diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26860 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 400, + "global_step": 17412, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017229496898690558, + "grad_norm": 2.182401180267334, + "learning_rate": 1.148105625717566e-10, + "logits/chosen": -2.967046022415161, + "logits/rejected": -2.9243061542510986, + "logps/chosen": -43.99115753173828, + "logps/rejected": -41.627906799316406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0017229496898690559, + "grad_norm": 2.3828177452087402, + "learning_rate": 1.148105625717566e-09, + "logits/chosen": -3.0551016330718994, + "logits/rejected": -3.025693893432617, + "logps/chosen": -50.46271514892578, + "logps/rejected": -49.61042785644531, + "loss": 0.693, + "rewards/accuracies": 0.5069444179534912, + "rewards/chosen": 3.583024226827547e-05, + "rewards/margins": 0.0002894425706472248, + "rewards/rejected": -0.0002536123211029917, + "step": 10 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 2.242837429046631, + "learning_rate": 2.296211251435132e-09, + "logits/chosen": -3.119117259979248, + "logits/rejected": -3.1108996868133545, + "logps/chosen": -52.652076721191406, + "logps/rejected": -52.98986053466797, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.00011003723921021447, + "rewards/margins": -4.4298911234363914e-05, + "rewards/rejected": 0.00015433612861670554, + "step": 20 + }, + { + "epoch": 0.005168849069607168, + "grad_norm": 2.5732250213623047, + "learning_rate": 3.4443168771526976e-09, + "logits/chosen": -3.0916590690612793, + "logits/rejected": -3.067901611328125, + "logps/chosen": -56.798622131347656, + "logps/rejected": -58.4221076965332, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -2.500688970030751e-05, + "rewards/margins": -0.00017597868281882256, + "rewards/rejected": 0.00015097178402356803, + "step": 30 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 2.0121591091156006, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -3.1051547527313232, + "logits/rejected": -3.0735793113708496, + "logps/chosen": -55.268089294433594, + "logps/rejected": -50.67551803588867, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 4.075562537764199e-05, + "rewards/margins": -5.7307326642330736e-05, + "rewards/rejected": 9.806293383007869e-05, + "step": 40 + }, + { + "epoch": 0.00861474844934528, + "grad_norm": 2.3856751918792725, + "learning_rate": 5.74052812858783e-09, + "logits/chosen": -3.100912570953369, + "logits/rejected": -3.0844902992248535, + "logps/chosen": -53.11623001098633, + "logps/rejected": -51.5071907043457, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 3.748986728169257e-06, + "rewards/margins": 7.726665353402495e-05, + "rewards/rejected": -7.35176945454441e-05, + "step": 50 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 2.7930078506469727, + "learning_rate": 6.888633754305395e-09, + "logits/chosen": -3.1540441513061523, + "logits/rejected": -3.124316453933716, + "logps/chosen": -57.598358154296875, + "logps/rejected": -54.17271041870117, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00011934386566281319, + "rewards/margins": -0.00021098754950799048, + "rewards/rejected": 9.164368384517729e-05, + "step": 60 + }, + { + "epoch": 0.012060647829083391, + "grad_norm": 2.204110860824585, + "learning_rate": 8.036739380022962e-09, + "logits/chosen": -3.0506539344787598, + "logits/rejected": -3.030632495880127, + "logps/chosen": -53.753807067871094, + "logps/rejected": -53.215538024902344, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000154320354340598, + "rewards/margins": 6.41578808426857e-05, + "rewards/rejected": 9.016246622195467e-05, + "step": 70 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 2.4381542205810547, + "learning_rate": 9.184845005740529e-09, + "logits/chosen": -3.1597819328308105, + "logits/rejected": -3.1264419555664062, + "logps/chosen": -59.09504318237305, + "logps/rejected": -54.12665557861328, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 1.3803789443045389e-05, + "rewards/margins": 0.0003244551189709455, + "rewards/rejected": -0.0003106513468082994, + "step": 80 + }, + { + "epoch": 0.015506547208821502, + "grad_norm": 2.4764792919158936, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.9934918880462646, + "logits/rejected": -2.9788215160369873, + "logps/chosen": -53.476707458496094, + "logps/rejected": -52.831390380859375, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -5.649983722832985e-05, + "rewards/margins": -0.00016349827637895942, + "rewards/rejected": 0.00010699845006456599, + "step": 90 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 2.485914707183838, + "learning_rate": 1.148105625717566e-08, + "logits/chosen": -3.1696791648864746, + "logits/rejected": -3.107633590698242, + "logps/chosen": -55.949684143066406, + "logps/rejected": -49.63793182373047, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -4.7669574996689335e-05, + "rewards/margins": 3.545046638464555e-05, + "rewards/rejected": -8.312005229527131e-05, + "step": 100 + }, + { + "epoch": 0.018952446588559616, + "grad_norm": 2.541780710220337, + "learning_rate": 1.2629161882893224e-08, + "logits/chosen": -3.1218652725219727, + "logits/rejected": -3.098087787628174, + "logps/chosen": -55.605926513671875, + "logps/rejected": -52.333740234375, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.00028869349625892937, + "rewards/margins": -0.0002611152012832463, + "rewards/rejected": -2.7578294975683093e-05, + "step": 110 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 2.5565247535705566, + "learning_rate": 1.377726750861079e-08, + "logits/chosen": -3.065533399581909, + "logits/rejected": -3.0499966144561768, + "logps/chosen": -53.1763916015625, + "logps/rejected": -55.58396530151367, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": 5.7340563216712326e-05, + "rewards/margins": 0.00017559928528498858, + "rewards/rejected": -0.00011825871479231864, + "step": 120 + }, + { + "epoch": 0.022398345968297727, + "grad_norm": 2.139101982116699, + "learning_rate": 1.4925373134328357e-08, + "logits/chosen": -3.1009914875030518, + "logits/rejected": -3.08695650100708, + "logps/chosen": -55.18524169921875, + "logps/rejected": -53.770782470703125, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": 8.389687718590721e-05, + "rewards/margins": 0.00018740523955784738, + "rewards/rejected": -0.00010350835509598255, + "step": 130 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 2.431870460510254, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -3.122816562652588, + "logits/rejected": -3.104506015777588, + "logps/chosen": -54.189476013183594, + "logps/rejected": -53.77021408081055, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -3.4902594052255154e-05, + "rewards/margins": 9.448556374991313e-05, + "rewards/rejected": -0.00012938815052621067, + "step": 140 + }, + { + "epoch": 0.025844245348035838, + "grad_norm": 2.2152774333953857, + "learning_rate": 1.722158438576349e-08, + "logits/chosen": -3.0275559425354004, + "logits/rejected": -3.0098488330841064, + "logps/chosen": -52.62406539916992, + "logps/rejected": -52.41263961791992, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 2.73335517704254e-05, + "rewards/margins": 0.00013241718988865614, + "rewards/rejected": -0.00010508365085115656, + "step": 150 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 2.1547353267669678, + "learning_rate": 1.8369690011481057e-08, + "logits/chosen": -3.0887503623962402, + "logits/rejected": -3.067896604537964, + "logps/chosen": -53.48332595825195, + "logps/rejected": -54.71419143676758, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": 3.922378527931869e-05, + "rewards/margins": 0.00015319202793762088, + "rewards/rejected": -0.00011396827176213264, + "step": 160 + }, + { + "epoch": 0.02929014472777395, + "grad_norm": 2.3667101860046387, + "learning_rate": 1.9517795637198624e-08, + "logits/chosen": -3.076423168182373, + "logits/rejected": -3.0568714141845703, + "logps/chosen": -56.27741622924805, + "logps/rejected": -51.32807540893555, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 7.188701420091093e-05, + "rewards/margins": 0.00013863443746231496, + "rewards/rejected": -6.674742326140404e-05, + "step": 170 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 2.604196548461914, + "learning_rate": 2.0665901262916187e-08, + "logits/chosen": -3.0623817443847656, + "logits/rejected": -3.043731451034546, + "logps/chosen": -56.39360809326172, + "logps/rejected": -53.7656364440918, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00012041317677358165, + "rewards/margins": -3.166842361679301e-05, + "rewards/rejected": 0.00015208160039037466, + "step": 180 + }, + { + "epoch": 0.03273604410751206, + "grad_norm": 2.6357362270355225, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -3.1240854263305664, + "logits/rejected": -3.080690860748291, + "logps/chosen": -58.188934326171875, + "logps/rejected": -52.56037521362305, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.388389763538726e-05, + "rewards/margins": 0.00017177227709908038, + "rewards/rejected": -0.00022565617109648883, + "step": 190 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 2.5758800506591797, + "learning_rate": 2.296211251435132e-08, + "logits/chosen": -3.059539556503296, + "logits/rejected": -3.0438239574432373, + "logps/chosen": -54.10227584838867, + "logps/rejected": -54.71996307373047, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00020430810400284827, + "rewards/margins": 0.0001816313888411969, + "rewards/rejected": 2.2676715161651373e-05, + "step": 200 + }, + { + "epoch": 0.03618194348725017, + "grad_norm": 2.2871100902557373, + "learning_rate": 2.4110218140068887e-08, + "logits/chosen": -3.0141561031341553, + "logits/rejected": -3.00553560256958, + "logps/chosen": -53.250831604003906, + "logps/rejected": -57.2822380065918, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00013214505452197045, + "rewards/margins": 4.95816238981206e-05, + "rewards/rejected": 8.256339060608298e-05, + "step": 210 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 2.3293843269348145, + "learning_rate": 2.5258323765786448e-08, + "logits/chosen": -3.0508525371551514, + "logits/rejected": -3.01947021484375, + "logps/chosen": -52.181129455566406, + "logps/rejected": -51.32151412963867, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0001834220893215388, + "rewards/margins": 0.00023680910817347467, + "rewards/rejected": -5.338704795576632e-05, + "step": 220 + }, + { + "epoch": 0.03962784286698828, + "grad_norm": 2.391742467880249, + "learning_rate": 2.6406429391504014e-08, + "logits/chosen": -3.0510756969451904, + "logits/rejected": -3.0328288078308105, + "logps/chosen": -48.904396057128906, + "logps/rejected": -49.960792541503906, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": 5.220425009611063e-05, + "rewards/margins": 0.00021240771457087249, + "rewards/rejected": -0.00016020346083678305, + "step": 230 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 2.250674247741699, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -3.0246095657348633, + "logits/rejected": -2.9822471141815186, + "logps/chosen": -55.94166946411133, + "logps/rejected": -52.1525993347168, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00012501263699959964, + "rewards/margins": 0.00010458445467520505, + "rewards/rejected": 2.0428178686415777e-05, + "step": 240 + }, + { + "epoch": 0.043073742246726394, + "grad_norm": 2.317246437072754, + "learning_rate": 2.8702640642939148e-08, + "logits/chosen": -3.1179134845733643, + "logits/rejected": -3.097510576248169, + "logps/chosen": -52.26668167114258, + "logps/rejected": -51.089698791503906, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0001339766022283584, + "rewards/margins": 5.490519106388092e-05, + "rewards/rejected": 7.907139661256224e-05, + "step": 250 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 2.3133351802825928, + "learning_rate": 2.9850746268656714e-08, + "logits/chosen": -3.094203472137451, + "logits/rejected": -3.0818848609924316, + "logps/chosen": -54.8641357421875, + "logps/rejected": -56.6263313293457, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 5.03903029311914e-05, + "rewards/margins": 0.00014110926713328809, + "rewards/rejected": -9.071899694390595e-05, + "step": 260 + }, + { + "epoch": 0.046519641626464506, + "grad_norm": 2.2149734497070312, + "learning_rate": 3.099885189437428e-08, + "logits/chosen": -3.033080577850342, + "logits/rejected": -3.014913558959961, + "logps/chosen": -53.12932586669922, + "logps/rejected": -54.3194465637207, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -9.781783592188731e-05, + "rewards/margins": 4.450721462490037e-05, + "rewards/rejected": -0.00014232503599487245, + "step": 270 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 2.4271368980407715, + "learning_rate": 3.214695752009185e-08, + "logits/chosen": -3.1243245601654053, + "logits/rejected": -3.090181589126587, + "logps/chosen": -57.60817337036133, + "logps/rejected": -53.423431396484375, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 5.003236583434045e-05, + "rewards/margins": 0.0001398597814841196, + "rewards/rejected": -8.982741564977914e-05, + "step": 280 + }, + { + "epoch": 0.04996554100620262, + "grad_norm": 2.24564790725708, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -3.047020673751831, + "logits/rejected": -3.0328316688537598, + "logps/chosen": -55.384986877441406, + "logps/rejected": -54.282264709472656, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -8.540546696167439e-05, + "rewards/margins": -8.022228576010093e-05, + "rewards/rejected": -5.1831566452165134e-06, + "step": 290 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 2.359998941421509, + "learning_rate": 3.444316877152698e-08, + "logits/chosen": -3.0025739669799805, + "logits/rejected": -2.9939587116241455, + "logps/chosen": -52.850181579589844, + "logps/rejected": -53.94788360595703, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.816951256245375e-05, + "rewards/margins": -4.090732545591891e-05, + "rewards/rejected": -3.726218710653484e-05, + "step": 300 + }, + { + "epoch": 0.05341144038594073, + "grad_norm": 2.47426176071167, + "learning_rate": 3.559127439724455e-08, + "logits/chosen": -3.0653576850891113, + "logits/rejected": -3.05991268157959, + "logps/chosen": -53.51900100708008, + "logps/rejected": -53.30791473388672, + "loss": 0.6931, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": 4.028494004160166e-06, + "rewards/margins": 0.00012998899910598993, + "rewards/rejected": -0.00012596049054991454, + "step": 310 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 2.410942554473877, + "learning_rate": 3.6739380022962115e-08, + "logits/chosen": -3.0229759216308594, + "logits/rejected": -2.9965834617614746, + "logps/chosen": -54.520713806152344, + "logps/rejected": -49.27983093261719, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0001271664659725502, + "rewards/margins": 0.00026925824931822717, + "rewards/rejected": -0.0003964246716350317, + "step": 320 + }, + { + "epoch": 0.05685733976567884, + "grad_norm": 2.349635124206543, + "learning_rate": 3.788748564867968e-08, + "logits/chosen": -3.0833523273468018, + "logits/rejected": -3.059699535369873, + "logps/chosen": -55.03084182739258, + "logps/rejected": -52.24028778076172, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 2.294254045409616e-05, + "rewards/margins": 0.00032292449031956494, + "rewards/rejected": -0.00029998194077052176, + "step": 330 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 2.1602048873901367, + "learning_rate": 3.903559127439725e-08, + "logits/chosen": -3.0051302909851074, + "logits/rejected": -2.983626127243042, + "logps/chosen": -52.538604736328125, + "logps/rejected": -51.94645309448242, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00012052619422320276, + "rewards/margins": -1.1196988452866208e-05, + "rewards/rejected": -0.00010932923760265112, + "step": 340 + }, + { + "epoch": 0.06030323914541695, + "grad_norm": 2.3185722827911377, + "learning_rate": 4.018369690011481e-08, + "logits/chosen": -2.9774973392486572, + "logits/rejected": -2.9381320476531982, + "logps/chosen": -56.23899459838867, + "logps/rejected": -53.59648513793945, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0002953378134407103, + "rewards/margins": 0.0001039146663970314, + "rewards/rejected": -0.00039925254532136023, + "step": 350 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 2.4112558364868164, + "learning_rate": 4.1331802525832375e-08, + "logits/chosen": -3.1283843517303467, + "logits/rejected": -3.105407238006592, + "logps/chosen": -54.58465576171875, + "logps/rejected": -50.559539794921875, + "loss": 0.6929, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 2.8427713914425112e-05, + "rewards/margins": 0.0004831443657167256, + "rewards/rejected": -0.0004547166754491627, + "step": 360 + }, + { + "epoch": 0.06374913852515507, + "grad_norm": 2.3298943042755127, + "learning_rate": 4.247990815154994e-08, + "logits/chosen": -3.103458881378174, + "logits/rejected": -3.0742361545562744, + "logps/chosen": -52.39043426513672, + "logps/rejected": -51.40144729614258, + "loss": 0.6929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -5.273760689306073e-05, + "rewards/margins": 0.0005517008830793202, + "rewards/rejected": -0.0006044383626431227, + "step": 370 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 2.0821239948272705, + "learning_rate": 4.362801377726751e-08, + "logits/chosen": -3.2037720680236816, + "logits/rejected": -3.1789143085479736, + "logps/chosen": -53.529457092285156, + "logps/rejected": -52.24095916748047, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0002573663368821144, + "rewards/margins": 0.0002463909622747451, + "rewards/rejected": -0.0005037573864683509, + "step": 380 + }, + { + "epoch": 0.06719503790489317, + "grad_norm": 2.40598464012146, + "learning_rate": 4.4776119402985075e-08, + "logits/chosen": -3.1002163887023926, + "logits/rejected": -3.0747485160827637, + "logps/chosen": -56.086753845214844, + "logps/rejected": -55.247955322265625, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.00021222772193141282, + "rewards/margins": 0.00019325126777403057, + "rewards/rejected": -0.0004054790479131043, + "step": 390 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 2.0966548919677734, + "learning_rate": 4.592422502870264e-08, + "logits/chosen": -3.0705161094665527, + "logits/rejected": -3.054738998413086, + "logps/chosen": -52.68198776245117, + "logps/rejected": -52.8104362487793, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0004277491825632751, + "rewards/margins": 1.7190934613608988e-06, + "rewards/rejected": -0.00042946828762069345, + "step": 400 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.1630496978759766, + "eval_logits/rejected": -3.157418727874756, + "eval_logps/chosen": -58.6923942565918, + "eval_logps/rejected": -63.1541748046875, + "eval_loss": 0.6931801438331604, + "eval_rewards/accuracies": 0.46538102626800537, + "eval_rewards/chosen": 0.00019500928465276957, + "eval_rewards/margins": -6.448826025007293e-05, + "eval_rewards/rejected": 0.0002594975521788001, + "eval_runtime": 382.9357, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 400 + }, + { + "epoch": 0.07064093728463129, + "grad_norm": 2.1526947021484375, + "learning_rate": 4.707233065442021e-08, + "logits/chosen": -3.0743823051452637, + "logits/rejected": -3.070225238800049, + "logps/chosen": -50.76953125, + "logps/rejected": -55.5819206237793, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00024037156254053116, + "rewards/margins": 0.00029858676134608686, + "rewards/rejected": -0.0005389582947827876, + "step": 410 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 2.527576208114624, + "learning_rate": 4.8220436280137775e-08, + "logits/chosen": -3.0596532821655273, + "logits/rejected": -3.051694631576538, + "logps/chosen": -54.21385955810547, + "logps/rejected": -53.94188690185547, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0004788221849594265, + "rewards/margins": -0.00019773165695369244, + "rewards/rejected": -0.00028109049890190363, + "step": 420 + }, + { + "epoch": 0.0740868366643694, + "grad_norm": 2.2473433017730713, + "learning_rate": 4.9368541905855335e-08, + "logits/chosen": -3.0878098011016846, + "logits/rejected": -3.0740981101989746, + "logps/chosen": -53.11975860595703, + "logps/rejected": -54.20466232299805, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0002864287234842777, + "rewards/margins": 0.00012910208897665143, + "rewards/rejected": -0.00041553081246092916, + "step": 430 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 2.511948823928833, + "learning_rate": 5.0516647531572895e-08, + "logits/chosen": -3.1331732273101807, + "logits/rejected": -3.0978844165802, + "logps/chosen": -54.333213806152344, + "logps/rejected": -53.343994140625, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00028464937349781394, + "rewards/margins": 0.0006941338069736958, + "rewards/rejected": -0.000978783005848527, + "step": 440 + }, + { + "epoch": 0.07753273604410751, + "grad_norm": 2.281709909439087, + "learning_rate": 5.166475315729046e-08, + "logits/chosen": -3.0553770065307617, + "logits/rejected": -3.0229177474975586, + "logps/chosen": -56.12068557739258, + "logps/rejected": -54.6532096862793, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.000248014519456774, + "rewards/margins": 0.0002685516665223986, + "rewards/rejected": -0.0005165661568753421, + "step": 450 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 2.403916597366333, + "learning_rate": 5.281285878300803e-08, + "logits/chosen": -3.024060010910034, + "logits/rejected": -3.0043814182281494, + "logps/chosen": -56.222320556640625, + "logps/rejected": -53.07564163208008, + "loss": 0.6928, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.00022003026970196515, + "rewards/margins": 0.0006629715790040791, + "rewards/rejected": -0.000883001834154129, + "step": 460 + }, + { + "epoch": 0.08097863542384562, + "grad_norm": 2.208056688308716, + "learning_rate": 5.3960964408725595e-08, + "logits/chosen": -3.0470211505889893, + "logits/rejected": -3.015340566635132, + "logps/chosen": -53.32251739501953, + "logps/rejected": -51.46124267578125, + "loss": 0.6931, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0005692066042684019, + "rewards/margins": 4.9208720156457275e-05, + "rewards/rejected": -0.0006184153025969863, + "step": 470 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 2.447040557861328, + "learning_rate": 5.510907003444316e-08, + "logits/chosen": -3.044581890106201, + "logits/rejected": -3.0393683910369873, + "logps/chosen": -54.26829147338867, + "logps/rejected": -59.047401428222656, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00029101339168846607, + "rewards/margins": 0.0005864130798727274, + "rewards/rejected": -0.0008774265879765153, + "step": 480 + }, + { + "epoch": 0.08442453480358374, + "grad_norm": 2.480241298675537, + "learning_rate": 5.625717566016073e-08, + "logits/chosen": -2.9544525146484375, + "logits/rejected": -2.9047305583953857, + "logps/chosen": -60.655059814453125, + "logps/rejected": -51.4771842956543, + "loss": 0.6927, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0003955605789087713, + "rewards/margins": 0.0009884096216410398, + "rewards/rejected": -0.001383970258757472, + "step": 490 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 2.2628631591796875, + "learning_rate": 5.7405281285878295e-08, + "logits/chosen": -3.0166900157928467, + "logits/rejected": -2.987926721572876, + "logps/chosen": -55.03630447387695, + "logps/rejected": -51.71287155151367, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0008643764886073768, + "rewards/margins": 0.00037768454058095813, + "rewards/rejected": -0.0012420611456036568, + "step": 500 + }, + { + "epoch": 0.08787043418332184, + "grad_norm": 2.239893674850464, + "learning_rate": 5.855338691159586e-08, + "logits/chosen": -3.008843421936035, + "logits/rejected": -2.9875636100769043, + "logps/chosen": -58.34012985229492, + "logps/rejected": -52.08111572265625, + "loss": 0.693, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0006519248127005994, + "rewards/margins": 0.00021181194460950792, + "rewards/rejected": -0.000863736669998616, + "step": 510 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 2.0704708099365234, + "learning_rate": 5.970149253731343e-08, + "logits/chosen": -3.056964159011841, + "logits/rejected": -3.0313286781311035, + "logps/chosen": -56.494102478027344, + "logps/rejected": -51.76349639892578, + "loss": 0.6929, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0008717130986042321, + "rewards/margins": 0.00046385274617932737, + "rewards/rejected": -0.001335565815679729, + "step": 520 + }, + { + "epoch": 0.09131633356305996, + "grad_norm": 2.0647060871124268, + "learning_rate": 6.084959816303099e-08, + "logits/chosen": -3.0535197257995605, + "logits/rejected": -3.011583089828491, + "logps/chosen": -55.7488899230957, + "logps/rejected": -51.35005569458008, + "loss": 0.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0004926534602418542, + "rewards/margins": 0.001148594543337822, + "rewards/rejected": -0.0016412477707490325, + "step": 530 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 2.2363359928131104, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -3.039377212524414, + "logits/rejected": -3.022956132888794, + "logps/chosen": -52.94477081298828, + "logps/rejected": -53.070411682128906, + "loss": 0.6928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0009251947631128132, + "rewards/margins": 0.0006015413091517985, + "rewards/rejected": -0.0015267360722646117, + "step": 540 + }, + { + "epoch": 0.09476223294279806, + "grad_norm": 2.260488510131836, + "learning_rate": 6.314580941446614e-08, + "logits/chosen": -3.1000843048095703, + "logits/rejected": -3.082019329071045, + "logps/chosen": -53.64750289916992, + "logps/rejected": -52.13825607299805, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0007364696939475834, + "rewards/margins": 0.001018259208649397, + "rewards/rejected": -0.0017547288443893194, + "step": 550 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 2.5739121437072754, + "learning_rate": 6.42939150401837e-08, + "logits/chosen": -3.0712523460388184, + "logits/rejected": -3.063311815261841, + "logps/chosen": -52.2054557800293, + "logps/rejected": -55.1589469909668, + "loss": 0.6927, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0009688477148301899, + "rewards/margins": 0.0009185401722788811, + "rewards/rejected": -0.00188738782890141, + "step": 560 + }, + { + "epoch": 0.09820813232253618, + "grad_norm": 2.2335901260375977, + "learning_rate": 6.544202066590127e-08, + "logits/chosen": -3.04362154006958, + "logits/rejected": -3.034945249557495, + "logps/chosen": -51.44483184814453, + "logps/rejected": -54.00239944458008, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0008267102530226111, + "rewards/margins": 0.0009768879972398281, + "rewards/rejected": -0.0018035981338471174, + "step": 570 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 1.7748967409133911, + "learning_rate": 6.659012629161883e-08, + "logits/chosen": -3.050416946411133, + "logits/rejected": -3.044624090194702, + "logps/chosen": -51.22686767578125, + "logps/rejected": -53.419944763183594, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0011879531666636467, + "rewards/margins": 0.0005926621379330754, + "rewards/rejected": -0.001780615421012044, + "step": 580 + }, + { + "epoch": 0.1016540317022743, + "grad_norm": 2.0524795055389404, + "learning_rate": 6.77382319173364e-08, + "logits/chosen": -3.0492122173309326, + "logits/rejected": -3.027358293533325, + "logps/chosen": -54.976097106933594, + "logps/rejected": -54.95697784423828, + "loss": 0.6928, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0009717288194224238, + "rewards/margins": 0.0007361652096733451, + "rewards/rejected": -0.001707894029095769, + "step": 590 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 2.3293092250823975, + "learning_rate": 6.888633754305396e-08, + "logits/chosen": -3.0223147869110107, + "logits/rejected": -2.9981188774108887, + "logps/chosen": -53.92238235473633, + "logps/rejected": -56.933319091796875, + "loss": 0.692, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.00038632837822660804, + "rewards/margins": 0.0022632996551692486, + "rewards/rejected": -0.0026496287900954485, + "step": 600 + }, + { + "epoch": 0.1050999310820124, + "grad_norm": 2.2301552295684814, + "learning_rate": 7.003444316877152e-08, + "logits/chosen": -2.9881367683410645, + "logits/rejected": -2.985593318939209, + "logps/chosen": -52.6776123046875, + "logps/rejected": -53.41646194458008, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.001130065182223916, + "rewards/margins": 0.0004246587341185659, + "rewards/rejected": -0.0015547239454463124, + "step": 610 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 2.37221097946167, + "learning_rate": 7.11825487944891e-08, + "logits/chosen": -3.146998882293701, + "logits/rejected": -3.120384931564331, + "logps/chosen": -55.118141174316406, + "logps/rejected": -53.546974182128906, + "loss": 0.6922, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0009913553949445486, + "rewards/margins": 0.0018093215767294168, + "rewards/rejected": -0.002800677204504609, + "step": 620 + }, + { + "epoch": 0.10854583046175052, + "grad_norm": 2.475659132003784, + "learning_rate": 7.233065442020666e-08, + "logits/chosen": -3.130199432373047, + "logits/rejected": -3.1033434867858887, + "logps/chosen": -54.01044464111328, + "logps/rejected": -51.10059356689453, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0014563563745468855, + "rewards/margins": 0.0010107369162142277, + "rewards/rejected": -0.002467093523591757, + "step": 630 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 2.473634958267212, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -3.098565101623535, + "logits/rejected": -3.0876259803771973, + "logps/chosen": -53.07587432861328, + "logps/rejected": -54.5706672668457, + "loss": 0.6928, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.001637846464291215, + "rewards/margins": 0.0008041782421059906, + "rewards/rejected": -0.0024420248810201883, + "step": 640 + }, + { + "epoch": 0.11199172984148863, + "grad_norm": 2.685940980911255, + "learning_rate": 7.462686567164179e-08, + "logits/chosen": -3.1109936237335205, + "logits/rejected": -3.112290859222412, + "logps/chosen": -51.69086456298828, + "logps/rejected": -55.07471466064453, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0021597391460090876, + "rewards/margins": 0.00037304952275007963, + "rewards/rejected": -0.0025327885523438454, + "step": 650 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 2.2419979572296143, + "learning_rate": 7.577497129735936e-08, + "logits/chosen": -3.0010995864868164, + "logits/rejected": -2.9949069023132324, + "logps/chosen": -54.76444625854492, + "logps/rejected": -52.52055740356445, + "loss": 0.6925, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0013639924582093954, + "rewards/margins": 0.0012060193112120032, + "rewards/rejected": -0.0025700118858367205, + "step": 660 + }, + { + "epoch": 0.11543762922122675, + "grad_norm": 2.2122037410736084, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -3.023953914642334, + "logits/rejected": -3.019078493118286, + "logps/chosen": -53.1700553894043, + "logps/rejected": -57.7303466796875, + "loss": 0.6929, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0016445998335257173, + "rewards/margins": 0.0004528749850578606, + "rewards/rejected": -0.002097474876791239, + "step": 670 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 2.33010196685791, + "learning_rate": 7.80711825487945e-08, + "logits/chosen": -2.975424289703369, + "logits/rejected": -2.950366497039795, + "logps/chosen": -54.11586380004883, + "logps/rejected": -51.203956604003906, + "loss": 0.6924, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.001826353371143341, + "rewards/margins": 0.001559465192258358, + "rewards/rejected": -0.0033858187962323427, + "step": 680 + }, + { + "epoch": 0.11888352860096485, + "grad_norm": 2.5904860496520996, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -3.1240360736846924, + "logits/rejected": -3.094136953353882, + "logps/chosen": -59.23029327392578, + "logps/rejected": -50.781585693359375, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0014779084594920278, + "rewards/margins": 0.0009200021740980446, + "rewards/rejected": -0.0023979106917977333, + "step": 690 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 2.203641891479492, + "learning_rate": 8.036739380022962e-08, + "logits/chosen": -3.0816588401794434, + "logits/rejected": -3.053264856338501, + "logps/chosen": -55.8792839050293, + "logps/rejected": -53.30298614501953, + "loss": 0.6923, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0015104495687410235, + "rewards/margins": 0.001711997203528881, + "rewards/rejected": -0.0032224461901932955, + "step": 700 + }, + { + "epoch": 0.12232942798070297, + "grad_norm": 2.2730891704559326, + "learning_rate": 8.151549942594719e-08, + "logits/chosen": -3.060011625289917, + "logits/rejected": -3.031322956085205, + "logps/chosen": -54.78464889526367, + "logps/rejected": -55.0402946472168, + "loss": 0.6921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0013594934716820717, + "rewards/margins": 0.002062887419015169, + "rewards/rejected": -0.003422380657866597, + "step": 710 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 2.5072357654571533, + "learning_rate": 8.266360505166475e-08, + "logits/chosen": -3.021775722503662, + "logits/rejected": -3.0175387859344482, + "logps/chosen": -53.49358367919922, + "logps/rejected": -54.68156051635742, + "loss": 0.6926, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0017647970234975219, + "rewards/margins": 0.0010740322759374976, + "rewards/rejected": -0.0028388292994350195, + "step": 720 + }, + { + "epoch": 0.12577532736044109, + "grad_norm": 2.3823907375335693, + "learning_rate": 8.381171067738232e-08, + "logits/chosen": -3.1382100582122803, + "logits/rejected": -3.1131160259246826, + "logps/chosen": -56.51377487182617, + "logps/rejected": -52.933837890625, + "loss": 0.692, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0018151247641071677, + "rewards/margins": 0.002312289085239172, + "rewards/rejected": -0.0041274139657616615, + "step": 730 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 2.1981279850006104, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -3.0200366973876953, + "logits/rejected": -2.994292736053467, + "logps/chosen": -55.04505157470703, + "logps/rejected": -54.068115234375, + "loss": 0.692, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0012819484109058976, + "rewards/margins": 0.00240796385332942, + "rewards/rejected": -0.0036899116821587086, + "step": 740 + }, + { + "epoch": 0.1292212267401792, + "grad_norm": 2.430384874343872, + "learning_rate": 8.610792192881746e-08, + "logits/chosen": -3.1935439109802246, + "logits/rejected": -3.1655354499816895, + "logps/chosen": -56.11168670654297, + "logps/rejected": -54.45726776123047, + "loss": 0.6914, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.000991794397123158, + "rewards/margins": 0.003526625456288457, + "rewards/rejected": -0.004518419038504362, + "step": 750 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 2.590021848678589, + "learning_rate": 8.725602755453502e-08, + "logits/chosen": -3.043562412261963, + "logits/rejected": -3.004751205444336, + "logps/chosen": -54.59693145751953, + "logps/rejected": -49.988407135009766, + "loss": 0.6916, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0016047193203121424, + "rewards/margins": 0.0031021684408187866, + "rewards/rejected": -0.004706887062638998, + "step": 760 + }, + { + "epoch": 0.1326671261199173, + "grad_norm": 2.0914759635925293, + "learning_rate": 8.840413318025258e-08, + "logits/chosen": -3.093982219696045, + "logits/rejected": -3.070664405822754, + "logps/chosen": -53.08367156982422, + "logps/rejected": -52.61655807495117, + "loss": 0.6919, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.001643743016757071, + "rewards/margins": 0.002435127506032586, + "rewards/rejected": -0.004078870639204979, + "step": 770 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 2.6623942852020264, + "learning_rate": 8.955223880597015e-08, + "logits/chosen": -3.0886335372924805, + "logits/rejected": -3.0576045513153076, + "logps/chosen": -53.36533737182617, + "logps/rejected": -51.709129333496094, + "loss": 0.6916, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.002253578510135412, + "rewards/margins": 0.003126527415588498, + "rewards/rejected": -0.005380106158554554, + "step": 780 + }, + { + "epoch": 0.1361130254996554, + "grad_norm": 2.2624731063842773, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -3.096177339553833, + "logits/rejected": -3.062889337539673, + "logps/chosen": -54.07451629638672, + "logps/rejected": -54.05188751220703, + "loss": 0.6916, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0016787642380222678, + "rewards/margins": 0.0031538617331534624, + "rewards/rejected": -0.004832625854760408, + "step": 790 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 2.6842644214630127, + "learning_rate": 9.184845005740528e-08, + "logits/chosen": -2.9823784828186035, + "logits/rejected": -2.956275224685669, + "logps/chosen": -55.31383514404297, + "logps/rejected": -54.95196533203125, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.003162782173603773, + "rewards/margins": 0.002404420403763652, + "rewards/rejected": -0.005567202344536781, + "step": 800 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.157433271408081, + "eval_logits/rejected": -3.1517605781555176, + "eval_logps/chosen": -58.558616638183594, + "eval_logps/rejected": -63.0954704284668, + "eval_loss": 0.6928107142448425, + "eval_rewards/accuracies": 0.5525093078613281, + "eval_rewards/chosen": 0.0015327819855883718, + "eval_rewards/margins": 0.0006862352602183819, + "eval_rewards/rejected": 0.0008465467253699899, + "eval_runtime": 382.9449, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 800 + }, + { + "epoch": 0.13955892487939353, + "grad_norm": 2.204132080078125, + "learning_rate": 9.299655568312284e-08, + "logits/chosen": -3.0566883087158203, + "logits/rejected": -3.0285804271698, + "logps/chosen": -56.864402770996094, + "logps/rejected": -56.03449630737305, + "loss": 0.6919, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.002270194236189127, + "rewards/margins": 0.00244377669878304, + "rewards/rejected": -0.004713970236480236, + "step": 810 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 2.1584360599517822, + "learning_rate": 9.414466130884042e-08, + "logits/chosen": -3.1056180000305176, + "logits/rejected": -3.0800392627716064, + "logps/chosen": -51.924049377441406, + "logps/rejected": -51.29270553588867, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003763598622754216, + "rewards/margins": 0.0018941021990031004, + "rewards/rejected": -0.005657701287418604, + "step": 820 + }, + { + "epoch": 0.14300482425913164, + "grad_norm": 2.4463465213775635, + "learning_rate": 9.529276693455798e-08, + "logits/chosen": -3.03351092338562, + "logits/rejected": -3.0181891918182373, + "logps/chosen": -54.88848114013672, + "logps/rejected": -54.726600646972656, + "loss": 0.691, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0030868500471115112, + "rewards/margins": 0.004416943993419409, + "rewards/rejected": -0.007503793574869633, + "step": 830 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 2.5564217567443848, + "learning_rate": 9.644087256027555e-08, + "logits/chosen": -3.136671781539917, + "logits/rejected": -3.1101253032684326, + "logps/chosen": -54.3847770690918, + "logps/rejected": -50.26315689086914, + "loss": 0.691, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.002898902166634798, + "rewards/margins": 0.0043100654147565365, + "rewards/rejected": -0.007208968047052622, + "step": 840 + }, + { + "epoch": 0.14645072363886974, + "grad_norm": 2.2932486534118652, + "learning_rate": 9.758897818599311e-08, + "logits/chosen": -3.0120320320129395, + "logits/rejected": -3.0013999938964844, + "logps/chosen": -51.23966598510742, + "logps/rejected": -55.53287887573242, + "loss": 0.6923, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.004185699392110109, + "rewards/margins": 0.0017863849643617868, + "rewards/rejected": -0.0059720841236412525, + "step": 850 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 2.3204004764556885, + "learning_rate": 9.873708381171067e-08, + "logits/chosen": -3.037720203399658, + "logits/rejected": -3.017627239227295, + "logps/chosen": -53.264244079589844, + "logps/rejected": -53.15632247924805, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0040500895120203495, + "rewards/margins": 0.004448921885341406, + "rewards/rejected": -0.008499011397361755, + "step": 860 + }, + { + "epoch": 0.14989662301860784, + "grad_norm": 1.9834500551223755, + "learning_rate": 9.988518943742824e-08, + "logits/chosen": -3.111067771911621, + "logits/rejected": -3.1072709560394287, + "logps/chosen": -51.75238800048828, + "logps/rejected": -54.507606506347656, + "loss": 0.6912, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.004508176352828741, + "rewards/margins": 0.004009455442428589, + "rewards/rejected": -0.008517631329596043, + "step": 870 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 1.9233384132385254, + "learning_rate": 1.0103329506314579e-07, + "logits/chosen": -3.024372100830078, + "logits/rejected": -3.0014374256134033, + "logps/chosen": -52.12944412231445, + "logps/rejected": -52.31949996948242, + "loss": 0.6905, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.003995339386165142, + "rewards/margins": 0.005398184061050415, + "rewards/rejected": -0.009393523447215557, + "step": 880 + }, + { + "epoch": 0.15334252239834598, + "grad_norm": 2.241018772125244, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -3.0522618293762207, + "logits/rejected": -3.0149013996124268, + "logps/chosen": -58.669822692871094, + "logps/rejected": -54.974082946777344, + "loss": 0.69, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0018275838810950518, + "rewards/margins": 0.006310337223112583, + "rewards/rejected": -0.008137920871376991, + "step": 890 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 2.1978955268859863, + "learning_rate": 1.0332950631458092e-07, + "logits/chosen": -3.072181463241577, + "logits/rejected": -3.061849355697632, + "logps/chosen": -54.629608154296875, + "logps/rejected": -53.2902946472168, + "loss": 0.6911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.004928673151880503, + "rewards/margins": 0.00411958945915103, + "rewards/rejected": -0.009048262611031532, + "step": 900 + }, + { + "epoch": 0.15678842177808408, + "grad_norm": 2.142141342163086, + "learning_rate": 1.044776119402985e-07, + "logits/chosen": -3.0427145957946777, + "logits/rejected": -3.0346322059631348, + "logps/chosen": -51.631103515625, + "logps/rejected": -52.90343475341797, + "loss": 0.692, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006040601991117001, + "rewards/margins": 0.0024170693941414356, + "rewards/rejected": -0.008457671850919724, + "step": 910 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 2.318749189376831, + "learning_rate": 1.0562571756601606e-07, + "logits/chosen": -3.08345627784729, + "logits/rejected": -3.040051221847534, + "logps/chosen": -54.85624313354492, + "logps/rejected": -50.60798263549805, + "loss": 0.6901, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.005340777337551117, + "rewards/margins": 0.0062098256312310696, + "rewards/rejected": -0.011550603434443474, + "step": 920 + }, + { + "epoch": 0.16023432115782218, + "grad_norm": 2.554981231689453, + "learning_rate": 1.0677382319173363e-07, + "logits/chosen": -3.1441140174865723, + "logits/rejected": -3.1281702518463135, + "logps/chosen": -53.05458450317383, + "logps/rejected": -55.630455017089844, + "loss": 0.6904, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005707239266484976, + "rewards/margins": 0.0056891003623604774, + "rewards/rejected": -0.01139634009450674, + "step": 930 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 2.219067335128784, + "learning_rate": 1.0792192881745119e-07, + "logits/chosen": -3.1239795684814453, + "logits/rejected": -3.0865120887756348, + "logps/chosen": -60.679237365722656, + "logps/rejected": -55.87761688232422, + "loss": 0.6906, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00473904749378562, + "rewards/margins": 0.005185187328606844, + "rewards/rejected": -0.009924234822392464, + "step": 940 + }, + { + "epoch": 0.16368022053756032, + "grad_norm": 2.2362265586853027, + "learning_rate": 1.0907003444316875e-07, + "logits/chosen": -2.916612148284912, + "logits/rejected": -2.8993961811065674, + "logps/chosen": -56.0440788269043, + "logps/rejected": -56.89391326904297, + "loss": 0.6908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.007726993411779404, + "rewards/margins": 0.004901893436908722, + "rewards/rejected": -0.012628885917365551, + "step": 950 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 2.317995071411133, + "learning_rate": 1.1021814006888632e-07, + "logits/chosen": -2.885284662246704, + "logits/rejected": -2.8884658813476562, + "logps/chosen": -51.720481872558594, + "logps/rejected": -56.540191650390625, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009791770949959755, + "rewards/margins": -0.0003909044316969812, + "rewards/rejected": -0.0094008669257164, + "step": 960 + }, + { + "epoch": 0.16712611991729842, + "grad_norm": 2.359971046447754, + "learning_rate": 1.1136624569460388e-07, + "logits/chosen": -3.055280923843384, + "logits/rejected": -3.0196239948272705, + "logps/chosen": -61.49090576171875, + "logps/rejected": -53.53166580200195, + "loss": 0.6919, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.008273814804852009, + "rewards/margins": 0.002594894263893366, + "rewards/rejected": -0.010868709534406662, + "step": 970 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 3.1766042709350586, + "learning_rate": 1.1251435132032146e-07, + "logits/chosen": -3.1567161083221436, + "logits/rejected": -3.136969804763794, + "logps/chosen": -56.458778381347656, + "logps/rejected": -56.0153694152832, + "loss": 0.6894, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.004100353457033634, + "rewards/margins": 0.007654036395251751, + "rewards/rejected": -0.011754389852285385, + "step": 980 + }, + { + "epoch": 0.17057201929703653, + "grad_norm": 2.504091262817383, + "learning_rate": 1.1366245694603902e-07, + "logits/chosen": -3.0233750343322754, + "logits/rejected": -3.000009298324585, + "logps/chosen": -55.2623291015625, + "logps/rejected": -53.919044494628906, + "loss": 0.6902, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006930728908628225, + "rewards/margins": 0.0060745240189135075, + "rewards/rejected": -0.013005253858864307, + "step": 990 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 2.235285758972168, + "learning_rate": 1.1481056257175659e-07, + "logits/chosen": -2.9751508235931396, + "logits/rejected": -2.9474892616271973, + "logps/chosen": -57.668296813964844, + "logps/rejected": -52.80499267578125, + "loss": 0.6919, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.008000878617167473, + "rewards/margins": 0.0026205407921224833, + "rewards/rejected": -0.010621419176459312, + "step": 1000 + }, + { + "epoch": 0.17401791867677463, + "grad_norm": 2.3575587272644043, + "learning_rate": 1.1595866819747415e-07, + "logits/chosen": -2.9106006622314453, + "logits/rejected": -2.9210867881774902, + "logps/chosen": -54.70320510864258, + "logps/rejected": -59.270790100097656, + "loss": 0.691, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.009453857317566872, + "rewards/margins": 0.0043923440389335155, + "rewards/rejected": -0.0138462008908391, + "step": 1010 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 2.255159378051758, + "learning_rate": 1.1710677382319172e-07, + "logits/chosen": -3.0922813415527344, + "logits/rejected": -3.0569264888763428, + "logps/chosen": -57.53850173950195, + "logps/rejected": -55.62078094482422, + "loss": 0.6885, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.004358768928796053, + "rewards/margins": 0.009538348764181137, + "rewards/rejected": -0.013897115364670753, + "step": 1020 + }, + { + "epoch": 0.17746381805651276, + "grad_norm": 2.396897554397583, + "learning_rate": 1.1825487944890928e-07, + "logits/chosen": -3.140779495239258, + "logits/rejected": -3.1136345863342285, + "logps/chosen": -54.39406204223633, + "logps/rejected": -53.09972381591797, + "loss": 0.6907, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006752696819603443, + "rewards/margins": 0.00511940149590373, + "rewards/rejected": -0.011872097849845886, + "step": 1030 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 2.4822428226470947, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -3.056056499481201, + "logits/rejected": -3.0457711219787598, + "logps/chosen": -54.37267303466797, + "logps/rejected": -56.8503532409668, + "loss": 0.6915, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00974307581782341, + "rewards/margins": 0.003385114250704646, + "rewards/rejected": -0.013128191232681274, + "step": 1040 + }, + { + "epoch": 0.18090971743625087, + "grad_norm": 2.523308515548706, + "learning_rate": 1.205510907003444e-07, + "logits/chosen": -2.9768359661102295, + "logits/rejected": -2.941450595855713, + "logps/chosen": -57.94348907470703, + "logps/rejected": -52.01409912109375, + "loss": 0.6901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008004303090274334, + "rewards/margins": 0.0064055053517222404, + "rewards/rejected": -0.014409807510674, + "step": 1050 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 2.466860771179199, + "learning_rate": 1.2169919632606198e-07, + "logits/chosen": -2.985286235809326, + "logits/rejected": -2.966055154800415, + "logps/chosen": -57.37578201293945, + "logps/rejected": -56.16017532348633, + "loss": 0.6917, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.008745206519961357, + "rewards/margins": 0.003092522034421563, + "rewards/rejected": -0.011837727390229702, + "step": 1060 + }, + { + "epoch": 0.18435561681598897, + "grad_norm": 2.3267135620117188, + "learning_rate": 1.2284730195177955e-07, + "logits/chosen": -3.128039598464966, + "logits/rejected": -3.0941824913024902, + "logps/chosen": -56.73784255981445, + "logps/rejected": -54.831199645996094, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.008065931499004364, + "rewards/margins": 0.007762957364320755, + "rewards/rejected": -0.01582888886332512, + "step": 1070 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 2.292827606201172, + "learning_rate": 1.2399540757749712e-07, + "logits/chosen": -3.105078935623169, + "logits/rejected": -3.0817818641662598, + "logps/chosen": -56.80012130737305, + "logps/rejected": -52.38414764404297, + "loss": 0.69, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.010143356397747993, + "rewards/margins": 0.006392681505531073, + "rewards/rejected": -0.016536036506295204, + "step": 1080 + }, + { + "epoch": 0.18780151619572708, + "grad_norm": 2.412367343902588, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -2.9820761680603027, + "logits/rejected": -2.9737863540649414, + "logps/chosen": -53.27097702026367, + "logps/rejected": -54.16463088989258, + "loss": 0.6908, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.010393026284873486, + "rewards/margins": 0.004915698431432247, + "rewards/rejected": -0.015308722853660583, + "step": 1090 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 2.1496472358703613, + "learning_rate": 1.2629161882893227e-07, + "logits/chosen": -3.0467867851257324, + "logits/rejected": -3.0438222885131836, + "logps/chosen": -51.78764724731445, + "logps/rejected": -56.13716506958008, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00880212802439928, + "rewards/margins": 0.006492135114967823, + "rewards/rejected": -0.015294264070689678, + "step": 1100 + }, + { + "epoch": 0.1912474155754652, + "grad_norm": 2.670915126800537, + "learning_rate": 1.2743972445464984e-07, + "logits/chosen": -3.0431346893310547, + "logits/rejected": -3.055330753326416, + "logps/chosen": -54.185951232910156, + "logps/rejected": -58.2843132019043, + "loss": 0.6906, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.012901161797344685, + "rewards/margins": 0.005421520210802555, + "rewards/rejected": -0.01832268200814724, + "step": 1110 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 2.4121639728546143, + "learning_rate": 1.285878300803674e-07, + "logits/chosen": -3.0725626945495605, + "logits/rejected": -3.0507113933563232, + "logps/chosen": -57.42089080810547, + "logps/rejected": -55.515769958496094, + "loss": 0.6897, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.010498611256480217, + "rewards/margins": 0.007218754850327969, + "rewards/rejected": -0.01771736517548561, + "step": 1120 + }, + { + "epoch": 0.1946933149552033, + "grad_norm": 2.235563039779663, + "learning_rate": 1.2973593570608496e-07, + "logits/chosen": -3.1377358436584473, + "logits/rejected": -3.1132779121398926, + "logps/chosen": -53.177711486816406, + "logps/rejected": -56.41312789916992, + "loss": 0.6879, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.011453317478299141, + "rewards/margins": 0.010754810646176338, + "rewards/rejected": -0.02220812812447548, + "step": 1130 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 2.4700300693511963, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -3.044579029083252, + "logits/rejected": -3.011229991912842, + "logps/chosen": -57.91096878051758, + "logps/rejected": -54.86474609375, + "loss": 0.6891, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010296146385371685, + "rewards/margins": 0.008502911776304245, + "rewards/rejected": -0.018799057230353355, + "step": 1140 + }, + { + "epoch": 0.19813921433494142, + "grad_norm": 2.472872734069824, + "learning_rate": 1.3203214695752008e-07, + "logits/chosen": -2.9843618869781494, + "logits/rejected": -2.9652435779571533, + "logps/chosen": -54.42316818237305, + "logps/rejected": -56.3864631652832, + "loss": 0.6891, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01066792942583561, + "rewards/margins": 0.008340856991708279, + "rewards/rejected": -0.019008787348866463, + "step": 1150 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 2.2732763290405273, + "learning_rate": 1.3318025258323766e-07, + "logits/chosen": -3.079559087753296, + "logits/rejected": -3.070949077606201, + "logps/chosen": -54.765602111816406, + "logps/rejected": -56.289215087890625, + "loss": 0.6883, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.007399639580398798, + "rewards/margins": 0.010012407787144184, + "rewards/rejected": -0.01741204783320427, + "step": 1160 + }, + { + "epoch": 0.20158511371467952, + "grad_norm": 2.170346736907959, + "learning_rate": 1.3432835820895523e-07, + "logits/chosen": -2.9518837928771973, + "logits/rejected": -2.939192056655884, + "logps/chosen": -53.35938262939453, + "logps/rejected": -56.720787048339844, + "loss": 0.6895, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.012800860218703747, + "rewards/margins": 0.007673123385757208, + "rewards/rejected": -0.020473983138799667, + "step": 1170 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 2.554912567138672, + "learning_rate": 1.354764638346728e-07, + "logits/chosen": -2.9232382774353027, + "logits/rejected": -2.8921735286712646, + "logps/chosen": -54.116477966308594, + "logps/rejected": -53.69866943359375, + "loss": 0.6864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.013146810233592987, + "rewards/margins": 0.013819403946399689, + "rewards/rejected": -0.026966210454702377, + "step": 1180 + }, + { + "epoch": 0.20503101309441765, + "grad_norm": 2.5223007202148438, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -3.118698835372925, + "logits/rejected": -3.0836944580078125, + "logps/chosen": -60.1918830871582, + "logps/rejected": -52.9239616394043, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014924025163054466, + "rewards/margins": 0.010316677391529083, + "rewards/rejected": -0.0252407006919384, + "step": 1190 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 2.1731607913970947, + "learning_rate": 1.3777267508610792e-07, + "logits/chosen": -2.9499802589416504, + "logits/rejected": -2.9351534843444824, + "logps/chosen": -54.7470588684082, + "logps/rejected": -53.473411560058594, + "loss": 0.6902, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.015825878828763962, + "rewards/margins": 0.006190788466483355, + "rewards/rejected": -0.02201666869223118, + "step": 1200 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.1337568759918213, + "eval_logits/rejected": -3.128148317337036, + "eval_logps/chosen": -58.61870193481445, + "eval_logps/rejected": -63.452667236328125, + "eval_loss": 0.691386878490448, + "eval_rewards/accuracies": 0.5875929594039917, + "eval_rewards/chosen": 0.0009319494129158556, + "eval_rewards/margins": 0.003657379886135459, + "eval_rewards/rejected": -0.002725430764257908, + "eval_runtime": 382.7439, + "eval_samples_per_second": 11.245, + "eval_steps_per_second": 1.406, + "step": 1200 + }, + { + "epoch": 0.20847691247415576, + "grad_norm": 2.374234676361084, + "learning_rate": 1.389207807118255e-07, + "logits/chosen": -3.0486347675323486, + "logits/rejected": -3.0146355628967285, + "logps/chosen": -55.02860641479492, + "logps/rejected": -55.492454528808594, + "loss": 0.6872, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.012403490021824837, + "rewards/margins": 0.012364232912659645, + "rewards/rejected": -0.024767722934484482, + "step": 1210 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 2.142857551574707, + "learning_rate": 1.4006888633754304e-07, + "logits/chosen": -3.058962345123291, + "logits/rejected": -3.0320382118225098, + "logps/chosen": -54.947837829589844, + "logps/rejected": -55.0828857421875, + "loss": 0.6872, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01566535234451294, + "rewards/margins": 0.012328686192631721, + "rewards/rejected": -0.02799403667449951, + "step": 1220 + }, + { + "epoch": 0.21192281185389386, + "grad_norm": 2.823793888092041, + "learning_rate": 1.4121699196326062e-07, + "logits/chosen": -3.1282544136047363, + "logits/rejected": -3.086381196975708, + "logps/chosen": -57.08588790893555, + "logps/rejected": -55.1170654296875, + "loss": 0.6856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.01335026603192091, + "rewards/margins": 0.01557250041514635, + "rewards/rejected": -0.02892276644706726, + "step": 1230 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 2.2466061115264893, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -2.9896018505096436, + "logits/rejected": -2.972954273223877, + "logps/chosen": -54.30778121948242, + "logps/rejected": -55.87935256958008, + "loss": 0.6874, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.018706928938627243, + "rewards/margins": 0.011885268613696098, + "rewards/rejected": -0.03059219755232334, + "step": 1240 + }, + { + "epoch": 0.21536871123363197, + "grad_norm": 2.7906625270843506, + "learning_rate": 1.4351320321469576e-07, + "logits/chosen": -3.1079976558685303, + "logits/rejected": -3.0718159675598145, + "logps/chosen": -55.47816848754883, + "logps/rejected": -54.13701248168945, + "loss": 0.6875, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.017741341143846512, + "rewards/margins": 0.01175488717854023, + "rewards/rejected": -0.029496226459741592, + "step": 1250 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 2.181622266769409, + "learning_rate": 1.446613088404133e-07, + "logits/chosen": -2.956184148788452, + "logits/rejected": -2.9249396324157715, + "logps/chosen": -54.747764587402344, + "logps/rejected": -53.68642044067383, + "loss": 0.6853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01772422529757023, + "rewards/margins": 0.016296565532684326, + "rewards/rejected": -0.034020788967609406, + "step": 1260 + }, + { + "epoch": 0.2188146106133701, + "grad_norm": 2.5509865283966064, + "learning_rate": 1.4580941446613089e-07, + "logits/chosen": -3.0164220333099365, + "logits/rejected": -3.0100362300872803, + "logps/chosen": -53.903045654296875, + "logps/rejected": -57.88615798950195, + "loss": 0.6877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.020272260531783104, + "rewards/margins": 0.011540655978024006, + "rewards/rejected": -0.031812917441129684, + "step": 1270 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 2.4054272174835205, + "learning_rate": 1.4695752009184846e-07, + "logits/chosen": -3.0869784355163574, + "logits/rejected": -3.093924045562744, + "logps/chosen": -53.26460647583008, + "logps/rejected": -62.8282470703125, + "loss": 0.6893, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.020930523052811623, + "rewards/margins": 0.008218497969210148, + "rewards/rejected": -0.029149020090699196, + "step": 1280 + }, + { + "epoch": 0.2222605099931082, + "grad_norm": 2.3104968070983887, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -2.986077070236206, + "logits/rejected": -2.95453143119812, + "logps/chosen": -56.5821647644043, + "logps/rejected": -53.85289764404297, + "loss": 0.6862, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.019812434911727905, + "rewards/margins": 0.014519277028739452, + "rewards/rejected": -0.03433171287178993, + "step": 1290 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 2.2120158672332764, + "learning_rate": 1.4925373134328358e-07, + "logits/chosen": -3.0651228427886963, + "logits/rejected": -3.052384614944458, + "logps/chosen": -53.11201095581055, + "logps/rejected": -57.268531799316406, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.028970792889595032, + "rewards/margins": 0.006490548141300678, + "rewards/rejected": -0.03546133637428284, + "step": 1300 + }, + { + "epoch": 0.2257064093728463, + "grad_norm": 2.054224729537964, + "learning_rate": 1.5040183696900115e-07, + "logits/chosen": -3.064964771270752, + "logits/rejected": -3.0295050144195557, + "logps/chosen": -55.623008728027344, + "logps/rejected": -53.59694290161133, + "loss": 0.6841, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022824782878160477, + "rewards/margins": 0.018944334238767624, + "rewards/rejected": -0.0417691171169281, + "step": 1310 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 2.443293809890747, + "learning_rate": 1.5154994259471873e-07, + "logits/chosen": -3.037405490875244, + "logits/rejected": -3.011997699737549, + "logps/chosen": -57.403045654296875, + "logps/rejected": -54.382476806640625, + "loss": 0.6852, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.019832521677017212, + "rewards/margins": 0.016630280762910843, + "rewards/rejected": -0.036462802439928055, + "step": 1320 + }, + { + "epoch": 0.22915230875258444, + "grad_norm": 2.500528335571289, + "learning_rate": 1.5269804822043627e-07, + "logits/chosen": -3.1157054901123047, + "logits/rejected": -3.085151195526123, + "logps/chosen": -57.14472579956055, + "logps/rejected": -53.741905212402344, + "loss": 0.6851, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.019271235913038254, + "rewards/margins": 0.01675734482705593, + "rewards/rejected": -0.036028582602739334, + "step": 1330 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 2.3414456844329834, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -2.956932306289673, + "logits/rejected": -2.9354348182678223, + "logps/chosen": -57.207542419433594, + "logps/rejected": -56.502777099609375, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020681411027908325, + "rewards/margins": 0.01566999778151512, + "rewards/rejected": -0.03635140508413315, + "step": 1340 + }, + { + "epoch": 0.23259820813232254, + "grad_norm": 2.3005292415618896, + "learning_rate": 1.5499425947187142e-07, + "logits/chosen": -3.0213208198547363, + "logits/rejected": -2.996798038482666, + "logps/chosen": -56.907737731933594, + "logps/rejected": -57.270530700683594, + "loss": 0.6874, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0280645452439785, + "rewards/margins": 0.012301048263907433, + "rewards/rejected": -0.04036559537053108, + "step": 1350 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 2.42112135887146, + "learning_rate": 1.56142365097589e-07, + "logits/chosen": -2.978275775909424, + "logits/rejected": -2.965906858444214, + "logps/chosen": -58.08185958862305, + "logps/rejected": -59.959800720214844, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028793543577194214, + "rewards/margins": 0.010555420070886612, + "rewards/rejected": -0.039348963648080826, + "step": 1360 + }, + { + "epoch": 0.23604410751206065, + "grad_norm": 2.564072370529175, + "learning_rate": 1.5729047072330654e-07, + "logits/chosen": -3.0901427268981934, + "logits/rejected": -3.0705947875976562, + "logps/chosen": -55.34904861450195, + "logps/rejected": -57.388999938964844, + "loss": 0.6839, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.026742001995444298, + "rewards/margins": 0.01927506923675537, + "rewards/rejected": -0.04601707309484482, + "step": 1370 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 2.3225302696228027, + "learning_rate": 1.584385763490241e-07, + "logits/chosen": -2.9823920726776123, + "logits/rejected": -2.9642717838287354, + "logps/chosen": -56.900108337402344, + "logps/rejected": -56.28680419921875, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02567340061068535, + "rewards/margins": 0.014020757749676704, + "rewards/rejected": -0.0396941602230072, + "step": 1380 + }, + { + "epoch": 0.23949000689179875, + "grad_norm": 2.4447555541992188, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -3.1299426555633545, + "logits/rejected": -3.102720022201538, + "logps/chosen": -59.336936950683594, + "logps/rejected": -57.85896682739258, + "loss": 0.6838, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02443106099963188, + "rewards/margins": 0.01959383673965931, + "rewards/rejected": -0.04402489587664604, + "step": 1390 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 2.328646659851074, + "learning_rate": 1.6073478760045923e-07, + "logits/chosen": -2.9504268169403076, + "logits/rejected": -2.924954891204834, + "logps/chosen": -58.6168098449707, + "logps/rejected": -57.66655731201172, + "loss": 0.6834, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02808014489710331, + "rewards/margins": 0.02053389698266983, + "rewards/rejected": -0.04861404374241829, + "step": 1400 + }, + { + "epoch": 0.24293590627153688, + "grad_norm": 2.178410053253174, + "learning_rate": 1.618828932261768e-07, + "logits/chosen": -3.0127053260803223, + "logits/rejected": -3.0005548000335693, + "logps/chosen": -57.48663330078125, + "logps/rejected": -58.0041389465332, + "loss": 0.6851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.031129727140069008, + "rewards/margins": 0.01690850965678692, + "rewards/rejected": -0.048038236796855927, + "step": 1410 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 2.2679238319396973, + "learning_rate": 1.6303099885189438e-07, + "logits/chosen": -2.9296984672546387, + "logits/rejected": -2.9353113174438477, + "logps/chosen": -54.37672805786133, + "logps/rejected": -59.217491149902344, + "loss": 0.6891, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03622571751475334, + "rewards/margins": 0.009116034023463726, + "rewards/rejected": -0.045341745018959045, + "step": 1420 + }, + { + "epoch": 0.246381805651275, + "grad_norm": 2.6715142726898193, + "learning_rate": 1.6417910447761195e-07, + "logits/chosen": -3.0467171669006348, + "logits/rejected": -3.0351033210754395, + "logps/chosen": -56.65338134765625, + "logps/rejected": -61.58381271362305, + "loss": 0.6835, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.030687406659126282, + "rewards/margins": 0.020251978188753128, + "rewards/rejected": -0.05093938857316971, + "step": 1430 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 2.2624526023864746, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.9727156162261963, + "logits/rejected": -2.9416565895080566, + "logps/chosen": -54.01691818237305, + "logps/rejected": -53.66893768310547, + "loss": 0.6849, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03554430976510048, + "rewards/margins": 0.01750084012746811, + "rewards/rejected": -0.05304514244198799, + "step": 1440 + }, + { + "epoch": 0.2498277050310131, + "grad_norm": 3.1223669052124023, + "learning_rate": 1.6647531572904707e-07, + "logits/chosen": -2.984633684158325, + "logits/rejected": -2.9475743770599365, + "logps/chosen": -58.33637237548828, + "logps/rejected": -57.2099609375, + "loss": 0.6816, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.028612574562430382, + "rewards/margins": 0.024286650121212006, + "rewards/rejected": -0.05289921909570694, + "step": 1450 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 2.475480318069458, + "learning_rate": 1.6762342135476465e-07, + "logits/chosen": -2.9874236583709717, + "logits/rejected": -2.955101490020752, + "logps/chosen": -55.609344482421875, + "logps/rejected": -56.4217643737793, + "loss": 0.6836, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.03909724950790405, + "rewards/margins": 0.02046862244606018, + "rewards/rejected": -0.05956587195396423, + "step": 1460 + }, + { + "epoch": 0.2532736044107512, + "grad_norm": 2.6421852111816406, + "learning_rate": 1.687715269804822e-07, + "logits/chosen": -3.1511387825012207, + "logits/rejected": -3.115359306335449, + "logps/chosen": -60.091941833496094, + "logps/rejected": -60.51912307739258, + "loss": 0.6799, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03068288043141365, + "rewards/margins": 0.02774442359805107, + "rewards/rejected": -0.05842730402946472, + "step": 1470 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 2.4771311283111572, + "learning_rate": 1.6991963260619977e-07, + "logits/chosen": -3.1178431510925293, + "logits/rejected": -3.082062244415283, + "logps/chosen": -55.12421417236328, + "logps/rejected": -57.14054489135742, + "loss": 0.6858, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03813482075929642, + "rewards/margins": 0.015866823494434357, + "rewards/rejected": -0.054001640528440475, + "step": 1480 + }, + { + "epoch": 0.2567195037904893, + "grad_norm": 2.576782703399658, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -3.0479483604431152, + "logits/rejected": -3.023475408554077, + "logps/chosen": -61.34899139404297, + "logps/rejected": -59.27949142456055, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03965304046869278, + "rewards/margins": 0.01644364930689335, + "rewards/rejected": -0.05609668418765068, + "step": 1490 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 2.63468337059021, + "learning_rate": 1.722158438576349e-07, + "logits/chosen": -2.957155466079712, + "logits/rejected": -2.954725980758667, + "logps/chosen": -54.37346649169922, + "logps/rejected": -58.872047424316406, + "loss": 0.6848, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04384751245379448, + "rewards/margins": 0.018207985907793045, + "rewards/rejected": -0.06205549091100693, + "step": 1500 + }, + { + "epoch": 0.2601654031702274, + "grad_norm": 2.59952712059021, + "learning_rate": 1.7336394948335246e-07, + "logits/chosen": -2.9072952270507812, + "logits/rejected": -2.872072696685791, + "logps/chosen": -62.11091232299805, + "logps/rejected": -59.270965576171875, + "loss": 0.6792, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03504981845617294, + "rewards/margins": 0.029394099488854408, + "rewards/rejected": -0.0644439235329628, + "step": 1510 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 2.8741984367370605, + "learning_rate": 1.7451205510907003e-07, + "logits/chosen": -3.0184130668640137, + "logits/rejected": -3.010338544845581, + "logps/chosen": -56.67668914794922, + "logps/rejected": -56.772361755371094, + "loss": 0.6895, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0442604199051857, + "rewards/margins": 0.008484233170747757, + "rewards/rejected": -0.05274464935064316, + "step": 1520 + }, + { + "epoch": 0.26361130254996556, + "grad_norm": 2.7853004932403564, + "learning_rate": 1.756601607347876e-07, + "logits/chosen": -2.9038796424865723, + "logits/rejected": -2.8876137733459473, + "logps/chosen": -56.257667541503906, + "logps/rejected": -56.9356803894043, + "loss": 0.6858, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04569043964147568, + "rewards/margins": 0.01612422987818718, + "rewards/rejected": -0.06181466579437256, + "step": 1530 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 2.535432815551758, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -3.03082013130188, + "logits/rejected": -2.999049663543701, + "logps/chosen": -60.42095184326172, + "logps/rejected": -58.095306396484375, + "loss": 0.6811, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.045154839754104614, + "rewards/margins": 0.02576422318816185, + "rewards/rejected": -0.07091905921697617, + "step": 1540 + }, + { + "epoch": 0.26705720192970367, + "grad_norm": 2.455566883087158, + "learning_rate": 1.7795637198622273e-07, + "logits/chosen": -2.9487860202789307, + "logits/rejected": -2.929208278656006, + "logps/chosen": -57.921913146972656, + "logps/rejected": -57.43787384033203, + "loss": 0.6821, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.045525889843702316, + "rewards/margins": 0.023446694016456604, + "rewards/rejected": -0.06897258758544922, + "step": 1550 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 2.513192653656006, + "learning_rate": 1.791044776119403e-07, + "logits/chosen": -2.960799217224121, + "logits/rejected": -2.9433281421661377, + "logps/chosen": -55.99836349487305, + "logps/rejected": -57.2844123840332, + "loss": 0.6864, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.051505934447050095, + "rewards/margins": 0.014735942706465721, + "rewards/rejected": -0.06624187529087067, + "step": 1560 + }, + { + "epoch": 0.2705031013094418, + "grad_norm": 2.5723841190338135, + "learning_rate": 1.8025258323765787e-07, + "logits/chosen": -3.010369300842285, + "logits/rejected": -3.0123367309570312, + "logps/chosen": -56.7002067565918, + "logps/rejected": -62.82074737548828, + "loss": 0.6847, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04784129932522774, + "rewards/margins": 0.01806465908885002, + "rewards/rejected": -0.06590595096349716, + "step": 1570 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 2.942056655883789, + "learning_rate": 1.8140068886337542e-07, + "logits/chosen": -2.943998098373413, + "logits/rejected": -2.93428373336792, + "logps/chosen": -57.99327850341797, + "logps/rejected": -60.913169860839844, + "loss": 0.6841, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.050780169665813446, + "rewards/margins": 0.019445115700364113, + "rewards/rejected": -0.07022528350353241, + "step": 1580 + }, + { + "epoch": 0.2739490006891799, + "grad_norm": 2.7961862087249756, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.9881751537323, + "logits/rejected": -2.9737207889556885, + "logps/chosen": -58.699737548828125, + "logps/rejected": -61.12406539916992, + "loss": 0.6842, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04569312185049057, + "rewards/margins": 0.019125862047076225, + "rewards/rejected": -0.06481898576021194, + "step": 1590 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 2.5653514862060547, + "learning_rate": 1.8369690011481057e-07, + "logits/chosen": -2.9790871143341064, + "logits/rejected": -2.941410541534424, + "logps/chosen": -57.3028450012207, + "logps/rejected": -58.16033935546875, + "loss": 0.6835, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.051235903054475784, + "rewards/margins": 0.020867522805929184, + "rewards/rejected": -0.07210342586040497, + "step": 1600 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.089492082595825, + "eval_logits/rejected": -3.08384108543396, + "eval_logps/chosen": -60.95978927612305, + "eval_logps/rejected": -66.38330078125, + "eval_loss": 0.688758134841919, + "eval_rewards/accuracies": 0.5864312052726746, + "eval_rewards/chosen": -0.02247888222336769, + "eval_rewards/margins": 0.009552864357829094, + "eval_rewards/rejected": -0.032031744718551636, + "eval_runtime": 382.6729, + "eval_samples_per_second": 11.247, + "eval_steps_per_second": 1.406, + "step": 1600 + }, + { + "epoch": 0.277394900068918, + "grad_norm": 2.465613842010498, + "learning_rate": 1.848450057405281e-07, + "logits/chosen": -3.018049716949463, + "logits/rejected": -2.9930763244628906, + "logps/chosen": -61.77216720581055, + "logps/rejected": -65.25670623779297, + "loss": 0.6856, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.05878068879246712, + "rewards/margins": 0.016800960525870323, + "rewards/rejected": -0.07558164745569229, + "step": 1610 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 2.5315089225769043, + "learning_rate": 1.8599311136624569e-07, + "logits/chosen": -2.940201997756958, + "logits/rejected": -2.91823410987854, + "logps/chosen": -57.43720626831055, + "logps/rejected": -62.375892639160156, + "loss": 0.6826, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.059971462935209274, + "rewards/margins": 0.02313486859202385, + "rewards/rejected": -0.08310633152723312, + "step": 1620 + }, + { + "epoch": 0.2808407994486561, + "grad_norm": 2.86327862739563, + "learning_rate": 1.8714121699196326e-07, + "logits/chosen": -3.0095534324645996, + "logits/rejected": -2.9834182262420654, + "logps/chosen": -60.75006866455078, + "logps/rejected": -59.5101203918457, + "loss": 0.6813, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.052403099834918976, + "rewards/margins": 0.025434961542487144, + "rewards/rejected": -0.07783806324005127, + "step": 1630 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 2.7473204135894775, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -3.0492942333221436, + "logits/rejected": -3.0127370357513428, + "logps/chosen": -62.9014892578125, + "logps/rejected": -59.5329475402832, + "loss": 0.6832, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.05947121977806091, + "rewards/margins": 0.02242801897227764, + "rewards/rejected": -0.0818992406129837, + "step": 1640 + }, + { + "epoch": 0.2842866988283942, + "grad_norm": 2.928473949432373, + "learning_rate": 1.8943742824339838e-07, + "logits/chosen": -3.0190649032592773, + "logits/rejected": -2.991811990737915, + "logps/chosen": -62.157310485839844, + "logps/rejected": -62.88653564453125, + "loss": 0.6791, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.049208804965019226, + "rewards/margins": 0.029801160097122192, + "rewards/rejected": -0.07900996506214142, + "step": 1650 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 2.6895644664764404, + "learning_rate": 1.9058553386911595e-07, + "logits/chosen": -3.053314208984375, + "logits/rejected": -3.007117986679077, + "logps/chosen": -60.38581466674805, + "logps/rejected": -58.10612869262695, + "loss": 0.6793, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.05414261296391487, + "rewards/margins": 0.02976139448583126, + "rewards/rejected": -0.08390400558710098, + "step": 1660 + }, + { + "epoch": 0.2877325982081323, + "grad_norm": 2.900505304336548, + "learning_rate": 1.9173363949483353e-07, + "logits/chosen": -3.0389440059661865, + "logits/rejected": -3.016704559326172, + "logps/chosen": -59.47711944580078, + "logps/rejected": -63.44648361206055, + "loss": 0.6775, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.049625031650066376, + "rewards/margins": 0.032785750925540924, + "rewards/rejected": -0.0824107900261879, + "step": 1670 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 2.6400303840637207, + "learning_rate": 1.928817451205511e-07, + "logits/chosen": -2.993952512741089, + "logits/rejected": -2.9681200981140137, + "logps/chosen": -61.86346435546875, + "logps/rejected": -63.476051330566406, + "loss": 0.6801, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06255128979682922, + "rewards/margins": 0.028126254677772522, + "rewards/rejected": -0.09067754447460175, + "step": 1680 + }, + { + "epoch": 0.29117849758787046, + "grad_norm": 2.792930841445923, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.9742238521575928, + "logits/rejected": -2.958693504333496, + "logps/chosen": -61.657623291015625, + "logps/rejected": -59.42774200439453, + "loss": 0.6887, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.061123304069042206, + "rewards/margins": 0.010420399717986584, + "rewards/rejected": -0.07154370844364166, + "step": 1690 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 2.930274248123169, + "learning_rate": 1.9517795637198622e-07, + "logits/chosen": -2.8819127082824707, + "logits/rejected": -2.8864543437957764, + "logps/chosen": -56.14990234375, + "logps/rejected": -63.5261116027832, + "loss": 0.6859, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06586851179599762, + "rewards/margins": 0.016264760866761208, + "rewards/rejected": -0.08213327825069427, + "step": 1700 + }, + { + "epoch": 0.29462439696760856, + "grad_norm": 3.0649280548095703, + "learning_rate": 1.963260619977038e-07, + "logits/chosen": -3.0112526416778564, + "logits/rejected": -2.9711148738861084, + "logps/chosen": -64.08343505859375, + "logps/rejected": -62.7235221862793, + "loss": 0.6813, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05673649162054062, + "rewards/margins": 0.02566385641694069, + "rewards/rejected": -0.08240034431219101, + "step": 1710 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 2.632606029510498, + "learning_rate": 1.9747416762342134e-07, + "logits/chosen": -3.0191071033477783, + "logits/rejected": -2.990119457244873, + "logps/chosen": -62.924537658691406, + "logps/rejected": -59.43379592895508, + "loss": 0.6845, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0649864599108696, + "rewards/margins": 0.01879298873245716, + "rewards/rejected": -0.08377943933010101, + "step": 1720 + }, + { + "epoch": 0.29807029634734666, + "grad_norm": 2.6209614276885986, + "learning_rate": 1.9862227324913891e-07, + "logits/chosen": -3.013237953186035, + "logits/rejected": -2.996584177017212, + "logps/chosen": -60.8912353515625, + "logps/rejected": -61.02091598510742, + "loss": 0.6863, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.06956915557384491, + "rewards/margins": 0.015582660213112831, + "rewards/rejected": -0.085151806473732, + "step": 1730 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 2.7946395874023438, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.961860179901123, + "logits/rejected": -2.9518237113952637, + "logps/chosen": -59.7307243347168, + "logps/rejected": -63.86859130859375, + "loss": 0.6834, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.07629448175430298, + "rewards/margins": 0.021778276190161705, + "rewards/rejected": -0.09807275980710983, + "step": 1740 + }, + { + "epoch": 0.30151619572708477, + "grad_norm": 3.019209146499634, + "learning_rate": 1.999998713790723e-07, + "logits/chosen": -3.002354621887207, + "logits/rejected": -2.9911231994628906, + "logps/chosen": -61.43891525268555, + "logps/rejected": -64.77537536621094, + "loss": 0.6816, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.06252885609865189, + "rewards/margins": 0.02495855651795864, + "rewards/rejected": -0.08748741447925568, + "step": 1750 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 2.858008623123169, + "learning_rate": 1.999993488571206e-07, + "logits/chosen": -2.9882655143737793, + "logits/rejected": -2.952385663986206, + "logps/chosen": -62.764404296875, + "logps/rejected": -63.62236404418945, + "loss": 0.6748, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06741251796483994, + "rewards/margins": 0.03932555764913559, + "rewards/rejected": -0.10673806816339493, + "step": 1760 + }, + { + "epoch": 0.3049620951068229, + "grad_norm": 2.529355525970459, + "learning_rate": 1.9999842439743547e-07, + "logits/chosen": -3.0135600566864014, + "logits/rejected": -2.9775524139404297, + "logps/chosen": -62.50413131713867, + "logps/rejected": -60.10844039916992, + "loss": 0.676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07522428780794144, + "rewards/margins": 0.03643802925944328, + "rewards/rejected": -0.11166232824325562, + "step": 1770 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 3.3260445594787598, + "learning_rate": 1.999970980037328e-07, + "logits/chosen": -2.9299685955047607, + "logits/rejected": -2.931786298751831, + "logps/chosen": -62.187744140625, + "logps/rejected": -69.33695220947266, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07669190317392349, + "rewards/margins": 0.032053664326667786, + "rewards/rejected": -0.10874556005001068, + "step": 1780 + }, + { + "epoch": 0.308407994486561, + "grad_norm": 3.0353615283966064, + "learning_rate": 1.999953696813438e-07, + "logits/chosen": -3.0692267417907715, + "logits/rejected": -3.0471017360687256, + "logps/chosen": -59.89692306518555, + "logps/rejected": -64.16006469726562, + "loss": 0.6757, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07901540398597717, + "rewards/margins": 0.037279874086380005, + "rewards/rejected": -0.11629529297351837, + "step": 1790 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 2.8321800231933594, + "learning_rate": 1.9999323943721533e-07, + "logits/chosen": -3.033538818359375, + "logits/rejected": -3.0094218254089355, + "logps/chosen": -61.10071563720703, + "logps/rejected": -64.22647857666016, + "loss": 0.6796, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.09298542886972427, + "rewards/margins": 0.02937093935906887, + "rewards/rejected": -0.1223563551902771, + "step": 1800 + }, + { + "epoch": 0.3118538938662991, + "grad_norm": 3.0810434818267822, + "learning_rate": 1.9999070727990972e-07, + "logits/chosen": -3.033754587173462, + "logits/rejected": -2.998857021331787, + "logps/chosen": -65.45790100097656, + "logps/rejected": -65.5028305053711, + "loss": 0.6798, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.08840756863355637, + "rewards/margins": 0.02971474826335907, + "rewards/rejected": -0.11812230199575424, + "step": 1810 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 2.9397025108337402, + "learning_rate": 1.999877732196047e-07, + "logits/chosen": -3.002478837966919, + "logits/rejected": -2.9685733318328857, + "logps/chosen": -64.43415069580078, + "logps/rejected": -62.759742736816406, + "loss": 0.6806, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09272059053182602, + "rewards/margins": 0.028056111186742783, + "rewards/rejected": -0.1207766979932785, + "step": 1820 + }, + { + "epoch": 0.31529979324603724, + "grad_norm": 3.0365073680877686, + "learning_rate": 1.9998443726809344e-07, + "logits/chosen": -2.8997771739959717, + "logits/rejected": -2.8866164684295654, + "logps/chosen": -64.41737365722656, + "logps/rejected": -65.77619934082031, + "loss": 0.6836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0887407660484314, + "rewards/margins": 0.021692968904972076, + "rewards/rejected": -0.11043374240398407, + "step": 1830 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 3.104240655899048, + "learning_rate": 1.9998069943878452e-07, + "logits/chosen": -3.102999210357666, + "logits/rejected": -3.0991158485412598, + "logps/chosen": -65.93641662597656, + "logps/rejected": -68.23253631591797, + "loss": 0.6842, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.11116300523281097, + "rewards/margins": 0.020699962973594666, + "rewards/rejected": -0.13186296820640564, + "step": 1840 + }, + { + "epoch": 0.31874569262577535, + "grad_norm": 3.036222219467163, + "learning_rate": 1.9997655974670177e-07, + "logits/chosen": -2.9688706398010254, + "logits/rejected": -2.9688243865966797, + "logps/chosen": -64.49559020996094, + "logps/rejected": -67.51258850097656, + "loss": 0.6869, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.112858846783638, + "rewards/margins": 0.015191557817161083, + "rewards/rejected": -0.12805040180683136, + "step": 1850 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 3.0601251125335693, + "learning_rate": 1.9997201820848421e-07, + "logits/chosen": -2.902102470397949, + "logits/rejected": -2.8667800426483154, + "logps/chosen": -66.10860443115234, + "logps/rejected": -66.68751525878906, + "loss": 0.6738, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08816654980182648, + "rewards/margins": 0.041354451328516006, + "rewards/rejected": -0.12952101230621338, + "step": 1860 + }, + { + "epoch": 0.32219159200551345, + "grad_norm": 3.619654893875122, + "learning_rate": 1.999670748423862e-07, + "logits/chosen": -2.9479846954345703, + "logits/rejected": -2.9228718280792236, + "logps/chosen": -65.58842468261719, + "logps/rejected": -66.9212646484375, + "loss": 0.6772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09638717025518417, + "rewards/margins": 0.03494124859571457, + "rewards/rejected": -0.13132841885089874, + "step": 1870 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 2.9565136432647705, + "learning_rate": 1.9996172966827712e-07, + "logits/chosen": -2.99045991897583, + "logits/rejected": -2.953990936279297, + "logps/chosen": -61.8843994140625, + "logps/rejected": -65.32537841796875, + "loss": 0.6717, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09168566763401031, + "rewards/margins": 0.04686136916279793, + "rewards/rejected": -0.13854703307151794, + "step": 1880 + }, + { + "epoch": 0.32563749138525155, + "grad_norm": 3.3497560024261475, + "learning_rate": 1.9995598270764132e-07, + "logits/chosen": -3.0127041339874268, + "logits/rejected": -3.006329298019409, + "logps/chosen": -61.48860549926758, + "logps/rejected": -67.71702575683594, + "loss": 0.6709, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.08430846035480499, + "rewards/margins": 0.04838328808546066, + "rewards/rejected": -0.13269174098968506, + "step": 1890 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 3.203680992126465, + "learning_rate": 1.9994983398357822e-07, + "logits/chosen": -2.9438915252685547, + "logits/rejected": -2.9119534492492676, + "logps/chosen": -65.86426544189453, + "logps/rejected": -64.35572814941406, + "loss": 0.6761, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.08424468338489532, + "rewards/margins": 0.03669450432062149, + "rewards/rejected": -0.12093917280435562, + "step": 1900 + }, + { + "epoch": 0.32908339076498966, + "grad_norm": 3.236323118209839, + "learning_rate": 1.9994328352080197e-07, + "logits/chosen": -2.8468799591064453, + "logits/rejected": -2.812983989715576, + "logps/chosen": -64.58326721191406, + "logps/rejected": -67.55799865722656, + "loss": 0.6741, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10115577280521393, + "rewards/margins": 0.04144899174571037, + "rewards/rejected": -0.1426047533750534, + "step": 1910 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 3.068075656890869, + "learning_rate": 1.9993633134564157e-07, + "logits/chosen": -2.9570934772491455, + "logits/rejected": -2.9299278259277344, + "logps/chosen": -65.09169006347656, + "logps/rejected": -67.30238342285156, + "loss": 0.6753, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09723018109798431, + "rewards/margins": 0.0387737974524498, + "rewards/rejected": -0.1360039860010147, + "step": 1920 + }, + { + "epoch": 0.33252929014472776, + "grad_norm": 2.9155988693237305, + "learning_rate": 1.9992897748604057e-07, + "logits/chosen": -2.903533935546875, + "logits/rejected": -2.867462635040283, + "logps/chosen": -64.53730773925781, + "logps/rejected": -67.99651336669922, + "loss": 0.677, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10646890103816986, + "rewards/margins": 0.03499242290854454, + "rewards/rejected": -0.1414613425731659, + "step": 1930 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 3.012897491455078, + "learning_rate": 1.9992122197155713e-07, + "logits/chosen": -2.9248719215393066, + "logits/rejected": -2.9068443775177, + "logps/chosen": -61.72211837768555, + "logps/rejected": -63.50091552734375, + "loss": 0.6812, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10782128572463989, + "rewards/margins": 0.02694891020655632, + "rewards/rejected": -0.1347701996564865, + "step": 1940 + }, + { + "epoch": 0.33597518952446587, + "grad_norm": 2.9323947429656982, + "learning_rate": 1.9991306483336379e-07, + "logits/chosen": -2.9290108680725098, + "logits/rejected": -2.927438974380493, + "logps/chosen": -63.5114860534668, + "logps/rejected": -69.5857162475586, + "loss": 0.6763, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.11003844439983368, + "rewards/margins": 0.03804415836930275, + "rewards/rejected": -0.14808261394500732, + "step": 1950 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 3.069072961807251, + "learning_rate": 1.9990450610424739e-07, + "logits/chosen": -2.93648624420166, + "logits/rejected": -2.9172258377075195, + "logps/chosen": -64.36561584472656, + "logps/rejected": -68.8530502319336, + "loss": 0.6731, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10006101429462433, + "rewards/margins": 0.04350076615810394, + "rewards/rejected": -0.14356176555156708, + "step": 1960 + }, + { + "epoch": 0.33942108890420397, + "grad_norm": 3.011516809463501, + "learning_rate": 1.9989554581860885e-07, + "logits/chosen": -2.9762659072875977, + "logits/rejected": -2.9475595951080322, + "logps/chosen": -66.1217041015625, + "logps/rejected": -65.79177856445312, + "loss": 0.6751, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09975095093250275, + "rewards/margins": 0.03971099108457565, + "rewards/rejected": -0.1394619345664978, + "step": 1970 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 3.3382675647735596, + "learning_rate": 1.9988618401246327e-07, + "logits/chosen": -2.88445782661438, + "logits/rejected": -2.8788936138153076, + "logps/chosen": -67.1238021850586, + "logps/rejected": -67.88463592529297, + "loss": 0.6839, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11536003649234772, + "rewards/margins": 0.02309250645339489, + "rewards/rejected": -0.13845254480838776, + "step": 1980 + }, + { + "epoch": 0.34286698828394213, + "grad_norm": 3.3324198722839355, + "learning_rate": 1.9987642072343948e-07, + "logits/chosen": -3.0067853927612305, + "logits/rejected": -2.9725756645202637, + "logps/chosen": -62.18952560424805, + "logps/rejected": -64.51532745361328, + "loss": 0.6701, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1135512962937355, + "rewards/margins": 0.049906175583601, + "rewards/rejected": -0.1634574681520462, + "step": 1990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 3.651040554046631, + "learning_rate": 1.9986625599078007e-07, + "logits/chosen": -2.928395986557007, + "logits/rejected": -2.934624195098877, + "logps/chosen": -62.90681838989258, + "logps/rejected": -73.9938735961914, + "loss": 0.6778, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12945915758609772, + "rewards/margins": 0.03435561805963516, + "rewards/rejected": -0.16381476819515228, + "step": 2000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.0270333290100098, + "eval_logits/rejected": -3.0213301181793213, + "eval_logps/chosen": -65.9486312866211, + "eval_logps/rejected": -72.35735321044922, + "eval_loss": 0.6844969391822815, + "eval_rewards/accuracies": 0.5975836515426636, + "eval_rewards/chosen": -0.072367362678051, + "eval_rewards/margins": 0.019404985010623932, + "eval_rewards/rejected": -0.09177234023809433, + "eval_runtime": 382.4638, + "eval_samples_per_second": 11.253, + "eval_steps_per_second": 1.407, + "step": 2000 + }, + { + "epoch": 0.34631288766368024, + "grad_norm": 3.3070271015167236, + "learning_rate": 1.9985568985534123e-07, + "logits/chosen": -2.9318108558654785, + "logits/rejected": -2.903789520263672, + "logps/chosen": -66.5789794921875, + "logps/rejected": -66.69670104980469, + "loss": 0.6752, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10977102816104889, + "rewards/margins": 0.039171889424324036, + "rewards/rejected": -0.14894291758537292, + "step": 2010 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 3.1021924018859863, + "learning_rate": 1.9984472235959246e-07, + "logits/chosen": -2.9195027351379395, + "logits/rejected": -2.8980581760406494, + "logps/chosen": -62.109039306640625, + "logps/rejected": -71.7960205078125, + "loss": 0.671, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.12279726564884186, + "rewards/margins": 0.04837704449892044, + "rewards/rejected": -0.1711743175983429, + "step": 2020 + }, + { + "epoch": 0.34975878704341834, + "grad_norm": 3.9570415019989014, + "learning_rate": 1.9983335354761662e-07, + "logits/chosen": -3.003624677658081, + "logits/rejected": -2.9824397563934326, + "logps/chosen": -68.96121978759766, + "logps/rejected": -70.75452423095703, + "loss": 0.6767, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11908937990665436, + "rewards/margins": 0.03664074093103409, + "rewards/rejected": -0.15573014318943024, + "step": 2030 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 3.131145715713501, + "learning_rate": 1.9982158346510952e-07, + "logits/chosen": -2.868227005004883, + "logits/rejected": -2.8557090759277344, + "logps/chosen": -64.66195678710938, + "logps/rejected": -70.04840087890625, + "loss": 0.6745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11946003139019012, + "rewards/margins": 0.04148910567164421, + "rewards/rejected": -0.16094914078712463, + "step": 2040 + }, + { + "epoch": 0.35320468642315644, + "grad_norm": 3.7799129486083984, + "learning_rate": 1.998094121593799e-07, + "logits/chosen": -2.980715274810791, + "logits/rejected": -2.956112861633301, + "logps/chosen": -63.249267578125, + "logps/rejected": -69.17818450927734, + "loss": 0.6813, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.12027259171009064, + "rewards/margins": 0.02740442380309105, + "rewards/rejected": -0.14767701923847198, + "step": 2050 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 3.3789472579956055, + "learning_rate": 1.9979683967934911e-07, + "logits/chosen": -2.96757173538208, + "logits/rejected": -2.929948329925537, + "logps/chosen": -66.1915512084961, + "logps/rejected": -67.29234313964844, + "loss": 0.6708, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10862097889184952, + "rewards/margins": 0.04848029464483261, + "rewards/rejected": -0.15710125863552094, + "step": 2060 + }, + { + "epoch": 0.35665058580289455, + "grad_norm": 3.4836807250976562, + "learning_rate": 1.9978386607555103e-07, + "logits/chosen": -2.983682870864868, + "logits/rejected": -2.960925579071045, + "logps/chosen": -67.11554718017578, + "logps/rejected": -71.07099151611328, + "loss": 0.6733, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.11822348833084106, + "rewards/margins": 0.04466177895665169, + "rewards/rejected": -0.16288527846336365, + "step": 2070 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 3.548471450805664, + "learning_rate": 1.9977049140013183e-07, + "logits/chosen": -2.9241182804107666, + "logits/rejected": -2.895223617553711, + "logps/chosen": -65.55860137939453, + "logps/rejected": -69.46336364746094, + "loss": 0.6696, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1369078904390335, + "rewards/margins": 0.051416944712400436, + "rewards/rejected": -0.18832483887672424, + "step": 2080 + }, + { + "epoch": 0.36009648518263265, + "grad_norm": 3.2138357162475586, + "learning_rate": 1.997567157068497e-07, + "logits/chosen": -2.9462406635284424, + "logits/rejected": -2.9429733753204346, + "logps/chosen": -67.5681381225586, + "logps/rejected": -70.5682144165039, + "loss": 0.6813, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.13173379004001617, + "rewards/margins": 0.027362842112779617, + "rewards/rejected": -0.1590966284275055, + "step": 2090 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 3.7781026363372803, + "learning_rate": 1.997425390510747e-07, + "logits/chosen": -2.894129991531372, + "logits/rejected": -2.872572422027588, + "logps/chosen": -68.9821548461914, + "logps/rejected": -68.93914794921875, + "loss": 0.6749, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13635702431201935, + "rewards/margins": 0.04054417088627815, + "rewards/rejected": -0.1769011914730072, + "step": 2100 + }, + { + "epoch": 0.36354238456237076, + "grad_norm": 3.3131191730499268, + "learning_rate": 1.9972796148978856e-07, + "logits/chosen": -2.948805332183838, + "logits/rejected": -2.951103448867798, + "logps/chosen": -63.747772216796875, + "logps/rejected": -73.74634552001953, + "loss": 0.6771, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1484815776348114, + "rewards/margins": 0.03708801046013832, + "rewards/rejected": -0.18556959927082062, + "step": 2110 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 3.6554408073425293, + "learning_rate": 1.9971298308158441e-07, + "logits/chosen": -2.8126590251922607, + "logits/rejected": -2.786180019378662, + "logps/chosen": -65.94208526611328, + "logps/rejected": -67.56100463867188, + "loss": 0.6739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1236642599105835, + "rewards/margins": 0.04265039786696434, + "rewards/rejected": -0.16631464660167694, + "step": 2120 + }, + { + "epoch": 0.3669882839421089, + "grad_norm": 3.0757296085357666, + "learning_rate": 1.9969760388666645e-07, + "logits/chosen": -2.846762180328369, + "logits/rejected": -2.8287978172302246, + "logps/chosen": -68.62953186035156, + "logps/rejected": -72.33243560791016, + "loss": 0.6674, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.14159207046031952, + "rewards/margins": 0.05636315792798996, + "rewards/rejected": -0.19795522093772888, + "step": 2130 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 3.5154972076416016, + "learning_rate": 1.996818239668499e-07, + "logits/chosen": -2.8251287937164307, + "logits/rejected": -2.817430019378662, + "logps/chosen": -64.94082641601562, + "logps/rejected": -73.19986724853516, + "loss": 0.678, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1628565490245819, + "rewards/margins": 0.033618878573179245, + "rewards/rejected": -0.19647544622421265, + "step": 2140 + }, + { + "epoch": 0.370434183321847, + "grad_norm": 3.5373153686523438, + "learning_rate": 1.9966564338556065e-07, + "logits/chosen": -2.8717055320739746, + "logits/rejected": -2.8266823291778564, + "logps/chosen": -67.66168212890625, + "logps/rejected": -67.52949523925781, + "loss": 0.6628, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1297309845685959, + "rewards/margins": 0.06547585129737854, + "rewards/rejected": -0.19520683586597443, + "step": 2150 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 3.6125190258026123, + "learning_rate": 1.9964906220783492e-07, + "logits/chosen": -2.839944839477539, + "logits/rejected": -2.8248395919799805, + "logps/chosen": -69.63507080078125, + "logps/rejected": -68.79278564453125, + "loss": 0.6733, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1487804651260376, + "rewards/margins": 0.04360898584127426, + "rewards/rejected": -0.19238945841789246, + "step": 2160 + }, + { + "epoch": 0.3738800827015851, + "grad_norm": 3.6442248821258545, + "learning_rate": 1.9963208050031922e-07, + "logits/chosen": -2.954251766204834, + "logits/rejected": -2.939054250717163, + "logps/chosen": -68.50241088867188, + "logps/rejected": -70.56828308105469, + "loss": 0.6704, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1417117863893509, + "rewards/margins": 0.0491500198841095, + "rewards/rejected": -0.1908618062734604, + "step": 2170 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 3.8122751712799072, + "learning_rate": 1.9961469833126987e-07, + "logits/chosen": -3.001730442047119, + "logits/rejected": -2.96539306640625, + "logps/chosen": -73.8094482421875, + "logps/rejected": -74.3231430053711, + "loss": 0.667, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15785323083400726, + "rewards/margins": 0.05803511291742325, + "rewards/rejected": -0.2158883512020111, + "step": 2180 + }, + { + "epoch": 0.37732598208132323, + "grad_norm": 3.7190444469451904, + "learning_rate": 1.995969157705528e-07, + "logits/chosen": -3.0624592304229736, + "logits/rejected": -3.056671142578125, + "logps/chosen": -69.03919982910156, + "logps/rejected": -72.68977355957031, + "loss": 0.6822, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.16197362542152405, + "rewards/margins": 0.026371484622359276, + "rewards/rejected": -0.18834510445594788, + "step": 2190 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 3.5991036891937256, + "learning_rate": 1.995787328896433e-07, + "logits/chosen": -2.8910741806030273, + "logits/rejected": -2.8688979148864746, + "logps/chosen": -66.85711669921875, + "logps/rejected": -76.20103454589844, + "loss": 0.6618, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.14548395574092865, + "rewards/margins": 0.06858919560909271, + "rewards/rejected": -0.21407318115234375, + "step": 2200 + }, + { + "epoch": 0.38077188146106133, + "grad_norm": 3.7989940643310547, + "learning_rate": 1.9956014976162572e-07, + "logits/chosen": -2.947467565536499, + "logits/rejected": -2.9316937923431396, + "logps/chosen": -69.19676971435547, + "logps/rejected": -73.41373443603516, + "loss": 0.6696, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1447041779756546, + "rewards/margins": 0.05233711004257202, + "rewards/rejected": -0.19704128801822662, + "step": 2210 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 3.6562952995300293, + "learning_rate": 1.9954116646119315e-07, + "logits/chosen": -2.7782294750213623, + "logits/rejected": -2.7709176540374756, + "logps/chosen": -68.47245788574219, + "logps/rejected": -72.8316421508789, + "loss": 0.6741, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14594446122646332, + "rewards/margins": 0.043900761753320694, + "rewards/rejected": -0.1898452192544937, + "step": 2220 + }, + { + "epoch": 0.38421778084079944, + "grad_norm": 3.846614122390747, + "learning_rate": 1.9952178306464708e-07, + "logits/chosen": -2.9279448986053467, + "logits/rejected": -2.8969149589538574, + "logps/chosen": -70.74501037597656, + "logps/rejected": -71.7994155883789, + "loss": 0.6756, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17067433893680573, + "rewards/margins": 0.039899904280900955, + "rewards/rejected": -0.21057423949241638, + "step": 2230 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 3.7794787883758545, + "learning_rate": 1.9950199964989728e-07, + "logits/chosen": -2.883044958114624, + "logits/rejected": -2.848534107208252, + "logps/chosen": -72.35716247558594, + "logps/rejected": -72.94497680664062, + "loss": 0.6765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15547525882720947, + "rewards/margins": 0.03803917020559311, + "rewards/rejected": -0.19351443648338318, + "step": 2240 + }, + { + "epoch": 0.38766368022053754, + "grad_norm": 4.301860332489014, + "learning_rate": 1.9948181629646125e-07, + "logits/chosen": -2.840498447418213, + "logits/rejected": -2.807094097137451, + "logps/chosen": -71.03971862792969, + "logps/rejected": -73.32917785644531, + "loss": 0.6743, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.15689852833747864, + "rewards/margins": 0.042939841747283936, + "rewards/rejected": -0.19983838498592377, + "step": 2250 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 3.486368179321289, + "learning_rate": 1.99461233085464e-07, + "logits/chosen": -2.800408124923706, + "logits/rejected": -2.775947332382202, + "logps/chosen": -78.2021484375, + "logps/rejected": -77.9363784790039, + "loss": 0.6819, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.17394503951072693, + "rewards/margins": 0.031767066568136215, + "rewards/rejected": -0.20571212470531464, + "step": 2260 + }, + { + "epoch": 0.39110957960027565, + "grad_norm": 4.095204830169678, + "learning_rate": 1.9944025009963783e-07, + "logits/chosen": -2.784682273864746, + "logits/rejected": -2.752465009689331, + "logps/chosen": -70.94514465332031, + "logps/rejected": -73.77107238769531, + "loss": 0.6702, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1688375473022461, + "rewards/margins": 0.05170854926109314, + "rewards/rejected": -0.22054609656333923, + "step": 2270 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 3.7926547527313232, + "learning_rate": 1.9941886742332175e-07, + "logits/chosen": -2.856924533843994, + "logits/rejected": -2.8466103076934814, + "logps/chosen": -68.77479553222656, + "logps/rejected": -75.07438659667969, + "loss": 0.6744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15331251919269562, + "rewards/margins": 0.04212985187768936, + "rewards/rejected": -0.1954423487186432, + "step": 2280 + }, + { + "epoch": 0.3945554789800138, + "grad_norm": 3.7058446407318115, + "learning_rate": 1.9939708514246143e-07, + "logits/chosen": -2.7546420097351074, + "logits/rejected": -2.727194309234619, + "logps/chosen": -69.0736312866211, + "logps/rejected": -75.74372863769531, + "loss": 0.6666, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.16190746426582336, + "rewards/margins": 0.05792677402496338, + "rewards/rejected": -0.21983423829078674, + "step": 2290 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 3.861417770385742, + "learning_rate": 1.9937490334460857e-07, + "logits/chosen": -2.9293429851531982, + "logits/rejected": -2.8968148231506348, + "logps/chosen": -71.00596618652344, + "logps/rejected": -74.87997436523438, + "loss": 0.6655, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.15060889720916748, + "rewards/margins": 0.0616321787238121, + "rewards/rejected": -0.21224108338356018, + "step": 2300 + }, + { + "epoch": 0.3980013783597519, + "grad_norm": 3.798541307449341, + "learning_rate": 1.9935232211892083e-07, + "logits/chosen": -2.8329617977142334, + "logits/rejected": -2.817756175994873, + "logps/chosen": -67.91639709472656, + "logps/rejected": -73.6534194946289, + "loss": 0.6691, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1683826744556427, + "rewards/margins": 0.05380697920918465, + "rewards/rejected": -0.22218966484069824, + "step": 2310 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 3.9587326049804688, + "learning_rate": 1.9932934155616127e-07, + "logits/chosen": -2.9256093502044678, + "logits/rejected": -2.8858180046081543, + "logps/chosen": -73.00454711914062, + "logps/rejected": -73.79489135742188, + "loss": 0.6608, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15563161671161652, + "rewards/margins": 0.07084403187036514, + "rewards/rejected": -0.22647564113140106, + "step": 2320 + }, + { + "epoch": 0.40144727773949, + "grad_norm": 3.805995225906372, + "learning_rate": 1.9930596174869797e-07, + "logits/chosen": -2.8376352787017822, + "logits/rejected": -2.816253900527954, + "logps/chosen": -72.3519287109375, + "logps/rejected": -76.95233154296875, + "loss": 0.6572, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.16542547941207886, + "rewards/margins": 0.0815449133515358, + "rewards/rejected": -0.24697038531303406, + "step": 2330 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 3.8098678588867188, + "learning_rate": 1.992821827905039e-07, + "logits/chosen": -2.8756024837493896, + "logits/rejected": -2.863811492919922, + "logps/chosen": -70.9607162475586, + "logps/rejected": -77.6399917602539, + "loss": 0.6782, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.18659153580665588, + "rewards/margins": 0.036031901836395264, + "rewards/rejected": -0.22262343764305115, + "step": 2340 + }, + { + "epoch": 0.4048931771192281, + "grad_norm": 3.8881237506866455, + "learning_rate": 1.9925800477715623e-07, + "logits/chosen": -2.855710506439209, + "logits/rejected": -2.8338699340820312, + "logps/chosen": -74.32742309570312, + "logps/rejected": -78.49640655517578, + "loss": 0.6608, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.16066120564937592, + "rewards/margins": 0.07114093005657196, + "rewards/rejected": -0.2318021059036255, + "step": 2350 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 3.6372482776641846, + "learning_rate": 1.992334278058362e-07, + "logits/chosen": -2.872138738632202, + "logits/rejected": -2.859128475189209, + "logps/chosen": -67.64442443847656, + "logps/rejected": -73.90852355957031, + "loss": 0.6686, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1728234887123108, + "rewards/margins": 0.054965026676654816, + "rewards/rejected": -0.2277885228395462, + "step": 2360 + }, + { + "epoch": 0.4083390764989662, + "grad_norm": 4.352206707000732, + "learning_rate": 1.9920845197532854e-07, + "logits/chosen": -2.8816115856170654, + "logits/rejected": -2.8750884532928467, + "logps/chosen": -72.4279556274414, + "logps/rejected": -79.2039794921875, + "loss": 0.6624, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1866598129272461, + "rewards/margins": 0.06673283874988556, + "rewards/rejected": -0.25339263677597046, + "step": 2370 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 3.87642765045166, + "learning_rate": 1.991830773860212e-07, + "logits/chosen": -2.8102853298187256, + "logits/rejected": -2.7858879566192627, + "logps/chosen": -71.7208251953125, + "logps/rejected": -75.96289825439453, + "loss": 0.668, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.17378661036491394, + "rewards/margins": 0.05719633772969246, + "rewards/rejected": -0.2309829741716385, + "step": 2380 + }, + { + "epoch": 0.41178497587870433, + "grad_norm": 4.1682448387146, + "learning_rate": 1.9915730413990486e-07, + "logits/chosen": -2.889021158218384, + "logits/rejected": -2.8628406524658203, + "logps/chosen": -74.9619140625, + "logps/rejected": -78.55941772460938, + "loss": 0.6629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18727919459342957, + "rewards/margins": 0.06737373024225235, + "rewards/rejected": -0.2546529471874237, + "step": 2390 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 3.591698408126831, + "learning_rate": 1.9913113234057264e-07, + "logits/chosen": -2.933715343475342, + "logits/rejected": -2.9128355979919434, + "logps/chosen": -70.69978332519531, + "logps/rejected": -77.25373077392578, + "loss": 0.6688, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.18399231135845184, + "rewards/margins": 0.055909980088472366, + "rewards/rejected": -0.2399022877216339, + "step": 2400 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -2.9427647590637207, + "eval_logits/rejected": -2.937041997909546, + "eval_logps/chosen": -72.73748779296875, + "eval_logps/rejected": -80.43446350097656, + "eval_loss": 0.6791625618934631, + "eval_rewards/accuracies": 0.6031598448753357, + "eval_rewards/chosen": -0.14025600254535675, + "eval_rewards/margins": 0.032287415117025375, + "eval_rewards/rejected": -0.17254340648651123, + "eval_runtime": 383.1472, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 2400 + }, + { + "epoch": 0.41523087525844243, + "grad_norm": 3.8721492290496826, + "learning_rate": 1.9910456209321956e-07, + "logits/chosen": -2.9030094146728516, + "logits/rejected": -2.8650403022766113, + "logps/chosen": -72.07716369628906, + "logps/rejected": -74.85731506347656, + "loss": 0.6597, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.18226821720600128, + "rewards/margins": 0.0736222192645073, + "rewards/rejected": -0.255890429019928, + "step": 2410 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 4.3863983154296875, + "learning_rate": 1.9907759350464212e-07, + "logits/chosen": -2.8956375122070312, + "logits/rejected": -2.8728861808776855, + "logps/chosen": -76.08689880371094, + "logps/rejected": -81.4808349609375, + "loss": 0.6634, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19904786348342896, + "rewards/margins": 0.06673558056354523, + "rewards/rejected": -0.265783429145813, + "step": 2420 + }, + { + "epoch": 0.41867677463818054, + "grad_norm": 4.163143157958984, + "learning_rate": 1.9905022668323803e-07, + "logits/chosen": -2.8559088706970215, + "logits/rejected": -2.8340187072753906, + "logps/chosen": -75.20133209228516, + "logps/rejected": -76.68769073486328, + "loss": 0.6737, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2035834789276123, + "rewards/margins": 0.04596007615327835, + "rewards/rejected": -0.24954351782798767, + "step": 2430 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 4.649513244628906, + "learning_rate": 1.9902246173900554e-07, + "logits/chosen": -2.8532347679138184, + "logits/rejected": -2.840039014816284, + "logps/chosen": -73.97209930419922, + "logps/rejected": -80.97435760498047, + "loss": 0.6589, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.18915219604969025, + "rewards/margins": 0.0755903422832489, + "rewards/rejected": -0.26474255323410034, + "step": 2440 + }, + { + "epoch": 0.4221226740179187, + "grad_norm": 4.157593250274658, + "learning_rate": 1.9899429878354318e-07, + "logits/chosen": -2.8119709491729736, + "logits/rejected": -2.78918194770813, + "logps/chosen": -74.1239013671875, + "logps/rejected": -79.02185821533203, + "loss": 0.6675, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.21065564453601837, + "rewards/margins": 0.056983523070812225, + "rewards/rejected": -0.2676391303539276, + "step": 2450 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 3.678806781768799, + "learning_rate": 1.989657379300492e-07, + "logits/chosen": -2.7902090549468994, + "logits/rejected": -2.7585668563842773, + "logps/chosen": -76.15962219238281, + "logps/rejected": -78.26510620117188, + "loss": 0.6684, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.20018188655376434, + "rewards/margins": 0.05713933706283569, + "rewards/rejected": -0.25732123851776123, + "step": 2460 + }, + { + "epoch": 0.4255685733976568, + "grad_norm": 4.094069957733154, + "learning_rate": 1.9893677929332123e-07, + "logits/chosen": -2.9425644874572754, + "logits/rejected": -2.9215643405914307, + "logps/chosen": -74.55903625488281, + "logps/rejected": -80.36302185058594, + "loss": 0.658, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19514811038970947, + "rewards/margins": 0.08139944821596146, + "rewards/rejected": -0.2765475809574127, + "step": 2470 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 4.179010391235352, + "learning_rate": 1.9890742298975574e-07, + "logits/chosen": -2.8428502082824707, + "logits/rejected": -2.813183307647705, + "logps/chosen": -76.90948486328125, + "logps/rejected": -78.09593200683594, + "loss": 0.6704, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2253132164478302, + "rewards/margins": 0.051389217376708984, + "rewards/rejected": -0.2767024636268616, + "step": 2480 + }, + { + "epoch": 0.4290144727773949, + "grad_norm": 4.350039482116699, + "learning_rate": 1.9887766913734748e-07, + "logits/chosen": -2.828249216079712, + "logits/rejected": -2.8191637992858887, + "logps/chosen": -71.01896667480469, + "logps/rejected": -79.3835678100586, + "loss": 0.6633, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20980136096477509, + "rewards/margins": 0.06808502972126007, + "rewards/rejected": -0.27788639068603516, + "step": 2490 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 4.809699058532715, + "learning_rate": 1.9884751785568928e-07, + "logits/chosen": -2.8965935707092285, + "logits/rejected": -2.8732640743255615, + "logps/chosen": -80.31697082519531, + "logps/rejected": -85.91722106933594, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.234262615442276, + "rewards/margins": 0.059213198721408844, + "rewards/rejected": -0.29347580671310425, + "step": 2500 + }, + { + "epoch": 0.432460372157133, + "grad_norm": 4.566841125488281, + "learning_rate": 1.9881696926597125e-07, + "logits/chosen": -2.7498779296875, + "logits/rejected": -2.7402052879333496, + "logps/chosen": -75.13661193847656, + "logps/rejected": -82.50086975097656, + "loss": 0.6584, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.22750934958457947, + "rewards/margins": 0.07768379151821136, + "rewards/rejected": -0.30519312620162964, + "step": 2510 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 4.17630672454834, + "learning_rate": 1.987860234909805e-07, + "logits/chosen": -2.735560417175293, + "logits/rejected": -2.7091479301452637, + "logps/chosen": -74.96757507324219, + "logps/rejected": -78.909912109375, + "loss": 0.6607, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22499346733093262, + "rewards/margins": 0.07408694922924042, + "rewards/rejected": -0.29908040165901184, + "step": 2520 + }, + { + "epoch": 0.4359062715368711, + "grad_norm": 4.669484615325928, + "learning_rate": 1.987546806551006e-07, + "logits/chosen": -2.8178813457489014, + "logits/rejected": -2.813152551651001, + "logps/chosen": -75.33357238769531, + "logps/rejected": -82.6530990600586, + "loss": 0.6757, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.24164001643657684, + "rewards/margins": 0.04336600750684738, + "rewards/rejected": -0.28500601649284363, + "step": 2530 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 4.2985920906066895, + "learning_rate": 1.9872294088431105e-07, + "logits/chosen": -2.856102705001831, + "logits/rejected": -2.845292568206787, + "logps/chosen": -78.64314270019531, + "logps/rejected": -86.57007598876953, + "loss": 0.6597, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25102704763412476, + "rewards/margins": 0.07619601488113403, + "rewards/rejected": -0.3272230923175812, + "step": 2540 + }, + { + "epoch": 0.4393521709166092, + "grad_norm": 4.795299530029297, + "learning_rate": 1.9869080430618684e-07, + "logits/chosen": -2.799677610397339, + "logits/rejected": -2.7678658962249756, + "logps/chosen": -81.40946197509766, + "logps/rejected": -86.08482360839844, + "loss": 0.6552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2272636592388153, + "rewards/margins": 0.09386886656284332, + "rewards/rejected": -0.3211325407028198, + "step": 2550 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 4.634361743927002, + "learning_rate": 1.9865827104989774e-07, + "logits/chosen": -2.8855044841766357, + "logits/rejected": -2.8617072105407715, + "logps/chosen": -77.7742691040039, + "logps/rejected": -83.0869369506836, + "loss": 0.665, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22513917088508606, + "rewards/margins": 0.06552886217832565, + "rewards/rejected": -0.2906680107116699, + "step": 2560 + }, + { + "epoch": 0.4427980702963473, + "grad_norm": 4.259562969207764, + "learning_rate": 1.9862534124620814e-07, + "logits/chosen": -2.8211474418640137, + "logits/rejected": -2.8036704063415527, + "logps/chosen": -81.92951965332031, + "logps/rejected": -86.38294982910156, + "loss": 0.6652, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.23327799141407013, + "rewards/margins": 0.06551525741815567, + "rewards/rejected": -0.2987932562828064, + "step": 2570 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 4.979623317718506, + "learning_rate": 1.9859201502747614e-07, + "logits/chosen": -2.836080312728882, + "logits/rejected": -2.8181748390197754, + "logps/chosen": -79.82194519042969, + "logps/rejected": -84.47944641113281, + "loss": 0.6733, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2643613815307617, + "rewards/margins": 0.048605453222990036, + "rewards/rejected": -0.31296685338020325, + "step": 2580 + }, + { + "epoch": 0.4462439696760855, + "grad_norm": 4.920752048492432, + "learning_rate": 1.985582925276533e-07, + "logits/chosen": -2.8163769245147705, + "logits/rejected": -2.7815186977386475, + "logps/chosen": -80.47447967529297, + "logps/rejected": -82.33802032470703, + "loss": 0.661, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.23780038952827454, + "rewards/margins": 0.07311274856328964, + "rewards/rejected": -0.3109131455421448, + "step": 2590 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 5.04128885269165, + "learning_rate": 1.9852417388228392e-07, + "logits/chosen": -2.7918379306793213, + "logits/rejected": -2.746398448944092, + "logps/chosen": -82.25102233886719, + "logps/rejected": -80.16227722167969, + "loss": 0.6657, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2515484392642975, + "rewards/margins": 0.06543479859828949, + "rewards/rejected": -0.3169831931591034, + "step": 2600 + }, + { + "epoch": 0.4496898690558236, + "grad_norm": 4.687989711761475, + "learning_rate": 1.9848965922850464e-07, + "logits/chosen": -2.7723536491394043, + "logits/rejected": -2.7345359325408936, + "logps/chosen": -83.00138092041016, + "logps/rejected": -83.72332000732422, + "loss": 0.6616, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.24622325599193573, + "rewards/margins": 0.07428522408008575, + "rewards/rejected": -0.32050850987434387, + "step": 2610 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 4.529170989990234, + "learning_rate": 1.9845474870504378e-07, + "logits/chosen": -2.8512473106384277, + "logits/rejected": -2.8228654861450195, + "logps/chosen": -75.3122787475586, + "logps/rejected": -83.6035385131836, + "loss": 0.6521, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2404632866382599, + "rewards/margins": 0.09109614789485931, + "rewards/rejected": -0.3315594792366028, + "step": 2620 + }, + { + "epoch": 0.4531357684355617, + "grad_norm": 4.675422191619873, + "learning_rate": 1.984194424522208e-07, + "logits/chosen": -2.7441370487213135, + "logits/rejected": -2.7133846282958984, + "logps/chosen": -77.77023315429688, + "logps/rejected": -85.73268127441406, + "loss": 0.6508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2295670509338379, + "rewards/margins": 0.09620517492294312, + "rewards/rejected": -0.325772225856781, + "step": 2630 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 4.241116523742676, + "learning_rate": 1.9838374061194575e-07, + "logits/chosen": -2.759497880935669, + "logits/rejected": -2.738795518875122, + "logps/chosen": -74.74937438964844, + "logps/rejected": -81.0289077758789, + "loss": 0.6614, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.24653956294059753, + "rewards/margins": 0.07387407124042511, + "rewards/rejected": -0.32041364908218384, + "step": 2640 + }, + { + "epoch": 0.4565816678152998, + "grad_norm": 4.396022319793701, + "learning_rate": 1.983476433277188e-07, + "logits/chosen": -2.730039596557617, + "logits/rejected": -2.713178873062134, + "logps/chosen": -75.72853088378906, + "logps/rejected": -85.95320129394531, + "loss": 0.6572, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.23297707736492157, + "rewards/margins": 0.08371174335479736, + "rewards/rejected": -0.3166888356208801, + "step": 2650 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 4.776400089263916, + "learning_rate": 1.9831115074462944e-07, + "logits/chosen": -2.7560505867004395, + "logits/rejected": -2.7167718410491943, + "logps/chosen": -84.48075866699219, + "logps/rejected": -86.92762756347656, + "loss": 0.658, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2619093954563141, + "rewards/margins": 0.08262166380882263, + "rewards/rejected": -0.34453102946281433, + "step": 2660 + }, + { + "epoch": 0.4600275671950379, + "grad_norm": 4.16822624206543, + "learning_rate": 1.982742630093561e-07, + "logits/chosen": -2.729417324066162, + "logits/rejected": -2.7020583152770996, + "logps/chosen": -81.297119140625, + "logps/rejected": -87.87712097167969, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2735114097595215, + "rewards/margins": 0.08464406430721283, + "rewards/rejected": -0.3581554591655731, + "step": 2670 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 5.366155624389648, + "learning_rate": 1.9823698027016548e-07, + "logits/chosen": -2.7723546028137207, + "logits/rejected": -2.7467968463897705, + "logps/chosen": -86.6219253540039, + "logps/rejected": -87.81824493408203, + "loss": 0.6694, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2837437391281128, + "rewards/margins": 0.05781107023358345, + "rewards/rejected": -0.34155479073524475, + "step": 2680 + }, + { + "epoch": 0.463473466574776, + "grad_norm": 4.76614236831665, + "learning_rate": 1.98199302676912e-07, + "logits/chosen": -2.7463812828063965, + "logits/rejected": -2.7254321575164795, + "logps/chosen": -78.85734558105469, + "logps/rejected": -86.11651611328125, + "loss": 0.6582, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.28216251730918884, + "rewards/margins": 0.08013720065355301, + "rewards/rejected": -0.36229974031448364, + "step": 2690 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 4.918941497802734, + "learning_rate": 1.9816123038103701e-07, + "logits/chosen": -2.7731640338897705, + "logits/rejected": -2.7475695610046387, + "logps/chosen": -81.00579071044922, + "logps/rejected": -88.36027526855469, + "loss": 0.6624, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2777502238750458, + "rewards/margins": 0.07108843326568604, + "rewards/rejected": -0.3488386571407318, + "step": 2700 + }, + { + "epoch": 0.4669193659545141, + "grad_norm": 5.564276218414307, + "learning_rate": 1.9812276353556852e-07, + "logits/chosen": -2.7926807403564453, + "logits/rejected": -2.774944543838501, + "logps/chosen": -86.74163055419922, + "logps/rejected": -88.55531311035156, + "loss": 0.6688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.27294981479644775, + "rewards/margins": 0.059096015989780426, + "rewards/rejected": -0.33204585313796997, + "step": 2710 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 5.371655464172363, + "learning_rate": 1.9808390229512026e-07, + "logits/chosen": -2.784646511077881, + "logits/rejected": -2.7882354259490967, + "logps/chosen": -80.06517028808594, + "logps/rejected": -89.0752182006836, + "loss": 0.6631, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2801792025566101, + "rewards/margins": 0.07294531166553497, + "rewards/rejected": -0.35312455892562866, + "step": 2720 + }, + { + "epoch": 0.4703652653342522, + "grad_norm": 5.554327487945557, + "learning_rate": 1.980446468158912e-07, + "logits/chosen": -2.7829484939575195, + "logits/rejected": -2.764782667160034, + "logps/chosen": -87.25930786132812, + "logps/rejected": -90.30667114257812, + "loss": 0.6665, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2869259715080261, + "rewards/margins": 0.06388825923204422, + "rewards/rejected": -0.35081425309181213, + "step": 2730 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 5.064526081085205, + "learning_rate": 1.9800499725566506e-07, + "logits/chosen": -2.7626073360443115, + "logits/rejected": -2.742617130279541, + "logps/chosen": -81.52635192871094, + "logps/rejected": -84.69229125976562, + "loss": 0.6665, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.28667086362838745, + "rewards/margins": 0.06642518192529678, + "rewards/rejected": -0.353096067905426, + "step": 2740 + }, + { + "epoch": 0.4738111647139904, + "grad_norm": 5.177687644958496, + "learning_rate": 1.9796495377380933e-07, + "logits/chosen": -2.696913242340088, + "logits/rejected": -2.692275047302246, + "logps/chosen": -78.03648376464844, + "logps/rejected": -90.65962219238281, + "loss": 0.6574, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2877274453639984, + "rewards/margins": 0.08459886163473129, + "rewards/rejected": -0.3723262846469879, + "step": 2750 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 5.387844562530518, + "learning_rate": 1.9792451653127496e-07, + "logits/chosen": -2.7484130859375, + "logits/rejected": -2.7317137718200684, + "logps/chosen": -82.39192199707031, + "logps/rejected": -91.6829605102539, + "loss": 0.6564, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28116923570632935, + "rewards/margins": 0.08760281652212143, + "rewards/rejected": -0.36877208948135376, + "step": 2760 + }, + { + "epoch": 0.4772570640937285, + "grad_norm": 6.261878490447998, + "learning_rate": 1.9788368569059551e-07, + "logits/chosen": -2.866014003753662, + "logits/rejected": -2.8366146087646484, + "logps/chosen": -84.28050231933594, + "logps/rejected": -88.85655212402344, + "loss": 0.6653, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2906574010848999, + "rewards/margins": 0.06752223521471024, + "rewards/rejected": -0.35817962884902954, + "step": 2770 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 5.086827754974365, + "learning_rate": 1.9784246141588662e-07, + "logits/chosen": -2.690002918243408, + "logits/rejected": -2.661464214324951, + "logps/chosen": -83.01453399658203, + "logps/rejected": -87.8421859741211, + "loss": 0.6639, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3018570840358734, + "rewards/margins": 0.07041344046592712, + "rewards/rejected": -0.37227049469947815, + "step": 2780 + }, + { + "epoch": 0.4807029634734666, + "grad_norm": 4.994007587432861, + "learning_rate": 1.9780084387284535e-07, + "logits/chosen": -2.798659563064575, + "logits/rejected": -2.7668426036834717, + "logps/chosen": -80.95232391357422, + "logps/rejected": -86.96568298339844, + "loss": 0.6599, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.29192060232162476, + "rewards/margins": 0.07660763710737228, + "rewards/rejected": -0.3685282766819, + "step": 2790 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 5.847742557525635, + "learning_rate": 1.977588332287493e-07, + "logits/chosen": -2.784656286239624, + "logits/rejected": -2.760721206665039, + "logps/chosen": -92.74882507324219, + "logps/rejected": -95.89972686767578, + "loss": 0.6675, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.32713156938552856, + "rewards/margins": 0.06348985433578491, + "rewards/rejected": -0.3906214237213135, + "step": 2800 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -2.86350417137146, + "eval_logits/rejected": -2.857557773590088, + "eval_logps/chosen": -81.5435791015625, + "eval_logps/rejected": -90.7352523803711, + "eval_loss": 0.6732187867164612, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -0.22831681370735168, + "eval_rewards/margins": 0.0472344309091568, + "eval_rewards/rejected": -0.2755512595176697, + "eval_runtime": 382.7509, + "eval_samples_per_second": 11.245, + "eval_steps_per_second": 1.406, + "step": 2800 + }, + { + "epoch": 0.4841488628532047, + "grad_norm": 6.044860363006592, + "learning_rate": 1.9771642965245623e-07, + "logits/chosen": -2.6763038635253906, + "logits/rejected": -2.6524975299835205, + "logps/chosen": -85.02580261230469, + "logps/rejected": -94.72159576416016, + "loss": 0.6535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2969000041484833, + "rewards/margins": 0.09268021583557129, + "rewards/rejected": -0.38958021998405457, + "step": 2810 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 5.906956672668457, + "learning_rate": 1.9767363331440324e-07, + "logits/chosen": -2.819481372833252, + "logits/rejected": -2.8079781532287598, + "logps/chosen": -84.50180053710938, + "logps/rejected": -89.02869415283203, + "loss": 0.6699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30174607038497925, + "rewards/margins": 0.05834274739027023, + "rewards/rejected": -0.3600887656211853, + "step": 2820 + }, + { + "epoch": 0.4875947622329428, + "grad_norm": 5.0853800773620605, + "learning_rate": 1.9763044438660606e-07, + "logits/chosen": -2.6298704147338867, + "logits/rejected": -2.61495304107666, + "logps/chosen": -84.22966766357422, + "logps/rejected": -95.0593032836914, + "loss": 0.6485, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.28729158639907837, + "rewards/margins": 0.11365656554698944, + "rewards/rejected": -0.40094810724258423, + "step": 2830 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 5.301225662231445, + "learning_rate": 1.9758686304265845e-07, + "logits/chosen": -2.7911629676818848, + "logits/rejected": -2.7801711559295654, + "logps/chosen": -83.1259994506836, + "logps/rejected": -89.50416564941406, + "loss": 0.6615, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3048972487449646, + "rewards/margins": 0.07634397596120834, + "rewards/rejected": -0.38124123215675354, + "step": 2840 + }, + { + "epoch": 0.4910406616126809, + "grad_norm": 6.339087963104248, + "learning_rate": 1.975428894577314e-07, + "logits/chosen": -2.749023675918579, + "logits/rejected": -2.730348825454712, + "logps/chosen": -84.1259994506836, + "logps/rejected": -97.1973876953125, + "loss": 0.6493, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.31879574060440063, + "rewards/margins": 0.10469119250774384, + "rewards/rejected": -0.42348694801330566, + "step": 2850 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 5.268611431121826, + "learning_rate": 1.9749852380857247e-07, + "logits/chosen": -2.73268461227417, + "logits/rejected": -2.7092766761779785, + "logps/chosen": -85.26544952392578, + "logps/rejected": -92.43037414550781, + "loss": 0.6503, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3030092120170593, + "rewards/margins": 0.10151870548725128, + "rewards/rejected": -0.4045279622077942, + "step": 2860 + }, + { + "epoch": 0.494486560992419, + "grad_norm": 6.336197853088379, + "learning_rate": 1.9745376627350515e-07, + "logits/chosen": -2.770284652709961, + "logits/rejected": -2.7484183311462402, + "logps/chosen": -86.85330963134766, + "logps/rejected": -90.93848419189453, + "loss": 0.6643, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.30737534165382385, + "rewards/margins": 0.07163959741592407, + "rewards/rejected": -0.3790149688720703, + "step": 2870 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 7.067396640777588, + "learning_rate": 1.9740861703242797e-07, + "logits/chosen": -2.8400158882141113, + "logits/rejected": -2.814021587371826, + "logps/chosen": -87.88722229003906, + "logps/rejected": -92.84794616699219, + "loss": 0.6548, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.325391948223114, + "rewards/margins": 0.09382455050945282, + "rewards/rejected": -0.4192165434360504, + "step": 2880 + }, + { + "epoch": 0.49793246037215716, + "grad_norm": 5.568375587463379, + "learning_rate": 1.97363076266814e-07, + "logits/chosen": -2.847001075744629, + "logits/rejected": -2.841013193130493, + "logps/chosen": -83.87931823730469, + "logps/rejected": -94.69303131103516, + "loss": 0.6453, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.30921635031700134, + "rewards/margins": 0.11252041906118393, + "rewards/rejected": -0.42173680663108826, + "step": 2890 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 5.770374298095703, + "learning_rate": 1.9731714415970998e-07, + "logits/chosen": -2.7627999782562256, + "logits/rejected": -2.753859281539917, + "logps/chosen": -82.06367492675781, + "logps/rejected": -89.87281036376953, + "loss": 0.6576, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2957562506198883, + "rewards/margins": 0.08762596547603607, + "rewards/rejected": -0.3833822011947632, + "step": 2900 + }, + { + "epoch": 0.5013783597518953, + "grad_norm": 6.5320916175842285, + "learning_rate": 1.9727082089573552e-07, + "logits/chosen": -2.8038830757141113, + "logits/rejected": -2.7914395332336426, + "logps/chosen": -86.36532592773438, + "logps/rejected": -99.83525085449219, + "loss": 0.6332, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.30258145928382874, + "rewards/margins": 0.13676601648330688, + "rewards/rejected": -0.43934744596481323, + "step": 2910 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 6.383522987365723, + "learning_rate": 1.9722410666108251e-07, + "logits/chosen": -2.7484636306762695, + "logits/rejected": -2.744802474975586, + "logps/chosen": -87.0767593383789, + "logps/rejected": -100.16703033447266, + "loss": 0.6474, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.33506637811660767, + "rewards/margins": 0.11154337227344513, + "rewards/rejected": -0.4466097354888916, + "step": 2920 + }, + { + "epoch": 0.5048242591316333, + "grad_norm": 5.712911128997803, + "learning_rate": 1.9717700164351435e-07, + "logits/chosen": -2.7168633937835693, + "logits/rejected": -2.6899285316467285, + "logps/chosen": -85.99942779541016, + "logps/rejected": -95.53802490234375, + "loss": 0.6447, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.32548198103904724, + "rewards/margins": 0.11321850121021271, + "rewards/rejected": -0.43870049715042114, + "step": 2930 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 5.64019775390625, + "learning_rate": 1.9712950603236508e-07, + "logits/chosen": -2.745177745819092, + "logits/rejected": -2.7103383541107178, + "logps/chosen": -84.22530364990234, + "logps/rejected": -90.6386489868164, + "loss": 0.6651, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.32764965295791626, + "rewards/margins": 0.07088522613048553, + "rewards/rejected": -0.3985348641872406, + "step": 2940 + }, + { + "epoch": 0.5082701585113715, + "grad_norm": 6.2562994956970215, + "learning_rate": 1.9708162001853873e-07, + "logits/chosen": -2.7670552730560303, + "logits/rejected": -2.7497591972351074, + "logps/chosen": -86.6700668334961, + "logps/rejected": -98.61284637451172, + "loss": 0.6444, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3234894871711731, + "rewards/margins": 0.11730514466762543, + "rewards/rejected": -0.44079461693763733, + "step": 2950 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 6.609861373901367, + "learning_rate": 1.9703334379450855e-07, + "logits/chosen": -2.7286181449890137, + "logits/rejected": -2.702838182449341, + "logps/chosen": -86.29169464111328, + "logps/rejected": -95.97723388671875, + "loss": 0.6496, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.32343727350234985, + "rewards/margins": 0.1173778548836708, + "rewards/rejected": -0.44081512093544006, + "step": 2960 + }, + { + "epoch": 0.5117160578911096, + "grad_norm": 6.852441310882568, + "learning_rate": 1.969846775543161e-07, + "logits/chosen": -2.6641182899475098, + "logits/rejected": -2.6382086277008057, + "logps/chosen": -91.28890228271484, + "logps/rejected": -96.90291595458984, + "loss": 0.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.360513836145401, + "rewards/margins": 0.08437645435333252, + "rewards/rejected": -0.44489026069641113, + "step": 2970 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 6.942179203033447, + "learning_rate": 1.9693562149357072e-07, + "logits/chosen": -2.6576151847839355, + "logits/rejected": -2.632829189300537, + "logps/chosen": -86.34416961669922, + "logps/rejected": -96.5618896484375, + "loss": 0.6436, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.324049711227417, + "rewards/margins": 0.1187606081366539, + "rewards/rejected": -0.4428102970123291, + "step": 2980 + }, + { + "epoch": 0.5151619572708477, + "grad_norm": 6.220181941986084, + "learning_rate": 1.9688617580944843e-07, + "logits/chosen": -2.702073812484741, + "logits/rejected": -2.6761860847473145, + "logps/chosen": -93.23859405517578, + "logps/rejected": -100.00730895996094, + "loss": 0.6582, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.366287499666214, + "rewards/margins": 0.08763855695724487, + "rewards/rejected": -0.4539260268211365, + "step": 2990 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 5.371936321258545, + "learning_rate": 1.9683634070069143e-07, + "logits/chosen": -2.7104570865631104, + "logits/rejected": -2.7045562267303467, + "logps/chosen": -85.96195983886719, + "logps/rejected": -97.77884674072266, + "loss": 0.6586, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.33949604630470276, + "rewards/margins": 0.09934855997562408, + "rewards/rejected": -0.43884459137916565, + "step": 3000 + }, + { + "epoch": 0.5186078566505858, + "grad_norm": 6.489460468292236, + "learning_rate": 1.967861163676071e-07, + "logits/chosen": -2.733281373977661, + "logits/rejected": -2.7114508152008057, + "logps/chosen": -92.49397277832031, + "logps/rejected": -98.18407440185547, + "loss": 0.662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3730631172657013, + "rewards/margins": 0.07783212512731552, + "rewards/rejected": -0.4508952498435974, + "step": 3010 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 6.768235683441162, + "learning_rate": 1.9673550301206733e-07, + "logits/chosen": -2.808366060256958, + "logits/rejected": -2.778860092163086, + "logps/chosen": -93.14388275146484, + "logps/rejected": -99.8772201538086, + "loss": 0.6571, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.38905346393585205, + "rewards/margins": 0.10656271129846573, + "rewards/rejected": -0.4956161379814148, + "step": 3020 + }, + { + "epoch": 0.5220537560303239, + "grad_norm": 6.508683681488037, + "learning_rate": 1.9668450083750762e-07, + "logits/chosen": -2.727835178375244, + "logits/rejected": -2.7069993019104004, + "logps/chosen": -93.80626678466797, + "logps/rejected": -99.74931335449219, + "loss": 0.6678, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3813135027885437, + "rewards/margins": 0.06728534400463104, + "rewards/rejected": -0.4485989212989807, + "step": 3030 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 6.18773078918457, + "learning_rate": 1.9663311004892628e-07, + "logits/chosen": -2.815324306488037, + "logits/rejected": -2.8085083961486816, + "logps/chosen": -87.60061645507812, + "logps/rejected": -101.3532485961914, + "loss": 0.6486, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3641282916069031, + "rewards/margins": 0.10624756664037704, + "rewards/rejected": -0.4703758656978607, + "step": 3040 + }, + { + "epoch": 0.525499655410062, + "grad_norm": 7.5851149559021, + "learning_rate": 1.9658133085288365e-07, + "logits/chosen": -2.6851487159729004, + "logits/rejected": -2.673161029815674, + "logps/chosen": -90.39569854736328, + "logps/rejected": -100.58292388916016, + "loss": 0.6598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3683760166168213, + "rewards/margins": 0.08387802541255951, + "rewards/rejected": -0.4522539973258972, + "step": 3050 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 7.758167743682861, + "learning_rate": 1.965291634575011e-07, + "logits/chosen": -2.714484214782715, + "logits/rejected": -2.6961376667022705, + "logps/chosen": -92.89412689208984, + "logps/rejected": -101.2403564453125, + "loss": 0.6545, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.37218329310417175, + "rewards/margins": 0.0965455174446106, + "rewards/rejected": -0.46872884035110474, + "step": 3060 + }, + { + "epoch": 0.5289455547898001, + "grad_norm": 6.4621477127075195, + "learning_rate": 1.9647660807246063e-07, + "logits/chosen": -2.679758071899414, + "logits/rejected": -2.6543891429901123, + "logps/chosen": -95.65350341796875, + "logps/rejected": -102.36808013916016, + "loss": 0.6485, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37710902094841003, + "rewards/margins": 0.10719014704227448, + "rewards/rejected": -0.4842991232872009, + "step": 3070 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 6.19595193862915, + "learning_rate": 1.9642366490900337e-07, + "logits/chosen": -2.5938596725463867, + "logits/rejected": -2.5772056579589844, + "logps/chosen": -91.1055908203125, + "logps/rejected": -105.87294006347656, + "loss": 0.6573, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4024714529514313, + "rewards/margins": 0.10824602842330933, + "rewards/rejected": -0.510717511177063, + "step": 3080 + }, + { + "epoch": 0.5323914541695383, + "grad_norm": 6.512645244598389, + "learning_rate": 1.9637033417992936e-07, + "logits/chosen": -2.6764976978302, + "logits/rejected": -2.6590423583984375, + "logps/chosen": -92.03446960449219, + "logps/rejected": -103.1094741821289, + "loss": 0.6477, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3842238485813141, + "rewards/margins": 0.10982821881771088, + "rewards/rejected": -0.49405208230018616, + "step": 3090 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 8.616137504577637, + "learning_rate": 1.9631661609959628e-07, + "logits/chosen": -2.6933939456939697, + "logits/rejected": -2.665048599243164, + "logps/chosen": -91.06763458251953, + "logps/rejected": -101.47669982910156, + "loss": 0.6419, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.37147217988967896, + "rewards/margins": 0.12758705019950867, + "rewards/rejected": -0.49905920028686523, + "step": 3100 + }, + { + "epoch": 0.5358373535492763, + "grad_norm": 7.095156192779541, + "learning_rate": 1.9626251088391876e-07, + "logits/chosen": -2.6438610553741455, + "logits/rejected": -2.652470350265503, + "logps/chosen": -92.75547790527344, + "logps/rejected": -106.07810974121094, + "loss": 0.6578, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4214300215244293, + "rewards/margins": 0.08987243473529816, + "rewards/rejected": -0.5113024711608887, + "step": 3110 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 6.420622825622559, + "learning_rate": 1.9620801875036753e-07, + "logits/chosen": -2.6833789348602295, + "logits/rejected": -2.6618340015411377, + "logps/chosen": -93.40895080566406, + "logps/rejected": -107.32088470458984, + "loss": 0.6337, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3739815950393677, + "rewards/margins": 0.14117519557476044, + "rewards/rejected": -0.5151568651199341, + "step": 3120 + }, + { + "epoch": 0.5392832529290145, + "grad_norm": 6.547939300537109, + "learning_rate": 1.9615313991796843e-07, + "logits/chosen": -2.6036171913146973, + "logits/rejected": -2.594822645187378, + "logps/chosen": -89.96730041503906, + "logps/rejected": -101.12642669677734, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3788634240627289, + "rewards/margins": 0.09900254011154175, + "rewards/rejected": -0.477865993976593, + "step": 3130 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 7.179741859436035, + "learning_rate": 1.960978746073016e-07, + "logits/chosen": -2.701573133468628, + "logits/rejected": -2.687563419342041, + "logps/chosen": -94.88987731933594, + "logps/rejected": -110.26725769042969, + "loss": 0.6407, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43192750215530396, + "rewards/margins": 0.1333298236131668, + "rewards/rejected": -0.5652573108673096, + "step": 3140 + }, + { + "epoch": 0.5427291523087526, + "grad_norm": 7.94010591506958, + "learning_rate": 1.9604222304050074e-07, + "logits/chosen": -2.7010350227355957, + "logits/rejected": -2.673696517944336, + "logps/chosen": -96.3143310546875, + "logps/rejected": -106.46876525878906, + "loss": 0.6503, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4194537103176117, + "rewards/margins": 0.10921194404363632, + "rewards/rejected": -0.5286656618118286, + "step": 3150 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 8.393180847167969, + "learning_rate": 1.9598618544125184e-07, + "logits/chosen": -2.630312204360962, + "logits/rejected": -2.6040008068084717, + "logps/chosen": -95.65312957763672, + "logps/rejected": -107.04170989990234, + "loss": 0.6428, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.41506823897361755, + "rewards/margins": 0.12901504337787628, + "rewards/rejected": -0.544083297252655, + "step": 3160 + }, + { + "epoch": 0.5461750516884907, + "grad_norm": 6.8773908615112305, + "learning_rate": 1.9592976203479266e-07, + "logits/chosen": -2.6714940071105957, + "logits/rejected": -2.641033411026001, + "logps/chosen": -98.60674285888672, + "logps/rejected": -106.5346908569336, + "loss": 0.6407, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42717328667640686, + "rewards/margins": 0.143313467502594, + "rewards/rejected": -0.5704867243766785, + "step": 3170 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 6.960276126861572, + "learning_rate": 1.9587295304791164e-07, + "logits/chosen": -2.725450038909912, + "logits/rejected": -2.690068244934082, + "logps/chosen": -95.82426452636719, + "logps/rejected": -105.81490325927734, + "loss": 0.6465, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.42567867040634155, + "rewards/margins": 0.11819012463092804, + "rewards/rejected": -0.5438688397407532, + "step": 3180 + }, + { + "epoch": 0.5496209510682288, + "grad_norm": 7.8112053871154785, + "learning_rate": 1.95815758708947e-07, + "logits/chosen": -2.7114315032958984, + "logits/rejected": -2.7045350074768066, + "logps/chosen": -93.44751739501953, + "logps/rejected": -117.18695068359375, + "loss": 0.6181, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4114983081817627, + "rewards/margins": 0.19371816515922546, + "rewards/rejected": -0.6052165031433105, + "step": 3190 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 7.148016929626465, + "learning_rate": 1.957581792477859e-07, + "logits/chosen": -2.6281604766845703, + "logits/rejected": -2.6143136024475098, + "logps/chosen": -96.54183197021484, + "logps/rejected": -106.0041275024414, + "loss": 0.6437, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4121316373348236, + "rewards/margins": 0.12658736109733582, + "rewards/rejected": -0.5387190580368042, + "step": 3200 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -2.760655403137207, + "eval_logits/rejected": -2.754596710205078, + "eval_logps/chosen": -94.27955627441406, + "eval_logps/rejected": -105.83222198486328, + "eval_loss": 0.6645947098731995, + "eval_rewards/accuracies": 0.6119888424873352, + "eval_rewards/chosen": -0.3556765913963318, + "eval_rewards/margins": 0.0708443745970726, + "eval_rewards/rejected": -0.426520973443985, + "eval_runtime": 382.9537, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 3200 + }, + { + "epoch": 0.5530668504479669, + "grad_norm": 9.388651847839355, + "learning_rate": 1.9570021489586344e-07, + "logits/chosen": -2.5501980781555176, + "logits/rejected": -2.5273725986480713, + "logps/chosen": -99.22215270996094, + "logps/rejected": -108.93840026855469, + "loss": 0.6486, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4527076780796051, + "rewards/margins": 0.11170338094234467, + "rewards/rejected": -0.5644110441207886, + "step": 3210 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 7.1553521156311035, + "learning_rate": 1.956418658861617e-07, + "logits/chosen": -2.5821588039398193, + "logits/rejected": -2.5769202709198, + "logps/chosen": -94.36402130126953, + "logps/rejected": -108.30653381347656, + "loss": 0.6499, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4402020573616028, + "rewards/margins": 0.11403479427099228, + "rewards/rejected": -0.5542367696762085, + "step": 3220 + }, + { + "epoch": 0.556512749827705, + "grad_norm": 8.118807792663574, + "learning_rate": 1.9558313245320888e-07, + "logits/chosen": -2.63601016998291, + "logits/rejected": -2.6221766471862793, + "logps/chosen": -98.5857162475586, + "logps/rejected": -110.01104736328125, + "loss": 0.6584, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4442833960056305, + "rewards/margins": 0.09203995019197464, + "rewards/rejected": -0.5363233685493469, + "step": 3230 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 7.089800834655762, + "learning_rate": 1.955240148330784e-07, + "logits/chosen": -2.7510554790496826, + "logits/rejected": -2.7306642532348633, + "logps/chosen": -105.6661376953125, + "logps/rejected": -110.5372543334961, + "loss": 0.6636, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4942947030067444, + "rewards/margins": 0.08564119040966034, + "rewards/rejected": -0.5799359083175659, + "step": 3240 + }, + { + "epoch": 0.5599586492074431, + "grad_norm": 8.295072555541992, + "learning_rate": 1.954645132633878e-07, + "logits/chosen": -2.637881278991699, + "logits/rejected": -2.6184113025665283, + "logps/chosen": -95.91987609863281, + "logps/rejected": -110.17707824707031, + "loss": 0.6399, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4334731101989746, + "rewards/margins": 0.13521215319633484, + "rewards/rejected": -0.5686852335929871, + "step": 3250 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 7.124377727508545, + "learning_rate": 1.9540462798329788e-07, + "logits/chosen": -2.655236005783081, + "logits/rejected": -2.6376397609710693, + "logps/chosen": -94.26342010498047, + "logps/rejected": -111.944091796875, + "loss": 0.6335, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42727988958358765, + "rewards/margins": 0.16477133333683014, + "rewards/rejected": -0.5920512080192566, + "step": 3260 + }, + { + "epoch": 0.5634045485871813, + "grad_norm": 7.825984477996826, + "learning_rate": 1.953443592335118e-07, + "logits/chosen": -2.70601224899292, + "logits/rejected": -2.7016124725341797, + "logps/chosen": -102.71690368652344, + "logps/rejected": -119.76414489746094, + "loss": 0.6376, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.47587305307388306, + "rewards/margins": 0.1428886353969574, + "rewards/rejected": -0.6187616586685181, + "step": 3270 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 8.646442413330078, + "learning_rate": 1.9528370725627393e-07, + "logits/chosen": -2.702390670776367, + "logits/rejected": -2.6908962726593018, + "logps/chosen": -94.289306640625, + "logps/rejected": -109.12205505371094, + "loss": 0.6474, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.44868603348731995, + "rewards/margins": 0.11855638027191162, + "rewards/rejected": -0.567242443561554, + "step": 3280 + }, + { + "epoch": 0.5668504479669194, + "grad_norm": 7.565232276916504, + "learning_rate": 1.9522267229536907e-07, + "logits/chosen": -2.7394003868103027, + "logits/rejected": -2.709407091140747, + "logps/chosen": -98.76429748535156, + "logps/rejected": -113.52610778808594, + "loss": 0.6365, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.45598071813583374, + "rewards/margins": 0.14605209231376648, + "rewards/rejected": -0.6020327806472778, + "step": 3290 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 8.72111701965332, + "learning_rate": 1.9516125459612133e-07, + "logits/chosen": -2.6218557357788086, + "logits/rejected": -2.6021950244903564, + "logps/chosen": -105.73250579833984, + "logps/rejected": -115.9620590209961, + "loss": 0.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5081601738929749, + "rewards/margins": 0.11569005250930786, + "rewards/rejected": -0.6238502264022827, + "step": 3300 + }, + { + "epoch": 0.5702963473466575, + "grad_norm": 7.255007743835449, + "learning_rate": 1.9509945440539328e-07, + "logits/chosen": -2.556091070175171, + "logits/rejected": -2.5282187461853027, + "logps/chosen": -100.8625717163086, + "logps/rejected": -112.12745666503906, + "loss": 0.6411, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.47763776779174805, + "rewards/margins": 0.13664332032203674, + "rewards/rejected": -0.6142809987068176, + "step": 3310 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 9.09738826751709, + "learning_rate": 1.9503727197158475e-07, + "logits/chosen": -2.612281560897827, + "logits/rejected": -2.5788486003875732, + "logps/chosen": -102.8791275024414, + "logps/rejected": -110.78157806396484, + "loss": 0.6458, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4917148947715759, + "rewards/margins": 0.12664994597434998, + "rewards/rejected": -0.6183648705482483, + "step": 3320 + }, + { + "epoch": 0.5737422467263956, + "grad_norm": 7.518438816070557, + "learning_rate": 1.949747075446321e-07, + "logits/chosen": -2.7091097831726074, + "logits/rejected": -2.6815297603607178, + "logps/chosen": -99.87444305419922, + "logps/rejected": -116.1915512084961, + "loss": 0.6286, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4566105008125305, + "rewards/margins": 0.16854511201381683, + "rewards/rejected": -0.6251556873321533, + "step": 3330 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 9.097807884216309, + "learning_rate": 1.9491176137600695e-07, + "logits/chosen": -2.6520981788635254, + "logits/rejected": -2.6339221000671387, + "logps/chosen": -106.4759521484375, + "logps/rejected": -119.20014953613281, + "loss": 0.637, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5056281685829163, + "rewards/margins": 0.1465793251991272, + "rewards/rejected": -0.6522074341773987, + "step": 3340 + }, + { + "epoch": 0.5771881461061337, + "grad_norm": 8.62578010559082, + "learning_rate": 1.9484843371871538e-07, + "logits/chosen": -2.545076370239258, + "logits/rejected": -2.530463933944702, + "logps/chosen": -105.77069091796875, + "logps/rejected": -119.581787109375, + "loss": 0.6471, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.49992021918296814, + "rewards/margins": 0.12265481799840927, + "rewards/rejected": -0.6225749850273132, + "step": 3350 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 7.896541118621826, + "learning_rate": 1.9478472482729677e-07, + "logits/chosen": -2.649341106414795, + "logits/rejected": -2.618516445159912, + "logps/chosen": -102.70530700683594, + "logps/rejected": -112.48616790771484, + "loss": 0.6476, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4789007604122162, + "rewards/margins": 0.12364625930786133, + "rewards/rejected": -0.6025470495223999, + "step": 3360 + }, + { + "epoch": 0.5806340454858718, + "grad_norm": 8.509905815124512, + "learning_rate": 1.947206349578229e-07, + "logits/chosen": -2.603302240371704, + "logits/rejected": -2.594250202178955, + "logps/chosen": -95.80339813232422, + "logps/rejected": -114.97441101074219, + "loss": 0.6297, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.44848114252090454, + "rewards/margins": 0.1611265242099762, + "rewards/rejected": -0.6096076369285583, + "step": 3370 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 8.58442497253418, + "learning_rate": 1.9465616436789683e-07, + "logits/chosen": -2.685575485229492, + "logits/rejected": -2.6578125953674316, + "logps/chosen": -100.21031951904297, + "logps/rejected": -107.1775131225586, + "loss": 0.6475, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.44749441742897034, + "rewards/margins": 0.12284334003925323, + "rewards/rejected": -0.57033771276474, + "step": 3380 + }, + { + "epoch": 0.5840799448656099, + "grad_norm": 8.441810607910156, + "learning_rate": 1.9459131331665183e-07, + "logits/chosen": -2.606628656387329, + "logits/rejected": -2.582200765609741, + "logps/chosen": -104.09676361083984, + "logps/rejected": -112.6886215209961, + "loss": 0.6599, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5036908388137817, + "rewards/margins": 0.10885962098836899, + "rewards/rejected": -0.6125504970550537, + "step": 3390 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 7.212275981903076, + "learning_rate": 1.9452608206475044e-07, + "logits/chosen": -2.643218755722046, + "logits/rejected": -2.6058707237243652, + "logps/chosen": -99.18804168701172, + "logps/rejected": -114.4738540649414, + "loss": 0.6285, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.46331319212913513, + "rewards/margins": 0.16757920384407043, + "rewards/rejected": -0.6308923959732056, + "step": 3400 + }, + { + "epoch": 0.587525844245348, + "grad_norm": 10.034046173095703, + "learning_rate": 1.9446047087438342e-07, + "logits/chosen": -2.543471097946167, + "logits/rejected": -2.515993118286133, + "logps/chosen": -92.6207275390625, + "logps/rejected": -103.92476654052734, + "loss": 0.6382, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4304746985435486, + "rewards/margins": 0.13809053599834442, + "rewards/rejected": -0.5685652494430542, + "step": 3410 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 9.963957786560059, + "learning_rate": 1.9439448000926859e-07, + "logits/chosen": -2.567805528640747, + "logits/rejected": -2.5440430641174316, + "logps/chosen": -96.545166015625, + "logps/rejected": -112.3097915649414, + "loss": 0.6323, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.43777838349342346, + "rewards/margins": 0.15233097970485687, + "rewards/rejected": -0.5901094675064087, + "step": 3420 + }, + { + "epoch": 0.5909717436250862, + "grad_norm": 7.235942363739014, + "learning_rate": 1.9432810973464988e-07, + "logits/chosen": -2.6433088779449463, + "logits/rejected": -2.6230881214141846, + "logps/chosen": -97.69755554199219, + "logps/rejected": -115.63801574707031, + "loss": 0.6353, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46356654167175293, + "rewards/margins": 0.14649716019630432, + "rewards/rejected": -0.6100637316703796, + "step": 3430 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 10.129012107849121, + "learning_rate": 1.942613603172962e-07, + "logits/chosen": -2.561520576477051, + "logits/rejected": -2.5459980964660645, + "logps/chosen": -104.91939544677734, + "logps/rejected": -118.30328369140625, + "loss": 0.6436, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5234652161598206, + "rewards/margins": 0.13155731558799744, + "rewards/rejected": -0.6550225019454956, + "step": 3440 + }, + { + "epoch": 0.5944176430048242, + "grad_norm": 10.185026168823242, + "learning_rate": 1.9419423202550037e-07, + "logits/chosen": -2.780900478363037, + "logits/rejected": -2.7437312602996826, + "logps/chosen": -112.07551574707031, + "logps/rejected": -121.580810546875, + "loss": 0.643, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5269603133201599, + "rewards/margins": 0.13937778770923615, + "rewards/rejected": -0.6663380861282349, + "step": 3450 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 9.492935180664062, + "learning_rate": 1.9412672512907812e-07, + "logits/chosen": -2.5769448280334473, + "logits/rejected": -2.5640530586242676, + "logps/chosen": -98.23934173583984, + "logps/rejected": -112.2879409790039, + "loss": 0.6454, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.460599422454834, + "rewards/margins": 0.12627866864204407, + "rewards/rejected": -0.5868780612945557, + "step": 3460 + }, + { + "epoch": 0.5978635423845624, + "grad_norm": 8.52971363067627, + "learning_rate": 1.940588398993669e-07, + "logits/chosen": -2.652571439743042, + "logits/rejected": -2.6381473541259766, + "logps/chosen": -107.09454345703125, + "logps/rejected": -122.2138900756836, + "loss": 0.6393, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5128433704376221, + "rewards/margins": 0.1659374237060547, + "rewards/rejected": -0.6787808537483215, + "step": 3470 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 8.951239585876465, + "learning_rate": 1.9399057660922482e-07, + "logits/chosen": -2.583982467651367, + "logits/rejected": -2.5551466941833496, + "logps/chosen": -109.26493072509766, + "logps/rejected": -121.90692138671875, + "loss": 0.6445, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5415034294128418, + "rewards/margins": 0.15873458981513977, + "rewards/rejected": -0.7002379298210144, + "step": 3480 + }, + { + "epoch": 0.6013094417643005, + "grad_norm": 12.640382766723633, + "learning_rate": 1.939219355330296e-07, + "logits/chosen": -2.5890984535217285, + "logits/rejected": -2.5623886585235596, + "logps/chosen": -108.95796203613281, + "logps/rejected": -125.61396789550781, + "loss": 0.631, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5468857884407043, + "rewards/margins": 0.164025217294693, + "rewards/rejected": -0.7109109163284302, + "step": 3490 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 9.470446586608887, + "learning_rate": 1.9385291694667742e-07, + "logits/chosen": -2.538329601287842, + "logits/rejected": -2.531400442123413, + "logps/chosen": -106.4929428100586, + "logps/rejected": -120.96571350097656, + "loss": 0.6506, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5449027419090271, + "rewards/margins": 0.11813728511333466, + "rewards/rejected": -0.663040041923523, + "step": 3500 + }, + { + "epoch": 0.6047553411440386, + "grad_norm": 7.284107685089111, + "learning_rate": 1.9378352112758182e-07, + "logits/chosen": -2.47944974899292, + "logits/rejected": -2.4565722942352295, + "logps/chosen": -108.488525390625, + "logps/rejected": -117.57743835449219, + "loss": 0.6528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5492126941680908, + "rewards/margins": 0.11511759459972382, + "rewards/rejected": -0.6643303036689758, + "step": 3510 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 9.556658744812012, + "learning_rate": 1.937137483546726e-07, + "logits/chosen": -2.514282464981079, + "logits/rejected": -2.495223045349121, + "logps/chosen": -107.26896667480469, + "logps/rejected": -116.1639404296875, + "loss": 0.6492, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5193467140197754, + "rewards/margins": 0.12163758277893066, + "rewards/rejected": -0.6409842371940613, + "step": 3520 + }, + { + "epoch": 0.6082012405237767, + "grad_norm": 10.154839515686035, + "learning_rate": 1.936435989083947e-07, + "logits/chosen": -2.6612658500671387, + "logits/rejected": -2.6214406490325928, + "logps/chosen": -107.97154235839844, + "logps/rejected": -120.12506103515625, + "loss": 0.6297, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5201407670974731, + "rewards/margins": 0.171969935297966, + "rewards/rejected": -0.6921107172966003, + "step": 3530 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 9.493666648864746, + "learning_rate": 1.9357307307070706e-07, + "logits/chosen": -2.5017638206481934, + "logits/rejected": -2.480271816253662, + "logps/chosen": -107.59370422363281, + "logps/rejected": -121.32795715332031, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5368096828460693, + "rewards/margins": 0.14079925417900085, + "rewards/rejected": -0.677608847618103, + "step": 3540 + }, + { + "epoch": 0.6116471399035148, + "grad_norm": 9.391371726989746, + "learning_rate": 1.9350217112508145e-07, + "logits/chosen": -2.5833280086517334, + "logits/rejected": -2.5742995738983154, + "logps/chosen": -112.63236999511719, + "logps/rejected": -122.10465240478516, + "loss": 0.6552, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5806922912597656, + "rewards/margins": 0.11484189331531525, + "rewards/rejected": -0.6955341696739197, + "step": 3550 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 8.803918838500977, + "learning_rate": 1.934308933565014e-07, + "logits/chosen": -2.5108482837677, + "logits/rejected": -2.486712694168091, + "logps/chosen": -109.10555267333984, + "logps/rejected": -121.26512145996094, + "loss": 0.65, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5620559453964233, + "rewards/margins": 0.125113382935524, + "rewards/rejected": -0.6871693134307861, + "step": 3560 + }, + { + "epoch": 0.6150930392832529, + "grad_norm": 10.215635299682617, + "learning_rate": 1.9335924005146106e-07, + "logits/chosen": -2.635714530944824, + "logits/rejected": -2.6039419174194336, + "logps/chosen": -117.2509765625, + "logps/rejected": -122.74668884277344, + "loss": 0.6779, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.603782594203949, + "rewards/margins": 0.09146207571029663, + "rewards/rejected": -0.6952446699142456, + "step": 3570 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 8.089561462402344, + "learning_rate": 1.9328721149796392e-07, + "logits/chosen": -2.5828123092651367, + "logits/rejected": -2.5639376640319824, + "logps/chosen": -112.38520812988281, + "logps/rejected": -126.69588470458984, + "loss": 0.6375, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5419041514396667, + "rewards/margins": 0.150339737534523, + "rewards/rejected": -0.6922438740730286, + "step": 3580 + }, + { + "epoch": 0.618538938662991, + "grad_norm": 8.862298011779785, + "learning_rate": 1.9321480798552184e-07, + "logits/chosen": -2.5745413303375244, + "logits/rejected": -2.5591864585876465, + "logps/chosen": -113.92549133300781, + "logps/rejected": -127.76713562011719, + "loss": 0.6494, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5950049161911011, + "rewards/margins": 0.14280660450458527, + "rewards/rejected": -0.7378115653991699, + "step": 3590 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 10.446368217468262, + "learning_rate": 1.9314202980515378e-07, + "logits/chosen": -2.5526838302612305, + "logits/rejected": -2.5284199714660645, + "logps/chosen": -107.75823974609375, + "logps/rejected": -116.94807434082031, + "loss": 0.6516, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5311118364334106, + "rewards/margins": 0.12276077270507812, + "rewards/rejected": -0.653872549533844, + "step": 3600 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -2.6612462997436523, + "eval_logits/rejected": -2.6547348499298096, + "eval_logps/chosen": -99.96431732177734, + "eval_logps/rejected": -112.99539184570312, + "eval_loss": 0.6601956486701965, + "eval_rewards/accuracies": 0.6177973747253418, + "eval_rewards/chosen": -0.4125242233276367, + "eval_rewards/margins": 0.0856284648180008, + "eval_rewards/rejected": -0.4981527328491211, + "eval_runtime": 382.6214, + "eval_samples_per_second": 11.249, + "eval_steps_per_second": 1.406, + "step": 3600 + }, + { + "epoch": 0.6219848380427292, + "grad_norm": 8.14836597442627, + "learning_rate": 1.9306887724938452e-07, + "logits/chosen": -2.520163059234619, + "logits/rejected": -2.5044562816619873, + "logps/chosen": -107.54792785644531, + "logps/rejected": -117.61376953125, + "loss": 0.6539, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5605472922325134, + "rewards/margins": 0.12117090076208115, + "rewards/rejected": -0.6817181706428528, + "step": 3610 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 9.616267204284668, + "learning_rate": 1.929953506122438e-07, + "logits/chosen": -2.4756884574890137, + "logits/rejected": -2.4509835243225098, + "logps/chosen": -101.85519409179688, + "logps/rejected": -119.7405014038086, + "loss": 0.6128, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.4634733200073242, + "rewards/margins": 0.197960764169693, + "rewards/rejected": -0.6614340543746948, + "step": 3620 + }, + { + "epoch": 0.6254307374224672, + "grad_norm": 9.02615737915039, + "learning_rate": 1.9292145018926478e-07, + "logits/chosen": -2.6216444969177246, + "logits/rejected": -2.614626407623291, + "logps/chosen": -102.73152923583984, + "logps/rejected": -127.5108871459961, + "loss": 0.6145, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5159082412719727, + "rewards/margins": 0.2023492306470871, + "rewards/rejected": -0.718257486820221, + "step": 3630 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 10.349617004394531, + "learning_rate": 1.9284717627748308e-07, + "logits/chosen": -2.575312376022339, + "logits/rejected": -2.561131000518799, + "logps/chosen": -104.86790466308594, + "logps/rejected": -123.4863052368164, + "loss": 0.6254, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.49824801087379456, + "rewards/margins": 0.17481979727745056, + "rewards/rejected": -0.6730678081512451, + "step": 3640 + }, + { + "epoch": 0.6288766368022054, + "grad_norm": 11.060629844665527, + "learning_rate": 1.9277252917543557e-07, + "logits/chosen": -2.5221903324127197, + "logits/rejected": -2.524599552154541, + "logps/chosen": -102.74385833740234, + "logps/rejected": -124.79080963134766, + "loss": 0.6239, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5166233777999878, + "rewards/margins": 0.18595512211322784, + "rewards/rejected": -0.7025784850120544, + "step": 3650 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 9.572096824645996, + "learning_rate": 1.92697509183159e-07, + "logits/chosen": -2.557387351989746, + "logits/rejected": -2.5276927947998047, + "logps/chosen": -113.19233703613281, + "logps/rejected": -132.50997924804688, + "loss": 0.6212, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5933358073234558, + "rewards/margins": 0.19637107849121094, + "rewards/rejected": -0.7897068858146667, + "step": 3660 + }, + { + "epoch": 0.6323225361819435, + "grad_norm": 10.522073745727539, + "learning_rate": 1.926221166021891e-07, + "logits/chosen": -2.5148067474365234, + "logits/rejected": -2.4904632568359375, + "logps/chosen": -114.75852966308594, + "logps/rejected": -127.93223571777344, + "loss": 0.6534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6190735697746277, + "rewards/margins": 0.13477261364459991, + "rewards/rejected": -0.7538461089134216, + "step": 3670 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 9.894550323486328, + "learning_rate": 1.9254635173555895e-07, + "logits/chosen": -2.541055679321289, + "logits/rejected": -2.5070159435272217, + "logps/chosen": -116.0576400756836, + "logps/rejected": -125.56229400634766, + "loss": 0.6492, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6138181686401367, + "rewards/margins": 0.14079603552818298, + "rewards/rejected": -0.7546142339706421, + "step": 3680 + }, + { + "epoch": 0.6357684355616816, + "grad_norm": 11.63806438446045, + "learning_rate": 1.9247021488779817e-07, + "logits/chosen": -2.482922077178955, + "logits/rejected": -2.4876933097839355, + "logps/chosen": -107.5826416015625, + "logps/rejected": -136.26242065429688, + "loss": 0.6146, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5548892021179199, + "rewards/margins": 0.21679215133190155, + "rewards/rejected": -0.7716813087463379, + "step": 3690 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 10.26459789276123, + "learning_rate": 1.923937063649315e-07, + "logits/chosen": -2.519047260284424, + "logits/rejected": -2.4944159984588623, + "logps/chosen": -115.27989196777344, + "logps/rejected": -134.55308532714844, + "loss": 0.6318, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5768235921859741, + "rewards/margins": 0.17483854293823242, + "rewards/rejected": -0.7516621351242065, + "step": 3700 + }, + { + "epoch": 0.6392143349414197, + "grad_norm": 10.61504077911377, + "learning_rate": 1.9231682647447757e-07, + "logits/chosen": -2.5426928997039795, + "logits/rejected": -2.5145814418792725, + "logps/chosen": -113.14909362792969, + "logps/rejected": -125.3436050415039, + "loss": 0.644, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5928040742874146, + "rewards/margins": 0.1502991020679474, + "rewards/rejected": -0.7431031465530396, + "step": 3710 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 8.570870399475098, + "learning_rate": 1.9223957552544762e-07, + "logits/chosen": -2.553067207336426, + "logits/rejected": -2.5354630947113037, + "logps/chosen": -106.05216979980469, + "logps/rejected": -128.2598419189453, + "loss": 0.6134, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5415540933609009, + "rewards/margins": 0.20719122886657715, + "rewards/rejected": -0.748745322227478, + "step": 3720 + }, + { + "epoch": 0.6426602343211578, + "grad_norm": 10.781991958618164, + "learning_rate": 1.9216195382834445e-07, + "logits/chosen": -2.514934539794922, + "logits/rejected": -2.4916977882385254, + "logps/chosen": -113.71707916259766, + "logps/rejected": -129.2454833984375, + "loss": 0.6254, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5919945240020752, + "rewards/margins": 0.19340413808822632, + "rewards/rejected": -0.7853986024856567, + "step": 3730 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 11.913883209228516, + "learning_rate": 1.9208396169516092e-07, + "logits/chosen": -2.5093138217926025, + "logits/rejected": -2.484928607940674, + "logps/chosen": -113.5191650390625, + "logps/rejected": -135.16090393066406, + "loss": 0.6264, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.600841224193573, + "rewards/margins": 0.18667316436767578, + "rewards/rejected": -0.7875143885612488, + "step": 3740 + }, + { + "epoch": 0.646106133700896, + "grad_norm": 9.88403606414795, + "learning_rate": 1.9200559943937895e-07, + "logits/chosen": -2.5667591094970703, + "logits/rejected": -2.5409789085388184, + "logps/chosen": -115.15147399902344, + "logps/rejected": -131.19515991210938, + "loss": 0.6395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6183281540870667, + "rewards/margins": 0.16283147037029266, + "rewards/rejected": -0.7811595797538757, + "step": 3750 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 14.569170951843262, + "learning_rate": 1.91926867375968e-07, + "logits/chosen": -2.5101914405822754, + "logits/rejected": -2.5057315826416016, + "logps/chosen": -124.47206115722656, + "logps/rejected": -136.46078491210938, + "loss": 0.6619, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6859841346740723, + "rewards/margins": 0.11802731454372406, + "rewards/rejected": -0.8040115237236023, + "step": 3760 + }, + { + "epoch": 0.649552033080634, + "grad_norm": 11.981207847595215, + "learning_rate": 1.9184776582138408e-07, + "logits/chosen": -2.497622013092041, + "logits/rejected": -2.4783318042755127, + "logps/chosen": -116.83412170410156, + "logps/rejected": -131.28945922851562, + "loss": 0.6503, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6329242587089539, + "rewards/margins": 0.13447043299674988, + "rewards/rejected": -0.7673946619033813, + "step": 3770 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 8.799779891967773, + "learning_rate": 1.9176829509356817e-07, + "logits/chosen": -2.5327377319335938, + "logits/rejected": -2.4975881576538086, + "logps/chosen": -116.349853515625, + "logps/rejected": -129.2434844970703, + "loss": 0.6319, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6227124929428101, + "rewards/margins": 0.17571154236793518, + "rewards/rejected": -0.7984240651130676, + "step": 3780 + }, + { + "epoch": 0.6529979324603722, + "grad_norm": 9.702862739562988, + "learning_rate": 1.9168845551194526e-07, + "logits/chosen": -2.4920172691345215, + "logits/rejected": -2.465822219848633, + "logps/chosen": -113.99241638183594, + "logps/rejected": -130.11317443847656, + "loss": 0.6261, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6000428199768066, + "rewards/margins": 0.1911824643611908, + "rewards/rejected": -0.7912253737449646, + "step": 3790 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 14.17463207244873, + "learning_rate": 1.916082473974228e-07, + "logits/chosen": -2.5084242820739746, + "logits/rejected": -2.4781010150909424, + "logps/chosen": -117.53678894042969, + "logps/rejected": -131.0016326904297, + "loss": 0.6248, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5984351634979248, + "rewards/margins": 0.18290486931800842, + "rewards/rejected": -0.7813400030136108, + "step": 3800 + }, + { + "epoch": 0.6564438318401102, + "grad_norm": 14.620065689086914, + "learning_rate": 1.9152767107238957e-07, + "logits/chosen": -2.520502805709839, + "logits/rejected": -2.48211407661438, + "logps/chosen": -124.7673110961914, + "logps/rejected": -135.38389587402344, + "loss": 0.6401, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6729920506477356, + "rewards/margins": 0.16020779311656952, + "rewards/rejected": -0.8331997990608215, + "step": 3810 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 10.79979419708252, + "learning_rate": 1.9144672686071437e-07, + "logits/chosen": -2.4938557147979736, + "logits/rejected": -2.4761714935302734, + "logps/chosen": -115.97245025634766, + "logps/rejected": -133.52169799804688, + "loss": 0.6249, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6187423467636108, + "rewards/margins": 0.1855977475643158, + "rewards/rejected": -0.8043401837348938, + "step": 3820 + }, + { + "epoch": 0.6598897312198484, + "grad_norm": 12.743062973022461, + "learning_rate": 1.913654150877446e-07, + "logits/chosen": -2.5338752269744873, + "logits/rejected": -2.492554187774658, + "logps/chosen": -125.34283447265625, + "logps/rejected": -134.8113250732422, + "loss": 0.6326, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6666573286056519, + "rewards/margins": 0.16785266995429993, + "rewards/rejected": -0.8345099687576294, + "step": 3830 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 9.466545104980469, + "learning_rate": 1.9128373608030513e-07, + "logits/chosen": -2.4681429862976074, + "logits/rejected": -2.4535365104675293, + "logps/chosen": -116.46714782714844, + "logps/rejected": -146.2787322998047, + "loss": 0.6043, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6238076090812683, + "rewards/margins": 0.24866314232349396, + "rewards/rejected": -0.8724706768989563, + "step": 3840 + }, + { + "epoch": 0.6633356305995864, + "grad_norm": 10.31405258178711, + "learning_rate": 1.9120169016669683e-07, + "logits/chosen": -2.5796279907226562, + "logits/rejected": -2.564216136932373, + "logps/chosen": -122.28788757324219, + "logps/rejected": -137.9895782470703, + "loss": 0.624, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6498677134513855, + "rewards/margins": 0.19236890971660614, + "rewards/rejected": -0.8422366380691528, + "step": 3850 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 12.240978240966797, + "learning_rate": 1.9111927767669531e-07, + "logits/chosen": -2.577061176300049, + "logits/rejected": -2.553990125656128, + "logps/chosen": -124.99577331542969, + "logps/rejected": -136.03915405273438, + "loss": 0.6561, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7081534266471863, + "rewards/margins": 0.12281046062707901, + "rewards/rejected": -0.8309639096260071, + "step": 3860 + }, + { + "epoch": 0.6667815299793246, + "grad_norm": 14.770833015441895, + "learning_rate": 1.9103649894154965e-07, + "logits/chosen": -2.467924118041992, + "logits/rejected": -2.4463753700256348, + "logps/chosen": -125.22386169433594, + "logps/rejected": -144.5591278076172, + "loss": 0.6129, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.663710355758667, + "rewards/margins": 0.22524094581604004, + "rewards/rejected": -0.8889514207839966, + "step": 3870 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 11.515402793884277, + "learning_rate": 1.90953354293981e-07, + "logits/chosen": -2.509796142578125, + "logits/rejected": -2.514373540878296, + "logps/chosen": -122.02311706542969, + "logps/rejected": -137.66896057128906, + "loss": 0.6515, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.711441159248352, + "rewards/margins": 0.13033203780651093, + "rewards/rejected": -0.8417732119560242, + "step": 3880 + }, + { + "epoch": 0.6702274293590628, + "grad_norm": 10.92282772064209, + "learning_rate": 1.908698440681812e-07, + "logits/chosen": -2.5480504035949707, + "logits/rejected": -2.524164915084839, + "logps/chosen": -118.02398681640625, + "logps/rejected": -135.6583251953125, + "loss": 0.6239, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6383758783340454, + "rewards/margins": 0.196736142039299, + "rewards/rejected": -0.8351120948791504, + "step": 3890 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 11.879098892211914, + "learning_rate": 1.9078596859981163e-07, + "logits/chosen": -2.5341193675994873, + "logits/rejected": -2.484837532043457, + "logps/chosen": -123.85850524902344, + "logps/rejected": -134.4687042236328, + "loss": 0.6352, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6702991724014282, + "rewards/margins": 0.1717648208141327, + "rewards/rejected": -0.8420640230178833, + "step": 3900 + }, + { + "epoch": 0.6736733287388008, + "grad_norm": 10.429671287536621, + "learning_rate": 1.9070172822600152e-07, + "logits/chosen": -2.5086872577667236, + "logits/rejected": -2.4919564723968506, + "logps/chosen": -124.47847747802734, + "logps/rejected": -147.82156372070312, + "loss": 0.6062, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6883711814880371, + "rewards/margins": 0.24378545582294464, + "rewards/rejected": -0.9321566820144653, + "step": 3910 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 10.995051383972168, + "learning_rate": 1.90617123285347e-07, + "logits/chosen": -2.5535805225372314, + "logits/rejected": -2.5131783485412598, + "logps/chosen": -124.58341979980469, + "logps/rejected": -136.53872680664062, + "loss": 0.625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6591401100158691, + "rewards/margins": 0.20176732540130615, + "rewards/rejected": -0.8609074354171753, + "step": 3920 + }, + { + "epoch": 0.677119228118539, + "grad_norm": 14.10901927947998, + "learning_rate": 1.9053215411790945e-07, + "logits/chosen": -2.5570924282073975, + "logits/rejected": -2.551347017288208, + "logps/chosen": -119.05476379394531, + "logps/rejected": -138.29513549804688, + "loss": 0.6376, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6779693961143494, + "rewards/margins": 0.17716078460216522, + "rewards/rejected": -0.8551301956176758, + "step": 3930 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 13.000458717346191, + "learning_rate": 1.9044682106521428e-07, + "logits/chosen": -2.404637336730957, + "logits/rejected": -2.387035846710205, + "logps/chosen": -117.692626953125, + "logps/rejected": -136.10415649414062, + "loss": 0.6281, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6419261693954468, + "rewards/margins": 0.1821344792842865, + "rewards/rejected": -0.8240607380867004, + "step": 3940 + }, + { + "epoch": 0.680565127498277, + "grad_norm": 11.085746765136719, + "learning_rate": 1.903611244702494e-07, + "logits/chosen": -2.3949618339538574, + "logits/rejected": -2.3535492420196533, + "logps/chosen": -121.14100646972656, + "logps/rejected": -135.42344665527344, + "loss": 0.6269, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6566764712333679, + "rewards/margins": 0.1880909651517868, + "rewards/rejected": -0.8447673916816711, + "step": 3950 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 11.905729293823242, + "learning_rate": 1.9027506467746404e-07, + "logits/chosen": -2.492781639099121, + "logits/rejected": -2.489943027496338, + "logps/chosen": -120.00762939453125, + "logps/rejected": -153.7581024169922, + "loss": 0.6004, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6730445623397827, + "rewards/margins": 0.2705869674682617, + "rewards/rejected": -0.943631649017334, + "step": 3960 + }, + { + "epoch": 0.6840110268780152, + "grad_norm": 12.589399337768555, + "learning_rate": 1.901886420327672e-07, + "logits/chosen": -2.486377477645874, + "logits/rejected": -2.4649770259857178, + "logps/chosen": -122.77647399902344, + "logps/rejected": -142.85531616210938, + "loss": 0.6216, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.687035858631134, + "rewards/margins": 0.21148011088371277, + "rewards/rejected": -0.8985159993171692, + "step": 3970 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 11.657106399536133, + "learning_rate": 1.9010185688352643e-07, + "logits/chosen": -2.3694095611572266, + "logits/rejected": -2.3498167991638184, + "logps/chosen": -123.49932861328125, + "logps/rejected": -147.36175537109375, + "loss": 0.6094, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7015981674194336, + "rewards/margins": 0.24004308879375458, + "rewards/rejected": -0.9416411519050598, + "step": 3980 + }, + { + "epoch": 0.6874569262577532, + "grad_norm": 10.493300437927246, + "learning_rate": 1.9001470957856615e-07, + "logits/chosen": -2.459582805633545, + "logits/rejected": -2.4476981163024902, + "logps/chosen": -125.8658676147461, + "logps/rejected": -145.31004333496094, + "loss": 0.639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7264918088912964, + "rewards/margins": 0.1739417016506195, + "rewards/rejected": -0.9004335403442383, + "step": 3990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 11.909307479858398, + "learning_rate": 1.8992720046816664e-07, + "logits/chosen": -2.5346808433532715, + "logits/rejected": -2.5022826194763184, + "logps/chosen": -130.37509155273438, + "logps/rejected": -145.04820251464844, + "loss": 0.6264, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7317085266113281, + "rewards/margins": 0.1911483258008957, + "rewards/rejected": -0.9228569269180298, + "step": 4000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -2.5324018001556396, + "eval_logits/rejected": -2.5251705646514893, + "eval_logps/chosen": -117.29443359375, + "eval_logps/rejected": -133.6785125732422, + "eval_loss": 0.6514254808425903, + "eval_rewards/accuracies": 0.63150554895401, + "eval_rewards/chosen": -0.5858253836631775, + "eval_rewards/margins": 0.11915845423936844, + "eval_rewards/rejected": -0.7049838900566101, + "eval_runtime": 384.7783, + "eval_samples_per_second": 11.186, + "eval_steps_per_second": 1.398, + "step": 4000 + }, + { + "epoch": 0.6909028256374914, + "grad_norm": 10.766988754272461, + "learning_rate": 1.8983932990406229e-07, + "logits/chosen": -2.4037575721740723, + "logits/rejected": -2.391706943511963, + "logps/chosen": -120.1826400756836, + "logps/rejected": -148.49423217773438, + "loss": 0.6115, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7216323614120483, + "rewards/margins": 0.23313522338867188, + "rewards/rejected": -0.9547675251960754, + "step": 4010 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 12.511350631713867, + "learning_rate": 1.8975109823944039e-07, + "logits/chosen": -2.412825107574463, + "logits/rejected": -2.400951385498047, + "logps/chosen": -122.57711029052734, + "logps/rejected": -142.6312713623047, + "loss": 0.6284, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.703916072845459, + "rewards/margins": 0.1911393702030182, + "rewards/rejected": -0.8950554132461548, + "step": 4020 + }, + { + "epoch": 0.6943487250172296, + "grad_norm": 10.538165092468262, + "learning_rate": 1.8966250582893953e-07, + "logits/chosen": -2.4241652488708496, + "logits/rejected": -2.4017891883850098, + "logps/chosen": -120.19538879394531, + "logps/rejected": -143.36984252929688, + "loss": 0.6065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6414139270782471, + "rewards/margins": 0.23436374962329865, + "rewards/rejected": -0.8757776021957397, + "step": 4030 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 13.03740406036377, + "learning_rate": 1.8957355302864842e-07, + "logits/chosen": -2.489417314529419, + "logits/rejected": -2.4761695861816406, + "logps/chosen": -125.21080017089844, + "logps/rejected": -147.3764190673828, + "loss": 0.6079, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6901857852935791, + "rewards/margins": 0.24308910965919495, + "rewards/rejected": -0.933275043964386, + "step": 4040 + }, + { + "epoch": 0.6977946243969676, + "grad_norm": 11.657258987426758, + "learning_rate": 1.894842401961042e-07, + "logits/chosen": -2.3974480628967285, + "logits/rejected": -2.378633737564087, + "logps/chosen": -116.8731689453125, + "logps/rejected": -145.9193115234375, + "loss": 0.6029, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6630774736404419, + "rewards/margins": 0.26788225769996643, + "rewards/rejected": -0.9309597015380859, + "step": 4050 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 12.036991119384766, + "learning_rate": 1.8939456769029122e-07, + "logits/chosen": -2.394946575164795, + "logits/rejected": -2.3649299144744873, + "logps/chosen": -135.7313995361328, + "logps/rejected": -149.5013885498047, + "loss": 0.634, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7937172651290894, + "rewards/margins": 0.179894357919693, + "rewards/rejected": -0.9736116528511047, + "step": 4060 + }, + { + "epoch": 0.7012405237767058, + "grad_norm": 11.258556365966797, + "learning_rate": 1.8930453587163949e-07, + "logits/chosen": -2.38643217086792, + "logits/rejected": -2.364267349243164, + "logps/chosen": -130.9126739501953, + "logps/rejected": -153.92453002929688, + "loss": 0.6031, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.770807683467865, + "rewards/margins": 0.23965749144554138, + "rewards/rejected": -1.0104650259017944, + "step": 4070 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 11.272856712341309, + "learning_rate": 1.8921414510202317e-07, + "logits/chosen": -2.3410181999206543, + "logits/rejected": -2.3312878608703613, + "logps/chosen": -126.29512023925781, + "logps/rejected": -151.48361206054688, + "loss": 0.6124, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7439530491828918, + "rewards/margins": 0.23066206276416779, + "rewards/rejected": -0.9746150970458984, + "step": 4080 + }, + { + "epoch": 0.7046864231564438, + "grad_norm": 11.280725479125977, + "learning_rate": 1.8912339574475925e-07, + "logits/chosen": -2.4383339881896973, + "logits/rejected": -2.4078497886657715, + "logps/chosen": -126.5016860961914, + "logps/rejected": -149.1123504638672, + "loss": 0.6162, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7102442383766174, + "rewards/margins": 0.23562097549438477, + "rewards/rejected": -0.9458651542663574, + "step": 4090 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 15.935708045959473, + "learning_rate": 1.8903228816460598e-07, + "logits/chosen": -2.373109817504883, + "logits/rejected": -2.346482753753662, + "logps/chosen": -124.05479431152344, + "logps/rejected": -145.33522033691406, + "loss": 0.6066, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7121135592460632, + "rewards/margins": 0.24879172444343567, + "rewards/rejected": -0.9609053730964661, + "step": 4100 + }, + { + "epoch": 0.708132322536182, + "grad_norm": 11.416752815246582, + "learning_rate": 1.8894082272776156e-07, + "logits/chosen": -2.3425259590148926, + "logits/rejected": -2.331468105316162, + "logps/chosen": -138.4196014404297, + "logps/rejected": -147.4912872314453, + "loss": 0.664, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8235718011856079, + "rewards/margins": 0.13630910217761993, + "rewards/rejected": -0.9598809480667114, + "step": 4110 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 11.454225540161133, + "learning_rate": 1.8884899980186248e-07, + "logits/chosen": -2.3015871047973633, + "logits/rejected": -2.3006720542907715, + "logps/chosen": -130.07545471191406, + "logps/rejected": -158.34634399414062, + "loss": 0.616, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8011056184768677, + "rewards/margins": 0.23448316752910614, + "rewards/rejected": -1.0355888605117798, + "step": 4120 + }, + { + "epoch": 0.71157822191592, + "grad_norm": 12.713473320007324, + "learning_rate": 1.8875681975598207e-07, + "logits/chosen": -2.391552686691284, + "logits/rejected": -2.370664119720459, + "logps/chosen": -126.0354232788086, + "logps/rejected": -149.48287963867188, + "loss": 0.5988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7206748723983765, + "rewards/margins": 0.2488236129283905, + "rewards/rejected": -0.9694985151290894, + "step": 4130 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 11.148635864257812, + "learning_rate": 1.8866428296062916e-07, + "logits/chosen": -2.3824214935302734, + "logits/rejected": -2.358895778656006, + "logps/chosen": -138.39901733398438, + "logps/rejected": -146.14036560058594, + "loss": 0.6656, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.8287339210510254, + "rewards/margins": 0.12563897669315338, + "rewards/rejected": -0.954372763633728, + "step": 4140 + }, + { + "epoch": 0.7150241212956582, + "grad_norm": 11.041769981384277, + "learning_rate": 1.8857138978774647e-07, + "logits/chosen": -2.4081053733825684, + "logits/rejected": -2.3844470977783203, + "logps/chosen": -136.33596801757812, + "logps/rejected": -152.49148559570312, + "loss": 0.6193, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7849429249763489, + "rewards/margins": 0.22173313796520233, + "rewards/rejected": -1.0066759586334229, + "step": 4150 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 13.000495910644531, + "learning_rate": 1.8847814061070917e-07, + "logits/chosen": -2.3095197677612305, + "logits/rejected": -2.2821574211120605, + "logps/chosen": -130.6693878173828, + "logps/rejected": -151.56906127929688, + "loss": 0.6095, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7899118661880493, + "rewards/margins": 0.23466038703918457, + "rewards/rejected": -1.0245721340179443, + "step": 4160 + }, + { + "epoch": 0.7184700206753962, + "grad_norm": 13.73046875, + "learning_rate": 1.8838453580432328e-07, + "logits/chosen": -2.356701374053955, + "logits/rejected": -2.347198963165283, + "logps/chosen": -126.84205627441406, + "logps/rejected": -151.11383056640625, + "loss": 0.6175, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7544603943824768, + "rewards/margins": 0.2208242416381836, + "rewards/rejected": -0.97528475522995, + "step": 4170 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 13.165528297424316, + "learning_rate": 1.882905757448243e-07, + "logits/chosen": -2.337136745452881, + "logits/rejected": -2.3184616565704346, + "logps/chosen": -131.48397827148438, + "logps/rejected": -157.45358276367188, + "loss": 0.608, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7613840699195862, + "rewards/margins": 0.2502623498439789, + "rewards/rejected": -1.0116463899612427, + "step": 4180 + }, + { + "epoch": 0.7219159200551344, + "grad_norm": 14.414857864379883, + "learning_rate": 1.8819626080987567e-07, + "logits/chosen": -2.326634168624878, + "logits/rejected": -2.3124537467956543, + "logps/chosen": -134.14418029785156, + "logps/rejected": -160.92916870117188, + "loss": 0.6139, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8307603597640991, + "rewards/margins": 0.2533273994922638, + "rewards/rejected": -1.08408784866333, + "step": 4190 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 13.043105125427246, + "learning_rate": 1.881015913785671e-07, + "logits/chosen": -2.3766627311706543, + "logits/rejected": -2.361971378326416, + "logps/chosen": -134.19444274902344, + "logps/rejected": -143.70457458496094, + "loss": 0.6605, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.7848796248435974, + "rewards/margins": 0.12627723813056946, + "rewards/rejected": -0.9111568331718445, + "step": 4200 + }, + { + "epoch": 0.7253618194348725, + "grad_norm": 14.597874641418457, + "learning_rate": 1.880065678314133e-07, + "logits/chosen": -2.3638992309570312, + "logits/rejected": -2.343294620513916, + "logps/chosen": -138.7797393798828, + "logps/rejected": -149.86465454101562, + "loss": 0.6656, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8348299264907837, + "rewards/margins": 0.13606666028499603, + "rewards/rejected": -0.9708966016769409, + "step": 4210 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 11.53864860534668, + "learning_rate": 1.8791119055035221e-07, + "logits/chosen": -2.243114948272705, + "logits/rejected": -2.224510431289673, + "logps/chosen": -124.7751693725586, + "logps/rejected": -145.98992919921875, + "loss": 0.6256, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7255649566650391, + "rewards/margins": 0.2072831392288208, + "rewards/rejected": -0.9328479766845703, + "step": 4220 + }, + { + "epoch": 0.7288077188146106, + "grad_norm": 11.614204406738281, + "learning_rate": 1.8781545991874362e-07, + "logits/chosen": -2.4114151000976562, + "logits/rejected": -2.39131236076355, + "logps/chosen": -132.70994567871094, + "logps/rejected": -155.8234405517578, + "loss": 0.6129, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7740388512611389, + "rewards/margins": 0.24319903552532196, + "rewards/rejected": -1.017237901687622, + "step": 4230 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 12.244373321533203, + "learning_rate": 1.8771937632136753e-07, + "logits/chosen": -2.320223331451416, + "logits/rejected": -2.2961888313293457, + "logps/chosen": -129.49264526367188, + "logps/rejected": -157.07241821289062, + "loss": 0.6121, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7474852800369263, + "rewards/margins": 0.2636989653110504, + "rewards/rejected": -1.0111842155456543, + "step": 4240 + }, + { + "epoch": 0.7322536181943488, + "grad_norm": 15.389056205749512, + "learning_rate": 1.8762294014442275e-07, + "logits/chosen": -2.3215842247009277, + "logits/rejected": -2.305537223815918, + "logps/chosen": -127.04652404785156, + "logps/rejected": -145.0000457763672, + "loss": 0.6248, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7199771404266357, + "rewards/margins": 0.20333869755268097, + "rewards/rejected": -0.9233158230781555, + "step": 4250 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 13.800179481506348, + "learning_rate": 1.8752615177552515e-07, + "logits/chosen": -2.301645040512085, + "logits/rejected": -2.287914276123047, + "logps/chosen": -126.49116516113281, + "logps/rejected": -148.6343536376953, + "loss": 0.6291, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7196300625801086, + "rewards/margins": 0.21817633509635925, + "rewards/rejected": -0.9378064274787903, + "step": 4260 + }, + { + "epoch": 0.7356995175740868, + "grad_norm": 12.630337715148926, + "learning_rate": 1.8742901160370629e-07, + "logits/chosen": -2.3138930797576904, + "logits/rejected": -2.2898266315460205, + "logps/chosen": -116.51651763916016, + "logps/rejected": -137.6942138671875, + "loss": 0.6192, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.649080753326416, + "rewards/margins": 0.2223900854587555, + "rewards/rejected": -0.8714709281921387, + "step": 4270 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 14.70296573638916, + "learning_rate": 1.8733152001941162e-07, + "logits/chosen": -2.3043904304504395, + "logits/rejected": -2.280935764312744, + "logps/chosen": -118.82975769042969, + "logps/rejected": -144.32821655273438, + "loss": 0.6148, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6711223125457764, + "rewards/margins": 0.2516058087348938, + "rewards/rejected": -0.9227281808853149, + "step": 4280 + }, + { + "epoch": 0.739145416953825, + "grad_norm": 13.53441047668457, + "learning_rate": 1.872336774144992e-07, + "logits/chosen": -2.315648317337036, + "logits/rejected": -2.2937397956848145, + "logps/chosen": -130.9637451171875, + "logps/rejected": -150.52886962890625, + "loss": 0.6304, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7572386860847473, + "rewards/margins": 0.23190510272979736, + "rewards/rejected": -0.9891437292098999, + "step": 4290 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 10.746184349060059, + "learning_rate": 1.8713548418223797e-07, + "logits/chosen": -2.346978187561035, + "logits/rejected": -2.333615779876709, + "logps/chosen": -122.73223876953125, + "logps/rejected": -146.6703338623047, + "loss": 0.6128, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7017590403556824, + "rewards/margins": 0.23429390788078308, + "rewards/rejected": -0.9360530972480774, + "step": 4300 + }, + { + "epoch": 0.742591316333563, + "grad_norm": 13.176383972167969, + "learning_rate": 1.8703694071730612e-07, + "logits/chosen": -2.283977508544922, + "logits/rejected": -2.2644619941711426, + "logps/chosen": -127.2890625, + "logps/rejected": -144.92465209960938, + "loss": 0.6235, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7323711514472961, + "rewards/margins": 0.2010023295879364, + "rewards/rejected": -0.9333734512329102, + "step": 4310 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 10.389663696289062, + "learning_rate": 1.8693804741578964e-07, + "logits/chosen": -2.370084047317505, + "logits/rejected": -2.341132402420044, + "logps/chosen": -133.34310913085938, + "logps/rejected": -160.8234100341797, + "loss": 0.5774, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7470110654830933, + "rewards/margins": 0.31445130705833435, + "rewards/rejected": -1.061462163925171, + "step": 4320 + }, + { + "epoch": 0.7460372157133012, + "grad_norm": 13.39374828338623, + "learning_rate": 1.8683880467518055e-07, + "logits/chosen": -2.2760860919952393, + "logits/rejected": -2.246753692626953, + "logps/chosen": -132.9408416748047, + "logps/rejected": -146.30142211914062, + "loss": 0.6354, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7745003700256348, + "rewards/margins": 0.19058740139007568, + "rewards/rejected": -0.9650877714157104, + "step": 4330 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 12.95674991607666, + "learning_rate": 1.8673921289437554e-07, + "logits/chosen": -2.2742409706115723, + "logits/rejected": -2.2547922134399414, + "logps/chosen": -120.75160217285156, + "logps/rejected": -148.62075805664062, + "loss": 0.6065, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.717552900314331, + "rewards/margins": 0.26367202401161194, + "rewards/rejected": -0.9812248945236206, + "step": 4340 + }, + { + "epoch": 0.7494831150930393, + "grad_norm": 13.893998146057129, + "learning_rate": 1.8663927247367407e-07, + "logits/chosen": -2.28212308883667, + "logits/rejected": -2.2669732570648193, + "logps/chosen": -120.74113464355469, + "logps/rejected": -147.4693145751953, + "loss": 0.6028, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6973130106925964, + "rewards/margins": 0.2454204559326172, + "rewards/rejected": -0.9427334666252136, + "step": 4350 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 10.430903434753418, + "learning_rate": 1.865389838147771e-07, + "logits/chosen": -2.3136277198791504, + "logits/rejected": -2.2893054485321045, + "logps/chosen": -134.9327850341797, + "logps/rejected": -152.28640747070312, + "loss": 0.6473, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8126112818717957, + "rewards/margins": 0.19375188648700714, + "rewards/rejected": -1.0063631534576416, + "step": 4360 + }, + { + "epoch": 0.7529290144727774, + "grad_norm": 12.522747993469238, + "learning_rate": 1.864383473207852e-07, + "logits/chosen": -2.3265700340270996, + "logits/rejected": -2.3115689754486084, + "logps/chosen": -131.7525634765625, + "logps/rejected": -153.37396240234375, + "loss": 0.6252, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7679398655891418, + "rewards/margins": 0.21781209111213684, + "rewards/rejected": -0.9857519268989563, + "step": 4370 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 11.650630950927734, + "learning_rate": 1.8633736339619702e-07, + "logits/chosen": -2.360729932785034, + "logits/rejected": -2.338486909866333, + "logps/chosen": -128.01376342773438, + "logps/rejected": -149.15093994140625, + "loss": 0.6255, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7371021509170532, + "rewards/margins": 0.23261289298534393, + "rewards/rejected": -0.9697149991989136, + "step": 4380 + }, + { + "epoch": 0.7563749138525155, + "grad_norm": 12.601362228393555, + "learning_rate": 1.8623603244690772e-07, + "logits/chosen": -2.3034393787384033, + "logits/rejected": -2.2892696857452393, + "logps/chosen": -126.63724517822266, + "logps/rejected": -150.16226196289062, + "loss": 0.614, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7340060472488403, + "rewards/margins": 0.2326422929763794, + "rewards/rejected": -0.966648280620575, + "step": 4390 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 13.8900146484375, + "learning_rate": 1.861343548802073e-07, + "logits/chosen": -2.3105218410491943, + "logits/rejected": -2.2936127185821533, + "logps/chosen": -132.64756774902344, + "logps/rejected": -154.4315185546875, + "loss": 0.6109, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.757422924041748, + "rewards/margins": 0.23268385231494904, + "rewards/rejected": -0.9901068806648254, + "step": 4400 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -2.412363290786743, + "eval_logits/rejected": -2.404085397720337, + "eval_logps/chosen": -120.88501739501953, + "eval_logps/rejected": -139.04837036132812, + "eval_loss": 0.6474176049232483, + "eval_rewards/accuracies": 0.6312732100486755, + "eval_rewards/chosen": -0.6217312216758728, + "eval_rewards/margins": 0.1369512975215912, + "eval_rewards/rejected": -0.7586825489997864, + "eval_runtime": 384.2018, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 4400 + }, + { + "epoch": 0.7598208132322536, + "grad_norm": 13.378119468688965, + "learning_rate": 1.8603233110477884e-07, + "logits/chosen": -2.268012762069702, + "logits/rejected": -2.2514090538024902, + "logps/chosen": -134.5967559814453, + "logps/rejected": -156.74072265625, + "loss": 0.6342, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7970510721206665, + "rewards/margins": 0.22004970908164978, + "rewards/rejected": -1.0171008110046387, + "step": 4410 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 14.908544540405273, + "learning_rate": 1.8592996153069715e-07, + "logits/chosen": -2.3738467693328857, + "logits/rejected": -2.33332896232605, + "logps/chosen": -133.03750610351562, + "logps/rejected": -147.9682159423828, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7822148203849792, + "rewards/margins": 0.19276809692382812, + "rewards/rejected": -0.9749830365180969, + "step": 4420 + }, + { + "epoch": 0.7632667126119917, + "grad_norm": 13.850336074829102, + "learning_rate": 1.8582724656942683e-07, + "logits/chosen": -2.265064001083374, + "logits/rejected": -2.233612298965454, + "logps/chosen": -129.28952026367188, + "logps/rejected": -146.53509521484375, + "loss": 0.6389, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7550686597824097, + "rewards/margins": 0.19707316160202026, + "rewards/rejected": -0.9521418809890747, + "step": 4430 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 14.160219192504883, + "learning_rate": 1.8572418663382074e-07, + "logits/chosen": -2.32669734954834, + "logits/rejected": -2.2965919971466064, + "logps/chosen": -141.6306915283203, + "logps/rejected": -165.29336547851562, + "loss": 0.6015, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8222866058349609, + "rewards/margins": 0.28041699528694153, + "rewards/rejected": -1.10270357131958, + "step": 4440 + }, + { + "epoch": 0.7667126119917298, + "grad_norm": 12.661294937133789, + "learning_rate": 1.8562078213811833e-07, + "logits/chosen": -2.204195737838745, + "logits/rejected": -2.192131757736206, + "logps/chosen": -128.37451171875, + "logps/rejected": -152.55081176757812, + "loss": 0.6088, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7265924215316772, + "rewards/margins": 0.25704535841941833, + "rewards/rejected": -0.9836376905441284, + "step": 4450 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 12.920085906982422, + "learning_rate": 1.8551703349794406e-07, + "logits/chosen": -2.31882643699646, + "logits/rejected": -2.3102874755859375, + "logps/chosen": -126.8609848022461, + "logps/rejected": -150.83541870117188, + "loss": 0.6177, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7412899136543274, + "rewards/margins": 0.24046726524829865, + "rewards/rejected": -0.9817571640014648, + "step": 4460 + }, + { + "epoch": 0.770158511371468, + "grad_norm": 12.629962921142578, + "learning_rate": 1.854129411303055e-07, + "logits/chosen": -2.367927312850952, + "logits/rejected": -2.33909010887146, + "logps/chosen": -130.4448699951172, + "logps/rejected": -145.80923461914062, + "loss": 0.6269, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7452287077903748, + "rewards/margins": 0.2104768306016922, + "rewards/rejected": -0.9557055234909058, + "step": 4470 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 18.944599151611328, + "learning_rate": 1.8530850545359193e-07, + "logits/chosen": -2.384981393814087, + "logits/rejected": -2.3748526573181152, + "logps/chosen": -131.04205322265625, + "logps/rejected": -155.17080688476562, + "loss": 0.6267, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8003128170967102, + "rewards/margins": 0.2289293110370636, + "rewards/rejected": -1.0292421579360962, + "step": 4480 + }, + { + "epoch": 0.7736044107512061, + "grad_norm": 15.200418472290039, + "learning_rate": 1.8520372688757245e-07, + "logits/chosen": -2.2746169567108154, + "logits/rejected": -2.24771785736084, + "logps/chosen": -130.04827880859375, + "logps/rejected": -150.07791137695312, + "loss": 0.6338, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7890744805335999, + "rewards/margins": 0.21333102881908417, + "rewards/rejected": -1.0024055242538452, + "step": 4490 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 12.581011772155762, + "learning_rate": 1.8509860585339446e-07, + "logits/chosen": -2.285783290863037, + "logits/rejected": -2.265993595123291, + "logps/chosen": -135.75062561035156, + "logps/rejected": -159.28656005859375, + "loss": 0.6125, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7941531538963318, + "rewards/margins": 0.26907879114151, + "rewards/rejected": -1.0632318258285522, + "step": 4500 + }, + { + "epoch": 0.7770503101309442, + "grad_norm": 17.554412841796875, + "learning_rate": 1.8499314277358167e-07, + "logits/chosen": -2.3803791999816895, + "logits/rejected": -2.3529820442199707, + "logps/chosen": -133.89425659179688, + "logps/rejected": -163.8766632080078, + "loss": 0.6036, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.804618239402771, + "rewards/margins": 0.2779730558395386, + "rewards/rejected": -1.0825912952423096, + "step": 4510 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 14.42990779876709, + "learning_rate": 1.848873380720329e-07, + "logits/chosen": -2.336332321166992, + "logits/rejected": -2.314807653427124, + "logps/chosen": -135.14236450195312, + "logps/rejected": -152.7788543701172, + "loss": 0.6363, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8254430890083313, + "rewards/margins": 0.2015375792980194, + "rewards/rejected": -1.0269806385040283, + "step": 4520 + }, + { + "epoch": 0.7804962095106823, + "grad_norm": 14.05636978149414, + "learning_rate": 1.8478119217401985e-07, + "logits/chosen": -2.318453788757324, + "logits/rejected": -2.3068490028381348, + "logps/chosen": -128.04605102539062, + "logps/rejected": -146.30068969726562, + "loss": 0.6409, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7427166700363159, + "rewards/margins": 0.1854708194732666, + "rewards/rejected": -0.9281874895095825, + "step": 4530 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 19.649593353271484, + "learning_rate": 1.8467470550618574e-07, + "logits/chosen": -2.2332546710968018, + "logits/rejected": -2.2186062335968018, + "logps/chosen": -125.69355773925781, + "logps/rejected": -147.77760314941406, + "loss": 0.6121, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7296175360679626, + "rewards/margins": 0.2388092577457428, + "rewards/rejected": -0.9684268236160278, + "step": 4540 + }, + { + "epoch": 0.7839421088904204, + "grad_norm": 16.796405792236328, + "learning_rate": 1.8456787849654347e-07, + "logits/chosen": -2.3180625438690186, + "logits/rejected": -2.293102741241455, + "logps/chosen": -131.48257446289062, + "logps/rejected": -154.91592407226562, + "loss": 0.6116, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7512409090995789, + "rewards/margins": 0.2640026807785034, + "rewards/rejected": -1.015243649482727, + "step": 4550 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 13.212952613830566, + "learning_rate": 1.844607115744739e-07, + "logits/chosen": -2.2414722442626953, + "logits/rejected": -2.203615665435791, + "logps/chosen": -133.26007080078125, + "logps/rejected": -159.36962890625, + "loss": 0.596, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7704302072525024, + "rewards/margins": 0.2923758625984192, + "rewards/rejected": -1.0628061294555664, + "step": 4560 + }, + { + "epoch": 0.7873880082701585, + "grad_norm": 19.492271423339844, + "learning_rate": 1.8435320517072408e-07, + "logits/chosen": -2.24662709236145, + "logits/rejected": -2.2220511436462402, + "logps/chosen": -140.2692108154297, + "logps/rejected": -164.09080505371094, + "loss": 0.6398, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8917884826660156, + "rewards/margins": 0.23158666491508484, + "rewards/rejected": -1.1233750581741333, + "step": 4570 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 14.586845397949219, + "learning_rate": 1.842453597174057e-07, + "logits/chosen": -2.214069128036499, + "logits/rejected": -2.1866512298583984, + "logps/chosen": -127.54747009277344, + "logps/rejected": -150.66769409179688, + "loss": 0.6064, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7328786253929138, + "rewards/margins": 0.25618213415145874, + "rewards/rejected": -0.9890606999397278, + "step": 4580 + }, + { + "epoch": 0.7908339076498966, + "grad_norm": 14.758131980895996, + "learning_rate": 1.841371756479931e-07, + "logits/chosen": -2.3069567680358887, + "logits/rejected": -2.2812230587005615, + "logps/chosen": -140.52098083496094, + "logps/rejected": -155.68270874023438, + "loss": 0.6581, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8563669323921204, + "rewards/margins": 0.17839594185352325, + "rewards/rejected": -1.034762978553772, + "step": 4590 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 14.336187362670898, + "learning_rate": 1.8402865339732171e-07, + "logits/chosen": -2.2076327800750732, + "logits/rejected": -2.1841959953308105, + "logps/chosen": -134.13148498535156, + "logps/rejected": -170.171875, + "loss": 0.572, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8076637387275696, + "rewards/margins": 0.35054484009742737, + "rewards/rejected": -1.1582086086273193, + "step": 4600 + }, + { + "epoch": 0.7942798070296347, + "grad_norm": 15.751847267150879, + "learning_rate": 1.8391979340158627e-07, + "logits/chosen": -2.219324827194214, + "logits/rejected": -2.2080159187316895, + "logps/chosen": -141.0472412109375, + "logps/rejected": -158.53619384765625, + "loss": 0.649, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8990927934646606, + "rewards/margins": 0.17679741978645325, + "rewards/rejected": -1.075890302658081, + "step": 4610 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 15.971353530883789, + "learning_rate": 1.8381059609833904e-07, + "logits/chosen": -2.2696173191070557, + "logits/rejected": -2.243626594543457, + "logps/chosen": -134.827392578125, + "logps/rejected": -163.3299102783203, + "loss": 0.5942, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7910867929458618, + "rewards/margins": 0.2870883345603943, + "rewards/rejected": -1.0781750679016113, + "step": 4620 + }, + { + "epoch": 0.7977257064093728, + "grad_norm": 14.739949226379395, + "learning_rate": 1.83701061926488e-07, + "logits/chosen": -2.2830615043640137, + "logits/rejected": -2.247520923614502, + "logps/chosen": -144.0731658935547, + "logps/rejected": -171.52520751953125, + "loss": 0.6025, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8678949475288391, + "rewards/margins": 0.2814059257507324, + "rewards/rejected": -1.1493009328842163, + "step": 4630 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 17.518966674804688, + "learning_rate": 1.8359119132629522e-07, + "logits/chosen": -2.2724461555480957, + "logits/rejected": -2.2560017108917236, + "logps/chosen": -151.77694702148438, + "logps/rejected": -177.51280212402344, + "loss": 0.6233, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9702235460281372, + "rewards/margins": 0.24939396977424622, + "rewards/rejected": -1.219617486000061, + "step": 4640 + }, + { + "epoch": 0.801171605789111, + "grad_norm": 16.50297737121582, + "learning_rate": 1.8348098473937498e-07, + "logits/chosen": -2.2693591117858887, + "logits/rejected": -2.2382216453552246, + "logps/chosen": -145.84854125976562, + "logps/rejected": -167.67819213867188, + "loss": 0.6194, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9169775247573853, + "rewards/margins": 0.252015620470047, + "rewards/rejected": -1.1689932346343994, + "step": 4650 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 13.565765380859375, + "learning_rate": 1.8337044260869195e-07, + "logits/chosen": -2.2698116302490234, + "logits/rejected": -2.2529296875, + "logps/chosen": -135.17910766601562, + "logps/rejected": -159.684326171875, + "loss": 0.6098, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8087183833122253, + "rewards/margins": 0.24533787369728088, + "rewards/rejected": -1.0540562868118286, + "step": 4660 + }, + { + "epoch": 0.8046175051688491, + "grad_norm": 15.313465118408203, + "learning_rate": 1.8325956537855964e-07, + "logits/chosen": -2.2522294521331787, + "logits/rejected": -2.228555202484131, + "logps/chosen": -133.0584259033203, + "logps/rejected": -157.33486938476562, + "loss": 0.6001, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8063106536865234, + "rewards/margins": 0.276062548160553, + "rewards/rejected": -1.0823732614517212, + "step": 4670 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 14.039936065673828, + "learning_rate": 1.8314835349463834e-07, + "logits/chosen": -2.249237537384033, + "logits/rejected": -2.2180287837982178, + "logps/chosen": -136.5815887451172, + "logps/rejected": -163.4698944091797, + "loss": 0.6178, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8409102559089661, + "rewards/margins": 0.27688902616500854, + "rewards/rejected": -1.1177994012832642, + "step": 4680 + }, + { + "epoch": 0.8080634045485872, + "grad_norm": 13.222373008728027, + "learning_rate": 1.8303680740393354e-07, + "logits/chosen": -2.2840068340301514, + "logits/rejected": -2.2594456672668457, + "logps/chosen": -142.51416015625, + "logps/rejected": -176.42428588867188, + "loss": 0.5895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8675103187561035, + "rewards/margins": 0.3537306487560272, + "rewards/rejected": -1.2212409973144531, + "step": 4690 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 16.832406997680664, + "learning_rate": 1.829249275547939e-07, + "logits/chosen": -2.2416279315948486, + "logits/rejected": -2.227914333343506, + "logps/chosen": -143.00843811035156, + "logps/rejected": -179.38323974609375, + "loss": 0.6022, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9127408862113953, + "rewards/margins": 0.32419341802597046, + "rewards/rejected": -1.2369344234466553, + "step": 4700 + }, + { + "epoch": 0.8115093039283253, + "grad_norm": 17.441082000732422, + "learning_rate": 1.8281271439690972e-07, + "logits/chosen": -2.2823708057403564, + "logits/rejected": -2.2493019104003906, + "logps/chosen": -148.35629272460938, + "logps/rejected": -171.28524780273438, + "loss": 0.6138, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9040901064872742, + "rewards/margins": 0.2849331796169281, + "rewards/rejected": -1.1890233755111694, + "step": 4710 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 15.347607612609863, + "learning_rate": 1.8270016838131098e-07, + "logits/chosen": -2.170727491378784, + "logits/rejected": -2.140810489654541, + "logps/chosen": -144.2772216796875, + "logps/rejected": -172.93899536132812, + "loss": 0.5901, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8747490048408508, + "rewards/margins": 0.3303028643131256, + "rewards/rejected": -1.2050518989562988, + "step": 4720 + }, + { + "epoch": 0.8149552033080634, + "grad_norm": 18.199466705322266, + "learning_rate": 1.825872899603655e-07, + "logits/chosen": -2.1585171222686768, + "logits/rejected": -2.132734537124634, + "logps/chosen": -146.39041137695312, + "logps/rejected": -168.12551879882812, + "loss": 0.6366, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9264416694641113, + "rewards/margins": 0.24258160591125488, + "rewards/rejected": -1.1690232753753662, + "step": 4730 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 13.123265266418457, + "learning_rate": 1.824740795877772e-07, + "logits/chosen": -2.236109972000122, + "logits/rejected": -2.2206127643585205, + "logps/chosen": -134.59371948242188, + "logps/rejected": -167.7301483154297, + "loss": 0.5892, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8015215992927551, + "rewards/margins": 0.3370732367038727, + "rewards/rejected": -1.1385948657989502, + "step": 4740 + }, + { + "epoch": 0.8184011026878015, + "grad_norm": 14.976317405700684, + "learning_rate": 1.8236053771858428e-07, + "logits/chosen": -2.2023322582244873, + "logits/rejected": -2.186938524246216, + "logps/chosen": -141.57130432128906, + "logps/rejected": -165.6098175048828, + "loss": 0.612, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8617923855781555, + "rewards/margins": 0.24837076663970947, + "rewards/rejected": -1.1101632118225098, + "step": 4750 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 17.25830078125, + "learning_rate": 1.8224666480915732e-07, + "logits/chosen": -2.2000133991241455, + "logits/rejected": -2.175978660583496, + "logps/chosen": -141.7084197998047, + "logps/rejected": -169.6337127685547, + "loss": 0.6029, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8923633694648743, + "rewards/margins": 0.28839367628097534, + "rewards/rejected": -1.1807570457458496, + "step": 4760 + }, + { + "epoch": 0.8218470020675396, + "grad_norm": 12.114882469177246, + "learning_rate": 1.8213246131719746e-07, + "logits/chosen": -2.258085250854492, + "logits/rejected": -2.2356557846069336, + "logps/chosen": -157.0770721435547, + "logps/rejected": -176.11485290527344, + "loss": 0.6458, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9748696088790894, + "rewards/margins": 0.23428723216056824, + "rewards/rejected": -1.2091567516326904, + "step": 4770 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 13.910969734191895, + "learning_rate": 1.8201792770173462e-07, + "logits/chosen": -2.1453006267547607, + "logits/rejected": -2.1221261024475098, + "logps/chosen": -137.5989990234375, + "logps/rejected": -174.49765014648438, + "loss": 0.5723, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.861288845539093, + "rewards/margins": 0.34425026178359985, + "rewards/rejected": -1.2055391073226929, + "step": 4780 + }, + { + "epoch": 0.8252929014472777, + "grad_norm": 13.05284309387207, + "learning_rate": 1.8190306442312565e-07, + "logits/chosen": -2.2314467430114746, + "logits/rejected": -2.208369731903076, + "logps/chosen": -151.15231323242188, + "logps/rejected": -163.9604034423828, + "loss": 0.6475, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9336099624633789, + "rewards/margins": 0.1781330555677414, + "rewards/rejected": -1.1117430925369263, + "step": 4790 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 15.045424461364746, + "learning_rate": 1.8178787194305239e-07, + "logits/chosen": -2.179490089416504, + "logits/rejected": -2.160287857055664, + "logps/chosen": -134.5084228515625, + "logps/rejected": -158.16839599609375, + "loss": 0.6153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8251282572746277, + "rewards/margins": 0.25325918197631836, + "rewards/rejected": -1.0783874988555908, + "step": 4800 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -2.3301966190338135, + "eval_logits/rejected": -2.3205788135528564, + "eval_logps/chosen": -129.8305206298828, + "eval_logps/rejected": -150.38137817382812, + "eval_loss": 0.6432415246963501, + "eval_rewards/accuracies": 0.6266263723373413, + "eval_rewards/chosen": -0.7111861705780029, + "eval_rewards/margins": 0.16082628071308136, + "eval_rewards/rejected": -0.8720124363899231, + "eval_runtime": 384.3644, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 4800 + }, + { + "epoch": 0.8287388008270159, + "grad_norm": 13.558989524841309, + "learning_rate": 1.816723507245199e-07, + "logits/chosen": -2.1871228218078613, + "logits/rejected": -2.151096820831299, + "logps/chosen": -139.2497100830078, + "logps/rejected": -166.04983520507812, + "loss": 0.5912, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.820334255695343, + "rewards/margins": 0.31642287969589233, + "rewards/rejected": -1.1367571353912354, + "step": 4810 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 13.416345596313477, + "learning_rate": 1.8155650123185458e-07, + "logits/chosen": -2.226797580718994, + "logits/rejected": -2.213923931121826, + "logps/chosen": -133.97592163085938, + "logps/rejected": -164.05328369140625, + "loss": 0.6038, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8281854391098022, + "rewards/margins": 0.2941720485687256, + "rewards/rejected": -1.1223574876785278, + "step": 4820 + }, + { + "epoch": 0.832184700206754, + "grad_norm": 13.031325340270996, + "learning_rate": 1.8144032393070225e-07, + "logits/chosen": -2.2374587059020996, + "logits/rejected": -2.2140958309173584, + "logps/chosen": -139.13052368164062, + "logps/rejected": -157.33258056640625, + "loss": 0.6539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8555358648300171, + "rewards/margins": 0.19151292741298676, + "rewards/rejected": -1.047048807144165, + "step": 4830 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 12.416264533996582, + "learning_rate": 1.8132381928802643e-07, + "logits/chosen": -2.192883253097534, + "logits/rejected": -2.1586222648620605, + "logps/chosen": -144.0219268798828, + "logps/rejected": -174.36602783203125, + "loss": 0.5878, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8785157203674316, + "rewards/margins": 0.3410136103630066, + "rewards/rejected": -1.219529390335083, + "step": 4840 + }, + { + "epoch": 0.8356305995864921, + "grad_norm": 14.267515182495117, + "learning_rate": 1.8120698777210626e-07, + "logits/chosen": -2.24122953414917, + "logits/rejected": -2.222362518310547, + "logps/chosen": -140.9573211669922, + "logps/rejected": -168.92770385742188, + "loss": 0.6085, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8682325482368469, + "rewards/margins": 0.28492265939712524, + "rewards/rejected": -1.1531550884246826, + "step": 4850 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 17.367626190185547, + "learning_rate": 1.8108982985253472e-07, + "logits/chosen": -2.249375104904175, + "logits/rejected": -2.2173800468444824, + "logps/chosen": -148.92501831054688, + "logps/rejected": -161.76766967773438, + "loss": 0.638, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9046787023544312, + "rewards/margins": 0.20723576843738556, + "rewards/rejected": -1.1119145154953003, + "step": 4860 + }, + { + "epoch": 0.8390764989662302, + "grad_norm": 17.7515869140625, + "learning_rate": 1.8097234600021679e-07, + "logits/chosen": -2.258147954940796, + "logits/rejected": -2.229663133621216, + "logps/chosen": -149.1612091064453, + "logps/rejected": -175.52978515625, + "loss": 0.6017, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9169160723686218, + "rewards/margins": 0.3234608769416809, + "rewards/rejected": -1.2403769493103027, + "step": 4870 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 15.29587459564209, + "learning_rate": 1.8085453668736745e-07, + "logits/chosen": -2.1662745475769043, + "logits/rejected": -2.133847713470459, + "logps/chosen": -138.71878051757812, + "logps/rejected": -161.3764190673828, + "loss": 0.622, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8553798794746399, + "rewards/margins": 0.2521751821041107, + "rewards/rejected": -1.1075549125671387, + "step": 4880 + }, + { + "epoch": 0.8425223983459683, + "grad_norm": 14.815744400024414, + "learning_rate": 1.8073640238750988e-07, + "logits/chosen": -2.2213339805603027, + "logits/rejected": -2.1952900886535645, + "logps/chosen": -144.75059509277344, + "logps/rejected": -175.25985717773438, + "loss": 0.5898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8932998776435852, + "rewards/margins": 0.32396167516708374, + "rewards/rejected": -1.217261552810669, + "step": 4890 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 13.8853178024292, + "learning_rate": 1.806179435754735e-07, + "logits/chosen": -2.196868419647217, + "logits/rejected": -2.164306640625, + "logps/chosen": -144.29612731933594, + "logps/rejected": -169.0148468017578, + "loss": 0.6377, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9109374284744263, + "rewards/margins": 0.2767733931541443, + "rewards/rejected": -1.1877108812332153, + "step": 4900 + }, + { + "epoch": 0.8459682977257064, + "grad_norm": 15.792616844177246, + "learning_rate": 1.804991607273921e-07, + "logits/chosen": -2.306757926940918, + "logits/rejected": -2.2741053104400635, + "logps/chosen": -147.36080932617188, + "logps/rejected": -171.72479248046875, + "loss": 0.6104, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9102157354354858, + "rewards/margins": 0.2826886773109436, + "rewards/rejected": -1.1929042339324951, + "step": 4910 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 15.132826805114746, + "learning_rate": 1.8038005432070183e-07, + "logits/chosen": -2.151120662689209, + "logits/rejected": -2.1305134296417236, + "logps/chosen": -142.0108184814453, + "logps/rejected": -177.2761993408203, + "loss": 0.5886, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8817094564437866, + "rewards/margins": 0.3563971221446991, + "rewards/rejected": -1.238106608390808, + "step": 4920 + }, + { + "epoch": 0.8494141971054445, + "grad_norm": 15.987706184387207, + "learning_rate": 1.8026062483413943e-07, + "logits/chosen": -2.318948984146118, + "logits/rejected": -2.3007540702819824, + "logps/chosen": -147.85739135742188, + "logps/rejected": -180.6566619873047, + "loss": 0.6267, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.947142481803894, + "rewards/margins": 0.27939143776893616, + "rewards/rejected": -1.2265340089797974, + "step": 4930 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 15.91757869720459, + "learning_rate": 1.8014087274774018e-07, + "logits/chosen": -2.2689530849456787, + "logits/rejected": -2.238652467727661, + "logps/chosen": -147.04481506347656, + "logps/rejected": -176.6737060546875, + "loss": 0.5941, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9112297296524048, + "rewards/margins": 0.323239266872406, + "rewards/rejected": -1.2344690561294556, + "step": 4940 + }, + { + "epoch": 0.8528600964851827, + "grad_norm": 15.712467193603516, + "learning_rate": 1.8002079854283605e-07, + "logits/chosen": -2.0711658000946045, + "logits/rejected": -2.0532054901123047, + "logps/chosen": -141.21932983398438, + "logps/rejected": -167.3578338623047, + "loss": 0.6104, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8639213442802429, + "rewards/margins": 0.2684737741947174, + "rewards/rejected": -1.1323951482772827, + "step": 4950 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 19.658069610595703, + "learning_rate": 1.799004027020537e-07, + "logits/chosen": -2.1870884895324707, + "logits/rejected": -2.1787643432617188, + "logps/chosen": -137.68484497070312, + "logps/rejected": -175.98887634277344, + "loss": 0.5754, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8543106317520142, + "rewards/margins": 0.34138739109039307, + "rewards/rejected": -1.1956980228424072, + "step": 4960 + }, + { + "epoch": 0.8563059958649207, + "grad_norm": 14.821301460266113, + "learning_rate": 1.7977968570931262e-07, + "logits/chosen": -2.135411500930786, + "logits/rejected": -2.118692636489868, + "logps/chosen": -145.12217712402344, + "logps/rejected": -184.40428161621094, + "loss": 0.5729, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9374872446060181, + "rewards/margins": 0.387998104095459, + "rewards/rejected": -1.3254854679107666, + "step": 4970 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 14.007376670837402, + "learning_rate": 1.796586480498231e-07, + "logits/chosen": -2.1761584281921387, + "logits/rejected": -2.161414384841919, + "logps/chosen": -148.2333221435547, + "logps/rejected": -183.5047607421875, + "loss": 0.5911, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9537581205368042, + "rewards/margins": 0.3324365019798279, + "rewards/rejected": -1.2861945629119873, + "step": 4980 + }, + { + "epoch": 0.8597518952446589, + "grad_norm": 18.360790252685547, + "learning_rate": 1.7953729021008434e-07, + "logits/chosen": -2.110150098800659, + "logits/rejected": -2.0944671630859375, + "logps/chosen": -151.00375366210938, + "logps/rejected": -184.45541381835938, + "loss": 0.6066, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9921935200691223, + "rewards/margins": 0.30348965525627136, + "rewards/rejected": -1.2956831455230713, + "step": 4990 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 15.967220306396484, + "learning_rate": 1.7941561267788245e-07, + "logits/chosen": -2.1210286617279053, + "logits/rejected": -2.093567371368408, + "logps/chosen": -151.39828491210938, + "logps/rejected": -182.8577880859375, + "loss": 0.5942, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9559431076049805, + "rewards/margins": 0.3455808162689209, + "rewards/rejected": -1.3015239238739014, + "step": 5000 + }, + { + "epoch": 0.8631977946243969, + "grad_norm": 17.03342056274414, + "learning_rate": 1.7929361594228852e-07, + "logits/chosen": -2.1279661655426025, + "logits/rejected": -2.0970072746276855, + "logps/chosen": -153.00637817382812, + "logps/rejected": -183.12176513671875, + "loss": 0.5895, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.999555766582489, + "rewards/margins": 0.3205220103263855, + "rewards/rejected": -1.320077896118164, + "step": 5010 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 16.8343448638916, + "learning_rate": 1.7917130049365672e-07, + "logits/chosen": -2.1045081615448, + "logits/rejected": -2.074673652648926, + "logps/chosen": -152.74563598632812, + "logps/rejected": -183.15377807617188, + "loss": 0.6032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.992462158203125, + "rewards/margins": 0.3191111385822296, + "rewards/rejected": -1.3115732669830322, + "step": 5020 + }, + { + "epoch": 0.8666436940041351, + "grad_norm": 19.641525268554688, + "learning_rate": 1.7904866682362213e-07, + "logits/chosen": -2.0697884559631348, + "logits/rejected": -2.053832530975342, + "logps/chosen": -151.6670379638672, + "logps/rejected": -182.6899871826172, + "loss": 0.6099, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9904986619949341, + "rewards/margins": 0.2968147397041321, + "rewards/rejected": -1.2873132228851318, + "step": 5030 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 18.75517463684082, + "learning_rate": 1.7892571542509896e-07, + "logits/chosen": -2.2520158290863037, + "logits/rejected": -2.215366840362549, + "logps/chosen": -163.31063842773438, + "logps/rejected": -180.81187438964844, + "loss": 0.6286, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0615317821502686, + "rewards/margins": 0.24211840331554413, + "rewards/rejected": -1.3036502599716187, + "step": 5040 + }, + { + "epoch": 0.8700895933838731, + "grad_norm": 17.165555953979492, + "learning_rate": 1.7880244679227853e-07, + "logits/chosen": -2.1506662368774414, + "logits/rejected": -2.1234805583953857, + "logps/chosen": -151.9202880859375, + "logps/rejected": -183.21421813964844, + "loss": 0.6096, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9631596803665161, + "rewards/margins": 0.3219161629676819, + "rewards/rejected": -1.2850759029388428, + "step": 5050 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 21.54743766784668, + "learning_rate": 1.7867886142062717e-07, + "logits/chosen": -2.0237479209899902, + "logits/rejected": -2.013625383377075, + "logps/chosen": -146.44781494140625, + "logps/rejected": -173.47976684570312, + "loss": 0.6347, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9597591161727905, + "rewards/margins": 0.2546851933002472, + "rewards/rejected": -1.2144443988800049, + "step": 5060 + }, + { + "epoch": 0.8735354927636113, + "grad_norm": 16.986095428466797, + "learning_rate": 1.785549598068844e-07, + "logits/chosen": -2.1160712242126465, + "logits/rejected": -2.089094638824463, + "logps/chosen": -145.29318237304688, + "logps/rejected": -166.83309936523438, + "loss": 0.6317, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9254485368728638, + "rewards/margins": 0.261610746383667, + "rewards/rejected": -1.1870592832565308, + "step": 5070 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 11.260225296020508, + "learning_rate": 1.7843074244906075e-07, + "logits/chosen": -2.313349485397339, + "logits/rejected": -2.273677349090576, + "logps/chosen": -141.82994079589844, + "logps/rejected": -171.1905975341797, + "loss": 0.5794, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8432556390762329, + "rewards/margins": 0.3597295582294464, + "rewards/rejected": -1.202985167503357, + "step": 5080 + }, + { + "epoch": 0.8769813921433495, + "grad_norm": 19.612459182739258, + "learning_rate": 1.7830620984643597e-07, + "logits/chosen": -2.1832711696624756, + "logits/rejected": -2.146843671798706, + "logps/chosen": -141.8723907470703, + "logps/rejected": -175.12730407714844, + "loss": 0.5775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8758166432380676, + "rewards/margins": 0.36045199632644653, + "rewards/rejected": -1.2362687587738037, + "step": 5090 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 21.284860610961914, + "learning_rate": 1.7818136249955678e-07, + "logits/chosen": -2.020749568939209, + "logits/rejected": -1.985507607460022, + "logps/chosen": -154.8480987548828, + "logps/rejected": -173.80126953125, + "loss": 0.6379, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9978119134902954, + "rewards/margins": 0.21697509288787842, + "rewards/rejected": -1.2147870063781738, + "step": 5100 + }, + { + "epoch": 0.8804272915230875, + "grad_norm": 15.660050392150879, + "learning_rate": 1.7805620091023505e-07, + "logits/chosen": -2.145282745361328, + "logits/rejected": -2.1087963581085205, + "logps/chosen": -161.85922241210938, + "logps/rejected": -181.78924560546875, + "loss": 0.6592, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.03986394405365, + "rewards/margins": 0.2396329641342163, + "rewards/rejected": -1.2794967889785767, + "step": 5110 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 18.590288162231445, + "learning_rate": 1.7793072558154573e-07, + "logits/chosen": -2.2191407680511475, + "logits/rejected": -2.187811851501465, + "logps/chosen": -146.4526824951172, + "logps/rejected": -164.67807006835938, + "loss": 0.6344, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9257931709289551, + "rewards/margins": 0.21812419593334198, + "rewards/rejected": -1.1439173221588135, + "step": 5120 + }, + { + "epoch": 0.8838731909028257, + "grad_norm": 15.55690860748291, + "learning_rate": 1.778049370178248e-07, + "logits/chosen": -2.229480743408203, + "logits/rejected": -2.1971240043640137, + "logps/chosen": -149.29954528808594, + "logps/rejected": -176.12698364257812, + "loss": 0.6021, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9573342204093933, + "rewards/margins": 0.3041648268699646, + "rewards/rejected": -1.2614991664886475, + "step": 5130 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 17.51070213317871, + "learning_rate": 1.7767883572466726e-07, + "logits/chosen": -2.1650819778442383, + "logits/rejected": -2.1310112476348877, + "logps/chosen": -149.53402709960938, + "logps/rejected": -166.9405975341797, + "loss": 0.6326, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9171573519706726, + "rewards/margins": 0.2225378453731537, + "rewards/rejected": -1.139695167541504, + "step": 5140 + }, + { + "epoch": 0.8873190902825637, + "grad_norm": 16.04876136779785, + "learning_rate": 1.7755242220892507e-07, + "logits/chosen": -2.190516233444214, + "logits/rejected": -2.175175189971924, + "logps/chosen": -142.1367950439453, + "logps/rejected": -170.53720092773438, + "loss": 0.6166, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8900700807571411, + "rewards/margins": 0.26656752824783325, + "rewards/rejected": -1.1566376686096191, + "step": 5150 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 19.764726638793945, + "learning_rate": 1.7742569697870512e-07, + "logits/chosen": -2.173518419265747, + "logits/rejected": -2.1372475624084473, + "logps/chosen": -137.65684509277344, + "logps/rejected": -161.5216064453125, + "loss": 0.6132, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.862400233745575, + "rewards/margins": 0.27269431948661804, + "rewards/rejected": -1.135094404220581, + "step": 5160 + }, + { + "epoch": 0.8907649896623019, + "grad_norm": 15.33234691619873, + "learning_rate": 1.7729866054336734e-07, + "logits/chosen": -2.157505750656128, + "logits/rejected": -2.1266400814056396, + "logps/chosen": -136.25643920898438, + "logps/rejected": -175.56077575683594, + "loss": 0.5571, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8095985651016235, + "rewards/margins": 0.4275360703468323, + "rewards/rejected": -1.2371346950531006, + "step": 5170 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 24.214967727661133, + "learning_rate": 1.7717131341352235e-07, + "logits/chosen": -2.2335801124572754, + "logits/rejected": -2.217602252960205, + "logps/chosen": -159.39169311523438, + "logps/rejected": -182.57766723632812, + "loss": 0.6406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0398868322372437, + "rewards/margins": 0.240803524851799, + "rewards/rejected": -1.2806904315948486, + "step": 5180 + }, + { + "epoch": 0.8942108890420399, + "grad_norm": 19.098445892333984, + "learning_rate": 1.770436561010297e-07, + "logits/chosen": -2.11548113822937, + "logits/rejected": -2.0990681648254395, + "logps/chosen": -151.03292846679688, + "logps/rejected": -174.67657470703125, + "loss": 0.6363, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.981927216053009, + "rewards/margins": 0.22593578696250916, + "rewards/rejected": -1.2078630924224854, + "step": 5190 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 14.947031021118164, + "learning_rate": 1.7691568911899556e-07, + "logits/chosen": -2.1943612098693848, + "logits/rejected": -2.1697402000427246, + "logps/chosen": -144.95828247070312, + "logps/rejected": -175.0249786376953, + "loss": 0.6107, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9287967681884766, + "rewards/margins": 0.28082185983657837, + "rewards/rejected": -1.2096188068389893, + "step": 5200 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -2.2476398944854736, + "eval_logits/rejected": -2.236311197280884, + "eval_logps/chosen": -133.41664123535156, + "eval_logps/rejected": -155.67408752441406, + "eval_loss": 0.6407474279403687, + "eval_rewards/accuracies": 0.6349906921386719, + "eval_rewards/chosen": -0.747047483921051, + "eval_rewards/margins": 0.17789211869239807, + "eval_rewards/rejected": -0.9249395728111267, + "eval_runtime": 384.3229, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 5200 + }, + { + "epoch": 0.8976567884217781, + "grad_norm": 14.731963157653809, + "learning_rate": 1.7678741298177092e-07, + "logits/chosen": -2.138547658920288, + "logits/rejected": -2.119096279144287, + "logps/chosen": -143.7112274169922, + "logps/rejected": -167.04486083984375, + "loss": 0.6323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9207198023796082, + "rewards/margins": 0.24972346425056458, + "rewards/rejected": -1.1704432964324951, + "step": 5210 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 13.480340003967285, + "learning_rate": 1.766588282049494e-07, + "logits/chosen": -2.1872940063476562, + "logits/rejected": -2.1705737113952637, + "logps/chosen": -134.97857666015625, + "logps/rejected": -158.5009307861328, + "loss": 0.6425, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8236749768257141, + "rewards/margins": 0.20152375102043152, + "rewards/rejected": -1.0251986980438232, + "step": 5220 + }, + { + "epoch": 0.9011026878015161, + "grad_norm": 16.07693099975586, + "learning_rate": 1.7652993530536497e-07, + "logits/chosen": -2.1148829460144043, + "logits/rejected": -2.096147298812866, + "logps/chosen": -139.06875610351562, + "logps/rejected": -180.09439086914062, + "loss": 0.5651, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8380593061447144, + "rewards/margins": 0.40176233649253845, + "rewards/rejected": -1.2398215532302856, + "step": 5230 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 16.134185791015625, + "learning_rate": 1.764007348010903e-07, + "logits/chosen": -2.11022686958313, + "logits/rejected": -2.07845401763916, + "logps/chosen": -138.83816528320312, + "logps/rejected": -184.94107055664062, + "loss": 0.5544, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8606517910957336, + "rewards/margins": 0.4552828371524811, + "rewards/rejected": -1.3159345388412476, + "step": 5240 + }, + { + "epoch": 0.9045485871812543, + "grad_norm": 14.774961471557617, + "learning_rate": 1.762712272114343e-07, + "logits/chosen": -2.062854051589966, + "logits/rejected": -2.043586015701294, + "logps/chosen": -137.57249450683594, + "logps/rejected": -168.33363342285156, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8659477233886719, + "rewards/margins": 0.3085930049419403, + "rewards/rejected": -1.174540638923645, + "step": 5250 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 25.076032638549805, + "learning_rate": 1.7614141305694029e-07, + "logits/chosen": -2.1341543197631836, + "logits/rejected": -2.0903890132904053, + "logps/chosen": -144.52560424804688, + "logps/rejected": -169.38229370117188, + "loss": 0.6099, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8774024844169617, + "rewards/margins": 0.30131369829177856, + "rewards/rejected": -1.1787161827087402, + "step": 5260 + }, + { + "epoch": 0.9079944865609925, + "grad_norm": 13.271434783935547, + "learning_rate": 1.7601129285938364e-07, + "logits/chosen": -2.2262303829193115, + "logits/rejected": -2.208543300628662, + "logps/chosen": -142.20352172851562, + "logps/rejected": -175.04721069335938, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9064930081367493, + "rewards/margins": 0.33251988887786865, + "rewards/rejected": -1.2390129566192627, + "step": 5270 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 17.105276107788086, + "learning_rate": 1.7588086714177003e-07, + "logits/chosen": -2.1332719326019287, + "logits/rejected": -2.0988264083862305, + "logps/chosen": -152.53726196289062, + "logps/rejected": -174.6946563720703, + "loss": 0.6222, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9816780090332031, + "rewards/margins": 0.2585660517215729, + "rewards/rejected": -1.2402441501617432, + "step": 5280 + }, + { + "epoch": 0.9114403859407305, + "grad_norm": 24.475873947143555, + "learning_rate": 1.7575013642833295e-07, + "logits/chosen": -2.0810658931732178, + "logits/rejected": -2.039071798324585, + "logps/chosen": -147.7109832763672, + "logps/rejected": -168.82142639160156, + "loss": 0.6285, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9185523986816406, + "rewards/margins": 0.24926848709583282, + "rewards/rejected": -1.167820930480957, + "step": 5290 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 13.721856117248535, + "learning_rate": 1.7561910124453195e-07, + "logits/chosen": -2.087953805923462, + "logits/rejected": -2.0669302940368652, + "logps/chosen": -144.81130981445312, + "logps/rejected": -178.23941040039062, + "loss": 0.5901, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8971312642097473, + "rewards/margins": 0.3337869942188263, + "rewards/rejected": -1.2309181690216064, + "step": 5300 + }, + { + "epoch": 0.9148862853204687, + "grad_norm": 14.560609817504883, + "learning_rate": 1.7548776211705034e-07, + "logits/chosen": -2.1995584964752197, + "logits/rejected": -2.185981512069702, + "logps/chosen": -152.6493682861328, + "logps/rejected": -172.60911560058594, + "loss": 0.6313, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9451559782028198, + "rewards/margins": 0.25696811079978943, + "rewards/rejected": -1.2021242380142212, + "step": 5310 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 18.688629150390625, + "learning_rate": 1.7535611957379302e-07, + "logits/chosen": -2.129035472869873, + "logits/rejected": -2.074998140335083, + "logps/chosen": -151.6221160888672, + "logps/rejected": -174.55166625976562, + "loss": 0.6073, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9406827092170715, + "rewards/margins": 0.30224713683128357, + "rewards/rejected": -1.2429296970367432, + "step": 5320 + }, + { + "epoch": 0.9183321847002067, + "grad_norm": 12.732282638549805, + "learning_rate": 1.7522417414388446e-07, + "logits/chosen": -2.0805225372314453, + "logits/rejected": -2.074148416519165, + "logps/chosen": -141.38742065429688, + "logps/rejected": -182.73350524902344, + "loss": 0.5767, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9053370356559753, + "rewards/margins": 0.38774365186691284, + "rewards/rejected": -1.2930806875228882, + "step": 5330 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 16.328292846679688, + "learning_rate": 1.7509192635766664e-07, + "logits/chosen": -2.1107449531555176, + "logits/rejected": -2.062073230743408, + "logps/chosen": -146.4422607421875, + "logps/rejected": -170.7941131591797, + "loss": 0.6002, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.90080326795578, + "rewards/margins": 0.2933877110481262, + "rewards/rejected": -1.1941907405853271, + "step": 5340 + }, + { + "epoch": 0.9217780840799449, + "grad_norm": 19.853805541992188, + "learning_rate": 1.7495937674669675e-07, + "logits/chosen": -2.062171459197998, + "logits/rejected": -2.0297489166259766, + "logps/chosen": -144.7355499267578, + "logps/rejected": -170.14242553710938, + "loss": 0.6209, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9209309816360474, + "rewards/margins": 0.2607792317867279, + "rewards/rejected": -1.181710124015808, + "step": 5350 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 16.074657440185547, + "learning_rate": 1.7482652584374514e-07, + "logits/chosen": -2.160583019256592, + "logits/rejected": -2.1394407749176025, + "logps/chosen": -142.66517639160156, + "logps/rejected": -187.32321166992188, + "loss": 0.5671, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8945107460021973, + "rewards/margins": 0.4139057993888855, + "rewards/rejected": -1.3084166049957275, + "step": 5360 + }, + { + "epoch": 0.9252239834596829, + "grad_norm": 20.958538055419922, + "learning_rate": 1.7469337418279325e-07, + "logits/chosen": -2.069669008255005, + "logits/rejected": -2.050961971282959, + "logps/chosen": -143.38893127441406, + "logps/rejected": -171.64334106445312, + "loss": 0.6235, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9146418571472168, + "rewards/margins": 0.26775017380714417, + "rewards/rejected": -1.1823922395706177, + "step": 5370 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 19.33513069152832, + "learning_rate": 1.7455992229903133e-07, + "logits/chosen": -2.16186785697937, + "logits/rejected": -2.122178077697754, + "logps/chosen": -155.24649047851562, + "logps/rejected": -181.39578247070312, + "loss": 0.5992, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9784836769104004, + "rewards/margins": 0.3191712498664856, + "rewards/rejected": -1.2976548671722412, + "step": 5380 + }, + { + "epoch": 0.9286698828394211, + "grad_norm": 19.882524490356445, + "learning_rate": 1.7442617072885627e-07, + "logits/chosen": -2.08012056350708, + "logits/rejected": -2.0367953777313232, + "logps/chosen": -161.18124389648438, + "logps/rejected": -183.7856903076172, + "loss": 0.6114, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0261393785476685, + "rewards/margins": 0.29558470845222473, + "rewards/rejected": -1.3217241764068604, + "step": 5390 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 16.86815643310547, + "learning_rate": 1.7429212000986965e-07, + "logits/chosen": -2.0811963081359863, + "logits/rejected": -2.0536980628967285, + "logps/chosen": -143.79843139648438, + "logps/rejected": -189.55067443847656, + "loss": 0.563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9269205927848816, + "rewards/margins": 0.4296680986881256, + "rewards/rejected": -1.35658860206604, + "step": 5400 + }, + { + "epoch": 0.9321157822191593, + "grad_norm": 27.655717849731445, + "learning_rate": 1.7415777068087545e-07, + "logits/chosen": -2.086303949356079, + "logits/rejected": -2.0706019401550293, + "logps/chosen": -162.78172302246094, + "logps/rejected": -184.86241149902344, + "loss": 0.6245, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0614235401153564, + "rewards/margins": 0.2463240623474121, + "rewards/rejected": -1.3077476024627686, + "step": 5410 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 19.206357955932617, + "learning_rate": 1.7402312328187776e-07, + "logits/chosen": -2.1491780281066895, + "logits/rejected": -2.1271309852600098, + "logps/chosen": -153.69996643066406, + "logps/rejected": -181.80645751953125, + "loss": 0.6144, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9998170733451843, + "rewards/margins": 0.28072595596313477, + "rewards/rejected": -1.2805430889129639, + "step": 5420 + }, + { + "epoch": 0.9355616815988973, + "grad_norm": 19.00049591064453, + "learning_rate": 1.7388817835407884e-07, + "logits/chosen": -2.1044201850891113, + "logits/rejected": -2.08538556098938, + "logps/chosen": -156.75936889648438, + "logps/rejected": -189.45535278320312, + "loss": 0.5936, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0316128730773926, + "rewards/margins": 0.3454335331916809, + "rewards/rejected": -1.3770463466644287, + "step": 5430 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 14.75564956665039, + "learning_rate": 1.737529364398768e-07, + "logits/chosen": -2.086437225341797, + "logits/rejected": -2.061494827270508, + "logps/chosen": -159.58126831054688, + "logps/rejected": -198.78977966308594, + "loss": 0.5698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0337985754013062, + "rewards/margins": 0.4016556739807129, + "rewards/rejected": -1.4354543685913086, + "step": 5440 + }, + { + "epoch": 0.9390075809786355, + "grad_norm": 24.83152961730957, + "learning_rate": 1.7361739808286343e-07, + "logits/chosen": -2.0081675052642822, + "logits/rejected": -1.988793969154358, + "logps/chosen": -162.98727416992188, + "logps/rejected": -194.2002716064453, + "loss": 0.5992, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0740816593170166, + "rewards/margins": 0.3331459164619446, + "rewards/rejected": -1.407227635383606, + "step": 5450 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 17.985929489135742, + "learning_rate": 1.7348156382782215e-07, + "logits/chosen": -2.0747618675231934, + "logits/rejected": -2.0514020919799805, + "logps/chosen": -161.99270629882812, + "logps/rejected": -185.65159606933594, + "loss": 0.6304, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0814931392669678, + "rewards/margins": 0.2549132704734802, + "rewards/rejected": -1.3364064693450928, + "step": 5460 + }, + { + "epoch": 0.9424534803583735, + "grad_norm": 18.192277908325195, + "learning_rate": 1.733454342207256e-07, + "logits/chosen": -2.0366196632385254, + "logits/rejected": -2.017728328704834, + "logps/chosen": -159.3045654296875, + "logps/rejected": -180.2318878173828, + "loss": 0.6695, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0794590711593628, + "rewards/margins": 0.21801552176475525, + "rewards/rejected": -1.29747474193573, + "step": 5470 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 19.787668228149414, + "learning_rate": 1.732090098087336e-07, + "logits/chosen": -2.0741186141967773, + "logits/rejected": -2.04555082321167, + "logps/chosen": -153.68495178222656, + "logps/rejected": -189.20364379882812, + "loss": 0.5878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0074174404144287, + "rewards/margins": 0.36818450689315796, + "rewards/rejected": -1.3756020069122314, + "step": 5480 + }, + { + "epoch": 0.9458993797381117, + "grad_norm": 14.216739654541016, + "learning_rate": 1.7307229114019091e-07, + "logits/chosen": -2.0435163974761963, + "logits/rejected": -2.010690212249756, + "logps/chosen": -154.7379150390625, + "logps/rejected": -180.75912475585938, + "loss": 0.6133, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9917277097702026, + "rewards/margins": 0.29374217987060547, + "rewards/rejected": -1.285469889640808, + "step": 5490 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 21.31083106994629, + "learning_rate": 1.7293527876462504e-07, + "logits/chosen": -2.1244988441467285, + "logits/rejected": -2.1008124351501465, + "logps/chosen": -150.3030242919922, + "logps/rejected": -189.2754364013672, + "loss": 0.581, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9649218320846558, + "rewards/margins": 0.3811757564544678, + "rewards/rejected": -1.346097707748413, + "step": 5500 + }, + { + "epoch": 0.9493452791178497, + "grad_norm": 17.781944274902344, + "learning_rate": 1.72797973232744e-07, + "logits/chosen": -2.0382659435272217, + "logits/rejected": -2.017784833908081, + "logps/chosen": -165.6508026123047, + "logps/rejected": -191.04080200195312, + "loss": 0.6376, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1242339611053467, + "rewards/margins": 0.2588551640510559, + "rewards/rejected": -1.3830890655517578, + "step": 5510 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 21.734769821166992, + "learning_rate": 1.726603750964341e-07, + "logits/chosen": -2.031649112701416, + "logits/rejected": -2.0036509037017822, + "logps/chosen": -155.42190551757812, + "logps/rejected": -185.03111267089844, + "loss": 0.5963, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9959745407104492, + "rewards/margins": 0.33889469504356384, + "rewards/rejected": -1.334869146347046, + "step": 5520 + }, + { + "epoch": 0.9527911784975879, + "grad_norm": 20.869234085083008, + "learning_rate": 1.725224849087578e-07, + "logits/chosen": -2.066890239715576, + "logits/rejected": -2.0298147201538086, + "logps/chosen": -162.42332458496094, + "logps/rejected": -187.11082458496094, + "loss": 0.6107, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0251661539077759, + "rewards/margins": 0.3152082860469818, + "rewards/rejected": -1.34037446975708, + "step": 5530 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 22.375978469848633, + "learning_rate": 1.723843032239514e-07, + "logits/chosen": -2.0838558673858643, + "logits/rejected": -2.078803539276123, + "logps/chosen": -147.0901641845703, + "logps/rejected": -188.95034790039062, + "loss": 0.584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9474297761917114, + "rewards/margins": 0.37506046891212463, + "rewards/rejected": -1.3224903345108032, + "step": 5540 + }, + { + "epoch": 0.956237077877326, + "grad_norm": 16.93791961669922, + "learning_rate": 1.722458305974229e-07, + "logits/chosen": -1.950770378112793, + "logits/rejected": -1.9323211908340454, + "logps/chosen": -156.38357543945312, + "logps/rejected": -177.25633239746094, + "loss": 0.6578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.0376572608947754, + "rewards/margins": 0.19414012134075165, + "rewards/rejected": -1.231797456741333, + "step": 5550 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 17.297195434570312, + "learning_rate": 1.7210706758574957e-07, + "logits/chosen": -2.0867764949798584, + "logits/rejected": -2.0538828372955322, + "logps/chosen": -140.0636444091797, + "logps/rejected": -172.29058837890625, + "loss": 0.5909, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.881769061088562, + "rewards/margins": 0.3462349772453308, + "rewards/rejected": -1.2280040979385376, + "step": 5560 + }, + { + "epoch": 0.9596829772570641, + "grad_norm": 16.964792251586914, + "learning_rate": 1.71968014746676e-07, + "logits/chosen": -2.115882635116577, + "logits/rejected": -2.0990474224090576, + "logps/chosen": -137.8494415283203, + "logps/rejected": -174.75584411621094, + "loss": 0.6051, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8718293905258179, + "rewards/margins": 0.30636200308799744, + "rewards/rejected": -1.1781913042068481, + "step": 5570 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 14.352399826049805, + "learning_rate": 1.7182867263911163e-07, + "logits/chosen": -1.999959945678711, + "logits/rejected": -1.9783000946044922, + "logps/chosen": -146.61141967773438, + "logps/rejected": -180.71133422851562, + "loss": 0.5877, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.934338390827179, + "rewards/margins": 0.3349146246910095, + "rewards/rejected": -1.2692530155181885, + "step": 5580 + }, + { + "epoch": 0.9631288766368022, + "grad_norm": 29.507858276367188, + "learning_rate": 1.7168904182312863e-07, + "logits/chosen": -2.149902820587158, + "logits/rejected": -2.106049060821533, + "logps/chosen": -155.22250366210938, + "logps/rejected": -183.86978149414062, + "loss": 0.6186, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9839137196540833, + "rewards/margins": 0.3032621741294861, + "rewards/rejected": -1.2871758937835693, + "step": 5590 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 18.434415817260742, + "learning_rate": 1.715491228599596e-07, + "logits/chosen": -2.1140358448028564, + "logits/rejected": -2.101685047149658, + "logps/chosen": -149.51068115234375, + "logps/rejected": -186.94764709472656, + "loss": 0.6061, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9700676798820496, + "rewards/margins": 0.326327383518219, + "rewards/rejected": -1.296394944190979, + "step": 5600 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -2.1859350204467773, + "eval_logits/rejected": -2.1733474731445312, + "eval_logps/chosen": -137.2255096435547, + "eval_logps/rejected": -160.406982421875, + "eval_loss": 0.6391822695732117, + "eval_rewards/accuracies": 0.63150554895401, + "eval_rewards/chosen": -0.785136342048645, + "eval_rewards/margins": 0.1871323138475418, + "eval_rewards/rejected": -0.9722687005996704, + "eval_runtime": 384.4615, + "eval_samples_per_second": 11.195, + "eval_steps_per_second": 1.399, + "step": 5600 + }, + { + "epoch": 0.9665747760165403, + "grad_norm": 16.30390739440918, + "learning_rate": 1.7140891631199533e-07, + "logits/chosen": -2.121743679046631, + "logits/rejected": -2.09936785697937, + "logps/chosen": -145.47705078125, + "logps/rejected": -184.77903747558594, + "loss": 0.5891, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9307653307914734, + "rewards/margins": 0.3514396548271179, + "rewards/rejected": -1.2822052240371704, + "step": 5610 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 22.45163345336914, + "learning_rate": 1.7126842274278245e-07, + "logits/chosen": -2.036102771759033, + "logits/rejected": -2.013686418533325, + "logps/chosen": -157.7967987060547, + "logps/rejected": -178.23263549804688, + "loss": 0.6294, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0277711153030396, + "rewards/margins": 0.2493065595626831, + "rewards/rejected": -1.2770777940750122, + "step": 5620 + }, + { + "epoch": 0.9700206753962785, + "grad_norm": 18.08307456970215, + "learning_rate": 1.7112764271702135e-07, + "logits/chosen": -2.1119580268859863, + "logits/rejected": -2.081120729446411, + "logps/chosen": -154.29696655273438, + "logps/rejected": -172.24014282226562, + "loss": 0.6441, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9838927388191223, + "rewards/margins": 0.226963073015213, + "rewards/rejected": -1.2108559608459473, + "step": 5630 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 15.348020553588867, + "learning_rate": 1.7098657680056373e-07, + "logits/chosen": -2.0765204429626465, + "logits/rejected": -2.053340196609497, + "logps/chosen": -139.92092895507812, + "logps/rejected": -175.59921264648438, + "loss": 0.6002, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8597497940063477, + "rewards/margins": 0.3353044092655182, + "rewards/rejected": -1.195054054260254, + "step": 5640 + }, + { + "epoch": 0.9734665747760165, + "grad_norm": 12.885562896728516, + "learning_rate": 1.7084522556041049e-07, + "logits/chosen": -2.015348434448242, + "logits/rejected": -1.9957529306411743, + "logps/chosen": -138.64785766601562, + "logps/rejected": -175.66854858398438, + "loss": 0.593, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8663733601570129, + "rewards/margins": 0.3682767152786255, + "rewards/rejected": -1.2346501350402832, + "step": 5650 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 16.89599609375, + "learning_rate": 1.7070358956470923e-07, + "logits/chosen": -2.0125277042388916, + "logits/rejected": -1.9972074031829834, + "logps/chosen": -148.81884765625, + "logps/rejected": -184.15538024902344, + "loss": 0.6028, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9695499539375305, + "rewards/margins": 0.3619958460330963, + "rewards/rejected": -1.3315457105636597, + "step": 5660 + }, + { + "epoch": 0.9769124741557547, + "grad_norm": 19.88907241821289, + "learning_rate": 1.705616693827522e-07, + "logits/chosen": -2.034695863723755, + "logits/rejected": -2.0065276622772217, + "logps/chosen": -140.55987548828125, + "logps/rejected": -173.42185974121094, + "loss": 0.5839, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8651742935180664, + "rewards/margins": 0.3406986594200134, + "rewards/rejected": -1.2058730125427246, + "step": 5670 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 23.023468017578125, + "learning_rate": 1.7041946558497388e-07, + "logits/chosen": -2.056576728820801, + "logits/rejected": -2.014734983444214, + "logps/chosen": -158.1607666015625, + "logps/rejected": -190.6458282470703, + "loss": 0.5909, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0378226041793823, + "rewards/margins": 0.3521498739719391, + "rewards/rejected": -1.389972448348999, + "step": 5680 + }, + { + "epoch": 0.9803583735354927, + "grad_norm": 16.29109764099121, + "learning_rate": 1.7027697874294867e-07, + "logits/chosen": -2.086655855178833, + "logits/rejected": -2.049394130706787, + "logps/chosen": -158.97598266601562, + "logps/rejected": -188.78822326660156, + "loss": 0.5832, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0097744464874268, + "rewards/margins": 0.3614472448825836, + "rewards/rejected": -1.3712217807769775, + "step": 5690 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 29.7178955078125, + "learning_rate": 1.7013420942938876e-07, + "logits/chosen": -1.9082027673721313, + "logits/rejected": -1.887874960899353, + "logps/chosen": -162.42620849609375, + "logps/rejected": -187.9700469970703, + "loss": 0.627, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0464036464691162, + "rewards/margins": 0.30022451281547546, + "rewards/rejected": -1.346628189086914, + "step": 5700 + }, + { + "epoch": 0.9838042729152309, + "grad_norm": 19.80901336669922, + "learning_rate": 1.6999115821814155e-07, + "logits/chosen": -2.0822360515594482, + "logits/rejected": -2.052886486053467, + "logps/chosen": -158.82598876953125, + "logps/rejected": -193.48214721679688, + "loss": 0.5986, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0371277332305908, + "rewards/margins": 0.3718651533126831, + "rewards/rejected": -1.4089930057525635, + "step": 5710 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 17.461763381958008, + "learning_rate": 1.6984782568418766e-07, + "logits/chosen": -2.0551178455352783, + "logits/rejected": -2.0250158309936523, + "logps/chosen": -150.8203887939453, + "logps/rejected": -192.51698303222656, + "loss": 0.5634, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9880839586257935, + "rewards/margins": 0.4387296736240387, + "rewards/rejected": -1.4268134832382202, + "step": 5720 + }, + { + "epoch": 0.987250172294969, + "grad_norm": 23.234567642211914, + "learning_rate": 1.697042124036383e-07, + "logits/chosen": -2.117779493331909, + "logits/rejected": -2.1022117137908936, + "logps/chosen": -154.16024780273438, + "logps/rejected": -186.769287109375, + "loss": 0.6134, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.024356484413147, + "rewards/margins": 0.324504554271698, + "rewards/rejected": -1.3488609790802002, + "step": 5730 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 17.887039184570312, + "learning_rate": 1.6956031895373327e-07, + "logits/chosen": -2.0169944763183594, + "logits/rejected": -1.9780069589614868, + "logps/chosen": -170.80160522460938, + "logps/rejected": -204.9024658203125, + "loss": 0.5819, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1159305572509766, + "rewards/margins": 0.38865792751312256, + "rewards/rejected": -1.5045883655548096, + "step": 5740 + }, + { + "epoch": 0.9906960716747071, + "grad_norm": 18.053930282592773, + "learning_rate": 1.6941614591283834e-07, + "logits/chosen": -2.1555233001708984, + "logits/rejected": -2.126132011413574, + "logps/chosen": -162.896484375, + "logps/rejected": -179.764892578125, + "loss": 0.6383, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0623223781585693, + "rewards/margins": 0.23261627554893494, + "rewards/rejected": -1.2949388027191162, + "step": 5750 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 18.37833595275879, + "learning_rate": 1.6927169386044313e-07, + "logits/chosen": -2.0529894828796387, + "logits/rejected": -2.0172603130340576, + "logps/chosen": -156.34518432617188, + "logps/rejected": -190.53683471679688, + "loss": 0.5971, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9993540048599243, + "rewards/margins": 0.3503246307373047, + "rewards/rejected": -1.34967839717865, + "step": 5760 + }, + { + "epoch": 0.9941419710544452, + "grad_norm": 17.409399032592773, + "learning_rate": 1.691269633771588e-07, + "logits/chosen": -2.0082180500030518, + "logits/rejected": -1.975886583328247, + "logps/chosen": -144.2236328125, + "logps/rejected": -185.03004455566406, + "loss": 0.5754, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9236896634101868, + "rewards/margins": 0.38613972067832947, + "rewards/rejected": -1.3098294734954834, + "step": 5770 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 18.62607192993164, + "learning_rate": 1.6898195504471552e-07, + "logits/chosen": -2.02447509765625, + "logits/rejected": -1.9947541952133179, + "logps/chosen": -151.97547912597656, + "logps/rejected": -192.77957153320312, + "loss": 0.5888, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9951189756393433, + "rewards/margins": 0.39299628138542175, + "rewards/rejected": -1.3881151676177979, + "step": 5780 + }, + { + "epoch": 0.9975878704341833, + "grad_norm": 14.568831443786621, + "learning_rate": 1.688366694459603e-07, + "logits/chosen": -1.955736517906189, + "logits/rejected": -1.9136276245117188, + "logps/chosen": -142.00694274902344, + "logps/rejected": -181.83102416992188, + "loss": 0.5724, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9053285717964172, + "rewards/margins": 0.37364915013313293, + "rewards/rejected": -1.278977632522583, + "step": 5790 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 18.89543342590332, + "learning_rate": 1.6869110716485456e-07, + "logits/chosen": -2.0314784049987793, + "logits/rejected": -1.9947092533111572, + "logps/chosen": -153.63650512695312, + "logps/rejected": -198.30746459960938, + "loss": 0.5658, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0288561582565308, + "rewards/margins": 0.4323458671569824, + "rewards/rejected": -1.4612020254135132, + "step": 5800 + }, + { + "epoch": 1.0010337698139213, + "grad_norm": 18.528011322021484, + "learning_rate": 1.6854526878647186e-07, + "logits/chosen": -2.0780978202819824, + "logits/rejected": -2.0521152019500732, + "logps/chosen": -163.8245086669922, + "logps/rejected": -199.75381469726562, + "loss": 0.5987, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.097534418106079, + "rewards/margins": 0.36402350664138794, + "rewards/rejected": -1.4615581035614014, + "step": 5810 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 18.017547607421875, + "learning_rate": 1.6839915489699545e-07, + "logits/chosen": -2.062495708465576, + "logits/rejected": -2.0183212757110596, + "logps/chosen": -163.41049194335938, + "logps/rejected": -210.4263916015625, + "loss": 0.5397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1038446426391602, + "rewards/margins": 0.5071204304695129, + "rewards/rejected": -1.6109651327133179, + "step": 5820 + }, + { + "epoch": 1.0044796691936595, + "grad_norm": 26.828323364257812, + "learning_rate": 1.682527660837161e-07, + "logits/chosen": -1.954272985458374, + "logits/rejected": -1.9177658557891846, + "logps/chosen": -158.9732666015625, + "logps/rejected": -200.26998901367188, + "loss": 0.5782, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0547670125961304, + "rewards/margins": 0.4192841053009033, + "rewards/rejected": -1.4740509986877441, + "step": 5830 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 15.48154354095459, + "learning_rate": 1.6810610293502944e-07, + "logits/chosen": -1.9852135181427002, + "logits/rejected": -1.9541308879852295, + "logps/chosen": -156.57272338867188, + "logps/rejected": -206.9540252685547, + "loss": 0.5533, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0418469905853271, + "rewards/margins": 0.4819900393486023, + "rewards/rejected": -1.5238369703292847, + "step": 5840 + }, + { + "epoch": 1.0079255685733977, + "grad_norm": 19.590543746948242, + "learning_rate": 1.679591660404339e-07, + "logits/chosen": -2.031728982925415, + "logits/rejected": -1.9990825653076172, + "logps/chosen": -155.19406127929688, + "logps/rejected": -213.0499725341797, + "loss": 0.5278, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.042792797088623, + "rewards/margins": 0.5427265167236328, + "rewards/rejected": -1.5855190753936768, + "step": 5850 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 15.18802547454834, + "learning_rate": 1.6781195599052807e-07, + "logits/chosen": -1.8982799053192139, + "logits/rejected": -1.8733913898468018, + "logps/chosen": -160.97203063964844, + "logps/rejected": -217.6151123046875, + "loss": 0.5504, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0919866561889648, + "rewards/margins": 0.5518103837966919, + "rewards/rejected": -1.6437969207763672, + "step": 5860 + }, + { + "epoch": 1.0113714679531358, + "grad_norm": 27.201187133789062, + "learning_rate": 1.6766447337700865e-07, + "logits/chosen": -1.9276330471038818, + "logits/rejected": -1.8953415155410767, + "logps/chosen": -177.0043487548828, + "logps/rejected": -222.3863067626953, + "loss": 0.5629, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2082350254058838, + "rewards/margins": 0.4653599262237549, + "rewards/rejected": -1.6735950708389282, + "step": 5870 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 22.19955825805664, + "learning_rate": 1.6751671879266769e-07, + "logits/chosen": -1.9740365743637085, + "logits/rejected": -1.9525846242904663, + "logps/chosen": -168.96153259277344, + "logps/rejected": -208.54513549804688, + "loss": 0.5868, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1421959400177002, + "rewards/margins": 0.4119526743888855, + "rewards/rejected": -1.5541484355926514, + "step": 5880 + }, + { + "epoch": 1.014817367332874, + "grad_norm": 27.656230926513672, + "learning_rate": 1.673686928313905e-07, + "logits/chosen": -1.9965871572494507, + "logits/rejected": -1.968044638633728, + "logps/chosen": -170.11952209472656, + "logps/rejected": -217.99942016601562, + "loss": 0.5625, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1495411396026611, + "rewards/margins": 0.44836121797561646, + "rewards/rejected": -1.597902536392212, + "step": 5890 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 18.465089797973633, + "learning_rate": 1.6722039608815315e-07, + "logits/chosen": -1.9055583477020264, + "logits/rejected": -1.8747007846832275, + "logps/chosen": -174.04336547851562, + "logps/rejected": -228.24948120117188, + "loss": 0.5291, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1837732791900635, + "rewards/margins": 0.5520281791687012, + "rewards/rejected": -1.7358014583587646, + "step": 5900 + }, + { + "epoch": 1.018263266712612, + "grad_norm": 20.942235946655273, + "learning_rate": 1.670718291590201e-07, + "logits/chosen": -1.9033527374267578, + "logits/rejected": -1.8923523426055908, + "logps/chosen": -169.00942993164062, + "logps/rejected": -211.85433959960938, + "loss": 0.5828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1649696826934814, + "rewards/margins": 0.3764055669307709, + "rewards/rejected": -1.5413752794265747, + "step": 5910 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 25.44744873046875, + "learning_rate": 1.6692299264114178e-07, + "logits/chosen": -1.9109452962875366, + "logits/rejected": -1.8838907480239868, + "logps/chosen": -172.25010681152344, + "logps/rejected": -210.68661499023438, + "loss": 0.6112, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2076642513275146, + "rewards/margins": 0.3641297221183777, + "rewards/rejected": -1.5717939138412476, + "step": 5920 + }, + { + "epoch": 1.02170916609235, + "grad_norm": 15.050667762756348, + "learning_rate": 1.6677388713275224e-07, + "logits/chosen": -1.9521329402923584, + "logits/rejected": -1.9319088459014893, + "logps/chosen": -176.0291290283203, + "logps/rejected": -216.5508270263672, + "loss": 0.6086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2580091953277588, + "rewards/margins": 0.39582735300064087, + "rewards/rejected": -1.6538364887237549, + "step": 5930 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 18.323843002319336, + "learning_rate": 1.6662451323316663e-07, + "logits/chosen": -1.946067452430725, + "logits/rejected": -1.9008439779281616, + "logps/chosen": -157.49227905273438, + "logps/rejected": -209.2881622314453, + "loss": 0.5384, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0325511693954468, + "rewards/margins": 0.529666543006897, + "rewards/rejected": -1.5622177124023438, + "step": 5940 + }, + { + "epoch": 1.0251550654720882, + "grad_norm": 14.298197746276855, + "learning_rate": 1.6647487154277897e-07, + "logits/chosen": -1.8379230499267578, + "logits/rejected": -1.8213342428207397, + "logps/chosen": -162.3971405029297, + "logps/rejected": -204.32907104492188, + "loss": 0.5784, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0869194269180298, + "rewards/margins": 0.4166596531867981, + "rewards/rejected": -1.5035792589187622, + "step": 5950 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 14.643122673034668, + "learning_rate": 1.6632496266305958e-07, + "logits/chosen": -1.8929506540298462, + "logits/rejected": -1.8443950414657593, + "logps/chosen": -174.22885131835938, + "logps/rejected": -198.41079711914062, + "loss": 0.6354, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1590553522109985, + "rewards/margins": 0.30704209208488464, + "rewards/rejected": -1.4660975933074951, + "step": 5960 + }, + { + "epoch": 1.0286009648518264, + "grad_norm": 17.613201141357422, + "learning_rate": 1.661747871965527e-07, + "logits/chosen": -1.8706638813018799, + "logits/rejected": -1.8416591882705688, + "logps/chosen": -161.9023895263672, + "logps/rejected": -208.8050994873047, + "loss": 0.5632, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1000502109527588, + "rewards/margins": 0.48885440826416016, + "rewards/rejected": -1.588904857635498, + "step": 5970 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 21.86261749267578, + "learning_rate": 1.6602434574687417e-07, + "logits/chosen": -1.883933424949646, + "logits/rejected": -1.8692964315414429, + "logps/chosen": -155.93893432617188, + "logps/rejected": -211.02001953125, + "loss": 0.5424, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.033689260482788, + "rewards/margins": 0.5113634467124939, + "rewards/rejected": -1.5450528860092163, + "step": 5980 + }, + { + "epoch": 1.0320468642315643, + "grad_norm": 25.24880027770996, + "learning_rate": 1.658736389187089e-07, + "logits/chosen": -1.933140516281128, + "logits/rejected": -1.8884773254394531, + "logps/chosen": -170.1532745361328, + "logps/rejected": -207.06857299804688, + "loss": 0.5904, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.150805115699768, + "rewards/margins": 0.39992770552635193, + "rewards/rejected": -1.5507327318191528, + "step": 5990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 26.61385726928711, + "learning_rate": 1.6572266731780842e-07, + "logits/chosen": -1.9090297222137451, + "logits/rejected": -1.877764344215393, + "logps/chosen": -169.45245361328125, + "logps/rejected": -214.60702514648438, + "loss": 0.5701, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1734836101531982, + "rewards/margins": 0.44985073804855347, + "rewards/rejected": -1.623334288597107, + "step": 6000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -2.029165029525757, + "eval_logits/rejected": -2.0121874809265137, + "eval_logps/chosen": -159.05809020996094, + "eval_logps/rejected": -187.67584228515625, + "eval_loss": 0.6356069445610046, + "eval_rewards/accuracies": 0.6291821599006653, + "eval_rewards/chosen": -1.0034619569778442, + "eval_rewards/margins": 0.2414952963590622, + "eval_rewards/rejected": -1.2449570894241333, + "eval_runtime": 384.1954, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 6000 + }, + { + "epoch": 1.0354927636113025, + "grad_norm": 20.915616989135742, + "learning_rate": 1.655714315509885e-07, + "logits/chosen": -1.9238481521606445, + "logits/rejected": -1.9024574756622314, + "logps/chosen": -169.83511352539062, + "logps/rejected": -201.43551635742188, + "loss": 0.6298, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1679986715316772, + "rewards/margins": 0.3322969377040863, + "rewards/rejected": -1.5002957582473755, + "step": 6010 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 16.894756317138672, + "learning_rate": 1.654199322261267e-07, + "logits/chosen": -2.0713367462158203, + "logits/rejected": -2.0292153358459473, + "logps/chosen": -170.26837158203125, + "logps/rejected": -205.8231964111328, + "loss": 0.61, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1588108539581299, + "rewards/margins": 0.40042543411254883, + "rewards/rejected": -1.5592362880706787, + "step": 6020 + }, + { + "epoch": 1.0389386629910407, + "grad_norm": 17.70704460144043, + "learning_rate": 1.6526816995215995e-07, + "logits/chosen": -1.775228500366211, + "logits/rejected": -1.7515833377838135, + "logps/chosen": -160.17477416992188, + "logps/rejected": -196.04910278320312, + "loss": 0.6146, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1033680438995361, + "rewards/margins": 0.3539595901966095, + "rewards/rejected": -1.4573276042938232, + "step": 6030 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 20.00725746154785, + "learning_rate": 1.651161453390821e-07, + "logits/chosen": -1.9683364629745483, + "logits/rejected": -1.9444955587387085, + "logps/chosen": -154.38311767578125, + "logps/rejected": -191.213623046875, + "loss": 0.578, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0067485570907593, + "rewards/margins": 0.3886244297027588, + "rewards/rejected": -1.3953731060028076, + "step": 6040 + }, + { + "epoch": 1.0423845623707788, + "grad_norm": 18.94013786315918, + "learning_rate": 1.6496385899794135e-07, + "logits/chosen": -1.873522400856018, + "logits/rejected": -1.8403030633926392, + "logps/chosen": -168.37149047851562, + "logps/rejected": -204.64468383789062, + "loss": 0.5708, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0902937650680542, + "rewards/margins": 0.419158399105072, + "rewards/rejected": -1.5094521045684814, + "step": 6050 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 18.95502281188965, + "learning_rate": 1.64811311540838e-07, + "logits/chosen": -1.9236536026000977, + "logits/rejected": -1.890880823135376, + "logps/chosen": -158.81686401367188, + "logps/rejected": -197.5561981201172, + "loss": 0.5943, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0532209873199463, + "rewards/margins": 0.40269631147384644, + "rewards/rejected": -1.4559170007705688, + "step": 6060 + }, + { + "epoch": 1.045830461750517, + "grad_norm": 24.840234756469727, + "learning_rate": 1.6465850358092184e-07, + "logits/chosen": -1.9287309646606445, + "logits/rejected": -1.8931522369384766, + "logps/chosen": -159.619384765625, + "logps/rejected": -200.43862915039062, + "loss": 0.5665, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0567705631256104, + "rewards/margins": 0.43039005994796753, + "rewards/rejected": -1.487160563468933, + "step": 6070 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 16.706314086914062, + "learning_rate": 1.645054357323897e-07, + "logits/chosen": -1.9066253900527954, + "logits/rejected": -1.8798481225967407, + "logps/chosen": -169.624755859375, + "logps/rejected": -208.3575439453125, + "loss": 0.5701, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1054916381835938, + "rewards/margins": 0.4423116147518158, + "rewards/rejected": -1.5478031635284424, + "step": 6080 + }, + { + "epoch": 1.049276361130255, + "grad_norm": 21.22140121459961, + "learning_rate": 1.6435210861048302e-07, + "logits/chosen": -1.9500453472137451, + "logits/rejected": -1.9190781116485596, + "logps/chosen": -159.1141357421875, + "logps/rejected": -212.4842071533203, + "loss": 0.5258, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.077060341835022, + "rewards/margins": 0.5342671871185303, + "rewards/rejected": -1.6113275289535522, + "step": 6090 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 19.14046287536621, + "learning_rate": 1.6419852283148535e-07, + "logits/chosen": -1.916421890258789, + "logits/rejected": -1.8944313526153564, + "logps/chosen": -177.45870971679688, + "logps/rejected": -230.8317413330078, + "loss": 0.5576, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2067549228668213, + "rewards/margins": 0.56285560131073, + "rewards/rejected": -1.7696106433868408, + "step": 6100 + }, + { + "epoch": 1.052722260509993, + "grad_norm": 17.196231842041016, + "learning_rate": 1.6404467901271998e-07, + "logits/chosen": -1.9056333303451538, + "logits/rejected": -1.872330904006958, + "logps/chosen": -179.09910583496094, + "logps/rejected": -231.61837768554688, + "loss": 0.5544, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2740018367767334, + "rewards/margins": 0.5072966814041138, + "rewards/rejected": -1.7812986373901367, + "step": 6110 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 21.852598190307617, + "learning_rate": 1.6389057777254722e-07, + "logits/chosen": -1.995835542678833, + "logits/rejected": -1.9330600500106812, + "logps/chosen": -174.4574432373047, + "logps/rejected": -239.14688110351562, + "loss": 0.5111, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2013572454452515, + "rewards/margins": 0.6553361415863037, + "rewards/rejected": -1.8566935062408447, + "step": 6120 + }, + { + "epoch": 1.0561681598897312, + "grad_norm": 15.418428421020508, + "learning_rate": 1.6373621973036224e-07, + "logits/chosen": -1.896733045578003, + "logits/rejected": -1.8596582412719727, + "logps/chosen": -176.39080810546875, + "logps/rejected": -231.16891479492188, + "loss": 0.5421, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2199915647506714, + "rewards/margins": 0.5522419810295105, + "rewards/rejected": -1.7722337245941162, + "step": 6130 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 33.1326904296875, + "learning_rate": 1.6358160550659213e-07, + "logits/chosen": -1.9121757745742798, + "logits/rejected": -1.8819659948349, + "logps/chosen": -178.7617645263672, + "logps/rejected": -224.37741088867188, + "loss": 0.5819, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2568280696868896, + "rewards/margins": 0.4621516168117523, + "rewards/rejected": -1.7189795970916748, + "step": 6140 + }, + { + "epoch": 1.0596140592694694, + "grad_norm": 24.849287033081055, + "learning_rate": 1.6342673572269398e-07, + "logits/chosen": -1.8367702960968018, + "logits/rejected": -1.81298828125, + "logps/chosen": -182.08493041992188, + "logps/rejected": -224.43612670898438, + "loss": 0.6006, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2907702922821045, + "rewards/margins": 0.45370227098464966, + "rewards/rejected": -1.7444725036621094, + "step": 6150 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 27.562496185302734, + "learning_rate": 1.632716110011519e-07, + "logits/chosen": -1.76801335811615, + "logits/rejected": -1.7465245723724365, + "logps/chosen": -173.99270629882812, + "logps/rejected": -217.61288452148438, + "loss": 0.5933, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2301690578460693, + "rewards/margins": 0.42046982049942017, + "rewards/rejected": -1.6506388187408447, + "step": 6160 + }, + { + "epoch": 1.0630599586492075, + "grad_norm": 20.34008026123047, + "learning_rate": 1.6311623196547474e-07, + "logits/chosen": -1.8881728649139404, + "logits/rejected": -1.8623225688934326, + "logps/chosen": -203.97837829589844, + "logps/rejected": -253.9075927734375, + "loss": 0.5641, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.445177435874939, + "rewards/margins": 0.5291798710823059, + "rewards/rejected": -1.9743572473526, + "step": 6170 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 23.07094955444336, + "learning_rate": 1.6296059924019353e-07, + "logits/chosen": -1.8910945653915405, + "logits/rejected": -1.859135627746582, + "logps/chosen": -198.97625732421875, + "logps/rejected": -236.320068359375, + "loss": 0.6122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4082109928131104, + "rewards/margins": 0.4251943528652191, + "rewards/rejected": -1.8334052562713623, + "step": 6180 + }, + { + "epoch": 1.0665058580289455, + "grad_norm": 25.002544403076172, + "learning_rate": 1.6280471345085901e-07, + "logits/chosen": -1.8824446201324463, + "logits/rejected": -1.8468780517578125, + "logps/chosen": -192.41334533691406, + "logps/rejected": -240.2723388671875, + "loss": 0.571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3423736095428467, + "rewards/margins": 0.5303239226341248, + "rewards/rejected": -1.8726978302001953, + "step": 6190 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 18.83368682861328, + "learning_rate": 1.6264857522403906e-07, + "logits/chosen": -1.7882111072540283, + "logits/rejected": -1.7458820343017578, + "logps/chosen": -180.8072967529297, + "logps/rejected": -237.64926147460938, + "loss": 0.5446, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2629281282424927, + "rewards/margins": 0.5611923933029175, + "rewards/rejected": -1.824120283126831, + "step": 6200 + }, + { + "epoch": 1.0699517574086836, + "grad_norm": 22.740522384643555, + "learning_rate": 1.6249218518731623e-07, + "logits/chosen": -1.8761451244354248, + "logits/rejected": -1.8383567333221436, + "logps/chosen": -190.98704528808594, + "logps/rejected": -237.24948120117188, + "loss": 0.5727, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3616178035736084, + "rewards/margins": 0.481916606426239, + "rewards/rejected": -1.843534231185913, + "step": 6210 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 19.40971565246582, + "learning_rate": 1.6233554396928515e-07, + "logits/chosen": -1.8758461475372314, + "logits/rejected": -1.8481992483139038, + "logps/chosen": -179.4745330810547, + "logps/rejected": -222.5402069091797, + "loss": 0.5813, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2733550071716309, + "rewards/margins": 0.4265773892402649, + "rewards/rejected": -1.699932336807251, + "step": 6220 + }, + { + "epoch": 1.0733976567884218, + "grad_norm": 18.235477447509766, + "learning_rate": 1.6217865219955008e-07, + "logits/chosen": -2.0155832767486572, + "logits/rejected": -1.972914457321167, + "logps/chosen": -177.2640838623047, + "logps/rejected": -251.619873046875, + "loss": 0.4886, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1843140125274658, + "rewards/margins": 0.7663615942001343, + "rewards/rejected": -1.950675368309021, + "step": 6230 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 20.55992317199707, + "learning_rate": 1.6202151050872242e-07, + "logits/chosen": -1.8177560567855835, + "logits/rejected": -1.7749807834625244, + "logps/chosen": -177.9635009765625, + "logps/rejected": -224.63125610351562, + "loss": 0.574, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2157669067382812, + "rewards/margins": 0.49352017045021057, + "rewards/rejected": -1.7092870473861694, + "step": 6240 + }, + { + "epoch": 1.07684355616816, + "grad_norm": 14.043688774108887, + "learning_rate": 1.618641195284179e-07, + "logits/chosen": -1.9259369373321533, + "logits/rejected": -1.894171953201294, + "logps/chosen": -178.1907958984375, + "logps/rejected": -209.7275848388672, + "loss": 0.6409, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2246299982070923, + "rewards/margins": 0.33370086550712585, + "rewards/rejected": -1.5583306550979614, + "step": 6250 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 21.307506561279297, + "learning_rate": 1.6170647989125455e-07, + "logits/chosen": -1.8041244745254517, + "logits/rejected": -1.777611494064331, + "logps/chosen": -180.18528747558594, + "logps/rejected": -208.56021118164062, + "loss": 0.6312, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2626087665557861, + "rewards/margins": 0.31918349862098694, + "rewards/rejected": -1.5817922353744507, + "step": 6260 + }, + { + "epoch": 1.080289455547898, + "grad_norm": 16.19000816345215, + "learning_rate": 1.6154859223084953e-07, + "logits/chosen": -2.079878807067871, + "logits/rejected": -2.0693135261535645, + "logps/chosen": -171.21466064453125, + "logps/rejected": -208.9565887451172, + "loss": 0.6056, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1831395626068115, + "rewards/margins": 0.36004436016082764, + "rewards/rejected": -1.5431840419769287, + "step": 6270 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 19.230445861816406, + "learning_rate": 1.613904571818171e-07, + "logits/chosen": -1.7808287143707275, + "logits/rejected": -1.7513635158538818, + "logps/chosen": -161.0174102783203, + "logps/rejected": -203.79562377929688, + "loss": 0.5709, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.066051721572876, + "rewards/margins": 0.44547238945961, + "rewards/rejected": -1.5115240812301636, + "step": 6280 + }, + { + "epoch": 1.083735354927636, + "grad_norm": 16.056062698364258, + "learning_rate": 1.6123207537976588e-07, + "logits/chosen": -1.8716799020767212, + "logits/rejected": -1.835736870765686, + "logps/chosen": -161.36973571777344, + "logps/rejected": -206.4115753173828, + "loss": 0.5785, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0999131202697754, + "rewards/margins": 0.4404403269290924, + "rewards/rejected": -1.5403534173965454, + "step": 6290 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 23.695697784423828, + "learning_rate": 1.6107344746129622e-07, + "logits/chosen": -1.9100465774536133, + "logits/rejected": -1.8787978887557983, + "logps/chosen": -172.82638549804688, + "logps/rejected": -209.90701293945312, + "loss": 0.6107, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2039012908935547, + "rewards/margins": 0.3747173845767975, + "rewards/rejected": -1.5786187648773193, + "step": 6300 + }, + { + "epoch": 1.0871812543073742, + "grad_norm": 19.707185745239258, + "learning_rate": 1.609145740639977e-07, + "logits/chosen": -1.9088573455810547, + "logits/rejected": -1.8732635974884033, + "logps/chosen": -156.1880645751953, + "logps/rejected": -189.70230102539062, + "loss": 0.6169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0573114156723022, + "rewards/margins": 0.3241928219795227, + "rewards/rejected": -1.3815044164657593, + "step": 6310 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 18.144315719604492, + "learning_rate": 1.6075545582644663e-07, + "logits/chosen": -1.895162582397461, + "logits/rejected": -1.8665847778320312, + "logps/chosen": -159.209228515625, + "logps/rejected": -199.00975036621094, + "loss": 0.5928, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.044872760772705, + "rewards/margins": 0.41198864579200745, + "rewards/rejected": -1.4568613767623901, + "step": 6320 + }, + { + "epoch": 1.0906271536871124, + "grad_norm": 18.30056381225586, + "learning_rate": 1.6059609338820342e-07, + "logits/chosen": -1.9051834344863892, + "logits/rejected": -1.8644979000091553, + "logps/chosen": -154.34178161621094, + "logps/rejected": -213.75088500976562, + "loss": 0.5157, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0105177164077759, + "rewards/margins": 0.590909481048584, + "rewards/rejected": -1.6014270782470703, + "step": 6330 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 17.638629913330078, + "learning_rate": 1.6043648738981e-07, + "logits/chosen": -1.9253466129302979, + "logits/rejected": -1.8944011926651, + "logps/chosen": -162.9455108642578, + "logps/rejected": -203.7376251220703, + "loss": 0.5806, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0735816955566406, + "rewards/margins": 0.43518805503845215, + "rewards/rejected": -1.5087696313858032, + "step": 6340 + }, + { + "epoch": 1.0940730530668505, + "grad_norm": 24.398740768432617, + "learning_rate": 1.6027663847278725e-07, + "logits/chosen": -1.848207712173462, + "logits/rejected": -1.8280870914459229, + "logps/chosen": -172.33726501464844, + "logps/rejected": -217.809814453125, + "loss": 0.5685, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.168224573135376, + "rewards/margins": 0.4468640387058258, + "rewards/rejected": -1.615088701248169, + "step": 6350 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 18.46108627319336, + "learning_rate": 1.6011654727963252e-07, + "logits/chosen": -1.8542156219482422, + "logits/rejected": -1.8329334259033203, + "logps/chosen": -169.8433380126953, + "logps/rejected": -220.50668334960938, + "loss": 0.5608, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1637474298477173, + "rewards/margins": 0.4954819679260254, + "rewards/rejected": -1.6592292785644531, + "step": 6360 + }, + { + "epoch": 1.0975189524465885, + "grad_norm": 18.699954986572266, + "learning_rate": 1.599562144538169e-07, + "logits/chosen": -1.8681213855743408, + "logits/rejected": -1.8547241687774658, + "logps/chosen": -174.8009490966797, + "logps/rejected": -216.6031036376953, + "loss": 0.6111, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2165663242340088, + "rewards/margins": 0.4050436019897461, + "rewards/rejected": -1.6216099262237549, + "step": 6370 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 22.507064819335938, + "learning_rate": 1.597956406397827e-07, + "logits/chosen": -1.9227724075317383, + "logits/rejected": -1.8930885791778564, + "logps/chosen": -175.95718383789062, + "logps/rejected": -230.57723999023438, + "loss": 0.5499, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2322263717651367, + "rewards/margins": 0.5083431005477905, + "rewards/rejected": -1.7405694723129272, + "step": 6380 + }, + { + "epoch": 1.1009648518263266, + "grad_norm": 18.604408264160156, + "learning_rate": 1.5963482648294085e-07, + "logits/chosen": -1.9683860540390015, + "logits/rejected": -1.9260038137435913, + "logps/chosen": -171.10850524902344, + "logps/rejected": -217.3498992919922, + "loss": 0.5586, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1502472162246704, + "rewards/margins": 0.5132304430007935, + "rewards/rejected": -1.6634775400161743, + "step": 6390 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 21.85063362121582, + "learning_rate": 1.5947377262966842e-07, + "logits/chosen": -1.9116064310073853, + "logits/rejected": -1.8747844696044922, + "logps/chosen": -167.57177734375, + "logps/rejected": -214.62210083007812, + "loss": 0.5557, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0987837314605713, + "rewards/margins": 0.48204049468040466, + "rewards/rejected": -1.5808241367340088, + "step": 6400 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -1.9953449964523315, + "eval_logits/rejected": -1.977708101272583, + "eval_logps/chosen": -161.66815185546875, + "eval_logps/rejected": -191.02622985839844, + "eval_loss": 0.6357540488243103, + "eval_rewards/accuracies": 0.6322026252746582, + "eval_rewards/chosen": -1.0295625925064087, + "eval_rewards/margins": 0.248898446559906, + "eval_rewards/rejected": -1.27846097946167, + "eval_runtime": 383.9272, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 6400 + }, + { + "epoch": 1.1044107512060648, + "grad_norm": 29.006738662719727, + "learning_rate": 1.5931247972730572e-07, + "logits/chosen": -1.966294288635254, + "logits/rejected": -1.9305827617645264, + "logps/chosen": -180.78768920898438, + "logps/rejected": -228.24093627929688, + "loss": 0.5943, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2642327547073364, + "rewards/margins": 0.5007797479629517, + "rewards/rejected": -1.765012502670288, + "step": 6410 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 22.37037467956543, + "learning_rate": 1.591509484241541e-07, + "logits/chosen": -1.8983854055404663, + "logits/rejected": -1.8700834512710571, + "logps/chosen": -185.94509887695312, + "logps/rejected": -225.1170196533203, + "loss": 0.6176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3115462064743042, + "rewards/margins": 0.40037521719932556, + "rewards/rejected": -1.7119213342666626, + "step": 6420 + }, + { + "epoch": 1.107856650585803, + "grad_norm": 23.627182006835938, + "learning_rate": 1.5898917936947297e-07, + "logits/chosen": -1.8346779346466064, + "logits/rejected": -1.81487238407135, + "logps/chosen": -162.68484497070312, + "logps/rejected": -192.02468872070312, + "loss": 0.6214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1229327917099, + "rewards/margins": 0.31496235728263855, + "rewards/rejected": -1.4378952980041504, + "step": 6430 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 20.39024543762207, + "learning_rate": 1.5882717321347752e-07, + "logits/chosen": -1.8643989562988281, + "logits/rejected": -1.8361200094223022, + "logps/chosen": -172.6422882080078, + "logps/rejected": -217.25473022460938, + "loss": 0.5813, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1938320398330688, + "rewards/margins": 0.4467882513999939, + "rewards/rejected": -1.640620470046997, + "step": 6440 + }, + { + "epoch": 1.111302549965541, + "grad_norm": 16.047563552856445, + "learning_rate": 1.5866493060733576e-07, + "logits/chosen": -1.8755505084991455, + "logits/rejected": -1.8299802541732788, + "logps/chosen": -165.16580200195312, + "logps/rejected": -213.8719940185547, + "loss": 0.5564, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.088411569595337, + "rewards/margins": 0.5280265212059021, + "rewards/rejected": -1.6164381504058838, + "step": 6450 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 25.144058227539062, + "learning_rate": 1.585024522031663e-07, + "logits/chosen": -1.818200707435608, + "logits/rejected": -1.801749587059021, + "logps/chosen": -164.95494079589844, + "logps/rejected": -234.9093780517578, + "loss": 0.5135, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1307909488677979, + "rewards/margins": 0.6390718221664429, + "rewards/rejected": -1.7698627710342407, + "step": 6460 + }, + { + "epoch": 1.114748449345279, + "grad_norm": 16.80597686767578, + "learning_rate": 1.5833973865403533e-07, + "logits/chosen": -1.8051636219024658, + "logits/rejected": -1.7716310024261475, + "logps/chosen": -155.55227661132812, + "logps/rejected": -197.94503784179688, + "loss": 0.5708, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.027890682220459, + "rewards/margins": 0.43396225571632385, + "rewards/rejected": -1.46185302734375, + "step": 6470 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 24.668655395507812, + "learning_rate": 1.5817679061395426e-07, + "logits/chosen": -1.8865272998809814, + "logits/rejected": -1.8435176610946655, + "logps/chosen": -164.46475219726562, + "logps/rejected": -199.07980346679688, + "loss": 0.584, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0789591073989868, + "rewards/margins": 0.40726250410079956, + "rewards/rejected": -1.4862215518951416, + "step": 6480 + }, + { + "epoch": 1.1181943487250172, + "grad_norm": 18.045318603515625, + "learning_rate": 1.5801360873787704e-07, + "logits/chosen": -2.040658473968506, + "logits/rejected": -2.015576124191284, + "logps/chosen": -172.48171997070312, + "logps/rejected": -218.50021362304688, + "loss": 0.5734, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1886144876480103, + "rewards/margins": 0.45509299635887146, + "rewards/rejected": -1.643707513809204, + "step": 6490 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 21.18434715270996, + "learning_rate": 1.5785019368169748e-07, + "logits/chosen": -1.8881008625030518, + "logits/rejected": -1.8674548864364624, + "logps/chosen": -169.26629638671875, + "logps/rejected": -207.85452270507812, + "loss": 0.5721, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1305954456329346, + "rewards/margins": 0.4208308160305023, + "rewards/rejected": -1.5514262914657593, + "step": 6500 + }, + { + "epoch": 1.1216402481047554, + "grad_norm": 18.61454963684082, + "learning_rate": 1.5768654610224664e-07, + "logits/chosen": -1.8763364553451538, + "logits/rejected": -1.8204272985458374, + "logps/chosen": -173.53945922851562, + "logps/rejected": -221.5675506591797, + "loss": 0.5626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1675516366958618, + "rewards/margins": 0.5180361866950989, + "rewards/rejected": -1.6855876445770264, + "step": 6510 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 20.92111587524414, + "learning_rate": 1.575226666572901e-07, + "logits/chosen": -1.8613507747650146, + "logits/rejected": -1.829530954360962, + "logps/chosen": -166.28549194335938, + "logps/rejected": -207.47384643554688, + "loss": 0.5742, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.126740574836731, + "rewards/margins": 0.43342143297195435, + "rewards/rejected": -1.560161828994751, + "step": 6520 + }, + { + "epoch": 1.1250861474844935, + "grad_norm": 22.111543655395508, + "learning_rate": 1.573585560055256e-07, + "logits/chosen": -1.809640645980835, + "logits/rejected": -1.7645851373672485, + "logps/chosen": -166.86082458496094, + "logps/rejected": -224.10556030273438, + "loss": 0.5233, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1323686838150024, + "rewards/margins": 0.5942455530166626, + "rewards/rejected": -1.726613998413086, + "step": 6530 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 19.58144760131836, + "learning_rate": 1.5719421480657996e-07, + "logits/chosen": -1.8357940912246704, + "logits/rejected": -1.8000843524932861, + "logps/chosen": -185.57937622070312, + "logps/rejected": -224.1998291015625, + "loss": 0.6105, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2820502519607544, + "rewards/margins": 0.39689844846725464, + "rewards/rejected": -1.6789487600326538, + "step": 6540 + }, + { + "epoch": 1.1285320468642315, + "grad_norm": 30.742877960205078, + "learning_rate": 1.570296437210068e-07, + "logits/chosen": -1.7698867321014404, + "logits/rejected": -1.7366396188735962, + "logps/chosen": -173.15988159179688, + "logps/rejected": -211.2397003173828, + "loss": 0.62, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.216139316558838, + "rewards/margins": 0.37928685545921326, + "rewards/rejected": -1.595426321029663, + "step": 6550 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 20.883285522460938, + "learning_rate": 1.5686484341028374e-07, + "logits/chosen": -1.884711503982544, + "logits/rejected": -1.8411281108856201, + "logps/chosen": -167.2931671142578, + "logps/rejected": -211.8631134033203, + "loss": 0.5624, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1128895282745361, + "rewards/margins": 0.48710498213768005, + "rewards/rejected": -1.599994421005249, + "step": 6560 + }, + { + "epoch": 1.1319779462439696, + "grad_norm": 18.167383193969727, + "learning_rate": 1.566998145368097e-07, + "logits/chosen": -1.8972285985946655, + "logits/rejected": -1.8509814739227295, + "logps/chosen": -163.0162353515625, + "logps/rejected": -214.9351043701172, + "loss": 0.5346, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0945223569869995, + "rewards/margins": 0.5278555154800415, + "rewards/rejected": -1.6223779916763306, + "step": 6570 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 18.919946670532227, + "learning_rate": 1.5653455776390235e-07, + "logits/chosen": -1.892338514328003, + "logits/rejected": -1.849704384803772, + "logps/chosen": -170.286865234375, + "logps/rejected": -201.15330505371094, + "loss": 0.6021, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1525856256484985, + "rewards/margins": 0.35892918705940247, + "rewards/rejected": -1.5115145444869995, + "step": 6580 + }, + { + "epoch": 1.1354238456237078, + "grad_norm": 19.923118591308594, + "learning_rate": 1.563690737557953e-07, + "logits/chosen": -1.8471940755844116, + "logits/rejected": -1.8094072341918945, + "logps/chosen": -164.0819549560547, + "logps/rejected": -214.5155792236328, + "loss": 0.5563, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1124200820922852, + "rewards/margins": 0.4979260563850403, + "rewards/rejected": -1.6103461980819702, + "step": 6590 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 26.111726760864258, + "learning_rate": 1.562033631776356e-07, + "logits/chosen": -1.9286270141601562, + "logits/rejected": -1.8974510431289673, + "logps/chosen": -170.0607147216797, + "logps/rejected": -214.9732666015625, + "loss": 0.5814, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1442835330963135, + "rewards/margins": 0.46957072615623474, + "rewards/rejected": -1.613854169845581, + "step": 6600 + }, + { + "epoch": 1.138869745003446, + "grad_norm": 16.342317581176758, + "learning_rate": 1.560374266954809e-07, + "logits/chosen": -1.8816630840301514, + "logits/rejected": -1.8479684591293335, + "logps/chosen": -169.02830505371094, + "logps/rejected": -223.63723754882812, + "loss": 0.5361, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1706207990646362, + "rewards/margins": 0.5482100248336792, + "rewards/rejected": -1.7188308238983154, + "step": 6610 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 23.36205291748047, + "learning_rate": 1.5587126497629686e-07, + "logits/chosen": -1.8299745321273804, + "logits/rejected": -1.7964084148406982, + "logps/chosen": -183.80242919921875, + "logps/rejected": -225.1468505859375, + "loss": 0.6006, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3141157627105713, + "rewards/margins": 0.41576558351516724, + "rewards/rejected": -1.7298812866210938, + "step": 6620 + }, + { + "epoch": 1.1423156443831841, + "grad_norm": 20.460121154785156, + "learning_rate": 1.557048786879545e-07, + "logits/chosen": -1.803312063217163, + "logits/rejected": -1.768874168395996, + "logps/chosen": -168.17320251464844, + "logps/rejected": -201.20533752441406, + "loss": 0.5978, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1486170291900635, + "rewards/margins": 0.37151283025741577, + "rewards/rejected": -1.5201297998428345, + "step": 6630 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 24.55524253845215, + "learning_rate": 1.5553826849922747e-07, + "logits/chosen": -1.8844077587127686, + "logits/rejected": -1.839284896850586, + "logps/chosen": -170.33155822753906, + "logps/rejected": -212.1918182373047, + "loss": 0.5843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1722571849822998, + "rewards/margins": 0.4536797106266022, + "rewards/rejected": -1.62593674659729, + "step": 6640 + }, + { + "epoch": 1.145761543762922, + "grad_norm": 21.957923889160156, + "learning_rate": 1.553714350797893e-07, + "logits/chosen": -1.930938959121704, + "logits/rejected": -1.894940972328186, + "logps/chosen": -166.409423828125, + "logps/rejected": -220.80581665039062, + "loss": 0.5335, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1429588794708252, + "rewards/margins": 0.5315641164779663, + "rewards/rejected": -1.6745229959487915, + "step": 6650 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 18.168514251708984, + "learning_rate": 1.5520437910021084e-07, + "logits/chosen": -1.9338651895523071, + "logits/rejected": -1.9020391702651978, + "logps/chosen": -162.00460815429688, + "logps/rejected": -216.5800018310547, + "loss": 0.5419, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.102439045906067, + "rewards/margins": 0.5217219591140747, + "rewards/rejected": -1.6241611242294312, + "step": 6660 + }, + { + "epoch": 1.1492074431426602, + "grad_norm": 22.49798011779785, + "learning_rate": 1.550371012319575e-07, + "logits/chosen": -1.8237295150756836, + "logits/rejected": -1.798282265663147, + "logps/chosen": -174.2585906982422, + "logps/rejected": -247.76937866210938, + "loss": 0.5025, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2435170412063599, + "rewards/margins": 0.6812580227851868, + "rewards/rejected": -1.9247751235961914, + "step": 6670 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 23.629316329956055, + "learning_rate": 1.5486960214738648e-07, + "logits/chosen": -1.8039436340332031, + "logits/rejected": -1.7624622583389282, + "logps/chosen": -183.01832580566406, + "logps/rejected": -225.6625518798828, + "loss": 0.5919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2919310331344604, + "rewards/margins": 0.44403982162475586, + "rewards/rejected": -1.7359708547592163, + "step": 6680 + }, + { + "epoch": 1.1526533425223984, + "grad_norm": 18.57203483581543, + "learning_rate": 1.547018825197443e-07, + "logits/chosen": -1.8125450611114502, + "logits/rejected": -1.776877760887146, + "logps/chosen": -175.30438232421875, + "logps/rejected": -237.8402099609375, + "loss": 0.5092, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1814472675323486, + "rewards/margins": 0.6386057138442993, + "rewards/rejected": -1.8200527429580688, + "step": 6690 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 33.86186599731445, + "learning_rate": 1.5453394302316366e-07, + "logits/chosen": -1.775186538696289, + "logits/rejected": -1.7512357234954834, + "logps/chosen": -197.62759399414062, + "logps/rejected": -245.69482421875, + "loss": 0.5945, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4149936437606812, + "rewards/margins": 0.4829793870449066, + "rewards/rejected": -1.8979730606079102, + "step": 6700 + }, + { + "epoch": 1.1560992419021365, + "grad_norm": 17.650436401367188, + "learning_rate": 1.5436578433266126e-07, + "logits/chosen": -1.7831552028656006, + "logits/rejected": -1.735656499862671, + "logps/chosen": -192.37847900390625, + "logps/rejected": -256.8363952636719, + "loss": 0.5364, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3545328378677368, + "rewards/margins": 0.6783474683761597, + "rewards/rejected": -2.0328803062438965, + "step": 6710 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 19.204593658447266, + "learning_rate": 1.5419740712413472e-07, + "logits/chosen": -1.7737830877304077, + "logits/rejected": -1.7366644144058228, + "logps/chosen": -181.94137573242188, + "logps/rejected": -244.99801635742188, + "loss": 0.5295, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3052650690078735, + "rewards/margins": 0.6292239427566528, + "rewards/rejected": -1.9344890117645264, + "step": 6720 + }, + { + "epoch": 1.1595451412818747, + "grad_norm": 29.945493698120117, + "learning_rate": 1.5402881207436e-07, + "logits/chosen": -1.725109338760376, + "logits/rejected": -1.6990764141082764, + "logps/chosen": -194.0322265625, + "logps/rejected": -235.67431640625, + "loss": 0.6083, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.373313546180725, + "rewards/margins": 0.4306615889072418, + "rewards/rejected": -1.8039751052856445, + "step": 6730 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 27.677576065063477, + "learning_rate": 1.5385999986098858e-07, + "logits/chosen": -1.7985633611679077, + "logits/rejected": -1.774356484413147, + "logps/chosen": -179.78250122070312, + "logps/rejected": -237.5058135986328, + "loss": 0.5401, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.253493070602417, + "rewards/margins": 0.5606546401977539, + "rewards/rejected": -1.814147710800171, + "step": 6740 + }, + { + "epoch": 1.1629910406616126, + "grad_norm": 19.740476608276367, + "learning_rate": 1.5369097116254493e-07, + "logits/chosen": -1.8279622793197632, + "logits/rejected": -1.795508623123169, + "logps/chosen": -190.2114715576172, + "logps/rejected": -249.172119140625, + "loss": 0.555, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.377532720565796, + "rewards/margins": 0.577313244342804, + "rewards/rejected": -1.9548461437225342, + "step": 6750 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 34.197540283203125, + "learning_rate": 1.5352172665842351e-07, + "logits/chosen": -1.7786098718643188, + "logits/rejected": -1.7375552654266357, + "logps/chosen": -183.84373474121094, + "logps/rejected": -226.8520050048828, + "loss": 0.611, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3179031610488892, + "rewards/margins": 0.4632466435432434, + "rewards/rejected": -1.7811496257781982, + "step": 6760 + }, + { + "epoch": 1.1664369400413508, + "grad_norm": 29.12902069091797, + "learning_rate": 1.5335226702888636e-07, + "logits/chosen": -1.814850091934204, + "logits/rejected": -1.790254831314087, + "logps/chosen": -175.25399780273438, + "logps/rejected": -229.5296630859375, + "loss": 0.557, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2141906023025513, + "rewards/margins": 0.5248024463653564, + "rewards/rejected": -1.7389930486679077, + "step": 6770 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 23.56708526611328, + "learning_rate": 1.5318259295506004e-07, + "logits/chosen": -1.8106752634048462, + "logits/rejected": -1.7647393941879272, + "logps/chosen": -184.96701049804688, + "logps/rejected": -229.0535125732422, + "loss": 0.5885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2846510410308838, + "rewards/margins": 0.4726200997829437, + "rewards/rejected": -1.75727117061615, + "step": 6780 + }, + { + "epoch": 1.169882839421089, + "grad_norm": 18.515710830688477, + "learning_rate": 1.5301270511893315e-07, + "logits/chosen": -1.8454278707504272, + "logits/rejected": -1.8067982196807861, + "logps/chosen": -164.33692932128906, + "logps/rejected": -224.3966827392578, + "loss": 0.5411, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1381299495697021, + "rewards/margins": 0.5746427774429321, + "rewards/rejected": -1.7127727270126343, + "step": 6790 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 14.323341369628906, + "learning_rate": 1.5284260420335345e-07, + "logits/chosen": -1.7431209087371826, + "logits/rejected": -1.698785424232483, + "logps/chosen": -167.23757934570312, + "logps/rejected": -224.02554321289062, + "loss": 0.5292, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1481921672821045, + "rewards/margins": 0.5911040306091309, + "rewards/rejected": -1.7392959594726562, + "step": 6800 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -1.9159184694290161, + "eval_logits/rejected": -1.8969097137451172, + "eval_logps/chosen": -167.49000549316406, + "eval_logps/rejected": -198.10008239746094, + "eval_loss": 0.6332610249519348, + "eval_rewards/accuracies": 0.6312732100486755, + "eval_rewards/chosen": -1.0877811908721924, + "eval_rewards/margins": 0.2614184319972992, + "eval_rewards/rejected": -1.3491995334625244, + "eval_runtime": 384.0244, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 6800 + }, + { + "epoch": 1.173328738800827, + "grad_norm": 20.453310012817383, + "learning_rate": 1.5267229089202514e-07, + "logits/chosen": -1.7973759174346924, + "logits/rejected": -1.76103937625885, + "logps/chosen": -181.02047729492188, + "logps/rejected": -228.26077270507812, + "loss": 0.5642, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.253167748451233, + "rewards/margins": 0.5119723677635193, + "rewards/rejected": -1.765140175819397, + "step": 6810 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 24.659183502197266, + "learning_rate": 1.5250176586950615e-07, + "logits/chosen": -1.8925281763076782, + "logits/rejected": -1.8536806106567383, + "logps/chosen": -181.89085388183594, + "logps/rejected": -227.59896850585938, + "loss": 0.5735, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2686735391616821, + "rewards/margins": 0.47714129090309143, + "rewards/rejected": -1.7458146810531616, + "step": 6820 + }, + { + "epoch": 1.176774638180565, + "grad_norm": 21.895891189575195, + "learning_rate": 1.523310298212054e-07, + "logits/chosen": -1.9211881160736084, + "logits/rejected": -1.894505262374878, + "logps/chosen": -168.66549682617188, + "logps/rejected": -219.8590850830078, + "loss": 0.5657, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1476205587387085, + "rewards/margins": 0.5208449959754944, + "rewards/rejected": -1.668465256690979, + "step": 6830 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 19.686376571655273, + "learning_rate": 1.5216008343337987e-07, + "logits/chosen": -1.863774061203003, + "logits/rejected": -1.8285160064697266, + "logps/chosen": -182.8833770751953, + "logps/rejected": -227.8834991455078, + "loss": 0.6113, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2595279216766357, + "rewards/margins": 0.46531790494918823, + "rewards/rejected": -1.7248455286026, + "step": 6840 + }, + { + "epoch": 1.1802205375603032, + "grad_norm": 19.36058807373047, + "learning_rate": 1.5198892739313216e-07, + "logits/chosen": -1.753993034362793, + "logits/rejected": -1.7107868194580078, + "logps/chosen": -165.95858764648438, + "logps/rejected": -216.3078155517578, + "loss": 0.5649, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1179423332214355, + "rewards/margins": 0.5087266564369202, + "rewards/rejected": -1.626668930053711, + "step": 6850 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 19.93489646911621, + "learning_rate": 1.518175623884074e-07, + "logits/chosen": -1.8617608547210693, + "logits/rejected": -1.8141686916351318, + "logps/chosen": -180.40768432617188, + "logps/rejected": -219.93545532226562, + "loss": 0.573, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2129299640655518, + "rewards/margins": 0.4632079005241394, + "rewards/rejected": -1.676137924194336, + "step": 6860 + }, + { + "epoch": 1.1836664369400414, + "grad_norm": 29.238536834716797, + "learning_rate": 1.516459891079907e-07, + "logits/chosen": -1.7542908191680908, + "logits/rejected": -1.7299522161483765, + "logps/chosen": -177.2001495361328, + "logps/rejected": -225.78271484375, + "loss": 0.5667, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2611467838287354, + "rewards/margins": 0.4789832532405853, + "rewards/rejected": -1.740130066871643, + "step": 6870 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 17.341331481933594, + "learning_rate": 1.5147420824150435e-07, + "logits/chosen": -1.8384802341461182, + "logits/rejected": -1.7911484241485596, + "logps/chosen": -177.47169494628906, + "logps/rejected": -230.1194305419922, + "loss": 0.5446, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2320877313613892, + "rewards/margins": 0.5574935078620911, + "rewards/rejected": -1.789581298828125, + "step": 6880 + }, + { + "epoch": 1.1871123363197795, + "grad_norm": 20.969749450683594, + "learning_rate": 1.5130222047940492e-07, + "logits/chosen": -1.7615314722061157, + "logits/rejected": -1.7270469665527344, + "logps/chosen": -173.20687866210938, + "logps/rejected": -231.8846435546875, + "loss": 0.5449, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2089654207229614, + "rewards/margins": 0.5859875679016113, + "rewards/rejected": -1.7949531078338623, + "step": 6890 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 25.91954231262207, + "learning_rate": 1.5113002651298062e-07, + "logits/chosen": -1.8158423900604248, + "logits/rejected": -1.7804663181304932, + "logps/chosen": -184.11569213867188, + "logps/rejected": -229.144775390625, + "loss": 0.5881, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3056111335754395, + "rewards/margins": 0.4514707028865814, + "rewards/rejected": -1.7570817470550537, + "step": 6900 + }, + { + "epoch": 1.1905582356995175, + "grad_norm": 17.826147079467773, + "learning_rate": 1.509576270343485e-07, + "logits/chosen": -1.84814453125, + "logits/rejected": -1.8017213344573975, + "logps/chosen": -183.55357360839844, + "logps/rejected": -244.83389282226562, + "loss": 0.5295, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.266675353050232, + "rewards/margins": 0.6295980215072632, + "rewards/rejected": -1.8962733745574951, + "step": 6910 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 20.14065170288086, + "learning_rate": 1.5078502273645164e-07, + "logits/chosen": -1.9001047611236572, + "logits/rejected": -1.8588473796844482, + "logps/chosen": -191.18313598632812, + "logps/rejected": -231.6808624267578, + "loss": 0.6016, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3507914543151855, + "rewards/margins": 0.4283156991004944, + "rewards/rejected": -1.779106855392456, + "step": 6920 + }, + { + "epoch": 1.1940041350792556, + "grad_norm": 22.96848487854004, + "learning_rate": 1.5061221431305632e-07, + "logits/chosen": -1.7364161014556885, + "logits/rejected": -1.6910498142242432, + "logps/chosen": -178.3186492919922, + "logps/rejected": -237.7119140625, + "loss": 0.5287, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.206847906112671, + "rewards/margins": 0.6277521252632141, + "rewards/rejected": -1.8345998525619507, + "step": 6930 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 25.747020721435547, + "learning_rate": 1.5043920245874937e-07, + "logits/chosen": -1.7463403940200806, + "logits/rejected": -1.6873855590820312, + "logps/chosen": -172.3845672607422, + "logps/rejected": -225.8962860107422, + "loss": 0.5383, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1442162990570068, + "rewards/margins": 0.6088359951972961, + "rewards/rejected": -1.7530523538589478, + "step": 6940 + }, + { + "epoch": 1.1974500344589938, + "grad_norm": 19.827890396118164, + "learning_rate": 1.5026598786893522e-07, + "logits/chosen": -1.7630767822265625, + "logits/rejected": -1.7269541025161743, + "logps/chosen": -183.65208435058594, + "logps/rejected": -248.32949829101562, + "loss": 0.5261, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2979530096054077, + "rewards/margins": 0.655959963798523, + "rewards/rejected": -1.9539129734039307, + "step": 6950 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 20.81599998474121, + "learning_rate": 1.5009257123983322e-07, + "logits/chosen": -1.9348329305648804, + "logits/rejected": -1.8880043029785156, + "logps/chosen": -195.40646362304688, + "logps/rejected": -229.8538055419922, + "loss": 0.6051, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4008803367614746, + "rewards/margins": 0.41985026001930237, + "rewards/rejected": -1.8207308053970337, + "step": 6960 + }, + { + "epoch": 1.200895933838732, + "grad_norm": 22.377382278442383, + "learning_rate": 1.499189532684747e-07, + "logits/chosen": -1.8455537557601929, + "logits/rejected": -1.8025840520858765, + "logps/chosen": -182.77159118652344, + "logps/rejected": -233.11825561523438, + "loss": 0.557, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2822978496551514, + "rewards/margins": 0.5449867248535156, + "rewards/rejected": -1.827284574508667, + "step": 6970 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 23.947980880737305, + "learning_rate": 1.4974513465270049e-07, + "logits/chosen": -1.773627519607544, + "logits/rejected": -1.7299044132232666, + "logps/chosen": -184.4539337158203, + "logps/rejected": -245.153076171875, + "loss": 0.5341, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3022854328155518, + "rewards/margins": 0.6191726326942444, + "rewards/rejected": -1.9214584827423096, + "step": 6980 + }, + { + "epoch": 1.20434183321847, + "grad_norm": 24.419055938720703, + "learning_rate": 1.4957111609115761e-07, + "logits/chosen": -1.7043527364730835, + "logits/rejected": -1.6770222187042236, + "logps/chosen": -191.4741973876953, + "logps/rejected": -235.8734588623047, + "loss": 0.6066, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3449076414108276, + "rewards/margins": 0.4712006449699402, + "rewards/rejected": -1.8161083459854126, + "step": 6990 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 19.808265686035156, + "learning_rate": 1.4939689828329694e-07, + "logits/chosen": -1.980425238609314, + "logits/rejected": -1.9331204891204834, + "logps/chosen": -198.2283172607422, + "logps/rejected": -255.0526123046875, + "loss": 0.5316, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4193394184112549, + "rewards/margins": 0.5842828154563904, + "rewards/rejected": -2.003622531890869, + "step": 7000 + }, + { + "epoch": 1.207787732598208, + "grad_norm": 21.359458923339844, + "learning_rate": 1.492224819293701e-07, + "logits/chosen": -1.826668381690979, + "logits/rejected": -1.7854959964752197, + "logps/chosen": -186.2130889892578, + "logps/rejected": -239.1843719482422, + "loss": 0.5612, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2978785037994385, + "rewards/margins": 0.5653413534164429, + "rewards/rejected": -1.8632196187973022, + "step": 7010 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 26.118690490722656, + "learning_rate": 1.490478677304268e-07, + "logits/chosen": -1.7860314846038818, + "logits/rejected": -1.748482346534729, + "logps/chosen": -181.08990478515625, + "logps/rejected": -227.72579956054688, + "loss": 0.5799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.297550916671753, + "rewards/margins": 0.45612120628356934, + "rewards/rejected": -1.7536720037460327, + "step": 7020 + }, + { + "epoch": 1.2112336319779462, + "grad_norm": 41.83209991455078, + "learning_rate": 1.4887305638831207e-07, + "logits/chosen": -1.8422186374664307, + "logits/rejected": -1.808428168296814, + "logps/chosen": -184.91195678710938, + "logps/rejected": -236.88973999023438, + "loss": 0.5843, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.318117618560791, + "rewards/margins": 0.49964022636413574, + "rewards/rejected": -1.8177579641342163, + "step": 7030 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 36.24333572387695, + "learning_rate": 1.486980486056631e-07, + "logits/chosen": -1.759803056716919, + "logits/rejected": -1.7243820428848267, + "logps/chosen": -194.1247100830078, + "logps/rejected": -251.89547729492188, + "loss": 0.5415, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.380479335784912, + "rewards/margins": 0.5735012292861938, + "rewards/rejected": -1.9539806842803955, + "step": 7040 + }, + { + "epoch": 1.2146795313576844, + "grad_norm": 26.811819076538086, + "learning_rate": 1.4852284508590686e-07, + "logits/chosen": -1.7498468160629272, + "logits/rejected": -1.719957709312439, + "logps/chosen": -180.6525115966797, + "logps/rejected": -231.7786102294922, + "loss": 0.5777, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2919676303863525, + "rewards/margins": 0.4920363426208496, + "rewards/rejected": -1.7840036153793335, + "step": 7050 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 24.70426368713379, + "learning_rate": 1.483474465332569e-07, + "logits/chosen": -1.882567048072815, + "logits/rejected": -1.8586457967758179, + "logps/chosen": -184.4115447998047, + "logps/rejected": -223.13162231445312, + "loss": 0.6135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3120129108428955, + "rewards/margins": 0.3901762366294861, + "rewards/rejected": -1.7021892070770264, + "step": 7060 + }, + { + "epoch": 1.2181254307374225, + "grad_norm": 38.69207000732422, + "learning_rate": 1.4817185365271092e-07, + "logits/chosen": -1.841565489768982, + "logits/rejected": -1.804574966430664, + "logps/chosen": -179.31002807617188, + "logps/rejected": -210.82839965820312, + "loss": 0.6612, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.247070074081421, + "rewards/margins": 0.32873469591140747, + "rewards/rejected": -1.5758049488067627, + "step": 7070 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 16.222440719604492, + "learning_rate": 1.4799606715004744e-07, + "logits/chosen": -1.9310624599456787, + "logits/rejected": -1.8868637084960938, + "logps/chosen": -163.29208374023438, + "logps/rejected": -211.12222290039062, + "loss": 0.554, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0982394218444824, + "rewards/margins": 0.47333258390426636, + "rewards/rejected": -1.5715720653533936, + "step": 7080 + }, + { + "epoch": 1.2215713301171607, + "grad_norm": 17.030590057373047, + "learning_rate": 1.4782008773182342e-07, + "logits/chosen": -1.9744294881820679, + "logits/rejected": -1.9437084197998047, + "logps/chosen": -166.2434539794922, + "logps/rejected": -219.3644561767578, + "loss": 0.5454, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1272560358047485, + "rewards/margins": 0.5442889332771301, + "rewards/rejected": -1.6715450286865234, + "step": 7090 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 19.815698623657227, + "learning_rate": 1.476439161053711e-07, + "logits/chosen": -1.8406782150268555, + "logits/rejected": -1.80866277217865, + "logps/chosen": -167.04959106445312, + "logps/rejected": -208.95480346679688, + "loss": 0.5922, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0997631549835205, + "rewards/margins": 0.4213009476661682, + "rewards/rejected": -1.521064043045044, + "step": 7100 + }, + { + "epoch": 1.2250172294968986, + "grad_norm": 16.055950164794922, + "learning_rate": 1.4746755297879535e-07, + "logits/chosen": -1.8774404525756836, + "logits/rejected": -1.834254264831543, + "logps/chosen": -158.6421356201172, + "logps/rejected": -191.69210815429688, + "loss": 0.6052, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0516932010650635, + "rewards/margins": 0.355400413274765, + "rewards/rejected": -1.4070935249328613, + "step": 7110 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 29.036243438720703, + "learning_rate": 1.4729099906097074e-07, + "logits/chosen": -1.8762614727020264, + "logits/rejected": -1.8309389352798462, + "logps/chosen": -168.1267852783203, + "logps/rejected": -191.68255615234375, + "loss": 0.6143, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0614054203033447, + "rewards/margins": 0.32982301712036133, + "rewards/rejected": -1.391228437423706, + "step": 7120 + }, + { + "epoch": 1.2284631288766368, + "grad_norm": 29.026830673217773, + "learning_rate": 1.4711425506153872e-07, + "logits/chosen": -1.775841474533081, + "logits/rejected": -1.7425835132598877, + "logps/chosen": -155.8795928955078, + "logps/rejected": -197.4680938720703, + "loss": 0.5775, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9953833818435669, + "rewards/margins": 0.44751301407814026, + "rewards/rejected": -1.4428964853286743, + "step": 7130 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 18.57213592529297, + "learning_rate": 1.4693732169090472e-07, + "logits/chosen": -1.8745845556259155, + "logits/rejected": -1.8567358255386353, + "logps/chosen": -151.21255493164062, + "logps/rejected": -193.65370178222656, + "loss": 0.5888, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9989484548568726, + "rewards/margins": 0.41814273595809937, + "rewards/rejected": -1.4170913696289062, + "step": 7140 + }, + { + "epoch": 1.231909028256375, + "grad_norm": 19.837411880493164, + "learning_rate": 1.4676019966023537e-07, + "logits/chosen": -1.9492120742797852, + "logits/rejected": -1.9174597263336182, + "logps/chosen": -176.21762084960938, + "logps/rejected": -215.0535430908203, + "loss": 0.6, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.198301076889038, + "rewards/margins": 0.4013351500034332, + "rewards/rejected": -1.599636435508728, + "step": 7150 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 29.906728744506836, + "learning_rate": 1.4658288968145556e-07, + "logits/chosen": -1.858672857284546, + "logits/rejected": -1.8087135553359985, + "logps/chosen": -156.7094268798828, + "logps/rejected": -208.208740234375, + "loss": 0.553, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0304765701293945, + "rewards/margins": 0.5108339190483093, + "rewards/rejected": -1.5413105487823486, + "step": 7160 + }, + { + "epoch": 1.235354927636113, + "grad_norm": 23.859086990356445, + "learning_rate": 1.4640539246724565e-07, + "logits/chosen": -1.8171465396881104, + "logits/rejected": -1.7792974710464478, + "logps/chosen": -167.0326385498047, + "logps/rejected": -208.37393188476562, + "loss": 0.5998, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1146633625030518, + "rewards/margins": 0.43098729848861694, + "rewards/rejected": -1.545650601387024, + "step": 7170 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 19.75602912902832, + "learning_rate": 1.4622770873103857e-07, + "logits/chosen": -1.9519678354263306, + "logits/rejected": -1.9202091693878174, + "logps/chosen": -157.49551391601562, + "logps/rejected": -206.5701446533203, + "loss": 0.5476, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0219330787658691, + "rewards/margins": 0.49904584884643555, + "rewards/rejected": -1.5209788084030151, + "step": 7180 + }, + { + "epoch": 1.2388008270158513, + "grad_norm": 20.737136840820312, + "learning_rate": 1.4604983918701692e-07, + "logits/chosen": -1.777673363685608, + "logits/rejected": -1.73529851436615, + "logps/chosen": -161.501220703125, + "logps/rejected": -214.7451934814453, + "loss": 0.5442, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0883067846298218, + "rewards/margins": 0.5229275226593018, + "rewards/rejected": -1.6112343072891235, + "step": 7190 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 27.85216522216797, + "learning_rate": 1.4587178455011021e-07, + "logits/chosen": -1.7661443948745728, + "logits/rejected": -1.7274892330169678, + "logps/chosen": -173.51914978027344, + "logps/rejected": -233.92068481445312, + "loss": 0.5473, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2042179107666016, + "rewards/margins": 0.5732482671737671, + "rewards/rejected": -1.7774661779403687, + "step": 7200 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -1.9226493835449219, + "eval_logits/rejected": -1.904362678527832, + "eval_logps/chosen": -163.50006103515625, + "eval_logps/rejected": -192.75973510742188, + "eval_loss": 0.635442316532135, + "eval_rewards/accuracies": 0.6261616945266724, + "eval_rewards/chosen": -1.0478816032409668, + "eval_rewards/margins": 0.24791431427001953, + "eval_rewards/rejected": -1.2957961559295654, + "eval_runtime": 384.2418, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 7200 + }, + { + "epoch": 1.2422467263955892, + "grad_norm": 30.415618896484375, + "learning_rate": 1.4569354553599186e-07, + "logits/chosen": -1.8696123361587524, + "logits/rejected": -1.8457527160644531, + "logps/chosen": -190.6040802001953, + "logps/rejected": -217.7646942138672, + "loss": 0.6321, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3357537984848022, + "rewards/margins": 0.31159737706184387, + "rewards/rejected": -1.6473512649536133, + "step": 7210 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 27.164396286010742, + "learning_rate": 1.4551512286107642e-07, + "logits/chosen": -1.7603811025619507, + "logits/rejected": -1.7127151489257812, + "logps/chosen": -172.73391723632812, + "logps/rejected": -218.4431915283203, + "loss": 0.583, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1820390224456787, + "rewards/margins": 0.4857490658760071, + "rewards/rejected": -1.6677881479263306, + "step": 7220 + }, + { + "epoch": 1.2456926257753274, + "grad_norm": 19.279682159423828, + "learning_rate": 1.4533651724251654e-07, + "logits/chosen": -1.8027547597885132, + "logits/rejected": -1.768843650817871, + "logps/chosen": -172.1409912109375, + "logps/rejected": -215.8574981689453, + "loss": 0.5679, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1791961193084717, + "rewards/margins": 0.4495778977870941, + "rewards/rejected": -1.6287740468978882, + "step": 7230 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 19.966888427734375, + "learning_rate": 1.4515772939820036e-07, + "logits/chosen": -1.8357681035995483, + "logits/rejected": -1.8127696514129639, + "logps/chosen": -173.6414337158203, + "logps/rejected": -216.55142211914062, + "loss": 0.5735, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.161946415901184, + "rewards/margins": 0.45585179328918457, + "rewards/rejected": -1.6177982091903687, + "step": 7240 + }, + { + "epoch": 1.2491385251550655, + "grad_norm": 21.268489837646484, + "learning_rate": 1.4497876004674824e-07, + "logits/chosen": -1.883754014968872, + "logits/rejected": -1.8425041437149048, + "logps/chosen": -168.72447204589844, + "logps/rejected": -211.64157104492188, + "loss": 0.5734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1415681838989258, + "rewards/margins": 0.44630199670791626, + "rewards/rejected": -1.5878701210021973, + "step": 7250 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 24.966110229492188, + "learning_rate": 1.4479960990751037e-07, + "logits/chosen": -1.870134949684143, + "logits/rejected": -1.8283237218856812, + "logps/chosen": -167.22454833984375, + "logps/rejected": -217.4507598876953, + "loss": 0.5489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.135387897491455, + "rewards/margins": 0.515394926071167, + "rewards/rejected": -1.650782823562622, + "step": 7260 + }, + { + "epoch": 1.2525844245348035, + "grad_norm": 21.891530990600586, + "learning_rate": 1.4462027970056336e-07, + "logits/chosen": -1.8796777725219727, + "logits/rejected": -1.8357088565826416, + "logps/chosen": -165.61715698242188, + "logps/rejected": -210.3502655029297, + "loss": 0.5759, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1130149364471436, + "rewards/margins": 0.46413689851760864, + "rewards/rejected": -1.5771516561508179, + "step": 7270 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 21.23567008972168, + "learning_rate": 1.4444077014670767e-07, + "logits/chosen": -1.9080616235733032, + "logits/rejected": -1.859018087387085, + "logps/chosen": -169.698486328125, + "logps/rejected": -222.4978790283203, + "loss": 0.5285, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1312874555587769, + "rewards/margins": 0.5645879507064819, + "rewards/rejected": -1.6958754062652588, + "step": 7280 + }, + { + "epoch": 1.2560303239145416, + "grad_norm": 28.257923126220703, + "learning_rate": 1.4426108196746465e-07, + "logits/chosen": -1.6983267068862915, + "logits/rejected": -1.675920844078064, + "logps/chosen": -183.1871337890625, + "logps/rejected": -221.4242706298828, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2839581966400146, + "rewards/margins": 0.37165379524230957, + "rewards/rejected": -1.6556117534637451, + "step": 7290 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 21.172204971313477, + "learning_rate": 1.4408121588507358e-07, + "logits/chosen": -1.6503427028656006, + "logits/rejected": -1.6238577365875244, + "logps/chosen": -173.98719787597656, + "logps/rejected": -218.13619995117188, + "loss": 0.6133, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2372645139694214, + "rewards/margins": 0.424022912979126, + "rewards/rejected": -1.661287546157837, + "step": 7300 + }, + { + "epoch": 1.2594762232942798, + "grad_norm": 19.044803619384766, + "learning_rate": 1.4390117262248886e-07, + "logits/chosen": -1.8345047235488892, + "logits/rejected": -1.7920280694961548, + "logps/chosen": -179.25234985351562, + "logps/rejected": -234.50521850585938, + "loss": 0.5557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2419331073760986, + "rewards/margins": 0.5842617750167847, + "rewards/rejected": -1.8261950016021729, + "step": 7310 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 18.871639251708984, + "learning_rate": 1.4372095290337697e-07, + "logits/chosen": -1.876389503479004, + "logits/rejected": -1.8411738872528076, + "logps/chosen": -164.28530883789062, + "logps/rejected": -204.193359375, + "loss": 0.5911, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0993173122406006, + "rewards/margins": 0.4310569167137146, + "rewards/rejected": -1.53037428855896, + "step": 7320 + }, + { + "epoch": 1.262922122674018, + "grad_norm": 29.29106330871582, + "learning_rate": 1.4354055745211372e-07, + "logits/chosen": -1.7525503635406494, + "logits/rejected": -1.7058188915252686, + "logps/chosen": -164.0477752685547, + "logps/rejected": -222.71859741210938, + "loss": 0.5294, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1098519563674927, + "rewards/margins": 0.6016186475753784, + "rewards/rejected": -1.711470365524292, + "step": 7330 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 19.067596435546875, + "learning_rate": 1.4335998699378123e-07, + "logits/chosen": -1.8152233362197876, + "logits/rejected": -1.7801440954208374, + "logps/chosen": -185.6068572998047, + "logps/rejected": -224.10043334960938, + "loss": 0.6139, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.302257776260376, + "rewards/margins": 0.3995913863182068, + "rewards/rejected": -1.7018489837646484, + "step": 7340 + }, + { + "epoch": 1.266368022053756, + "grad_norm": 21.694255828857422, + "learning_rate": 1.4317924225416493e-07, + "logits/chosen": -1.901818871498108, + "logits/rejected": -1.8575118780136108, + "logps/chosen": -161.57791137695312, + "logps/rejected": -211.4538116455078, + "loss": 0.5496, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0465854406356812, + "rewards/margins": 0.550262451171875, + "rewards/rejected": -1.5968478918075562, + "step": 7350 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 26.77164077758789, + "learning_rate": 1.42998323959751e-07, + "logits/chosen": -1.795769453048706, + "logits/rejected": -1.7647924423217773, + "logps/chosen": -173.7707977294922, + "logps/rejected": -217.08798217773438, + "loss": 0.5753, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1947288513183594, + "rewards/margins": 0.4542997479438782, + "rewards/rejected": -1.6490285396575928, + "step": 7360 + }, + { + "epoch": 1.269813921433494, + "grad_norm": 26.229225158691406, + "learning_rate": 1.4281723283772297e-07, + "logits/chosen": -1.7144826650619507, + "logits/rejected": -1.681382417678833, + "logps/chosen": -166.12722778320312, + "logps/rejected": -211.6783447265625, + "loss": 0.5726, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.133207082748413, + "rewards/margins": 0.4711657464504242, + "rewards/rejected": -1.6043727397918701, + "step": 7370 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 25.314279556274414, + "learning_rate": 1.4263596961595913e-07, + "logits/chosen": -1.7785885334014893, + "logits/rejected": -1.7429101467132568, + "logps/chosen": -173.8869171142578, + "logps/rejected": -227.0846710205078, + "loss": 0.5579, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2071913480758667, + "rewards/margins": 0.5267850160598755, + "rewards/rejected": -1.7339763641357422, + "step": 7380 + }, + { + "epoch": 1.2732598208132322, + "grad_norm": 38.555118560791016, + "learning_rate": 1.424545350230296e-07, + "logits/chosen": -1.7544944286346436, + "logits/rejected": -1.7176777124404907, + "logps/chosen": -171.19810485839844, + "logps/rejected": -223.80068969726562, + "loss": 0.559, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.189843773841858, + "rewards/margins": 0.5179719924926758, + "rewards/rejected": -1.7078157663345337, + "step": 7390 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 20.990966796875, + "learning_rate": 1.422729297881931e-07, + "logits/chosen": -1.7361913919448853, + "logits/rejected": -1.678663969039917, + "logps/chosen": -187.804443359375, + "logps/rejected": -238.89242553710938, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3235337734222412, + "rewards/margins": 0.5596829056739807, + "rewards/rejected": -1.8832166194915771, + "step": 7400 + }, + { + "epoch": 1.2767057201929704, + "grad_norm": 32.962459564208984, + "learning_rate": 1.4209115464139445e-07, + "logits/chosen": -1.7074429988861084, + "logits/rejected": -1.659502625465393, + "logps/chosen": -181.55169677734375, + "logps/rejected": -234.9263916015625, + "loss": 0.5702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2779533863067627, + "rewards/margins": 0.5410931706428528, + "rewards/rejected": -1.8190467357635498, + "step": 7410 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 25.36784553527832, + "learning_rate": 1.419092103132612e-07, + "logits/chosen": -1.6498420238494873, + "logits/rejected": -1.617962121963501, + "logps/chosen": -178.5230712890625, + "logps/rejected": -221.35165405273438, + "loss": 0.5868, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2235511541366577, + "rewards/margins": 0.45928892493247986, + "rewards/rejected": -1.68284010887146, + "step": 7420 + }, + { + "epoch": 1.2801516195727085, + "grad_norm": 25.063386917114258, + "learning_rate": 1.4172709753510117e-07, + "logits/chosen": -1.6935100555419922, + "logits/rejected": -1.6461021900177002, + "logps/chosen": -192.5091094970703, + "logps/rejected": -249.29428100585938, + "loss": 0.5618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3910486698150635, + "rewards/margins": 0.5831217765808105, + "rewards/rejected": -1.9741703271865845, + "step": 7430 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 24.478483200073242, + "learning_rate": 1.41544817038899e-07, + "logits/chosen": -1.7873668670654297, + "logits/rejected": -1.7440226078033447, + "logps/chosen": -201.0021209716797, + "logps/rejected": -244.41189575195312, + "loss": 0.6088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4499728679656982, + "rewards/margins": 0.46050310134887695, + "rewards/rejected": -1.9104760885238647, + "step": 7440 + }, + { + "epoch": 1.2835975189524467, + "grad_norm": 24.15538215637207, + "learning_rate": 1.4136236955731354e-07, + "logits/chosen": -1.9057048559188843, + "logits/rejected": -1.8608615398406982, + "logps/chosen": -178.9813232421875, + "logps/rejected": -223.97671508789062, + "loss": 0.5758, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2305552959442139, + "rewards/margins": 0.4831448197364807, + "rewards/rejected": -1.7137000560760498, + "step": 7450 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 19.96843910217285, + "learning_rate": 1.4117975582367488e-07, + "logits/chosen": -1.7194932699203491, + "logits/rejected": -1.701974630355835, + "logps/chosen": -170.04483032226562, + "logps/rejected": -225.9224853515625, + "loss": 0.5591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1988255977630615, + "rewards/margins": 0.5305085182189941, + "rewards/rejected": -1.7293342351913452, + "step": 7460 + }, + { + "epoch": 1.2870434183321846, + "grad_norm": 26.922449111938477, + "learning_rate": 1.4099697657198128e-07, + "logits/chosen": -1.8264611959457397, + "logits/rejected": -1.8030723333358765, + "logps/chosen": -172.75778198242188, + "logps/rejected": -204.11093139648438, + "loss": 0.6341, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1810314655303955, + "rewards/margins": 0.34694355726242065, + "rewards/rejected": -1.5279749631881714, + "step": 7470 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 24.537500381469727, + "learning_rate": 1.4081403253689638e-07, + "logits/chosen": -1.7876904010772705, + "logits/rejected": -1.7532583475112915, + "logps/chosen": -161.22964477539062, + "logps/rejected": -189.12184143066406, + "loss": 0.6182, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0659732818603516, + "rewards/margins": 0.3332677483558655, + "rewards/rejected": -1.3992409706115723, + "step": 7480 + }, + { + "epoch": 1.2904893177119228, + "grad_norm": 17.613664627075195, + "learning_rate": 1.4063092445374591e-07, + "logits/chosen": -1.7610737085342407, + "logits/rejected": -1.7352359294891357, + "logps/chosen": -169.47122192382812, + "logps/rejected": -211.7993927001953, + "loss": 0.5807, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.16910719871521, + "rewards/margins": 0.4174097180366516, + "rewards/rejected": -1.5865167379379272, + "step": 7490 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 18.988906860351562, + "learning_rate": 1.404476530585153e-07, + "logits/chosen": -1.7912099361419678, + "logits/rejected": -1.7529321908950806, + "logps/chosen": -160.61988830566406, + "logps/rejected": -204.53854370117188, + "loss": 0.5962, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0914565324783325, + "rewards/margins": 0.43414705991744995, + "rewards/rejected": -1.5256035327911377, + "step": 7500 + }, + { + "epoch": 1.293935217091661, + "grad_norm": 22.593177795410156, + "learning_rate": 1.402642190878462e-07, + "logits/chosen": -1.8172283172607422, + "logits/rejected": -1.7795774936676025, + "logps/chosen": -170.80709838867188, + "logps/rejected": -204.64862060546875, + "loss": 0.5827, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1354600191116333, + "rewards/margins": 0.3988160192966461, + "rewards/rejected": -1.534276008605957, + "step": 7510 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 25.542572021484375, + "learning_rate": 1.4008062327903373e-07, + "logits/chosen": -1.785715103149414, + "logits/rejected": -1.754105806350708, + "logps/chosen": -163.73199462890625, + "logps/rejected": -216.9349822998047, + "loss": 0.5409, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1080421209335327, + "rewards/margins": 0.5227325558662415, + "rewards/rejected": -1.630774736404419, + "step": 7520 + }, + { + "epoch": 1.297381116471399, + "grad_norm": 19.270915985107422, + "learning_rate": 1.398968663700235e-07, + "logits/chosen": -1.7185955047607422, + "logits/rejected": -1.6924537420272827, + "logps/chosen": -163.96920776367188, + "logps/rejected": -218.0501251220703, + "loss": 0.5497, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1175190210342407, + "rewards/margins": 0.528626561164856, + "rewards/rejected": -1.646145224571228, + "step": 7530 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 21.197391510009766, + "learning_rate": 1.3971294909940872e-07, + "logits/chosen": -1.8468563556671143, + "logits/rejected": -1.8190243244171143, + "logps/chosen": -165.5609588623047, + "logps/rejected": -226.9778289794922, + "loss": 0.5295, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1402043104171753, + "rewards/margins": 0.6028168797492981, + "rewards/rejected": -1.743021011352539, + "step": 7540 + }, + { + "epoch": 1.3008270158511372, + "grad_norm": 22.384174346923828, + "learning_rate": 1.395288722064271e-07, + "logits/chosen": -1.7478249073028564, + "logits/rejected": -1.715561866760254, + "logps/chosen": -174.3957061767578, + "logps/rejected": -232.8987579345703, + "loss": 0.5479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2260493040084839, + "rewards/margins": 0.5848062634468079, + "rewards/rejected": -1.810855507850647, + "step": 7550 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 24.416648864746094, + "learning_rate": 1.39344636430958e-07, + "logits/chosen": -1.85236394405365, + "logits/rejected": -1.8107448816299438, + "logps/chosen": -167.5206298828125, + "logps/rejected": -227.94570922851562, + "loss": 0.5487, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1494567394256592, + "rewards/margins": 0.5763713717460632, + "rewards/rejected": -1.7258281707763672, + "step": 7560 + }, + { + "epoch": 1.3042729152308752, + "grad_norm": 23.83544921875, + "learning_rate": 1.3916024251351922e-07, + "logits/chosen": -1.8106807470321655, + "logits/rejected": -1.772073745727539, + "logps/chosen": -189.89389038085938, + "logps/rejected": -243.23342895507812, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3309684991836548, + "rewards/margins": 0.5788313150405884, + "rewards/rejected": -1.9097998142242432, + "step": 7570 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 27.942703247070312, + "learning_rate": 1.3897569119526442e-07, + "logits/chosen": -1.755277395248413, + "logits/rejected": -1.7244822978973389, + "logps/chosen": -180.81710815429688, + "logps/rejected": -233.8452606201172, + "loss": 0.5506, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2480337619781494, + "rewards/margins": 0.5438328981399536, + "rewards/rejected": -1.791866660118103, + "step": 7580 + }, + { + "epoch": 1.3077188146106133, + "grad_norm": 19.884885787963867, + "learning_rate": 1.387909832179798e-07, + "logits/chosen": -1.7783870697021484, + "logits/rejected": -1.7140800952911377, + "logps/chosen": -197.75082397460938, + "logps/rejected": -261.7915954589844, + "loss": 0.5158, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3942798376083374, + "rewards/margins": 0.7095158696174622, + "rewards/rejected": -2.1037955284118652, + "step": 7590 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 37.443603515625, + "learning_rate": 1.3860611932408118e-07, + "logits/chosen": -1.7441831827163696, + "logits/rejected": -1.708203673362732, + "logps/chosen": -204.8780975341797, + "logps/rejected": -242.55419921875, + "loss": 0.6231, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5234088897705078, + "rewards/margins": 0.4161691665649414, + "rewards/rejected": -1.9395780563354492, + "step": 7600 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -1.8558425903320312, + "eval_logits/rejected": -1.8355497121810913, + "eval_logps/chosen": -180.55352783203125, + "eval_logps/rejected": -212.970458984375, + "eval_loss": 0.6345874071121216, + "eval_rewards/accuracies": 0.6289498209953308, + "eval_rewards/chosen": -1.2184159755706787, + "eval_rewards/margins": 0.27948716282844543, + "eval_rewards/rejected": -1.4979033470153809, + "eval_runtime": 383.3126, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.404, + "step": 7600 + }, + { + "epoch": 1.3111647139903515, + "grad_norm": 19.79640769958496, + "learning_rate": 1.3842110025661126e-07, + "logits/chosen": -1.7169148921966553, + "logits/rejected": -1.6761764287948608, + "logps/chosen": -183.58102416992188, + "logps/rejected": -238.1451873779297, + "loss": 0.5547, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.306696891784668, + "rewards/margins": 0.5725280046463013, + "rewards/rejected": -1.8792247772216797, + "step": 7610 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 24.631587982177734, + "learning_rate": 1.3823592675923625e-07, + "logits/chosen": -1.7882808446884155, + "logits/rejected": -1.7510807514190674, + "logps/chosen": -181.6546173095703, + "logps/rejected": -224.89871215820312, + "loss": 0.5813, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2414124011993408, + "rewards/margins": 0.4696447253227234, + "rewards/rejected": -1.7110570669174194, + "step": 7620 + }, + { + "epoch": 1.3146106133700897, + "grad_norm": 19.975292205810547, + "learning_rate": 1.3805059957624318e-07, + "logits/chosen": -1.7191193103790283, + "logits/rejected": -1.6984857320785522, + "logps/chosen": -168.42410278320312, + "logps/rejected": -231.50979614257812, + "loss": 0.529, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1859110593795776, + "rewards/margins": 0.5888235569000244, + "rewards/rejected": -1.7747347354888916, + "step": 7630 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 32.9302864074707, + "learning_rate": 1.3786511945253675e-07, + "logits/chosen": -1.6463232040405273, + "logits/rejected": -1.6050297021865845, + "logps/chosen": -203.4332275390625, + "logps/rejected": -259.23309326171875, + "loss": 0.5606, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4551560878753662, + "rewards/margins": 0.6130838990211487, + "rewards/rejected": -2.068239688873291, + "step": 7640 + }, + { + "epoch": 1.3180565127498278, + "grad_norm": 25.096200942993164, + "learning_rate": 1.3767948713363646e-07, + "logits/chosen": -1.7451419830322266, + "logits/rejected": -1.7046140432357788, + "logps/chosen": -219.556884765625, + "logps/rejected": -272.8857421875, + "loss": 0.5864, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6116056442260742, + "rewards/margins": 0.5594407320022583, + "rewards/rejected": -2.171046257019043, + "step": 7650 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 26.969646453857422, + "learning_rate": 1.374937033656735e-07, + "logits/chosen": -1.7444837093353271, + "logits/rejected": -1.693585753440857, + "logps/chosen": -201.104736328125, + "logps/rejected": -262.66473388671875, + "loss": 0.5437, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4407439231872559, + "rewards/margins": 0.6381824016571045, + "rewards/rejected": -2.0789265632629395, + "step": 7660 + }, + { + "epoch": 1.3215024121295658, + "grad_norm": 25.12961196899414, + "learning_rate": 1.3730776889538776e-07, + "logits/chosen": -1.7061430215835571, + "logits/rejected": -1.6663553714752197, + "logps/chosen": -192.5684814453125, + "logps/rejected": -237.89523315429688, + "loss": 0.5899, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3502416610717773, + "rewards/margins": 0.49933284521102905, + "rewards/rejected": -1.8495744466781616, + "step": 7670 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 18.33002471923828, + "learning_rate": 1.3712168447012493e-07, + "logits/chosen": -1.7825815677642822, + "logits/rejected": -1.7495508193969727, + "logps/chosen": -182.09512329101562, + "logps/rejected": -229.66903686523438, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3145878314971924, + "rewards/margins": 0.48502451181411743, + "rewards/rejected": -1.799612283706665, + "step": 7680 + }, + { + "epoch": 1.324948311509304, + "grad_norm": 20.360734939575195, + "learning_rate": 1.369354508378334e-07, + "logits/chosen": -1.8760316371917725, + "logits/rejected": -1.8178390264511108, + "logps/chosen": -170.9095001220703, + "logps/rejected": -221.64004516601562, + "loss": 0.5396, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1592501401901245, + "rewards/margins": 0.5665987133979797, + "rewards/rejected": -1.725848913192749, + "step": 7690 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 16.414636611938477, + "learning_rate": 1.3674906874706129e-07, + "logits/chosen": -1.7452309131622314, + "logits/rejected": -1.6976512670516968, + "logps/chosen": -181.38619995117188, + "logps/rejected": -235.8791046142578, + "loss": 0.5379, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2767575979232788, + "rewards/margins": 0.5450109839439392, + "rewards/rejected": -1.8217687606811523, + "step": 7700 + }, + { + "epoch": 1.328394210889042, + "grad_norm": 22.02670669555664, + "learning_rate": 1.365625389469534e-07, + "logits/chosen": -1.7657630443572998, + "logits/rejected": -1.7363771200180054, + "logps/chosen": -197.82945251464844, + "logps/rejected": -243.3042755126953, + "loss": 0.5976, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.462568998336792, + "rewards/margins": 0.4612101912498474, + "rewards/rejected": -1.9237792491912842, + "step": 7710 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 30.34868049621582, + "learning_rate": 1.363758621872483e-07, + "logits/chosen": -1.7491880655288696, + "logits/rejected": -1.7047643661499023, + "logps/chosen": -195.3718719482422, + "logps/rejected": -243.65029907226562, + "loss": 0.5547, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3922932147979736, + "rewards/margins": 0.5360931158065796, + "rewards/rejected": -1.9283863306045532, + "step": 7720 + }, + { + "epoch": 1.33184011026878, + "grad_norm": 22.87053108215332, + "learning_rate": 1.361890392182752e-07, + "logits/chosen": -1.7148549556732178, + "logits/rejected": -1.6829760074615479, + "logps/chosen": -180.99148559570312, + "logps/rejected": -230.822509765625, + "loss": 0.5567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.270691156387329, + "rewards/margins": 0.5148378610610962, + "rewards/rejected": -1.7855291366577148, + "step": 7730 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 22.639476776123047, + "learning_rate": 1.3600207079095097e-07, + "logits/chosen": -1.715233564376831, + "logits/rejected": -1.6812883615493774, + "logps/chosen": -183.4813232421875, + "logps/rejected": -249.8083953857422, + "loss": 0.5269, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3177640438079834, + "rewards/margins": 0.6425153613090515, + "rewards/rejected": -1.9602794647216797, + "step": 7740 + }, + { + "epoch": 1.3352860096485184, + "grad_norm": 21.633207321166992, + "learning_rate": 1.3581495765677718e-07, + "logits/chosen": -1.724340796470642, + "logits/rejected": -1.675937294960022, + "logps/chosen": -197.2760009765625, + "logps/rejected": -261.220947265625, + "loss": 0.5474, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.433332085609436, + "rewards/margins": 0.6569215655326843, + "rewards/rejected": -2.0902533531188965, + "step": 7750 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 28.828248977661133, + "learning_rate": 1.3562770056783702e-07, + "logits/chosen": -1.622764229774475, + "logits/rejected": -1.5804797410964966, + "logps/chosen": -179.65103149414062, + "logps/rejected": -246.61685180664062, + "loss": 0.5334, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.291443109512329, + "rewards/margins": 0.6578264236450195, + "rewards/rejected": -1.9492695331573486, + "step": 7760 + }, + { + "epoch": 1.3387319090282563, + "grad_norm": 22.768138885498047, + "learning_rate": 1.3544030027679232e-07, + "logits/chosen": -1.6900676488876343, + "logits/rejected": -1.646816611289978, + "logps/chosen": -185.71786499023438, + "logps/rejected": -239.11392211914062, + "loss": 0.5586, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3360167741775513, + "rewards/margins": 0.5299959182739258, + "rewards/rejected": -1.8660128116607666, + "step": 7770 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 27.77067756652832, + "learning_rate": 1.3525275753688042e-07, + "logits/chosen": -1.8046811819076538, + "logits/rejected": -1.779531717300415, + "logps/chosen": -201.95101928710938, + "logps/rejected": -256.29058837890625, + "loss": 0.5928, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5041205883026123, + "rewards/margins": 0.5199225544929504, + "rewards/rejected": -2.024043321609497, + "step": 7780 + }, + { + "epoch": 1.3421778084079945, + "grad_norm": 27.013347625732422, + "learning_rate": 1.350650731019113e-07, + "logits/chosen": -1.7433545589447021, + "logits/rejected": -1.7085548639297485, + "logps/chosen": -195.16302490234375, + "logps/rejected": -261.06683349609375, + "loss": 0.5275, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4359261989593506, + "rewards/margins": 0.6557725071907043, + "rewards/rejected": -2.0916988849639893, + "step": 7790 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 26.0377140045166, + "learning_rate": 1.3487724772626439e-07, + "logits/chosen": -1.7717100381851196, + "logits/rejected": -1.7416073083877563, + "logps/chosen": -196.32034301757812, + "logps/rejected": -263.3025817871094, + "loss": 0.5472, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4233986139297485, + "rewards/margins": 0.6623138785362244, + "rewards/rejected": -2.0857126712799072, + "step": 7800 + }, + { + "epoch": 1.3456237077877327, + "grad_norm": 42.655330657958984, + "learning_rate": 1.346892821648857e-07, + "logits/chosen": -1.7774578332901, + "logits/rejected": -1.7309677600860596, + "logps/chosen": -198.44158935546875, + "logps/rejected": -244.404296875, + "loss": 0.5825, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.394239902496338, + "rewards/margins": 0.5022423267364502, + "rewards/rejected": -1.8964821100234985, + "step": 7810 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 26.43970489501953, + "learning_rate": 1.3450117717328468e-07, + "logits/chosen": -1.764257788658142, + "logits/rejected": -1.7166850566864014, + "logps/chosen": -190.58175659179688, + "logps/rejected": -257.0692443847656, + "loss": 0.5496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.342162847518921, + "rewards/margins": 0.6754031777381897, + "rewards/rejected": -2.017565965652466, + "step": 7820 + }, + { + "epoch": 1.3490696071674706, + "grad_norm": 29.67559051513672, + "learning_rate": 1.3431293350753115e-07, + "logits/chosen": -1.704307198524475, + "logits/rejected": -1.672844648361206, + "logps/chosen": -186.94482421875, + "logps/rejected": -250.39633178710938, + "loss": 0.551, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3714474439620972, + "rewards/margins": 0.5982364416122437, + "rewards/rejected": -1.9696840047836304, + "step": 7830 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 23.183025360107422, + "learning_rate": 1.341245519242524e-07, + "logits/chosen": -1.6737979650497437, + "logits/rejected": -1.6364225149154663, + "logps/chosen": -184.36581420898438, + "logps/rejected": -234.9649658203125, + "loss": 0.571, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3079830408096313, + "rewards/margins": 0.499339759349823, + "rewards/rejected": -1.8073227405548096, + "step": 7840 + }, + { + "epoch": 1.3525155065472088, + "grad_norm": 23.90206527709961, + "learning_rate": 1.3393603318063e-07, + "logits/chosen": -1.6684505939483643, + "logits/rejected": -1.6141875982284546, + "logps/chosen": -190.3370819091797, + "logps/rejected": -236.37905883789062, + "loss": 0.5752, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.347599983215332, + "rewards/margins": 0.5324249267578125, + "rewards/rejected": -1.8800249099731445, + "step": 7850 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 23.96225357055664, + "learning_rate": 1.3374737803439685e-07, + "logits/chosen": -1.694443941116333, + "logits/rejected": -1.6376886367797852, + "logps/chosen": -188.59872436523438, + "logps/rejected": -254.9580841064453, + "loss": 0.5265, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.357022762298584, + "rewards/margins": 0.6660300493240356, + "rewards/rejected": -2.02305269241333, + "step": 7860 + }, + { + "epoch": 1.355961405926947, + "grad_norm": 35.446407318115234, + "learning_rate": 1.3355858724383415e-07, + "logits/chosen": -1.708349585533142, + "logits/rejected": -1.6715790033340454, + "logps/chosen": -196.78343200683594, + "logps/rejected": -252.73013305664062, + "loss": 0.5663, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4110945463180542, + "rewards/margins": 0.5725733041763306, + "rewards/rejected": -1.9836680889129639, + "step": 7870 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 25.436092376708984, + "learning_rate": 1.3336966156776822e-07, + "logits/chosen": -1.7855867147445679, + "logits/rejected": -1.7552499771118164, + "logps/chosen": -191.6211395263672, + "logps/rejected": -239.4647674560547, + "loss": 0.5904, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.385075569152832, + "rewards/margins": 0.4844435155391693, + "rewards/rejected": -1.8695189952850342, + "step": 7880 + }, + { + "epoch": 1.359407305306685, + "grad_norm": 22.131240844726562, + "learning_rate": 1.3318060176556756e-07, + "logits/chosen": -1.7470848560333252, + "logits/rejected": -1.7071688175201416, + "logps/chosen": -184.20083618164062, + "logps/rejected": -242.040771484375, + "loss": 0.5498, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2982476949691772, + "rewards/margins": 0.5696505308151245, + "rewards/rejected": -1.8678982257843018, + "step": 7890 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 23.41493797302246, + "learning_rate": 1.3299140859713983e-07, + "logits/chosen": -1.7388805150985718, + "logits/rejected": -1.7148288488388062, + "logps/chosen": -182.5740203857422, + "logps/rejected": -238.2061004638672, + "loss": 0.5593, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.303978443145752, + "rewards/margins": 0.5415924787521362, + "rewards/rejected": -1.8455708026885986, + "step": 7900 + }, + { + "epoch": 1.3628532046864232, + "grad_norm": 27.00302505493164, + "learning_rate": 1.3280208282292878e-07, + "logits/chosen": -1.7452290058135986, + "logits/rejected": -1.703344702720642, + "logps/chosen": -188.55166625976562, + "logps/rejected": -232.04141235351562, + "loss": 0.5916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3172823190689087, + "rewards/margins": 0.466630220413208, + "rewards/rejected": -1.7839124202728271, + "step": 7910 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 26.739322662353516, + "learning_rate": 1.3261262520391097e-07, + "logits/chosen": -1.7465155124664307, + "logits/rejected": -1.7103631496429443, + "logps/chosen": -181.1072998046875, + "logps/rejected": -227.1074981689453, + "loss": 0.577, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2628659009933472, + "rewards/margins": 0.493598073720932, + "rewards/rejected": -1.7564637660980225, + "step": 7920 + }, + { + "epoch": 1.3662991040661612, + "grad_norm": 30.61216926574707, + "learning_rate": 1.3242303650159313e-07, + "logits/chosen": -1.7928102016448975, + "logits/rejected": -1.7448190450668335, + "logps/chosen": -187.6417694091797, + "logps/rejected": -238.43606567382812, + "loss": 0.5541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.293071985244751, + "rewards/margins": 0.567300021648407, + "rewards/rejected": -1.8603721857070923, + "step": 7930 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 29.23872947692871, + "learning_rate": 1.3223331747800867e-07, + "logits/chosen": -1.7208061218261719, + "logits/rejected": -1.6839271783828735, + "logps/chosen": -185.5218048095703, + "logps/rejected": -242.7389678955078, + "loss": 0.5544, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2696150541305542, + "rewards/margins": 0.6055715680122375, + "rewards/rejected": -1.8751866817474365, + "step": 7940 + }, + { + "epoch": 1.3697450034458993, + "grad_norm": 17.264358520507812, + "learning_rate": 1.3204346889571494e-07, + "logits/chosen": -1.6770089864730835, + "logits/rejected": -1.6513347625732422, + "logps/chosen": -165.56146240234375, + "logps/rejected": -223.57083129882812, + "loss": 0.5687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1405541896820068, + "rewards/margins": 0.5624696016311646, + "rewards/rejected": -1.7030236721038818, + "step": 7950 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 21.592418670654297, + "learning_rate": 1.3185349151779e-07, + "logits/chosen": -1.705940842628479, + "logits/rejected": -1.6647335290908813, + "logps/chosen": -175.36734008789062, + "logps/rejected": -224.5946807861328, + "loss": 0.5805, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2488831281661987, + "rewards/margins": 0.47221383452415466, + "rewards/rejected": -1.7210969924926758, + "step": 7960 + }, + { + "epoch": 1.3731909028256375, + "grad_norm": 20.810039520263672, + "learning_rate": 1.3166338610782957e-07, + "logits/chosen": -1.7742341756820679, + "logits/rejected": -1.7324060201644897, + "logps/chosen": -189.1077117919922, + "logps/rejected": -235.3397979736328, + "loss": 0.588, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3371036052703857, + "rewards/margins": 0.4955763816833496, + "rewards/rejected": -1.8326799869537354, + "step": 7970 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 24.57638168334961, + "learning_rate": 1.31473153429944e-07, + "logits/chosen": -1.8342373371124268, + "logits/rejected": -1.8030681610107422, + "logps/chosen": -182.3204345703125, + "logps/rejected": -236.19168090820312, + "loss": 0.5656, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.254408359527588, + "rewards/margins": 0.5464503765106201, + "rewards/rejected": -1.8008590936660767, + "step": 7980 + }, + { + "epoch": 1.3766368022053757, + "grad_norm": 21.95906639099121, + "learning_rate": 1.3128279424875523e-07, + "logits/chosen": -1.839450478553772, + "logits/rejected": -1.8062585592269897, + "logps/chosen": -177.2168731689453, + "logps/rejected": -235.6778106689453, + "loss": 0.5459, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2654964923858643, + "rewards/margins": 0.5746921896934509, + "rewards/rejected": -1.84018874168396, + "step": 7990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 19.835283279418945, + "learning_rate": 1.3109230932939354e-07, + "logits/chosen": -1.6997854709625244, + "logits/rejected": -1.6718209981918335, + "logps/chosen": -175.2350311279297, + "logps/rejected": -238.880859375, + "loss": 0.5403, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2071750164031982, + "rewards/margins": 0.6192091703414917, + "rewards/rejected": -1.82638418674469, + "step": 8000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -1.8848055601119995, + "eval_logits/rejected": -1.864662766456604, + "eval_logps/chosen": -173.08419799804688, + "eval_logps/rejected": -204.28668212890625, + "eval_loss": 0.6338950991630554, + "eval_rewards/accuracies": 0.6263940334320068, + "eval_rewards/chosen": -1.1437228918075562, + "eval_rewards/margins": 0.2673425078392029, + "eval_rewards/rejected": -1.4110653400421143, + "eval_runtime": 383.1441, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 8000 + }, + { + "epoch": 1.3800827015851138, + "grad_norm": 29.697978973388672, + "learning_rate": 1.3090169943749475e-07, + "logits/chosen": -1.7395660877227783, + "logits/rejected": -1.6922681331634521, + "logps/chosen": -180.59262084960938, + "logps/rejected": -233.02597045898438, + "loss": 0.5438, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.234327793121338, + "rewards/margins": 0.5716985464096069, + "rewards/rejected": -1.8060262203216553, + "step": 8010 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 30.644569396972656, + "learning_rate": 1.307109653391969e-07, + "logits/chosen": -1.7602342367172241, + "logits/rejected": -1.7022281885147095, + "logps/chosen": -199.9853973388672, + "logps/rejected": -250.7794952392578, + "loss": 0.548, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3974756002426147, + "rewards/margins": 0.5628911852836609, + "rewards/rejected": -1.9603666067123413, + "step": 8020 + }, + { + "epoch": 1.3835286009648518, + "grad_norm": 27.904361724853516, + "learning_rate": 1.3052010780113726e-07, + "logits/chosen": -1.7528932094573975, + "logits/rejected": -1.7155113220214844, + "logps/chosen": -174.95889282226562, + "logps/rejected": -240.86398315429688, + "loss": 0.5346, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2381556034088135, + "rewards/margins": 0.6363126635551453, + "rewards/rejected": -1.874468207359314, + "step": 8030 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 20.0029239654541, + "learning_rate": 1.3032912759044937e-07, + "logits/chosen": -1.7023286819458008, + "logits/rejected": -1.6535085439682007, + "logps/chosen": -193.00807189941406, + "logps/rejected": -251.18673706054688, + "loss": 0.5622, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3858143091201782, + "rewards/margins": 0.6057716608047485, + "rewards/rejected": -1.9915859699249268, + "step": 8040 + }, + { + "epoch": 1.38697450034459, + "grad_norm": 17.56081771850586, + "learning_rate": 1.301380254747597e-07, + "logits/chosen": -1.7698986530303955, + "logits/rejected": -1.7235695123672485, + "logps/chosen": -182.3731689453125, + "logps/rejected": -253.9183349609375, + "loss": 0.5107, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2726563215255737, + "rewards/margins": 0.7080651521682739, + "rewards/rejected": -1.9807217121124268, + "step": 8050 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 27.260448455810547, + "learning_rate": 1.2994680222218478e-07, + "logits/chosen": -1.7978088855743408, + "logits/rejected": -1.7510201930999756, + "logps/chosen": -194.02896118164062, + "logps/rejected": -236.45883178710938, + "loss": 0.6043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3869035243988037, + "rewards/margins": 0.47681206464767456, + "rewards/rejected": -1.8637157678604126, + "step": 8060 + }, + { + "epoch": 1.390420399724328, + "grad_norm": 33.41956329345703, + "learning_rate": 1.29755458601328e-07, + "logits/chosen": -1.7090890407562256, + "logits/rejected": -1.6601349115371704, + "logps/chosen": -193.67401123046875, + "logps/rejected": -243.97958374023438, + "loss": 0.5677, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4021570682525635, + "rewards/margins": 0.5379910469055176, + "rewards/rejected": -1.9401483535766602, + "step": 8070 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 24.615610122680664, + "learning_rate": 1.2956399538127665e-07, + "logits/chosen": -1.717283844947815, + "logits/rejected": -1.688805341720581, + "logps/chosen": -174.60430908203125, + "logps/rejected": -242.5743865966797, + "loss": 0.5228, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2161786556243896, + "rewards/margins": 0.6299420595169067, + "rewards/rejected": -1.8461205959320068, + "step": 8080 + }, + { + "epoch": 1.3938662991040662, + "grad_norm": 20.54537582397461, + "learning_rate": 1.2937241333159854e-07, + "logits/chosen": -1.6753578186035156, + "logits/rejected": -1.6328798532485962, + "logps/chosen": -182.584716796875, + "logps/rejected": -242.11477661132812, + "loss": 0.5507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2946635484695435, + "rewards/margins": 0.611949622631073, + "rewards/rejected": -1.9066131114959717, + "step": 8090 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 25.101362228393555, + "learning_rate": 1.2918071322233933e-07, + "logits/chosen": -1.7218458652496338, + "logits/rejected": -1.7032239437103271, + "logps/chosen": -193.86203002929688, + "logps/rejected": -240.07717895507812, + "loss": 0.5975, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3978016376495361, + "rewards/margins": 0.42204397916793823, + "rewards/rejected": -1.8198457956314087, + "step": 8100 + }, + { + "epoch": 1.3973121984838044, + "grad_norm": 29.090593338012695, + "learning_rate": 1.2898889582401912e-07, + "logits/chosen": -1.756087303161621, + "logits/rejected": -1.7036664485931396, + "logps/chosen": -185.19163513183594, + "logps/rejected": -235.901123046875, + "loss": 0.5544, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2913007736206055, + "rewards/margins": 0.5572675466537476, + "rewards/rejected": -1.848568320274353, + "step": 8110 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 28.600011825561523, + "learning_rate": 1.287969619076294e-07, + "logits/chosen": -1.7284362316131592, + "logits/rejected": -1.692164659500122, + "logps/chosen": -177.6201934814453, + "logps/rejected": -226.9577178955078, + "loss": 0.5815, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.235177755355835, + "rewards/margins": 0.5055698156356812, + "rewards/rejected": -1.7407476902008057, + "step": 8120 + }, + { + "epoch": 1.4007580978635423, + "grad_norm": 25.980693817138672, + "learning_rate": 1.2860491224463003e-07, + "logits/chosen": -1.7584434747695923, + "logits/rejected": -1.727425217628479, + "logps/chosen": -173.2230682373047, + "logps/rejected": -219.60348510742188, + "loss": 0.5641, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1840296983718872, + "rewards/margins": 0.5059834122657776, + "rewards/rejected": -1.6900132894515991, + "step": 8130 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 23.344661712646484, + "learning_rate": 1.2841274760694607e-07, + "logits/chosen": -1.722978949546814, + "logits/rejected": -1.6852538585662842, + "logps/chosen": -175.82997131347656, + "logps/rejected": -232.1393585205078, + "loss": 0.5572, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2310302257537842, + "rewards/margins": 0.5622873902320862, + "rewards/rejected": -1.7933177947998047, + "step": 8140 + }, + { + "epoch": 1.4042039972432805, + "grad_norm": 17.72876739501953, + "learning_rate": 1.282204687669648e-07, + "logits/chosen": -1.7822058200836182, + "logits/rejected": -1.7442264556884766, + "logps/chosen": -184.4099884033203, + "logps/rejected": -238.8332977294922, + "loss": 0.5873, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3226220607757568, + "rewards/margins": 0.5349725484848022, + "rewards/rejected": -1.8575942516326904, + "step": 8150 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 18.7353515625, + "learning_rate": 1.280280764975324e-07, + "logits/chosen": -1.7413051128387451, + "logits/rejected": -1.6966259479522705, + "logps/chosen": -173.74789428710938, + "logps/rejected": -239.2137908935547, + "loss": 0.5069, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.177348017692566, + "rewards/margins": 0.6919088363647461, + "rewards/rejected": -1.8692569732666016, + "step": 8160 + }, + { + "epoch": 1.4076498966230186, + "grad_norm": 28.141429901123047, + "learning_rate": 1.278355715719511e-07, + "logits/chosen": -1.7984817028045654, + "logits/rejected": -1.7529537677764893, + "logps/chosen": -178.94229125976562, + "logps/rejected": -222.255859375, + "loss": 0.5441, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1923143863677979, + "rewards/margins": 0.5107215642929077, + "rewards/rejected": -1.7030360698699951, + "step": 8170 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 30.230098724365234, + "learning_rate": 1.276429547639758e-07, + "logits/chosen": -1.7686141729354858, + "logits/rejected": -1.7348991632461548, + "logps/chosen": -186.8579864501953, + "logps/rejected": -231.6160430908203, + "loss": 0.5838, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3056573867797852, + "rewards/margins": 0.46039143204689026, + "rewards/rejected": -1.7660486698150635, + "step": 8180 + }, + { + "epoch": 1.4110957960027566, + "grad_norm": 25.567800521850586, + "learning_rate": 1.274502268478112e-07, + "logits/chosen": -1.721076250076294, + "logits/rejected": -1.6785032749176025, + "logps/chosen": -186.8155517578125, + "logps/rejected": -235.377197265625, + "loss": 0.5803, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2883694171905518, + "rewards/margins": 0.5267794728279114, + "rewards/rejected": -1.815148949623108, + "step": 8190 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 34.265472412109375, + "learning_rate": 1.2725738859810862e-07, + "logits/chosen": -1.7358614206314087, + "logits/rejected": -1.6915502548217773, + "logps/chosen": -194.3486328125, + "logps/rejected": -238.12918090820312, + "loss": 0.5919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3935348987579346, + "rewards/margins": 0.4622439742088318, + "rewards/rejected": -1.8557790517807007, + "step": 8200 + }, + { + "epoch": 1.414541695382495, + "grad_norm": 22.01947593688965, + "learning_rate": 1.270644407899627e-07, + "logits/chosen": -1.6316165924072266, + "logits/rejected": -1.596050500869751, + "logps/chosen": -184.8388214111328, + "logps/rejected": -238.46627807617188, + "loss": 0.5665, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2973389625549316, + "rewards/margins": 0.525777280330658, + "rewards/rejected": -1.8231165409088135, + "step": 8210 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 21.25282859802246, + "learning_rate": 1.2687138419890863e-07, + "logits/chosen": -1.7528893947601318, + "logits/rejected": -1.7010984420776367, + "logps/chosen": -177.33926391601562, + "logps/rejected": -236.0788116455078, + "loss": 0.539, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2424618005752563, + "rewards/margins": 0.5878540277481079, + "rewards/rejected": -1.8303155899047852, + "step": 8220 + }, + { + "epoch": 1.417987594762233, + "grad_norm": 21.485868453979492, + "learning_rate": 1.2667821960091865e-07, + "logits/chosen": -1.7280648946762085, + "logits/rejected": -1.6949926614761353, + "logps/chosen": -184.8418426513672, + "logps/rejected": -234.5526885986328, + "loss": 0.5588, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.295932412147522, + "rewards/margins": 0.5362016558647156, + "rewards/rejected": -1.8321340084075928, + "step": 8230 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 36.89909744262695, + "learning_rate": 1.2648494777239934e-07, + "logits/chosen": -1.8095436096191406, + "logits/rejected": -1.7656471729278564, + "logps/chosen": -191.72720336914062, + "logps/rejected": -241.11117553710938, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.355535626411438, + "rewards/margins": 0.5371559858322144, + "rewards/rejected": -1.8926916122436523, + "step": 8240 + }, + { + "epoch": 1.421433494141971, + "grad_norm": 21.338768005371094, + "learning_rate": 1.2629156949018805e-07, + "logits/chosen": -1.7479426860809326, + "logits/rejected": -1.711358666419983, + "logps/chosen": -180.8011474609375, + "logps/rejected": -240.23995971679688, + "loss": 0.535, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2315177917480469, + "rewards/margins": 0.627647340297699, + "rewards/rejected": -1.8591649532318115, + "step": 8250 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 25.411182403564453, + "learning_rate": 1.260980855315502e-07, + "logits/chosen": -1.8247654438018799, + "logits/rejected": -1.787501573562622, + "logps/chosen": -184.41542053222656, + "logps/rejected": -249.8496856689453, + "loss": 0.5518, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2800081968307495, + "rewards/margins": 0.6656675934791565, + "rewards/rejected": -1.9456758499145508, + "step": 8260 + }, + { + "epoch": 1.4248793935217092, + "grad_norm": 24.6607608795166, + "learning_rate": 1.2590449667417585e-07, + "logits/chosen": -1.8074318170547485, + "logits/rejected": -1.7805984020233154, + "logps/chosen": -183.58489990234375, + "logps/rejected": -243.6260986328125, + "loss": 0.5597, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3018925189971924, + "rewards/margins": 0.5806858539581299, + "rewards/rejected": -1.8825784921646118, + "step": 8270 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 24.326169967651367, + "learning_rate": 1.2571080369617673e-07, + "logits/chosen": -1.7513822317123413, + "logits/rejected": -1.7262996435165405, + "logps/chosen": -184.77059936523438, + "logps/rejected": -232.02096557617188, + "loss": 0.5994, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3239353895187378, + "rewards/margins": 0.4675242304801941, + "rewards/rejected": -1.7914596796035767, + "step": 8280 + }, + { + "epoch": 1.4283252929014472, + "grad_norm": 19.493026733398438, + "learning_rate": 1.2551700737608313e-07, + "logits/chosen": -1.7309757471084595, + "logits/rejected": -1.6682205200195312, + "logps/chosen": -179.61337280273438, + "logps/rejected": -220.5546875, + "loss": 0.5686, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1967713832855225, + "rewards/margins": 0.4668409824371338, + "rewards/rejected": -1.6636121273040771, + "step": 8290 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 18.092647552490234, + "learning_rate": 1.253231084928406e-07, + "logits/chosen": -1.8834679126739502, + "logits/rejected": -1.8440923690795898, + "logps/chosen": -184.91363525390625, + "logps/rejected": -235.0916748046875, + "loss": 0.5922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2733423709869385, + "rewards/margins": 0.5322745442390442, + "rewards/rejected": -1.8056169748306274, + "step": 8300 + }, + { + "epoch": 1.4317711922811853, + "grad_norm": 26.49702262878418, + "learning_rate": 1.2512910782580704e-07, + "logits/chosen": -1.6803478002548218, + "logits/rejected": -1.6344587802886963, + "logps/chosen": -173.33175659179688, + "logps/rejected": -220.29006958007812, + "loss": 0.5545, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1706262826919556, + "rewards/margins": 0.5207440853118896, + "rewards/rejected": -1.6913702487945557, + "step": 8310 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 23.054729461669922, + "learning_rate": 1.2493500615474937e-07, + "logits/chosen": -1.7400583028793335, + "logits/rejected": -1.7084745168685913, + "logps/chosen": -171.78407287597656, + "logps/rejected": -231.578369140625, + "loss": 0.5482, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.19270658493042, + "rewards/margins": 0.5825310945510864, + "rewards/rejected": -1.7752374410629272, + "step": 8320 + }, + { + "epoch": 1.4352170916609235, + "grad_norm": 25.612323760986328, + "learning_rate": 1.2474080425984056e-07, + "logits/chosen": -1.7509899139404297, + "logits/rejected": -1.7259149551391602, + "logps/chosen": -183.274169921875, + "logps/rejected": -233.5478057861328, + "loss": 0.5994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3465931415557861, + "rewards/margins": 0.45025119185447693, + "rewards/rejected": -1.796844244003296, + "step": 8330 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 25.223346710205078, + "learning_rate": 1.2454650292165634e-07, + "logits/chosen": -1.8134841918945312, + "logits/rejected": -1.7871147394180298, + "logps/chosen": -183.5989990234375, + "logps/rejected": -228.4562225341797, + "loss": 0.5981, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.291554570198059, + "rewards/margins": 0.47639140486717224, + "rewards/rejected": -1.7679458856582642, + "step": 8340 + }, + { + "epoch": 1.4386629910406616, + "grad_norm": 21.51726531982422, + "learning_rate": 1.2435210292117223e-07, + "logits/chosen": -1.6630241870880127, + "logits/rejected": -1.6343367099761963, + "logps/chosen": -182.60397338867188, + "logps/rejected": -225.54019165039062, + "loss": 0.5842, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2943296432495117, + "rewards/margins": 0.4351266920566559, + "rewards/rejected": -1.7294561862945557, + "step": 8350 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 28.087337493896484, + "learning_rate": 1.2415760503976027e-07, + "logits/chosen": -1.6792490482330322, + "logits/rejected": -1.622058629989624, + "logps/chosen": -163.5045166015625, + "logps/rejected": -222.25582885742188, + "loss": 0.5219, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0895042419433594, + "rewards/margins": 0.6140426397323608, + "rewards/rejected": -1.7035468816757202, + "step": 8360 + }, + { + "epoch": 1.4421088904203998, + "grad_norm": 28.05508041381836, + "learning_rate": 1.2396301005918592e-07, + "logits/chosen": -1.7516257762908936, + "logits/rejected": -1.712021827697754, + "logps/chosen": -174.34732055664062, + "logps/rejected": -233.7310791015625, + "loss": 0.5609, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.232910394668579, + "rewards/margins": 0.5682687759399414, + "rewards/rejected": -1.8011791706085205, + "step": 8370 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 18.621915817260742, + "learning_rate": 1.2376831876160493e-07, + "logits/chosen": -1.8677845001220703, + "logits/rejected": -1.8293062448501587, + "logps/chosen": -157.7315673828125, + "logps/rejected": -216.45401000976562, + "loss": 0.5486, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0649034976959229, + "rewards/margins": 0.5581703186035156, + "rewards/rejected": -1.6230738162994385, + "step": 8380 + }, + { + "epoch": 1.4455547898001377, + "grad_norm": 19.196773529052734, + "learning_rate": 1.2357353192956015e-07, + "logits/chosen": -1.837416648864746, + "logits/rejected": -1.786107063293457, + "logps/chosen": -167.60275268554688, + "logps/rejected": -239.2596435546875, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.109065294265747, + "rewards/margins": 0.7224443554878235, + "rewards/rejected": -1.8315098285675049, + "step": 8390 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 19.819705963134766, + "learning_rate": 1.2337865034597853e-07, + "logits/chosen": -1.7919927835464478, + "logits/rejected": -1.7521469593048096, + "logps/chosen": -157.2268524169922, + "logps/rejected": -214.87722778320312, + "loss": 0.5444, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.069208025932312, + "rewards/margins": 0.571751594543457, + "rewards/rejected": -1.6409597396850586, + "step": 8400 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -1.8767586946487427, + "eval_logits/rejected": -1.8567559719085693, + "eval_logps/chosen": -165.97645568847656, + "eval_logps/rejected": -196.28274536132812, + "eval_loss": 0.6338884234428406, + "eval_rewards/accuracies": 0.6287174820899963, + "eval_rewards/chosen": -1.072645664215088, + "eval_rewards/margins": 0.25838038325309753, + "eval_rewards/rejected": -1.3310261964797974, + "eval_runtime": 384.2633, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 8400 + }, + { + "epoch": 1.449000689179876, + "grad_norm": 18.899368286132812, + "learning_rate": 1.2318367479416772e-07, + "logits/chosen": -1.7205651998519897, + "logits/rejected": -1.6706392765045166, + "logps/chosen": -180.8457794189453, + "logps/rejected": -242.57177734375, + "loss": 0.5331, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2318532466888428, + "rewards/margins": 0.6757212281227112, + "rewards/rejected": -1.9075744152069092, + "step": 8410 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 23.191862106323242, + "learning_rate": 1.2298860605781317e-07, + "logits/chosen": -1.61892831325531, + "logits/rejected": -1.5845869779586792, + "logps/chosen": -162.18161010742188, + "logps/rejected": -229.1549530029297, + "loss": 0.5114, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1052982807159424, + "rewards/margins": 0.6736155152320862, + "rewards/rejected": -1.7789137363433838, + "step": 8420 + }, + { + "epoch": 1.452446588559614, + "grad_norm": 27.311832427978516, + "learning_rate": 1.2279344492097482e-07, + "logits/chosen": -1.7439725399017334, + "logits/rejected": -1.7103437185287476, + "logps/chosen": -173.8157501220703, + "logps/rejected": -229.6221160888672, + "loss": 0.5683, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2032434940338135, + "rewards/margins": 0.5264207720756531, + "rewards/rejected": -1.7296644449234009, + "step": 8430 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 26.43974494934082, + "learning_rate": 1.2259819216808406e-07, + "logits/chosen": -1.76885187625885, + "logits/rejected": -1.7441856861114502, + "logps/chosen": -179.97201538085938, + "logps/rejected": -224.7045440673828, + "loss": 0.5869, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2517940998077393, + "rewards/margins": 0.47994908690452576, + "rewards/rejected": -1.7317432165145874, + "step": 8440 + }, + { + "epoch": 1.4558924879393522, + "grad_norm": 21.743513107299805, + "learning_rate": 1.2240284858394048e-07, + "logits/chosen": -1.640924096107483, + "logits/rejected": -1.6128562688827515, + "logps/chosen": -175.72862243652344, + "logps/rejected": -240.687744140625, + "loss": 0.5499, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2771676778793335, + "rewards/margins": 0.6006568670272827, + "rewards/rejected": -1.8778244256973267, + "step": 8450 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 21.22736167907715, + "learning_rate": 1.2220741495370875e-07, + "logits/chosen": -1.7674133777618408, + "logits/rejected": -1.7252048254013062, + "logps/chosen": -178.5194854736328, + "logps/rejected": -238.6125946044922, + "loss": 0.5199, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2571282386779785, + "rewards/margins": 0.5916931629180908, + "rewards/rejected": -1.8488212823867798, + "step": 8460 + }, + { + "epoch": 1.4593383873190904, + "grad_norm": 40.24439239501953, + "learning_rate": 1.220118920629155e-07, + "logits/chosen": -1.720158338546753, + "logits/rejected": -1.683738112449646, + "logps/chosen": -201.67807006835938, + "logps/rejected": -249.63784790039062, + "loss": 0.6055, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.456115961074829, + "rewards/margins": 0.5088047385215759, + "rewards/rejected": -1.9649207592010498, + "step": 8470 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 27.819427490234375, + "learning_rate": 1.2181628069744613e-07, + "logits/chosen": -1.7302448749542236, + "logits/rejected": -1.6805111169815063, + "logps/chosen": -181.5083465576172, + "logps/rejected": -242.5039520263672, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.284711480140686, + "rewards/margins": 0.6244471669197083, + "rewards/rejected": -1.90915846824646, + "step": 8480 + }, + { + "epoch": 1.4627842866988283, + "grad_norm": 18.368925094604492, + "learning_rate": 1.216205816435416e-07, + "logits/chosen": -1.8100181818008423, + "logits/rejected": -1.7824761867523193, + "logps/chosen": -175.33273315429688, + "logps/rejected": -233.98095703125, + "loss": 0.5516, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2059272527694702, + "rewards/margins": 0.5618650317192078, + "rewards/rejected": -1.7677921056747437, + "step": 8490 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 25.975460052490234, + "learning_rate": 1.2142479568779545e-07, + "logits/chosen": -1.6658073663711548, + "logits/rejected": -1.6362226009368896, + "logps/chosen": -173.92318725585938, + "logps/rejected": -230.8607635498047, + "loss": 0.5476, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1961640119552612, + "rewards/margins": 0.5779889225959778, + "rewards/rejected": -1.7741529941558838, + "step": 8500 + }, + { + "epoch": 1.4662301860785665, + "grad_norm": 27.53252601623535, + "learning_rate": 1.2122892361715042e-07, + "logits/chosen": -1.7083604335784912, + "logits/rejected": -1.6629486083984375, + "logps/chosen": -184.6015167236328, + "logps/rejected": -244.2220001220703, + "loss": 0.5126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2675223350524902, + "rewards/margins": 0.6594316363334656, + "rewards/rejected": -1.9269540309906006, + "step": 8510 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 21.84817886352539, + "learning_rate": 1.2103296621889531e-07, + "logits/chosen": -1.6953309774398804, + "logits/rejected": -1.6540002822875977, + "logps/chosen": -186.7661590576172, + "logps/rejected": -238.6102752685547, + "loss": 0.565, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3319623470306396, + "rewards/margins": 0.5671849250793457, + "rewards/rejected": -1.899147391319275, + "step": 8520 + }, + { + "epoch": 1.4696760854583046, + "grad_norm": 26.40680503845215, + "learning_rate": 1.2083692428066207e-07, + "logits/chosen": -1.6670316457748413, + "logits/rejected": -1.6319210529327393, + "logps/chosen": -188.43687438964844, + "logps/rejected": -238.4445037841797, + "loss": 0.5843, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.345499038696289, + "rewards/margins": 0.5246737003326416, + "rewards/rejected": -1.8701727390289307, + "step": 8530 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 21.45933723449707, + "learning_rate": 1.2064079859042237e-07, + "logits/chosen": -1.8144257068634033, + "logits/rejected": -1.791337251663208, + "logps/chosen": -183.89779663085938, + "logps/rejected": -226.5663299560547, + "loss": 0.6059, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2920362949371338, + "rewards/margins": 0.4292899966239929, + "rewards/rejected": -1.721326470375061, + "step": 8540 + }, + { + "epoch": 1.4731219848380428, + "grad_norm": 21.429597854614258, + "learning_rate": 1.204445899364844e-07, + "logits/chosen": -1.7466418743133545, + "logits/rejected": -1.7070821523666382, + "logps/chosen": -183.11488342285156, + "logps/rejected": -248.70767211914062, + "loss": 0.5287, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2927693128585815, + "rewards/margins": 0.6410099267959595, + "rewards/rejected": -1.9337793588638306, + "step": 8550 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 19.853425979614258, + "learning_rate": 1.2024829910749e-07, + "logits/chosen": -1.8415504693984985, + "logits/rejected": -1.7988510131835938, + "logps/chosen": -182.31796264648438, + "logps/rejected": -244.3562469482422, + "loss": 0.5303, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.281106948852539, + "rewards/margins": 0.6391115188598633, + "rewards/rejected": -1.9202184677124023, + "step": 8560 + }, + { + "epoch": 1.476567884217781, + "grad_norm": 26.883686065673828, + "learning_rate": 1.2005192689241111e-07, + "logits/chosen": -1.7047302722930908, + "logits/rejected": -1.6662166118621826, + "logps/chosen": -178.93783569335938, + "logps/rejected": -230.54995727539062, + "loss": 0.5444, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2458077669143677, + "rewards/margins": 0.5393422842025757, + "rewards/rejected": -1.7851499319076538, + "step": 8570 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 22.551843643188477, + "learning_rate": 1.1985547408054707e-07, + "logits/chosen": -1.786651372909546, + "logits/rejected": -1.7402511835098267, + "logps/chosen": -171.06886291503906, + "logps/rejected": -240.85791015625, + "loss": 0.4924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1693259477615356, + "rewards/margins": 0.6937976479530334, + "rewards/rejected": -1.8631235361099243, + "step": 8580 + }, + { + "epoch": 1.480013783597519, + "grad_norm": 22.17210578918457, + "learning_rate": 1.1965894146152083e-07, + "logits/chosen": -1.7574405670166016, + "logits/rejected": -1.704829454421997, + "logps/chosen": -180.36509704589844, + "logps/rejected": -240.45803833007812, + "loss": 0.5201, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2337052822113037, + "rewards/margins": 0.6370530724525452, + "rewards/rejected": -1.8707586526870728, + "step": 8590 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 29.228626251220703, + "learning_rate": 1.1946232982527637e-07, + "logits/chosen": -1.677152395248413, + "logits/rejected": -1.6436678171157837, + "logps/chosen": -199.14393615722656, + "logps/rejected": -238.5497589111328, + "loss": 0.6175, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4570565223693848, + "rewards/margins": 0.41096407175064087, + "rewards/rejected": -1.8680204153060913, + "step": 8600 + }, + { + "epoch": 1.483459682977257, + "grad_norm": 35.54603958129883, + "learning_rate": 1.1926563996207518e-07, + "logits/chosen": -1.7128098011016846, + "logits/rejected": -1.6729986667633057, + "logps/chosen": -189.81747436523438, + "logps/rejected": -245.4787139892578, + "loss": 0.5656, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3567256927490234, + "rewards/margins": 0.5924991369247437, + "rewards/rejected": -1.9492250680923462, + "step": 8610 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 22.30665397644043, + "learning_rate": 1.1906887266249317e-07, + "logits/chosen": -1.6407105922698975, + "logits/rejected": -1.6167869567871094, + "logps/chosen": -191.15216064453125, + "logps/rejected": -230.05224609375, + "loss": 0.6086, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.36378014087677, + "rewards/margins": 0.4242352545261383, + "rewards/rejected": -1.788015365600586, + "step": 8620 + }, + { + "epoch": 1.4869055823569952, + "grad_norm": 24.072481155395508, + "learning_rate": 1.1887202871741757e-07, + "logits/chosen": -1.6276092529296875, + "logits/rejected": -1.5934003591537476, + "logps/chosen": -168.8276824951172, + "logps/rejected": -233.3299102783203, + "loss": 0.5156, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1712262630462646, + "rewards/margins": 0.6414912939071655, + "rewards/rejected": -1.8127176761627197, + "step": 8630 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 22.44603157043457, + "learning_rate": 1.1867510891804353e-07, + "logits/chosen": -1.769152045249939, + "logits/rejected": -1.7334001064300537, + "logps/chosen": -189.02745056152344, + "logps/rejected": -233.2024383544922, + "loss": 0.5991, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3328588008880615, + "rewards/margins": 0.47136467695236206, + "rewards/rejected": -1.804223656654358, + "step": 8640 + }, + { + "epoch": 1.4903514817367332, + "grad_norm": 30.211387634277344, + "learning_rate": 1.1847811405587127e-07, + "logits/chosen": -1.6871166229248047, + "logits/rejected": -1.6457468271255493, + "logps/chosen": -183.58999633789062, + "logps/rejected": -237.5922393798828, + "loss": 0.5813, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2863670587539673, + "rewards/margins": 0.5488260984420776, + "rewards/rejected": -1.8351930379867554, + "step": 8650 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 26.927658081054688, + "learning_rate": 1.1828104492270254e-07, + "logits/chosen": -1.7137506008148193, + "logits/rejected": -1.6803970336914062, + "logps/chosen": -181.31094360351562, + "logps/rejected": -238.88528442382812, + "loss": 0.5594, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2794899940490723, + "rewards/margins": 0.5921354293823242, + "rewards/rejected": -1.871625542640686, + "step": 8660 + }, + { + "epoch": 1.4937973811164715, + "grad_norm": 21.608163833618164, + "learning_rate": 1.1808390231063783e-07, + "logits/chosen": -1.81405770778656, + "logits/rejected": -1.7714502811431885, + "logps/chosen": -175.96788024902344, + "logps/rejected": -238.8325653076172, + "loss": 0.5383, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.239720106124878, + "rewards/margins": 0.6207379698753357, + "rewards/rejected": -1.8604580163955688, + "step": 8670 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 30.645854949951172, + "learning_rate": 1.1788668701207274e-07, + "logits/chosen": -1.7290817499160767, + "logits/rejected": -1.7106482982635498, + "logps/chosen": -183.47903442382812, + "logps/rejected": -224.8456268310547, + "loss": 0.6189, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.328007459640503, + "rewards/margins": 0.3926061689853668, + "rewards/rejected": -1.7206134796142578, + "step": 8680 + }, + { + "epoch": 1.4972432804962095, + "grad_norm": 22.075122833251953, + "learning_rate": 1.1768939981969515e-07, + "logits/chosen": -1.7659361362457275, + "logits/rejected": -1.7287206649780273, + "logps/chosen": -184.3316650390625, + "logps/rejected": -222.2124481201172, + "loss": 0.6251, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2879536151885986, + "rewards/margins": 0.4510854184627533, + "rewards/rejected": -1.7390391826629639, + "step": 8690 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 24.525606155395508, + "learning_rate": 1.1749204152648191e-07, + "logits/chosen": -1.7778053283691406, + "logits/rejected": -1.7342383861541748, + "logps/chosen": -188.58360290527344, + "logps/rejected": -223.57955932617188, + "loss": 0.6054, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3208611011505127, + "rewards/margins": 0.38850995898246765, + "rewards/rejected": -1.7093709707260132, + "step": 8700 + }, + { + "epoch": 1.5006891798759476, + "grad_norm": 32.69189453125, + "learning_rate": 1.1729461292569563e-07, + "logits/chosen": -1.7515008449554443, + "logits/rejected": -1.735420823097229, + "logps/chosen": -177.72052001953125, + "logps/rejected": -207.87039184570312, + "loss": 0.6566, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2488489151000977, + "rewards/margins": 0.298393189907074, + "rewards/rejected": -1.5472421646118164, + "step": 8710 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 26.830543518066406, + "learning_rate": 1.1709711481088156e-07, + "logits/chosen": -1.8059642314910889, + "logits/rejected": -1.7639391422271729, + "logps/chosen": -162.46495056152344, + "logps/rejected": -211.879638671875, + "loss": 0.5446, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0611735582351685, + "rewards/margins": 0.5130646824836731, + "rewards/rejected": -1.5742381811141968, + "step": 8720 + }, + { + "epoch": 1.5041350792556858, + "grad_norm": 20.756858825683594, + "learning_rate": 1.1689954797586422e-07, + "logits/chosen": -1.789809226989746, + "logits/rejected": -1.7468286752700806, + "logps/chosen": -163.58743286132812, + "logps/rejected": -215.0264892578125, + "loss": 0.5693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.113161325454712, + "rewards/margins": 0.5186198949813843, + "rewards/rejected": -1.6317813396453857, + "step": 8730 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 32.3831901550293, + "learning_rate": 1.1670191321474457e-07, + "logits/chosen": -1.7614628076553345, + "logits/rejected": -1.7365093231201172, + "logps/chosen": -167.097412109375, + "logps/rejected": -226.8600616455078, + "loss": 0.5446, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1569335460662842, + "rewards/margins": 0.5816691517829895, + "rewards/rejected": -1.738602638244629, + "step": 8740 + }, + { + "epoch": 1.5075809786354237, + "grad_norm": 24.4361572265625, + "learning_rate": 1.1650421132189634e-07, + "logits/chosen": -1.799372673034668, + "logits/rejected": -1.7554528713226318, + "logps/chosen": -168.343017578125, + "logps/rejected": -229.10745239257812, + "loss": 0.5329, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1464492082595825, + "rewards/margins": 0.6126116514205933, + "rewards/rejected": -1.7590608596801758, + "step": 8750 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 18.993227005004883, + "learning_rate": 1.1630644309196327e-07, + "logits/chosen": -1.725738525390625, + "logits/rejected": -1.7081897258758545, + "logps/chosen": -167.4300079345703, + "logps/rejected": -217.34671020507812, + "loss": 0.5799, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1593310832977295, + "rewards/margins": 0.4624145030975342, + "rewards/rejected": -1.6217454671859741, + "step": 8760 + }, + { + "epoch": 1.5110268780151621, + "grad_norm": 21.166257858276367, + "learning_rate": 1.1610860931985566e-07, + "logits/chosen": -1.7743898630142212, + "logits/rejected": -1.730316400527954, + "logps/chosen": -176.2876434326172, + "logps/rejected": -224.82296752929688, + "loss": 0.569, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2121226787567139, + "rewards/margins": 0.5385057926177979, + "rewards/rejected": -1.7506287097930908, + "step": 8770 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 27.59946632385254, + "learning_rate": 1.1591071080074727e-07, + "logits/chosen": -1.799107551574707, + "logits/rejected": -1.7788976430892944, + "logps/chosen": -169.95645141601562, + "logps/rejected": -227.0201873779297, + "loss": 0.552, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.208627462387085, + "rewards/margins": 0.5270792245864868, + "rewards/rejected": -1.7357066869735718, + "step": 8780 + }, + { + "epoch": 1.5144727773949, + "grad_norm": 26.69658088684082, + "learning_rate": 1.1571274833007214e-07, + "logits/chosen": -1.8081270456314087, + "logits/rejected": -1.7647250890731812, + "logps/chosen": -174.61740112304688, + "logps/rejected": -226.5430145263672, + "loss": 0.5601, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.196936011314392, + "rewards/margins": 0.5481966137886047, + "rewards/rejected": -1.7451328039169312, + "step": 8790 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 27.48969268798828, + "learning_rate": 1.1551472270352125e-07, + "logits/chosen": -1.7258638143539429, + "logits/rejected": -1.6805026531219482, + "logps/chosen": -168.3254852294922, + "logps/rejected": -213.35421752929688, + "loss": 0.5766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1332638263702393, + "rewards/margins": 0.4753327965736389, + "rewards/rejected": -1.6085964441299438, + "step": 8800 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -1.9008620977401733, + "eval_logits/rejected": -1.8818625211715698, + "eval_logps/chosen": -162.34829711914062, + "eval_logps/rejected": -191.97488403320312, + "eval_loss": 0.6328625082969666, + "eval_rewards/accuracies": 0.633596658706665, + "eval_rewards/chosen": -1.036363959312439, + "eval_rewards/margins": 0.2515837252140045, + "eval_rewards/rejected": -1.287947654724121, + "eval_runtime": 384.3016, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.4, + "step": 8800 + }, + { + "epoch": 1.5179186767746382, + "grad_norm": 23.977811813354492, + "learning_rate": 1.1531663471703956e-07, + "logits/chosen": -1.8277661800384521, + "logits/rejected": -1.7861217260360718, + "logps/chosen": -168.30447387695312, + "logps/rejected": -230.71499633789062, + "loss": 0.5309, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.17344069480896, + "rewards/margins": 0.6076480150222778, + "rewards/rejected": -1.7810888290405273, + "step": 8810 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 31.468040466308594, + "learning_rate": 1.1511848516682257e-07, + "logits/chosen": -1.8036655187606812, + "logits/rejected": -1.7669929265975952, + "logps/chosen": -164.54380798339844, + "logps/rejected": -223.2119903564453, + "loss": 0.5289, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1411962509155273, + "rewards/margins": 0.5930995345115662, + "rewards/rejected": -1.7342956066131592, + "step": 8820 + }, + { + "epoch": 1.5213645761543764, + "grad_norm": 26.091304779052734, + "learning_rate": 1.149202748493133e-07, + "logits/chosen": -1.651025414466858, + "logits/rejected": -1.6094402074813843, + "logps/chosen": -171.48646545410156, + "logps/rejected": -223.22842407226562, + "loss": 0.5773, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1689049005508423, + "rewards/margins": 0.5359553098678589, + "rewards/rejected": -1.7048600912094116, + "step": 8830 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 23.064006805419922, + "learning_rate": 1.1472200456119901e-07, + "logits/chosen": -1.6805092096328735, + "logits/rejected": -1.6452114582061768, + "logps/chosen": -164.11929321289062, + "logps/rejected": -228.6237335205078, + "loss": 0.5354, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1177093982696533, + "rewards/margins": 0.6305631995201111, + "rewards/rejected": -1.7482725381851196, + "step": 8840 + }, + { + "epoch": 1.5248104755341143, + "grad_norm": 27.771137237548828, + "learning_rate": 1.1452367509940794e-07, + "logits/chosen": -1.841334581375122, + "logits/rejected": -1.8025280237197876, + "logps/chosen": -162.31332397460938, + "logps/rejected": -228.25985717773438, + "loss": 0.5362, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1006293296813965, + "rewards/margins": 0.6390389204025269, + "rewards/rejected": -1.7396682500839233, + "step": 8850 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 19.63231086730957, + "learning_rate": 1.1432528726110628e-07, + "logits/chosen": -1.7424705028533936, + "logits/rejected": -1.699713110923767, + "logps/chosen": -175.4546661376953, + "logps/rejected": -235.0323028564453, + "loss": 0.5317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2227356433868408, + "rewards/margins": 0.5989742875099182, + "rewards/rejected": -1.8217099905014038, + "step": 8860 + }, + { + "epoch": 1.5282563749138525, + "grad_norm": 21.968769073486328, + "learning_rate": 1.1412684184369478e-07, + "logits/chosen": -1.8747831583023071, + "logits/rejected": -1.8180469274520874, + "logps/chosen": -182.103271484375, + "logps/rejected": -244.1374053955078, + "loss": 0.5301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.253333568572998, + "rewards/margins": 0.6377841234207153, + "rewards/rejected": -1.8911176919937134, + "step": 8870 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 34.719356536865234, + "learning_rate": 1.1392833964480564e-07, + "logits/chosen": -1.6653770208358765, + "logits/rejected": -1.6219761371612549, + "logps/chosen": -183.3114776611328, + "logps/rejected": -243.77294921875, + "loss": 0.5712, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2986027002334595, + "rewards/margins": 0.6236371397972107, + "rewards/rejected": -1.922239899635315, + "step": 8880 + }, + { + "epoch": 1.5317022742935906, + "grad_norm": 24.384660720825195, + "learning_rate": 1.137297814622993e-07, + "logits/chosen": -1.6986202001571655, + "logits/rejected": -1.6509069204330444, + "logps/chosen": -186.1614227294922, + "logps/rejected": -248.8933868408203, + "loss": 0.5196, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.316670536994934, + "rewards/margins": 0.6645750999450684, + "rewards/rejected": -1.9812456369400024, + "step": 8890 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 22.604028701782227, + "learning_rate": 1.1353116809426121e-07, + "logits/chosen": -1.7376683950424194, + "logits/rejected": -1.6882107257843018, + "logps/chosen": -189.76602172851562, + "logps/rejected": -251.9395294189453, + "loss": 0.5366, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3487989902496338, + "rewards/margins": 0.6406348347663879, + "rewards/rejected": -1.9894338846206665, + "step": 8900 + }, + { + "epoch": 1.5351481736733288, + "grad_norm": 31.242036819458008, + "learning_rate": 1.1333250033899867e-07, + "logits/chosen": -1.6938707828521729, + "logits/rejected": -1.6581497192382812, + "logps/chosen": -200.32672119140625, + "logps/rejected": -262.11102294921875, + "loss": 0.5484, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4465105533599854, + "rewards/margins": 0.6440070867538452, + "rewards/rejected": -2.09051775932312, + "step": 8910 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 22.39651107788086, + "learning_rate": 1.131337789950375e-07, + "logits/chosen": -1.8044532537460327, + "logits/rejected": -1.744747519493103, + "logps/chosen": -197.68798828125, + "logps/rejected": -268.2427673339844, + "loss": 0.5196, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.406423807144165, + "rewards/margins": 0.7548932433128357, + "rewards/rejected": -2.1613171100616455, + "step": 8920 + }, + { + "epoch": 1.538594073053067, + "grad_norm": 20.711666107177734, + "learning_rate": 1.12935004861119e-07, + "logits/chosen": -1.7355296611785889, + "logits/rejected": -1.6860911846160889, + "logps/chosen": -199.24159240722656, + "logps/rejected": -260.26495361328125, + "loss": 0.5508, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4455238580703735, + "rewards/margins": 0.6514202952384949, + "rewards/rejected": -2.0969443321228027, + "step": 8930 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 29.11483383178711, + "learning_rate": 1.1273617873619663e-07, + "logits/chosen": -1.720091462135315, + "logits/rejected": -1.6819565296173096, + "logps/chosen": -192.368896484375, + "logps/rejected": -250.0859832763672, + "loss": 0.5668, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3673410415649414, + "rewards/margins": 0.575615644454956, + "rewards/rejected": -1.9429569244384766, + "step": 8940 + }, + { + "epoch": 1.5420399724328049, + "grad_norm": 34.54115676879883, + "learning_rate": 1.1253730141943276e-07, + "logits/chosen": -1.5915120840072632, + "logits/rejected": -1.5745168924331665, + "logps/chosen": -190.63565063476562, + "logps/rejected": -244.02066040039062, + "loss": 0.5821, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3653391599655151, + "rewards/margins": 0.5031275749206543, + "rewards/rejected": -1.8684667348861694, + "step": 8950 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 29.218130111694336, + "learning_rate": 1.1233837371019566e-07, + "logits/chosen": -1.7079054117202759, + "logits/rejected": -1.6546837091445923, + "logps/chosen": -209.74807739257812, + "logps/rejected": -284.8275146484375, + "loss": 0.5267, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5534542798995972, + "rewards/margins": 0.7775411605834961, + "rewards/rejected": -2.330995559692383, + "step": 8960 + }, + { + "epoch": 1.545485871812543, + "grad_norm": 26.836565017700195, + "learning_rate": 1.1213939640805594e-07, + "logits/chosen": -1.6795122623443604, + "logits/rejected": -1.6285619735717773, + "logps/chosen": -200.71646118164062, + "logps/rejected": -264.31719970703125, + "loss": 0.5252, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.474325180053711, + "rewards/margins": 0.6578629612922668, + "rewards/rejected": -2.132188081741333, + "step": 8970 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 31.637868881225586, + "learning_rate": 1.1194037031278378e-07, + "logits/chosen": -1.7559791803359985, + "logits/rejected": -1.720070481300354, + "logps/chosen": -221.94912719726562, + "logps/rejected": -261.1572570800781, + "loss": 0.6517, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.674683928489685, + "rewards/margins": 0.4098898768424988, + "rewards/rejected": -2.084573984146118, + "step": 8980 + }, + { + "epoch": 1.5489317711922812, + "grad_norm": 27.263280868530273, + "learning_rate": 1.1174129622434531e-07, + "logits/chosen": -1.6646140813827515, + "logits/rejected": -1.6244380474090576, + "logps/chosen": -190.9231414794922, + "logps/rejected": -256.2034606933594, + "loss": 0.5111, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3597346544265747, + "rewards/margins": 0.6620964407920837, + "rewards/rejected": -2.0218310356140137, + "step": 8990 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 31.816787719726562, + "learning_rate": 1.1154217494289966e-07, + "logits/chosen": -1.7356491088867188, + "logits/rejected": -1.6923658847808838, + "logps/chosen": -204.17465209960938, + "logps/rejected": -258.885986328125, + "loss": 0.6097, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4676018953323364, + "rewards/margins": 0.5954405069351196, + "rewards/rejected": -2.063042163848877, + "step": 9000 + }, + { + "epoch": 1.5523776705720191, + "grad_norm": 23.65191650390625, + "learning_rate": 1.1134300726879557e-07, + "logits/chosen": -1.7225637435913086, + "logits/rejected": -1.6865419149398804, + "logps/chosen": -185.95620727539062, + "logps/rejected": -229.1414031982422, + "loss": 0.599, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3031692504882812, + "rewards/margins": 0.4596996307373047, + "rewards/rejected": -1.762868881225586, + "step": 9010 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 26.136945724487305, + "learning_rate": 1.1114379400256828e-07, + "logits/chosen": -1.6853822469711304, + "logits/rejected": -1.6423343420028687, + "logps/chosen": -167.6372833251953, + "logps/rejected": -240.2078094482422, + "loss": 0.5002, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1371324062347412, + "rewards/margins": 0.7557451725006104, + "rewards/rejected": -1.8928775787353516, + "step": 9020 + }, + { + "epoch": 1.5558235699517575, + "grad_norm": 28.88298225402832, + "learning_rate": 1.1094453594493634e-07, + "logits/chosen": -1.7078361511230469, + "logits/rejected": -1.695569396018982, + "logps/chosen": -168.51095581054688, + "logps/rejected": -224.92550659179688, + "loss": 0.553, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1735893487930298, + "rewards/margins": 0.5375136137008667, + "rewards/rejected": -1.711102843284607, + "step": 9030 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 30.364782333374023, + "learning_rate": 1.107452338967982e-07, + "logits/chosen": -1.7026069164276123, + "logits/rejected": -1.673125982284546, + "logps/chosen": -173.69056701660156, + "logps/rejected": -216.798095703125, + "loss": 0.5959, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2252165079116821, + "rewards/margins": 0.42275819182395935, + "rewards/rejected": -1.6479747295379639, + "step": 9040 + }, + { + "epoch": 1.5592694693314955, + "grad_norm": 37.99259948730469, + "learning_rate": 1.1054588865922931e-07, + "logits/chosen": -1.7668405771255493, + "logits/rejected": -1.7353569269180298, + "logps/chosen": -182.21829223632812, + "logps/rejected": -229.96426391601562, + "loss": 0.6042, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2846075296401978, + "rewards/margins": 0.49061161279678345, + "rewards/rejected": -1.7752189636230469, + "step": 9050 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 26.010162353515625, + "learning_rate": 1.1034650103347856e-07, + "logits/chosen": -1.7860959768295288, + "logits/rejected": -1.7449796199798584, + "logps/chosen": -171.65902709960938, + "logps/rejected": -220.1482391357422, + "loss": 0.5695, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1694601774215698, + "rewards/margins": 0.4999922811985016, + "rewards/rejected": -1.6694523096084595, + "step": 9060 + }, + { + "epoch": 1.5627153687112336, + "grad_norm": 20.758052825927734, + "learning_rate": 1.1014707182096525e-07, + "logits/chosen": -1.7433793544769287, + "logits/rejected": -1.7031667232513428, + "logps/chosen": -162.1527862548828, + "logps/rejected": -229.52175903320312, + "loss": 0.5027, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0758721828460693, + "rewards/margins": 0.6713493466377258, + "rewards/rejected": -1.7472217082977295, + "step": 9070 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 18.145647048950195, + "learning_rate": 1.0994760182327593e-07, + "logits/chosen": -1.764984130859375, + "logits/rejected": -1.74074387550354, + "logps/chosen": -156.34017944335938, + "logps/rejected": -213.22970581054688, + "loss": 0.5387, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0435948371887207, + "rewards/margins": 0.5344707369804382, + "rewards/rejected": -1.5780656337738037, + "step": 9080 + }, + { + "epoch": 1.5661612680909718, + "grad_norm": 20.801746368408203, + "learning_rate": 1.0974809184216094e-07, + "logits/chosen": -1.7175157070159912, + "logits/rejected": -1.6671241521835327, + "logps/chosen": -172.3372344970703, + "logps/rejected": -223.4744415283203, + "loss": 0.53, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1706864833831787, + "rewards/margins": 0.5599038600921631, + "rewards/rejected": -1.7305902242660522, + "step": 9090 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 23.008760452270508, + "learning_rate": 1.0954854267953146e-07, + "logits/chosen": -1.7476812601089478, + "logits/rejected": -1.7103242874145508, + "logps/chosen": -192.20645141601562, + "logps/rejected": -228.8697509765625, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.354092001914978, + "rewards/margins": 0.3995038866996765, + "rewards/rejected": -1.7535959482192993, + "step": 9100 + }, + { + "epoch": 1.5696071674707097, + "grad_norm": 20.0081844329834, + "learning_rate": 1.0934895513745603e-07, + "logits/chosen": -1.7460222244262695, + "logits/rejected": -1.7097656726837158, + "logps/chosen": -184.85659790039062, + "logps/rejected": -237.32064819335938, + "loss": 0.5706, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2960811853408813, + "rewards/margins": 0.5511730909347534, + "rewards/rejected": -1.8472541570663452, + "step": 9110 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 31.030187606811523, + "learning_rate": 1.0914933001815754e-07, + "logits/chosen": -1.7750955820083618, + "logits/rejected": -1.7259628772735596, + "logps/chosen": -182.80413818359375, + "logps/rejected": -236.4357147216797, + "loss": 0.5427, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2252854108810425, + "rewards/margins": 0.5880699157714844, + "rewards/rejected": -1.8133554458618164, + "step": 9120 + }, + { + "epoch": 1.573053066850448, + "grad_norm": 22.988636016845703, + "learning_rate": 1.0894966812400992e-07, + "logits/chosen": -1.7008718252182007, + "logits/rejected": -1.662156343460083, + "logps/chosen": -183.43167114257812, + "logps/rejected": -231.0472412109375, + "loss": 0.5892, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2841278314590454, + "rewards/margins": 0.48533502221107483, + "rewards/rejected": -1.7694628238677979, + "step": 9130 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 19.436880111694336, + "learning_rate": 1.0874997025753482e-07, + "logits/chosen": -1.7695186138153076, + "logits/rejected": -1.700583815574646, + "logps/chosen": -172.9995574951172, + "logps/rejected": -237.1913604736328, + "loss": 0.5026, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.157179832458496, + "rewards/margins": 0.7229998707771301, + "rewards/rejected": -1.880179762840271, + "step": 9140 + }, + { + "epoch": 1.576498966230186, + "grad_norm": 28.90360450744629, + "learning_rate": 1.0855023722139864e-07, + "logits/chosen": -1.7517004013061523, + "logits/rejected": -1.702845573425293, + "logps/chosen": -190.75489807128906, + "logps/rejected": -255.5432586669922, + "loss": 0.5388, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3242889642715454, + "rewards/margins": 0.686375081539154, + "rewards/rejected": -2.0106639862060547, + "step": 9150 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 24.89302635192871, + "learning_rate": 1.0835046981840896e-07, + "logits/chosen": -1.763664960861206, + "logits/rejected": -1.7383177280426025, + "logps/chosen": -169.76939392089844, + "logps/rejected": -232.401611328125, + "loss": 0.5413, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2061946392059326, + "rewards/margins": 0.5869729518890381, + "rewards/rejected": -1.7931678295135498, + "step": 9160 + }, + { + "epoch": 1.5799448656099242, + "grad_norm": 24.991596221923828, + "learning_rate": 1.0815066885151165e-07, + "logits/chosen": -1.7933380603790283, + "logits/rejected": -1.7359075546264648, + "logps/chosen": -181.2700958251953, + "logps/rejected": -244.0440673828125, + "loss": 0.525, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.264285683631897, + "rewards/margins": 0.6814316511154175, + "rewards/rejected": -1.9457175731658936, + "step": 9170 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 21.60300636291504, + "learning_rate": 1.0795083512378738e-07, + "logits/chosen": -1.7519088983535767, + "logits/rejected": -1.7141910791397095, + "logps/chosen": -182.85592651367188, + "logps/rejected": -233.93008422851562, + "loss": 0.5742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2812501192092896, + "rewards/margins": 0.5364077091217041, + "rewards/rejected": -1.8176578283309937, + "step": 9180 + }, + { + "epoch": 1.5833907649896624, + "grad_norm": 21.416772842407227, + "learning_rate": 1.077509694384485e-07, + "logits/chosen": -1.8482236862182617, + "logits/rejected": -1.8232104778289795, + "logps/chosen": -186.87045288085938, + "logps/rejected": -251.0176544189453, + "loss": 0.5146, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.327132225036621, + "rewards/margins": 0.6137700080871582, + "rewards/rejected": -1.9409021139144897, + "step": 9190 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 17.748327255249023, + "learning_rate": 1.0755107259883591e-07, + "logits/chosen": -1.7413246631622314, + "logits/rejected": -1.696459412574768, + "logps/chosen": -184.2054443359375, + "logps/rejected": -251.5696563720703, + "loss": 0.525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3135349750518799, + "rewards/margins": 0.6860525012016296, + "rewards/rejected": -1.9995874166488647, + "step": 9200 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -1.8325300216674805, + "eval_logits/rejected": -1.812217116355896, + "eval_logps/chosen": -177.4160919189453, + "eval_logps/rejected": -209.28688049316406, + "eval_loss": 0.6320397853851318, + "eval_rewards/accuracies": 0.636617124080658, + "eval_rewards/chosen": -1.187041997909546, + "eval_rewards/margins": 0.27402570843696594, + "eval_rewards/rejected": -1.461067795753479, + "eval_runtime": 384.2682, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 9200 + }, + { + "epoch": 1.5868366643694003, + "grad_norm": 30.787199020385742, + "learning_rate": 1.0735114540841565e-07, + "logits/chosen": -1.614463210105896, + "logits/rejected": -1.5715653896331787, + "logps/chosen": -194.19384765625, + "logps/rejected": -247.31533813476562, + "loss": 0.566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3834686279296875, + "rewards/margins": 0.5717785954475403, + "rewards/rejected": -1.9552472829818726, + "step": 9210 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 49.48377990722656, + "learning_rate": 1.0715118867077575e-07, + "logits/chosen": -1.7034069299697876, + "logits/rejected": -1.6654075384140015, + "logps/chosen": -182.36663818359375, + "logps/rejected": -239.0763397216797, + "loss": 0.5678, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3235313892364502, + "rewards/margins": 0.5717422962188721, + "rewards/rejected": -1.8952735662460327, + "step": 9220 + }, + { + "epoch": 1.5902825637491387, + "grad_norm": 28.817829132080078, + "learning_rate": 1.0695120318962305e-07, + "logits/chosen": -1.6265385150909424, + "logits/rejected": -1.5868934392929077, + "logps/chosen": -178.04183959960938, + "logps/rejected": -243.2806396484375, + "loss": 0.526, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.287351369857788, + "rewards/margins": 0.6169094443321228, + "rewards/rejected": -1.9042608737945557, + "step": 9230 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 30.303749084472656, + "learning_rate": 1.0675118976877989e-07, + "logits/chosen": -1.6701663732528687, + "logits/rejected": -1.6437671184539795, + "logps/chosen": -196.56332397460938, + "logps/rejected": -260.31170654296875, + "loss": 0.5495, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.404131293296814, + "rewards/margins": 0.6530481576919556, + "rewards/rejected": -2.0571796894073486, + "step": 9240 + }, + { + "epoch": 1.5937284631288766, + "grad_norm": 18.34560775756836, + "learning_rate": 1.0655114921218086e-07, + "logits/chosen": -1.665238618850708, + "logits/rejected": -1.621919870376587, + "logps/chosen": -181.21426391601562, + "logps/rejected": -240.7842559814453, + "loss": 0.5579, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.264282464981079, + "rewards/margins": 0.5974792242050171, + "rewards/rejected": -1.8617616891860962, + "step": 9250 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 24.00996208190918, + "learning_rate": 1.0635108232386976e-07, + "logits/chosen": -1.6609874963760376, + "logits/rejected": -1.6281172037124634, + "logps/chosen": -189.6760711669922, + "logps/rejected": -246.7325897216797, + "loss": 0.5674, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.416132926940918, + "rewards/margins": 0.5612865090370178, + "rewards/rejected": -1.9774194955825806, + "step": 9260 + }, + { + "epoch": 1.5971743625086148, + "grad_norm": 20.443849563598633, + "learning_rate": 1.0615098990799607e-07, + "logits/chosen": -1.7815415859222412, + "logits/rejected": -1.7329866886138916, + "logps/chosen": -193.226806640625, + "logps/rejected": -249.8080291748047, + "loss": 0.538, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3833353519439697, + "rewards/margins": 0.5966004133224487, + "rewards/rejected": -1.979935646057129, + "step": 9270 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 22.499149322509766, + "learning_rate": 1.05950872768812e-07, + "logits/chosen": -1.7292639017105103, + "logits/rejected": -1.6948598623275757, + "logps/chosen": -181.84854125976562, + "logps/rejected": -237.987548828125, + "loss": 0.5637, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2827826738357544, + "rewards/margins": 0.5591589212417603, + "rewards/rejected": -1.8419415950775146, + "step": 9280 + }, + { + "epoch": 1.600620261888353, + "grad_norm": 25.416688919067383, + "learning_rate": 1.0575073171066906e-07, + "logits/chosen": -1.6332378387451172, + "logits/rejected": -1.6004530191421509, + "logps/chosen": -177.553955078125, + "logps/rejected": -220.517822265625, + "loss": 0.5932, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2289899587631226, + "rewards/margins": 0.4675261080265045, + "rewards/rejected": -1.6965160369873047, + "step": 9290 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 35.59848403930664, + "learning_rate": 1.0555056753801493e-07, + "logits/chosen": -1.694219946861267, + "logits/rejected": -1.6638238430023193, + "logps/chosen": -190.98104858398438, + "logps/rejected": -259.08929443359375, + "loss": 0.5266, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3729770183563232, + "rewards/margins": 0.6592379808425903, + "rewards/rejected": -2.032215118408203, + "step": 9300 + }, + { + "epoch": 1.6040661612680909, + "grad_norm": 21.09294891357422, + "learning_rate": 1.0535038105539014e-07, + "logits/chosen": -1.713945746421814, + "logits/rejected": -1.6748058795928955, + "logps/chosen": -183.74774169921875, + "logps/rejected": -238.228515625, + "loss": 0.5547, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3000211715698242, + "rewards/margins": 0.5785013437271118, + "rewards/rejected": -1.8785226345062256, + "step": 9310 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 26.233116149902344, + "learning_rate": 1.0515017306742504e-07, + "logits/chosen": -1.7640018463134766, + "logits/rejected": -1.7215381860733032, + "logps/chosen": -193.1175079345703, + "logps/rejected": -259.2463073730469, + "loss": 0.532, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4228532314300537, + "rewards/margins": 0.6254833936691284, + "rewards/rejected": -2.0483365058898926, + "step": 9320 + }, + { + "epoch": 1.607512060647829, + "grad_norm": 31.612598419189453, + "learning_rate": 1.0494994437883619e-07, + "logits/chosen": -1.740483283996582, + "logits/rejected": -1.6970884799957275, + "logps/chosen": -192.74623107910156, + "logps/rejected": -248.35781860351562, + "loss": 0.5737, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3616818189620972, + "rewards/margins": 0.5712953209877014, + "rewards/rejected": -1.932977318763733, + "step": 9330 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 17.047866821289062, + "learning_rate": 1.0474969579442356e-07, + "logits/chosen": -1.65829598903656, + "logits/rejected": -1.6235958337783813, + "logps/chosen": -190.55990600585938, + "logps/rejected": -253.1373748779297, + "loss": 0.5435, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3875072002410889, + "rewards/margins": 0.627979040145874, + "rewards/rejected": -2.015486240386963, + "step": 9340 + }, + { + "epoch": 1.6109579600275672, + "grad_norm": 42.927005767822266, + "learning_rate": 1.0454942811906703e-07, + "logits/chosen": -1.735147476196289, + "logits/rejected": -1.6905243396759033, + "logps/chosen": -175.52894592285156, + "logps/rejected": -230.59426879882812, + "loss": 0.547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.229788899421692, + "rewards/margins": 0.5569666028022766, + "rewards/rejected": -1.7867555618286133, + "step": 9350 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 24.976329803466797, + "learning_rate": 1.0434914215772318e-07, + "logits/chosen": -1.7689049243927002, + "logits/rejected": -1.7112462520599365, + "logps/chosen": -185.17877197265625, + "logps/rejected": -254.5945281982422, + "loss": 0.5023, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2849639654159546, + "rewards/margins": 0.7352157831192017, + "rewards/rejected": -2.0201797485351562, + "step": 9360 + }, + { + "epoch": 1.6144038594073054, + "grad_norm": 27.026674270629883, + "learning_rate": 1.0414883871542208e-07, + "logits/chosen": -1.7741403579711914, + "logits/rejected": -1.712407112121582, + "logps/chosen": -188.32443237304688, + "logps/rejected": -253.3127899169922, + "loss": 0.5304, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3448514938354492, + "rewards/margins": 0.6742678880691528, + "rewards/rejected": -2.0191197395324707, + "step": 9370 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 27.5206356048584, + "learning_rate": 1.0394851859726408e-07, + "logits/chosen": -1.7602388858795166, + "logits/rejected": -1.7300840616226196, + "logps/chosen": -187.4597625732422, + "logps/rejected": -241.29885864257812, + "loss": 0.5922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.330510139465332, + "rewards/margins": 0.5449169278144836, + "rewards/rejected": -1.8754268884658813, + "step": 9380 + }, + { + "epoch": 1.6178497587870435, + "grad_norm": 32.495243072509766, + "learning_rate": 1.0374818260841663e-07, + "logits/chosen": -1.6052782535552979, + "logits/rejected": -1.5658015012741089, + "logps/chosen": -188.24819946289062, + "logps/rejected": -255.52261352539062, + "loss": 0.5263, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3514819145202637, + "rewards/margins": 0.6556193232536316, + "rewards/rejected": -2.007101535797119, + "step": 9390 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 22.966014862060547, + "learning_rate": 1.035478315541108e-07, + "logits/chosen": -1.6737964153289795, + "logits/rejected": -1.6404335498809814, + "logps/chosen": -181.5256805419922, + "logps/rejected": -230.23092651367188, + "loss": 0.5898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2745777368545532, + "rewards/margins": 0.4850188195705414, + "rewards/rejected": -1.759596586227417, + "step": 9400 + }, + { + "epoch": 1.6212956581667815, + "grad_norm": 27.858112335205078, + "learning_rate": 1.0334746623963843e-07, + "logits/chosen": -1.6823469400405884, + "logits/rejected": -1.6463171243667603, + "logps/chosen": -185.46388244628906, + "logps/rejected": -244.6988525390625, + "loss": 0.5469, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.315901517868042, + "rewards/margins": 0.6097933650016785, + "rewards/rejected": -1.9256948232650757, + "step": 9410 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 34.802608489990234, + "learning_rate": 1.031470874703485e-07, + "logits/chosen": -1.714709997177124, + "logits/rejected": -1.6808593273162842, + "logps/chosen": -196.0220184326172, + "logps/rejected": -241.30258178710938, + "loss": 0.5762, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4098308086395264, + "rewards/margins": 0.48128652572631836, + "rewards/rejected": -1.8911174535751343, + "step": 9420 + }, + { + "epoch": 1.6247415575465196, + "grad_norm": 23.209945678710938, + "learning_rate": 1.0294669605164417e-07, + "logits/chosen": -1.6820461750030518, + "logits/rejected": -1.6425174474716187, + "logps/chosen": -186.3541259765625, + "logps/rejected": -246.5086212158203, + "loss": 0.5807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3392202854156494, + "rewards/margins": 0.6365295648574829, + "rewards/rejected": -1.9757496118545532, + "step": 9430 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 29.137161254882812, + "learning_rate": 1.0274629278897941e-07, + "logits/chosen": -1.6789467334747314, + "logits/rejected": -1.6548280715942383, + "logps/chosen": -176.45706176757812, + "logps/rejected": -227.3600616455078, + "loss": 0.5697, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2682231664657593, + "rewards/margins": 0.4807654917240143, + "rewards/rejected": -1.7489887475967407, + "step": 9440 + }, + { + "epoch": 1.6281874569262578, + "grad_norm": 22.850292205810547, + "learning_rate": 1.0254587848785574e-07, + "logits/chosen": -1.8444761037826538, + "logits/rejected": -1.8191983699798584, + "logps/chosen": -186.48324584960938, + "logps/rejected": -228.24008178710938, + "loss": 0.5955, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3309428691864014, + "rewards/margins": 0.44507789611816406, + "rewards/rejected": -1.7760207653045654, + "step": 9450 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 20.438674926757812, + "learning_rate": 1.0234545395381922e-07, + "logits/chosen": -1.7006847858428955, + "logits/rejected": -1.6650491952896118, + "logps/chosen": -164.76817321777344, + "logps/rejected": -252.7565460205078, + "loss": 0.4427, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1251728534698486, + "rewards/margins": 0.8621363639831543, + "rewards/rejected": -1.987309217453003, + "step": 9460 + }, + { + "epoch": 1.6316333563059957, + "grad_norm": 33.695003509521484, + "learning_rate": 1.021450199924568e-07, + "logits/chosen": -1.5540118217468262, + "logits/rejected": -1.5089060068130493, + "logps/chosen": -181.3240203857422, + "logps/rejected": -224.87307739257812, + "loss": 0.5968, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.286789059638977, + "rewards/margins": 0.4572557806968689, + "rewards/rejected": -1.7440446615219116, + "step": 9470 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 26.688095092773438, + "learning_rate": 1.0194457740939353e-07, + "logits/chosen": -1.6772072315216064, + "logits/rejected": -1.6208477020263672, + "logps/chosen": -186.79238891601562, + "logps/rejected": -247.66085815429688, + "loss": 0.5373, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3279988765716553, + "rewards/margins": 0.622045636177063, + "rewards/rejected": -1.9500446319580078, + "step": 9480 + }, + { + "epoch": 1.635079255685734, + "grad_norm": 25.796838760375977, + "learning_rate": 1.0174412701028899e-07, + "logits/chosen": -1.6382777690887451, + "logits/rejected": -1.6029354333877563, + "logps/chosen": -189.98422241210938, + "logps/rejected": -245.23324584960938, + "loss": 0.5523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.358717679977417, + "rewards/margins": 0.5708493590354919, + "rewards/rejected": -1.9295669794082642, + "step": 9490 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 21.41780662536621, + "learning_rate": 1.0154366960083422e-07, + "logits/chosen": -1.6924152374267578, + "logits/rejected": -1.6522552967071533, + "logps/chosen": -184.74864196777344, + "logps/rejected": -259.6523132324219, + "loss": 0.4885, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.3110222816467285, + "rewards/margins": 0.7347725629806519, + "rewards/rejected": -2.04579496383667, + "step": 9500 + }, + { + "epoch": 1.638525155065472, + "grad_norm": 22.247453689575195, + "learning_rate": 1.0134320598674846e-07, + "logits/chosen": -1.5799000263214111, + "logits/rejected": -1.5400402545928955, + "logps/chosen": -191.2025909423828, + "logps/rejected": -265.931884765625, + "loss": 0.5267, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4031956195831299, + "rewards/margins": 0.7167037129402161, + "rewards/rejected": -2.1198995113372803, + "step": 9510 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 19.012386322021484, + "learning_rate": 1.0114273697377583e-07, + "logits/chosen": -1.7850033044815063, + "logits/rejected": -1.7514865398406982, + "logps/chosen": -206.412109375, + "logps/rejected": -278.20819091796875, + "loss": 0.5608, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.540743112564087, + "rewards/margins": 0.6628342270851135, + "rewards/rejected": -2.2035775184631348, + "step": 9520 + }, + { + "epoch": 1.6419710544452102, + "grad_norm": 36.910980224609375, + "learning_rate": 1.0094226336768224e-07, + "logits/chosen": -1.6955476999282837, + "logits/rejected": -1.6423709392547607, + "logps/chosen": -193.9436492919922, + "logps/rejected": -268.7176818847656, + "loss": 0.5012, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.410476565361023, + "rewards/margins": 0.7678488492965698, + "rewards/rejected": -2.178325653076172, + "step": 9530 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 26.08768653869629, + "learning_rate": 1.0074178597425194e-07, + "logits/chosen": -1.605597734451294, + "logits/rejected": -1.5660779476165771, + "logps/chosen": -198.6016082763672, + "logps/rejected": -266.4854736328125, + "loss": 0.514, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4448301792144775, + "rewards/margins": 0.6859620213508606, + "rewards/rejected": -2.1307921409606934, + "step": 9540 + }, + { + "epoch": 1.6454169538249483, + "grad_norm": 22.80348777770996, + "learning_rate": 1.0054130559928451e-07, + "logits/chosen": -1.6636934280395508, + "logits/rejected": -1.6406666040420532, + "logps/chosen": -190.80972290039062, + "logps/rejected": -261.4796447753906, + "loss": 0.5443, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3758639097213745, + "rewards/margins": 0.6709795594215393, + "rewards/rejected": -2.0468432903289795, + "step": 9550 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 29.717512130737305, + "learning_rate": 1.0034082304859144e-07, + "logits/chosen": -1.7326301336288452, + "logits/rejected": -1.7004493474960327, + "logps/chosen": -198.12722778320312, + "logps/rejected": -260.0203552246094, + "loss": 0.5488, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4450441598892212, + "rewards/margins": 0.6245887279510498, + "rewards/rejected": -2.0696330070495605, + "step": 9560 + }, + { + "epoch": 1.6488628532046863, + "grad_norm": 27.482389450073242, + "learning_rate": 1.00140339127993e-07, + "logits/chosen": -1.620723009109497, + "logits/rejected": -1.591686725616455, + "logps/chosen": -193.84017944335938, + "logps/rejected": -257.40740966796875, + "loss": 0.5577, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.435881495475769, + "rewards/margins": 0.6315909624099731, + "rewards/rejected": -2.067472457885742, + "step": 9570 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 34.06911087036133, + "learning_rate": 9.9939854643315e-08, + "logits/chosen": -1.7072927951812744, + "logits/rejected": -1.6677078008651733, + "logps/chosen": -198.4580841064453, + "logps/rejected": -262.29534912109375, + "loss": 0.5489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4397019147872925, + "rewards/margins": 0.6138035655021667, + "rewards/rejected": -2.0535054206848145, + "step": 9580 + }, + { + "epoch": 1.6523087525844247, + "grad_norm": 24.818511962890625, + "learning_rate": 9.973937040038544e-08, + "logits/chosen": -1.796342134475708, + "logits/rejected": -1.7624422311782837, + "logps/chosen": -194.32196044921875, + "logps/rejected": -249.00344848632812, + "loss": 0.5681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3822523355484009, + "rewards/margins": 0.5446712374687195, + "rewards/rejected": -1.9269235134124756, + "step": 9590 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 29.23993682861328, + "learning_rate": 9.953888720503145e-08, + "logits/chosen": -1.6400010585784912, + "logits/rejected": -1.5827388763427734, + "logps/chosen": -192.125732421875, + "logps/rejected": -262.93731689453125, + "loss": 0.5174, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3706529140472412, + "rewards/margins": 0.7350988984107971, + "rewards/rejected": -2.1057517528533936, + "step": 9600 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -1.7810282707214355, + "eval_logits/rejected": -1.7596746683120728, + "eval_logps/chosen": -185.33482360839844, + "eval_logps/rejected": -219.24375915527344, + "eval_loss": 0.631009042263031, + "eval_rewards/accuracies": 0.6375464797019958, + "eval_rewards/chosen": -1.266229271888733, + "eval_rewards/margins": 0.2944071292877197, + "eval_rewards/rejected": -1.5606361627578735, + "eval_runtime": 384.4232, + "eval_samples_per_second": 11.196, + "eval_steps_per_second": 1.399, + "step": 9600 + }, + { + "epoch": 1.6557546519641626, + "grad_norm": 25.812196731567383, + "learning_rate": 9.933840586307579e-08, + "logits/chosen": -1.646094560623169, + "logits/rejected": -1.6073650121688843, + "logps/chosen": -182.9886474609375, + "logps/rejected": -252.9163360595703, + "loss": 0.5064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.266803503036499, + "rewards/margins": 0.7127724885940552, + "rewards/rejected": -1.9795758724212646, + "step": 9610 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 33.57108688354492, + "learning_rate": 9.913792718033396e-08, + "logits/chosen": -1.7659132480621338, + "logits/rejected": -1.7371089458465576, + "logps/chosen": -196.85321044921875, + "logps/rejected": -243.35916137695312, + "loss": 0.607, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4226679801940918, + "rewards/margins": 0.4569169580936432, + "rewards/rejected": -1.8795849084854126, + "step": 9620 + }, + { + "epoch": 1.6592005513439008, + "grad_norm": 32.600406646728516, + "learning_rate": 9.893745196261062e-08, + "logits/chosen": -1.6798865795135498, + "logits/rejected": -1.6302530765533447, + "logps/chosen": -204.86288452148438, + "logps/rejected": -260.3321838378906, + "loss": 0.5815, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4940460920333862, + "rewards/margins": 0.5789082050323486, + "rewards/rejected": -2.0729544162750244, + "step": 9630 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 27.517127990722656, + "learning_rate": 9.873698101569657e-08, + "logits/chosen": -1.7465555667877197, + "logits/rejected": -1.706447958946228, + "logps/chosen": -190.49761962890625, + "logps/rejected": -239.1614227294922, + "loss": 0.5647, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.345007300376892, + "rewards/margins": 0.5413269400596619, + "rewards/rejected": -1.8863341808319092, + "step": 9640 + }, + { + "epoch": 1.662646450723639, + "grad_norm": 27.598669052124023, + "learning_rate": 9.853651514536552e-08, + "logits/chosen": -1.6575853824615479, + "logits/rejected": -1.6214959621429443, + "logps/chosen": -188.8939666748047, + "logps/rejected": -233.7520294189453, + "loss": 0.5979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.334161400794983, + "rewards/margins": 0.46343547105789185, + "rewards/rejected": -1.7975966930389404, + "step": 9650 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 25.109174728393555, + "learning_rate": 9.833605515737058e-08, + "logits/chosen": -1.669508934020996, + "logits/rejected": -1.642592430114746, + "logps/chosen": -172.75863647460938, + "logps/rejected": -231.64077758789062, + "loss": 0.5652, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2217415571212769, + "rewards/margins": 0.558169960975647, + "rewards/rejected": -1.7799113988876343, + "step": 9660 + }, + { + "epoch": 1.6660923501033769, + "grad_norm": 28.79549217224121, + "learning_rate": 9.813560185744138e-08, + "logits/chosen": -1.755334496498108, + "logits/rejected": -1.706169843673706, + "logps/chosen": -177.9940643310547, + "logps/rejected": -247.0950469970703, + "loss": 0.5132, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.218774676322937, + "rewards/margins": 0.7187453508377075, + "rewards/rejected": -1.9375197887420654, + "step": 9670 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 29.618350982666016, + "learning_rate": 9.79351560512806e-08, + "logits/chosen": -1.6942167282104492, + "logits/rejected": -1.6706167459487915, + "logps/chosen": -180.63156127929688, + "logps/rejected": -221.5099334716797, + "loss": 0.6191, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2616262435913086, + "rewards/margins": 0.4315156936645508, + "rewards/rejected": -1.6931419372558594, + "step": 9680 + }, + { + "epoch": 1.6695382494831152, + "grad_norm": 26.86222267150879, + "learning_rate": 9.773471854456087e-08, + "logits/chosen": -1.6257823705673218, + "logits/rejected": -1.593782901763916, + "logps/chosen": -181.58224487304688, + "logps/rejected": -229.3589324951172, + "loss": 0.5738, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2575411796569824, + "rewards/margins": 0.48993149399757385, + "rewards/rejected": -1.7474727630615234, + "step": 9690 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 26.250883102416992, + "learning_rate": 9.753429014292132e-08, + "logits/chosen": -1.676490068435669, + "logits/rejected": -1.633374571800232, + "logps/chosen": -175.6143798828125, + "logps/rejected": -227.7067108154297, + "loss": 0.5811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2290782928466797, + "rewards/margins": 0.5252577066421509, + "rewards/rejected": -1.7543357610702515, + "step": 9700 + }, + { + "epoch": 1.6729841488628532, + "grad_norm": 25.921573638916016, + "learning_rate": 9.73338716519646e-08, + "logits/chosen": -1.5699330568313599, + "logits/rejected": -1.5422900915145874, + "logps/chosen": -176.53219604492188, + "logps/rejected": -222.19589233398438, + "loss": 0.5927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2290360927581787, + "rewards/margins": 0.47371214628219604, + "rewards/rejected": -1.7027482986450195, + "step": 9710 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 23.1956787109375, + "learning_rate": 9.713346387725355e-08, + "logits/chosen": -1.7528129816055298, + "logits/rejected": -1.7244980335235596, + "logps/chosen": -172.8015594482422, + "logps/rejected": -218.1025390625, + "loss": 0.5783, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1944303512573242, + "rewards/margins": 0.4731059968471527, + "rewards/rejected": -1.6675363779067993, + "step": 9720 + }, + { + "epoch": 1.6764300482425913, + "grad_norm": 22.841461181640625, + "learning_rate": 9.693306762430782e-08, + "logits/chosen": -1.7413135766983032, + "logits/rejected": -1.7077674865722656, + "logps/chosen": -162.9296875, + "logps/rejected": -226.3254852294922, + "loss": 0.5319, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.091930627822876, + "rewards/margins": 0.6269403696060181, + "rewards/rejected": -1.7188708782196045, + "step": 9730 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 22.896677017211914, + "learning_rate": 9.673268369860086e-08, + "logits/chosen": -1.7438064813613892, + "logits/rejected": -1.7046003341674805, + "logps/chosen": -173.13571166992188, + "logps/rejected": -228.1545867919922, + "loss": 0.5517, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1866159439086914, + "rewards/margins": 0.5622804164886475, + "rewards/rejected": -1.7488963603973389, + "step": 9740 + }, + { + "epoch": 1.6798759476223295, + "grad_norm": 17.43238067626953, + "learning_rate": 9.653231290555647e-08, + "logits/chosen": -1.7511183023452759, + "logits/rejected": -1.688676118850708, + "logps/chosen": -173.98178100585938, + "logps/rejected": -228.1248779296875, + "loss": 0.5351, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.188538908958435, + "rewards/margins": 0.5991718769073486, + "rewards/rejected": -1.7877107858657837, + "step": 9750 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 18.74418067932129, + "learning_rate": 9.633195605054573e-08, + "logits/chosen": -1.7443416118621826, + "logits/rejected": -1.698015570640564, + "logps/chosen": -180.7736358642578, + "logps/rejected": -236.44351196289062, + "loss": 0.5572, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2759031057357788, + "rewards/margins": 0.5757189989089966, + "rewards/rejected": -1.8516219854354858, + "step": 9760 + }, + { + "epoch": 1.6833218470020674, + "grad_norm": 21.315763473510742, + "learning_rate": 9.613161393888372e-08, + "logits/chosen": -1.6301326751708984, + "logits/rejected": -1.5908677577972412, + "logps/chosen": -178.78009033203125, + "logps/rejected": -235.99789428710938, + "loss": 0.552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2467206716537476, + "rewards/margins": 0.5737360715866089, + "rewards/rejected": -1.8204567432403564, + "step": 9770 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 20.305156707763672, + "learning_rate": 9.593128737582623e-08, + "logits/chosen": -1.7256050109863281, + "logits/rejected": -1.6650545597076416, + "logps/chosen": -186.75709533691406, + "logps/rejected": -243.6442413330078, + "loss": 0.5472, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2587451934814453, + "rewards/margins": 0.6616265177726746, + "rewards/rejected": -1.920371651649475, + "step": 9780 + }, + { + "epoch": 1.6867677463818056, + "grad_norm": 25.12137794494629, + "learning_rate": 9.57309771665665e-08, + "logits/chosen": -1.6799179315567017, + "logits/rejected": -1.6594957113265991, + "logps/chosen": -192.07211303710938, + "logps/rejected": -258.8779296875, + "loss": 0.566, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3866052627563477, + "rewards/margins": 0.6263383030891418, + "rewards/rejected": -2.012943744659424, + "step": 9790 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 32.45927810668945, + "learning_rate": 9.553068411623211e-08, + "logits/chosen": -1.7195956707000732, + "logits/rejected": -1.6630808115005493, + "logps/chosen": -191.19000244140625, + "logps/rejected": -257.57318115234375, + "loss": 0.5365, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3438773155212402, + "rewards/margins": 0.6948517560958862, + "rewards/rejected": -2.038729190826416, + "step": 9800 + }, + { + "epoch": 1.6902136457615438, + "grad_norm": 24.958833694458008, + "learning_rate": 9.533040902988164e-08, + "logits/chosen": -1.6515401601791382, + "logits/rejected": -1.6001765727996826, + "logps/chosen": -191.26797485351562, + "logps/rejected": -253.69271850585938, + "loss": 0.57, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3584883213043213, + "rewards/margins": 0.6395347714424133, + "rewards/rejected": -1.9980227947235107, + "step": 9810 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 24.197202682495117, + "learning_rate": 9.51301527125015e-08, + "logits/chosen": -1.6761776208877563, + "logits/rejected": -1.6309750080108643, + "logps/chosen": -185.36248779296875, + "logps/rejected": -252.4319305419922, + "loss": 0.5254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3124070167541504, + "rewards/margins": 0.6722649335861206, + "rewards/rejected": -1.98467218875885, + "step": 9820 + }, + { + "epoch": 1.693659545141282, + "grad_norm": 28.450220108032227, + "learning_rate": 9.492991596900265e-08, + "logits/chosen": -1.7091443538665771, + "logits/rejected": -1.6807483434677124, + "logps/chosen": -200.43643188476562, + "logps/rejected": -254.0392303466797, + "loss": 0.6132, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4244849681854248, + "rewards/margins": 0.5424485206604004, + "rewards/rejected": -1.9669336080551147, + "step": 9830 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 21.36317253112793, + "learning_rate": 9.47296996042173e-08, + "logits/chosen": -1.6725046634674072, + "logits/rejected": -1.6177743673324585, + "logps/chosen": -187.1649627685547, + "logps/rejected": -257.66131591796875, + "loss": 0.5377, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3128710985183716, + "rewards/margins": 0.7233718633651733, + "rewards/rejected": -2.036242961883545, + "step": 9840 + }, + { + "epoch": 1.69710544452102, + "grad_norm": 24.398290634155273, + "learning_rate": 9.452950442289582e-08, + "logits/chosen": -1.6904996633529663, + "logits/rejected": -1.6662309169769287, + "logps/chosen": -180.9543914794922, + "logps/rejected": -229.668212890625, + "loss": 0.566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2635273933410645, + "rewards/margins": 0.5165113210678101, + "rewards/rejected": -1.780038833618164, + "step": 9850 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 25.69253921508789, + "learning_rate": 9.432933122970347e-08, + "logits/chosen": -1.7612245082855225, + "logits/rejected": -1.734895944595337, + "logps/chosen": -197.70860290527344, + "logps/rejected": -244.4144744873047, + "loss": 0.6001, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4138227701187134, + "rewards/margins": 0.4914781451225281, + "rewards/rejected": -1.9053010940551758, + "step": 9860 + }, + { + "epoch": 1.700551343900758, + "grad_norm": 21.9511661529541, + "learning_rate": 9.412918082921706e-08, + "logits/chosen": -1.720068335533142, + "logits/rejected": -1.6764943599700928, + "logps/chosen": -183.56893920898438, + "logps/rejected": -238.9677734375, + "loss": 0.5642, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3083776235580444, + "rewards/margins": 0.572445809841156, + "rewards/rejected": -1.8808234930038452, + "step": 9870 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 19.187746047973633, + "learning_rate": 9.39290540259218e-08, + "logits/chosen": -1.6332343816757202, + "logits/rejected": -1.6056232452392578, + "logps/chosen": -183.8154754638672, + "logps/rejected": -251.66244506835938, + "loss": 0.528, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3174991607666016, + "rewards/margins": 0.6446079015731812, + "rewards/rejected": -1.9621070623397827, + "step": 9880 + }, + { + "epoch": 1.7039972432804962, + "grad_norm": 23.748544692993164, + "learning_rate": 9.372895162420808e-08, + "logits/chosen": -1.6163705587387085, + "logits/rejected": -1.5838301181793213, + "logps/chosen": -187.1710205078125, + "logps/rejected": -254.9395751953125, + "loss": 0.5086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.307533621788025, + "rewards/margins": 0.7030706405639648, + "rewards/rejected": -2.0106043815612793, + "step": 9890 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 28.16010093688965, + "learning_rate": 9.352887442836816e-08, + "logits/chosen": -1.6951067447662354, + "logits/rejected": -1.6399049758911133, + "logps/chosen": -177.525634765625, + "logps/rejected": -260.8652038574219, + "loss": 0.4753, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2581908702850342, + "rewards/margins": 0.8525069952011108, + "rewards/rejected": -2.1106979846954346, + "step": 9900 + }, + { + "epoch": 1.7074431426602343, + "grad_norm": 19.513193130493164, + "learning_rate": 9.332882324259306e-08, + "logits/chosen": -1.7475383281707764, + "logits/rejected": -1.6854770183563232, + "logps/chosen": -193.81378173828125, + "logps/rejected": -252.58413696289062, + "loss": 0.5561, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3624550104141235, + "rewards/margins": 0.6568828821182251, + "rewards/rejected": -2.0193378925323486, + "step": 9910 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 25.817169189453125, + "learning_rate": 9.312879887096923e-08, + "logits/chosen": -1.7400646209716797, + "logits/rejected": -1.6931641101837158, + "logps/chosen": -187.50587463378906, + "logps/rejected": -254.0473175048828, + "loss": 0.5126, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3481698036193848, + "rewards/margins": 0.6990029811859131, + "rewards/rejected": -2.047173023223877, + "step": 9920 + }, + { + "epoch": 1.7108890420399723, + "grad_norm": 30.18265151977539, + "learning_rate": 9.292880211747528e-08, + "logits/chosen": -1.6739486455917358, + "logits/rejected": -1.644385576248169, + "logps/chosen": -180.66278076171875, + "logps/rejected": -250.3004913330078, + "loss": 0.5329, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.287561297416687, + "rewards/margins": 0.6668421030044556, + "rewards/rejected": -1.954403281211853, + "step": 9930 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 20.01820182800293, + "learning_rate": 9.27288337859789e-08, + "logits/chosen": -1.7534430027008057, + "logits/rejected": -1.7249292135238647, + "logps/chosen": -198.48370361328125, + "logps/rejected": -257.41363525390625, + "loss": 0.573, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4342199563980103, + "rewards/margins": 0.5325092077255249, + "rewards/rejected": -1.966728925704956, + "step": 9940 + }, + { + "epoch": 1.7143349414197107, + "grad_norm": 43.556434631347656, + "learning_rate": 9.252889468023348e-08, + "logits/chosen": -1.7074737548828125, + "logits/rejected": -1.6514174938201904, + "logps/chosen": -194.29640197753906, + "logps/rejected": -261.7345886230469, + "loss": 0.5279, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4033207893371582, + "rewards/margins": 0.7007296085357666, + "rewards/rejected": -2.104050397872925, + "step": 9950 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 39.64878845214844, + "learning_rate": 9.232898560387503e-08, + "logits/chosen": -1.7442843914031982, + "logits/rejected": -1.7110795974731445, + "logps/chosen": -207.5657196044922, + "logps/rejected": -256.70965576171875, + "loss": 0.6004, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5412859916687012, + "rewards/margins": 0.4797201156616211, + "rewards/rejected": -2.0210063457489014, + "step": 9960 + }, + { + "epoch": 1.7177808407994486, + "grad_norm": 26.249374389648438, + "learning_rate": 9.212910736041868e-08, + "logits/chosen": -1.7053654193878174, + "logits/rejected": -1.6709177494049072, + "logps/chosen": -192.18594360351562, + "logps/rejected": -255.30746459960938, + "loss": 0.5629, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4082581996917725, + "rewards/margins": 0.6271167993545532, + "rewards/rejected": -2.0353751182556152, + "step": 9970 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 27.637008666992188, + "learning_rate": 9.19292607532558e-08, + "logits/chosen": -1.6315752267837524, + "logits/rejected": -1.595414400100708, + "logps/chosen": -198.72315979003906, + "logps/rejected": -270.51312255859375, + "loss": 0.5359, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4592723846435547, + "rewards/margins": 0.7141053080558777, + "rewards/rejected": -2.1733779907226562, + "step": 9980 + }, + { + "epoch": 1.7212267401791868, + "grad_norm": 40.08787155151367, + "learning_rate": 9.172944658565057e-08, + "logits/chosen": -1.7109102010726929, + "logits/rejected": -1.6611464023590088, + "logps/chosen": -204.422119140625, + "logps/rejected": -250.1943817138672, + "loss": 0.5955, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5068941116333008, + "rewards/margins": 0.49588704109191895, + "rewards/rejected": -2.002781391143799, + "step": 9990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 21.382776260375977, + "learning_rate": 9.15296656607367e-08, + "logits/chosen": -1.7126632928848267, + "logits/rejected": -1.6783558130264282, + "logps/chosen": -196.45445251464844, + "logps/rejected": -258.8180847167969, + "loss": 0.5312, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4070589542388916, + "rewards/margins": 0.6114452481269836, + "rewards/rejected": -2.0185041427612305, + "step": 10000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -1.7847836017608643, + "eval_logits/rejected": -1.762927532196045, + "eval_logps/chosen": -188.505615234375, + "eval_logps/rejected": -223.30810546875, + "eval_loss": 0.6312531232833862, + "eval_rewards/accuracies": 0.6359200477600098, + "eval_rewards/chosen": -1.2979371547698975, + "eval_rewards/margins": 0.3033425211906433, + "eval_rewards/rejected": -1.601279616355896, + "eval_runtime": 384.1037, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.401, + "step": 10000 + }, + { + "epoch": 1.724672639558925, + "grad_norm": 17.818395614624023, + "learning_rate": 9.132991878151444e-08, + "logits/chosen": -1.7125742435455322, + "logits/rejected": -1.6681439876556396, + "logps/chosen": -187.5587158203125, + "logps/rejected": -259.1612854003906, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3517935276031494, + "rewards/margins": 0.7120456099510193, + "rewards/rejected": -2.0638391971588135, + "step": 10010 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 31.448246002197266, + "learning_rate": 9.113020675084693e-08, + "logits/chosen": -1.6409164667129517, + "logits/rejected": -1.590031385421753, + "logps/chosen": -197.2374267578125, + "logps/rejected": -255.70834350585938, + "loss": 0.544, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4356483221054077, + "rewards/margins": 0.6116400957107544, + "rewards/rejected": -2.047288417816162, + "step": 10020 + }, + { + "epoch": 1.7281185389386629, + "grad_norm": 43.61131286621094, + "learning_rate": 9.093053037145756e-08, + "logits/chosen": -1.6513713598251343, + "logits/rejected": -1.6045444011688232, + "logps/chosen": -202.1380615234375, + "logps/rejected": -255.1143341064453, + "loss": 0.5476, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4474685192108154, + "rewards/margins": 0.5823658108711243, + "rewards/rejected": -2.029834270477295, + "step": 10030 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 18.31475067138672, + "learning_rate": 9.073089044592619e-08, + "logits/chosen": -1.8138395547866821, + "logits/rejected": -1.7728168964385986, + "logps/chosen": -197.71688842773438, + "logps/rejected": -262.2447204589844, + "loss": 0.5454, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4287588596343994, + "rewards/margins": 0.6493293046951294, + "rewards/rejected": -2.0780882835388184, + "step": 10040 + }, + { + "epoch": 1.7315644383184012, + "grad_norm": 26.210664749145508, + "learning_rate": 9.053128777668629e-08, + "logits/chosen": -1.6510467529296875, + "logits/rejected": -1.629809021949768, + "logps/chosen": -202.47584533691406, + "logps/rejected": -253.1685333251953, + "loss": 0.5772, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4850410223007202, + "rewards/margins": 0.4842820167541504, + "rewards/rejected": -1.969322919845581, + "step": 10050 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 33.081329345703125, + "learning_rate": 9.033172316602148e-08, + "logits/chosen": -1.6137077808380127, + "logits/rejected": -1.5819133520126343, + "logps/chosen": -189.3477325439453, + "logps/rejected": -262.2599182128906, + "loss": 0.5484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.406105875968933, + "rewards/margins": 0.6926298141479492, + "rewards/rejected": -2.0987353324890137, + "step": 10060 + }, + { + "epoch": 1.7350103376981392, + "grad_norm": 24.314748764038086, + "learning_rate": 9.013219741606244e-08, + "logits/chosen": -1.5984914302825928, + "logits/rejected": -1.5605851411819458, + "logps/chosen": -187.48721313476562, + "logps/rejected": -260.6324462890625, + "loss": 0.5132, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3503230810165405, + "rewards/margins": 0.7312024831771851, + "rewards/rejected": -2.0815258026123047, + "step": 10070 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 34.41969299316406, + "learning_rate": 8.993271132878371e-08, + "logits/chosen": -1.7018215656280518, + "logits/rejected": -1.6646702289581299, + "logps/chosen": -197.0696258544922, + "logps/rejected": -255.782958984375, + "loss": 0.5736, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.405316948890686, + "rewards/margins": 0.6016537547111511, + "rewards/rejected": -2.0069708824157715, + "step": 10080 + }, + { + "epoch": 1.7384562370778773, + "grad_norm": 20.763137817382812, + "learning_rate": 8.973326570600038e-08, + "logits/chosen": -1.728520393371582, + "logits/rejected": -1.681694746017456, + "logps/chosen": -189.59129333496094, + "logps/rejected": -262.61431884765625, + "loss": 0.5276, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3521802425384521, + "rewards/margins": 0.6968034505844116, + "rewards/rejected": -2.0489840507507324, + "step": 10090 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 36.174373626708984, + "learning_rate": 8.953386134936489e-08, + "logits/chosen": -1.6898266077041626, + "logits/rejected": -1.6627204418182373, + "logps/chosen": -190.21826171875, + "logps/rejected": -250.46133422851562, + "loss": 0.562, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.352323293685913, + "rewards/margins": 0.6072344183921814, + "rewards/rejected": -1.9595577716827393, + "step": 10100 + }, + { + "epoch": 1.7419021364576155, + "grad_norm": 34.1463737487793, + "learning_rate": 8.933449906036373e-08, + "logits/chosen": -1.7423661947250366, + "logits/rejected": -1.7222964763641357, + "logps/chosen": -194.2482147216797, + "logps/rejected": -260.20965576171875, + "loss": 0.5566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4367506504058838, + "rewards/margins": 0.6188918352127075, + "rewards/rejected": -2.0556423664093018, + "step": 10110 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 20.162803649902344, + "learning_rate": 8.913517964031447e-08, + "logits/chosen": -1.7082332372665405, + "logits/rejected": -1.6626908779144287, + "logps/chosen": -187.05923461914062, + "logps/rejected": -255.07669067382812, + "loss": 0.5193, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3333557844161987, + "rewards/margins": 0.7113418579101562, + "rewards/rejected": -2.0446975231170654, + "step": 10120 + }, + { + "epoch": 1.7453480358373534, + "grad_norm": 27.919374465942383, + "learning_rate": 8.893590389036226e-08, + "logits/chosen": -1.722672462463379, + "logits/rejected": -1.6737346649169922, + "logps/chosen": -183.01498413085938, + "logps/rejected": -258.7102966308594, + "loss": 0.5057, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2966862916946411, + "rewards/margins": 0.7539176344871521, + "rewards/rejected": -2.0506038665771484, + "step": 10130 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 28.644804000854492, + "learning_rate": 8.873667261147673e-08, + "logits/chosen": -1.68411123752594, + "logits/rejected": -1.6239120960235596, + "logps/chosen": -205.28250122070312, + "logps/rejected": -262.96441650390625, + "loss": 0.5664, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4485670328140259, + "rewards/margins": 0.6414085030555725, + "rewards/rejected": -2.089975357055664, + "step": 10140 + }, + { + "epoch": 1.7487939352170918, + "grad_norm": 24.981163024902344, + "learning_rate": 8.853748660444881e-08, + "logits/chosen": -1.631771445274353, + "logits/rejected": -1.582779049873352, + "logps/chosen": -190.84922790527344, + "logps/rejected": -261.8558044433594, + "loss": 0.5279, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.354757308959961, + "rewards/margins": 0.7097658514976501, + "rewards/rejected": -2.064523220062256, + "step": 10150 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 23.57000732421875, + "learning_rate": 8.833834666988738e-08, + "logits/chosen": -1.65898859500885, + "logits/rejected": -1.6106218099594116, + "logps/chosen": -187.1697235107422, + "logps/rejected": -263.8644714355469, + "loss": 0.5142, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3509787321090698, + "rewards/margins": 0.7656160593032837, + "rewards/rejected": -2.1165947914123535, + "step": 10160 + }, + { + "epoch": 1.7522398345968297, + "grad_norm": 22.85065269470215, + "learning_rate": 8.813925360821624e-08, + "logits/chosen": -1.6529548168182373, + "logits/rejected": -1.6138883829116821, + "logps/chosen": -196.12030029296875, + "logps/rejected": -268.4806213378906, + "loss": 0.5297, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4121040105819702, + "rewards/margins": 0.7286542057991028, + "rewards/rejected": -2.140758514404297, + "step": 10170 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 25.705286026000977, + "learning_rate": 8.794020821967075e-08, + "logits/chosen": -1.5729020833969116, + "logits/rejected": -1.514883041381836, + "logps/chosen": -197.0308837890625, + "logps/rejected": -276.1694641113281, + "loss": 0.512, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4458892345428467, + "rewards/margins": 0.7827070951461792, + "rewards/rejected": -2.2285962104797363, + "step": 10180 + }, + { + "epoch": 1.755685733976568, + "grad_norm": 40.89603805541992, + "learning_rate": 8.774121130429464e-08, + "logits/chosen": -1.6025034189224243, + "logits/rejected": -1.5513317584991455, + "logps/chosen": -190.3941650390625, + "logps/rejected": -263.5399169921875, + "loss": 0.5224, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3943601846694946, + "rewards/margins": 0.7269679307937622, + "rewards/rejected": -2.121328115463257, + "step": 10190 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 23.062108993530273, + "learning_rate": 8.754226366193677e-08, + "logits/chosen": -1.6597881317138672, + "logits/rejected": -1.6177161931991577, + "logps/chosen": -200.96263122558594, + "logps/rejected": -270.7159118652344, + "loss": 0.5379, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4854190349578857, + "rewards/margins": 0.7124817371368408, + "rewards/rejected": -2.1979007720947266, + "step": 10200 + }, + { + "epoch": 1.759131633356306, + "grad_norm": 45.000587463378906, + "learning_rate": 8.734336609224798e-08, + "logits/chosen": -1.6575438976287842, + "logits/rejected": -1.6288673877716064, + "logps/chosen": -218.801513671875, + "logps/rejected": -302.0950622558594, + "loss": 0.5392, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6478855609893799, + "rewards/margins": 0.8018059730529785, + "rewards/rejected": -2.4496912956237793, + "step": 10210 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 31.537446975708008, + "learning_rate": 8.714451939467793e-08, + "logits/chosen": -1.5889657735824585, + "logits/rejected": -1.5587692260742188, + "logps/chosen": -202.49703979492188, + "logps/rejected": -272.3978576660156, + "loss": 0.5276, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5411155223846436, + "rewards/margins": 0.6716324090957642, + "rewards/rejected": -2.212747573852539, + "step": 10220 + }, + { + "epoch": 1.762577532736044, + "grad_norm": 22.315608978271484, + "learning_rate": 8.69457243684717e-08, + "logits/chosen": -1.5495671033859253, + "logits/rejected": -1.5024573802947998, + "logps/chosen": -207.17160034179688, + "logps/rejected": -268.1032409667969, + "loss": 0.5932, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.537306547164917, + "rewards/margins": 0.6251362562179565, + "rewards/rejected": -2.162442684173584, + "step": 10230 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 19.749860763549805, + "learning_rate": 8.67469818126667e-08, + "logits/chosen": -1.619339942932129, + "logits/rejected": -1.5651066303253174, + "logps/chosen": -204.11798095703125, + "logps/rejected": -297.8780517578125, + "loss": 0.5045, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5266096591949463, + "rewards/margins": 0.928512454032898, + "rewards/rejected": -2.4551219940185547, + "step": 10240 + }, + { + "epoch": 1.7660234321157822, + "grad_norm": 50.124698638916016, + "learning_rate": 8.654829252608947e-08, + "logits/chosen": -1.6453462839126587, + "logits/rejected": -1.5897960662841797, + "logps/chosen": -208.56918334960938, + "logps/rejected": -266.9615173339844, + "loss": 0.5292, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5179365873336792, + "rewards/margins": 0.6548843383789062, + "rewards/rejected": -2.172820568084717, + "step": 10250 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 29.26951789855957, + "learning_rate": 8.634965730735238e-08, + "logits/chosen": -1.6260560750961304, + "logits/rejected": -1.6039345264434814, + "logps/chosen": -197.04734802246094, + "logps/rejected": -271.06866455078125, + "loss": 0.5231, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4378256797790527, + "rewards/margins": 0.7041373252868652, + "rewards/rejected": -2.141962766647339, + "step": 10260 + }, + { + "epoch": 1.7694693314955203, + "grad_norm": 30.01377296447754, + "learning_rate": 8.615107695485059e-08, + "logits/chosen": -1.6363704204559326, + "logits/rejected": -1.5963335037231445, + "logps/chosen": -202.7434539794922, + "logps/rejected": -271.08990478515625, + "loss": 0.5355, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.473127007484436, + "rewards/margins": 0.6813068389892578, + "rewards/rejected": -2.1544339656829834, + "step": 10270 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 27.83128547668457, + "learning_rate": 8.595255226675867e-08, + "logits/chosen": -1.6075325012207031, + "logits/rejected": -1.578054428100586, + "logps/chosen": -211.5869598388672, + "logps/rejected": -255.16415405273438, + "loss": 0.6148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5725781917572021, + "rewards/margins": 0.44869309663772583, + "rewards/rejected": -2.0212714672088623, + "step": 10280 + }, + { + "epoch": 1.7729152308752585, + "grad_norm": 20.72820472717285, + "learning_rate": 8.575408404102739e-08, + "logits/chosen": -1.5998598337173462, + "logits/rejected": -1.5654428005218506, + "logps/chosen": -184.80645751953125, + "logps/rejected": -267.18109130859375, + "loss": 0.5129, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.333701729774475, + "rewards/margins": 0.7832706570625305, + "rewards/rejected": -2.1169724464416504, + "step": 10290 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 25.413875579833984, + "learning_rate": 8.555567307538067e-08, + "logits/chosen": -1.6682040691375732, + "logits/rejected": -1.6348912715911865, + "logps/chosen": -195.970947265625, + "logps/rejected": -248.9465789794922, + "loss": 0.5872, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4132314920425415, + "rewards/margins": 0.531366229057312, + "rewards/rejected": -1.944597601890564, + "step": 10300 + }, + { + "epoch": 1.7763611302549966, + "grad_norm": 22.22412872314453, + "learning_rate": 8.53573201673122e-08, + "logits/chosen": -1.6083345413208008, + "logits/rejected": -1.566024899482727, + "logps/chosen": -199.2248077392578, + "logps/rejected": -263.5713195800781, + "loss": 0.5186, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4296208620071411, + "rewards/margins": 0.6915571093559265, + "rewards/rejected": -2.121178150177002, + "step": 10310 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 26.842947006225586, + "learning_rate": 8.515902611408245e-08, + "logits/chosen": -1.659576416015625, + "logits/rejected": -1.6150563955307007, + "logps/chosen": -199.6532440185547, + "logps/rejected": -253.9457550048828, + "loss": 0.601, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.428916096687317, + "rewards/margins": 0.5753315687179565, + "rewards/rejected": -2.0042474269866943, + "step": 10320 + }, + { + "epoch": 1.7798070296347346, + "grad_norm": 45.35322570800781, + "learning_rate": 8.496079171271512e-08, + "logits/chosen": -1.6947290897369385, + "logits/rejected": -1.6549278497695923, + "logps/chosen": -187.60887145996094, + "logps/rejected": -251.35610961914062, + "loss": 0.5305, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3546451330184937, + "rewards/margins": 0.6137539148330688, + "rewards/rejected": -1.9683990478515625, + "step": 10330 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 25.27505111694336, + "learning_rate": 8.476261775999432e-08, + "logits/chosen": -1.7075321674346924, + "logits/rejected": -1.6534080505371094, + "logps/chosen": -185.04373168945312, + "logps/rejected": -261.29937744140625, + "loss": 0.5162, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2895625829696655, + "rewards/margins": 0.7777486443519592, + "rewards/rejected": -2.0673112869262695, + "step": 10340 + }, + { + "epoch": 1.7832529290144727, + "grad_norm": 28.808042526245117, + "learning_rate": 8.45645050524611e-08, + "logits/chosen": -1.7838119268417358, + "logits/rejected": -1.7406063079833984, + "logps/chosen": -186.5411834716797, + "logps/rejected": -245.5423126220703, + "loss": 0.5485, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3245922327041626, + "rewards/margins": 0.6202830672264099, + "rewards/rejected": -1.9448751211166382, + "step": 10350 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 30.061277389526367, + "learning_rate": 8.436645438641038e-08, + "logits/chosen": -1.6315284967422485, + "logits/rejected": -1.5859811305999756, + "logps/chosen": -192.25270080566406, + "logps/rejected": -238.0504150390625, + "loss": 0.5994, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3630584478378296, + "rewards/margins": 0.48939284682273865, + "rewards/rejected": -1.8524513244628906, + "step": 10360 + }, + { + "epoch": 1.786698828394211, + "grad_norm": 31.28719711303711, + "learning_rate": 8.416846655788774e-08, + "logits/chosen": -1.526947021484375, + "logits/rejected": -1.4788224697113037, + "logps/chosen": -178.3394012451172, + "logps/rejected": -239.5234375, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2636200189590454, + "rewards/margins": 0.602812647819519, + "rewards/rejected": -1.866432785987854, + "step": 10370 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 36.3031120300293, + "learning_rate": 8.397054236268611e-08, + "logits/chosen": -1.669136643409729, + "logits/rejected": -1.6464998722076416, + "logps/chosen": -196.32110595703125, + "logps/rejected": -231.94418334960938, + "loss": 0.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4444220066070557, + "rewards/margins": 0.35197263956069946, + "rewards/rejected": -1.7963947057724, + "step": 10380 + }, + { + "epoch": 1.7901447277739488, + "grad_norm": 19.98095703125, + "learning_rate": 8.37726825963427e-08, + "logits/chosen": -1.7154592275619507, + "logits/rejected": -1.666921854019165, + "logps/chosen": -180.02867126464844, + "logps/rejected": -242.6608123779297, + "loss": 0.5435, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2421947717666626, + "rewards/margins": 0.6435031890869141, + "rewards/rejected": -1.8856979608535767, + "step": 10390 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 25.061344146728516, + "learning_rate": 8.357488805413576e-08, + "logits/chosen": -1.6696021556854248, + "logits/rejected": -1.631299376487732, + "logps/chosen": -188.3632354736328, + "logps/rejected": -265.38037109375, + "loss": 0.4923, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3622297048568726, + "rewards/margins": 0.7384769320487976, + "rewards/rejected": -2.1007065773010254, + "step": 10400 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -1.796593189239502, + "eval_logits/rejected": -1.7754329442977905, + "eval_logps/chosen": -174.6746063232422, + "eval_logps/rejected": -207.29551696777344, + "eval_loss": 0.6312093734741211, + "eval_rewards/accuracies": 0.6333643198013306, + "eval_rewards/chosen": -1.1596269607543945, + "eval_rewards/margins": 0.28152674436569214, + "eval_rewards/rejected": -1.441153883934021, + "eval_runtime": 384.1355, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.401, + "step": 10400 + }, + { + "epoch": 1.7935906271536872, + "grad_norm": 27.34046173095703, + "learning_rate": 8.337715953108133e-08, + "logits/chosen": -1.644966721534729, + "logits/rejected": -1.5948392152786255, + "logps/chosen": -190.6090545654297, + "logps/rejected": -245.26416015625, + "loss": 0.5639, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3409154415130615, + "rewards/margins": 0.580964207649231, + "rewards/rejected": -1.921879529953003, + "step": 10410 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 38.71763229370117, + "learning_rate": 8.317949782193021e-08, + "logits/chosen": -1.6703846454620361, + "logits/rejected": -1.6184704303741455, + "logps/chosen": -185.3733367919922, + "logps/rejected": -248.4432830810547, + "loss": 0.5412, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3125276565551758, + "rewards/margins": 0.6306508183479309, + "rewards/rejected": -1.9431785345077515, + "step": 10420 + }, + { + "epoch": 1.7970365265334252, + "grad_norm": 24.359373092651367, + "learning_rate": 8.298190372116449e-08, + "logits/chosen": -1.747746229171753, + "logits/rejected": -1.7087604999542236, + "logps/chosen": -185.07826232910156, + "logps/rejected": -240.5188446044922, + "loss": 0.5752, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.302076816558838, + "rewards/margins": 0.57256019115448, + "rewards/rejected": -1.874637246131897, + "step": 10430 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 25.13093376159668, + "learning_rate": 8.278437802299462e-08, + "logits/chosen": -1.7765858173370361, + "logits/rejected": -1.7487947940826416, + "logps/chosen": -191.94027709960938, + "logps/rejected": -241.29458618164062, + "loss": 0.579, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3407024145126343, + "rewards/margins": 0.4993131160736084, + "rewards/rejected": -1.8400154113769531, + "step": 10440 + }, + { + "epoch": 1.8004824259131633, + "grad_norm": 25.234697341918945, + "learning_rate": 8.258692152135605e-08, + "logits/chosen": -1.6926829814910889, + "logits/rejected": -1.665917158126831, + "logps/chosen": -191.83380126953125, + "logps/rejected": -258.0459899902344, + "loss": 0.5337, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3821178674697876, + "rewards/margins": 0.6219611763954163, + "rewards/rejected": -2.0040788650512695, + "step": 10450 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 21.314926147460938, + "learning_rate": 8.238953500990624e-08, + "logits/chosen": -1.6903746128082275, + "logits/rejected": -1.644550085067749, + "logps/chosen": -183.9580078125, + "logps/rejected": -237.87936401367188, + "loss": 0.5601, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.293830156326294, + "rewards/margins": 0.5546851754188538, + "rewards/rejected": -1.848515272140503, + "step": 10460 + }, + { + "epoch": 1.8039283252929015, + "grad_norm": 40.113525390625, + "learning_rate": 8.219221928202108e-08, + "logits/chosen": -1.5459710359573364, + "logits/rejected": -1.506653904914856, + "logps/chosen": -181.66893005371094, + "logps/rejected": -239.0261688232422, + "loss": 0.5778, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.297624945640564, + "rewards/margins": 0.5848358273506165, + "rewards/rejected": -1.882460594177246, + "step": 10470 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 24.86050033569336, + "learning_rate": 8.199497513079219e-08, + "logits/chosen": -1.6869032382965088, + "logits/rejected": -1.6301963329315186, + "logps/chosen": -185.90304565429688, + "logps/rejected": -254.4204559326172, + "loss": 0.5409, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2993216514587402, + "rewards/margins": 0.720859944820404, + "rewards/rejected": -2.020181655883789, + "step": 10480 + }, + { + "epoch": 1.8073742246726394, + "grad_norm": 22.12966537475586, + "learning_rate": 8.179780334902338e-08, + "logits/chosen": -1.6784626245498657, + "logits/rejected": -1.6307865381240845, + "logps/chosen": -171.83340454101562, + "logps/rejected": -238.9564666748047, + "loss": 0.524, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.180229663848877, + "rewards/margins": 0.6838089823722839, + "rewards/rejected": -1.8640388250350952, + "step": 10490 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 24.905609130859375, + "learning_rate": 8.16007047292276e-08, + "logits/chosen": -1.6610034704208374, + "logits/rejected": -1.6133527755737305, + "logps/chosen": -187.67446899414062, + "logps/rejected": -266.53692626953125, + "loss": 0.5196, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.34975004196167, + "rewards/margins": 0.7645037770271301, + "rewards/rejected": -2.1142537593841553, + "step": 10500 + }, + { + "epoch": 1.8108201240523778, + "grad_norm": 26.100194931030273, + "learning_rate": 8.140368006362378e-08, + "logits/chosen": -1.6718677282333374, + "logits/rejected": -1.6218767166137695, + "logps/chosen": -185.58583068847656, + "logps/rejected": -251.987060546875, + "loss": 0.5291, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2999773025512695, + "rewards/margins": 0.6881439089775085, + "rewards/rejected": -1.9881212711334229, + "step": 10510 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 38.36790084838867, + "learning_rate": 8.120673014413346e-08, + "logits/chosen": -1.7166264057159424, + "logits/rejected": -1.685956597328186, + "logps/chosen": -188.8399658203125, + "logps/rejected": -272.0633850097656, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3835443258285522, + "rewards/margins": 0.7916911244392395, + "rewards/rejected": -2.1752355098724365, + "step": 10520 + }, + { + "epoch": 1.8142660234321157, + "grad_norm": 24.703962326049805, + "learning_rate": 8.100985576237789e-08, + "logits/chosen": -1.6094785928726196, + "logits/rejected": -1.5705649852752686, + "logps/chosen": -196.53831481933594, + "logps/rejected": -254.22103881835938, + "loss": 0.5587, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4319744110107422, + "rewards/margins": 0.5889819860458374, + "rewards/rejected": -2.020956516265869, + "step": 10530 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 20.878108978271484, + "learning_rate": 8.081305770967466e-08, + "logits/chosen": -1.5575045347213745, + "logits/rejected": -1.5127557516098022, + "logps/chosen": -193.12832641601562, + "logps/rejected": -258.6274719238281, + "loss": 0.5252, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4079763889312744, + "rewards/margins": 0.6843532919883728, + "rewards/rejected": -2.092329502105713, + "step": 10540 + }, + { + "epoch": 1.817711922811854, + "grad_norm": 30.245561599731445, + "learning_rate": 8.061633677703457e-08, + "logits/chosen": -1.7191566228866577, + "logits/rejected": -1.6862128973007202, + "logps/chosen": -210.8649444580078, + "logps/rejected": -280.70245361328125, + "loss": 0.5475, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5711400508880615, + "rewards/margins": 0.6780726313591003, + "rewards/rejected": -2.2492125034332275, + "step": 10550 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 32.06334686279297, + "learning_rate": 8.041969375515835e-08, + "logits/chosen": -1.596172571182251, + "logits/rejected": -1.5496784448623657, + "logps/chosen": -197.6623992919922, + "logps/rejected": -273.310546875, + "loss": 0.5502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4203016757965088, + "rewards/margins": 0.7665907740592957, + "rewards/rejected": -2.18689227104187, + "step": 10560 + }, + { + "epoch": 1.821157822191592, + "grad_norm": 34.21003723144531, + "learning_rate": 8.022312943443369e-08, + "logits/chosen": -1.6808770895004272, + "logits/rejected": -1.6430126428604126, + "logps/chosen": -201.29342651367188, + "logps/rejected": -276.6784362792969, + "loss": 0.5171, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4534662961959839, + "rewards/margins": 0.7456766366958618, + "rewards/rejected": -2.1991429328918457, + "step": 10570 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 24.851125717163086, + "learning_rate": 8.002664460493194e-08, + "logits/chosen": -1.705336570739746, + "logits/rejected": -1.6608549356460571, + "logps/chosen": -184.4915008544922, + "logps/rejected": -252.7461395263672, + "loss": 0.514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3112952709197998, + "rewards/margins": 0.6880284547805786, + "rewards/rejected": -1.9993234872817993, + "step": 10580 + }, + { + "epoch": 1.82460372157133, + "grad_norm": 20.24405288696289, + "learning_rate": 7.983024005640487e-08, + "logits/chosen": -1.6469793319702148, + "logits/rejected": -1.5926092863082886, + "logps/chosen": -190.94544982910156, + "logps/rejected": -249.9911651611328, + "loss": 0.5273, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3504091501235962, + "rewards/margins": 0.6501537561416626, + "rewards/rejected": -2.000562906265259, + "step": 10590 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 26.51520538330078, + "learning_rate": 7.963391657828167e-08, + "logits/chosen": -1.7193012237548828, + "logits/rejected": -1.6881153583526611, + "logps/chosen": -183.8968505859375, + "logps/rejected": -246.1622772216797, + "loss": 0.5715, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3101425170898438, + "rewards/margins": 0.6046009063720703, + "rewards/rejected": -1.9147436618804932, + "step": 10600 + }, + { + "epoch": 1.8280496209510684, + "grad_norm": 24.79945182800293, + "learning_rate": 7.943767495966556e-08, + "logits/chosen": -1.6367594003677368, + "logits/rejected": -1.6014162302017212, + "logps/chosen": -187.56796264648438, + "logps/rejected": -251.03634643554688, + "loss": 0.5404, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3650391101837158, + "rewards/margins": 0.6266623139381409, + "rewards/rejected": -1.991701364517212, + "step": 10610 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 30.169219970703125, + "learning_rate": 7.924151598933077e-08, + "logits/chosen": -1.5777955055236816, + "logits/rejected": -1.5336124897003174, + "logps/chosen": -192.7977294921875, + "logps/rejected": -256.5873107910156, + "loss": 0.5313, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3899385929107666, + "rewards/margins": 0.6526475548744202, + "rewards/rejected": -2.042586326599121, + "step": 10620 + }, + { + "epoch": 1.8314955203308063, + "grad_norm": 17.994773864746094, + "learning_rate": 7.904544045571942e-08, + "logits/chosen": -1.6816425323486328, + "logits/rejected": -1.6320663690567017, + "logps/chosen": -186.03292846679688, + "logps/rejected": -254.05209350585938, + "loss": 0.5574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3177438974380493, + "rewards/margins": 0.6810862421989441, + "rewards/rejected": -1.9988301992416382, + "step": 10630 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 29.066410064697266, + "learning_rate": 7.884944914693819e-08, + "logits/chosen": -1.6614630222320557, + "logits/rejected": -1.6166385412216187, + "logps/chosen": -193.18630981445312, + "logps/rejected": -257.2984924316406, + "loss": 0.5533, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3812034130096436, + "rewards/margins": 0.6659941673278809, + "rewards/rejected": -2.0471975803375244, + "step": 10640 + }, + { + "epoch": 1.8349414197105445, + "grad_norm": 22.816099166870117, + "learning_rate": 7.865354285075517e-08, + "logits/chosen": -1.7261043787002563, + "logits/rejected": -1.6789547204971313, + "logps/chosen": -193.98812866210938, + "logps/rejected": -275.81634521484375, + "loss": 0.515, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4036664962768555, + "rewards/margins": 0.8163207173347473, + "rewards/rejected": -2.219987154006958, + "step": 10650 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 25.88532066345215, + "learning_rate": 7.845772235459687e-08, + "logits/chosen": -1.6032822132110596, + "logits/rejected": -1.5658022165298462, + "logps/chosen": -201.2757568359375, + "logps/rejected": -263.31585693359375, + "loss": 0.5442, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4657762050628662, + "rewards/margins": 0.6155818700790405, + "rewards/rejected": -2.0813581943511963, + "step": 10660 + }, + { + "epoch": 1.8383873190902826, + "grad_norm": 27.669984817504883, + "learning_rate": 7.826198844554484e-08, + "logits/chosen": -1.6221199035644531, + "logits/rejected": -1.5766185522079468, + "logps/chosen": -202.876708984375, + "logps/rejected": -271.6616516113281, + "loss": 0.5582, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5073853731155396, + "rewards/margins": 0.6935917735099792, + "rewards/rejected": -2.200977325439453, + "step": 10670 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 25.442867279052734, + "learning_rate": 7.806634191033268e-08, + "logits/chosen": -1.6795568466186523, + "logits/rejected": -1.6216316223144531, + "logps/chosen": -189.34262084960938, + "logps/rejected": -251.6562042236328, + "loss": 0.5285, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3305351734161377, + "rewards/margins": 0.6782180070877075, + "rewards/rejected": -2.0087532997131348, + "step": 10680 + }, + { + "epoch": 1.8418332184700206, + "grad_norm": 25.80929183959961, + "learning_rate": 7.787078353534276e-08, + "logits/chosen": -1.6476482152938843, + "logits/rejected": -1.609718918800354, + "logps/chosen": -190.88961791992188, + "logps/rejected": -267.1067199707031, + "loss": 0.5332, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3713963031768799, + "rewards/margins": 0.7359045147895813, + "rewards/rejected": -2.1073009967803955, + "step": 10690 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 34.373382568359375, + "learning_rate": 7.767531410660307e-08, + "logits/chosen": -1.701225996017456, + "logits/rejected": -1.6454471349716187, + "logps/chosen": -188.1253204345703, + "logps/rejected": -243.43905639648438, + "loss": 0.5365, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3195775747299194, + "rewards/margins": 0.6114214658737183, + "rewards/rejected": -1.9309990406036377, + "step": 10700 + }, + { + "epoch": 1.8452791178497587, + "grad_norm": 29.572166442871094, + "learning_rate": 7.74799344097841e-08, + "logits/chosen": -1.6249849796295166, + "logits/rejected": -1.5775136947631836, + "logps/chosen": -185.059326171875, + "logps/rejected": -244.96792602539062, + "loss": 0.5475, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3161468505859375, + "rewards/margins": 0.6391556262969971, + "rewards/rejected": -1.9553024768829346, + "step": 10710 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 47.28426742553711, + "learning_rate": 7.728464523019574e-08, + "logits/chosen": -1.652116060256958, + "logits/rejected": -1.6033544540405273, + "logps/chosen": -194.1964569091797, + "logps/rejected": -249.8228759765625, + "loss": 0.5972, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4160946607589722, + "rewards/margins": 0.5593221187591553, + "rewards/rejected": -1.9754167795181274, + "step": 10720 + }, + { + "epoch": 1.848725017229497, + "grad_norm": 22.565967559814453, + "learning_rate": 7.7089447352784e-08, + "logits/chosen": -1.7549766302108765, + "logits/rejected": -1.7027647495269775, + "logps/chosen": -186.93838500976562, + "logps/rejected": -244.3561553955078, + "loss": 0.529, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3297926187515259, + "rewards/margins": 0.614985466003418, + "rewards/rejected": -1.9447780847549438, + "step": 10730 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 33.7943229675293, + "learning_rate": 7.689434156212788e-08, + "logits/chosen": -1.6176128387451172, + "logits/rejected": -1.579614281654358, + "logps/chosen": -198.7300567626953, + "logps/rejected": -254.4537811279297, + "loss": 0.5909, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.440397024154663, + "rewards/margins": 0.5526378154754639, + "rewards/rejected": -1.9930353164672852, + "step": 10740 + }, + { + "epoch": 1.852170916609235, + "grad_norm": 21.347455978393555, + "learning_rate": 7.669932864243627e-08, + "logits/chosen": -1.612450361251831, + "logits/rejected": -1.5564180612564087, + "logps/chosen": -184.0469207763672, + "logps/rejected": -260.59716796875, + "loss": 0.5242, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3198864459991455, + "rewards/margins": 0.7594377398490906, + "rewards/rejected": -2.079324245452881, + "step": 10750 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 29.009899139404297, + "learning_rate": 7.65044093775448e-08, + "logits/chosen": -1.6315386295318604, + "logits/rejected": -1.6006014347076416, + "logps/chosen": -186.20626831054688, + "logps/rejected": -243.677978515625, + "loss": 0.5665, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3252546787261963, + "rewards/margins": 0.5836648941040039, + "rewards/rejected": -1.9089195728302002, + "step": 10760 + }, + { + "epoch": 1.8556168159889732, + "grad_norm": 24.822834014892578, + "learning_rate": 7.630958455091266e-08, + "logits/chosen": -1.6193573474884033, + "logits/rejected": -1.590057611465454, + "logps/chosen": -188.15383911132812, + "logps/rejected": -246.31069946289062, + "loss": 0.5482, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3108214139938354, + "rewards/margins": 0.6063941121101379, + "rewards/rejected": -1.9172155857086182, + "step": 10770 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 25.640043258666992, + "learning_rate": 7.611485494561947e-08, + "logits/chosen": -1.7544190883636475, + "logits/rejected": -1.7139012813568115, + "logps/chosen": -186.97499084472656, + "logps/rejected": -251.82894897460938, + "loss": 0.5634, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3380534648895264, + "rewards/margins": 0.6293826699256897, + "rewards/rejected": -1.96743643283844, + "step": 10780 + }, + { + "epoch": 1.8590627153687111, + "grad_norm": 29.79625701904297, + "learning_rate": 7.592022134436201e-08, + "logits/chosen": -1.787848711013794, + "logits/rejected": -1.7382673025131226, + "logps/chosen": -173.20809936523438, + "logps/rejected": -243.8647003173828, + "loss": 0.5089, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.208669900894165, + "rewards/margins": 0.6959220767021179, + "rewards/rejected": -1.9045919179916382, + "step": 10790 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 23.23958969116211, + "learning_rate": 7.57256845294513e-08, + "logits/chosen": -1.768815279006958, + "logits/rejected": -1.7291584014892578, + "logps/chosen": -181.27464294433594, + "logps/rejected": -250.6145477294922, + "loss": 0.5386, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2931530475616455, + "rewards/margins": 0.673789381980896, + "rewards/rejected": -1.9669424295425415, + "step": 10800 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -1.7721792459487915, + "eval_logits/rejected": -1.7500107288360596, + "eval_logps/chosen": -185.76849365234375, + "eval_logps/rejected": -220.52793884277344, + "eval_loss": 0.6304465532302856, + "eval_rewards/accuracies": 0.6373141407966614, + "eval_rewards/chosen": -1.2705661058425903, + "eval_rewards/margins": 0.3029119074344635, + "eval_rewards/rejected": -1.573478102684021, + "eval_runtime": 383.7392, + "eval_samples_per_second": 11.216, + "eval_steps_per_second": 1.402, + "step": 10800 + }, + { + "epoch": 1.8625086147484493, + "grad_norm": 17.646251678466797, + "learning_rate": 7.553124528280928e-08, + "logits/chosen": -1.6951982975006104, + "logits/rejected": -1.6370052099227905, + "logps/chosen": -196.0543670654297, + "logps/rejected": -254.51473999023438, + "loss": 0.5238, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.386745572090149, + "rewards/margins": 0.6572982668876648, + "rewards/rejected": -2.044044017791748, + "step": 10810 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 28.62160873413086, + "learning_rate": 7.533690438596583e-08, + "logits/chosen": -1.622422218322754, + "logits/rejected": -1.5795962810516357, + "logps/chosen": -177.42306518554688, + "logps/rejected": -246.3397979736328, + "loss": 0.526, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2432811260223389, + "rewards/margins": 0.7060755491256714, + "rewards/rejected": -1.9493564367294312, + "step": 10820 + }, + { + "epoch": 1.8659545141281875, + "grad_norm": 43.903541564941406, + "learning_rate": 7.514266262005528e-08, + "logits/chosen": -1.6371924877166748, + "logits/rejected": -1.5897127389907837, + "logps/chosen": -200.0693359375, + "logps/rejected": -261.50897216796875, + "loss": 0.5597, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4825317859649658, + "rewards/margins": 0.5994998216629028, + "rewards/rejected": -2.082031726837158, + "step": 10830 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 27.840375900268555, + "learning_rate": 7.494852076581377e-08, + "logits/chosen": -1.680965781211853, + "logits/rejected": -1.6448853015899658, + "logps/chosen": -187.4963836669922, + "logps/rejected": -240.0545654296875, + "loss": 0.584, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3450555801391602, + "rewards/margins": 0.5464226603507996, + "rewards/rejected": -1.8914783000946045, + "step": 10840 + }, + { + "epoch": 1.8694004135079254, + "grad_norm": 26.34052085876465, + "learning_rate": 7.475447960357572e-08, + "logits/chosen": -1.6020797491073608, + "logits/rejected": -1.5666046142578125, + "logps/chosen": -180.9534912109375, + "logps/rejected": -240.6409912109375, + "loss": 0.5594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.307927131652832, + "rewards/margins": 0.5539305210113525, + "rewards/rejected": -1.8618577718734741, + "step": 10850 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 29.617908477783203, + "learning_rate": 7.456053991327083e-08, + "logits/chosen": -1.7373720407485962, + "logits/rejected": -1.6805378198623657, + "logps/chosen": -185.09268188476562, + "logps/rejected": -254.7001495361328, + "loss": 0.5303, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.298338770866394, + "rewards/margins": 0.7245272397994995, + "rewards/rejected": -2.0228660106658936, + "step": 10860 + }, + { + "epoch": 1.8728463128876638, + "grad_norm": 19.6914119720459, + "learning_rate": 7.436670247442107e-08, + "logits/chosen": -1.5922746658325195, + "logits/rejected": -1.5555670261383057, + "logps/chosen": -169.83578491210938, + "logps/rejected": -254.55313110351562, + "loss": 0.4986, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1935006380081177, + "rewards/margins": 0.7914150357246399, + "rewards/rejected": -1.9849159717559814, + "step": 10870 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 25.92415428161621, + "learning_rate": 7.417296806613718e-08, + "logits/chosen": -1.6392488479614258, + "logits/rejected": -1.5973610877990723, + "logps/chosen": -194.64828491210938, + "logps/rejected": -260.9826354980469, + "loss": 0.5367, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4043915271759033, + "rewards/margins": 0.6784892678260803, + "rewards/rejected": -2.082880973815918, + "step": 10880 + }, + { + "epoch": 1.8762922122674017, + "grad_norm": 25.264381408691406, + "learning_rate": 7.397933746711603e-08, + "logits/chosen": -1.681231141090393, + "logits/rejected": -1.642806053161621, + "logps/chosen": -192.63475036621094, + "logps/rejected": -264.21405029296875, + "loss": 0.5312, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3947433233261108, + "rewards/margins": 0.6991499662399292, + "rewards/rejected": -2.09389328956604, + "step": 10890 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 29.97552490234375, + "learning_rate": 7.378581145563709e-08, + "logits/chosen": -1.7367448806762695, + "logits/rejected": -1.678744912147522, + "logps/chosen": -190.0194549560547, + "logps/rejected": -254.99465942382812, + "loss": 0.5247, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3502709865570068, + "rewards/margins": 0.6936368346214294, + "rewards/rejected": -2.043907880783081, + "step": 10900 + }, + { + "epoch": 1.8797381116471399, + "grad_norm": 30.631378173828125, + "learning_rate": 7.35923908095595e-08, + "logits/chosen": -1.6269598007202148, + "logits/rejected": -1.5821008682250977, + "logps/chosen": -195.05043029785156, + "logps/rejected": -251.58560180664062, + "loss": 0.5723, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.393839955329895, + "rewards/margins": 0.5802146792411804, + "rewards/rejected": -1.9740545749664307, + "step": 10910 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 35.906612396240234, + "learning_rate": 7.339907630631886e-08, + "logits/chosen": -1.6025956869125366, + "logits/rejected": -1.5546543598175049, + "logps/chosen": -201.29962158203125, + "logps/rejected": -268.4058837890625, + "loss": 0.5154, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4546197652816772, + "rewards/margins": 0.7002333402633667, + "rewards/rejected": -2.154853105545044, + "step": 10920 + }, + { + "epoch": 1.883184011026878, + "grad_norm": 30.438188552856445, + "learning_rate": 7.320586872292413e-08, + "logits/chosen": -1.559808373451233, + "logits/rejected": -1.5186632871627808, + "logps/chosen": -199.2595977783203, + "logps/rejected": -255.65878295898438, + "loss": 0.5548, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.450476050376892, + "rewards/margins": 0.5912362933158875, + "rewards/rejected": -2.041712522506714, + "step": 10930 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 45.64797592163086, + "learning_rate": 7.301276883595463e-08, + "logits/chosen": -1.551182508468628, + "logits/rejected": -1.5193589925765991, + "logps/chosen": -204.142822265625, + "logps/rejected": -257.69952392578125, + "loss": 0.5659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4768362045288086, + "rewards/margins": 0.5484408736228943, + "rewards/rejected": -2.0252771377563477, + "step": 10940 + }, + { + "epoch": 1.886629910406616, + "grad_norm": 33.39741897583008, + "learning_rate": 7.281977742155669e-08, + "logits/chosen": -1.6167634725570679, + "logits/rejected": -1.5732940435409546, + "logps/chosen": -201.14071655273438, + "logps/rejected": -268.1167907714844, + "loss": 0.5262, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4653427600860596, + "rewards/margins": 0.6688799262046814, + "rewards/rejected": -2.1342225074768066, + "step": 10950 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 35.17470169067383, + "learning_rate": 7.262689525544067e-08, + "logits/chosen": -1.6693687438964844, + "logits/rejected": -1.6231985092163086, + "logps/chosen": -196.91688537597656, + "logps/rejected": -267.0567932128906, + "loss": 0.5512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4310643672943115, + "rewards/margins": 0.7137660980224609, + "rewards/rejected": -2.1448302268981934, + "step": 10960 + }, + { + "epoch": 1.8900758097863544, + "grad_norm": 36.11570739746094, + "learning_rate": 7.243412311287782e-08, + "logits/chosen": -1.6791913509368896, + "logits/rejected": -1.6281063556671143, + "logps/chosen": -193.53982543945312, + "logps/rejected": -258.5443420410156, + "loss": 0.5791, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.366768717765808, + "rewards/margins": 0.6775639653205872, + "rewards/rejected": -2.044332504272461, + "step": 10970 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 28.242664337158203, + "learning_rate": 7.224146176869717e-08, + "logits/chosen": -1.6848160028457642, + "logits/rejected": -1.6459327936172485, + "logps/chosen": -201.38186645507812, + "logps/rejected": -272.21429443359375, + "loss": 0.5731, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4680874347686768, + "rewards/margins": 0.6922693252563477, + "rewards/rejected": -2.1603565216064453, + "step": 10980 + }, + { + "epoch": 1.8935217091660923, + "grad_norm": 24.009624481201172, + "learning_rate": 7.204891199728241e-08, + "logits/chosen": -1.6162869930267334, + "logits/rejected": -1.576981782913208, + "logps/chosen": -179.42196655273438, + "logps/rejected": -247.52578735351562, + "loss": 0.5222, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2693443298339844, + "rewards/margins": 0.6809569597244263, + "rewards/rejected": -1.9503014087677002, + "step": 10990 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 22.96497917175293, + "learning_rate": 7.185647457256879e-08, + "logits/chosen": -1.7962024211883545, + "logits/rejected": -1.7659142017364502, + "logps/chosen": -193.4381866455078, + "logps/rejected": -259.83184814453125, + "loss": 0.5676, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4054908752441406, + "rewards/margins": 0.6229832768440247, + "rewards/rejected": -2.0284743309020996, + "step": 11000 + }, + { + "epoch": 1.8969676085458305, + "grad_norm": 26.895023345947266, + "learning_rate": 7.166415026803991e-08, + "logits/chosen": -1.7166029214859009, + "logits/rejected": -1.6600326299667358, + "logps/chosen": -186.55538940429688, + "logps/rejected": -261.6690673828125, + "loss": 0.4973, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3334908485412598, + "rewards/margins": 0.7684970498085022, + "rewards/rejected": -2.101987600326538, + "step": 11010 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 19.827083587646484, + "learning_rate": 7.147193985672477e-08, + "logits/chosen": -1.7519909143447876, + "logits/rejected": -1.7119252681732178, + "logps/chosen": -187.62266540527344, + "logps/rejected": -248.4136505126953, + "loss": 0.5336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3261867761611938, + "rewards/margins": 0.6275477409362793, + "rewards/rejected": -1.9537343978881836, + "step": 11020 + }, + { + "epoch": 1.9004135079255686, + "grad_norm": 23.33296012878418, + "learning_rate": 7.127984411119461e-08, + "logits/chosen": -1.7048231363296509, + "logits/rejected": -1.675737738609314, + "logps/chosen": -193.9524688720703, + "logps/rejected": -260.84259033203125, + "loss": 0.5546, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3916425704956055, + "rewards/margins": 0.6718882322311401, + "rewards/rejected": -2.063530921936035, + "step": 11030 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 22.681320190429688, + "learning_rate": 7.108786380355971e-08, + "logits/chosen": -1.736576795578003, + "logits/rejected": -1.6962627172470093, + "logps/chosen": -196.87417602539062, + "logps/rejected": -259.28131103515625, + "loss": 0.5495, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.40407395362854, + "rewards/margins": 0.6333260536193848, + "rewards/rejected": -2.037400007247925, + "step": 11040 + }, + { + "epoch": 1.9038594073053066, + "grad_norm": 25.510026931762695, + "learning_rate": 7.089599970546642e-08, + "logits/chosen": -1.6437184810638428, + "logits/rejected": -1.5956861972808838, + "logps/chosen": -179.71234130859375, + "logps/rejected": -259.95782470703125, + "loss": 0.4912, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.277482032775879, + "rewards/margins": 0.7694389224052429, + "rewards/rejected": -2.0469212532043457, + "step": 11050 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 28.625423431396484, + "learning_rate": 7.070425258809394e-08, + "logits/chosen": -1.6748847961425781, + "logits/rejected": -1.605231523513794, + "logps/chosen": -192.73068237304688, + "logps/rejected": -267.40386962890625, + "loss": 0.499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.384558081626892, + "rewards/margins": 0.7721072435379028, + "rewards/rejected": -2.156665325164795, + "step": 11060 + }, + { + "epoch": 1.907305306685045, + "grad_norm": 20.10926055908203, + "learning_rate": 7.051262322215128e-08, + "logits/chosen": -1.6803903579711914, + "logits/rejected": -1.6113357543945312, + "logps/chosen": -185.80337524414062, + "logps/rejected": -265.29815673828125, + "loss": 0.4953, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3262794017791748, + "rewards/margins": 0.8310495615005493, + "rewards/rejected": -2.1573286056518555, + "step": 11070 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 22.918128967285156, + "learning_rate": 7.032111237787424e-08, + "logits/chosen": -1.672593355178833, + "logits/rejected": -1.626828908920288, + "logps/chosen": -202.33993530273438, + "logps/rejected": -269.45245361328125, + "loss": 0.5286, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4683647155761719, + "rewards/margins": 0.7012184858322144, + "rewards/rejected": -2.169583320617676, + "step": 11080 + }, + { + "epoch": 1.9107512060647829, + "grad_norm": 29.926124572753906, + "learning_rate": 7.01297208250222e-08, + "logits/chosen": -1.6541608572006226, + "logits/rejected": -1.6142412424087524, + "logps/chosen": -204.04806518554688, + "logps/rejected": -259.0303955078125, + "loss": 0.5442, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4622454643249512, + "rewards/margins": 0.6007913947105408, + "rewards/rejected": -2.0630366802215576, + "step": 11090 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 26.798297882080078, + "learning_rate": 6.993844933287496e-08, + "logits/chosen": -1.5548121929168701, + "logits/rejected": -1.518174171447754, + "logps/chosen": -181.76849365234375, + "logps/rejected": -250.3037567138672, + "loss": 0.5323, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3089042901992798, + "rewards/margins": 0.6539050340652466, + "rewards/rejected": -1.9628093242645264, + "step": 11100 + }, + { + "epoch": 1.914197105444521, + "grad_norm": 25.093984603881836, + "learning_rate": 6.974729867022989e-08, + "logits/chosen": -1.6413952112197876, + "logits/rejected": -1.587660789489746, + "logps/chosen": -214.73489379882812, + "logps/rejected": -304.73065185546875, + "loss": 0.5347, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6083320379257202, + "rewards/margins": 0.9037650227546692, + "rewards/rejected": -2.512097120285034, + "step": 11110 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 33.20296096801758, + "learning_rate": 6.955626960539855e-08, + "logits/chosen": -1.7690269947052002, + "logits/rejected": -1.7244031429290771, + "logps/chosen": -202.90762329101562, + "logps/rejected": -263.71588134765625, + "loss": 0.5506, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4339269399642944, + "rewards/margins": 0.6664804220199585, + "rewards/rejected": -2.100407600402832, + "step": 11120 + }, + { + "epoch": 1.9176430048242592, + "grad_norm": 34.29365921020508, + "learning_rate": 6.936536290620393e-08, + "logits/chosen": -1.5904325246810913, + "logits/rejected": -1.5409746170043945, + "logps/chosen": -205.6435089111328, + "logps/rejected": -265.16888427734375, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4976742267608643, + "rewards/margins": 0.6221583485603333, + "rewards/rejected": -2.119832754135132, + "step": 11130 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 23.378873825073242, + "learning_rate": 6.917457933997706e-08, + "logits/chosen": -1.5747454166412354, + "logits/rejected": -1.536361813545227, + "logps/chosen": -187.07211303710938, + "logps/rejected": -245.0606689453125, + "loss": 0.5473, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3325872421264648, + "rewards/margins": 0.6022460460662842, + "rewards/rejected": -1.9348335266113281, + "step": 11140 + }, + { + "epoch": 1.9210889042039971, + "grad_norm": 29.120763778686523, + "learning_rate": 6.898391967355405e-08, + "logits/chosen": -1.6491073369979858, + "logits/rejected": -1.6144596338272095, + "logps/chosen": -204.37586975097656, + "logps/rejected": -264.35272216796875, + "loss": 0.5523, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4628231525421143, + "rewards/margins": 0.6155986785888672, + "rewards/rejected": -2.0784218311309814, + "step": 11150 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 29.497880935668945, + "learning_rate": 6.879338467327302e-08, + "logits/chosen": -1.7183611392974854, + "logits/rejected": -1.6814098358154297, + "logps/chosen": -186.4435272216797, + "logps/rejected": -240.2172393798828, + "loss": 0.5788, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3432486057281494, + "rewards/margins": 0.54877108335495, + "rewards/rejected": -1.8920198678970337, + "step": 11160 + }, + { + "epoch": 1.9245348035837355, + "grad_norm": 34.64222717285156, + "learning_rate": 6.860297510497104e-08, + "logits/chosen": -1.5618165731430054, + "logits/rejected": -1.5183881521224976, + "logps/chosen": -196.0279998779297, + "logps/rejected": -236.39517211914062, + "loss": 0.5921, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3886585235595703, + "rewards/margins": 0.4850701689720154, + "rewards/rejected": -1.8737287521362305, + "step": 11170 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 24.274341583251953, + "learning_rate": 6.841269173398107e-08, + "logits/chosen": -1.7048511505126953, + "logits/rejected": -1.6627804040908813, + "logps/chosen": -198.62307739257812, + "logps/rejected": -262.12396240234375, + "loss": 0.5452, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.446572184562683, + "rewards/margins": 0.666427493095398, + "rewards/rejected": -2.112999677658081, + "step": 11180 + }, + { + "epoch": 1.9279807029634735, + "grad_norm": 26.194883346557617, + "learning_rate": 6.82225353251286e-08, + "logits/chosen": -1.7092431783676147, + "logits/rejected": -1.6481502056121826, + "logps/chosen": -178.11065673828125, + "logps/rejected": -257.3678283691406, + "loss": 0.4607, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.232288122177124, + "rewards/margins": 0.8250058889389038, + "rewards/rejected": -2.0572941303253174, + "step": 11190 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 32.451969146728516, + "learning_rate": 6.80325066427291e-08, + "logits/chosen": -1.7004636526107788, + "logits/rejected": -1.6555591821670532, + "logps/chosen": -179.03671264648438, + "logps/rejected": -248.1265869140625, + "loss": 0.5178, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.257236123085022, + "rewards/margins": 0.6865662336349487, + "rewards/rejected": -1.9438022375106812, + "step": 11200 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -1.7501182556152344, + "eval_logits/rejected": -1.7272056341171265, + "eval_logps/chosen": -187.30357360839844, + "eval_logps/rejected": -223.25985717773438, + "eval_loss": 0.629496157169342, + "eval_rewards/accuracies": 0.6442843675613403, + "eval_rewards/chosen": -1.2859166860580444, + "eval_rewards/margins": 0.31488049030303955, + "eval_rewards/rejected": -1.600797176361084, + "eval_runtime": 384.6652, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 11200 + }, + { + "epoch": 1.9314266023432116, + "grad_norm": 28.927274703979492, + "learning_rate": 6.784260645058445e-08, + "logits/chosen": -1.5466166734695435, + "logits/rejected": -1.500182867050171, + "logps/chosen": -189.25881958007812, + "logps/rejected": -272.9542541503906, + "loss": 0.5235, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3884804248809814, + "rewards/margins": 0.8016069531440735, + "rewards/rejected": -2.19008731842041, + "step": 11210 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 31.34682273864746, + "learning_rate": 6.765283551198016e-08, + "logits/chosen": -1.6393709182739258, + "logits/rejected": -1.601914644241333, + "logps/chosen": -192.19395446777344, + "logps/rejected": -259.9654541015625, + "loss": 0.5799, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4014427661895752, + "rewards/margins": 0.6413668990135193, + "rewards/rejected": -2.04280948638916, + "step": 11220 + }, + { + "epoch": 1.9348725017229498, + "grad_norm": 26.979076385498047, + "learning_rate": 6.746319458968226e-08, + "logits/chosen": -1.6130883693695068, + "logits/rejected": -1.5812774896621704, + "logps/chosen": -206.37557983398438, + "logps/rejected": -262.9778747558594, + "loss": 0.5763, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.506066083908081, + "rewards/margins": 0.6084599494934082, + "rewards/rejected": -2.1145262718200684, + "step": 11230 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 19.683889389038086, + "learning_rate": 6.727368444593408e-08, + "logits/chosen": -1.627913475036621, + "logits/rejected": -1.5929877758026123, + "logps/chosen": -191.50289916992188, + "logps/rejected": -249.98178100585938, + "loss": 0.5739, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3685283660888672, + "rewards/margins": 0.5968562960624695, + "rewards/rejected": -1.9653844833374023, + "step": 11240 + }, + { + "epoch": 1.9383184011026877, + "grad_norm": 21.013734817504883, + "learning_rate": 6.708430584245337e-08, + "logits/chosen": -1.6611820459365845, + "logits/rejected": -1.6172800064086914, + "logps/chosen": -184.84738159179688, + "logps/rejected": -252.0414276123047, + "loss": 0.5245, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3055918216705322, + "rewards/margins": 0.6785596609115601, + "rewards/rejected": -1.9841514825820923, + "step": 11250 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 25.888309478759766, + "learning_rate": 6.689505954042913e-08, + "logits/chosen": -1.611316442489624, + "logits/rejected": -1.561462640762329, + "logps/chosen": -182.65574645996094, + "logps/rejected": -233.82638549804688, + "loss": 0.5611, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2482223510742188, + "rewards/margins": 0.5744415521621704, + "rewards/rejected": -1.8226640224456787, + "step": 11260 + }, + { + "epoch": 1.9417643004824259, + "grad_norm": 28.195512771606445, + "learning_rate": 6.67059463005187e-08, + "logits/chosen": -1.6652758121490479, + "logits/rejected": -1.6094785928726196, + "logps/chosen": -178.05380249023438, + "logps/rejected": -245.34066772460938, + "loss": 0.495, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2248725891113281, + "rewards/margins": 0.7181330919265747, + "rewards/rejected": -1.9430058002471924, + "step": 11270 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 24.995092391967773, + "learning_rate": 6.651696688284438e-08, + "logits/chosen": -1.680368185043335, + "logits/rejected": -1.633082628250122, + "logps/chosen": -196.35501098632812, + "logps/rejected": -246.5548095703125, + "loss": 0.5885, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4004287719726562, + "rewards/margins": 0.541008710861206, + "rewards/rejected": -1.9414373636245728, + "step": 11280 + }, + { + "epoch": 1.945210199862164, + "grad_norm": 27.881084442138672, + "learning_rate": 6.632812204699077e-08, + "logits/chosen": -1.6912187337875366, + "logits/rejected": -1.6491819620132446, + "logps/chosen": -184.676025390625, + "logps/rejected": -253.5763397216797, + "loss": 0.5159, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2471721172332764, + "rewards/margins": 0.7271748781204224, + "rewards/rejected": -1.9743473529815674, + "step": 11290 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 27.88582420349121, + "learning_rate": 6.613941255200147e-08, + "logits/chosen": -1.6142715215682983, + "logits/rejected": -1.5784027576446533, + "logps/chosen": -208.334228515625, + "logps/rejected": -247.870849609375, + "loss": 0.634, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.49410080909729, + "rewards/margins": 0.4474979043006897, + "rewards/rejected": -1.941598892211914, + "step": 11300 + }, + { + "epoch": 1.948656099241902, + "grad_norm": 33.243408203125, + "learning_rate": 6.595083915637602e-08, + "logits/chosen": -1.7283128499984741, + "logits/rejected": -1.6947336196899414, + "logps/chosen": -181.50262451171875, + "logps/rejected": -259.31378173828125, + "loss": 0.4967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3103947639465332, + "rewards/margins": 0.7675163149833679, + "rewards/rejected": -2.077910900115967, + "step": 11310 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 17.72072982788086, + "learning_rate": 6.576240261806711e-08, + "logits/chosen": -1.6511691808700562, + "logits/rejected": -1.5994830131530762, + "logps/chosen": -189.26026916503906, + "logps/rejected": -262.7272033691406, + "loss": 0.5067, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3433868885040283, + "rewards/margins": 0.7447927594184875, + "rewards/rejected": -2.0881800651550293, + "step": 11320 + }, + { + "epoch": 1.9521019986216404, + "grad_norm": 34.079185485839844, + "learning_rate": 6.557410369447712e-08, + "logits/chosen": -1.643885612487793, + "logits/rejected": -1.5888869762420654, + "logps/chosen": -198.7957305908203, + "logps/rejected": -277.14239501953125, + "loss": 0.4892, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.4623702764511108, + "rewards/margins": 0.8277085423469543, + "rewards/rejected": -2.2900784015655518, + "step": 11330 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 38.53839874267578, + "learning_rate": 6.538594314245541e-08, + "logits/chosen": -1.5357722043991089, + "logits/rejected": -1.4847605228424072, + "logps/chosen": -210.06088256835938, + "logps/rejected": -287.0927734375, + "loss": 0.5374, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5468144416809082, + "rewards/margins": 0.7765219807624817, + "rewards/rejected": -2.323336124420166, + "step": 11340 + }, + { + "epoch": 1.9555478980013783, + "grad_norm": 21.716381072998047, + "learning_rate": 6.51979217182952e-08, + "logits/chosen": -1.6712570190429688, + "logits/rejected": -1.631908655166626, + "logps/chosen": -218.09591674804688, + "logps/rejected": -276.07403564453125, + "loss": 0.5825, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6118927001953125, + "rewards/margins": 0.5883620977401733, + "rewards/rejected": -2.2002549171447754, + "step": 11350 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 25.573238372802734, + "learning_rate": 6.501004017773049e-08, + "logits/chosen": -1.6209056377410889, + "logits/rejected": -1.5758745670318604, + "logps/chosen": -205.62026977539062, + "logps/rejected": -271.5562744140625, + "loss": 0.5405, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5336554050445557, + "rewards/margins": 0.6963235139846802, + "rewards/rejected": -2.2299787998199463, + "step": 11360 + }, + { + "epoch": 1.9589937973811165, + "grad_norm": 22.5719051361084, + "learning_rate": 6.482229927593292e-08, + "logits/chosen": -1.6068344116210938, + "logits/rejected": -1.5686242580413818, + "logps/chosen": -202.43106079101562, + "logps/rejected": -264.10528564453125, + "loss": 0.5468, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.461557149887085, + "rewards/margins": 0.6432181596755981, + "rewards/rejected": -2.1047754287719727, + "step": 11370 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 38.594482421875, + "learning_rate": 6.463469976750894e-08, + "logits/chosen": -1.56108558177948, + "logits/rejected": -1.512900710105896, + "logps/chosen": -202.19317626953125, + "logps/rejected": -269.82177734375, + "loss": 0.5556, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5035988092422485, + "rewards/margins": 0.6939393877983093, + "rewards/rejected": -2.197537899017334, + "step": 11380 + }, + { + "epoch": 1.9624396967608546, + "grad_norm": 27.154815673828125, + "learning_rate": 6.444724240649674e-08, + "logits/chosen": -1.7020747661590576, + "logits/rejected": -1.6445642709732056, + "logps/chosen": -185.58892822265625, + "logps/rejected": -272.3781433105469, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3326423168182373, + "rewards/margins": 0.8379791378974915, + "rewards/rejected": -2.170621395111084, + "step": 11390 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 27.39109230041504, + "learning_rate": 6.425992794636305e-08, + "logits/chosen": -1.6527000665664673, + "logits/rejected": -1.6063038110733032, + "logps/chosen": -188.4795684814453, + "logps/rejected": -253.7546844482422, + "loss": 0.5426, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3256714344024658, + "rewards/margins": 0.6787670850753784, + "rewards/rejected": -2.0044384002685547, + "step": 11400 + }, + { + "epoch": 1.9658855961405926, + "grad_norm": 28.39838981628418, + "learning_rate": 6.407275714000029e-08, + "logits/chosen": -1.645401954650879, + "logits/rejected": -1.5962473154067993, + "logps/chosen": -192.2163543701172, + "logps/rejected": -256.794677734375, + "loss": 0.5371, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3653062582015991, + "rewards/margins": 0.6519485712051392, + "rewards/rejected": -2.0172548294067383, + "step": 11410 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 26.793954849243164, + "learning_rate": 6.388573073972341e-08, + "logits/chosen": -1.6396955251693726, + "logits/rejected": -1.6010195016860962, + "logps/chosen": -196.24127197265625, + "logps/rejected": -244.29257202148438, + "loss": 0.5792, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.425563097000122, + "rewards/margins": 0.514776349067688, + "rewards/rejected": -1.9403393268585205, + "step": 11420 + }, + { + "epoch": 1.969331495520331, + "grad_norm": 26.12976837158203, + "learning_rate": 6.3698849497267e-08, + "logits/chosen": -1.5891997814178467, + "logits/rejected": -1.5519187450408936, + "logps/chosen": -193.51866149902344, + "logps/rejected": -244.13540649414062, + "loss": 0.5918, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3854833841323853, + "rewards/margins": 0.529236376285553, + "rewards/rejected": -1.9147199392318726, + "step": 11430 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 27.010652542114258, + "learning_rate": 6.351211416378221e-08, + "logits/chosen": -1.6656444072723389, + "logits/rejected": -1.6361305713653564, + "logps/chosen": -188.60604858398438, + "logps/rejected": -236.7830810546875, + "loss": 0.58, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.360559105873108, + "rewards/margins": 0.48112016916275024, + "rewards/rejected": -1.8416792154312134, + "step": 11440 + }, + { + "epoch": 1.9727773949000689, + "grad_norm": 28.18216323852539, + "learning_rate": 6.332552548983368e-08, + "logits/chosen": -1.623583436012268, + "logits/rejected": -1.5726052522659302, + "logps/chosen": -183.00462341308594, + "logps/rejected": -252.4422149658203, + "loss": 0.5145, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.302964687347412, + "rewards/margins": 0.704452395439148, + "rewards/rejected": -2.0074172019958496, + "step": 11450 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 26.0721492767334, + "learning_rate": 6.313908422539656e-08, + "logits/chosen": -1.6321194171905518, + "logits/rejected": -1.5928866863250732, + "logps/chosen": -181.80203247070312, + "logps/rejected": -252.208740234375, + "loss": 0.5153, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2626416683197021, + "rewards/margins": 0.7160847187042236, + "rewards/rejected": -1.9787263870239258, + "step": 11460 + }, + { + "epoch": 1.976223294279807, + "grad_norm": 17.767175674438477, + "learning_rate": 6.295279111985354e-08, + "logits/chosen": -1.6835839748382568, + "logits/rejected": -1.6219911575317383, + "logps/chosen": -193.47625732421875, + "logps/rejected": -266.53326416015625, + "loss": 0.4831, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3663376569747925, + "rewards/margins": 0.7892099618911743, + "rewards/rejected": -2.1555473804473877, + "step": 11470 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 25.063413619995117, + "learning_rate": 6.276664692199175e-08, + "logits/chosen": -1.7087328433990479, + "logits/rejected": -1.6501919031143188, + "logps/chosen": -179.31600952148438, + "logps/rejected": -239.3920440673828, + "loss": 0.5349, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2322156429290771, + "rewards/margins": 0.657407820224762, + "rewards/rejected": -1.8896234035491943, + "step": 11480 + }, + { + "epoch": 1.9796691936595452, + "grad_norm": 23.074682235717773, + "learning_rate": 6.258065237999988e-08, + "logits/chosen": -1.6238977909088135, + "logits/rejected": -1.5825841426849365, + "logps/chosen": -195.80282592773438, + "logps/rejected": -246.5172119140625, + "loss": 0.5983, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4111154079437256, + "rewards/margins": 0.5428776741027832, + "rewards/rejected": -1.9539932012557983, + "step": 11490 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 30.888504028320312, + "learning_rate": 6.239480824146503e-08, + "logits/chosen": -1.6757621765136719, + "logits/rejected": -1.6433537006378174, + "logps/chosen": -181.2732696533203, + "logps/rejected": -229.3321075439453, + "loss": 0.58, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3098499774932861, + "rewards/margins": 0.47558125853538513, + "rewards/rejected": -1.785431146621704, + "step": 11500 + }, + { + "epoch": 1.9831150930392831, + "grad_norm": 31.92888069152832, + "learning_rate": 6.220911525336977e-08, + "logits/chosen": -1.5900300741195679, + "logits/rejected": -1.556713342666626, + "logps/chosen": -194.94851684570312, + "logps/rejected": -261.088623046875, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.394680380821228, + "rewards/margins": 0.6835174560546875, + "rewards/rejected": -2.078198194503784, + "step": 11510 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 24.679962158203125, + "learning_rate": 6.202357416208911e-08, + "logits/chosen": -1.6697218418121338, + "logits/rejected": -1.595521330833435, + "logps/chosen": -186.24783325195312, + "logps/rejected": -250.603515625, + "loss": 0.4994, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2644917964935303, + "rewards/margins": 0.7417808771133423, + "rewards/rejected": -2.006272792816162, + "step": 11520 + }, + { + "epoch": 1.9865609924190215, + "grad_norm": 19.900346755981445, + "learning_rate": 6.183818571338766e-08, + "logits/chosen": -1.661076545715332, + "logits/rejected": -1.6121562719345093, + "logps/chosen": -179.8099822998047, + "logps/rejected": -233.569580078125, + "loss": 0.5344, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2634823322296143, + "rewards/margins": 0.5821090340614319, + "rewards/rejected": -1.8455913066864014, + "step": 11530 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 27.895814895629883, + "learning_rate": 6.165295065241633e-08, + "logits/chosen": -1.6979827880859375, + "logits/rejected": -1.6585216522216797, + "logps/chosen": -180.02334594726562, + "logps/rejected": -265.24761962890625, + "loss": 0.4854, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2927592992782593, + "rewards/margins": 0.8074501156806946, + "rewards/rejected": -2.1002094745635986, + "step": 11540 + }, + { + "epoch": 1.9900068917987594, + "grad_norm": 24.11145782470703, + "learning_rate": 6.146786972370959e-08, + "logits/chosen": -1.625811219215393, + "logits/rejected": -1.5916721820831299, + "logps/chosen": -190.41238403320312, + "logps/rejected": -249.06112670898438, + "loss": 0.5833, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.39082932472229, + "rewards/margins": 0.5714894533157349, + "rewards/rejected": -1.962318778038025, + "step": 11550 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 24.088064193725586, + "learning_rate": 6.128294367118237e-08, + "logits/chosen": -1.6487737894058228, + "logits/rejected": -1.5956158638000488, + "logps/chosen": -194.94357299804688, + "logps/rejected": -259.94940185546875, + "loss": 0.5307, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3996059894561768, + "rewards/margins": 0.6820842027664185, + "rewards/rejected": -2.0816903114318848, + "step": 11560 + }, + { + "epoch": 1.9934527911784976, + "grad_norm": 33.55830383300781, + "learning_rate": 6.109817323812706e-08, + "logits/chosen": -1.641147255897522, + "logits/rejected": -1.599494218826294, + "logps/chosen": -205.94955444335938, + "logps/rejected": -271.0900573730469, + "loss": 0.5525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.518385410308838, + "rewards/margins": 0.635517954826355, + "rewards/rejected": -2.1539034843444824, + "step": 11570 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 27.222890853881836, + "learning_rate": 6.091355916721064e-08, + "logits/chosen": -1.7054399251937866, + "logits/rejected": -1.6661192178726196, + "logps/chosen": -194.77606201171875, + "logps/rejected": -263.6825866699219, + "loss": 0.5325, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.385178565979004, + "rewards/margins": 0.6949024796485901, + "rewards/rejected": -2.0800812244415283, + "step": 11580 + }, + { + "epoch": 1.9968986905582358, + "grad_norm": 28.632381439208984, + "learning_rate": 6.072910220047159e-08, + "logits/chosen": -1.5894039869308472, + "logits/rejected": -1.5394232273101807, + "logps/chosen": -193.7826385498047, + "logps/rejected": -244.916748046875, + "loss": 0.5385, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.342811942100525, + "rewards/margins": 0.5961362719535828, + "rewards/rejected": -1.938948392868042, + "step": 11590 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 28.379253387451172, + "learning_rate": 6.054480307931678e-08, + "logits/chosen": -1.657523512840271, + "logits/rejected": -1.622125267982483, + "logps/chosen": -178.28390502929688, + "logps/rejected": -239.18045043945312, + "loss": 0.5556, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2783455848693848, + "rewards/margins": 0.5756335258483887, + "rewards/rejected": -1.8539791107177734, + "step": 11600 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -1.7579905986785889, + "eval_logits/rejected": -1.7355717420578003, + "eval_logps/chosen": -185.22940063476562, + "eval_logps/rejected": -220.3213653564453, + "eval_loss": 0.6294985413551331, + "eval_rewards/accuracies": 0.6361523866653442, + "eval_rewards/chosen": -1.2651748657226562, + "eval_rewards/margins": 0.3062376379966736, + "eval_rewards/rejected": -1.571412444114685, + "eval_runtime": 384.4299, + "eval_samples_per_second": 11.196, + "eval_steps_per_second": 1.399, + "step": 11600 + }, + { + "epoch": 2.0003445899379737, + "grad_norm": 24.472761154174805, + "learning_rate": 6.036066254451881e-08, + "logits/chosen": -1.646522879600525, + "logits/rejected": -1.6024043560028076, + "logps/chosen": -181.7415313720703, + "logps/rejected": -251.5871124267578, + "loss": 0.5436, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3067373037338257, + "rewards/margins": 0.6961520314216614, + "rewards/rejected": -2.0028891563415527, + "step": 11610 + }, + { + "epoch": 2.0020675396278427, + "grad_norm": 29.493276596069336, + "learning_rate": 6.017668133621275e-08, + "logits/chosen": -1.649038314819336, + "logits/rejected": -1.606300950050354, + "logps/chosen": -189.8376007080078, + "logps/rejected": -259.1862487792969, + "loss": 0.5138, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3303192853927612, + "rewards/margins": 0.6951020956039429, + "rewards/rejected": -2.025421142578125, + "step": 11620 + }, + { + "epoch": 2.003790489317712, + "grad_norm": 20.440237045288086, + "learning_rate": 5.999286019389342e-08, + "logits/chosen": -1.6871811151504517, + "logits/rejected": -1.6313413381576538, + "logps/chosen": -183.3996124267578, + "logps/rejected": -255.48953247070312, + "loss": 0.4822, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2532579898834229, + "rewards/margins": 0.7730938196182251, + "rewards/rejected": -2.0263514518737793, + "step": 11630 + }, + { + "epoch": 2.005513439007581, + "grad_norm": 22.95535659790039, + "learning_rate": 5.980919985641202e-08, + "logits/chosen": -1.697274923324585, + "logits/rejected": -1.6537383794784546, + "logps/chosen": -180.3485870361328, + "logps/rejected": -251.100830078125, + "loss": 0.4927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2791554927825928, + "rewards/margins": 0.7304551601409912, + "rewards/rejected": -2.009610652923584, + "step": 11640 + }, + { + "epoch": 2.00723638869745, + "grad_norm": 26.56332778930664, + "learning_rate": 5.962570106197364e-08, + "logits/chosen": -1.5988438129425049, + "logits/rejected": -1.5431615114212036, + "logps/chosen": -177.05831909179688, + "logps/rejected": -246.145751953125, + "loss": 0.5147, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2243263721466064, + "rewards/margins": 0.725864052772522, + "rewards/rejected": -1.9501903057098389, + "step": 11650 + }, + { + "epoch": 2.008959338387319, + "grad_norm": 21.601728439331055, + "learning_rate": 5.944236454813396e-08, + "logits/chosen": -1.7152572870254517, + "logits/rejected": -1.6425163745880127, + "logps/chosen": -194.8033447265625, + "logps/rejected": -266.7611389160156, + "loss": 0.5118, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3984410762786865, + "rewards/margins": 0.7663156390190125, + "rewards/rejected": -2.1647565364837646, + "step": 11660 + }, + { + "epoch": 2.010682288077188, + "grad_norm": 24.720033645629883, + "learning_rate": 5.9259191051796375e-08, + "logits/chosen": -1.6166601181030273, + "logits/rejected": -1.5559755563735962, + "logps/chosen": -198.079833984375, + "logps/rejected": -292.06951904296875, + "loss": 0.4613, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4261969327926636, + "rewards/margins": 0.9553655385971069, + "rewards/rejected": -2.3815624713897705, + "step": 11670 + }, + { + "epoch": 2.0124052377670574, + "grad_norm": 31.20380973815918, + "learning_rate": 5.907618130920919e-08, + "logits/chosen": -1.5339603424072266, + "logits/rejected": -1.4910167455673218, + "logps/chosen": -205.57272338867188, + "logps/rejected": -289.71307373046875, + "loss": 0.5066, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4919240474700928, + "rewards/margins": 0.8347158432006836, + "rewards/rejected": -2.3266398906707764, + "step": 11680 + }, + { + "epoch": 2.0141281874569263, + "grad_norm": 41.9987678527832, + "learning_rate": 5.8893336055962254e-08, + "logits/chosen": -1.6653470993041992, + "logits/rejected": -1.61074697971344, + "logps/chosen": -206.09677124023438, + "logps/rejected": -269.8291931152344, + "loss": 0.5325, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4608094692230225, + "rewards/margins": 0.695176362991333, + "rewards/rejected": -2.1559858322143555, + "step": 11690 + }, + { + "epoch": 2.0158511371467953, + "grad_norm": 24.3171443939209, + "learning_rate": 5.871065602698451e-08, + "logits/chosen": -1.722753882408142, + "logits/rejected": -1.6735206842422485, + "logps/chosen": -190.55506896972656, + "logps/rejected": -254.53768920898438, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.337207555770874, + "rewards/margins": 0.6379891633987427, + "rewards/rejected": -1.9751968383789062, + "step": 11700 + }, + { + "epoch": 2.0175740868366643, + "grad_norm": 39.6287727355957, + "learning_rate": 5.852814195654068e-08, + "logits/chosen": -1.6527093648910522, + "logits/rejected": -1.6069351434707642, + "logps/chosen": -199.5983123779297, + "logps/rejected": -264.60858154296875, + "loss": 0.5308, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4391547441482544, + "rewards/margins": 0.6757218241691589, + "rewards/rejected": -2.1148767471313477, + "step": 11710 + }, + { + "epoch": 2.0192970365265333, + "grad_norm": 26.280908584594727, + "learning_rate": 5.834579457822848e-08, + "logits/chosen": -1.6911094188690186, + "logits/rejected": -1.649491548538208, + "logps/chosen": -200.91928100585938, + "logps/rejected": -268.92919921875, + "loss": 0.5565, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.473656415939331, + "rewards/margins": 0.6882593035697937, + "rewards/rejected": -2.1619155406951904, + "step": 11720 + }, + { + "epoch": 2.0210199862164027, + "grad_norm": 20.197315216064453, + "learning_rate": 5.81636146249756e-08, + "logits/chosen": -1.6321799755096436, + "logits/rejected": -1.5866422653198242, + "logps/chosen": -190.90988159179688, + "logps/rejected": -278.8709716796875, + "loss": 0.4776, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3572702407836914, + "rewards/margins": 0.8569647669792175, + "rewards/rejected": -2.2142350673675537, + "step": 11730 + }, + { + "epoch": 2.0227429359062716, + "grad_norm": 22.812088012695312, + "learning_rate": 5.798160282903672e-08, + "logits/chosen": -1.701157808303833, + "logits/rejected": -1.6628563404083252, + "logps/chosen": -185.42343139648438, + "logps/rejected": -252.4224090576172, + "loss": 0.5333, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3248282670974731, + "rewards/margins": 0.6645230650901794, + "rewards/rejected": -1.989351511001587, + "step": 11740 + }, + { + "epoch": 2.0244658855961406, + "grad_norm": 21.9329891204834, + "learning_rate": 5.779975992199075e-08, + "logits/chosen": -1.6952497959136963, + "logits/rejected": -1.661820411682129, + "logps/chosen": -179.97988891601562, + "logps/rejected": -272.5703125, + "loss": 0.4591, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3170697689056396, + "rewards/margins": 0.8572857975959778, + "rewards/rejected": -2.1743557453155518, + "step": 11750 + }, + { + "epoch": 2.0261888352860096, + "grad_norm": 24.972156524658203, + "learning_rate": 5.761808663473775e-08, + "logits/chosen": -1.6464498043060303, + "logits/rejected": -1.6037495136260986, + "logps/chosen": -195.52725219726562, + "logps/rejected": -272.06707763671875, + "loss": 0.5244, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.407350778579712, + "rewards/margins": 0.7783458828926086, + "rewards/rejected": -2.185697078704834, + "step": 11760 + }, + { + "epoch": 2.0279117849758785, + "grad_norm": 23.84781265258789, + "learning_rate": 5.74365836974959e-08, + "logits/chosen": -1.5857475996017456, + "logits/rejected": -1.5295172929763794, + "logps/chosen": -193.87759399414062, + "logps/rejected": -268.18048095703125, + "loss": 0.5116, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3709337711334229, + "rewards/margins": 0.7918566465377808, + "rewards/rejected": -2.162790536880493, + "step": 11770 + }, + { + "epoch": 2.029634734665748, + "grad_norm": 35.31039047241211, + "learning_rate": 5.7255251839798726e-08, + "logits/chosen": -1.5392212867736816, + "logits/rejected": -1.4999496936798096, + "logps/chosen": -199.56455993652344, + "logps/rejected": -269.37847900390625, + "loss": 0.543, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4918761253356934, + "rewards/margins": 0.6816176176071167, + "rewards/rejected": -2.1734938621520996, + "step": 11780 + }, + { + "epoch": 2.031357684355617, + "grad_norm": 31.171165466308594, + "learning_rate": 5.7074091790492206e-08, + "logits/chosen": -1.6171038150787354, + "logits/rejected": -1.5785691738128662, + "logps/chosen": -181.83547973632812, + "logps/rejected": -295.6371765136719, + "loss": 0.4036, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3244999647140503, + "rewards/margins": 1.0919300317764282, + "rewards/rejected": -2.4164299964904785, + "step": 11790 + }, + { + "epoch": 2.033080634045486, + "grad_norm": 21.47528648376465, + "learning_rate": 5.6893104277731594e-08, + "logits/chosen": -1.6073768138885498, + "logits/rejected": -1.5689818859100342, + "logps/chosen": -214.50344848632812, + "logps/rejected": -277.74688720703125, + "loss": 0.5375, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5738976001739502, + "rewards/margins": 0.6701092720031738, + "rewards/rejected": -2.244006633758545, + "step": 11800 + }, + { + "epoch": 2.034803583735355, + "grad_norm": 32.091861724853516, + "learning_rate": 5.6712290028978815e-08, + "logits/chosen": -1.567014217376709, + "logits/rejected": -1.528381109237671, + "logps/chosen": -220.6244659423828, + "logps/rejected": -298.44293212890625, + "loss": 0.5138, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6461851596832275, + "rewards/margins": 0.810315728187561, + "rewards/rejected": -2.456500768661499, + "step": 11810 + }, + { + "epoch": 2.036526533425224, + "grad_norm": 25.720264434814453, + "learning_rate": 5.653164977099921e-08, + "logits/chosen": -1.5711066722869873, + "logits/rejected": -1.5318381786346436, + "logps/chosen": -213.1315460205078, + "logps/rejected": -294.34356689453125, + "loss": 0.5246, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.632672905921936, + "rewards/margins": 0.7945891618728638, + "rewards/rejected": -2.4272620677948, + "step": 11820 + }, + { + "epoch": 2.0382494831150932, + "grad_norm": 35.8724479675293, + "learning_rate": 5.635118422985896e-08, + "logits/chosen": -1.5579307079315186, + "logits/rejected": -1.5190536975860596, + "logps/chosen": -197.6388702392578, + "logps/rejected": -288.42901611328125, + "loss": 0.4808, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4660093784332275, + "rewards/margins": 0.8728569149971008, + "rewards/rejected": -2.3388664722442627, + "step": 11830 + }, + { + "epoch": 2.039972432804962, + "grad_norm": 28.59819793701172, + "learning_rate": 5.61708941309218e-08, + "logits/chosen": -1.5615540742874146, + "logits/rejected": -1.5118497610092163, + "logps/chosen": -211.1688995361328, + "logps/rejected": -299.4646301269531, + "loss": 0.4795, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5894633531570435, + "rewards/margins": 0.8861982226371765, + "rewards/rejected": -2.4756617546081543, + "step": 11840 + }, + { + "epoch": 2.041695382494831, + "grad_norm": 33.376678466796875, + "learning_rate": 5.5990780198846435e-08, + "logits/chosen": -1.4968514442443848, + "logits/rejected": -1.4367352724075317, + "logps/chosen": -224.6299285888672, + "logps/rejected": -293.9331970214844, + "loss": 0.5316, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6985681056976318, + "rewards/margins": 0.7514779567718506, + "rewards/rejected": -2.4500460624694824, + "step": 11850 + }, + { + "epoch": 2.0434183321847, + "grad_norm": 37.74618911743164, + "learning_rate": 5.581084315758351e-08, + "logits/chosen": -1.6089210510253906, + "logits/rejected": -1.580612301826477, + "logps/chosen": -235.47769165039062, + "logps/rejected": -290.8262939453125, + "loss": 0.606, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8060928583145142, + "rewards/margins": 0.5267369747161865, + "rewards/rejected": -2.332829236984253, + "step": 11860 + }, + { + "epoch": 2.045141281874569, + "grad_norm": 27.94704246520996, + "learning_rate": 5.563108373037243e-08, + "logits/chosen": -1.544832468032837, + "logits/rejected": -1.501721978187561, + "logps/chosen": -203.64990234375, + "logps/rejected": -291.85693359375, + "loss": 0.5009, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5535991191864014, + "rewards/margins": 0.8287463188171387, + "rewards/rejected": -2.38234543800354, + "step": 11870 + }, + { + "epoch": 2.0468642315644385, + "grad_norm": 46.3896598815918, + "learning_rate": 5.545150263973897e-08, + "logits/chosen": -1.568872332572937, + "logits/rejected": -1.5166070461273193, + "logps/chosen": -212.12838745117188, + "logps/rejected": -299.6388244628906, + "loss": 0.465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5504481792449951, + "rewards/margins": 0.8933519124984741, + "rewards/rejected": -2.443800449371338, + "step": 11880 + }, + { + "epoch": 2.0485871812543075, + "grad_norm": 28.92313575744629, + "learning_rate": 5.527210060749201e-08, + "logits/chosen": -1.5919010639190674, + "logits/rejected": -1.5410182476043701, + "logps/chosen": -220.18118286132812, + "logps/rejected": -303.7860107421875, + "loss": 0.495, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6413625478744507, + "rewards/margins": 0.8713878393173218, + "rewards/rejected": -2.5127501487731934, + "step": 11890 + }, + { + "epoch": 2.0503101309441765, + "grad_norm": 34.28177261352539, + "learning_rate": 5.509287835472067e-08, + "logits/chosen": -1.4587892293930054, + "logits/rejected": -1.4046921730041504, + "logps/chosen": -217.0598602294922, + "logps/rejected": -312.3211364746094, + "loss": 0.4782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6380027532577515, + "rewards/margins": 0.9663599729537964, + "rewards/rejected": -2.604362726211548, + "step": 11900 + }, + { + "epoch": 2.0520330806340454, + "grad_norm": 32.117855072021484, + "learning_rate": 5.4913836601791497e-08, + "logits/chosen": -1.6434059143066406, + "logits/rejected": -1.6171928644180298, + "logps/chosen": -219.04653930664062, + "logps/rejected": -309.12176513671875, + "loss": 0.5203, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.670095443725586, + "rewards/margins": 0.8413150906562805, + "rewards/rejected": -2.511410713195801, + "step": 11910 + }, + { + "epoch": 2.0537560303239144, + "grad_norm": 29.40235137939453, + "learning_rate": 5.473497606834554e-08, + "logits/chosen": -1.5855402946472168, + "logits/rejected": -1.5456552505493164, + "logps/chosen": -226.11276245117188, + "logps/rejected": -300.3972473144531, + "loss": 0.5177, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6782705783843994, + "rewards/margins": 0.7569664716720581, + "rewards/rejected": -2.435236692428589, + "step": 11920 + }, + { + "epoch": 2.055478980013784, + "grad_norm": 23.78101348876953, + "learning_rate": 5.4556297473295496e-08, + "logits/chosen": -1.5875284671783447, + "logits/rejected": -1.541799783706665, + "logps/chosen": -206.77352905273438, + "logps/rejected": -298.059326171875, + "loss": 0.4563, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5398340225219727, + "rewards/margins": 0.8825027346611023, + "rewards/rejected": -2.4223365783691406, + "step": 11930 + }, + { + "epoch": 2.057201929703653, + "grad_norm": 25.40555763244629, + "learning_rate": 5.4377801534822676e-08, + "logits/chosen": -1.5478023290634155, + "logits/rejected": -1.4946033954620361, + "logps/chosen": -205.4107208251953, + "logps/rejected": -288.2338562011719, + "loss": 0.4869, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5127077102661133, + "rewards/margins": 0.8377426266670227, + "rewards/rejected": -2.350450277328491, + "step": 11940 + }, + { + "epoch": 2.0589248793935218, + "grad_norm": 36.173519134521484, + "learning_rate": 5.419948897037436e-08, + "logits/chosen": -1.655509352684021, + "logits/rejected": -1.6048641204833984, + "logps/chosen": -219.49755859375, + "logps/rejected": -291.3073425292969, + "loss": 0.5625, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6178226470947266, + "rewards/margins": 0.7313393354415894, + "rewards/rejected": -2.3491616249084473, + "step": 11950 + }, + { + "epoch": 2.0606478290833907, + "grad_norm": 39.75484848022461, + "learning_rate": 5.4021360496660614e-08, + "logits/chosen": -1.5307501554489136, + "logits/rejected": -1.4881916046142578, + "logps/chosen": -209.718505859375, + "logps/rejected": -300.6219787597656, + "loss": 0.4911, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5740060806274414, + "rewards/margins": 0.904276967048645, + "rewards/rejected": -2.478283166885376, + "step": 11960 + }, + { + "epoch": 2.0623707787732597, + "grad_norm": 23.72635841369629, + "learning_rate": 5.3843416829651713e-08, + "logits/chosen": -1.6101815700531006, + "logits/rejected": -1.5637496709823608, + "logps/chosen": -210.59402465820312, + "logps/rejected": -288.64813232421875, + "loss": 0.494, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5444914102554321, + "rewards/margins": 0.8065498471260071, + "rewards/rejected": -2.351041078567505, + "step": 11970 + }, + { + "epoch": 2.0640937284631287, + "grad_norm": 26.937618255615234, + "learning_rate": 5.3665658684574975e-08, + "logits/chosen": -1.612941026687622, + "logits/rejected": -1.5580946207046509, + "logps/chosen": -199.7908172607422, + "logps/rejected": -284.44891357421875, + "loss": 0.4494, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4498227834701538, + "rewards/margins": 0.8510910868644714, + "rewards/rejected": -2.3009140491485596, + "step": 11980 + }, + { + "epoch": 2.065816678152998, + "grad_norm": 28.87058448791504, + "learning_rate": 5.348808677591222e-08, + "logits/chosen": -1.6719602346420288, + "logits/rejected": -1.6104381084442139, + "logps/chosen": -204.49407958984375, + "logps/rejected": -271.0589294433594, + "loss": 0.5225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.468535304069519, + "rewards/margins": 0.7241191864013672, + "rewards/rejected": -2.1926543712615967, + "step": 11990 + }, + { + "epoch": 2.067539627842867, + "grad_norm": 27.164878845214844, + "learning_rate": 5.331070181739654e-08, + "logits/chosen": -1.5682309865951538, + "logits/rejected": -1.5209156274795532, + "logps/chosen": -198.87789916992188, + "logps/rejected": -287.53662109375, + "loss": 0.4901, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.482013463973999, + "rewards/margins": 0.8468286395072937, + "rewards/rejected": -2.3288419246673584, + "step": 12000 + }, + { + "epoch": 2.067539627842867, + "eval_logits/chosen": -1.6928431987762451, + "eval_logits/rejected": -1.6688482761383057, + "eval_logps/chosen": -206.20091247558594, + "eval_logps/rejected": -245.6420440673828, + "eval_loss": 0.6303159594535828, + "eval_rewards/accuracies": 0.6447490453720093, + "eval_rewards/chosen": -1.4748902320861816, + "eval_rewards/margins": 0.34972894191741943, + "eval_rewards/rejected": -1.8246192932128906, + "eval_runtime": 384.922, + "eval_samples_per_second": 11.181, + "eval_steps_per_second": 1.398, + "step": 12000 + }, + { + "epoch": 2.069262577532736, + "grad_norm": 24.481056213378906, + "learning_rate": 5.313350452200962e-08, + "logits/chosen": -1.6024847030639648, + "logits/rejected": -1.5504302978515625, + "logps/chosen": -218.79531860351562, + "logps/rejected": -306.15643310546875, + "loss": 0.4969, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.638654112815857, + "rewards/margins": 0.8658208847045898, + "rewards/rejected": -2.5044751167297363, + "step": 12010 + }, + { + "epoch": 2.070985527222605, + "grad_norm": 32.93242263793945, + "learning_rate": 5.295649560197895e-08, + "logits/chosen": -1.6048576831817627, + "logits/rejected": -1.5665851831436157, + "logps/chosen": -201.8533172607422, + "logps/rejected": -278.32830810546875, + "loss": 0.5261, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4912683963775635, + "rewards/margins": 0.7643145322799683, + "rewards/rejected": -2.255582809448242, + "step": 12020 + }, + { + "epoch": 2.072708476912474, + "grad_norm": 30.427339553833008, + "learning_rate": 5.27796757687748e-08, + "logits/chosen": -1.71212899684906, + "logits/rejected": -1.6749742031097412, + "logps/chosen": -188.22976684570312, + "logps/rejected": -254.19546508789062, + "loss": 0.5499, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3920142650604248, + "rewards/margins": 0.6375147700309753, + "rewards/rejected": -2.029529094696045, + "step": 12030 + }, + { + "epoch": 2.0744314266023434, + "grad_norm": 26.58029556274414, + "learning_rate": 5.260304573310743e-08, + "logits/chosen": -1.5901178121566772, + "logits/rejected": -1.539693832397461, + "logps/chosen": -204.1492156982422, + "logps/rejected": -282.22393798828125, + "loss": 0.5055, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.484749674797058, + "rewards/margins": 0.826759934425354, + "rewards/rejected": -2.311509609222412, + "step": 12040 + }, + { + "epoch": 2.0761543762922123, + "grad_norm": 32.788917541503906, + "learning_rate": 5.242660620492416e-08, + "logits/chosen": -1.6245148181915283, + "logits/rejected": -1.5556128025054932, + "logps/chosen": -203.12457275390625, + "logps/rejected": -294.1376953125, + "loss": 0.4289, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4886772632598877, + "rewards/margins": 0.940477728843689, + "rewards/rejected": -2.429154872894287, + "step": 12050 + }, + { + "epoch": 2.0778773259820813, + "grad_norm": 28.661020278930664, + "learning_rate": 5.2250357893406703e-08, + "logits/chosen": -1.5994634628295898, + "logits/rejected": -1.5617071390151978, + "logps/chosen": -203.6561737060547, + "logps/rejected": -275.8608703613281, + "loss": 0.5248, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4781253337860107, + "rewards/margins": 0.7392922639846802, + "rewards/rejected": -2.2174174785614014, + "step": 12060 + }, + { + "epoch": 2.0796002756719503, + "grad_norm": 27.835433959960938, + "learning_rate": 5.2074301506968165e-08, + "logits/chosen": -1.5991908311843872, + "logits/rejected": -1.5303010940551758, + "logps/chosen": -199.74252319335938, + "logps/rejected": -295.2624206542969, + "loss": 0.4498, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4509713649749756, + "rewards/margins": 1.0048578977584839, + "rewards/rejected": -2.45582914352417, + "step": 12070 + }, + { + "epoch": 2.0813232253618192, + "grad_norm": 41.754215240478516, + "learning_rate": 5.189843775325018e-08, + "logits/chosen": -1.492857575416565, + "logits/rejected": -1.4475312232971191, + "logps/chosen": -198.62515258789062, + "logps/rejected": -295.5875244140625, + "loss": 0.4505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4303429126739502, + "rewards/margins": 0.9779750108718872, + "rewards/rejected": -2.408318042755127, + "step": 12080 + }, + { + "epoch": 2.0830461750516887, + "grad_norm": 39.57783508300781, + "learning_rate": 5.172276733912009e-08, + "logits/chosen": -1.5722321271896362, + "logits/rejected": -1.5255016088485718, + "logps/chosen": -215.2079620361328, + "logps/rejected": -291.56878662109375, + "loss": 0.5355, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5862327814102173, + "rewards/margins": 0.7993678450584412, + "rewards/rejected": -2.3856005668640137, + "step": 12090 + }, + { + "epoch": 2.0847691247415576, + "grad_norm": 51.323238372802734, + "learning_rate": 5.1547290970668243e-08, + "logits/chosen": -1.5735671520233154, + "logits/rejected": -1.5332248210906982, + "logps/chosen": -205.8524169921875, + "logps/rejected": -281.3876953125, + "loss": 0.5143, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4781602621078491, + "rewards/margins": 0.813062310218811, + "rewards/rejected": -2.29122257232666, + "step": 12100 + }, + { + "epoch": 2.0864920744314266, + "grad_norm": 31.750356674194336, + "learning_rate": 5.13720093532049e-08, + "logits/chosen": -1.6060386896133423, + "logits/rejected": -1.557785153388977, + "logps/chosen": -228.09848022460938, + "logps/rejected": -296.2030029296875, + "loss": 0.5508, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7111345529556274, + "rewards/margins": 0.7136415243148804, + "rewards/rejected": -2.424776077270508, + "step": 12110 + }, + { + "epoch": 2.0882150241212956, + "grad_norm": 37.41475296020508, + "learning_rate": 5.1196923191257654e-08, + "logits/chosen": -1.5627740621566772, + "logits/rejected": -1.516533613204956, + "logps/chosen": -216.0414276123047, + "logps/rejected": -286.8224792480469, + "loss": 0.5469, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6007273197174072, + "rewards/margins": 0.7471935749053955, + "rewards/rejected": -2.3479208946228027, + "step": 12120 + }, + { + "epoch": 2.0899379738111645, + "grad_norm": 20.90497589111328, + "learning_rate": 5.102203318856847e-08, + "logits/chosen": -1.5501728057861328, + "logits/rejected": -1.491492509841919, + "logps/chosen": -205.28103637695312, + "logps/rejected": -298.6039123535156, + "loss": 0.4356, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5086464881896973, + "rewards/margins": 0.9435533285140991, + "rewards/rejected": -2.4521994590759277, + "step": 12130 + }, + { + "epoch": 2.091660923501034, + "grad_norm": 40.72807312011719, + "learning_rate": 5.084734004809079e-08, + "logits/chosen": -1.5945508480072021, + "logits/rejected": -1.5598722696304321, + "logps/chosen": -214.08544921875, + "logps/rejected": -284.52252197265625, + "loss": 0.5566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6211093664169312, + "rewards/margins": 0.6779786944389343, + "rewards/rejected": -2.2990882396698, + "step": 12140 + }, + { + "epoch": 2.093383873190903, + "grad_norm": 85.24677276611328, + "learning_rate": 5.0672844471986806e-08, + "logits/chosen": -1.580863118171692, + "logits/rejected": -1.538017988204956, + "logps/chosen": -198.13323974609375, + "logps/rejected": -282.21923828125, + "loss": 0.5177, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.447094440460205, + "rewards/margins": 0.8471297025680542, + "rewards/rejected": -2.294224262237549, + "step": 12150 + }, + { + "epoch": 2.095106822880772, + "grad_norm": 46.52803039550781, + "learning_rate": 5.049854716162469e-08, + "logits/chosen": -1.527610421180725, + "logits/rejected": -1.4756158590316772, + "logps/chosen": -205.2031707763672, + "logps/rejected": -281.08294677734375, + "loss": 0.5246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4923702478408813, + "rewards/margins": 0.7814701199531555, + "rewards/rejected": -2.2738404273986816, + "step": 12160 + }, + { + "epoch": 2.096829772570641, + "grad_norm": 25.297910690307617, + "learning_rate": 5.032444881757575e-08, + "logits/chosen": -1.5589890480041504, + "logits/rejected": -1.5120322704315186, + "logps/chosen": -207.8610076904297, + "logps/rejected": -290.50433349609375, + "loss": 0.5286, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5336462259292603, + "rewards/margins": 0.8126686215400696, + "rewards/rejected": -2.3463149070739746, + "step": 12170 + }, + { + "epoch": 2.09855272226051, + "grad_norm": 24.670923233032227, + "learning_rate": 5.015055013961129e-08, + "logits/chosen": -1.5066965818405151, + "logits/rejected": -1.46164870262146, + "logps/chosen": -208.98617553710938, + "logps/rejected": -316.40887451171875, + "loss": 0.4464, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6038252115249634, + "rewards/margins": 0.9800359010696411, + "rewards/rejected": -2.5838611125946045, + "step": 12180 + }, + { + "epoch": 2.1002756719503792, + "grad_norm": 27.564498901367188, + "learning_rate": 4.9976851826700385e-08, + "logits/chosen": -1.5501229763031006, + "logits/rejected": -1.4985682964324951, + "logps/chosen": -211.6484375, + "logps/rejected": -279.1171875, + "loss": 0.532, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5453133583068848, + "rewards/margins": 0.7092097997665405, + "rewards/rejected": -2.254523277282715, + "step": 12190 + }, + { + "epoch": 2.101998621640248, + "grad_norm": 27.813444137573242, + "learning_rate": 4.980335457700665e-08, + "logits/chosen": -1.642297387123108, + "logits/rejected": -1.5962289571762085, + "logps/chosen": -208.8214111328125, + "logps/rejected": -290.8011169433594, + "loss": 0.48, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.522281289100647, + "rewards/margins": 0.843533992767334, + "rewards/rejected": -2.3658149242401123, + "step": 12200 + }, + { + "epoch": 2.103721571330117, + "grad_norm": 30.052356719970703, + "learning_rate": 4.963005908788547e-08, + "logits/chosen": -1.615196943283081, + "logits/rejected": -1.5772802829742432, + "logps/chosen": -208.3212432861328, + "logps/rejected": -278.3284606933594, + "loss": 0.5273, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5461933612823486, + "rewards/margins": 0.719495415687561, + "rewards/rejected": -2.26568865776062, + "step": 12210 + }, + { + "epoch": 2.105444521019986, + "grad_norm": 43.54231262207031, + "learning_rate": 4.945696605588143e-08, + "logits/chosen": -1.5061839818954468, + "logits/rejected": -1.4668266773223877, + "logps/chosen": -211.716552734375, + "logps/rejected": -289.04095458984375, + "loss": 0.5089, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5821346044540405, + "rewards/margins": 0.7677549123764038, + "rewards/rejected": -2.3498897552490234, + "step": 12220 + }, + { + "epoch": 2.107167470709855, + "grad_norm": 56.09331512451172, + "learning_rate": 4.928407617672519e-08, + "logits/chosen": -1.5421305894851685, + "logits/rejected": -1.5023183822631836, + "logps/chosen": -217.19921875, + "logps/rejected": -291.5745849609375, + "loss": 0.556, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6528059244155884, + "rewards/margins": 0.7489153742790222, + "rewards/rejected": -2.401721477508545, + "step": 12230 + }, + { + "epoch": 2.1088904203997245, + "grad_norm": 41.50090026855469, + "learning_rate": 4.911139014533099e-08, + "logits/chosen": -1.513869047164917, + "logits/rejected": -1.4628936052322388, + "logps/chosen": -205.1170196533203, + "logps/rejected": -282.4010925292969, + "loss": 0.5169, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5244477987289429, + "rewards/margins": 0.7733960747718811, + "rewards/rejected": -2.2978439331054688, + "step": 12240 + }, + { + "epoch": 2.1106133700895935, + "grad_norm": 39.326995849609375, + "learning_rate": 4.893890865579362e-08, + "logits/chosen": -1.5844281911849976, + "logits/rejected": -1.535041093826294, + "logps/chosen": -204.37889099121094, + "logps/rejected": -270.86981201171875, + "loss": 0.5651, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5194447040557861, + "rewards/margins": 0.6731320023536682, + "rewards/rejected": -2.1925766468048096, + "step": 12250 + }, + { + "epoch": 2.1123363197794625, + "grad_norm": 28.982343673706055, + "learning_rate": 4.8766632401385856e-08, + "logits/chosen": -1.5679948329925537, + "logits/rejected": -1.5272144079208374, + "logps/chosen": -193.6581573486328, + "logps/rejected": -274.4389343261719, + "loss": 0.5026, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.410930871963501, + "rewards/margins": 0.7730732560157776, + "rewards/rejected": -2.184004306793213, + "step": 12260 + }, + { + "epoch": 2.1140592694693314, + "grad_norm": 28.518871307373047, + "learning_rate": 4.859456207455539e-08, + "logits/chosen": -1.6385746002197266, + "logits/rejected": -1.5881214141845703, + "logps/chosen": -211.4015350341797, + "logps/rejected": -290.3092956542969, + "loss": 0.5262, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5662710666656494, + "rewards/margins": 0.7988941669464111, + "rewards/rejected": -2.3651652336120605, + "step": 12270 + }, + { + "epoch": 2.1157822191592004, + "grad_norm": 25.8980712890625, + "learning_rate": 4.842269836692239e-08, + "logits/chosen": -1.5845524072647095, + "logits/rejected": -1.5420753955841064, + "logps/chosen": -204.1547393798828, + "logps/rejected": -280.9200744628906, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4760124683380127, + "rewards/margins": 0.7984156608581543, + "rewards/rejected": -2.274428129196167, + "step": 12280 + }, + { + "epoch": 2.11750516884907, + "grad_norm": 32.773284912109375, + "learning_rate": 4.8251041969276355e-08, + "logits/chosen": -1.5275884866714478, + "logits/rejected": -1.4831234216690063, + "logps/chosen": -200.70103454589844, + "logps/rejected": -266.9785461425781, + "loss": 0.5278, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4691832065582275, + "rewards/margins": 0.6694160103797913, + "rewards/rejected": -2.138599157333374, + "step": 12290 + }, + { + "epoch": 2.1192281185389388, + "grad_norm": 32.303794860839844, + "learning_rate": 4.8079593571573654e-08, + "logits/chosen": -1.6137571334838867, + "logits/rejected": -1.5665721893310547, + "logps/chosen": -201.3102569580078, + "logps/rejected": -274.618896484375, + "loss": 0.5023, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4663169384002686, + "rewards/margins": 0.7571481466293335, + "rewards/rejected": -2.2234652042388916, + "step": 12300 + }, + { + "epoch": 2.1209510682288077, + "grad_norm": 23.450542449951172, + "learning_rate": 4.7908353862934645e-08, + "logits/chosen": -1.5943067073822021, + "logits/rejected": -1.5404088497161865, + "logps/chosen": -192.87353515625, + "logps/rejected": -279.0466613769531, + "loss": 0.4782, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3797800540924072, + "rewards/margins": 0.8839829564094543, + "rewards/rejected": -2.263762950897217, + "step": 12310 + }, + { + "epoch": 2.1226740179186767, + "grad_norm": 22.52594757080078, + "learning_rate": 4.773732353164069e-08, + "logits/chosen": -1.5985201597213745, + "logits/rejected": -1.5497382879257202, + "logps/chosen": -187.78636169433594, + "logps/rejected": -280.4857177734375, + "loss": 0.4772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3348400592803955, + "rewards/margins": 0.9259307980537415, + "rewards/rejected": -2.2607710361480713, + "step": 12320 + }, + { + "epoch": 2.1243969676085457, + "grad_norm": 26.383817672729492, + "learning_rate": 4.756650326513175e-08, + "logits/chosen": -1.6028234958648682, + "logits/rejected": -1.568605661392212, + "logps/chosen": -202.93075561523438, + "logps/rejected": -274.3039245605469, + "loss": 0.5205, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4667823314666748, + "rewards/margins": 0.7258719205856323, + "rewards/rejected": -2.1926541328430176, + "step": 12330 + }, + { + "epoch": 2.126119917298415, + "grad_norm": 26.518596649169922, + "learning_rate": 4.739589375000345e-08, + "logits/chosen": -1.621058702468872, + "logits/rejected": -1.562735676765442, + "logps/chosen": -202.68588256835938, + "logps/rejected": -301.7408142089844, + "loss": 0.4497, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4908440113067627, + "rewards/margins": 0.9897588491439819, + "rewards/rejected": -2.480602741241455, + "step": 12340 + }, + { + "epoch": 2.127842866988284, + "grad_norm": 27.536848068237305, + "learning_rate": 4.722549567200423e-08, + "logits/chosen": -1.5056012868881226, + "logits/rejected": -1.4400701522827148, + "logps/chosen": -217.3277130126953, + "logps/rejected": -291.8016052246094, + "loss": 0.5163, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6065528392791748, + "rewards/margins": 0.8035901784896851, + "rewards/rejected": -2.4101428985595703, + "step": 12350 + }, + { + "epoch": 2.129565816678153, + "grad_norm": 35.610374450683594, + "learning_rate": 4.70553097160327e-08, + "logits/chosen": -1.5891592502593994, + "logits/rejected": -1.5435259342193604, + "logps/chosen": -198.75253295898438, + "logps/rejected": -294.31427001953125, + "loss": 0.4536, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4582741260528564, + "rewards/margins": 0.9680150747299194, + "rewards/rejected": -2.4262890815734863, + "step": 12360 + }, + { + "epoch": 2.131288766368022, + "grad_norm": 21.72165870666504, + "learning_rate": 4.6885336566134905e-08, + "logits/chosen": -1.5432701110839844, + "logits/rejected": -1.4837857484817505, + "logps/chosen": -210.87631225585938, + "logps/rejected": -301.46527099609375, + "loss": 0.4864, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5829124450683594, + "rewards/margins": 0.9285858273506165, + "rewards/rejected": -2.511498212814331, + "step": 12370 + }, + { + "epoch": 2.133011716057891, + "grad_norm": 32.40623474121094, + "learning_rate": 4.671557690550158e-08, + "logits/chosen": -1.557513952255249, + "logits/rejected": -1.524306058883667, + "logps/chosen": -203.4970245361328, + "logps/rejected": -304.2638244628906, + "loss": 0.4518, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5309957265853882, + "rewards/margins": 0.9441516995429993, + "rewards/rejected": -2.4751474857330322, + "step": 12380 + }, + { + "epoch": 2.13473466574776, + "grad_norm": 28.967927932739258, + "learning_rate": 4.65460314164652e-08, + "logits/chosen": -1.661972999572754, + "logits/rejected": -1.6170930862426758, + "logps/chosen": -209.74435424804688, + "logps/rejected": -305.64923095703125, + "loss": 0.4876, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5648424625396729, + "rewards/margins": 0.9469350576400757, + "rewards/rejected": -2.511777400970459, + "step": 12390 + }, + { + "epoch": 2.1364576154376294, + "grad_norm": 26.420259475708008, + "learning_rate": 4.637670078049759e-08, + "logits/chosen": -1.5792243480682373, + "logits/rejected": -1.5416433811187744, + "logps/chosen": -224.37405395507812, + "logps/rejected": -317.53863525390625, + "loss": 0.4713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6922643184661865, + "rewards/margins": 0.9153863191604614, + "rewards/rejected": -2.6076505184173584, + "step": 12400 + }, + { + "epoch": 2.1364576154376294, + "eval_logits/chosen": -1.6644082069396973, + "eval_logits/rejected": -1.6396886110305786, + "eval_logps/chosen": -221.01467895507812, + "eval_logps/rejected": -263.3477783203125, + "eval_loss": 0.6302575469017029, + "eval_rewards/accuracies": 0.6470724940299988, + "eval_rewards/chosen": -1.6230275630950928, + "eval_rewards/margins": 0.37864890694618225, + "eval_rewards/rejected": -2.001676559448242, + "eval_runtime": 384.8844, + "eval_samples_per_second": 11.183, + "eval_steps_per_second": 1.398, + "step": 12400 + }, + { + "epoch": 2.1381805651274983, + "grad_norm": 33.982757568359375, + "learning_rate": 4.620758567820686e-08, + "logits/chosen": -1.5089236497879028, + "logits/rejected": -1.4480992555618286, + "logps/chosen": -215.35238647460938, + "logps/rejected": -295.104736328125, + "loss": 0.4985, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6145118474960327, + "rewards/margins": 0.8170326352119446, + "rewards/rejected": -2.431544542312622, + "step": 12410 + }, + { + "epoch": 2.1399035148173673, + "grad_norm": 62.16181564331055, + "learning_rate": 4.60386867893348e-08, + "logits/chosen": -1.3992679119110107, + "logits/rejected": -1.353809118270874, + "logps/chosen": -225.4574432373047, + "logps/rejected": -309.0572814941406, + "loss": 0.5194, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7003612518310547, + "rewards/margins": 0.8596186637878418, + "rewards/rejected": -2.5599799156188965, + "step": 12420 + }, + { + "epoch": 2.1416264645072363, + "grad_norm": 39.4116325378418, + "learning_rate": 4.5870004792754257e-08, + "logits/chosen": -1.5263465642929077, + "logits/rejected": -1.4820783138275146, + "logps/chosen": -227.6816864013672, + "logps/rejected": -318.7822265625, + "loss": 0.5033, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7275323867797852, + "rewards/margins": 0.9142645001411438, + "rewards/rejected": -2.641796588897705, + "step": 12430 + }, + { + "epoch": 2.1433494141971057, + "grad_norm": 30.580076217651367, + "learning_rate": 4.570154036646625e-08, + "logits/chosen": -1.4799082279205322, + "logits/rejected": -1.4441767930984497, + "logps/chosen": -215.04409790039062, + "logps/rejected": -296.3481750488281, + "loss": 0.5079, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.617315649986267, + "rewards/margins": 0.8194793462753296, + "rewards/rejected": -2.4367949962615967, + "step": 12440 + }, + { + "epoch": 2.1450723638869746, + "grad_norm": 35.00650405883789, + "learning_rate": 4.553329418759726e-08, + "logits/chosen": -1.5489532947540283, + "logits/rejected": -1.507712960243225, + "logps/chosen": -209.34255981445312, + "logps/rejected": -290.799560546875, + "loss": 0.5195, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5875122547149658, + "rewards/margins": 0.8105241656303406, + "rewards/rejected": -2.398036479949951, + "step": 12450 + }, + { + "epoch": 2.1467953135768436, + "grad_norm": 34.579010009765625, + "learning_rate": 4.5365266932396526e-08, + "logits/chosen": -1.6123673915863037, + "logits/rejected": -1.5568170547485352, + "logps/chosen": -220.16244506835938, + "logps/rejected": -316.6996154785156, + "loss": 0.484, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6562690734863281, + "rewards/margins": 0.9824492335319519, + "rewards/rejected": -2.6387181282043457, + "step": 12460 + }, + { + "epoch": 2.1485182632667126, + "grad_norm": 36.69334411621094, + "learning_rate": 4.519745927623344e-08, + "logits/chosen": -1.618233323097229, + "logits/rejected": -1.5750727653503418, + "logps/chosen": -231.67236328125, + "logps/rejected": -328.7120056152344, + "loss": 0.4771, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7639925479888916, + "rewards/margins": 0.9653264880180359, + "rewards/rejected": -2.7293190956115723, + "step": 12470 + }, + { + "epoch": 2.1502412129565815, + "grad_norm": 32.25230407714844, + "learning_rate": 4.5029871893594695e-08, + "logits/chosen": -1.6398899555206299, + "logits/rejected": -1.5991530418395996, + "logps/chosen": -223.4516143798828, + "logps/rejected": -302.0787353515625, + "loss": 0.5225, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6901556253433228, + "rewards/margins": 0.7845726013183594, + "rewards/rejected": -2.4747283458709717, + "step": 12480 + }, + { + "epoch": 2.1519641626464505, + "grad_norm": 32.58366012573242, + "learning_rate": 4.486250545808159e-08, + "logits/chosen": -1.5485055446624756, + "logits/rejected": -1.5052379369735718, + "logps/chosen": -208.1356658935547, + "logps/rejected": -298.0610656738281, + "loss": 0.498, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.536603569984436, + "rewards/margins": 0.9074599146842957, + "rewards/rejected": -2.444063425064087, + "step": 12490 + }, + { + "epoch": 2.15368711233632, + "grad_norm": 36.965362548828125, + "learning_rate": 4.469536064240731e-08, + "logits/chosen": -1.5669692754745483, + "logits/rejected": -1.521559476852417, + "logps/chosen": -222.0422821044922, + "logps/rejected": -315.29248046875, + "loss": 0.4904, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.686669945716858, + "rewards/margins": 0.9324058294296265, + "rewards/rejected": -2.6190757751464844, + "step": 12500 + }, + { + "epoch": 2.155410062026189, + "grad_norm": 25.924415588378906, + "learning_rate": 4.452843811839435e-08, + "logits/chosen": -1.6050430536270142, + "logits/rejected": -1.56070077419281, + "logps/chosen": -210.5176239013672, + "logps/rejected": -285.3177185058594, + "loss": 0.5205, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5747045278549194, + "rewards/margins": 0.7555389404296875, + "rewards/rejected": -2.3302433490753174, + "step": 12510 + }, + { + "epoch": 2.157133011716058, + "grad_norm": 30.48223114013672, + "learning_rate": 4.436173855697174e-08, + "logits/chosen": -1.5401922464370728, + "logits/rejected": -1.496222734451294, + "logps/chosen": -215.2317352294922, + "logps/rejected": -298.5047302246094, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6176135540008545, + "rewards/margins": 0.8186357617378235, + "rewards/rejected": -2.436249256134033, + "step": 12520 + }, + { + "epoch": 2.158855961405927, + "grad_norm": 38.309688568115234, + "learning_rate": 4.4195262628172224e-08, + "logits/chosen": -1.5570924282073975, + "logits/rejected": -1.5224055051803589, + "logps/chosen": -224.19107055664062, + "logps/rejected": -312.5212707519531, + "loss": 0.4784, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6979620456695557, + "rewards/margins": 0.8683120608329773, + "rewards/rejected": -2.5662739276885986, + "step": 12530 + }, + { + "epoch": 2.160578911095796, + "grad_norm": 33.87547302246094, + "learning_rate": 4.402901100112972e-08, + "logits/chosen": -1.5645685195922852, + "logits/rejected": -1.5179166793823242, + "logps/chosen": -211.88320922851562, + "logps/rejected": -289.5576171875, + "loss": 0.5615, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5813262462615967, + "rewards/margins": 0.77818363904953, + "rewards/rejected": -2.3595099449157715, + "step": 12540 + }, + { + "epoch": 2.162301860785665, + "grad_norm": 40.345947265625, + "learning_rate": 4.386298434407666e-08, + "logits/chosen": -1.6636282205581665, + "logits/rejected": -1.629463791847229, + "logps/chosen": -205.6763916015625, + "logps/rejected": -274.09100341796875, + "loss": 0.5371, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5461241006851196, + "rewards/margins": 0.6756289601325989, + "rewards/rejected": -2.2217533588409424, + "step": 12550 + }, + { + "epoch": 2.164024810475534, + "grad_norm": 29.747833251953125, + "learning_rate": 4.369718332434109e-08, + "logits/chosen": -1.6467325687408447, + "logits/rejected": -1.5891529321670532, + "logps/chosen": -210.7559051513672, + "logps/rejected": -276.63140869140625, + "loss": 0.5436, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5353169441223145, + "rewards/margins": 0.7327998280525208, + "rewards/rejected": -2.2681167125701904, + "step": 12560 + }, + { + "epoch": 2.165747760165403, + "grad_norm": 29.418228149414062, + "learning_rate": 4.3531608608344274e-08, + "logits/chosen": -1.5815799236297607, + "logits/rejected": -1.524218201637268, + "logps/chosen": -196.69119262695312, + "logps/rejected": -272.20849609375, + "loss": 0.4979, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4489413499832153, + "rewards/margins": 0.7680349349975586, + "rewards/rejected": -2.2169766426086426, + "step": 12570 + }, + { + "epoch": 2.167470709855272, + "grad_norm": 42.948368072509766, + "learning_rate": 4.3366260861597814e-08, + "logits/chosen": -1.6030486822128296, + "logits/rejected": -1.5377169847488403, + "logps/chosen": -213.7222900390625, + "logps/rejected": -295.1755065917969, + "loss": 0.4841, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5307261943817139, + "rewards/margins": 0.8973187208175659, + "rewards/rejected": -2.4280447959899902, + "step": 12580 + }, + { + "epoch": 2.169193659545141, + "grad_norm": 27.345455169677734, + "learning_rate": 4.3201140748701e-08, + "logits/chosen": -1.5837843418121338, + "logits/rejected": -1.5203263759613037, + "logps/chosen": -196.2156524658203, + "logps/rejected": -289.7723693847656, + "loss": 0.4401, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4074121713638306, + "rewards/margins": 0.9618197679519653, + "rewards/rejected": -2.369231700897217, + "step": 12590 + }, + { + "epoch": 2.1709166092350105, + "grad_norm": 24.87310028076172, + "learning_rate": 4.303624893333816e-08, + "logits/chosen": -1.5408040285110474, + "logits/rejected": -1.4929521083831787, + "logps/chosen": -210.43441772460938, + "logps/rejected": -285.0773620605469, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5213069915771484, + "rewards/margins": 0.7937949895858765, + "rewards/rejected": -2.3151021003723145, + "step": 12600 + }, + { + "epoch": 2.1726395589248795, + "grad_norm": 27.389739990234375, + "learning_rate": 4.287158607827607e-08, + "logits/chosen": -1.6728298664093018, + "logits/rejected": -1.6453278064727783, + "logps/chosen": -197.07711791992188, + "logps/rejected": -278.75213623046875, + "loss": 0.4727, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4301495552062988, + "rewards/margins": 0.8101191520690918, + "rewards/rejected": -2.2402689456939697, + "step": 12610 + }, + { + "epoch": 2.1743625086147484, + "grad_norm": 34.71573257446289, + "learning_rate": 4.270715284536124e-08, + "logits/chosen": -1.7020797729492188, + "logits/rejected": -1.6502811908721924, + "logps/chosen": -224.19589233398438, + "logps/rejected": -309.0333557128906, + "loss": 0.5049, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.688309669494629, + "rewards/margins": 0.8674631118774414, + "rewards/rejected": -2.5557727813720703, + "step": 12620 + }, + { + "epoch": 2.1760854583046174, + "grad_norm": 29.679231643676758, + "learning_rate": 4.2542949895517066e-08, + "logits/chosen": -1.629212737083435, + "logits/rejected": -1.6008615493774414, + "logps/chosen": -204.96914672851562, + "logps/rejected": -289.08575439453125, + "loss": 0.5328, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5458189249038696, + "rewards/margins": 0.8251399993896484, + "rewards/rejected": -2.3709590435028076, + "step": 12630 + }, + { + "epoch": 2.1778084079944864, + "grad_norm": 25.9903621673584, + "learning_rate": 4.2378977888741506e-08, + "logits/chosen": -1.6058130264282227, + "logits/rejected": -1.5600433349609375, + "logps/chosen": -207.5745849609375, + "logps/rejected": -292.06396484375, + "loss": 0.473, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5348149538040161, + "rewards/margins": 0.8506088256835938, + "rewards/rejected": -2.3854236602783203, + "step": 12640 + }, + { + "epoch": 2.179531357684356, + "grad_norm": 35.16606903076172, + "learning_rate": 4.221523748410428e-08, + "logits/chosen": -1.5772018432617188, + "logits/rejected": -1.5374476909637451, + "logps/chosen": -202.94577026367188, + "logps/rejected": -267.78656005859375, + "loss": 0.5463, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4989752769470215, + "rewards/margins": 0.6631990671157837, + "rewards/rejected": -2.1621744632720947, + "step": 12650 + }, + { + "epoch": 2.1812543073742248, + "grad_norm": 29.193828582763672, + "learning_rate": 4.2051729339744056e-08, + "logits/chosen": -1.4867627620697021, + "logits/rejected": -1.451336145401001, + "logps/chosen": -214.2347412109375, + "logps/rejected": -283.4859924316406, + "loss": 0.5661, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.621567964553833, + "rewards/margins": 0.6800750494003296, + "rewards/rejected": -2.301642894744873, + "step": 12660 + }, + { + "epoch": 2.1829772570640937, + "grad_norm": 41.727237701416016, + "learning_rate": 4.1888454112866125e-08, + "logits/chosen": -1.779306173324585, + "logits/rejected": -1.7223072052001953, + "logps/chosen": -202.23556518554688, + "logps/rejected": -287.9600524902344, + "loss": 0.515, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.490797758102417, + "rewards/margins": 0.8758557438850403, + "rewards/rejected": -2.3666536808013916, + "step": 12670 + }, + { + "epoch": 2.1847002067539627, + "grad_norm": 23.595829010009766, + "learning_rate": 4.172541245973943e-08, + "logits/chosen": -1.544134497642517, + "logits/rejected": -1.5028505325317383, + "logps/chosen": -203.92660522460938, + "logps/rejected": -282.19879150390625, + "loss": 0.5063, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5235766172409058, + "rewards/margins": 0.7782987356185913, + "rewards/rejected": -2.301875591278076, + "step": 12680 + }, + { + "epoch": 2.1864231564438317, + "grad_norm": 22.84954261779785, + "learning_rate": 4.156260503569423e-08, + "logits/chosen": -1.557756781578064, + "logits/rejected": -1.5162785053253174, + "logps/chosen": -210.11917114257812, + "logps/rejected": -289.6104431152344, + "loss": 0.5725, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5822913646697998, + "rewards/margins": 0.7867759466171265, + "rewards/rejected": -2.369067430496216, + "step": 12690 + }, + { + "epoch": 2.188146106133701, + "grad_norm": 24.481889724731445, + "learning_rate": 4.1400032495119183e-08, + "logits/chosen": -1.5616981983184814, + "logits/rejected": -1.511423110961914, + "logps/chosen": -201.97335815429688, + "logps/rejected": -289.2737121582031, + "loss": 0.4704, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4522186517715454, + "rewards/margins": 0.8924834132194519, + "rewards/rejected": -2.3447022438049316, + "step": 12700 + }, + { + "epoch": 2.18986905582357, + "grad_norm": 40.27986145019531, + "learning_rate": 4.123769549145901e-08, + "logits/chosen": -1.6570045948028564, + "logits/rejected": -1.6221774816513062, + "logps/chosen": -206.52645874023438, + "logps/rejected": -290.4212951660156, + "loss": 0.506, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5287845134735107, + "rewards/margins": 0.7977313995361328, + "rewards/rejected": -2.3265159130096436, + "step": 12710 + }, + { + "epoch": 2.191592005513439, + "grad_norm": 23.55912208557129, + "learning_rate": 4.10755946772116e-08, + "logits/chosen": -1.670885682106018, + "logits/rejected": -1.601375937461853, + "logps/chosen": -192.55203247070312, + "logps/rejected": -279.6814880371094, + "loss": 0.4654, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3597242832183838, + "rewards/margins": 0.9234593510627747, + "rewards/rejected": -2.2831835746765137, + "step": 12720 + }, + { + "epoch": 2.193314955203308, + "grad_norm": 37.51332092285156, + "learning_rate": 4.0913730703925485e-08, + "logits/chosen": -1.5183833837509155, + "logits/rejected": -1.4677681922912598, + "logps/chosen": -201.53402709960938, + "logps/rejected": -281.51397705078125, + "loss": 0.4951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5104000568389893, + "rewards/margins": 0.8004547953605652, + "rewards/rejected": -2.31085467338562, + "step": 12730 + }, + { + "epoch": 2.195037904893177, + "grad_norm": 23.36768341064453, + "learning_rate": 4.075210422219732e-08, + "logits/chosen": -1.5590684413909912, + "logits/rejected": -1.5175398588180542, + "logps/chosen": -221.9892578125, + "logps/rejected": -299.72515869140625, + "loss": 0.546, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6943871974945068, + "rewards/margins": 0.7802594304084778, + "rewards/rejected": -2.474646806716919, + "step": 12740 + }, + { + "epoch": 2.1967608545830464, + "grad_norm": 27.73143196105957, + "learning_rate": 4.059071588166921e-08, + "logits/chosen": -1.5354615449905396, + "logits/rejected": -1.4796323776245117, + "logps/chosen": -196.08566284179688, + "logps/rejected": -288.60552978515625, + "loss": 0.4454, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4249557256698608, + "rewards/margins": 0.914918065071106, + "rewards/rejected": -2.3398735523223877, + "step": 12750 + }, + { + "epoch": 2.1984838042729153, + "grad_norm": 26.64198112487793, + "learning_rate": 4.042956633102597e-08, + "logits/chosen": -1.5401995182037354, + "logits/rejected": -1.5017931461334229, + "logps/chosen": -212.65982055664062, + "logps/rejected": -293.1643371582031, + "loss": 0.5606, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6058902740478516, + "rewards/margins": 0.7976308465003967, + "rewards/rejected": -2.4035212993621826, + "step": 12760 + }, + { + "epoch": 2.2002067539627843, + "grad_norm": 28.978290557861328, + "learning_rate": 4.0268656217992615e-08, + "logits/chosen": -1.6174194812774658, + "logits/rejected": -1.5647356510162354, + "logps/chosen": -213.73715209960938, + "logps/rejected": -280.42755126953125, + "loss": 0.544, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.557701587677002, + "rewards/margins": 0.7231639623641968, + "rewards/rejected": -2.280865430831909, + "step": 12770 + }, + { + "epoch": 2.2019297036526533, + "grad_norm": 26.79999542236328, + "learning_rate": 4.0107986189331875e-08, + "logits/chosen": -1.6159842014312744, + "logits/rejected": -1.5725295543670654, + "logps/chosen": -208.47622680664062, + "logps/rejected": -286.48138427734375, + "loss": 0.5424, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5654840469360352, + "rewards/margins": 0.7603294253349304, + "rewards/rejected": -2.3258132934570312, + "step": 12780 + }, + { + "epoch": 2.2036526533425222, + "grad_norm": 42.5976676940918, + "learning_rate": 3.9947556890841464e-08, + "logits/chosen": -1.5539737939834595, + "logits/rejected": -1.507218599319458, + "logps/chosen": -216.44656372070312, + "logps/rejected": -306.5636291503906, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.620617151260376, + "rewards/margins": 0.9096146821975708, + "rewards/rejected": -2.5302317142486572, + "step": 12790 + }, + { + "epoch": 2.205375603032391, + "grad_norm": 27.104745864868164, + "learning_rate": 3.978736896735141e-08, + "logits/chosen": -1.5875989198684692, + "logits/rejected": -1.5486294031143188, + "logps/chosen": -215.0325927734375, + "logps/rejected": -291.66351318359375, + "loss": 0.5188, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5896577835083008, + "rewards/margins": 0.7712696194648743, + "rewards/rejected": -2.360927104949951, + "step": 12800 + }, + { + "epoch": 2.205375603032391, + "eval_logits/chosen": -1.7011349201202393, + "eval_logits/rejected": -1.6775678396224976, + "eval_logps/chosen": -204.64537048339844, + "eval_logps/rejected": -243.69786071777344, + "eval_loss": 0.6304866075515747, + "eval_rewards/accuracies": 0.6407992839813232, + "eval_rewards/chosen": -1.4593344926834106, + "eval_rewards/margins": 0.3458428680896759, + "eval_rewards/rejected": -1.8051774501800537, + "eval_runtime": 384.5524, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 12800 + }, + { + "epoch": 2.2070985527222606, + "grad_norm": 29.665178298950195, + "learning_rate": 3.96274230627216e-08, + "logits/chosen": -1.647985816001892, + "logits/rejected": -1.6106626987457275, + "logps/chosen": -204.36851501464844, + "logps/rejected": -287.9172058105469, + "loss": 0.5028, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5377624034881592, + "rewards/margins": 0.7883270382881165, + "rewards/rejected": -2.326089382171631, + "step": 12810 + }, + { + "epoch": 2.2088215024121296, + "grad_norm": 31.872037887573242, + "learning_rate": 3.9467719819839186e-08, + "logits/chosen": -1.5224041938781738, + "logits/rejected": -1.4853503704071045, + "logps/chosen": -201.1040496826172, + "logps/rejected": -281.5457458496094, + "loss": 0.5131, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.480491042137146, + "rewards/margins": 0.7895897626876831, + "rewards/rejected": -2.270081043243408, + "step": 12820 + }, + { + "epoch": 2.2105444521019986, + "grad_norm": 30.092105865478516, + "learning_rate": 3.930825988061599e-08, + "logits/chosen": -1.530009150505066, + "logits/rejected": -1.4921272993087769, + "logps/chosen": -212.3929443359375, + "logps/rejected": -279.1653137207031, + "loss": 0.5613, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6196510791778564, + "rewards/margins": 0.6495185494422913, + "rewards/rejected": -2.269169330596924, + "step": 12830 + }, + { + "epoch": 2.2122674017918675, + "grad_norm": 24.13979148864746, + "learning_rate": 3.914904388598577e-08, + "logits/chosen": -1.614890456199646, + "logits/rejected": -1.5737498998641968, + "logps/chosen": -224.39254760742188, + "logps/rejected": -313.24951171875, + "loss": 0.4871, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6918249130249023, + "rewards/margins": 0.8942005038261414, + "rewards/rejected": -2.5860252380371094, + "step": 12840 + }, + { + "epoch": 2.213990351481737, + "grad_norm": 26.58709716796875, + "learning_rate": 3.899007247590191e-08, + "logits/chosen": -1.6306241750717163, + "logits/rejected": -1.582698106765747, + "logps/chosen": -207.58724975585938, + "logps/rejected": -283.46636962890625, + "loss": 0.507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5449132919311523, + "rewards/margins": 0.7790514230728149, + "rewards/rejected": -2.3239645957946777, + "step": 12850 + }, + { + "epoch": 2.215713301171606, + "grad_norm": 30.544235229492188, + "learning_rate": 3.883134628933465e-08, + "logits/chosen": -1.5273045301437378, + "logits/rejected": -1.4818195104599, + "logps/chosen": -220.94357299804688, + "logps/rejected": -302.7528991699219, + "loss": 0.5044, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6472495794296265, + "rewards/margins": 0.8625710606575012, + "rewards/rejected": -2.5098206996917725, + "step": 12860 + }, + { + "epoch": 2.217436250861475, + "grad_norm": 37.73915100097656, + "learning_rate": 3.867286596426853e-08, + "logits/chosen": -1.6090524196624756, + "logits/rejected": -1.571989893913269, + "logps/chosen": -208.7991180419922, + "logps/rejected": -278.4769592285156, + "loss": 0.5741, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5641460418701172, + "rewards/margins": 0.7138841152191162, + "rewards/rejected": -2.2780303955078125, + "step": 12870 + }, + { + "epoch": 2.219159200551344, + "grad_norm": 36.86833572387695, + "learning_rate": 3.851463213769996e-08, + "logits/chosen": -1.590273141860962, + "logits/rejected": -1.5490907430648804, + "logps/chosen": -202.5039825439453, + "logps/rejected": -280.5902404785156, + "loss": 0.5116, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4931889772415161, + "rewards/margins": 0.7568543553352356, + "rewards/rejected": -2.2500433921813965, + "step": 12880 + }, + { + "epoch": 2.220882150241213, + "grad_norm": 22.201927185058594, + "learning_rate": 3.8356645445634575e-08, + "logits/chosen": -1.6446106433868408, + "logits/rejected": -1.6043132543563843, + "logps/chosen": -202.05697631835938, + "logps/rejected": -271.5600891113281, + "loss": 0.5292, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.451763391494751, + "rewards/margins": 0.7237731218338013, + "rewards/rejected": -2.1755363941192627, + "step": 12890 + }, + { + "epoch": 2.222605099931082, + "grad_norm": 31.349964141845703, + "learning_rate": 3.8198906523084594e-08, + "logits/chosen": -1.5713380575180054, + "logits/rejected": -1.516449213027954, + "logps/chosen": -213.3640594482422, + "logps/rejected": -311.71624755859375, + "loss": 0.441, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5946190357208252, + "rewards/margins": 0.9784560203552246, + "rewards/rejected": -2.57307505607605, + "step": 12900 + }, + { + "epoch": 2.224328049620951, + "grad_norm": 30.3024959564209, + "learning_rate": 3.8041416004066364e-08, + "logits/chosen": -1.671606421470642, + "logits/rejected": -1.6348520517349243, + "logps/chosen": -189.18295288085938, + "logps/rejected": -270.96551513671875, + "loss": 0.4925, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.385255217552185, + "rewards/margins": 0.7706804275512695, + "rewards/rejected": -2.155935764312744, + "step": 12910 + }, + { + "epoch": 2.22605099931082, + "grad_norm": 22.79236602783203, + "learning_rate": 3.7884174521597866e-08, + "logits/chosen": -1.5699918270111084, + "logits/rejected": -1.526435136795044, + "logps/chosen": -205.57717895507812, + "logps/rejected": -291.1944274902344, + "loss": 0.5002, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.529378056526184, + "rewards/margins": 0.8503366708755493, + "rewards/rejected": -2.3797144889831543, + "step": 12920 + }, + { + "epoch": 2.227773949000689, + "grad_norm": 26.476848602294922, + "learning_rate": 3.77271827076961e-08, + "logits/chosen": -1.6100099086761475, + "logits/rejected": -1.5733484029769897, + "logps/chosen": -203.7718963623047, + "logps/rejected": -272.2909240722656, + "loss": 0.5721, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5427305698394775, + "rewards/margins": 0.6530159711837769, + "rewards/rejected": -2.195746421813965, + "step": 12930 + }, + { + "epoch": 2.229496898690558, + "grad_norm": 32.81683349609375, + "learning_rate": 3.757044119337449e-08, + "logits/chosen": -1.544937014579773, + "logits/rejected": -1.4977126121520996, + "logps/chosen": -205.29995727539062, + "logps/rejected": -283.4554748535156, + "loss": 0.5148, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5327036380767822, + "rewards/margins": 0.8081310391426086, + "rewards/rejected": -2.340834617614746, + "step": 12940 + }, + { + "epoch": 2.231219848380427, + "grad_norm": 39.6876106262207, + "learning_rate": 3.741395060864038e-08, + "logits/chosen": -1.5567517280578613, + "logits/rejected": -1.5120809078216553, + "logps/chosen": -207.96401977539062, + "logps/rejected": -302.92889404296875, + "loss": 0.4579, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5375335216522217, + "rewards/margins": 0.9502789378166199, + "rewards/rejected": -2.4878125190734863, + "step": 12950 + }, + { + "epoch": 2.2329427980702965, + "grad_norm": 27.241886138916016, + "learning_rate": 3.7257711582492645e-08, + "logits/chosen": -1.5266079902648926, + "logits/rejected": -1.4739742279052734, + "logps/chosen": -215.13552856445312, + "logps/rejected": -283.886962890625, + "loss": 0.5142, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5669468641281128, + "rewards/margins": 0.741981565952301, + "rewards/rejected": -2.3089284896850586, + "step": 12960 + }, + { + "epoch": 2.2346657477601655, + "grad_norm": 26.52547264099121, + "learning_rate": 3.7101724742918915e-08, + "logits/chosen": -1.4905484914779663, + "logits/rejected": -1.436008095741272, + "logps/chosen": -198.09500122070312, + "logps/rejected": -292.9523620605469, + "loss": 0.4494, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4538332223892212, + "rewards/margins": 0.989672064781189, + "rewards/rejected": -2.4435055255889893, + "step": 12970 + }, + { + "epoch": 2.2363886974500344, + "grad_norm": 32.57485580444336, + "learning_rate": 3.694599071689329e-08, + "logits/chosen": -1.5237276554107666, + "logits/rejected": -1.4900181293487549, + "logps/chosen": -203.85452270507812, + "logps/rejected": -285.77557373046875, + "loss": 0.4753, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5306895971298218, + "rewards/margins": 0.7909470796585083, + "rewards/rejected": -2.32163667678833, + "step": 12980 + }, + { + "epoch": 2.2381116471399034, + "grad_norm": 25.511980056762695, + "learning_rate": 3.679051013037361e-08, + "logits/chosen": -1.625741720199585, + "logits/rejected": -1.5625674724578857, + "logps/chosen": -222.39999389648438, + "logps/rejected": -308.61663818359375, + "loss": 0.4847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6366920471191406, + "rewards/margins": 0.919256329536438, + "rewards/rejected": -2.555948257446289, + "step": 12990 + }, + { + "epoch": 2.2398345968297724, + "grad_norm": 35.11985397338867, + "learning_rate": 3.663528360829915e-08, + "logits/chosen": -1.5546085834503174, + "logits/rejected": -1.5087900161743164, + "logps/chosen": -215.45242309570312, + "logps/rejected": -302.48028564453125, + "loss": 0.474, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.620145559310913, + "rewards/margins": 0.8866864442825317, + "rewards/rejected": -2.5068321228027344, + "step": 13000 + }, + { + "epoch": 2.241557546519642, + "grad_norm": 27.722003936767578, + "learning_rate": 3.6480311774587877e-08, + "logits/chosen": -1.5375518798828125, + "logits/rejected": -1.4975337982177734, + "logps/chosen": -218.1388702392578, + "logps/rejected": -289.1412658691406, + "loss": 0.5443, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6785621643066406, + "rewards/margins": 0.7160916328430176, + "rewards/rejected": -2.394653558731079, + "step": 13010 + }, + { + "epoch": 2.2432804962095108, + "grad_norm": 36.15455627441406, + "learning_rate": 3.6325595252134144e-08, + "logits/chosen": -1.535723090171814, + "logits/rejected": -1.4892303943634033, + "logps/chosen": -203.90689086914062, + "logps/rejected": -283.9646911621094, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5217381715774536, + "rewards/margins": 0.818256676197052, + "rewards/rejected": -2.3399949073791504, + "step": 13020 + }, + { + "epoch": 2.2450034458993797, + "grad_norm": 30.270002365112305, + "learning_rate": 3.617113466280612e-08, + "logits/chosen": -1.6230676174163818, + "logits/rejected": -1.5725562572479248, + "logps/chosen": -211.6242218017578, + "logps/rejected": -279.1073303222656, + "loss": 0.5389, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5698533058166504, + "rewards/margins": 0.6981257200241089, + "rewards/rejected": -2.267979145050049, + "step": 13030 + }, + { + "epoch": 2.2467263955892487, + "grad_norm": 36.30577087402344, + "learning_rate": 3.601693062744322e-08, + "logits/chosen": -1.6449174880981445, + "logits/rejected": -1.5922715663909912, + "logps/chosen": -201.2418670654297, + "logps/rejected": -302.2711486816406, + "loss": 0.4626, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4769412279129028, + "rewards/margins": 0.9876639246940613, + "rewards/rejected": -2.4646053314208984, + "step": 13040 + }, + { + "epoch": 2.2484493452791177, + "grad_norm": 29.94109344482422, + "learning_rate": 3.586298376585363e-08, + "logits/chosen": -1.6053701639175415, + "logits/rejected": -1.5708749294281006, + "logps/chosen": -215.24267578125, + "logps/rejected": -298.4056701660156, + "loss": 0.5218, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.634469747543335, + "rewards/margins": 0.810254693031311, + "rewards/rejected": -2.4447245597839355, + "step": 13050 + }, + { + "epoch": 2.250172294968987, + "grad_norm": 37.369834899902344, + "learning_rate": 3.5709294696811985e-08, + "logits/chosen": -1.6182520389556885, + "logits/rejected": -1.577430248260498, + "logps/chosen": -204.96432495117188, + "logps/rejected": -295.6147155761719, + "loss": 0.4989, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5216501951217651, + "rewards/margins": 0.8875336647033691, + "rewards/rejected": -2.4091837406158447, + "step": 13060 + }, + { + "epoch": 2.251895244658856, + "grad_norm": 62.851322174072266, + "learning_rate": 3.555586403805663e-08, + "logits/chosen": -1.5494712591171265, + "logits/rejected": -1.4969944953918457, + "logps/chosen": -205.6003875732422, + "logps/rejected": -283.4789123535156, + "loss": 0.5259, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4812865257263184, + "rewards/margins": 0.8190127611160278, + "rewards/rejected": -2.300299644470215, + "step": 13070 + }, + { + "epoch": 2.253618194348725, + "grad_norm": 24.81918716430664, + "learning_rate": 3.540269240628726e-08, + "logits/chosen": -1.5176604986190796, + "logits/rejected": -1.4786919355392456, + "logps/chosen": -208.3052978515625, + "logps/rejected": -295.0064392089844, + "loss": 0.486, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.576119065284729, + "rewards/margins": 0.8568998575210571, + "rewards/rejected": -2.433018684387207, + "step": 13080 + }, + { + "epoch": 2.255341144038594, + "grad_norm": 29.975021362304688, + "learning_rate": 3.52497804171625e-08, + "logits/chosen": -1.6407638788223267, + "logits/rejected": -1.5833606719970703, + "logps/chosen": -214.4311065673828, + "logps/rejected": -297.3269348144531, + "loss": 0.4981, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6044986248016357, + "rewards/margins": 0.8463436961174011, + "rewards/rejected": -2.4508423805236816, + "step": 13090 + }, + { + "epoch": 2.257064093728463, + "grad_norm": 23.319719314575195, + "learning_rate": 3.509712868529738e-08, + "logits/chosen": -1.682140588760376, + "logits/rejected": -1.623839020729065, + "logps/chosen": -207.14645385742188, + "logps/rejected": -294.4388122558594, + "loss": 0.4902, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5284478664398193, + "rewards/margins": 0.9110603332519531, + "rewards/rejected": -2.4395084381103516, + "step": 13100 + }, + { + "epoch": 2.2587870434183324, + "grad_norm": 25.40921401977539, + "learning_rate": 3.494473782426073e-08, + "logits/chosen": -1.535962462425232, + "logits/rejected": -1.4818295240402222, + "logps/chosen": -212.125244140625, + "logps/rejected": -296.21575927734375, + "loss": 0.5267, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5828105211257935, + "rewards/margins": 0.863786518573761, + "rewards/rejected": -2.44659686088562, + "step": 13110 + }, + { + "epoch": 2.2605099931082013, + "grad_norm": 26.571462631225586, + "learning_rate": 3.479260844657297e-08, + "logits/chosen": -1.6702839136123657, + "logits/rejected": -1.635602593421936, + "logps/chosen": -209.6977996826172, + "logps/rejected": -287.2583312988281, + "loss": 0.5404, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5520085096359253, + "rewards/margins": 0.7901738882064819, + "rewards/rejected": -2.3421826362609863, + "step": 13120 + }, + { + "epoch": 2.2622329427980703, + "grad_norm": 26.704050064086914, + "learning_rate": 3.46407411637034e-08, + "logits/chosen": -1.6938931941986084, + "logits/rejected": -1.6417863368988037, + "logps/chosen": -199.1559600830078, + "logps/rejected": -306.63714599609375, + "loss": 0.4476, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4523357152938843, + "rewards/margins": 1.0319113731384277, + "rewards/rejected": -2.4842472076416016, + "step": 13130 + }, + { + "epoch": 2.2639558924879393, + "grad_norm": 23.49358367919922, + "learning_rate": 3.448913658606798e-08, + "logits/chosen": -1.526761531829834, + "logits/rejected": -1.4799644947052002, + "logps/chosen": -202.72506713867188, + "logps/rejected": -284.6557922363281, + "loss": 0.4739, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.490138292312622, + "rewards/margins": 0.8108283281326294, + "rewards/rejected": -2.300966501235962, + "step": 13140 + }, + { + "epoch": 2.2656788421778082, + "grad_norm": 30.544790267944336, + "learning_rate": 3.43377953230266e-08, + "logits/chosen": -1.5474731922149658, + "logits/rejected": -1.4954991340637207, + "logps/chosen": -201.0833282470703, + "logps/rejected": -301.4603576660156, + "loss": 0.4487, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4589382410049438, + "rewards/margins": 1.0210484266281128, + "rewards/rejected": -2.4799866676330566, + "step": 13150 + }, + { + "epoch": 2.2674017918676777, + "grad_norm": 34.979034423828125, + "learning_rate": 3.418671798288093e-08, + "logits/chosen": -1.5324780941009521, + "logits/rejected": -1.4817928075790405, + "logps/chosen": -228.0107421875, + "logps/rejected": -306.8648681640625, + "loss": 0.511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7148380279541016, + "rewards/margins": 0.8141125440597534, + "rewards/rejected": -2.5289502143859863, + "step": 13160 + }, + { + "epoch": 2.2691247415575466, + "grad_norm": 35.73521423339844, + "learning_rate": 3.403590517287175e-08, + "logits/chosen": -1.5787086486816406, + "logits/rejected": -1.5398588180541992, + "logps/chosen": -215.3478546142578, + "logps/rejected": -292.5067443847656, + "loss": 0.4972, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6153271198272705, + "rewards/margins": 0.7852949500083923, + "rewards/rejected": -2.4006218910217285, + "step": 13170 + }, + { + "epoch": 2.2708476912474156, + "grad_norm": 39.83544921875, + "learning_rate": 3.388535749917653e-08, + "logits/chosen": -1.5618702173233032, + "logits/rejected": -1.5051721334457397, + "logps/chosen": -207.89285278320312, + "logps/rejected": -304.93505859375, + "loss": 0.436, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5455347299575806, + "rewards/margins": 0.9910475015640259, + "rewards/rejected": -2.5365822315216064, + "step": 13180 + }, + { + "epoch": 2.2725706409372846, + "grad_norm": 36.779273986816406, + "learning_rate": 3.373507556690718e-08, + "logits/chosen": -1.5543924570083618, + "logits/rejected": -1.5112448930740356, + "logps/chosen": -221.789306640625, + "logps/rejected": -290.9019775390625, + "loss": 0.5493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6608854532241821, + "rewards/margins": 0.7263485193252563, + "rewards/rejected": -2.3872339725494385, + "step": 13190 + }, + { + "epoch": 2.2742935906271535, + "grad_norm": 32.65153121948242, + "learning_rate": 3.358505998010743e-08, + "logits/chosen": -1.5592164993286133, + "logits/rejected": -1.5155131816864014, + "logps/chosen": -212.46090698242188, + "logps/rejected": -281.14141845703125, + "loss": 0.5395, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5759353637695312, + "rewards/margins": 0.7066248655319214, + "rewards/rejected": -2.282560110092163, + "step": 13200 + }, + { + "epoch": 2.2742935906271535, + "eval_logits/chosen": -1.6833596229553223, + "eval_logits/rejected": -1.6590664386749268, + "eval_logps/chosen": -212.43768310546875, + "eval_logps/rejected": -253.689208984375, + "eval_loss": 0.6315019726753235, + "eval_rewards/accuracies": 0.6428903341293335, + "eval_rewards/chosen": -1.5372580289840698, + "eval_rewards/margins": 0.36783286929130554, + "eval_rewards/rejected": -1.9050910472869873, + "eval_runtime": 384.8229, + "eval_samples_per_second": 11.184, + "eval_steps_per_second": 1.398, + "step": 13200 + }, + { + "epoch": 2.2760165403170225, + "grad_norm": 31.827877044677734, + "learning_rate": 3.343531134175046e-08, + "logits/chosen": -1.6485798358917236, + "logits/rejected": -1.6175024509429932, + "logps/chosen": -204.543701171875, + "logps/rejected": -279.55572509765625, + "loss": 0.5524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5241453647613525, + "rewards/margins": 0.7401344180107117, + "rewards/rejected": -2.264279842376709, + "step": 13210 + }, + { + "epoch": 2.277739490006892, + "grad_norm": 24.60109519958496, + "learning_rate": 3.3285830253736405e-08, + "logits/chosen": -1.5928890705108643, + "logits/rejected": -1.5579743385314941, + "logps/chosen": -206.6677703857422, + "logps/rejected": -278.94769287109375, + "loss": 0.5106, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5167338848114014, + "rewards/margins": 0.712703287601471, + "rewards/rejected": -2.2294368743896484, + "step": 13220 + }, + { + "epoch": 2.279462439696761, + "grad_norm": 31.586509704589844, + "learning_rate": 3.313661731689013e-08, + "logits/chosen": -1.5826867818832397, + "logits/rejected": -1.5373554229736328, + "logps/chosen": -194.94052124023438, + "logps/rejected": -287.5800476074219, + "loss": 0.4702, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.440435528755188, + "rewards/margins": 0.9033792614936829, + "rewards/rejected": -2.3438143730163574, + "step": 13230 + }, + { + "epoch": 2.28118538938663, + "grad_norm": 27.885540008544922, + "learning_rate": 3.298767313095865e-08, + "logits/chosen": -1.5880454778671265, + "logits/rejected": -1.5458592176437378, + "logps/chosen": -219.48892211914062, + "logps/rejected": -303.6644287109375, + "loss": 0.4621, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6815032958984375, + "rewards/margins": 0.8260301351547241, + "rewards/rejected": -2.507533311843872, + "step": 13240 + }, + { + "epoch": 2.282908339076499, + "grad_norm": 29.93614387512207, + "learning_rate": 3.283899829460873e-08, + "logits/chosen": -1.5705511569976807, + "logits/rejected": -1.52997624874115, + "logps/chosen": -207.859375, + "logps/rejected": -308.9605712890625, + "loss": 0.477, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.561885952949524, + "rewards/margins": 0.9680153131484985, + "rewards/rejected": -2.5299010276794434, + "step": 13250 + }, + { + "epoch": 2.2846312887663682, + "grad_norm": 37.0043830871582, + "learning_rate": 3.269059340542448e-08, + "logits/chosen": -1.6618578433990479, + "logits/rejected": -1.616093635559082, + "logps/chosen": -216.92294311523438, + "logps/rejected": -316.55963134765625, + "loss": 0.4876, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6888090372085571, + "rewards/margins": 0.9507328867912292, + "rewards/rejected": -2.6395421028137207, + "step": 13260 + }, + { + "epoch": 2.286354238456237, + "grad_norm": 27.53391456604004, + "learning_rate": 3.2542459059905127e-08, + "logits/chosen": -1.5043879747390747, + "logits/rejected": -1.465038537979126, + "logps/chosen": -221.95388793945312, + "logps/rejected": -315.71649169921875, + "loss": 0.4788, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6740529537200928, + "rewards/margins": 0.9358172416687012, + "rewards/rejected": -2.609870195388794, + "step": 13270 + }, + { + "epoch": 2.288077188146106, + "grad_norm": 31.898828506469727, + "learning_rate": 3.239459585346228e-08, + "logits/chosen": -1.5886051654815674, + "logits/rejected": -1.5323841571807861, + "logps/chosen": -212.6425018310547, + "logps/rejected": -292.26690673828125, + "loss": 0.5453, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.589463233947754, + "rewards/margins": 0.8199914693832397, + "rewards/rejected": -2.409454584121704, + "step": 13280 + }, + { + "epoch": 2.289800137835975, + "grad_norm": 41.60102844238281, + "learning_rate": 3.224700438041789e-08, + "logits/chosen": -1.5710105895996094, + "logits/rejected": -1.522861361503601, + "logps/chosen": -207.9844970703125, + "logps/rejected": -298.4026794433594, + "loss": 0.4795, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5416791439056396, + "rewards/margins": 0.9125336408615112, + "rewards/rejected": -2.4542126655578613, + "step": 13290 + }, + { + "epoch": 2.291523087525844, + "grad_norm": 48.986732482910156, + "learning_rate": 3.209968523400165e-08, + "logits/chosen": -1.5890921354293823, + "logits/rejected": -1.5477979183197021, + "logps/chosen": -223.21560668945312, + "logps/rejected": -297.00799560546875, + "loss": 0.5494, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7203028202056885, + "rewards/margins": 0.734029233455658, + "rewards/rejected": -2.454331874847412, + "step": 13300 + }, + { + "epoch": 2.293246037215713, + "grad_norm": 40.81411361694336, + "learning_rate": 3.195263900634863e-08, + "logits/chosen": -1.5560508966445923, + "logits/rejected": -1.5148818492889404, + "logps/chosen": -222.29904174804688, + "logps/rejected": -302.53082275390625, + "loss": 0.5463, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6516530513763428, + "rewards/margins": 0.8341676592826843, + "rewards/rejected": -2.485820770263672, + "step": 13310 + }, + { + "epoch": 2.2949689869055825, + "grad_norm": 54.13987350463867, + "learning_rate": 3.180586628849692e-08, + "logits/chosen": -1.644132375717163, + "logits/rejected": -1.5857830047607422, + "logps/chosen": -218.32791137695312, + "logps/rejected": -278.406982421875, + "loss": 0.5784, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6245272159576416, + "rewards/margins": 0.6808105111122131, + "rewards/rejected": -2.305337429046631, + "step": 13320 + }, + { + "epoch": 2.2966919365954515, + "grad_norm": 37.29862594604492, + "learning_rate": 3.165936767038534e-08, + "logits/chosen": -1.5331171751022339, + "logits/rejected": -1.4829689264297485, + "logps/chosen": -198.60708618164062, + "logps/rejected": -291.91204833984375, + "loss": 0.4975, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4252647161483765, + "rewards/margins": 0.9944926500320435, + "rewards/rejected": -2.41975736618042, + "step": 13330 + }, + { + "epoch": 2.2984148862853204, + "grad_norm": 36.688751220703125, + "learning_rate": 3.151314374085097e-08, + "logits/chosen": -1.6717342138290405, + "logits/rejected": -1.6321557760238647, + "logps/chosen": -210.4832000732422, + "logps/rejected": -285.9844970703125, + "loss": 0.5545, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5588440895080566, + "rewards/margins": 0.7591500878334045, + "rewards/rejected": -2.3179941177368164, + "step": 13340 + }, + { + "epoch": 2.3001378359751894, + "grad_norm": 36.74166488647461, + "learning_rate": 3.136719508762674e-08, + "logits/chosen": -1.6364288330078125, + "logits/rejected": -1.571699857711792, + "logps/chosen": -200.09927368164062, + "logps/rejected": -281.8629150390625, + "loss": 0.4931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4316269159317017, + "rewards/margins": 0.8701766133308411, + "rewards/rejected": -2.3018035888671875, + "step": 13350 + }, + { + "epoch": 2.301860785665059, + "grad_norm": 27.395763397216797, + "learning_rate": 3.1221522297339177e-08, + "logits/chosen": -1.6450027227401733, + "logits/rejected": -1.6021854877471924, + "logps/chosen": -205.87466430664062, + "logps/rejected": -290.9104919433594, + "loss": 0.4869, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4858473539352417, + "rewards/margins": 0.8877309560775757, + "rewards/rejected": -2.3735783100128174, + "step": 13360 + }, + { + "epoch": 2.3035837353549278, + "grad_norm": 17.683246612548828, + "learning_rate": 3.1076125955506015e-08, + "logits/chosen": -1.5695432424545288, + "logits/rejected": -1.5173529386520386, + "logps/chosen": -202.6400604248047, + "logps/rejected": -292.83282470703125, + "loss": 0.514, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4840691089630127, + "rewards/margins": 0.9159029722213745, + "rewards/rejected": -2.3999722003936768, + "step": 13370 + }, + { + "epoch": 2.3053066850447967, + "grad_norm": 39.179649353027344, + "learning_rate": 3.0931006646533866e-08, + "logits/chosen": -1.5399872064590454, + "logits/rejected": -1.4884560108184814, + "logps/chosen": -212.501953125, + "logps/rejected": -280.26556396484375, + "loss": 0.5347, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5561041831970215, + "rewards/margins": 0.7119619846343994, + "rewards/rejected": -2.26806640625, + "step": 13380 + }, + { + "epoch": 2.3070296347346657, + "grad_norm": 32.08159255981445, + "learning_rate": 3.078616495371574e-08, + "logits/chosen": -1.523012399673462, + "logits/rejected": -1.4631705284118652, + "logps/chosen": -196.67645263671875, + "logps/rejected": -275.2851257324219, + "loss": 0.5165, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3953605890274048, + "rewards/margins": 0.8383074998855591, + "rewards/rejected": -2.2336678504943848, + "step": 13390 + }, + { + "epoch": 2.3087525844245347, + "grad_norm": 19.68545913696289, + "learning_rate": 3.064160145922884e-08, + "logits/chosen": -1.6055904626846313, + "logits/rejected": -1.551365613937378, + "logps/chosen": -186.95303344726562, + "logps/rejected": -281.7250061035156, + "loss": 0.4451, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3198156356811523, + "rewards/margins": 0.9908710718154907, + "rewards/rejected": -2.3106865882873535, + "step": 13400 + }, + { + "epoch": 2.3104755341144037, + "grad_norm": 17.97285270690918, + "learning_rate": 3.0497316744132215e-08, + "logits/chosen": -1.6288455724716187, + "logits/rejected": -1.5841569900512695, + "logps/chosen": -210.47897338867188, + "logps/rejected": -289.45245361328125, + "loss": 0.5404, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5649375915527344, + "rewards/margins": 0.8170774579048157, + "rewards/rejected": -2.3820149898529053, + "step": 13410 + }, + { + "epoch": 2.312198483804273, + "grad_norm": 33.42015838623047, + "learning_rate": 3.035331138836431e-08, + "logits/chosen": -1.6192560195922852, + "logits/rejected": -1.5792990922927856, + "logps/chosen": -205.99609375, + "logps/rejected": -300.41070556640625, + "loss": 0.4713, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5238395929336548, + "rewards/margins": 0.9164296984672546, + "rewards/rejected": -2.4402692317962646, + "step": 13420 + }, + { + "epoch": 2.313921433494142, + "grad_norm": 33.72605514526367, + "learning_rate": 3.020958597074081e-08, + "logits/chosen": -1.6985355615615845, + "logits/rejected": -1.6434152126312256, + "logps/chosen": -207.11843872070312, + "logps/rejected": -298.39898681640625, + "loss": 0.4918, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.515989065170288, + "rewards/margins": 0.8947912454605103, + "rewards/rejected": -2.4107799530029297, + "step": 13430 + }, + { + "epoch": 2.315644383184011, + "grad_norm": 27.87947654724121, + "learning_rate": 3.006614106895211e-08, + "logits/chosen": -1.551805853843689, + "logits/rejected": -1.5149122476577759, + "logps/chosen": -189.2252197265625, + "logps/rejected": -256.5485534667969, + "loss": 0.5483, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3701688051223755, + "rewards/margins": 0.6807070970535278, + "rewards/rejected": -2.0508759021759033, + "step": 13440 + }, + { + "epoch": 2.31736733287388, + "grad_norm": 28.670364379882812, + "learning_rate": 2.992297725956121e-08, + "logits/chosen": -1.5825433731079102, + "logits/rejected": -1.537448763847351, + "logps/chosen": -195.5888214111328, + "logps/rejected": -270.58636474609375, + "loss": 0.5146, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4518344402313232, + "rewards/margins": 0.7301121950149536, + "rewards/rejected": -2.1819469928741455, + "step": 13450 + }, + { + "epoch": 2.3190902825637494, + "grad_norm": 34.71762466430664, + "learning_rate": 2.978009511800116e-08, + "logits/chosen": -1.6074050664901733, + "logits/rejected": -1.5511646270751953, + "logps/chosen": -187.51683044433594, + "logps/rejected": -280.5148010253906, + "loss": 0.4486, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.334885597229004, + "rewards/margins": 0.9614629745483398, + "rewards/rejected": -2.2963485717773438, + "step": 13460 + }, + { + "epoch": 2.3208132322536184, + "grad_norm": 30.473466873168945, + "learning_rate": 2.9637495218572972e-08, + "logits/chosen": -1.4833455085754395, + "logits/rejected": -1.4295669794082642, + "logps/chosen": -211.89492797851562, + "logps/rejected": -290.8565368652344, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5683119297027588, + "rewards/margins": 0.8340962529182434, + "rewards/rejected": -2.4024083614349365, + "step": 13470 + }, + { + "epoch": 2.3225361819434873, + "grad_norm": 33.9401969909668, + "learning_rate": 2.9495178134443254e-08, + "logits/chosen": -1.6398298740386963, + "logits/rejected": -1.5832942724227905, + "logps/chosen": -198.39564514160156, + "logps/rejected": -279.4765930175781, + "loss": 0.4923, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.455733299255371, + "rewards/margins": 0.8250287175178528, + "rewards/rejected": -2.280762195587158, + "step": 13480 + }, + { + "epoch": 2.3242591316333563, + "grad_norm": 24.341140747070312, + "learning_rate": 2.9353144437641662e-08, + "logits/chosen": -1.6076714992523193, + "logits/rejected": -1.5592044591903687, + "logps/chosen": -218.0586395263672, + "logps/rejected": -297.05633544921875, + "loss": 0.515, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.641355276107788, + "rewards/margins": 0.79271399974823, + "rewards/rejected": -2.4340693950653076, + "step": 13490 + }, + { + "epoch": 2.3259820813232253, + "grad_norm": 31.230804443359375, + "learning_rate": 2.9211394699058987e-08, + "logits/chosen": -1.6254823207855225, + "logits/rejected": -1.5697517395019531, + "logps/chosen": -204.39346313476562, + "logps/rejected": -297.74896240234375, + "loss": 0.4478, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4872136116027832, + "rewards/margins": 0.9720531702041626, + "rewards/rejected": -2.4592669010162354, + "step": 13500 + }, + { + "epoch": 2.3277050310130942, + "grad_norm": 19.279212951660156, + "learning_rate": 2.9069929488444678e-08, + "logits/chosen": -1.5557644367218018, + "logits/rejected": -1.5264142751693726, + "logps/chosen": -196.2863311767578, + "logps/rejected": -276.21881103515625, + "loss": 0.5231, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.469422459602356, + "rewards/margins": 0.7577401399612427, + "rewards/rejected": -2.2271625995635986, + "step": 13510 + }, + { + "epoch": 2.3294279807029636, + "grad_norm": 26.38505744934082, + "learning_rate": 2.8928749374404448e-08, + "logits/chosen": -1.4847887754440308, + "logits/rejected": -1.440119981765747, + "logps/chosen": -201.7764434814453, + "logps/rejected": -298.28509521484375, + "loss": 0.4588, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5093433856964111, + "rewards/margins": 0.9552731513977051, + "rewards/rejected": -2.464616298675537, + "step": 13520 + }, + { + "epoch": 2.3311509303928326, + "grad_norm": 37.105743408203125, + "learning_rate": 2.8787854924398123e-08, + "logits/chosen": -1.5994256734848022, + "logits/rejected": -1.552588939666748, + "logps/chosen": -210.1918182373047, + "logps/rejected": -270.26116943359375, + "loss": 0.5475, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.552570104598999, + "rewards/margins": 0.6298242807388306, + "rewards/rejected": -2.182394504547119, + "step": 13530 + }, + { + "epoch": 2.3328738800827016, + "grad_norm": 53.39751434326172, + "learning_rate": 2.8647246704737382e-08, + "logits/chosen": -1.51864755153656, + "logits/rejected": -1.4642963409423828, + "logps/chosen": -211.7984619140625, + "logps/rejected": -294.93060302734375, + "loss": 0.492, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5604420900344849, + "rewards/margins": 0.885561466217041, + "rewards/rejected": -2.4460034370422363, + "step": 13540 + }, + { + "epoch": 2.3345968297725705, + "grad_norm": 31.251556396484375, + "learning_rate": 2.8506925280583417e-08, + "logits/chosen": -1.5718047618865967, + "logits/rejected": -1.5317022800445557, + "logps/chosen": -213.5127716064453, + "logps/rejected": -282.591796875, + "loss": 0.5498, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6243717670440674, + "rewards/margins": 0.6968979835510254, + "rewards/rejected": -2.321269989013672, + "step": 13550 + }, + { + "epoch": 2.3363197794624395, + "grad_norm": 41.41878890991211, + "learning_rate": 2.8366891215944598e-08, + "logits/chosen": -1.6352542638778687, + "logits/rejected": -1.6042063236236572, + "logps/chosen": -200.40451049804688, + "logps/rejected": -274.9106750488281, + "loss": 0.5705, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.486896276473999, + "rewards/margins": 0.7334551811218262, + "rewards/rejected": -2.220351457595825, + "step": 13560 + }, + { + "epoch": 2.338042729152309, + "grad_norm": 29.753414154052734, + "learning_rate": 2.8227145073674385e-08, + "logits/chosen": -1.5187408924102783, + "logits/rejected": -1.4685065746307373, + "logps/chosen": -203.2227783203125, + "logps/rejected": -302.477294921875, + "loss": 0.4416, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.52272367477417, + "rewards/margins": 0.9911115765571594, + "rewards/rejected": -2.5138351917266846, + "step": 13570 + }, + { + "epoch": 2.339765678842178, + "grad_norm": 24.401880264282227, + "learning_rate": 2.8087687415468896e-08, + "logits/chosen": -1.5765650272369385, + "logits/rejected": -1.5453118085861206, + "logps/chosen": -192.10269165039062, + "logps/rejected": -274.36383056640625, + "loss": 0.4839, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4052332639694214, + "rewards/margins": 0.8032985925674438, + "rewards/rejected": -2.2085318565368652, + "step": 13580 + }, + { + "epoch": 2.341488628532047, + "grad_norm": 28.540246963500977, + "learning_rate": 2.7948518801864697e-08, + "logits/chosen": -1.5397393703460693, + "logits/rejected": -1.5157146453857422, + "logps/chosen": -213.9230194091797, + "logps/rejected": -296.255859375, + "loss": 0.5176, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6143159866333008, + "rewards/margins": 0.8101215362548828, + "rewards/rejected": -2.4244375228881836, + "step": 13590 + }, + { + "epoch": 2.343211578221916, + "grad_norm": 47.43010330200195, + "learning_rate": 2.780963979223663e-08, + "logits/chosen": -1.5442464351654053, + "logits/rejected": -1.4999010562896729, + "logps/chosen": -220.0675506591797, + "logps/rejected": -299.0097351074219, + "loss": 0.5059, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6141239404678345, + "rewards/margins": 0.8025999069213867, + "rewards/rejected": -2.4167237281799316, + "step": 13600 + }, + { + "epoch": 2.343211578221916, + "eval_logits/chosen": -1.7051271200180054, + "eval_logits/rejected": -1.6811729669570923, + "eval_logps/chosen": -206.69924926757812, + "eval_logps/rejected": -246.98841857910156, + "eval_loss": 0.631769061088562, + "eval_rewards/accuracies": 0.643122673034668, + "eval_rewards/chosen": -1.479873538017273, + "eval_rewards/margins": 0.35820940136909485, + "eval_rewards/rejected": -1.8380827903747559, + "eval_runtime": 384.7838, + "eval_samples_per_second": 11.186, + "eval_steps_per_second": 1.398, + "step": 13600 + }, + { + "epoch": 2.344934527911785, + "grad_norm": 39.63872528076172, + "learning_rate": 2.7671050944795494e-08, + "logits/chosen": -1.7334047555923462, + "logits/rejected": -1.6934597492218018, + "logps/chosen": -201.57763671875, + "logps/rejected": -271.024658203125, + "loss": 0.5537, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4877710342407227, + "rewards/margins": 0.6896355152130127, + "rewards/rejected": -2.1774067878723145, + "step": 13610 + }, + { + "epoch": 2.346657477601654, + "grad_norm": 21.51317024230957, + "learning_rate": 2.753275281658578e-08, + "logits/chosen": -1.6060326099395752, + "logits/rejected": -1.5415525436401367, + "logps/chosen": -202.09640502929688, + "logps/rejected": -286.78485107421875, + "loss": 0.4627, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4595098495483398, + "rewards/margins": 0.891812801361084, + "rewards/rejected": -2.351322650909424, + "step": 13620 + }, + { + "epoch": 2.348380427291523, + "grad_norm": 26.84193992614746, + "learning_rate": 2.7394745963483414e-08, + "logits/chosen": -1.5497287511825562, + "logits/rejected": -1.484168291091919, + "logps/chosen": -208.8119659423828, + "logps/rejected": -305.40679931640625, + "loss": 0.4444, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5409910678863525, + "rewards/margins": 1.0113693475723267, + "rewards/rejected": -2.5523600578308105, + "step": 13630 + }, + { + "epoch": 2.350103376981392, + "grad_norm": 26.73301887512207, + "learning_rate": 2.725703094019368e-08, + "logits/chosen": -1.5552833080291748, + "logits/rejected": -1.510765790939331, + "logps/chosen": -205.0259246826172, + "logps/rejected": -292.72039794921875, + "loss": 0.4904, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5017683506011963, + "rewards/margins": 0.88092440366745, + "rewards/rejected": -2.382692813873291, + "step": 13640 + }, + { + "epoch": 2.351826326671261, + "grad_norm": 31.266645431518555, + "learning_rate": 2.7119608300248842e-08, + "logits/chosen": -1.6274423599243164, + "logits/rejected": -1.5821449756622314, + "logps/chosen": -216.30654907226562, + "logps/rejected": -301.15802001953125, + "loss": 0.5018, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6229690313339233, + "rewards/margins": 0.8716036677360535, + "rewards/rejected": -2.494572639465332, + "step": 13650 + }, + { + "epoch": 2.35354927636113, + "grad_norm": 35.00014877319336, + "learning_rate": 2.698247859600591e-08, + "logits/chosen": -1.5113948583602905, + "logits/rejected": -1.4610074758529663, + "logps/chosen": -200.38381958007812, + "logps/rejected": -281.7772521972656, + "loss": 0.4975, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4734694957733154, + "rewards/margins": 0.8053814172744751, + "rewards/rejected": -2.278850793838501, + "step": 13660 + }, + { + "epoch": 2.3552722260509995, + "grad_norm": 33.406166076660156, + "learning_rate": 2.6845642378644463e-08, + "logits/chosen": -1.6255537271499634, + "logits/rejected": -1.5784156322479248, + "logps/chosen": -210.26931762695312, + "logps/rejected": -292.46734619140625, + "loss": 0.501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5554783344268799, + "rewards/margins": 0.8226876258850098, + "rewards/rejected": -2.3781659603118896, + "step": 13670 + }, + { + "epoch": 2.3569951757408685, + "grad_norm": 29.371341705322266, + "learning_rate": 2.6709100198164513e-08, + "logits/chosen": -1.6458193063735962, + "logits/rejected": -1.5945569276809692, + "logps/chosen": -196.690185546875, + "logps/rejected": -265.1627502441406, + "loss": 0.5321, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4204047918319702, + "rewards/margins": 0.7230075597763062, + "rewards/rejected": -2.1434123516082764, + "step": 13680 + }, + { + "epoch": 2.3587181254307374, + "grad_norm": 30.44771385192871, + "learning_rate": 2.657285260338421e-08, + "logits/chosen": -1.6045669317245483, + "logits/rejected": -1.548130989074707, + "logps/chosen": -203.29916381835938, + "logps/rejected": -295.7070617675781, + "loss": 0.5, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4948320388793945, + "rewards/margins": 0.9141039848327637, + "rewards/rejected": -2.408936023712158, + "step": 13690 + }, + { + "epoch": 2.3604410751206064, + "grad_norm": 38.34294128417969, + "learning_rate": 2.643690014193758e-08, + "logits/chosen": -1.602710485458374, + "logits/rejected": -1.5532397031784058, + "logps/chosen": -212.09945678710938, + "logps/rejected": -274.312744140625, + "loss": 0.5696, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.530841588973999, + "rewards/margins": 0.6810584664344788, + "rewards/rejected": -2.211899995803833, + "step": 13700 + }, + { + "epoch": 2.3621640248104754, + "grad_norm": 45.795570373535156, + "learning_rate": 2.6301243360272394e-08, + "logits/chosen": -1.566232442855835, + "logits/rejected": -1.5085102319717407, + "logps/chosen": -197.57620239257812, + "logps/rejected": -284.8357849121094, + "loss": 0.4789, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.440800428390503, + "rewards/margins": 0.868538498878479, + "rewards/rejected": -2.3093388080596924, + "step": 13710 + }, + { + "epoch": 2.3638869745003444, + "grad_norm": 40.264739990234375, + "learning_rate": 2.6165882803648055e-08, + "logits/chosen": -1.5383836030960083, + "logits/rejected": -1.4786090850830078, + "logps/chosen": -197.35031127929688, + "logps/rejected": -275.8687438964844, + "loss": 0.5075, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4295049905776978, + "rewards/margins": 0.8047353029251099, + "rewards/rejected": -2.2342402935028076, + "step": 13720 + }, + { + "epoch": 2.3656099241902138, + "grad_norm": 52.54778289794922, + "learning_rate": 2.60308190161332e-08, + "logits/chosen": -1.693058967590332, + "logits/rejected": -1.6434071063995361, + "logps/chosen": -199.19119262695312, + "logps/rejected": -299.73699951171875, + "loss": 0.4603, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4446357488632202, + "rewards/margins": 0.9695494771003723, + "rewards/rejected": -2.4141855239868164, + "step": 13730 + }, + { + "epoch": 2.3673328738800827, + "grad_norm": 20.35848045349121, + "learning_rate": 2.5896052540603706e-08, + "logits/chosen": -1.6675994396209717, + "logits/rejected": -1.6155967712402344, + "logps/chosen": -201.94395446777344, + "logps/rejected": -291.0851745605469, + "loss": 0.4777, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4425544738769531, + "rewards/margins": 0.9335956573486328, + "rewards/rejected": -2.376150369644165, + "step": 13740 + }, + { + "epoch": 2.3690558235699517, + "grad_norm": 39.45704650878906, + "learning_rate": 2.576158391874047e-08, + "logits/chosen": -1.6140620708465576, + "logits/rejected": -1.5652718544006348, + "logps/chosen": -216.77197265625, + "logps/rejected": -310.7403564453125, + "loss": 0.4879, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.60541570186615, + "rewards/margins": 0.9550965428352356, + "rewards/rejected": -2.5605123043060303, + "step": 13750 + }, + { + "epoch": 2.3707787732598207, + "grad_norm": 49.456642150878906, + "learning_rate": 2.562741369102711e-08, + "logits/chosen": -1.6827857494354248, + "logits/rejected": -1.6329879760742188, + "logps/chosen": -201.86753845214844, + "logps/rejected": -278.7059326171875, + "loss": 0.5225, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4746357202529907, + "rewards/margins": 0.8057149648666382, + "rewards/rejected": -2.280350685119629, + "step": 13760 + }, + { + "epoch": 2.37250172294969, + "grad_norm": 43.56124496459961, + "learning_rate": 2.549354239674786e-08, + "logits/chosen": -1.6783527135849, + "logits/rejected": -1.6421291828155518, + "logps/chosen": -206.2716064453125, + "logps/rejected": -290.7262268066406, + "loss": 0.5038, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.555018424987793, + "rewards/margins": 0.8067394495010376, + "rewards/rejected": -2.361757755279541, + "step": 13770 + }, + { + "epoch": 2.374224672639559, + "grad_norm": 22.799421310424805, + "learning_rate": 2.5359970573985524e-08, + "logits/chosen": -1.7318445444107056, + "logits/rejected": -1.6665557622909546, + "logps/chosen": -214.9160919189453, + "logps/rejected": -292.8385009765625, + "loss": 0.4964, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5610339641571045, + "rewards/margins": 0.8489457964897156, + "rewards/rejected": -2.409980058670044, + "step": 13780 + }, + { + "epoch": 2.375947622329428, + "grad_norm": 23.744430541992188, + "learning_rate": 2.522669875961919e-08, + "logits/chosen": -1.620179533958435, + "logits/rejected": -1.5787187814712524, + "logps/chosen": -200.7430877685547, + "logps/rejected": -281.3035583496094, + "loss": 0.5127, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4429785013198853, + "rewards/margins": 0.8367716073989868, + "rewards/rejected": -2.279750108718872, + "step": 13790 + }, + { + "epoch": 2.377670572019297, + "grad_norm": 50.123878479003906, + "learning_rate": 2.509372748932195e-08, + "logits/chosen": -1.6756843328475952, + "logits/rejected": -1.6160857677459717, + "logps/chosen": -209.7655487060547, + "logps/rejected": -296.96343994140625, + "loss": 0.4796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5245649814605713, + "rewards/margins": 0.9218786358833313, + "rewards/rejected": -2.446443796157837, + "step": 13800 + }, + { + "epoch": 2.379393521709166, + "grad_norm": 33.61479187011719, + "learning_rate": 2.4961057297559064e-08, + "logits/chosen": -1.5745052099227905, + "logits/rejected": -1.534232497215271, + "logps/chosen": -195.0514373779297, + "logps/rejected": -282.4466247558594, + "loss": 0.5002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.428135633468628, + "rewards/margins": 0.8580183982849121, + "rewards/rejected": -2.286154270172119, + "step": 13810 + }, + { + "epoch": 2.381116471399035, + "grad_norm": 39.281761169433594, + "learning_rate": 2.4828688717585567e-08, + "logits/chosen": -1.6656768321990967, + "logits/rejected": -1.6113700866699219, + "logps/chosen": -215.00863647460938, + "logps/rejected": -290.1063537597656, + "loss": 0.5251, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5446885824203491, + "rewards/margins": 0.8189195394515991, + "rewards/rejected": -2.3636081218719482, + "step": 13820 + }, + { + "epoch": 2.3828394210889043, + "grad_norm": 26.61870574951172, + "learning_rate": 2.4696622281444158e-08, + "logits/chosen": -1.6838382482528687, + "logits/rejected": -1.6528728008270264, + "logps/chosen": -200.13461303710938, + "logps/rejected": -268.8764953613281, + "loss": 0.516, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4510828256607056, + "rewards/margins": 0.7129031419754028, + "rewards/rejected": -2.1639862060546875, + "step": 13830 + }, + { + "epoch": 2.3845623707787733, + "grad_norm": 35.59844207763672, + "learning_rate": 2.4564858519963195e-08, + "logits/chosen": -1.660980463027954, + "logits/rejected": -1.6128597259521484, + "logps/chosen": -200.73452758789062, + "logps/rejected": -267.2124328613281, + "loss": 0.5397, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4402029514312744, + "rewards/margins": 0.7149636745452881, + "rewards/rejected": -2.1551668643951416, + "step": 13840 + }, + { + "epoch": 2.3862853204686423, + "grad_norm": 35.738765716552734, + "learning_rate": 2.443339796275432e-08, + "logits/chosen": -1.565151572227478, + "logits/rejected": -1.5192712545394897, + "logps/chosen": -204.0678253173828, + "logps/rejected": -276.42156982421875, + "loss": 0.5454, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4998000860214233, + "rewards/margins": 0.7670835256576538, + "rewards/rejected": -2.266883611679077, + "step": 13850 + }, + { + "epoch": 2.3880082701585112, + "grad_norm": 25.851102828979492, + "learning_rate": 2.4302241138210633e-08, + "logits/chosen": -1.5774033069610596, + "logits/rejected": -1.5349401235580444, + "logps/chosen": -203.85476684570312, + "logps/rejected": -286.1795654296875, + "loss": 0.4733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5228554010391235, + "rewards/margins": 0.8183981776237488, + "rewards/rejected": -2.3412535190582275, + "step": 13860 + }, + { + "epoch": 2.3897312198483807, + "grad_norm": 26.764244079589844, + "learning_rate": 2.417138857350428e-08, + "logits/chosen": -1.6568161249160767, + "logits/rejected": -1.613242745399475, + "logps/chosen": -210.5281219482422, + "logps/rejected": -302.59844970703125, + "loss": 0.4892, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5569124221801758, + "rewards/margins": 0.9423718452453613, + "rewards/rejected": -2.499284267425537, + "step": 13870 + }, + { + "epoch": 2.3914541695382496, + "grad_norm": 39.235816955566406, + "learning_rate": 2.404084079458457e-08, + "logits/chosen": -1.5871946811676025, + "logits/rejected": -1.5298856496810913, + "logps/chosen": -214.61550903320312, + "logps/rejected": -282.5399475097656, + "loss": 0.5843, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.601596474647522, + "rewards/margins": 0.7003291845321655, + "rewards/rejected": -2.3019254207611084, + "step": 13880 + }, + { + "epoch": 2.3931771192281186, + "grad_norm": 24.032691955566406, + "learning_rate": 2.3910598326175635e-08, + "logits/chosen": -1.6235158443450928, + "logits/rejected": -1.5832910537719727, + "logps/chosen": -204.44700622558594, + "logps/rejected": -277.77716064453125, + "loss": 0.4993, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4882326126098633, + "rewards/margins": 0.7519177198410034, + "rewards/rejected": -2.2401506900787354, + "step": 13890 + }, + { + "epoch": 2.3949000689179876, + "grad_norm": 23.762805938720703, + "learning_rate": 2.3780661691774585e-08, + "logits/chosen": -1.5637362003326416, + "logits/rejected": -1.5174527168273926, + "logps/chosen": -195.5287628173828, + "logps/rejected": -269.56658935546875, + "loss": 0.5077, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.420799732208252, + "rewards/margins": 0.7775405049324036, + "rewards/rejected": -2.1983401775360107, + "step": 13900 + }, + { + "epoch": 2.3966230186078565, + "grad_norm": 37.73569107055664, + "learning_rate": 2.3651031413649127e-08, + "logits/chosen": -1.5958714485168457, + "logits/rejected": -1.552513837814331, + "logps/chosen": -190.7194061279297, + "logps/rejected": -259.73394775390625, + "loss": 0.5196, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.340254783630371, + "rewards/margins": 0.7284151315689087, + "rewards/rejected": -2.0686697959899902, + "step": 13910 + }, + { + "epoch": 2.3983459682977255, + "grad_norm": 25.303503036499023, + "learning_rate": 2.3521708012835696e-08, + "logits/chosen": -1.6298589706420898, + "logits/rejected": -1.5667641162872314, + "logps/chosen": -205.85693359375, + "logps/rejected": -285.18780517578125, + "loss": 0.4789, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.456417441368103, + "rewards/margins": 0.8772147297859192, + "rewards/rejected": -2.333632469177246, + "step": 13920 + }, + { + "epoch": 2.400068917987595, + "grad_norm": 28.09389305114746, + "learning_rate": 2.3392692009137193e-08, + "logits/chosen": -1.6002569198608398, + "logits/rejected": -1.5511232614517212, + "logps/chosen": -190.23448181152344, + "logps/rejected": -257.18218994140625, + "loss": 0.55, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3869836330413818, + "rewards/margins": 0.6741336584091187, + "rewards/rejected": -2.061117649078369, + "step": 13930 + }, + { + "epoch": 2.401791867677464, + "grad_norm": 31.90937042236328, + "learning_rate": 2.3263983921120987e-08, + "logits/chosen": -1.5503696203231812, + "logits/rejected": -1.5013597011566162, + "logps/chosen": -186.52639770507812, + "logps/rejected": -286.6660461425781, + "loss": 0.4748, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3336960077285767, + "rewards/margins": 0.9724694490432739, + "rewards/rejected": -2.3061652183532715, + "step": 13940 + }, + { + "epoch": 2.403514817367333, + "grad_norm": 45.16349792480469, + "learning_rate": 2.3135584266116837e-08, + "logits/chosen": -1.643180251121521, + "logits/rejected": -1.5967227220535278, + "logps/chosen": -201.06625366210938, + "logps/rejected": -273.56024169921875, + "loss": 0.5508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4699921607971191, + "rewards/margins": 0.7421053647994995, + "rewards/rejected": -2.212097644805908, + "step": 13950 + }, + { + "epoch": 2.405237767057202, + "grad_norm": 33.21641540527344, + "learning_rate": 2.3007493560214787e-08, + "logits/chosen": -1.4977080821990967, + "logits/rejected": -1.477624773979187, + "logps/chosen": -202.22499084472656, + "logps/rejected": -258.01995849609375, + "loss": 0.5666, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4879155158996582, + "rewards/margins": 0.5633623003959656, + "rewards/rejected": -2.0512776374816895, + "step": 13960 + }, + { + "epoch": 2.406960716747071, + "grad_norm": 18.644105911254883, + "learning_rate": 2.2879712318263056e-08, + "logits/chosen": -1.6170930862426758, + "logits/rejected": -1.5662033557891846, + "logps/chosen": -198.0435791015625, + "logps/rejected": -275.56024169921875, + "loss": 0.5217, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.395129680633545, + "rewards/margins": 0.8305740356445312, + "rewards/rejected": -2.225703477859497, + "step": 13970 + }, + { + "epoch": 2.40868366643694, + "grad_norm": 27.033361434936523, + "learning_rate": 2.2752241053865973e-08, + "logits/chosen": -1.58198082447052, + "logits/rejected": -1.5406694412231445, + "logps/chosen": -191.66148376464844, + "logps/rejected": -282.66217041015625, + "loss": 0.4813, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3707060813903809, + "rewards/margins": 0.9057755470275879, + "rewards/rejected": -2.2764816284179688, + "step": 13980 + }, + { + "epoch": 2.410406616126809, + "grad_norm": 48.245643615722656, + "learning_rate": 2.2625080279382024e-08, + "logits/chosen": -1.6190292835235596, + "logits/rejected": -1.5692462921142578, + "logps/chosen": -197.7570343017578, + "logps/rejected": -270.0393371582031, + "loss": 0.5232, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4068973064422607, + "rewards/margins": 0.7630517482757568, + "rewards/rejected": -2.1699490547180176, + "step": 13990 + }, + { + "epoch": 2.412129565816678, + "grad_norm": 32.1939582824707, + "learning_rate": 2.249823050592169e-08, + "logits/chosen": -1.5385420322418213, + "logits/rejected": -1.4891750812530518, + "logps/chosen": -198.2667236328125, + "logps/rejected": -287.6027526855469, + "loss": 0.4543, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4719661474227905, + "rewards/margins": 0.8670805096626282, + "rewards/rejected": -2.3390464782714844, + "step": 14000 + }, + { + "epoch": 2.412129565816678, + "eval_logits/chosen": -1.7365682125091553, + "eval_logits/rejected": -1.7133723497390747, + "eval_logps/chosen": -195.87925720214844, + "eval_logps/rejected": -234.26934814453125, + "eval_loss": 0.6318486332893372, + "eval_rewards/accuracies": 0.6459107995033264, + "eval_rewards/chosen": -1.3716734647750854, + "eval_rewards/margins": 0.33921846747398376, + "eval_rewards/rejected": -1.710891842842102, + "eval_runtime": 384.7361, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 14000 + }, + { + "epoch": 2.413852515506547, + "grad_norm": 30.94611358642578, + "learning_rate": 2.2371692243345354e-08, + "logits/chosen": -1.574171781539917, + "logits/rejected": -1.5344856977462769, + "logps/chosen": -197.61508178710938, + "logps/rejected": -269.8860778808594, + "loss": 0.5565, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.42230224609375, + "rewards/margins": 0.7458277940750122, + "rewards/rejected": -2.1681301593780518, + "step": 14010 + }, + { + "epoch": 2.415575465196416, + "grad_norm": 33.24098205566406, + "learning_rate": 2.2245466000261394e-08, + "logits/chosen": -1.5797890424728394, + "logits/rejected": -1.5521284341812134, + "logps/chosen": -204.57333374023438, + "logps/rejected": -268.26678466796875, + "loss": 0.5693, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5004533529281616, + "rewards/margins": 0.6720166802406311, + "rewards/rejected": -2.1724698543548584, + "step": 14020 + }, + { + "epoch": 2.4172984148862855, + "grad_norm": 30.121356964111328, + "learning_rate": 2.211955228402399e-08, + "logits/chosen": -1.5755398273468018, + "logits/rejected": -1.528684377670288, + "logps/chosen": -204.1021270751953, + "logps/rejected": -272.84130859375, + "loss": 0.5418, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5018281936645508, + "rewards/margins": 0.7337289452552795, + "rewards/rejected": -2.2355570793151855, + "step": 14030 + }, + { + "epoch": 2.4190213645761545, + "grad_norm": 42.259281158447266, + "learning_rate": 2.1993951600731154e-08, + "logits/chosen": -1.569271445274353, + "logits/rejected": -1.4973461627960205, + "logps/chosen": -198.60647583007812, + "logps/rejected": -280.525634765625, + "loss": 0.4824, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4193156957626343, + "rewards/margins": 0.8868446350097656, + "rewards/rejected": -2.3061604499816895, + "step": 14040 + }, + { + "epoch": 2.4207443142660234, + "grad_norm": 24.36745262145996, + "learning_rate": 2.186866445522273e-08, + "logits/chosen": -1.6486291885375977, + "logits/rejected": -1.5946848392486572, + "logps/chosen": -185.94216918945312, + "logps/rejected": -250.9385223388672, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.309471845626831, + "rewards/margins": 0.694402813911438, + "rewards/rejected": -2.0038745403289795, + "step": 14050 + }, + { + "epoch": 2.4224672639558924, + "grad_norm": 21.555404663085938, + "learning_rate": 2.1743691351078332e-08, + "logits/chosen": -1.6609976291656494, + "logits/rejected": -1.6012214422225952, + "logps/chosen": -188.33303833007812, + "logps/rejected": -285.00128173828125, + "loss": 0.4531, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3309261798858643, + "rewards/margins": 1.0017203092575073, + "rewards/rejected": -2.332646369934082, + "step": 14060 + }, + { + "epoch": 2.4241902136457614, + "grad_norm": 33.299034118652344, + "learning_rate": 2.161903279061529e-08, + "logits/chosen": -1.6014869213104248, + "logits/rejected": -1.549005389213562, + "logps/chosen": -203.32826232910156, + "logps/rejected": -292.0400390625, + "loss": 0.4812, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4871909618377686, + "rewards/margins": 0.8758770823478699, + "rewards/rejected": -2.363068103790283, + "step": 14070 + }, + { + "epoch": 2.425913163335631, + "grad_norm": 25.26913070678711, + "learning_rate": 2.14946892748866e-08, + "logits/chosen": -1.5732542276382446, + "logits/rejected": -1.5227702856063843, + "logps/chosen": -215.99417114257812, + "logps/rejected": -286.9879455566406, + "loss": 0.5608, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.573691725730896, + "rewards/margins": 0.7653611898422241, + "rewards/rejected": -2.339052677154541, + "step": 14080 + }, + { + "epoch": 2.4276361130254998, + "grad_norm": 28.92695426940918, + "learning_rate": 2.1370661303679084e-08, + "logits/chosen": -1.5760879516601562, + "logits/rejected": -1.5211198329925537, + "logps/chosen": -203.30577087402344, + "logps/rejected": -268.3055419921875, + "loss": 0.5498, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.474204182624817, + "rewards/margins": 0.6897687911987305, + "rewards/rejected": -2.163972854614258, + "step": 14090 + }, + { + "epoch": 2.4293590627153687, + "grad_norm": 22.257734298706055, + "learning_rate": 2.1246949375511214e-08, + "logits/chosen": -1.6646171808242798, + "logits/rejected": -1.6052793264389038, + "logps/chosen": -197.08328247070312, + "logps/rejected": -284.4249572753906, + "loss": 0.4527, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.385813593864441, + "rewards/margins": 0.9123753309249878, + "rewards/rejected": -2.2981889247894287, + "step": 14100 + }, + { + "epoch": 2.4310820124052377, + "grad_norm": 50.557247161865234, + "learning_rate": 2.1123553987631126e-08, + "logits/chosen": -1.6229591369628906, + "logits/rejected": -1.5879002809524536, + "logps/chosen": -197.52047729492188, + "logps/rejected": -273.00579833984375, + "loss": 0.533, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4846248626708984, + "rewards/margins": 0.7303065061569214, + "rewards/rejected": -2.2149314880371094, + "step": 14110 + }, + { + "epoch": 2.4328049620951067, + "grad_norm": 41.365875244140625, + "learning_rate": 2.1000475636014635e-08, + "logits/chosen": -1.6093180179595947, + "logits/rejected": -1.5649349689483643, + "logps/chosen": -205.98617553710938, + "logps/rejected": -277.5599365234375, + "loss": 0.5421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5368276834487915, + "rewards/margins": 0.7141960859298706, + "rewards/rejected": -2.251023769378662, + "step": 14120 + }, + { + "epoch": 2.4345279117849756, + "grad_norm": 37.92250442504883, + "learning_rate": 2.0877714815363366e-08, + "logits/chosen": -1.6542012691497803, + "logits/rejected": -1.6128876209259033, + "logps/chosen": -189.68319702148438, + "logps/rejected": -254.00405883789062, + "loss": 0.5245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3541688919067383, + "rewards/margins": 0.6648411750793457, + "rewards/rejected": -2.019010066986084, + "step": 14130 + }, + { + "epoch": 2.436250861474845, + "grad_norm": 23.785160064697266, + "learning_rate": 2.0755272019102542e-08, + "logits/chosen": -1.706735372543335, + "logits/rejected": -1.6625268459320068, + "logps/chosen": -206.888916015625, + "logps/rejected": -287.9200134277344, + "loss": 0.5175, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5433664321899414, + "rewards/margins": 0.8240596055984497, + "rewards/rejected": -2.3674259185791016, + "step": 14140 + }, + { + "epoch": 2.437973811164714, + "grad_norm": 37.91211700439453, + "learning_rate": 2.063314773937921e-08, + "logits/chosen": -1.6717586517333984, + "logits/rejected": -1.6323515176773071, + "logps/chosen": -199.97064208984375, + "logps/rejected": -282.82415771484375, + "loss": 0.5092, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4601263999938965, + "rewards/margins": 0.8106845617294312, + "rewards/rejected": -2.270810842514038, + "step": 14150 + }, + { + "epoch": 2.439696760854583, + "grad_norm": 27.181774139404297, + "learning_rate": 2.051134246706008e-08, + "logits/chosen": -1.6129558086395264, + "logits/rejected": -1.571523666381836, + "logps/chosen": -197.92718505859375, + "logps/rejected": -272.5576171875, + "loss": 0.5285, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4462512731552124, + "rewards/margins": 0.7152296900749207, + "rewards/rejected": -2.1614811420440674, + "step": 14160 + }, + { + "epoch": 2.441419710544452, + "grad_norm": 35.80717086791992, + "learning_rate": 2.0389856691729734e-08, + "logits/chosen": -1.541624665260315, + "logits/rejected": -1.4962389469146729, + "logps/chosen": -206.609130859375, + "logps/rejected": -269.45013427734375, + "loss": 0.5747, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.510289192199707, + "rewards/margins": 0.6496440172195435, + "rewards/rejected": -2.159933090209961, + "step": 14170 + }, + { + "epoch": 2.4431426602343214, + "grad_norm": 41.601417541503906, + "learning_rate": 2.026869090168849e-08, + "logits/chosen": -1.609832525253296, + "logits/rejected": -1.5575616359710693, + "logps/chosen": -209.0862274169922, + "logps/rejected": -269.556884765625, + "loss": 0.554, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5163252353668213, + "rewards/margins": 0.664095938205719, + "rewards/rejected": -2.1804213523864746, + "step": 14180 + }, + { + "epoch": 2.4448656099241903, + "grad_norm": 33.28386306762695, + "learning_rate": 2.0147845583950552e-08, + "logits/chosen": -1.6766802072525024, + "logits/rejected": -1.6355340480804443, + "logps/chosen": -206.7571258544922, + "logps/rejected": -266.9535217285156, + "loss": 0.5501, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4788180589675903, + "rewards/margins": 0.6512070894241333, + "rewards/rejected": -2.1300249099731445, + "step": 14190 + }, + { + "epoch": 2.4465885596140593, + "grad_norm": 32.834320068359375, + "learning_rate": 2.0027321224242067e-08, + "logits/chosen": -1.518226146697998, + "logits/rejected": -1.4756324291229248, + "logps/chosen": -183.6091766357422, + "logps/rejected": -269.2956237792969, + "loss": 0.4708, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.29989492893219, + "rewards/margins": 0.8477428555488586, + "rewards/rejected": -2.1476378440856934, + "step": 14200 + }, + { + "epoch": 2.4483115093039283, + "grad_norm": 33.95294952392578, + "learning_rate": 1.9907118306999017e-08, + "logits/chosen": -1.6634937524795532, + "logits/rejected": -1.6183143854141235, + "logps/chosen": -197.2618865966797, + "logps/rejected": -266.377197265625, + "loss": 0.5297, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4434157609939575, + "rewards/margins": 0.6969861388206482, + "rewards/rejected": -2.140401840209961, + "step": 14210 + }, + { + "epoch": 2.4500344589937972, + "grad_norm": 28.349445343017578, + "learning_rate": 1.9787237315365424e-08, + "logits/chosen": -1.7196086645126343, + "logits/rejected": -1.6710901260375977, + "logps/chosen": -197.17166137695312, + "logps/rejected": -271.1381530761719, + "loss": 0.509, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3899104595184326, + "rewards/margins": 0.7698068618774414, + "rewards/rejected": -2.159717321395874, + "step": 14220 + }, + { + "epoch": 2.451757408683666, + "grad_norm": 27.328739166259766, + "learning_rate": 1.9667678731191373e-08, + "logits/chosen": -1.555780053138733, + "logits/rejected": -1.4922558069229126, + "logps/chosen": -192.60348510742188, + "logps/rejected": -280.90203857421875, + "loss": 0.4816, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3687331676483154, + "rewards/margins": 0.9207097887992859, + "rewards/rejected": -2.289443016052246, + "step": 14230 + }, + { + "epoch": 2.4534803583735356, + "grad_norm": 28.51763343811035, + "learning_rate": 1.9548443035031125e-08, + "logits/chosen": -1.5582568645477295, + "logits/rejected": -1.5182602405548096, + "logps/chosen": -197.07421875, + "logps/rejected": -287.04376220703125, + "loss": 0.5135, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4305438995361328, + "rewards/margins": 0.8921090364456177, + "rewards/rejected": -2.322652816772461, + "step": 14240 + }, + { + "epoch": 2.4552033080634046, + "grad_norm": 34.41429901123047, + "learning_rate": 1.942953070614094e-08, + "logits/chosen": -1.557747483253479, + "logits/rejected": -1.5177257061004639, + "logps/chosen": -198.07131958007812, + "logps/rejected": -260.2146911621094, + "loss": 0.539, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.43120539188385, + "rewards/margins": 0.6406923532485962, + "rewards/rejected": -2.0718979835510254, + "step": 14250 + }, + { + "epoch": 2.4569262577532736, + "grad_norm": 48.78285217285156, + "learning_rate": 1.93109422224775e-08, + "logits/chosen": -1.6656968593597412, + "logits/rejected": -1.6043860912322998, + "logps/chosen": -202.08387756347656, + "logps/rejected": -270.35601806640625, + "loss": 0.542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4505724906921387, + "rewards/margins": 0.7518132925033569, + "rewards/rejected": -2.202385902404785, + "step": 14260 + }, + { + "epoch": 2.4586492074431425, + "grad_norm": 28.05662727355957, + "learning_rate": 1.9192678060695812e-08, + "logits/chosen": -1.6255466938018799, + "logits/rejected": -1.5749638080596924, + "logps/chosen": -191.5539093017578, + "logps/rejected": -275.77142333984375, + "loss": 0.4911, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3733799457550049, + "rewards/margins": 0.8732690811157227, + "rewards/rejected": -2.2466492652893066, + "step": 14270 + }, + { + "epoch": 2.460372157133012, + "grad_norm": 26.489620208740234, + "learning_rate": 1.9074738696147196e-08, + "logits/chosen": -1.522174596786499, + "logits/rejected": -1.4888569116592407, + "logps/chosen": -200.908203125, + "logps/rejected": -266.4534606933594, + "loss": 0.5756, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4906336069107056, + "rewards/margins": 0.6441228985786438, + "rewards/rejected": -2.134756565093994, + "step": 14280 + }, + { + "epoch": 2.462095106822881, + "grad_norm": 22.64809799194336, + "learning_rate": 1.8957124602877618e-08, + "logits/chosen": -1.5703630447387695, + "logits/rejected": -1.5144102573394775, + "logps/chosen": -200.39471435546875, + "logps/rejected": -273.7787780761719, + "loss": 0.5235, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4013175964355469, + "rewards/margins": 0.8190463185310364, + "rewards/rejected": -2.2203638553619385, + "step": 14290 + }, + { + "epoch": 2.46381805651275, + "grad_norm": 26.21112060546875, + "learning_rate": 1.8839836253625496e-08, + "logits/chosen": -1.7248750925064087, + "logits/rejected": -1.6809425354003906, + "logps/chosen": -180.10690307617188, + "logps/rejected": -273.7902526855469, + "loss": 0.4575, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2610968351364136, + "rewards/margins": 0.910190761089325, + "rewards/rejected": -2.171287775039673, + "step": 14300 + }, + { + "epoch": 2.465541006202619, + "grad_norm": 26.50727081298828, + "learning_rate": 1.872287411982011e-08, + "logits/chosen": -1.5628302097320557, + "logits/rejected": -1.5138075351715088, + "logps/chosen": -195.84312438964844, + "logps/rejected": -272.01519775390625, + "loss": 0.5121, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4113115072250366, + "rewards/margins": 0.7861210107803345, + "rewards/rejected": -2.197432518005371, + "step": 14310 + }, + { + "epoch": 2.467263955892488, + "grad_norm": 22.044160842895508, + "learning_rate": 1.860623867157941e-08, + "logits/chosen": -1.605048418045044, + "logits/rejected": -1.5548312664031982, + "logps/chosen": -176.4993896484375, + "logps/rejected": -260.81231689453125, + "loss": 0.4771, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.209468126296997, + "rewards/margins": 0.8460773229598999, + "rewards/rejected": -2.0555453300476074, + "step": 14320 + }, + { + "epoch": 2.468986905582357, + "grad_norm": 28.703174591064453, + "learning_rate": 1.8489930377708372e-08, + "logits/chosen": -1.7636277675628662, + "logits/rejected": -1.6995710134506226, + "logps/chosen": -198.1187744140625, + "logps/rejected": -292.19293212890625, + "loss": 0.466, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.4311959743499756, + "rewards/margins": 0.9489741325378418, + "rewards/rejected": -2.3801698684692383, + "step": 14330 + }, + { + "epoch": 2.470709855272226, + "grad_norm": 27.461868286132812, + "learning_rate": 1.8373949705696934e-08, + "logits/chosen": -1.606766700744629, + "logits/rejected": -1.5721355676651, + "logps/chosen": -199.48684692382812, + "logps/rejected": -288.28839111328125, + "loss": 0.5039, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4460034370422363, + "rewards/margins": 0.8741511106491089, + "rewards/rejected": -2.3201546669006348, + "step": 14340 + }, + { + "epoch": 2.472432804962095, + "grad_norm": 29.6517276763916, + "learning_rate": 1.8258297121718204e-08, + "logits/chosen": -1.6252644062042236, + "logits/rejected": -1.5800553560256958, + "logps/chosen": -200.2357940673828, + "logps/rejected": -272.8708190917969, + "loss": 0.5294, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4642345905303955, + "rewards/margins": 0.7204502820968628, + "rewards/rejected": -2.1846847534179688, + "step": 14350 + }, + { + "epoch": 2.474155754651964, + "grad_norm": 40.31829833984375, + "learning_rate": 1.81429730906266e-08, + "logits/chosen": -1.6263774633407593, + "logits/rejected": -1.5871617794036865, + "logps/chosen": -203.97201538085938, + "logps/rejected": -262.5541076660156, + "loss": 0.5591, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4734426736831665, + "rewards/margins": 0.6249311566352844, + "rewards/rejected": -2.0983736515045166, + "step": 14360 + }, + { + "epoch": 2.475878704341833, + "grad_norm": 34.94589614868164, + "learning_rate": 1.8027978075955953e-08, + "logits/chosen": -1.614682912826538, + "logits/rejected": -1.5609450340270996, + "logps/chosen": -202.69973754882812, + "logps/rejected": -275.20806884765625, + "loss": 0.5116, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4726990461349487, + "rewards/margins": 0.7752552628517151, + "rewards/rejected": -2.2479541301727295, + "step": 14370 + }, + { + "epoch": 2.4776016540317025, + "grad_norm": 29.386205673217773, + "learning_rate": 1.7913312539917624e-08, + "logits/chosen": -1.7429168224334717, + "logits/rejected": -1.6956888437271118, + "logps/chosen": -190.35641479492188, + "logps/rejected": -274.8843994140625, + "loss": 0.4653, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.383814811706543, + "rewards/margins": 0.8452625274658203, + "rewards/rejected": -2.2290773391723633, + "step": 14380 + }, + { + "epoch": 2.4793246037215715, + "grad_norm": 28.56850814819336, + "learning_rate": 1.7798976943398623e-08, + "logits/chosen": -1.6107604503631592, + "logits/rejected": -1.5593178272247314, + "logps/chosen": -189.3840789794922, + "logps/rejected": -281.2845458984375, + "loss": 0.4584, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3737777471542358, + "rewards/margins": 0.9282294511795044, + "rewards/rejected": -2.3020071983337402, + "step": 14390 + }, + { + "epoch": 2.4810475534114405, + "grad_norm": 37.50496292114258, + "learning_rate": 1.7684971745959887e-08, + "logits/chosen": -1.651484489440918, + "logits/rejected": -1.596335768699646, + "logps/chosen": -200.45687866210938, + "logps/rejected": -281.0902099609375, + "loss": 0.5121, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4362685680389404, + "rewards/margins": 0.8774330019950867, + "rewards/rejected": -2.313701629638672, + "step": 14400 + }, + { + "epoch": 2.4810475534114405, + "eval_logits/chosen": -1.7252347469329834, + "eval_logits/rejected": -1.7016128301620483, + "eval_logps/chosen": -200.76998901367188, + "eval_logps/rejected": -240.53890991210938, + "eval_loss": 0.6308444738388062, + "eval_rewards/accuracies": 0.6447490453720093, + "eval_rewards/chosen": -1.4205809831619263, + "eval_rewards/margins": 0.3530069589614868, + "eval_rewards/rejected": -1.7735878229141235, + "eval_runtime": 384.6003, + "eval_samples_per_second": 11.191, + "eval_steps_per_second": 1.399, + "step": 14400 + }, + { + "epoch": 2.4827705031013094, + "grad_norm": 36.72150421142578, + "learning_rate": 1.7571297405834328e-08, + "logits/chosen": -1.6439129114151, + "logits/rejected": -1.5948227643966675, + "logps/chosen": -195.40127563476562, + "logps/rejected": -278.58148193359375, + "loss": 0.5121, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4424898624420166, + "rewards/margins": 0.8148199915885925, + "rewards/rejected": -2.257310152053833, + "step": 14410 + }, + { + "epoch": 2.4844934527911784, + "grad_norm": 23.456363677978516, + "learning_rate": 1.7457954379924967e-08, + "logits/chosen": -1.6720870733261108, + "logits/rejected": -1.6245301961898804, + "logps/chosen": -199.8477020263672, + "logps/rejected": -280.53765869140625, + "loss": 0.5153, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4815994501113892, + "rewards/margins": 0.8005410432815552, + "rewards/rejected": -2.2821404933929443, + "step": 14420 + }, + { + "epoch": 2.4862164024810474, + "grad_norm": 53.96633529663086, + "learning_rate": 1.7344943123803126e-08, + "logits/chosen": -1.6086504459381104, + "logits/rejected": -1.5626440048217773, + "logps/chosen": -196.92874145507812, + "logps/rejected": -284.5268249511719, + "loss": 0.5059, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.446078896522522, + "rewards/margins": 0.8915165066719055, + "rewards/rejected": -2.3375954627990723, + "step": 14430 + }, + { + "epoch": 2.4879393521709168, + "grad_norm": 35.11296844482422, + "learning_rate": 1.7232264091706682e-08, + "logits/chosen": -1.5991556644439697, + "logits/rejected": -1.5401017665863037, + "logps/chosen": -193.55262756347656, + "logps/rejected": -288.7968444824219, + "loss": 0.4585, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3911709785461426, + "rewards/margins": 0.9518829584121704, + "rewards/rejected": -2.3430540561676025, + "step": 14440 + }, + { + "epoch": 2.4896623018607857, + "grad_norm": 27.75967025756836, + "learning_rate": 1.7119917736538115e-08, + "logits/chosen": -1.5744739770889282, + "logits/rejected": -1.5309337377548218, + "logps/chosen": -212.39810180664062, + "logps/rejected": -281.50128173828125, + "loss": 0.5227, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5675699710845947, + "rewards/margins": 0.7253061532974243, + "rewards/rejected": -2.2928760051727295, + "step": 14450 + }, + { + "epoch": 2.4913852515506547, + "grad_norm": 26.898860931396484, + "learning_rate": 1.700790450986276e-08, + "logits/chosen": -1.5727896690368652, + "logits/rejected": -1.531781554222107, + "logps/chosen": -201.531005859375, + "logps/rejected": -279.4584045410156, + "loss": 0.5034, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4766613245010376, + "rewards/margins": 0.7838097810745239, + "rewards/rejected": -2.2604711055755615, + "step": 14460 + }, + { + "epoch": 2.4931082012405237, + "grad_norm": 33.584259033203125, + "learning_rate": 1.6896224861907004e-08, + "logits/chosen": -1.7253456115722656, + "logits/rejected": -1.6681970357894897, + "logps/chosen": -214.53030395507812, + "logps/rejected": -292.70928955078125, + "loss": 0.4774, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5555566549301147, + "rewards/margins": 0.8685323596000671, + "rewards/rejected": -2.424088716506958, + "step": 14470 + }, + { + "epoch": 2.4948311509303926, + "grad_norm": 28.306344985961914, + "learning_rate": 1.6784879241556395e-08, + "logits/chosen": -1.617222785949707, + "logits/rejected": -1.5918428897857666, + "logps/chosen": -213.22500610351562, + "logps/rejected": -294.64129638671875, + "loss": 0.5175, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5985944271087646, + "rewards/margins": 0.7967004776000977, + "rewards/rejected": -2.3952949047088623, + "step": 14480 + }, + { + "epoch": 2.496554100620262, + "grad_norm": 27.062841415405273, + "learning_rate": 1.667386809635387e-08, + "logits/chosen": -1.5785030126571655, + "logits/rejected": -1.539458990097046, + "logps/chosen": -203.4307861328125, + "logps/rejected": -288.9940490722656, + "loss": 0.4953, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.52878737449646, + "rewards/margins": 0.8457359075546265, + "rewards/rejected": -2.374523401260376, + "step": 14490 + }, + { + "epoch": 2.498277050310131, + "grad_norm": 20.653133392333984, + "learning_rate": 1.6563191872498062e-08, + "logits/chosen": -1.5909979343414307, + "logits/rejected": -1.5225660800933838, + "logps/chosen": -201.03378295898438, + "logps/rejected": -291.93817138671875, + "loss": 0.4718, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4409816265106201, + "rewards/margins": 0.9384520649909973, + "rewards/rejected": -2.3794338703155518, + "step": 14500 + }, + { + "epoch": 2.5, + "grad_norm": 40.51236343383789, + "learning_rate": 1.6452851014841374e-08, + "logits/chosen": -1.6336681842803955, + "logits/rejected": -1.5917491912841797, + "logps/chosen": -217.73507690429688, + "logps/rejected": -279.9249572753906, + "loss": 0.5713, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6216713190078735, + "rewards/margins": 0.6607409119606018, + "rewards/rejected": -2.28241229057312, + "step": 14510 + }, + { + "epoch": 2.501722949689869, + "grad_norm": 35.526451110839844, + "learning_rate": 1.634284596688823e-08, + "logits/chosen": -1.609086036682129, + "logits/rejected": -1.5602099895477295, + "logps/chosen": -214.7878875732422, + "logps/rejected": -291.4499816894531, + "loss": 0.5584, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5922327041625977, + "rewards/margins": 0.7702957987785339, + "rewards/rejected": -2.3625285625457764, + "step": 14520 + }, + { + "epoch": 2.503445899379738, + "grad_norm": 33.11463928222656, + "learning_rate": 1.623317717079328e-08, + "logits/chosen": -1.6266065835952759, + "logits/rejected": -1.5786244869232178, + "logps/chosen": -216.61978149414062, + "logps/rejected": -298.9439697265625, + "loss": 0.5044, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5895724296569824, + "rewards/margins": 0.8475859761238098, + "rewards/rejected": -2.4371583461761475, + "step": 14530 + }, + { + "epoch": 2.505168849069607, + "grad_norm": 27.04874610900879, + "learning_rate": 1.6123845067359676e-08, + "logits/chosen": -1.606018304824829, + "logits/rejected": -1.5490694046020508, + "logps/chosen": -201.17922973632812, + "logps/rejected": -293.52813720703125, + "loss": 0.4742, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4720717668533325, + "rewards/margins": 0.9397362470626831, + "rewards/rejected": -2.4118080139160156, + "step": 14540 + }, + { + "epoch": 2.5068917987594763, + "grad_norm": 22.950489044189453, + "learning_rate": 1.6014850096037304e-08, + "logits/chosen": -1.5970135927200317, + "logits/rejected": -1.5471218824386597, + "logps/chosen": -195.12013244628906, + "logps/rejected": -274.5163879394531, + "loss": 0.5116, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3837738037109375, + "rewards/margins": 0.829186737537384, + "rewards/rejected": -2.2129604816436768, + "step": 14550 + }, + { + "epoch": 2.5086147484493453, + "grad_norm": 27.70438575744629, + "learning_rate": 1.5906192694920883e-08, + "logits/chosen": -1.592337965965271, + "logits/rejected": -1.5386199951171875, + "logps/chosen": -208.83316040039062, + "logps/rejected": -304.3836975097656, + "loss": 0.4948, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5584619045257568, + "rewards/margins": 0.9380912780761719, + "rewards/rejected": -2.496553421020508, + "step": 14560 + }, + { + "epoch": 2.5103376981392143, + "grad_norm": 33.47995376586914, + "learning_rate": 1.5797873300748355e-08, + "logits/chosen": -1.5123052597045898, + "logits/rejected": -1.4835093021392822, + "logps/chosen": -198.01016235351562, + "logps/rejected": -279.16314697265625, + "loss": 0.5221, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4571415185928345, + "rewards/margins": 0.7968717813491821, + "rewards/rejected": -2.2540130615234375, + "step": 14570 + }, + { + "epoch": 2.5120606478290832, + "grad_norm": 33.63603973388672, + "learning_rate": 1.5689892348899103e-08, + "logits/chosen": -1.6348133087158203, + "logits/rejected": -1.5937086343765259, + "logps/chosen": -198.91029357910156, + "logps/rejected": -273.56414794921875, + "loss": 0.5254, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4940311908721924, + "rewards/margins": 0.7310426831245422, + "rewards/rejected": -2.22507381439209, + "step": 14580 + }, + { + "epoch": 2.5137835975189526, + "grad_norm": 34.1225700378418, + "learning_rate": 1.5582250273392107e-08, + "logits/chosen": -1.577149748802185, + "logits/rejected": -1.5392792224884033, + "logps/chosen": -192.0406951904297, + "logps/rejected": -273.19183349609375, + "loss": 0.5107, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3957537412643433, + "rewards/margins": 0.8084688186645508, + "rewards/rejected": -2.2042224407196045, + "step": 14590 + }, + { + "epoch": 2.5155065472088216, + "grad_norm": 39.473167419433594, + "learning_rate": 1.547494750688435e-08, + "logits/chosen": -1.582103967666626, + "logits/rejected": -1.5162458419799805, + "logps/chosen": -204.1102294921875, + "logps/rejected": -292.2998046875, + "loss": 0.4573, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.493896722793579, + "rewards/margins": 0.9231041073799133, + "rewards/rejected": -2.4170007705688477, + "step": 14600 + }, + { + "epoch": 2.5172294968986906, + "grad_norm": 39.95032501220703, + "learning_rate": 1.5367984480668884e-08, + "logits/chosen": -1.578777551651001, + "logits/rejected": -1.5251179933547974, + "logps/chosen": -201.05909729003906, + "logps/rejected": -271.13177490234375, + "loss": 0.503, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4469878673553467, + "rewards/margins": 0.7682374715805054, + "rewards/rejected": -2.2152252197265625, + "step": 14610 + }, + { + "epoch": 2.5189524465885595, + "grad_norm": 33.64727020263672, + "learning_rate": 1.526136162467333e-08, + "logits/chosen": -1.5359634160995483, + "logits/rejected": -1.5078798532485962, + "logps/chosen": -215.8882598876953, + "logps/rejected": -297.89947509765625, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6469218730926514, + "rewards/margins": 0.8006730079650879, + "rewards/rejected": -2.4475948810577393, + "step": 14620 + }, + { + "epoch": 2.5206753962784285, + "grad_norm": 27.750146865844727, + "learning_rate": 1.5155079367457925e-08, + "logits/chosen": -1.5164506435394287, + "logits/rejected": -1.474473237991333, + "logps/chosen": -200.38941955566406, + "logps/rejected": -277.80364990234375, + "loss": 0.5291, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4730513095855713, + "rewards/margins": 0.7772534489631653, + "rewards/rejected": -2.250304698944092, + "step": 14630 + }, + { + "epoch": 2.5223983459682975, + "grad_norm": 21.659421920776367, + "learning_rate": 1.5049138136213968e-08, + "logits/chosen": -1.5548603534698486, + "logits/rejected": -1.51322340965271, + "logps/chosen": -197.6011199951172, + "logps/rejected": -283.56121826171875, + "loss": 0.5249, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4577686786651611, + "rewards/margins": 0.8481302261352539, + "rewards/rejected": -2.305898666381836, + "step": 14640 + }, + { + "epoch": 2.524121295658167, + "grad_norm": 70.42359161376953, + "learning_rate": 1.4943538356762065e-08, + "logits/chosen": -1.6177898645401, + "logits/rejected": -1.5802927017211914, + "logps/chosen": -221.6212921142578, + "logps/rejected": -277.2975158691406, + "loss": 0.6121, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.673858404159546, + "rewards/margins": 0.602361798286438, + "rewards/rejected": -2.2762205600738525, + "step": 14650 + }, + { + "epoch": 2.525844245348036, + "grad_norm": 38.56734848022461, + "learning_rate": 1.4838280453550234e-08, + "logits/chosen": -1.59617018699646, + "logits/rejected": -1.5251657962799072, + "logps/chosen": -195.82003784179688, + "logps/rejected": -288.07806396484375, + "loss": 0.4355, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.428985357284546, + "rewards/margins": 0.9305770993232727, + "rewards/rejected": -2.359562635421753, + "step": 14660 + }, + { + "epoch": 2.527567195037905, + "grad_norm": 27.20901107788086, + "learning_rate": 1.4733364849652518e-08, + "logits/chosen": -1.523749589920044, + "logits/rejected": -1.4803264141082764, + "logps/chosen": -189.8150634765625, + "logps/rejected": -276.70794677734375, + "loss": 0.4607, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3930755853652954, + "rewards/margins": 0.8602371215820312, + "rewards/rejected": -2.253312826156616, + "step": 14670 + }, + { + "epoch": 2.529290144727774, + "grad_norm": 36.2165412902832, + "learning_rate": 1.4628791966767095e-08, + "logits/chosen": -1.6154565811157227, + "logits/rejected": -1.5678001642227173, + "logps/chosen": -195.2696990966797, + "logps/rejected": -271.886474609375, + "loss": 0.5179, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4701576232910156, + "rewards/margins": 0.7252773642539978, + "rewards/rejected": -2.195435047149658, + "step": 14680 + }, + { + "epoch": 2.531013094417643, + "grad_norm": 42.185359954833984, + "learning_rate": 1.4524562225214532e-08, + "logits/chosen": -1.594913125038147, + "logits/rejected": -1.5524381399154663, + "logps/chosen": -210.17886352539062, + "logps/rejected": -305.15618896484375, + "loss": 0.4961, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5879902839660645, + "rewards/margins": 0.9257308840751648, + "rewards/rejected": -2.513720989227295, + "step": 14690 + }, + { + "epoch": 2.532736044107512, + "grad_norm": 31.354543685913086, + "learning_rate": 1.4420676043936198e-08, + "logits/chosen": -1.6699739694595337, + "logits/rejected": -1.6193714141845703, + "logps/chosen": -223.212890625, + "logps/rejected": -327.3179931640625, + "loss": 0.4993, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6847846508026123, + "rewards/margins": 1.0284368991851807, + "rewards/rejected": -2.713221311569214, + "step": 14700 + }, + { + "epoch": 2.534458993797381, + "grad_norm": 32.18533706665039, + "learning_rate": 1.4317133840492612e-08, + "logits/chosen": -1.6072320938110352, + "logits/rejected": -1.5672547817230225, + "logps/chosen": -199.07467651367188, + "logps/rejected": -282.31182861328125, + "loss": 0.5002, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4560476541519165, + "rewards/margins": 0.8347952961921692, + "rewards/rejected": -2.2908430099487305, + "step": 14710 + }, + { + "epoch": 2.53618194348725, + "grad_norm": 41.4571533203125, + "learning_rate": 1.4213936031061691e-08, + "logits/chosen": -1.5518696308135986, + "logits/rejected": -1.4998043775558472, + "logps/chosen": -218.3801727294922, + "logps/rejected": -297.88433837890625, + "loss": 0.5296, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6070804595947266, + "rewards/margins": 0.8339778184890747, + "rewards/rejected": -2.44105863571167, + "step": 14720 + }, + { + "epoch": 2.537904893177119, + "grad_norm": 36.418697357177734, + "learning_rate": 1.411108303043701e-08, + "logits/chosen": -1.6525733470916748, + "logits/rejected": -1.598491907119751, + "logps/chosen": -200.5663604736328, + "logps/rejected": -287.1551513671875, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4741586446762085, + "rewards/margins": 0.8913132548332214, + "rewards/rejected": -2.3654720783233643, + "step": 14730 + }, + { + "epoch": 2.539627842866988, + "grad_norm": 30.349742889404297, + "learning_rate": 1.4008575252026334e-08, + "logits/chosen": -1.5415337085723877, + "logits/rejected": -1.5147979259490967, + "logps/chosen": -219.85714721679688, + "logps/rejected": -309.5752868652344, + "loss": 0.4906, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6472421884536743, + "rewards/margins": 0.897518515586853, + "rewards/rejected": -2.5447609424591064, + "step": 14740 + }, + { + "epoch": 2.5413507925568575, + "grad_norm": 41.62550735473633, + "learning_rate": 1.3906413107849757e-08, + "logits/chosen": -1.5927969217300415, + "logits/rejected": -1.5406432151794434, + "logps/chosen": -207.52273559570312, + "logps/rejected": -286.8734436035156, + "loss": 0.495, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.547933578491211, + "rewards/margins": 0.8301254510879517, + "rewards/rejected": -2.378058910369873, + "step": 14750 + }, + { + "epoch": 2.5430737422467264, + "grad_norm": 28.98420524597168, + "learning_rate": 1.3804597008538177e-08, + "logits/chosen": -1.623771071434021, + "logits/rejected": -1.577034831047058, + "logps/chosen": -202.60971069335938, + "logps/rejected": -285.4917907714844, + "loss": 0.4974, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.497370958328247, + "rewards/margins": 0.8208389282226562, + "rewards/rejected": -2.3182098865509033, + "step": 14760 + }, + { + "epoch": 2.5447966919365954, + "grad_norm": 25.282695770263672, + "learning_rate": 1.3703127363331556e-08, + "logits/chosen": -1.6262471675872803, + "logits/rejected": -1.5848948955535889, + "logps/chosen": -212.24789428710938, + "logps/rejected": -291.30816650390625, + "loss": 0.5426, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5611674785614014, + "rewards/margins": 0.7933932542800903, + "rewards/rejected": -2.354560375213623, + "step": 14770 + }, + { + "epoch": 2.5465196416264644, + "grad_norm": 40.03992462158203, + "learning_rate": 1.3602004580077375e-08, + "logits/chosen": -1.558164119720459, + "logits/rejected": -1.5256003141403198, + "logps/chosen": -205.63125610351562, + "logps/rejected": -281.8812561035156, + "loss": 0.5609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.563673973083496, + "rewards/margins": 0.7622488141059875, + "rewards/rejected": -2.325922727584839, + "step": 14780 + }, + { + "epoch": 2.548242591316334, + "grad_norm": 27.992399215698242, + "learning_rate": 1.3501229065228892e-08, + "logits/chosen": -1.64004647731781, + "logits/rejected": -1.5912551879882812, + "logps/chosen": -222.78921508789062, + "logps/rejected": -298.9998779296875, + "loss": 0.5625, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6930469274520874, + "rewards/margins": 0.7830427289009094, + "rewards/rejected": -2.4760897159576416, + "step": 14790 + }, + { + "epoch": 2.5499655410062028, + "grad_norm": 43.494083404541016, + "learning_rate": 1.3400801223843539e-08, + "logits/chosen": -1.5810552835464478, + "logits/rejected": -1.5414950847625732, + "logps/chosen": -213.85580444335938, + "logps/rejected": -310.27008056640625, + "loss": 0.4847, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5871661901474, + "rewards/margins": 0.9443961977958679, + "rewards/rejected": -2.531562328338623, + "step": 14800 + }, + { + "epoch": 2.5499655410062028, + "eval_logits/chosen": -1.7152843475341797, + "eval_logits/rejected": -1.6912128925323486, + "eval_logps/chosen": -206.8795623779297, + "eval_logps/rejected": -248.1588592529297, + "eval_loss": 0.6304101347923279, + "eval_rewards/accuracies": 0.6442843675613403, + "eval_rewards/chosen": -1.4816765785217285, + "eval_rewards/margins": 0.3681107759475708, + "eval_rewards/rejected": -1.8497872352600098, + "eval_runtime": 384.7869, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 14800 + }, + { + "epoch": 2.5516884906960717, + "grad_norm": 24.059415817260742, + "learning_rate": 1.3300721459581355e-08, + "logits/chosen": -1.619269609451294, + "logits/rejected": -1.5576200485229492, + "logps/chosen": -222.851806640625, + "logps/rejected": -297.45196533203125, + "loss": 0.5204, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6409496068954468, + "rewards/margins": 0.8452986478805542, + "rewards/rejected": -2.48624849319458, + "step": 14810 + }, + { + "epoch": 2.5534114403859407, + "grad_norm": 43.30677032470703, + "learning_rate": 1.3200990174703308e-08, + "logits/chosen": -1.784624695777893, + "logits/rejected": -1.7225615978240967, + "logps/chosen": -204.2576904296875, + "logps/rejected": -300.54107666015625, + "loss": 0.4428, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5080598592758179, + "rewards/margins": 0.9750812649726868, + "rewards/rejected": -2.4831411838531494, + "step": 14820 + }, + { + "epoch": 2.5551343900758097, + "grad_norm": 27.601030349731445, + "learning_rate": 1.3101607770069667e-08, + "logits/chosen": -1.5990216732025146, + "logits/rejected": -1.5478788614273071, + "logps/chosen": -207.07339477539062, + "logps/rejected": -295.07421875, + "loss": 0.5039, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5224343538284302, + "rewards/margins": 0.9161651730537415, + "rewards/rejected": -2.4385993480682373, + "step": 14830 + }, + { + "epoch": 2.5568573397656786, + "grad_norm": 26.436817169189453, + "learning_rate": 1.3002574645138375e-08, + "logits/chosen": -1.6385233402252197, + "logits/rejected": -1.5847870111465454, + "logps/chosen": -213.31503295898438, + "logps/rejected": -308.9871520996094, + "loss": 0.4656, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5726587772369385, + "rewards/margins": 0.952702522277832, + "rewards/rejected": -2.5253615379333496, + "step": 14840 + }, + { + "epoch": 2.558580289455548, + "grad_norm": 22.54994773864746, + "learning_rate": 1.2903891197963568e-08, + "logits/chosen": -1.581339955329895, + "logits/rejected": -1.5261919498443604, + "logps/chosen": -217.6318817138672, + "logps/rejected": -309.2292175292969, + "loss": 0.4894, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.643764853477478, + "rewards/margins": 0.9066557884216309, + "rewards/rejected": -2.5504205226898193, + "step": 14850 + }, + { + "epoch": 2.560303239145417, + "grad_norm": 30.38620948791504, + "learning_rate": 1.2805557825193857e-08, + "logits/chosen": -1.571519374847412, + "logits/rejected": -1.5293896198272705, + "logps/chosen": -199.9291534423828, + "logps/rejected": -292.7502746582031, + "loss": 0.5276, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4561383724212646, + "rewards/margins": 0.9495272636413574, + "rewards/rejected": -2.405665636062622, + "step": 14860 + }, + { + "epoch": 2.562026188835286, + "grad_norm": 26.666837692260742, + "learning_rate": 1.2707574922070708e-08, + "logits/chosen": -1.6525710821151733, + "logits/rejected": -1.6034488677978516, + "logps/chosen": -205.05966186523438, + "logps/rejected": -281.1788330078125, + "loss": 0.5579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4710729122161865, + "rewards/margins": 0.812432587146759, + "rewards/rejected": -2.283505439758301, + "step": 14870 + }, + { + "epoch": 2.563749138525155, + "grad_norm": 34.56159591674805, + "learning_rate": 1.2609942882426938e-08, + "logits/chosen": -1.5512102842330933, + "logits/rejected": -1.516695499420166, + "logps/chosen": -198.02145385742188, + "logps/rejected": -282.66790771484375, + "loss": 0.5085, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.447977900505066, + "rewards/margins": 0.8416982889175415, + "rewards/rejected": -2.2896761894226074, + "step": 14880 + }, + { + "epoch": 2.5654720882150244, + "grad_norm": 39.69472122192383, + "learning_rate": 1.2512662098685144e-08, + "logits/chosen": -1.5551344156265259, + "logits/rejected": -1.5232659578323364, + "logps/chosen": -211.43896484375, + "logps/rejected": -283.1412048339844, + "loss": 0.5285, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5428087711334229, + "rewards/margins": 0.7138066291809082, + "rewards/rejected": -2.256615400314331, + "step": 14890 + }, + { + "epoch": 2.5671950379048933, + "grad_norm": 25.781578063964844, + "learning_rate": 1.2415732961856006e-08, + "logits/chosen": -1.4778220653533936, + "logits/rejected": -1.4218331575393677, + "logps/chosen": -199.333740234375, + "logps/rejected": -285.76885986328125, + "loss": 0.5085, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4639042615890503, + "rewards/margins": 0.8805838823318481, + "rewards/rejected": -2.3444881439208984, + "step": 14900 + }, + { + "epoch": 2.5689179875947623, + "grad_norm": 28.18239974975586, + "learning_rate": 1.2319155861536867e-08, + "logits/chosen": -1.6057764291763306, + "logits/rejected": -1.5677419900894165, + "logps/chosen": -195.48504638671875, + "logps/rejected": -280.6958923339844, + "loss": 0.4928, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4490611553192139, + "rewards/margins": 0.8212985992431641, + "rewards/rejected": -2.270359754562378, + "step": 14910 + }, + { + "epoch": 2.5706409372846313, + "grad_norm": 18.990493774414062, + "learning_rate": 1.222293118591008e-08, + "logits/chosen": -1.5674563646316528, + "logits/rejected": -1.5234514474868774, + "logps/chosen": -208.8411102294922, + "logps/rejected": -310.14605712890625, + "loss": 0.4895, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.560349702835083, + "rewards/margins": 0.9858556985855103, + "rewards/rejected": -2.5462050437927246, + "step": 14920 + }, + { + "epoch": 2.5723638869745002, + "grad_norm": 36.361183166503906, + "learning_rate": 1.2127059321741417e-08, + "logits/chosen": -1.717429757118225, + "logits/rejected": -1.6656608581542969, + "logps/chosen": -193.1423797607422, + "logps/rejected": -294.0716247558594, + "loss": 0.4604, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4025070667266846, + "rewards/margins": 1.0081617832183838, + "rewards/rejected": -2.4106688499450684, + "step": 14930 + }, + { + "epoch": 2.574086836664369, + "grad_norm": 44.2594108581543, + "learning_rate": 1.203154065437857e-08, + "logits/chosen": -1.6347858905792236, + "logits/rejected": -1.581321120262146, + "logps/chosen": -197.11672973632812, + "logps/rejected": -276.7389221191406, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.444643259048462, + "rewards/margins": 0.825234055519104, + "rewards/rejected": -2.2698774337768555, + "step": 14940 + }, + { + "epoch": 2.575809786354238, + "grad_norm": 39.82154846191406, + "learning_rate": 1.1936375567749612e-08, + "logits/chosen": -1.7075817584991455, + "logits/rejected": -1.654722809791565, + "logps/chosen": -212.85348510742188, + "logps/rejected": -278.1226501464844, + "loss": 0.5634, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5699443817138672, + "rewards/margins": 0.7073904275894165, + "rewards/rejected": -2.277334690093994, + "step": 14950 + }, + { + "epoch": 2.5775327360441076, + "grad_norm": 34.315711975097656, + "learning_rate": 1.1841564444361496e-08, + "logits/chosen": -1.5428823232650757, + "logits/rejected": -1.4934934377670288, + "logps/chosen": -212.505859375, + "logps/rejected": -296.00543212890625, + "loss": 0.5128, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5668630599975586, + "rewards/margins": 0.8439539670944214, + "rewards/rejected": -2.4108169078826904, + "step": 14960 + }, + { + "epoch": 2.5792556857339766, + "grad_norm": 37.772743225097656, + "learning_rate": 1.1747107665298273e-08, + "logits/chosen": -1.623822569847107, + "logits/rejected": -1.5672670602798462, + "logps/chosen": -201.41970825195312, + "logps/rejected": -292.71148681640625, + "loss": 0.5005, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4750289916992188, + "rewards/margins": 0.9137970805168152, + "rewards/rejected": -2.3888261318206787, + "step": 14970 + }, + { + "epoch": 2.5809786354238455, + "grad_norm": 32.4010124206543, + "learning_rate": 1.1653005610219913e-08, + "logits/chosen": -1.6380878686904907, + "logits/rejected": -1.56973135471344, + "logps/chosen": -209.4724884033203, + "logps/rejected": -311.81256103515625, + "loss": 0.4614, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5535662174224854, + "rewards/margins": 1.0467643737792969, + "rewards/rejected": -2.6003308296203613, + "step": 14980 + }, + { + "epoch": 2.582701585113715, + "grad_norm": 23.00949478149414, + "learning_rate": 1.155925865736055e-08, + "logits/chosen": -1.6744308471679688, + "logits/rejected": -1.6327850818634033, + "logps/chosen": -198.4084930419922, + "logps/rejected": -300.02435302734375, + "loss": 0.4787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4701337814331055, + "rewards/margins": 1.0028178691864014, + "rewards/rejected": -2.472951650619507, + "step": 14990 + }, + { + "epoch": 2.584424534803584, + "grad_norm": 44.53309631347656, + "learning_rate": 1.146586718352699e-08, + "logits/chosen": -1.6657359600067139, + "logits/rejected": -1.624629020690918, + "logps/chosen": -212.0545196533203, + "logps/rejected": -299.84539794921875, + "loss": 0.4727, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.559799075126648, + "rewards/margins": 0.9001957774162292, + "rewards/rejected": -2.4599947929382324, + "step": 15000 + }, + { + "epoch": 2.586147484493453, + "grad_norm": 41.07697677612305, + "learning_rate": 1.1372831564097286e-08, + "logits/chosen": -1.6813380718231201, + "logits/rejected": -1.6410796642303467, + "logps/chosen": -212.0048828125, + "logps/rejected": -279.15179443359375, + "loss": 0.6014, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5703686475753784, + "rewards/margins": 0.6888116598129272, + "rewards/rejected": -2.2591805458068848, + "step": 15010 + }, + { + "epoch": 2.587870434183322, + "grad_norm": 35.199371337890625, + "learning_rate": 1.1280152173019075e-08, + "logits/chosen": -1.5767467021942139, + "logits/rejected": -1.5493143796920776, + "logps/chosen": -198.18643188476562, + "logps/rejected": -283.5013427734375, + "loss": 0.5113, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4780842065811157, + "rewards/margins": 0.82579505443573, + "rewards/rejected": -2.3038792610168457, + "step": 15020 + }, + { + "epoch": 2.589593383873191, + "grad_norm": 32.93767166137695, + "learning_rate": 1.118782938280829e-08, + "logits/chosen": -1.5512254238128662, + "logits/rejected": -1.5099573135375977, + "logps/chosen": -210.15042114257812, + "logps/rejected": -285.9954833984375, + "loss": 0.5788, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5646612644195557, + "rewards/margins": 0.7441153526306152, + "rewards/rejected": -2.308776378631592, + "step": 15030 + }, + { + "epoch": 2.59131633356306, + "grad_norm": 38.56374740600586, + "learning_rate": 1.1095863564547436e-08, + "logits/chosen": -1.6254005432128906, + "logits/rejected": -1.5820324420928955, + "logps/chosen": -201.81747436523438, + "logps/rejected": -277.28643798828125, + "loss": 0.519, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4783657789230347, + "rewards/margins": 0.7753399610519409, + "rewards/rejected": -2.2537059783935547, + "step": 15040 + }, + { + "epoch": 2.5930392832529288, + "grad_norm": 28.777212142944336, + "learning_rate": 1.1004255087884273e-08, + "logits/chosen": -1.6261613368988037, + "logits/rejected": -1.5694060325622559, + "logps/chosen": -204.78195190429688, + "logps/rejected": -285.9472961425781, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5048892498016357, + "rewards/margins": 0.864482045173645, + "rewards/rejected": -2.369370937347412, + "step": 15050 + }, + { + "epoch": 2.594762232942798, + "grad_norm": 36.000404357910156, + "learning_rate": 1.0913004321030195e-08, + "logits/chosen": -1.6201896667480469, + "logits/rejected": -1.5766347646713257, + "logps/chosen": -193.09442138671875, + "logps/rejected": -274.0902404785156, + "loss": 0.5425, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.388741135597229, + "rewards/margins": 0.808066189289093, + "rewards/rejected": -2.196807384490967, + "step": 15060 + }, + { + "epoch": 2.596485182632667, + "grad_norm": 32.70420455932617, + "learning_rate": 1.0822111630758901e-08, + "logits/chosen": -1.6917825937271118, + "logits/rejected": -1.6308343410491943, + "logps/chosen": -199.3158416748047, + "logps/rejected": -266.4201354980469, + "loss": 0.5367, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4221495389938354, + "rewards/margins": 0.727179229259491, + "rewards/rejected": -2.1493287086486816, + "step": 15070 + }, + { + "epoch": 2.598208132322536, + "grad_norm": 41.88467788696289, + "learning_rate": 1.0731577382404744e-08, + "logits/chosen": -1.6529849767684937, + "logits/rejected": -1.5970804691314697, + "logps/chosen": -196.10508728027344, + "logps/rejected": -292.7115478515625, + "loss": 0.4646, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4301972389221191, + "rewards/margins": 0.9582144021987915, + "rewards/rejected": -2.388411283493042, + "step": 15080 + }, + { + "epoch": 2.599931082012405, + "grad_norm": 29.691938400268555, + "learning_rate": 1.0641401939861417e-08, + "logits/chosen": -1.6426990032196045, + "logits/rejected": -1.5919253826141357, + "logps/chosen": -199.49057006835938, + "logps/rejected": -275.2064514160156, + "loss": 0.5308, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4478983879089355, + "rewards/margins": 0.7710817456245422, + "rewards/rejected": -2.218980073928833, + "step": 15090 + }, + { + "epoch": 2.6016540317022745, + "grad_norm": 38.88888931274414, + "learning_rate": 1.0551585665580465e-08, + "logits/chosen": -1.5898182392120361, + "logits/rejected": -1.5544463396072388, + "logps/chosen": -201.89639282226562, + "logps/rejected": -270.3229064941406, + "loss": 0.5411, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.509132981300354, + "rewards/margins": 0.6869701743125916, + "rewards/rejected": -2.19610333442688, + "step": 15100 + }, + { + "epoch": 2.6033769813921435, + "grad_norm": 39.20042037963867, + "learning_rate": 1.0462128920569635e-08, + "logits/chosen": -1.6190464496612549, + "logits/rejected": -1.5810520648956299, + "logps/chosen": -207.54110717773438, + "logps/rejected": -276.8843994140625, + "loss": 0.5685, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.545078158378601, + "rewards/margins": 0.7065862417221069, + "rewards/rejected": -2.251664638519287, + "step": 15110 + }, + { + "epoch": 2.6050999310820124, + "grad_norm": 34.681983947753906, + "learning_rate": 1.0373032064391729e-08, + "logits/chosen": -1.6022884845733643, + "logits/rejected": -1.5622366666793823, + "logps/chosen": -213.93154907226562, + "logps/rejected": -290.47772216796875, + "loss": 0.5159, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5572923421859741, + "rewards/margins": 0.7834555506706238, + "rewards/rejected": -2.340747833251953, + "step": 15120 + }, + { + "epoch": 2.6068228807718814, + "grad_norm": 30.58574867248535, + "learning_rate": 1.0284295455162995e-08, + "logits/chosen": -1.5416325330734253, + "logits/rejected": -1.4799580574035645, + "logps/chosen": -194.72889709472656, + "logps/rejected": -278.31378173828125, + "loss": 0.4966, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4176585674285889, + "rewards/margins": 0.8688579797744751, + "rewards/rejected": -2.2865166664123535, + "step": 15130 + }, + { + "epoch": 2.6085458304617504, + "grad_norm": 23.715627670288086, + "learning_rate": 1.0195919449551637e-08, + "logits/chosen": -1.6180346012115479, + "logits/rejected": -1.56999933719635, + "logps/chosen": -209.47787475585938, + "logps/rejected": -301.15570068359375, + "loss": 0.4746, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5380821228027344, + "rewards/margins": 0.9100778698921204, + "rewards/rejected": -2.44815993309021, + "step": 15140 + }, + { + "epoch": 2.6102687801516193, + "grad_norm": 44.19576644897461, + "learning_rate": 1.0107904402776468e-08, + "logits/chosen": -1.7632591724395752, + "logits/rejected": -1.707419991493225, + "logps/chosen": -197.52490234375, + "logps/rejected": -273.18719482421875, + "loss": 0.5293, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4399224519729614, + "rewards/margins": 0.7631253004074097, + "rewards/rejected": -2.203047752380371, + "step": 15150 + }, + { + "epoch": 2.6119917298414888, + "grad_norm": 18.194103240966797, + "learning_rate": 1.002025066860549e-08, + "logits/chosen": -1.5769050121307373, + "logits/rejected": -1.540554165840149, + "logps/chosen": -198.5594482421875, + "logps/rejected": -292.1970520019531, + "loss": 0.4724, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.442894697189331, + "rewards/margins": 0.927615761756897, + "rewards/rejected": -2.3705103397369385, + "step": 15160 + }, + { + "epoch": 2.6137146795313577, + "grad_norm": 38.291282653808594, + "learning_rate": 9.932958599354457e-09, + "logits/chosen": -1.5755432844161987, + "logits/rejected": -1.5298999547958374, + "logps/chosen": -190.64517211914062, + "logps/rejected": -277.8343200683594, + "loss": 0.5209, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.376599669456482, + "rewards/margins": 0.863747239112854, + "rewards/rejected": -2.240346670150757, + "step": 15170 + }, + { + "epoch": 2.6154376292212267, + "grad_norm": 23.653581619262695, + "learning_rate": 9.846028545885376e-09, + "logits/chosen": -1.6484168767929077, + "logits/rejected": -1.611707091331482, + "logps/chosen": -209.3505401611328, + "logps/rejected": -297.0075988769531, + "loss": 0.5176, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5349688529968262, + "rewards/margins": 0.8758178949356079, + "rewards/rejected": -2.4107866287231445, + "step": 15180 + }, + { + "epoch": 2.6171605789110957, + "grad_norm": 30.477632522583008, + "learning_rate": 9.75946085760524e-09, + "logits/chosen": -1.5608577728271484, + "logits/rejected": -1.5307698249816895, + "logps/chosen": -197.48294067382812, + "logps/rejected": -272.7490234375, + "loss": 0.5087, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4481613636016846, + "rewards/margins": 0.755672037601471, + "rewards/rejected": -2.2038331031799316, + "step": 15190 + }, + { + "epoch": 2.618883528600965, + "grad_norm": 18.196041107177734, + "learning_rate": 9.673255882464504e-09, + "logits/chosen": -1.6353098154067993, + "logits/rejected": -1.5825080871582031, + "logps/chosen": -202.96937561035156, + "logps/rejected": -289.22760009765625, + "loss": 0.4701, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4741370677947998, + "rewards/margins": 0.8842340707778931, + "rewards/rejected": -2.3583710193634033, + "step": 15200 + }, + { + "epoch": 2.618883528600965, + "eval_logits/chosen": -1.732445240020752, + "eval_logits/rejected": -1.7090314626693726, + "eval_logps/chosen": -200.1665496826172, + "eval_logps/rejected": -239.77322387695312, + "eval_loss": 0.6306227445602417, + "eval_rewards/accuracies": 0.6445167064666748, + "eval_rewards/chosen": -1.4145464897155762, + "eval_rewards/margins": 0.3513844311237335, + "eval_rewards/rejected": -1.7659310102462769, + "eval_runtime": 384.213, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 15200 + }, + { + "epoch": 2.620606478290834, + "grad_norm": 34.808109283447266, + "learning_rate": 9.587413966955737e-09, + "logits/chosen": -1.5300031900405884, + "logits/rejected": -1.4728825092315674, + "logps/chosen": -209.06027221679688, + "logps/rejected": -287.71697998046875, + "loss": 0.5237, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.533287763595581, + "rewards/margins": 0.8262912034988403, + "rewards/rejected": -2.359578847885132, + "step": 15210 + }, + { + "epoch": 2.622329427980703, + "grad_norm": 39.4126091003418, + "learning_rate": 9.501935456112254e-09, + "logits/chosen": -1.5830267667770386, + "logits/rejected": -1.5235631465911865, + "logps/chosen": -188.13392639160156, + "logps/rejected": -269.7197570800781, + "loss": 0.4734, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3591792583465576, + "rewards/margins": 0.8418970108032227, + "rewards/rejected": -2.2010762691497803, + "step": 15220 + }, + { + "epoch": 2.624052377670572, + "grad_norm": 22.56211280822754, + "learning_rate": 9.416820693506677e-09, + "logits/chosen": -1.5887609720230103, + "logits/rejected": -1.5454912185668945, + "logps/chosen": -204.38143920898438, + "logps/rejected": -287.50439453125, + "loss": 0.5086, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.494003176689148, + "rewards/margins": 0.8419147729873657, + "rewards/rejected": -2.3359179496765137, + "step": 15230 + }, + { + "epoch": 2.625775327360441, + "grad_norm": 29.985095977783203, + "learning_rate": 9.332070021249595e-09, + "logits/chosen": -1.578778862953186, + "logits/rejected": -1.524937629699707, + "logps/chosen": -205.76651000976562, + "logps/rejected": -284.15264892578125, + "loss": 0.4948, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.467347502708435, + "rewards/margins": 0.8353103399276733, + "rewards/rejected": -2.3026576042175293, + "step": 15240 + }, + { + "epoch": 2.62749827705031, + "grad_norm": 37.65681076049805, + "learning_rate": 9.247683779988113e-09, + "logits/chosen": -1.5963375568389893, + "logits/rejected": -1.5480496883392334, + "logps/chosen": -192.90602111816406, + "logps/rejected": -280.87103271484375, + "loss": 0.5067, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4011046886444092, + "rewards/margins": 0.8682317733764648, + "rewards/rejected": -2.269336223602295, + "step": 15250 + }, + { + "epoch": 2.6292212267401793, + "grad_norm": 32.64960479736328, + "learning_rate": 9.163662308904608e-09, + "logits/chosen": -1.5931379795074463, + "logits/rejected": -1.5530445575714111, + "logps/chosen": -202.58602905273438, + "logps/rejected": -269.53875732421875, + "loss": 0.5357, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4842182397842407, + "rewards/margins": 0.7100173830986023, + "rewards/rejected": -2.194235324859619, + "step": 15260 + }, + { + "epoch": 2.6309441764300483, + "grad_norm": 24.36440086364746, + "learning_rate": 9.080005945715307e-09, + "logits/chosen": -1.6525249481201172, + "logits/rejected": -1.5804433822631836, + "logps/chosen": -206.2389678955078, + "logps/rejected": -297.2176208496094, + "loss": 0.4859, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.512003779411316, + "rewards/margins": 0.9533550143241882, + "rewards/rejected": -2.4653584957122803, + "step": 15270 + }, + { + "epoch": 2.6326671261199173, + "grad_norm": 28.877296447753906, + "learning_rate": 8.996715026668867e-09, + "logits/chosen": -1.7158002853393555, + "logits/rejected": -1.671775460243225, + "logps/chosen": -196.52822875976562, + "logps/rejected": -288.4090576171875, + "loss": 0.4447, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4435256719589233, + "rewards/margins": 0.8979686498641968, + "rewards/rejected": -2.34149432182312, + "step": 15280 + }, + { + "epoch": 2.6343900758097862, + "grad_norm": 24.51552963256836, + "learning_rate": 8.913789886545064e-09, + "logits/chosen": -1.6201568841934204, + "logits/rejected": -1.5507919788360596, + "logps/chosen": -204.2157440185547, + "logps/rejected": -295.75091552734375, + "loss": 0.4992, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.494683027267456, + "rewards/margins": 0.9509680867195129, + "rewards/rejected": -2.445650815963745, + "step": 15290 + }, + { + "epoch": 2.6361130254996556, + "grad_norm": 22.843257904052734, + "learning_rate": 8.831230858653538e-09, + "logits/chosen": -1.5143969058990479, + "logits/rejected": -1.4569345712661743, + "logps/chosen": -199.69515991210938, + "logps/rejected": -293.4846496582031, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.477146029472351, + "rewards/margins": 0.9601956605911255, + "rewards/rejected": -2.4373416900634766, + "step": 15300 + }, + { + "epoch": 2.6378359751895246, + "grad_norm": 26.822872161865234, + "learning_rate": 8.749038274832343e-09, + "logits/chosen": -1.6880228519439697, + "logits/rejected": -1.6334871053695679, + "logps/chosen": -199.8614501953125, + "logps/rejected": -290.93609619140625, + "loss": 0.4692, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4571337699890137, + "rewards/margins": 0.9172709584236145, + "rewards/rejected": -2.3744044303894043, + "step": 15310 + }, + { + "epoch": 2.6395589248793936, + "grad_norm": 40.18165588378906, + "learning_rate": 8.667212465446617e-09, + "logits/chosen": -1.5984302759170532, + "logits/rejected": -1.5583341121673584, + "logps/chosen": -202.61941528320312, + "logps/rejected": -285.7087707519531, + "loss": 0.5183, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5141230821609497, + "rewards/margins": 0.8147637248039246, + "rewards/rejected": -2.3288865089416504, + "step": 15320 + }, + { + "epoch": 2.6412818745692626, + "grad_norm": 39.94595718383789, + "learning_rate": 8.585753759387292e-09, + "logits/chosen": -1.600642442703247, + "logits/rejected": -1.5455397367477417, + "logps/chosen": -206.8169403076172, + "logps/rejected": -294.69873046875, + "loss": 0.4851, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5080393552780151, + "rewards/margins": 0.8920080065727234, + "rewards/rejected": -2.4000473022460938, + "step": 15330 + }, + { + "epoch": 2.6430048242591315, + "grad_norm": 28.69721031188965, + "learning_rate": 8.504662484069824e-09, + "logits/chosen": -1.5988765954971313, + "logits/rejected": -1.5588778257369995, + "logps/chosen": -204.5358428955078, + "logps/rejected": -289.7530822753906, + "loss": 0.5067, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.49817955493927, + "rewards/margins": 0.8466202616691589, + "rewards/rejected": -2.344799757003784, + "step": 15340 + }, + { + "epoch": 2.6447277739490005, + "grad_norm": 40.40675354003906, + "learning_rate": 8.423938965432708e-09, + "logits/chosen": -1.4938302040100098, + "logits/rejected": -1.4552674293518066, + "logps/chosen": -207.461669921875, + "logps/rejected": -299.3456115722656, + "loss": 0.4925, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5299715995788574, + "rewards/margins": 0.9237580299377441, + "rewards/rejected": -2.4537301063537598, + "step": 15350 + }, + { + "epoch": 2.64645072363887, + "grad_norm": 30.506824493408203, + "learning_rate": 8.343583527936382e-09, + "logits/chosen": -1.6167596578598022, + "logits/rejected": -1.5800806283950806, + "logps/chosen": -202.3173065185547, + "logps/rejected": -288.4965515136719, + "loss": 0.5202, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5074043273925781, + "rewards/margins": 0.8156595230102539, + "rewards/rejected": -2.323063850402832, + "step": 15360 + }, + { + "epoch": 2.648173673328739, + "grad_norm": 36.25282287597656, + "learning_rate": 8.263596494561765e-09, + "logits/chosen": -1.646561861038208, + "logits/rejected": -1.5939476490020752, + "logps/chosen": -211.3754119873047, + "logps/rejected": -285.7420654296875, + "loss": 0.528, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5526502132415771, + "rewards/margins": 0.7893895506858826, + "rewards/rejected": -2.3420395851135254, + "step": 15370 + }, + { + "epoch": 2.649896623018608, + "grad_norm": 31.11897850036621, + "learning_rate": 8.183978186809026e-09, + "logits/chosen": -1.6422828435897827, + "logits/rejected": -1.59429132938385, + "logps/chosen": -207.92440795898438, + "logps/rejected": -287.43621826171875, + "loss": 0.5049, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5371867418289185, + "rewards/margins": 0.8130367398262024, + "rewards/rejected": -2.3502230644226074, + "step": 15380 + }, + { + "epoch": 2.651619572708477, + "grad_norm": 23.215652465820312, + "learning_rate": 8.104728924696237e-09, + "logits/chosen": -1.6959645748138428, + "logits/rejected": -1.651354432106018, + "logps/chosen": -203.30502319335938, + "logps/rejected": -292.2448425292969, + "loss": 0.5014, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4707846641540527, + "rewards/margins": 0.8943023681640625, + "rewards/rejected": -2.3650870323181152, + "step": 15390 + }, + { + "epoch": 2.6533425223983462, + "grad_norm": 22.313430786132812, + "learning_rate": 8.02584902675818e-09, + "logits/chosen": -1.638462781906128, + "logits/rejected": -1.5840169191360474, + "logps/chosen": -217.51998901367188, + "logps/rejected": -277.68402099609375, + "loss": 0.5732, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5877636671066284, + "rewards/margins": 0.6705672740936279, + "rewards/rejected": -2.258330821990967, + "step": 15400 + }, + { + "epoch": 2.655065472088215, + "grad_norm": 25.50736427307129, + "learning_rate": 7.947338810045035e-09, + "logits/chosen": -1.6300718784332275, + "logits/rejected": -1.5662131309509277, + "logps/chosen": -216.51229858398438, + "logps/rejected": -276.90521240234375, + "loss": 0.5589, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5693070888519287, + "rewards/margins": 0.6847835779190063, + "rewards/rejected": -2.2540907859802246, + "step": 15410 + }, + { + "epoch": 2.656788421778084, + "grad_norm": 41.6089973449707, + "learning_rate": 7.869198590120962e-09, + "logits/chosen": -1.5842618942260742, + "logits/rejected": -1.5435030460357666, + "logps/chosen": -202.87576293945312, + "logps/rejected": -299.6231384277344, + "loss": 0.4889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.504333257675171, + "rewards/margins": 0.955193042755127, + "rewards/rejected": -2.459526538848877, + "step": 15420 + }, + { + "epoch": 2.658511371467953, + "grad_norm": 21.34117889404297, + "learning_rate": 7.791428681063084e-09, + "logits/chosen": -1.738438367843628, + "logits/rejected": -1.6839052438735962, + "logps/chosen": -210.18295288085938, + "logps/rejected": -294.5263977050781, + "loss": 0.4797, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5176045894622803, + "rewards/margins": 0.8913189172744751, + "rewards/rejected": -2.408923625946045, + "step": 15430 + }, + { + "epoch": 2.660234321157822, + "grad_norm": 29.64019203186035, + "learning_rate": 7.714029395460054e-09, + "logits/chosen": -1.7610969543457031, + "logits/rejected": -1.7188001871109009, + "logps/chosen": -202.62686157226562, + "logps/rejected": -272.075927734375, + "loss": 0.5267, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4838390350341797, + "rewards/margins": 0.7032070755958557, + "rewards/rejected": -2.1870460510253906, + "step": 15440 + }, + { + "epoch": 2.661957270847691, + "grad_norm": 30.35210609436035, + "learning_rate": 7.637001044410784e-09, + "logits/chosen": -1.4572802782058716, + "logits/rejected": -1.4177887439727783, + "logps/chosen": -201.9217529296875, + "logps/rejected": -273.5601806640625, + "loss": 0.5322, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4936710596084595, + "rewards/margins": 0.7113657593727112, + "rewards/rejected": -2.2050366401672363, + "step": 15450 + }, + { + "epoch": 2.66368022053756, + "grad_norm": 32.20455551147461, + "learning_rate": 7.560343937523361e-09, + "logits/chosen": -1.6862075328826904, + "logits/rejected": -1.6477339267730713, + "logps/chosen": -198.38482666015625, + "logps/rejected": -276.956298828125, + "loss": 0.5051, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3834189176559448, + "rewards/margins": 0.8093355298042297, + "rewards/rejected": -2.1927542686462402, + "step": 15460 + }, + { + "epoch": 2.6654031702274295, + "grad_norm": 33.88570022583008, + "learning_rate": 7.484058382913583e-09, + "logits/chosen": -1.6792227029800415, + "logits/rejected": -1.6291191577911377, + "logps/chosen": -215.4248046875, + "logps/rejected": -297.33941650390625, + "loss": 0.521, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5585341453552246, + "rewards/margins": 0.8606500625610352, + "rewards/rejected": -2.419184446334839, + "step": 15470 + }, + { + "epoch": 2.6671261199172984, + "grad_norm": 22.154661178588867, + "learning_rate": 7.40814468720391e-09, + "logits/chosen": -1.6908563375473022, + "logits/rejected": -1.6321758031845093, + "logps/chosen": -191.12051391601562, + "logps/rejected": -273.941650390625, + "loss": 0.5031, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3590161800384521, + "rewards/margins": 0.8672925233840942, + "rewards/rejected": -2.226309061050415, + "step": 15480 + }, + { + "epoch": 2.6688490696071674, + "grad_norm": 26.001617431640625, + "learning_rate": 7.332603155522066e-09, + "logits/chosen": -1.644089937210083, + "logits/rejected": -1.6148462295532227, + "logps/chosen": -208.5677947998047, + "logps/rejected": -270.153564453125, + "loss": 0.5564, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5410025119781494, + "rewards/margins": 0.6423856616020203, + "rewards/rejected": -2.1833882331848145, + "step": 15490 + }, + { + "epoch": 2.670572019297037, + "grad_norm": 45.97538757324219, + "learning_rate": 7.257434091500014e-09, + "logits/chosen": -1.621949553489685, + "logits/rejected": -1.594242811203003, + "logps/chosen": -222.2292938232422, + "logps/rejected": -283.860107421875, + "loss": 0.5922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6816177368164062, + "rewards/margins": 0.6131669282913208, + "rewards/rejected": -2.2947845458984375, + "step": 15500 + }, + { + "epoch": 2.6722949689869058, + "grad_norm": 30.593849182128906, + "learning_rate": 7.182637797272506e-09, + "logits/chosen": -1.5719969272613525, + "logits/rejected": -1.5192309617996216, + "logps/chosen": -206.0696258544922, + "logps/rejected": -281.87786865234375, + "loss": 0.5382, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.477163314819336, + "rewards/margins": 0.8039218187332153, + "rewards/rejected": -2.2810850143432617, + "step": 15510 + }, + { + "epoch": 2.6740179186767747, + "grad_norm": 32.63137435913086, + "learning_rate": 7.108214573476035e-09, + "logits/chosen": -1.500911831855774, + "logits/rejected": -1.4573032855987549, + "logps/chosen": -202.20135498046875, + "logps/rejected": -273.4117126464844, + "loss": 0.5347, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4793106317520142, + "rewards/margins": 0.7624852657318115, + "rewards/rejected": -2.2417960166931152, + "step": 15520 + }, + { + "epoch": 2.6757408683666437, + "grad_norm": 25.706695556640625, + "learning_rate": 7.0341647192475704e-09, + "logits/chosen": -1.5359480381011963, + "logits/rejected": -1.4948341846466064, + "logps/chosen": -190.4417266845703, + "logps/rejected": -271.4991455078125, + "loss": 0.494, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3646535873413086, + "rewards/margins": 0.8184051513671875, + "rewards/rejected": -2.183058261871338, + "step": 15530 + }, + { + "epoch": 2.6774638180565127, + "grad_norm": 31.82769775390625, + "learning_rate": 6.960488532223374e-09, + "logits/chosen": -1.5960495471954346, + "logits/rejected": -1.548031210899353, + "logps/chosen": -205.06552124023438, + "logps/rejected": -281.83062744140625, + "loss": 0.5418, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.527539849281311, + "rewards/margins": 0.7699499726295471, + "rewards/rejected": -2.297489643096924, + "step": 15540 + }, + { + "epoch": 2.6791867677463816, + "grad_norm": 38.12778091430664, + "learning_rate": 6.887186308537763e-09, + "logits/chosen": -1.6880691051483154, + "logits/rejected": -1.6379327774047852, + "logps/chosen": -213.2513885498047, + "logps/rejected": -289.7648010253906, + "loss": 0.5181, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.579298734664917, + "rewards/margins": 0.7917166948318481, + "rewards/rejected": -2.3710153102874756, + "step": 15550 + }, + { + "epoch": 2.6809097174362506, + "grad_norm": 32.01270294189453, + "learning_rate": 6.814258342821932e-09, + "logits/chosen": -1.6256210803985596, + "logits/rejected": -1.5979753732681274, + "logps/chosen": -202.05966186523438, + "logps/rejected": -276.04132080078125, + "loss": 0.5253, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5108765363693237, + "rewards/margins": 0.7135692834854126, + "rewards/rejected": -2.2244458198547363, + "step": 15560 + }, + { + "epoch": 2.68263266712612, + "grad_norm": 29.39885711669922, + "learning_rate": 6.741704928202807e-09, + "logits/chosen": -1.6568377017974854, + "logits/rejected": -1.6141636371612549, + "logps/chosen": -205.1513214111328, + "logps/rejected": -288.5816955566406, + "loss": 0.5194, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5274347066879272, + "rewards/margins": 0.8483465313911438, + "rewards/rejected": -2.3757810592651367, + "step": 15570 + }, + { + "epoch": 2.684355616815989, + "grad_norm": 30.44341278076172, + "learning_rate": 6.669526356301869e-09, + "logits/chosen": -1.6794891357421875, + "logits/rejected": -1.6423813104629517, + "logps/chosen": -204.3780975341797, + "logps/rejected": -287.107421875, + "loss": 0.5083, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4868040084838867, + "rewards/margins": 0.8067952394485474, + "rewards/rejected": -2.2935991287231445, + "step": 15580 + }, + { + "epoch": 2.686078566505858, + "grad_norm": 40.704795837402344, + "learning_rate": 6.597722917233894e-09, + "logits/chosen": -1.6327779293060303, + "logits/rejected": -1.5902550220489502, + "logps/chosen": -189.83932495117188, + "logps/rejected": -263.6131286621094, + "loss": 0.5163, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3530352115631104, + "rewards/margins": 0.7421896457672119, + "rewards/rejected": -2.0952248573303223, + "step": 15590 + }, + { + "epoch": 2.687801516195727, + "grad_norm": 29.427024841308594, + "learning_rate": 6.526294899605878e-09, + "logits/chosen": -1.6281979084014893, + "logits/rejected": -1.5792288780212402, + "logps/chosen": -198.7489776611328, + "logps/rejected": -291.39019775390625, + "loss": 0.5011, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4688153266906738, + "rewards/margins": 0.9297361373901367, + "rewards/rejected": -2.3985512256622314, + "step": 15600 + }, + { + "epoch": 2.687801516195727, + "eval_logits/chosen": -1.7368842363357544, + "eval_logits/rejected": -1.7134876251220703, + "eval_logps/chosen": -199.51185607910156, + "eval_logps/rejected": -238.93487548828125, + "eval_loss": 0.6304026246070862, + "eval_rewards/accuracies": 0.6433550119400024, + "eval_rewards/chosen": -1.4079997539520264, + "eval_rewards/margins": 0.3495476245880127, + "eval_rewards/rejected": -1.7575472593307495, + "eval_runtime": 384.5525, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 15600 + }, + { + "epoch": 2.6895244658855963, + "grad_norm": 31.768056869506836, + "learning_rate": 6.455242590515842e-09, + "logits/chosen": -1.7051315307617188, + "logits/rejected": -1.6663964986801147, + "logps/chosen": -202.0169677734375, + "logps/rejected": -275.9730529785156, + "loss": 0.5375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4596657752990723, + "rewards/margins": 0.7587411403656006, + "rewards/rejected": -2.218406915664673, + "step": 15610 + }, + { + "epoch": 2.6912474155754653, + "grad_norm": 40.55280303955078, + "learning_rate": 6.384566275551717e-09, + "logits/chosen": -1.5791552066802979, + "logits/rejected": -1.5466220378875732, + "logps/chosen": -186.74722290039062, + "logps/rejected": -279.75335693359375, + "loss": 0.476, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3374412059783936, + "rewards/margins": 0.9051253199577332, + "rewards/rejected": -2.2425665855407715, + "step": 15620 + }, + { + "epoch": 2.6929703652653343, + "grad_norm": 40.39755630493164, + "learning_rate": 6.314266238790089e-09, + "logits/chosen": -1.620631217956543, + "logits/rejected": -1.5486552715301514, + "logps/chosen": -205.1244354248047, + "logps/rejected": -295.75579833984375, + "loss": 0.4499, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.499033808708191, + "rewards/margins": 0.9564018249511719, + "rewards/rejected": -2.4554355144500732, + "step": 15630 + }, + { + "epoch": 2.6946933149552033, + "grad_norm": 33.87063980102539, + "learning_rate": 6.244342762795207e-09, + "logits/chosen": -1.6153934001922607, + "logits/rejected": -1.566318392753601, + "logps/chosen": -202.0092315673828, + "logps/rejected": -296.7959289550781, + "loss": 0.465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4862453937530518, + "rewards/margins": 0.9477277994155884, + "rewards/rejected": -2.433973550796509, + "step": 15640 + }, + { + "epoch": 2.6964162646450722, + "grad_norm": 49.840118408203125, + "learning_rate": 6.1747961286177205e-09, + "logits/chosen": -1.6278855800628662, + "logits/rejected": -1.5861130952835083, + "logps/chosen": -197.191650390625, + "logps/rejected": -273.5580139160156, + "loss": 0.5397, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4384510517120361, + "rewards/margins": 0.7653406858444214, + "rewards/rejected": -2.203791856765747, + "step": 15650 + }, + { + "epoch": 2.698139214334941, + "grad_norm": 28.98862075805664, + "learning_rate": 6.105626615793602e-09, + "logits/chosen": -1.682408094406128, + "logits/rejected": -1.6390424966812134, + "logps/chosen": -197.5651397705078, + "logps/rejected": -286.9451904296875, + "loss": 0.4832, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4492084980010986, + "rewards/margins": 0.8780032396316528, + "rewards/rejected": -2.327211856842041, + "step": 15660 + }, + { + "epoch": 2.6998621640248106, + "grad_norm": 26.868104934692383, + "learning_rate": 6.036834502343058e-09, + "logits/chosen": -1.5527050495147705, + "logits/rejected": -1.4929711818695068, + "logps/chosen": -196.7747039794922, + "logps/rejected": -273.2555236816406, + "loss": 0.4775, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4153461456298828, + "rewards/margins": 0.8030064702033997, + "rewards/rejected": -2.218352794647217, + "step": 15670 + }, + { + "epoch": 2.7015851137146796, + "grad_norm": 27.163881301879883, + "learning_rate": 5.968420064769342e-09, + "logits/chosen": -1.5822670459747314, + "logits/rejected": -1.5396705865859985, + "logps/chosen": -219.1418914794922, + "logps/rejected": -301.70843505859375, + "loss": 0.5029, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6476762294769287, + "rewards/margins": 0.8261844515800476, + "rewards/rejected": -2.473860263824463, + "step": 15680 + }, + { + "epoch": 2.7033080634045485, + "grad_norm": 22.595386505126953, + "learning_rate": 5.9003835780576774e-09, + "logits/chosen": -1.6135778427124023, + "logits/rejected": -1.5688902139663696, + "logps/chosen": -195.43197631835938, + "logps/rejected": -283.2395324707031, + "loss": 0.4875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4298336505889893, + "rewards/margins": 0.8611891865730286, + "rewards/rejected": -2.291022777557373, + "step": 15690 + }, + { + "epoch": 2.7050310130944175, + "grad_norm": 24.350574493408203, + "learning_rate": 5.832725315674147e-09, + "logits/chosen": -1.6457306146621704, + "logits/rejected": -1.5960192680358887, + "logps/chosen": -206.0160675048828, + "logps/rejected": -292.96295166015625, + "loss": 0.4965, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.539738416671753, + "rewards/margins": 0.8633926510810852, + "rewards/rejected": -2.4031310081481934, + "step": 15700 + }, + { + "epoch": 2.706753962784287, + "grad_norm": 31.922224044799805, + "learning_rate": 5.76544554956463e-09, + "logits/chosen": -1.6166181564331055, + "logits/rejected": -1.5642515420913696, + "logps/chosen": -209.6160430908203, + "logps/rejected": -300.5673828125, + "loss": 0.4717, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5211459398269653, + "rewards/margins": 0.9228206872940063, + "rewards/rejected": -2.4439666271209717, + "step": 15710 + }, + { + "epoch": 2.708476912474156, + "grad_norm": 30.285572052001953, + "learning_rate": 5.698544550153661e-09, + "logits/chosen": -1.64878249168396, + "logits/rejected": -1.613804817199707, + "logps/chosen": -204.9235076904297, + "logps/rejected": -276.12042236328125, + "loss": 0.5199, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5211926698684692, + "rewards/margins": 0.71763014793396, + "rewards/rejected": -2.2388229370117188, + "step": 15720 + }, + { + "epoch": 2.710199862164025, + "grad_norm": 23.428640365600586, + "learning_rate": 5.632022586343333e-09, + "logits/chosen": -1.718743085861206, + "logits/rejected": -1.6677840948104858, + "logps/chosen": -201.1768798828125, + "logps/rejected": -291.433349609375, + "loss": 0.488, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.487426519393921, + "rewards/margins": 0.8930130004882812, + "rewards/rejected": -2.380439281463623, + "step": 15730 + }, + { + "epoch": 2.711922811853894, + "grad_norm": 29.324687957763672, + "learning_rate": 5.565879925512252e-09, + "logits/chosen": -1.6265957355499268, + "logits/rejected": -1.5733954906463623, + "logps/chosen": -205.82687377929688, + "logps/rejected": -276.8540954589844, + "loss": 0.5602, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.527649164199829, + "rewards/margins": 0.7465609312057495, + "rewards/rejected": -2.274210214614868, + "step": 15740 + }, + { + "epoch": 2.713645761543763, + "grad_norm": 40.81529998779297, + "learning_rate": 5.50011683351449e-09, + "logits/chosen": -1.624692678451538, + "logits/rejected": -1.571045160293579, + "logps/chosen": -222.0780487060547, + "logps/rejected": -313.7108459472656, + "loss": 0.4624, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6946592330932617, + "rewards/margins": 0.8892682194709778, + "rewards/rejected": -2.583927631378174, + "step": 15750 + }, + { + "epoch": 2.7153687112336318, + "grad_norm": 24.664043426513672, + "learning_rate": 5.434733574678418e-09, + "logits/chosen": -1.5452042818069458, + "logits/rejected": -1.5038487911224365, + "logps/chosen": -198.94723510742188, + "logps/rejected": -272.55206298828125, + "loss": 0.5557, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4844261407852173, + "rewards/margins": 0.7378342747688293, + "rewards/rejected": -2.2222602367401123, + "step": 15760 + }, + { + "epoch": 2.717091660923501, + "grad_norm": 23.349504470825195, + "learning_rate": 5.369730411805762e-09, + "logits/chosen": -1.6020376682281494, + "logits/rejected": -1.5571599006652832, + "logps/chosen": -187.35842895507812, + "logps/rejected": -284.2367248535156, + "loss": 0.4332, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3557246923446655, + "rewards/margins": 0.9550348520278931, + "rewards/rejected": -2.3107595443725586, + "step": 15770 + }, + { + "epoch": 2.71881461061337, + "grad_norm": 29.679035186767578, + "learning_rate": 5.3051076061704445e-09, + "logits/chosen": -1.743186593055725, + "logits/rejected": -1.7102177143096924, + "logps/chosen": -215.1242218017578, + "logps/rejected": -272.3497619628906, + "loss": 0.5729, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.607500672340393, + "rewards/margins": 0.5912182331085205, + "rewards/rejected": -2.1987192630767822, + "step": 15780 + }, + { + "epoch": 2.720537560303239, + "grad_norm": 24.34191131591797, + "learning_rate": 5.240865417517604e-09, + "logits/chosen": -1.5436227321624756, + "logits/rejected": -1.5051971673965454, + "logps/chosen": -208.3878173828125, + "logps/rejected": -282.0325622558594, + "loss": 0.5088, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5434716939926147, + "rewards/margins": 0.7497387528419495, + "rewards/rejected": -2.293210506439209, + "step": 15790 + }, + { + "epoch": 2.722260509993108, + "grad_norm": 25.835796356201172, + "learning_rate": 5.177004104062521e-09, + "logits/chosen": -1.6853927373886108, + "logits/rejected": -1.6165730953216553, + "logps/chosen": -198.42984008789062, + "logps/rejected": -282.55657958984375, + "loss": 0.4685, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4018248319625854, + "rewards/margins": 0.9172322154045105, + "rewards/rejected": -2.319056987762451, + "step": 15800 + }, + { + "epoch": 2.7239834596829775, + "grad_norm": 47.76166534423828, + "learning_rate": 5.113523922489571e-09, + "logits/chosen": -1.678775429725647, + "logits/rejected": -1.6477625370025635, + "logps/chosen": -203.407958984375, + "logps/rejected": -277.275390625, + "loss": 0.5445, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5462872982025146, + "rewards/margins": 0.7036650776863098, + "rewards/rejected": -2.249952554702759, + "step": 15810 + }, + { + "epoch": 2.7257064093728465, + "grad_norm": 33.46269607543945, + "learning_rate": 5.0504251279512415e-09, + "logits/chosen": -1.5308297872543335, + "logits/rejected": -1.4846632480621338, + "logps/chosen": -209.490234375, + "logps/rejected": -292.7735595703125, + "loss": 0.5237, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5785003900527954, + "rewards/margins": 0.8312740325927734, + "rewards/rejected": -2.4097743034362793, + "step": 15820 + }, + { + "epoch": 2.7274293590627154, + "grad_norm": 31.780555725097656, + "learning_rate": 4.987707974067046e-09, + "logits/chosen": -1.6617063283920288, + "logits/rejected": -1.6254870891571045, + "logps/chosen": -199.11273193359375, + "logps/rejected": -274.83306884765625, + "loss": 0.5586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4930975437164307, + "rewards/margins": 0.7451966404914856, + "rewards/rejected": -2.2382943630218506, + "step": 15830 + }, + { + "epoch": 2.7291523087525844, + "grad_norm": 38.7813606262207, + "learning_rate": 4.9253727129224934e-09, + "logits/chosen": -1.6716371774673462, + "logits/rejected": -1.6329853534698486, + "logps/chosen": -222.65463256835938, + "logps/rejected": -313.23272705078125, + "loss": 0.5259, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6936982870101929, + "rewards/margins": 0.882973849773407, + "rewards/rejected": -2.576672077178955, + "step": 15840 + }, + { + "epoch": 2.7308752584424534, + "grad_norm": 36.57883071899414, + "learning_rate": 4.863419595068197e-09, + "logits/chosen": -1.6573702096939087, + "logits/rejected": -1.609955072402954, + "logps/chosen": -194.7985076904297, + "logps/rejected": -280.5198669433594, + "loss": 0.5139, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4152289628982544, + "rewards/margins": 0.8435710668563843, + "rewards/rejected": -2.2588000297546387, + "step": 15850 + }, + { + "epoch": 2.7325982081323223, + "grad_norm": 27.437578201293945, + "learning_rate": 4.801848869518721e-09, + "logits/chosen": -1.6103976964950562, + "logits/rejected": -1.5662734508514404, + "logps/chosen": -205.65383911132812, + "logps/rejected": -264.7209167480469, + "loss": 0.5863, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5253267288208008, + "rewards/margins": 0.6179733276367188, + "rewards/rejected": -2.1433000564575195, + "step": 15860 + }, + { + "epoch": 2.7343211578221913, + "grad_norm": 23.60274887084961, + "learning_rate": 4.740660783751638e-09, + "logits/chosen": -1.628843903541565, + "logits/rejected": -1.5704540014266968, + "logps/chosen": -211.80722045898438, + "logps/rejected": -304.2124938964844, + "loss": 0.4875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.565833330154419, + "rewards/margins": 0.9499748945236206, + "rewards/rejected": -2.515808343887329, + "step": 15870 + }, + { + "epoch": 2.7360441075120607, + "grad_norm": 31.625076293945312, + "learning_rate": 4.679855583706571e-09, + "logits/chosen": -1.5856298208236694, + "logits/rejected": -1.5474889278411865, + "logps/chosen": -198.1217041015625, + "logps/rejected": -290.2977294921875, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.462418794631958, + "rewards/margins": 0.9268255233764648, + "rewards/rejected": -2.3892440795898438, + "step": 15880 + }, + { + "epoch": 2.7377670572019297, + "grad_norm": 30.186508178710938, + "learning_rate": 4.619433513784166e-09, + "logits/chosen": -1.6528924703598022, + "logits/rejected": -1.5979883670806885, + "logps/chosen": -201.7820281982422, + "logps/rejected": -274.7981262207031, + "loss": 0.5298, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4912642240524292, + "rewards/margins": 0.7730330228805542, + "rewards/rejected": -2.2642972469329834, + "step": 15890 + }, + { + "epoch": 2.7394900068917987, + "grad_norm": 33.469242095947266, + "learning_rate": 4.559394816845075e-09, + "logits/chosen": -1.6455352306365967, + "logits/rejected": -1.5722758769989014, + "logps/chosen": -216.37216186523438, + "logps/rejected": -298.2059020996094, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5656859874725342, + "rewards/margins": 0.8968521356582642, + "rewards/rejected": -2.462538003921509, + "step": 15900 + }, + { + "epoch": 2.741212956581668, + "grad_norm": 30.317359924316406, + "learning_rate": 4.499739734209074e-09, + "logits/chosen": -1.5669691562652588, + "logits/rejected": -1.5114362239837646, + "logps/chosen": -192.9110870361328, + "logps/rejected": -276.04083251953125, + "loss": 0.5079, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3991148471832275, + "rewards/margins": 0.8640725016593933, + "rewards/rejected": -2.2631874084472656, + "step": 15910 + }, + { + "epoch": 2.742935906271537, + "grad_norm": 35.19529342651367, + "learning_rate": 4.440468505653982e-09, + "logits/chosen": -1.5765380859375, + "logits/rejected": -1.5384092330932617, + "logps/chosen": -215.87570190429688, + "logps/rejected": -299.07562255859375, + "loss": 0.5069, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6332439184188843, + "rewards/margins": 0.8158270120620728, + "rewards/rejected": -2.449071168899536, + "step": 15920 + }, + { + "epoch": 2.744658855961406, + "grad_norm": 30.254179000854492, + "learning_rate": 4.381581369414822e-09, + "logits/chosen": -1.510341763496399, + "logits/rejected": -1.4571070671081543, + "logps/chosen": -192.68174743652344, + "logps/rejected": -274.46771240234375, + "loss": 0.4713, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3748412132263184, + "rewards/margins": 0.8734277486801147, + "rewards/rejected": -2.2482690811157227, + "step": 15930 + }, + { + "epoch": 2.746381805651275, + "grad_norm": 22.98348617553711, + "learning_rate": 4.323078562182702e-09, + "logits/chosen": -1.60586678981781, + "logits/rejected": -1.5448791980743408, + "logps/chosen": -201.58407592773438, + "logps/rejected": -300.69329833984375, + "loss": 0.4591, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4241517782211304, + "rewards/margins": 0.9938325881958008, + "rewards/rejected": -2.4179844856262207, + "step": 15940 + }, + { + "epoch": 2.748104755341144, + "grad_norm": 37.307708740234375, + "learning_rate": 4.2649603191040715e-09, + "logits/chosen": -1.6714118719100952, + "logits/rejected": -1.6232936382293701, + "logps/chosen": -197.21530151367188, + "logps/rejected": -277.82293701171875, + "loss": 0.4869, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3933839797973633, + "rewards/margins": 0.8573741912841797, + "rewards/rejected": -2.250758171081543, + "step": 15950 + }, + { + "epoch": 2.749827705031013, + "grad_norm": 18.794313430786133, + "learning_rate": 4.207226873779557e-09, + "logits/chosen": -1.6430130004882812, + "logits/rejected": -1.5901249647140503, + "logps/chosen": -202.3380126953125, + "logps/rejected": -289.830810546875, + "loss": 0.4925, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4588900804519653, + "rewards/margins": 0.9070623517036438, + "rewards/rejected": -2.365952253341675, + "step": 15960 + }, + { + "epoch": 2.751550654720882, + "grad_norm": 35.651344299316406, + "learning_rate": 4.149878458263179e-09, + "logits/chosen": -1.62399423122406, + "logits/rejected": -1.5761115550994873, + "logps/chosen": -201.8713836669922, + "logps/rejected": -293.91705322265625, + "loss": 0.4755, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4667004346847534, + "rewards/margins": 0.9146144986152649, + "rewards/rejected": -2.381314754486084, + "step": 15970 + }, + { + "epoch": 2.7532736044107513, + "grad_norm": 34.21991729736328, + "learning_rate": 4.092915303061372e-09, + "logits/chosen": -1.59518563747406, + "logits/rejected": -1.5532639026641846, + "logps/chosen": -209.7283477783203, + "logps/rejected": -278.43890380859375, + "loss": 0.5247, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5316201448440552, + "rewards/margins": 0.6966302990913391, + "rewards/rejected": -2.22825026512146, + "step": 15980 + }, + { + "epoch": 2.7549965541006203, + "grad_norm": 30.082977294921875, + "learning_rate": 4.0363376371320366e-09, + "logits/chosen": -1.7484419345855713, + "logits/rejected": -1.7310978174209595, + "logps/chosen": -204.43405151367188, + "logps/rejected": -275.6298828125, + "loss": 0.5516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4974677562713623, + "rewards/margins": 0.7014755606651306, + "rewards/rejected": -2.1989433765411377, + "step": 15990 + }, + { + "epoch": 2.7567195037904892, + "grad_norm": 29.51515769958496, + "learning_rate": 3.98014568788364e-09, + "logits/chosen": -1.5735108852386475, + "logits/rejected": -1.5224225521087646, + "logps/chosen": -201.62033081054688, + "logps/rejected": -282.30157470703125, + "loss": 0.4936, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4778904914855957, + "rewards/margins": 0.8260573148727417, + "rewards/rejected": -2.303947925567627, + "step": 16000 + }, + { + "epoch": 2.7567195037904892, + "eval_logits/chosen": -1.724768042564392, + "eval_logits/rejected": -1.7010468244552612, + "eval_logps/chosen": -203.6143341064453, + "eval_logps/rejected": -244.05953979492188, + "eval_loss": 0.6303529143333435, + "eval_rewards/accuracies": 0.6435873508453369, + "eval_rewards/chosen": -1.4490246772766113, + "eval_rewards/margins": 0.3597696125507355, + "eval_rewards/rejected": -1.8087942600250244, + "eval_runtime": 384.5273, + "eval_samples_per_second": 11.193, + "eval_steps_per_second": 1.399, + "step": 16000 + }, + { + "epoch": 2.758442453480358, + "grad_norm": 46.15574645996094, + "learning_rate": 3.924339681174293e-09, + "logits/chosen": -1.6742016077041626, + "logits/rejected": -1.640960454940796, + "logps/chosen": -217.3235626220703, + "logps/rejected": -288.28643798828125, + "loss": 0.5747, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6208339929580688, + "rewards/margins": 0.7245651483535767, + "rewards/rejected": -2.3453993797302246, + "step": 16010 + }, + { + "epoch": 2.7601654031702276, + "grad_norm": 20.357431411743164, + "learning_rate": 3.868919841310858e-09, + "logits/chosen": -1.7137123346328735, + "logits/rejected": -1.6709020137786865, + "logps/chosen": -208.59683227539062, + "logps/rejected": -290.96697998046875, + "loss": 0.5329, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5549236536026, + "rewards/margins": 0.8437626957893372, + "rewards/rejected": -2.398686170578003, + "step": 16020 + }, + { + "epoch": 2.7618883528600966, + "grad_norm": 32.056190490722656, + "learning_rate": 3.81388639104806e-09, + "logits/chosen": -1.7042852640151978, + "logits/rejected": -1.659950852394104, + "logps/chosen": -208.49032592773438, + "logps/rejected": -288.7596740722656, + "loss": 0.5151, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5546530485153198, + "rewards/margins": 0.8066908717155457, + "rewards/rejected": -2.3613438606262207, + "step": 16030 + }, + { + "epoch": 2.7636113025499656, + "grad_norm": 45.362579345703125, + "learning_rate": 3.759239551587512e-09, + "logits/chosen": -1.6354458332061768, + "logits/rejected": -1.5884088277816772, + "logps/chosen": -208.31723022460938, + "logps/rejected": -302.8401794433594, + "loss": 0.4802, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5453264713287354, + "rewards/margins": 0.9585615396499634, + "rewards/rejected": -2.50388765335083, + "step": 16040 + }, + { + "epoch": 2.7653342522398345, + "grad_norm": 23.472545623779297, + "learning_rate": 3.7049795425769027e-09, + "logits/chosen": -1.6082700490951538, + "logits/rejected": -1.5660271644592285, + "logps/chosen": -194.77102661132812, + "logps/rejected": -287.49371337890625, + "loss": 0.4379, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4037247896194458, + "rewards/margins": 0.9330763816833496, + "rewards/rejected": -2.336801052093506, + "step": 16050 + }, + { + "epoch": 2.7670572019297035, + "grad_norm": 27.753414154052734, + "learning_rate": 3.6511065821091314e-09, + "logits/chosen": -1.6502840518951416, + "logits/rejected": -1.6107053756713867, + "logps/chosen": -196.86233520507812, + "logps/rejected": -278.1816101074219, + "loss": 0.497, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4114711284637451, + "rewards/margins": 0.819719135761261, + "rewards/rejected": -2.2311902046203613, + "step": 16060 + }, + { + "epoch": 2.7687801516195725, + "grad_norm": 27.338544845581055, + "learning_rate": 3.597620886721342e-09, + "logits/chosen": -1.5682957172393799, + "logits/rejected": -1.5182629823684692, + "logps/chosen": -199.6961212158203, + "logps/rejected": -282.4446105957031, + "loss": 0.478, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4262382984161377, + "rewards/margins": 0.8562216758728027, + "rewards/rejected": -2.2824597358703613, + "step": 16070 + }, + { + "epoch": 2.770503101309442, + "grad_norm": 29.13926124572754, + "learning_rate": 3.5445226713941457e-09, + "logits/chosen": -1.6415565013885498, + "logits/rejected": -1.5768063068389893, + "logps/chosen": -211.66091918945312, + "logps/rejected": -297.61773681640625, + "loss": 0.4922, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5455818176269531, + "rewards/margins": 0.914897084236145, + "rewards/rejected": -2.4604787826538086, + "step": 16080 + }, + { + "epoch": 2.772226050999311, + "grad_norm": 42.47835922241211, + "learning_rate": 3.491812149550688e-09, + "logits/chosen": -1.6251806020736694, + "logits/rejected": -1.57179856300354, + "logps/chosen": -195.59005737304688, + "logps/rejected": -282.0909118652344, + "loss": 0.4861, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4269095659255981, + "rewards/margins": 0.8857747316360474, + "rewards/rejected": -2.3126845359802246, + "step": 16090 + }, + { + "epoch": 2.77394900068918, + "grad_norm": 32.899444580078125, + "learning_rate": 3.4394895330558284e-09, + "logits/chosen": -1.6690833568572998, + "logits/rejected": -1.6237341165542603, + "logps/chosen": -193.7056427001953, + "logps/rejected": -303.7026672363281, + "loss": 0.4389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.414886236190796, + "rewards/margins": 1.076838731765747, + "rewards/rejected": -2.491725206375122, + "step": 16100 + }, + { + "epoch": 2.775671950379049, + "grad_norm": 36.35084915161133, + "learning_rate": 3.3875550322152503e-09, + "logits/chosen": -1.5023618936538696, + "logits/rejected": -1.4406607151031494, + "logps/chosen": -200.91744995117188, + "logps/rejected": -295.1684265136719, + "loss": 0.4929, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4381941556930542, + "rewards/margins": 0.9750955700874329, + "rewards/rejected": -2.4132895469665527, + "step": 16110 + }, + { + "epoch": 2.777394900068918, + "grad_norm": 37.906803131103516, + "learning_rate": 3.3360088557746856e-09, + "logits/chosen": -1.5955053567886353, + "logits/rejected": -1.5656285285949707, + "logps/chosen": -195.3086700439453, + "logps/rejected": -264.56658935546875, + "loss": 0.548, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4471538066864014, + "rewards/margins": 0.6703465580940247, + "rewards/rejected": -2.1175003051757812, + "step": 16120 + }, + { + "epoch": 2.779117849758787, + "grad_norm": 30.308712005615234, + "learning_rate": 3.2848512109190375e-09, + "logits/chosen": -1.6084339618682861, + "logits/rejected": -1.5634233951568604, + "logps/chosen": -203.5264892578125, + "logps/rejected": -282.7039794921875, + "loss": 0.5127, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5111953020095825, + "rewards/margins": 0.7994351983070374, + "rewards/rejected": -2.3106303215026855, + "step": 16130 + }, + { + "epoch": 2.780840799448656, + "grad_norm": 28.674461364746094, + "learning_rate": 3.2340823032715125e-09, + "logits/chosen": -1.722328782081604, + "logits/rejected": -1.672581672668457, + "logps/chosen": -196.27162170410156, + "logps/rejected": -284.0904235839844, + "loss": 0.4844, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.435058355331421, + "rewards/margins": 0.8832355737686157, + "rewards/rejected": -2.318293809890747, + "step": 16140 + }, + { + "epoch": 2.782563749138525, + "grad_norm": 30.645694732666016, + "learning_rate": 3.1837023368928017e-09, + "logits/chosen": -1.6766061782836914, + "logits/rejected": -1.6359221935272217, + "logps/chosen": -211.22470092773438, + "logps/rejected": -286.9867858886719, + "loss": 0.525, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5677381753921509, + "rewards/margins": 0.7787638902664185, + "rewards/rejected": -2.3465020656585693, + "step": 16150 + }, + { + "epoch": 2.784286698828394, + "grad_norm": 31.85881996154785, + "learning_rate": 3.133711514280357e-09, + "logits/chosen": -1.685119867324829, + "logits/rejected": -1.6261520385742188, + "logps/chosen": -196.38050842285156, + "logps/rejected": -295.7599792480469, + "loss": 0.4314, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.42711341381073, + "rewards/margins": 0.9946566820144653, + "rewards/rejected": -2.4217700958251953, + "step": 16160 + }, + { + "epoch": 2.786009648518263, + "grad_norm": 23.079660415649414, + "learning_rate": 3.084110036367449e-09, + "logits/chosen": -1.5471277236938477, + "logits/rejected": -1.499712586402893, + "logps/chosen": -210.12258911132812, + "logps/rejected": -288.97479248046875, + "loss": 0.5042, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5617667436599731, + "rewards/margins": 0.7905761003494263, + "rewards/rejected": -2.3523428440093994, + "step": 16170 + }, + { + "epoch": 2.7877325982081325, + "grad_norm": 40.84262466430664, + "learning_rate": 3.034898102522454e-09, + "logits/chosen": -1.5986557006835938, + "logits/rejected": -1.5298030376434326, + "logps/chosen": -221.5828399658203, + "logps/rejected": -308.65576171875, + "loss": 0.5076, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.627313256263733, + "rewards/margins": 0.9360467195510864, + "rewards/rejected": -2.5633597373962402, + "step": 16180 + }, + { + "epoch": 2.7894555478980014, + "grad_norm": 30.877662658691406, + "learning_rate": 2.9860759105479582e-09, + "logits/chosen": -1.6073898077011108, + "logits/rejected": -1.5686115026474, + "logps/chosen": -209.8683624267578, + "logps/rejected": -286.5576171875, + "loss": 0.5461, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5662927627563477, + "rewards/margins": 0.7508216500282288, + "rewards/rejected": -2.3171145915985107, + "step": 16190 + }, + { + "epoch": 2.7911784975878704, + "grad_norm": 30.29778480529785, + "learning_rate": 2.9376436566800667e-09, + "logits/chosen": -1.558695673942566, + "logits/rejected": -1.4981310367584229, + "logps/chosen": -203.7272491455078, + "logps/rejected": -279.0574645996094, + "loss": 0.5288, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4882779121398926, + "rewards/margins": 0.8002364039421082, + "rewards/rejected": -2.2885146141052246, + "step": 16200 + }, + { + "epoch": 2.7929014472777394, + "grad_norm": 42.685707092285156, + "learning_rate": 2.8896015355875492e-09, + "logits/chosen": -1.5141369104385376, + "logits/rejected": -1.4735002517700195, + "logps/chosen": -201.10067749023438, + "logps/rejected": -282.42083740234375, + "loss": 0.508, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.491695523262024, + "rewards/margins": 0.7959169149398804, + "rewards/rejected": -2.2876124382019043, + "step": 16210 + }, + { + "epoch": 2.794624396967609, + "grad_norm": 23.677894592285156, + "learning_rate": 2.841949740371086e-09, + "logits/chosen": -1.5899922847747803, + "logits/rejected": -1.5405527353286743, + "logps/chosen": -198.48683166503906, + "logps/rejected": -298.21026611328125, + "loss": 0.481, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4205026626586914, + "rewards/margins": 1.0091335773468018, + "rewards/rejected": -2.4296364784240723, + "step": 16220 + }, + { + "epoch": 2.7963473466574778, + "grad_norm": 20.0400447845459, + "learning_rate": 2.7946884625624556e-09, + "logits/chosen": -1.6403640508651733, + "logits/rejected": -1.589577078819275, + "logps/chosen": -198.72225952148438, + "logps/rejected": -291.4017639160156, + "loss": 0.4779, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4717034101486206, + "rewards/margins": 0.9280723333358765, + "rewards/rejected": -2.399775981903076, + "step": 16230 + }, + { + "epoch": 2.7980702963473467, + "grad_norm": 33.879493713378906, + "learning_rate": 2.747817892123816e-09, + "logits/chosen": -1.5945513248443604, + "logits/rejected": -1.5502314567565918, + "logps/chosen": -212.589599609375, + "logps/rejected": -299.84051513671875, + "loss": 0.5071, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6102514266967773, + "rewards/margins": 0.8573668599128723, + "rewards/rejected": -2.467618227005005, + "step": 16240 + }, + { + "epoch": 2.7997932460372157, + "grad_norm": 27.390350341796875, + "learning_rate": 2.7013382174468914e-09, + "logits/chosen": -1.6416511535644531, + "logits/rejected": -1.5986303091049194, + "logps/chosen": -207.4696502685547, + "logps/rejected": -279.26654052734375, + "loss": 0.5334, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4999459981918335, + "rewards/margins": 0.7439717054367065, + "rewards/rejected": -2.243917942047119, + "step": 16250 + }, + { + "epoch": 2.8015161957270847, + "grad_norm": 33.779144287109375, + "learning_rate": 2.6552496253522518e-09, + "logits/chosen": -1.6039321422576904, + "logits/rejected": -1.5535815954208374, + "logps/chosen": -209.19546508789062, + "logps/rejected": -307.2878112792969, + "loss": 0.4992, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5651133060455322, + "rewards/margins": 0.9539181590080261, + "rewards/rejected": -2.519031047821045, + "step": 16260 + }, + { + "epoch": 2.8032391454169536, + "grad_norm": 25.58933448791504, + "learning_rate": 2.609552301088558e-09, + "logits/chosen": -1.6411972045898438, + "logits/rejected": -1.598671555519104, + "logps/chosen": -211.4327392578125, + "logps/rejected": -287.0559997558594, + "loss": 0.5453, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5608909130096436, + "rewards/margins": 0.775378942489624, + "rewards/rejected": -2.3362698554992676, + "step": 16270 + }, + { + "epoch": 2.804962095106823, + "grad_norm": 37.81081771850586, + "learning_rate": 2.5642464283317733e-09, + "logits/chosen": -1.7280786037445068, + "logits/rejected": -1.6881927251815796, + "logps/chosen": -210.5127716064453, + "logps/rejected": -286.2266540527344, + "loss": 0.525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5488097667694092, + "rewards/margins": 0.7880204916000366, + "rewards/rejected": -2.3368306159973145, + "step": 16280 + }, + { + "epoch": 2.806685044796692, + "grad_norm": 21.164901733398438, + "learning_rate": 2.5193321891844866e-09, + "logits/chosen": -1.6742864847183228, + "logits/rejected": -1.6348425149917603, + "logps/chosen": -201.8052215576172, + "logps/rejected": -290.950927734375, + "loss": 0.5032, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4995896816253662, + "rewards/margins": 0.8645893931388855, + "rewards/rejected": -2.3641791343688965, + "step": 16290 + }, + { + "epoch": 2.808407994486561, + "grad_norm": 30.592893600463867, + "learning_rate": 2.4748097641751787e-09, + "logits/chosen": -1.6843475103378296, + "logits/rejected": -1.6307868957519531, + "logps/chosen": -211.84707641601562, + "logps/rejected": -290.2081604003906, + "loss": 0.5295, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5801098346710205, + "rewards/margins": 0.807979941368103, + "rewards/rejected": -2.388089656829834, + "step": 16300 + }, + { + "epoch": 2.81013094417643, + "grad_norm": 35.39067077636719, + "learning_rate": 2.4306793322574014e-09, + "logits/chosen": -1.577487826347351, + "logits/rejected": -1.5398495197296143, + "logps/chosen": -210.31735229492188, + "logps/rejected": -289.4979248046875, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5722347497940063, + "rewards/margins": 0.7788803577423096, + "rewards/rejected": -2.3511152267456055, + "step": 16310 + }, + { + "epoch": 2.8118538938662994, + "grad_norm": 37.08136749267578, + "learning_rate": 2.3869410708091787e-09, + "logits/chosen": -1.6066745519638062, + "logits/rejected": -1.5592342615127563, + "logps/chosen": -207.6752471923828, + "logps/rejected": -298.2518310546875, + "loss": 0.4863, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5351449251174927, + "rewards/margins": 0.8967401385307312, + "rewards/rejected": -2.431885242462158, + "step": 16320 + }, + { + "epoch": 2.8135768435561683, + "grad_norm": 20.299909591674805, + "learning_rate": 2.3435951556322386e-09, + "logits/chosen": -1.6628811359405518, + "logits/rejected": -1.6167129278182983, + "logps/chosen": -200.5544891357422, + "logps/rejected": -271.65625, + "loss": 0.5158, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4268579483032227, + "rewards/margins": 0.7481969594955444, + "rewards/rejected": -2.1750550270080566, + "step": 16330 + }, + { + "epoch": 2.8152997932460373, + "grad_norm": 30.356172561645508, + "learning_rate": 2.3006417609513053e-09, + "logits/chosen": -1.531884789466858, + "logits/rejected": -1.496141791343689, + "logps/chosen": -196.16905212402344, + "logps/rejected": -286.60443115234375, + "loss": 0.4647, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4173381328582764, + "rewards/margins": 0.9166313409805298, + "rewards/rejected": -2.3339695930480957, + "step": 16340 + }, + { + "epoch": 2.8170227429359063, + "grad_norm": 27.82748794555664, + "learning_rate": 2.258081059413397e-09, + "logits/chosen": -1.7319214344024658, + "logits/rejected": -1.6799014806747437, + "logps/chosen": -209.30575561523438, + "logps/rejected": -284.62188720703125, + "loss": 0.5076, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5352132320404053, + "rewards/margins": 0.7685121297836304, + "rewards/rejected": -2.303725242614746, + "step": 16350 + }, + { + "epoch": 2.8187456926257752, + "grad_norm": 39.15815353393555, + "learning_rate": 2.2159132220871623e-09, + "logits/chosen": -1.6009477376937866, + "logits/rejected": -1.5514745712280273, + "logps/chosen": -199.82054138183594, + "logps/rejected": -291.30218505859375, + "loss": 0.4756, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4737600088119507, + "rewards/margins": 0.9305243492126465, + "rewards/rejected": -2.404284715652466, + "step": 16360 + }, + { + "epoch": 2.820468642315644, + "grad_norm": 38.97136306762695, + "learning_rate": 2.174138418462135e-09, + "logits/chosen": -1.5708199739456177, + "logits/rejected": -1.5296893119812012, + "logps/chosen": -204.8894805908203, + "logps/rejected": -271.0523986816406, + "loss": 0.5688, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5094388723373413, + "rewards/margins": 0.6989701986312866, + "rewards/rejected": -2.208408832550049, + "step": 16370 + }, + { + "epoch": 2.822191592005513, + "grad_norm": 43.11518478393555, + "learning_rate": 2.132756816448111e-09, + "logits/chosen": -1.6394847631454468, + "logits/rejected": -1.6021696329116821, + "logps/chosen": -198.41256713867188, + "logps/rejected": -283.98065185546875, + "loss": 0.4834, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.441158413887024, + "rewards/margins": 0.8487105369567871, + "rewards/rejected": -2.2898690700531006, + "step": 16380 + }, + { + "epoch": 2.8239145416953826, + "grad_norm": 24.530704498291016, + "learning_rate": 2.0917685823744426e-09, + "logits/chosen": -1.5554282665252686, + "logits/rejected": -1.5096482038497925, + "logps/chosen": -193.83892822265625, + "logps/rejected": -281.29248046875, + "loss": 0.5268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4140729904174805, + "rewards/margins": 0.876665472984314, + "rewards/rejected": -2.290738582611084, + "step": 16390 + }, + { + "epoch": 2.8256374913852516, + "grad_norm": 34.35622787475586, + "learning_rate": 2.0511738809894097e-09, + "logits/chosen": -1.5805435180664062, + "logits/rejected": -1.538383960723877, + "logps/chosen": -198.9628448486328, + "logps/rejected": -282.85791015625, + "loss": 0.4952, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4447994232177734, + "rewards/margins": 0.8364402651786804, + "rewards/rejected": -2.2812397480010986, + "step": 16400 + }, + { + "epoch": 2.8256374913852516, + "eval_logits/chosen": -1.7279173135757446, + "eval_logits/rejected": -1.704264760017395, + "eval_logps/chosen": -203.5388946533203, + "eval_logps/rejected": -243.77938842773438, + "eval_loss": 0.6312301158905029, + "eval_rewards/accuracies": 0.6438196897506714, + "eval_rewards/chosen": -1.4482700824737549, + "eval_rewards/margins": 0.3577224314212799, + "eval_rewards/rejected": -1.8059924840927124, + "eval_runtime": 384.8549, + "eval_samples_per_second": 11.183, + "eval_steps_per_second": 1.398, + "step": 16400 + }, + { + "epoch": 2.8273604410751205, + "grad_norm": 32.33806228637695, + "learning_rate": 2.0109728754594713e-09, + "logits/chosen": -1.6441633701324463, + "logits/rejected": -1.5825875997543335, + "logps/chosen": -215.2042999267578, + "logps/rejected": -294.2441711425781, + "loss": 0.4817, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5997352600097656, + "rewards/margins": 0.8535000085830688, + "rewards/rejected": -2.453235149383545, + "step": 16410 + }, + { + "epoch": 2.82908339076499, + "grad_norm": 36.8965950012207, + "learning_rate": 1.9711657273686844e-09, + "logits/chosen": -1.5145912170410156, + "logits/rejected": -1.4785223007202148, + "logps/chosen": -207.9575653076172, + "logps/rejected": -281.3797912597656, + "loss": 0.5481, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5369865894317627, + "rewards/margins": 0.7095264792442322, + "rewards/rejected": -2.2465128898620605, + "step": 16420 + }, + { + "epoch": 2.830806340454859, + "grad_norm": 47.45038604736328, + "learning_rate": 1.93175259671805e-09, + "logits/chosen": -1.5235130786895752, + "logits/rejected": -1.4861727952957153, + "logps/chosen": -211.5547332763672, + "logps/rejected": -287.1858825683594, + "loss": 0.5383, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.575508713722229, + "rewards/margins": 0.7593789100646973, + "rewards/rejected": -2.3348875045776367, + "step": 16430 + }, + { + "epoch": 2.832529290144728, + "grad_norm": 23.88429832458496, + "learning_rate": 1.8927336419248596e-09, + "logits/chosen": -1.6044429540634155, + "logits/rejected": -1.5553555488586426, + "logps/chosen": -198.04505920410156, + "logps/rejected": -294.0986633300781, + "loss": 0.4818, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.468567132949829, + "rewards/margins": 0.954384982585907, + "rewards/rejected": -2.422952175140381, + "step": 16440 + }, + { + "epoch": 2.834252239834597, + "grad_norm": 33.20879364013672, + "learning_rate": 1.8541090198220144e-09, + "logits/chosen": -1.6844584941864014, + "logits/rejected": -1.629431962966919, + "logps/chosen": -202.9315185546875, + "logps/rejected": -305.4527282714844, + "loss": 0.4223, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4656628370285034, + "rewards/margins": 1.037574052810669, + "rewards/rejected": -2.503237009048462, + "step": 16450 + }, + { + "epoch": 2.835975189524466, + "grad_norm": 29.70263671875, + "learning_rate": 1.8158788856574624e-09, + "logits/chosen": -1.4988144636154175, + "logits/rejected": -1.461374282836914, + "logps/chosen": -189.18653869628906, + "logps/rejected": -283.64410400390625, + "loss": 0.4796, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3982244729995728, + "rewards/margins": 0.9302972555160522, + "rewards/rejected": -2.328521966934204, + "step": 16460 + }, + { + "epoch": 2.837698139214335, + "grad_norm": 39.5974235534668, + "learning_rate": 1.7780433930935312e-09, + "logits/chosen": -1.60671865940094, + "logits/rejected": -1.5581773519515991, + "logps/chosen": -194.26234436035156, + "logps/rejected": -291.94342041015625, + "loss": 0.4924, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4329856634140015, + "rewards/margins": 0.9630458950996399, + "rewards/rejected": -2.396031618118286, + "step": 16470 + }, + { + "epoch": 2.8394210889042037, + "grad_norm": 30.109529495239258, + "learning_rate": 1.74060269420635e-09, + "logits/chosen": -1.5139915943145752, + "logits/rejected": -1.465388536453247, + "logps/chosen": -195.1603546142578, + "logps/rejected": -292.16363525390625, + "loss": 0.4479, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4241392612457275, + "rewards/margins": 0.9566676020622253, + "rewards/rejected": -2.3808071613311768, + "step": 16480 + }, + { + "epoch": 2.841144038594073, + "grad_norm": 30.47223663330078, + "learning_rate": 1.7035569394851955e-09, + "logits/chosen": -1.6082112789154053, + "logits/rejected": -1.570446252822876, + "logps/chosen": -199.833984375, + "logps/rejected": -267.7076721191406, + "loss": 0.5319, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.477171540260315, + "rewards/margins": 0.6900812387466431, + "rewards/rejected": -2.167252779006958, + "step": 16490 + }, + { + "epoch": 2.842866988283942, + "grad_norm": 21.7894287109375, + "learning_rate": 1.6669062778318698e-09, + "logits/chosen": -1.649877905845642, + "logits/rejected": -1.5828702449798584, + "logps/chosen": -198.849365234375, + "logps/rejected": -262.10211181640625, + "loss": 0.5217, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4084839820861816, + "rewards/margins": 0.7134264707565308, + "rewards/rejected": -2.121910572052002, + "step": 16500 + }, + { + "epoch": 2.844589937973811, + "grad_norm": 36.085540771484375, + "learning_rate": 1.6306508565602228e-09, + "logits/chosen": -1.6468229293823242, + "logits/rejected": -1.607865571975708, + "logps/chosen": -201.9567108154297, + "logps/rejected": -278.9889221191406, + "loss": 0.5053, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4504163265228271, + "rewards/margins": 0.7838245034217834, + "rewards/rejected": -2.234240770339966, + "step": 16510 + }, + { + "epoch": 2.84631288766368, + "grad_norm": 32.19239807128906, + "learning_rate": 1.5947908213953753e-09, + "logits/chosen": -1.7317917346954346, + "logits/rejected": -1.6700223684310913, + "logps/chosen": -210.5098114013672, + "logps/rejected": -311.977783203125, + "loss": 0.4399, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.544738531112671, + "rewards/margins": 1.038854956626892, + "rewards/rejected": -2.5835938453674316, + "step": 16520 + }, + { + "epoch": 2.8480358373535495, + "grad_norm": 43.740455627441406, + "learning_rate": 1.5593263164732972e-09, + "logits/chosen": -1.563025712966919, + "logits/rejected": -1.519561767578125, + "logps/chosen": -209.86190795898438, + "logps/rejected": -267.63372802734375, + "loss": 0.5561, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5369549989700317, + "rewards/margins": 0.6146339178085327, + "rewards/rejected": -2.1515889167785645, + "step": 16530 + }, + { + "epoch": 2.8497587870434185, + "grad_norm": 29.081884384155273, + "learning_rate": 1.5242574843401524e-09, + "logits/chosen": -1.608612298965454, + "logits/rejected": -1.5656510591506958, + "logps/chosen": -207.4720001220703, + "logps/rejected": -281.773193359375, + "loss": 0.5244, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5441038608551025, + "rewards/margins": 0.7779989838600159, + "rewards/rejected": -2.3221025466918945, + "step": 16540 + }, + { + "epoch": 2.8514817367332874, + "grad_norm": 33.823883056640625, + "learning_rate": 1.489584465951721e-09, + "logits/chosen": -1.5540907382965088, + "logits/rejected": -1.5084948539733887, + "logps/chosen": -210.51278686523438, + "logps/rejected": -310.1302185058594, + "loss": 0.4586, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5683314800262451, + "rewards/margins": 0.985478401184082, + "rewards/rejected": -2.553809642791748, + "step": 16550 + }, + { + "epoch": 2.8532046864231564, + "grad_norm": 31.734004974365234, + "learning_rate": 1.455307400672845e-09, + "logits/chosen": -1.6425979137420654, + "logits/rejected": -1.5943950414657593, + "logps/chosen": -197.49183654785156, + "logps/rejected": -283.3627624511719, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4330264329910278, + "rewards/margins": 0.8302183151245117, + "rewards/rejected": -2.263244152069092, + "step": 16560 + }, + { + "epoch": 2.8549276361130254, + "grad_norm": 26.054500579833984, + "learning_rate": 1.421426426276895e-09, + "logits/chosen": -1.691537618637085, + "logits/rejected": -1.6329768896102905, + "logps/chosen": -201.1520233154297, + "logps/rejected": -289.4585876464844, + "loss": 0.4781, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4781467914581299, + "rewards/margins": 0.9025301933288574, + "rewards/rejected": -2.380676746368408, + "step": 16570 + }, + { + "epoch": 2.8566505858028943, + "grad_norm": 35.0569953918457, + "learning_rate": 1.3879416789451815e-09, + "logits/chosen": -1.5840961933135986, + "logits/rejected": -1.5487277507781982, + "logps/chosen": -207.152099609375, + "logps/rejected": -277.3089904785156, + "loss": 0.5871, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5626401901245117, + "rewards/margins": 0.6830005049705505, + "rewards/rejected": -2.245640754699707, + "step": 16580 + }, + { + "epoch": 2.8583735354927637, + "grad_norm": 52.74144744873047, + "learning_rate": 1.3548532932663891e-09, + "logits/chosen": -1.5398337841033936, + "logits/rejected": -1.4927947521209717, + "logps/chosen": -215.7863006591797, + "logps/rejected": -278.1468200683594, + "loss": 0.5743, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6190001964569092, + "rewards/margins": 0.6568753123283386, + "rewards/rejected": -2.2758755683898926, + "step": 16590 + }, + { + "epoch": 2.8600964851826327, + "grad_norm": 35.544647216796875, + "learning_rate": 1.3221614022361105e-09, + "logits/chosen": -1.610530138015747, + "logits/rejected": -1.5752967596054077, + "logps/chosen": -213.9752197265625, + "logps/rejected": -285.7201232910156, + "loss": 0.536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5679616928100586, + "rewards/margins": 0.7312275767326355, + "rewards/rejected": -2.2991890907287598, + "step": 16600 + }, + { + "epoch": 2.8618194348725017, + "grad_norm": 32.12127685546875, + "learning_rate": 1.289866137256257e-09, + "logits/chosen": -1.5730324983596802, + "logits/rejected": -1.5143897533416748, + "logps/chosen": -218.294189453125, + "logps/rejected": -306.3438720703125, + "loss": 0.5074, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6378536224365234, + "rewards/margins": 0.8943823575973511, + "rewards/rejected": -2.532235622406006, + "step": 16610 + }, + { + "epoch": 2.8635423845623706, + "grad_norm": 24.77232551574707, + "learning_rate": 1.2579676281345042e-09, + "logits/chosen": -1.5598160028457642, + "logits/rejected": -1.5061019659042358, + "logps/chosen": -201.98837280273438, + "logps/rejected": -283.95306396484375, + "loss": 0.5158, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.448168396949768, + "rewards/margins": 0.8431085348129272, + "rewards/rejected": -2.291276693344116, + "step": 16620 + }, + { + "epoch": 2.86526533425224, + "grad_norm": 62.111820220947266, + "learning_rate": 1.2264660030838592e-09, + "logits/chosen": -1.550568699836731, + "logits/rejected": -1.4934922456741333, + "logps/chosen": -213.20541381835938, + "logps/rejected": -285.7199401855469, + "loss": 0.535, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5055577754974365, + "rewards/margins": 0.8336159586906433, + "rewards/rejected": -2.3391735553741455, + "step": 16630 + }, + { + "epoch": 2.866988283942109, + "grad_norm": 46.97129821777344, + "learning_rate": 1.195361388722038e-09, + "logits/chosen": -1.6127160787582397, + "logits/rejected": -1.5536266565322876, + "logps/chosen": -221.4212188720703, + "logps/rejected": -303.21820068359375, + "loss": 0.5219, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6521766185760498, + "rewards/margins": 0.8641535043716431, + "rewards/rejected": -2.5163302421569824, + "step": 16640 + }, + { + "epoch": 2.868711233631978, + "grad_norm": 33.028594970703125, + "learning_rate": 1.1646539100710562e-09, + "logits/chosen": -1.5643160343170166, + "logits/rejected": -1.5265575647354126, + "logps/chosen": -190.27337646484375, + "logps/rejected": -276.12451171875, + "loss": 0.477, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3563764095306396, + "rewards/margins": 0.8850062489509583, + "rewards/rejected": -2.2413830757141113, + "step": 16650 + }, + { + "epoch": 2.870434183321847, + "grad_norm": 30.924272537231445, + "learning_rate": 1.1343436905566495e-09, + "logits/chosen": -1.6248385906219482, + "logits/rejected": -1.5855129957199097, + "logps/chosen": -205.64205932617188, + "logps/rejected": -295.14398193359375, + "loss": 0.4859, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5187439918518066, + "rewards/margins": 0.8884479403495789, + "rewards/rejected": -2.407191753387451, + "step": 16660 + }, + { + "epoch": 2.872157133011716, + "grad_norm": 28.749929428100586, + "learning_rate": 1.1044308520078316e-09, + "logits/chosen": -1.5990875959396362, + "logits/rejected": -1.5556060075759888, + "logps/chosen": -194.87875366210938, + "logps/rejected": -295.0609130859375, + "loss": 0.4493, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.434263825416565, + "rewards/margins": 0.9827602505683899, + "rewards/rejected": -2.4170238971710205, + "step": 16670 + }, + { + "epoch": 2.873880082701585, + "grad_norm": 49.628292083740234, + "learning_rate": 1.0749155146563493e-09, + "logits/chosen": -1.5153428316116333, + "logits/rejected": -1.4865665435791016, + "logps/chosen": -206.2247314453125, + "logps/rejected": -273.4560852050781, + "loss": 0.5717, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5732797384262085, + "rewards/margins": 0.673078179359436, + "rewards/rejected": -2.2463576793670654, + "step": 16680 + }, + { + "epoch": 2.8756030323914543, + "grad_norm": 31.62681007385254, + "learning_rate": 1.0457977971362831e-09, + "logits/chosen": -1.652151107788086, + "logits/rejected": -1.6131757497787476, + "logps/chosen": -202.0923614501953, + "logps/rejected": -270.86669921875, + "loss": 0.5374, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4801733493804932, + "rewards/margins": 0.692054033279419, + "rewards/rejected": -2.172227382659912, + "step": 16690 + }, + { + "epoch": 2.8773259820813233, + "grad_norm": 44.19800567626953, + "learning_rate": 1.0170778164834581e-09, + "logits/chosen": -1.728794813156128, + "logits/rejected": -1.6931133270263672, + "logps/chosen": -209.34469604492188, + "logps/rejected": -271.6888427734375, + "loss": 0.5925, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5546461343765259, + "rewards/margins": 0.6288726329803467, + "rewards/rejected": -2.183518648147583, + "step": 16700 + }, + { + "epoch": 2.8790489317711923, + "grad_norm": 34.03745651245117, + "learning_rate": 9.887556881350901e-10, + "logits/chosen": -1.6251108646392822, + "logits/rejected": -1.5769708156585693, + "logps/chosen": -209.3572998046875, + "logps/rejected": -299.29962158203125, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5848195552825928, + "rewards/margins": 0.8981760740280151, + "rewards/rejected": -2.4829957485198975, + "step": 16710 + }, + { + "epoch": 2.8807718814610612, + "grad_norm": 41.1098747253418, + "learning_rate": 9.608315259292288e-10, + "logits/chosen": -1.5905859470367432, + "logits/rejected": -1.5348126888275146, + "logps/chosen": -210.2429656982422, + "logps/rejected": -285.15679931640625, + "loss": 0.5151, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5592763423919678, + "rewards/margins": 0.8024843335151672, + "rewards/rejected": -2.3617606163024902, + "step": 16720 + }, + { + "epoch": 2.8824948311509306, + "grad_norm": 38.210693359375, + "learning_rate": 9.333054421043484e-10, + "logits/chosen": -1.576797604560852, + "logits/rejected": -1.5253446102142334, + "logps/chosen": -192.76980590820312, + "logps/rejected": -279.8339538574219, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4076998233795166, + "rewards/margins": 0.8691729307174683, + "rewards/rejected": -2.2768726348876953, + "step": 16730 + }, + { + "epoch": 2.8842177808407996, + "grad_norm": 37.603309631347656, + "learning_rate": 9.06177547298892e-10, + "logits/chosen": -1.5681672096252441, + "logits/rejected": -1.5280646085739136, + "logps/chosen": -204.23959350585938, + "logps/rejected": -271.69366455078125, + "loss": 0.5554, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4879558086395264, + "rewards/margins": 0.6997446417808533, + "rewards/rejected": -2.1877007484436035, + "step": 16740 + }, + { + "epoch": 2.8859407305306686, + "grad_norm": 31.856191635131836, + "learning_rate": 8.794479505508268e-10, + "logits/chosen": -1.6667163372039795, + "logits/rejected": -1.6266586780548096, + "logps/chosen": -197.18392944335938, + "logps/rejected": -294.75921630859375, + "loss": 0.4606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4334996938705444, + "rewards/margins": 0.9352189898490906, + "rewards/rejected": -2.3687186241149902, + "step": 16750 + }, + { + "epoch": 2.8876636802205375, + "grad_norm": 26.064311981201172, + "learning_rate": 8.531167592971566e-10, + "logits/chosen": -1.5521303415298462, + "logits/rejected": -1.509796380996704, + "logps/chosen": -211.45907592773438, + "logps/rejected": -292.41741943359375, + "loss": 0.5033, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5620230436325073, + "rewards/margins": 0.8341752886772156, + "rewards/rejected": -2.396198272705078, + "step": 16760 + }, + { + "epoch": 2.8893866299104065, + "grad_norm": 27.30735969543457, + "learning_rate": 8.271840793735884e-10, + "logits/chosen": -1.6859509944915771, + "logits/rejected": -1.6200199127197266, + "logps/chosen": -213.5120391845703, + "logps/rejected": -292.6845703125, + "loss": 0.5537, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5698108673095703, + "rewards/margins": 0.8569631576538086, + "rewards/rejected": -2.4267737865448, + "step": 16770 + }, + { + "epoch": 2.8911095796002755, + "grad_norm": 41.18815231323242, + "learning_rate": 8.016500150140215e-10, + "logits/chosen": -1.6033258438110352, + "logits/rejected": -1.5633761882781982, + "logps/chosen": -202.39730834960938, + "logps/rejected": -287.2729797363281, + "loss": 0.5114, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4809529781341553, + "rewards/margins": 0.8144389986991882, + "rewards/rejected": -2.2953920364379883, + "step": 16780 + }, + { + "epoch": 2.892832529290145, + "grad_norm": 27.463346481323242, + "learning_rate": 7.765146688501589e-10, + "logits/chosen": -1.6069574356079102, + "logits/rejected": -1.5416642427444458, + "logps/chosen": -205.8412628173828, + "logps/rejected": -273.67486572265625, + "loss": 0.5541, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4828940629959106, + "rewards/margins": 0.7489983439445496, + "rewards/rejected": -2.2318923473358154, + "step": 16790 + }, + { + "epoch": 2.894555478980014, + "grad_norm": 33.9183235168457, + "learning_rate": 7.51778141911108e-10, + "logits/chosen": -1.6445128917694092, + "logits/rejected": -1.6061605215072632, + "logps/chosen": -202.0955810546875, + "logps/rejected": -287.78717041015625, + "loss": 0.5024, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.530413269996643, + "rewards/margins": 0.8305818438529968, + "rewards/rejected": -2.360995292663574, + "step": 16800 + }, + { + "epoch": 2.894555478980014, + "eval_logits/chosen": -1.7274123430252075, + "eval_logits/rejected": -1.703658938407898, + "eval_logps/chosen": -203.63075256347656, + "eval_logps/rejected": -244.1201171875, + "eval_loss": 0.6304458379745483, + "eval_rewards/accuracies": 0.6428903341293335, + "eval_rewards/chosen": -1.4491883516311646, + "eval_rewards/margins": 0.3602113127708435, + "eval_rewards/rejected": -1.8093998432159424, + "eval_runtime": 384.5599, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 16800 + }, + { + "epoch": 2.896278428669883, + "grad_norm": 29.492122650146484, + "learning_rate": 7.274405336229361e-10, + "logits/chosen": -1.5904494524002075, + "logits/rejected": -1.5278266668319702, + "logps/chosen": -187.41946411132812, + "logps/rejected": -273.6427917480469, + "loss": 0.4848, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3402408361434937, + "rewards/margins": 0.9011996388435364, + "rewards/rejected": -2.241440534591675, + "step": 16810 + }, + { + "epoch": 2.898001378359752, + "grad_norm": 30.319656372070312, + "learning_rate": 7.035019418083376e-10, + "logits/chosen": -1.65801203250885, + "logits/rejected": -1.610608696937561, + "logps/chosen": -205.1194305419922, + "logps/rejected": -271.00213623046875, + "loss": 0.5268, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.499645471572876, + "rewards/margins": 0.689041018486023, + "rewards/rejected": -2.1886868476867676, + "step": 16820 + }, + { + "epoch": 2.899724328049621, + "grad_norm": 33.45829772949219, + "learning_rate": 6.799624626861456e-10, + "logits/chosen": -1.7170978784561157, + "logits/rejected": -1.6595051288604736, + "logps/chosen": -221.271728515625, + "logps/rejected": -323.4475402832031, + "loss": 0.4892, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6707754135131836, + "rewards/margins": 1.010047197341919, + "rewards/rejected": -2.6808226108551025, + "step": 16830 + }, + { + "epoch": 2.90144727773949, + "grad_norm": 43.579429626464844, + "learning_rate": 6.568221908710314e-10, + "logits/chosen": -1.548855185508728, + "logits/rejected": -1.5003758668899536, + "logps/chosen": -199.71316528320312, + "logps/rejected": -278.1864013671875, + "loss": 0.4935, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4677910804748535, + "rewards/margins": 0.8026076555252075, + "rewards/rejected": -2.2703986167907715, + "step": 16840 + }, + { + "epoch": 2.903170227429359, + "grad_norm": 41.76044845581055, + "learning_rate": 6.340812193730949e-10, + "logits/chosen": -1.6095333099365234, + "logits/rejected": -1.5596811771392822, + "logps/chosen": -218.263427734375, + "logps/rejected": -290.07415771484375, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6301006078720093, + "rewards/margins": 0.7804665565490723, + "rewards/rejected": -2.410567283630371, + "step": 16850 + }, + { + "epoch": 2.904893177119228, + "grad_norm": 27.137157440185547, + "learning_rate": 6.117396395974749e-10, + "logits/chosen": -1.6121982336044312, + "logits/rejected": -1.5542309284210205, + "logps/chosen": -211.4744873046875, + "logps/rejected": -285.07366943359375, + "loss": 0.5225, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5618984699249268, + "rewards/margins": 0.7801223993301392, + "rewards/rejected": -2.3420207500457764, + "step": 16860 + }, + { + "epoch": 2.906616126809097, + "grad_norm": 52.22926330566406, + "learning_rate": 5.897975413439837e-10, + "logits/chosen": -1.6715095043182373, + "logits/rejected": -1.6253564357757568, + "logps/chosen": -204.73321533203125, + "logps/rejected": -276.33856201171875, + "loss": 0.5613, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5167036056518555, + "rewards/margins": 0.738450288772583, + "rewards/rejected": -2.2551538944244385, + "step": 16870 + }, + { + "epoch": 2.908339076498966, + "grad_norm": 36.053855895996094, + "learning_rate": 5.682550128067731e-10, + "logits/chosen": -1.6771320104599, + "logits/rejected": -1.628893494606018, + "logps/chosen": -200.6133575439453, + "logps/rejected": -296.2360534667969, + "loss": 0.4839, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5162712335586548, + "rewards/margins": 0.9246581196784973, + "rewards/rejected": -2.440929412841797, + "step": 16880 + }, + { + "epoch": 2.910062026188835, + "grad_norm": 39.545101165771484, + "learning_rate": 5.471121405739687e-10, + "logits/chosen": -1.5920666456222534, + "logits/rejected": -1.5557562112808228, + "logps/chosen": -213.32894897460938, + "logps/rejected": -285.2569274902344, + "loss": 0.5177, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5840203762054443, + "rewards/margins": 0.7276771664619446, + "rewards/rejected": -2.311697483062744, + "step": 16890 + }, + { + "epoch": 2.9117849758787044, + "grad_norm": 46.50544357299805, + "learning_rate": 5.263690096273033e-10, + "logits/chosen": -1.664858102798462, + "logits/rejected": -1.6239697933197021, + "logps/chosen": -207.4394989013672, + "logps/rejected": -278.8927001953125, + "loss": 0.5072, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5422718524932861, + "rewards/margins": 0.7455980181694031, + "rewards/rejected": -2.287869930267334, + "step": 16900 + }, + { + "epoch": 2.9135079255685734, + "grad_norm": 24.831132888793945, + "learning_rate": 5.060257033417725e-10, + "logits/chosen": -1.6837489604949951, + "logits/rejected": -1.640244722366333, + "logps/chosen": -203.83706665039062, + "logps/rejected": -283.9046325683594, + "loss": 0.5361, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5224006175994873, + "rewards/margins": 0.7805654406547546, + "rewards/rejected": -2.3029661178588867, + "step": 16910 + }, + { + "epoch": 2.9152308752584424, + "grad_norm": 26.15068817138672, + "learning_rate": 4.860823034853468e-10, + "logits/chosen": -1.5878162384033203, + "logits/rejected": -1.5520457029342651, + "logps/chosen": -203.78700256347656, + "logps/rejected": -264.70123291015625, + "loss": 0.5722, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.490502953529358, + "rewards/margins": 0.6168953776359558, + "rewards/rejected": -2.107398271560669, + "step": 16920 + }, + { + "epoch": 2.9169538249483113, + "grad_norm": 29.456026077270508, + "learning_rate": 4.66538890218593e-10, + "logits/chosen": -1.6750223636627197, + "logits/rejected": -1.6385116577148438, + "logps/chosen": -193.2615203857422, + "logps/rejected": -263.14617919921875, + "loss": 0.5374, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3699278831481934, + "rewards/margins": 0.7440475225448608, + "rewards/rejected": -2.1139755249023438, + "step": 16930 + }, + { + "epoch": 2.9186767746381808, + "grad_norm": 25.665470123291016, + "learning_rate": 4.4739554209437536e-10, + "logits/chosen": -1.646510124206543, + "logits/rejected": -1.607150673866272, + "logps/chosen": -201.77035522460938, + "logps/rejected": -275.34112548828125, + "loss": 0.5214, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4379570484161377, + "rewards/margins": 0.7657677531242371, + "rewards/rejected": -2.2037246227264404, + "step": 16940 + }, + { + "epoch": 2.9203997243280497, + "grad_norm": 29.399169921875, + "learning_rate": 4.286523360575334e-10, + "logits/chosen": -1.585037112236023, + "logits/rejected": -1.55281662940979, + "logps/chosen": -205.02139282226562, + "logps/rejected": -292.2770690917969, + "loss": 0.5452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5497676134109497, + "rewards/margins": 0.8234025239944458, + "rewards/rejected": -2.3731701374053955, + "step": 16950 + }, + { + "epoch": 2.9221226740179187, + "grad_norm": 31.07048988342285, + "learning_rate": 4.103093474445818e-10, + "logits/chosen": -1.623678207397461, + "logits/rejected": -1.5673491954803467, + "logps/chosen": -203.4460906982422, + "logps/rejected": -308.8279113769531, + "loss": 0.4582, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.483617901802063, + "rewards/margins": 1.052154541015625, + "rewards/rejected": -2.5357725620269775, + "step": 16960 + }, + { + "epoch": 2.9238456237077877, + "grad_norm": 33.71870422363281, + "learning_rate": 3.9236664998338885e-10, + "logits/chosen": -1.696584939956665, + "logits/rejected": -1.6441404819488525, + "logps/chosen": -213.5078582763672, + "logps/rejected": -295.99932861328125, + "loss": 0.5039, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.577401041984558, + "rewards/margins": 0.8441799283027649, + "rewards/rejected": -2.4215807914733887, + "step": 16970 + }, + { + "epoch": 2.9255685733976566, + "grad_norm": 31.795190811157227, + "learning_rate": 3.7482431579289873e-10, + "logits/chosen": -1.687766432762146, + "logits/rejected": -1.6378850936889648, + "logps/chosen": -198.27952575683594, + "logps/rejected": -275.29119873046875, + "loss": 0.493, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4469237327575684, + "rewards/margins": 0.7793303728103638, + "rewards/rejected": -2.2262539863586426, + "step": 16980 + }, + { + "epoch": 2.9272915230875256, + "grad_norm": 26.28533172607422, + "learning_rate": 3.5768241538282064e-10, + "logits/chosen": -1.7197927236557007, + "logits/rejected": -1.6632732152938843, + "logps/chosen": -184.9141387939453, + "logps/rejected": -280.8218688964844, + "loss": 0.4643, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3191200494766235, + "rewards/margins": 0.9638843536376953, + "rewards/rejected": -2.2830045223236084, + "step": 16990 + }, + { + "epoch": 2.929014472777395, + "grad_norm": 47.96595001220703, + "learning_rate": 3.4094101765338446e-10, + "logits/chosen": -1.6959985494613647, + "logits/rejected": -1.6554844379425049, + "logps/chosen": -206.0132293701172, + "logps/rejected": -269.5046081542969, + "loss": 0.5616, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5034517049789429, + "rewards/margins": 0.6630902290344238, + "rewards/rejected": -2.1665420532226562, + "step": 17000 + }, + { + "epoch": 2.930737422467264, + "grad_norm": 26.475934982299805, + "learning_rate": 3.24600189895019e-10, + "logits/chosen": -1.6051123142242432, + "logits/rejected": -1.5607998371124268, + "logps/chosen": -209.7019805908203, + "logps/rejected": -293.5786437988281, + "loss": 0.5056, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5677287578582764, + "rewards/margins": 0.8212777376174927, + "rewards/rejected": -2.3890066146850586, + "step": 17010 + }, + { + "epoch": 2.932460372157133, + "grad_norm": 39.69713592529297, + "learning_rate": 3.086599977880855e-10, + "logits/chosen": -1.5805712938308716, + "logits/rejected": -1.5696442127227783, + "logps/chosen": -215.4978790283203, + "logps/rejected": -271.7910461425781, + "loss": 0.6024, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.621180534362793, + "rewards/margins": 0.5600312352180481, + "rewards/rejected": -2.1812117099761963, + "step": 17020 + }, + { + "epoch": 2.934183321847002, + "grad_norm": 35.4920539855957, + "learning_rate": 2.931205054026775e-10, + "logits/chosen": -1.6319687366485596, + "logits/rejected": -1.586881160736084, + "logps/chosen": -210.61434936523438, + "logps/rejected": -277.21807861328125, + "loss": 0.5437, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5412108898162842, + "rewards/margins": 0.7014018893241882, + "rewards/rejected": -2.242612838745117, + "step": 17030 + }, + { + "epoch": 2.9359062715368713, + "grad_norm": 27.13518714904785, + "learning_rate": 2.7798177519826605e-10, + "logits/chosen": -1.6823952198028564, + "logits/rejected": -1.6340644359588623, + "logps/chosen": -220.3932342529297, + "logps/rejected": -293.453369140625, + "loss": 0.512, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6454026699066162, + "rewards/margins": 0.7773019075393677, + "rewards/rejected": -2.4227046966552734, + "step": 17040 + }, + { + "epoch": 2.9376292212267403, + "grad_norm": 29.53461456298828, + "learning_rate": 2.632438680235216e-10, + "logits/chosen": -1.6405704021453857, + "logits/rejected": -1.6034597158432007, + "logps/chosen": -218.55917358398438, + "logps/rejected": -279.91717529296875, + "loss": 0.5688, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6312099695205688, + "rewards/margins": 0.6329344511032104, + "rewards/rejected": -2.2641444206237793, + "step": 17050 + }, + { + "epoch": 2.9393521709166093, + "grad_norm": 40.60226821899414, + "learning_rate": 2.4890684311603683e-10, + "logits/chosen": -1.6730968952178955, + "logits/rejected": -1.6203582286834717, + "logps/chosen": -212.64511108398438, + "logps/rejected": -283.8674011230469, + "loss": 0.5634, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5824534893035889, + "rewards/margins": 0.7069231271743774, + "rewards/rejected": -2.289376735687256, + "step": 17060 + }, + { + "epoch": 2.9410751206064782, + "grad_norm": 34.00909423828125, + "learning_rate": 2.3497075810210433e-10, + "logits/chosen": -1.6038997173309326, + "logits/rejected": -1.5644137859344482, + "logps/chosen": -219.3387908935547, + "logps/rejected": -276.1908264160156, + "loss": 0.6046, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6372060775756836, + "rewards/margins": 0.6086788177490234, + "rewards/rejected": -2.245885133743286, + "step": 17070 + }, + { + "epoch": 2.942798070296347, + "grad_norm": 27.89004135131836, + "learning_rate": 2.2143566899647248e-10, + "logits/chosen": -1.5212467908859253, + "logits/rejected": -1.4685356616973877, + "logps/chosen": -210.71841430664062, + "logps/rejected": -314.895751953125, + "loss": 0.4377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5750494003295898, + "rewards/margins": 1.0299382209777832, + "rewards/rejected": -2.604987859725952, + "step": 17080 + }, + { + "epoch": 2.944521019986216, + "grad_norm": 28.67696762084961, + "learning_rate": 2.0830163020212344e-10, + "logits/chosen": -1.6012256145477295, + "logits/rejected": -1.553945779800415, + "logps/chosen": -202.9490509033203, + "logps/rejected": -297.46917724609375, + "loss": 0.4804, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4996212720870972, + "rewards/margins": 0.9452611207962036, + "rewards/rejected": -2.44488263130188, + "step": 17090 + }, + { + "epoch": 2.9462439696760856, + "grad_norm": 31.051746368408203, + "learning_rate": 1.955686945100621e-10, + "logits/chosen": -1.5455687046051025, + "logits/rejected": -1.499463677406311, + "logps/chosen": -213.981689453125, + "logps/rejected": -288.32861328125, + "loss": 0.5257, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5765442848205566, + "rewards/margins": 0.7793576121330261, + "rewards/rejected": -2.3559021949768066, + "step": 17100 + }, + { + "epoch": 2.9479669193659546, + "grad_norm": 35.02011489868164, + "learning_rate": 1.8323691309909407e-10, + "logits/chosen": -1.5887330770492554, + "logits/rejected": -1.5520597696304321, + "logps/chosen": -220.20315551757812, + "logps/rejected": -309.24505615234375, + "loss": 0.49, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6530447006225586, + "rewards/margins": 0.8980328440666199, + "rewards/rejected": -2.551077365875244, + "step": 17110 + }, + { + "epoch": 2.9496898690558235, + "grad_norm": 27.6148624420166, + "learning_rate": 1.7130633553561479e-10, + "logits/chosen": -1.6821790933609009, + "logits/rejected": -1.627488136291504, + "logps/chosen": -201.37051391601562, + "logps/rejected": -285.5783996582031, + "loss": 0.466, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.494118094444275, + "rewards/margins": 0.8731333613395691, + "rewards/rejected": -2.367251396179199, + "step": 17120 + }, + { + "epoch": 2.9514128187456925, + "grad_norm": 47.40973663330078, + "learning_rate": 1.597770097734541e-10, + "logits/chosen": -1.5655953884124756, + "logits/rejected": -1.5067824125289917, + "logps/chosen": -211.8823699951172, + "logps/rejected": -292.6919250488281, + "loss": 0.5046, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5727508068084717, + "rewards/margins": 0.8204353451728821, + "rewards/rejected": -2.393186092376709, + "step": 17130 + }, + { + "epoch": 2.953135768435562, + "grad_norm": 20.614322662353516, + "learning_rate": 1.4864898215359857e-10, + "logits/chosen": -1.5765509605407715, + "logits/rejected": -1.5283482074737549, + "logps/chosen": -192.6062469482422, + "logps/rejected": -282.79718017578125, + "loss": 0.4635, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3896197080612183, + "rewards/margins": 0.9155749082565308, + "rewards/rejected": -2.30519437789917, + "step": 17140 + }, + { + "epoch": 2.954858718125431, + "grad_norm": 43.842124938964844, + "learning_rate": 1.3792229740409166e-10, + "logits/chosen": -1.6786420345306396, + "logits/rejected": -1.6254281997680664, + "logps/chosen": -216.43814086914062, + "logps/rejected": -293.46466064453125, + "loss": 0.5554, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6223779916763306, + "rewards/margins": 0.7882981896400452, + "rewards/rejected": -2.4106762409210205, + "step": 17150 + }, + { + "epoch": 2.9565816678153, + "grad_norm": 26.545705795288086, + "learning_rate": 1.2759699863980067e-10, + "logits/chosen": -1.715623140335083, + "logits/rejected": -1.6613988876342773, + "logps/chosen": -190.52456665039062, + "logps/rejected": -308.39959716796875, + "loss": 0.4311, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.404829740524292, + "rewards/margins": 1.162705659866333, + "rewards/rejected": -2.567535400390625, + "step": 17160 + }, + { + "epoch": 2.958304617505169, + "grad_norm": 26.82274627685547, + "learning_rate": 1.1767312736228329e-10, + "logits/chosen": -1.7410396337509155, + "logits/rejected": -1.6930043697357178, + "logps/chosen": -230.6710205078125, + "logps/rejected": -296.1482849121094, + "loss": 0.5957, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7154839038848877, + "rewards/margins": 0.6694554090499878, + "rewards/rejected": -2.3849387168884277, + "step": 17170 + }, + { + "epoch": 2.960027567195038, + "grad_norm": 33.76654052734375, + "learning_rate": 1.0815072345957688e-10, + "logits/chosen": -1.6340773105621338, + "logits/rejected": -1.5861009359359741, + "logps/chosen": -210.44271850585938, + "logps/rejected": -292.2772216796875, + "loss": 0.5186, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5934721231460571, + "rewards/margins": 0.8232981562614441, + "rewards/rejected": -2.4167699813842773, + "step": 17180 + }, + { + "epoch": 2.9617505168849068, + "grad_norm": 27.789745330810547, + "learning_rate": 9.902982520605396e-11, + "logits/chosen": -1.6218135356903076, + "logits/rejected": -1.5870753526687622, + "logps/chosen": -186.03189086914062, + "logps/rejected": -263.25030517578125, + "loss": 0.4961, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3469972610473633, + "rewards/margins": 0.7608731985092163, + "rewards/rejected": -2.10787034034729, + "step": 17190 + }, + { + "epoch": 2.963473466574776, + "grad_norm": 34.324520111083984, + "learning_rate": 9.031046926230024e-11, + "logits/chosen": -1.629669189453125, + "logits/rejected": -1.6002832651138306, + "logps/chosen": -197.7524871826172, + "logps/rejected": -279.38873291015625, + "loss": 0.5054, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4329588413238525, + "rewards/margins": 0.8157581090927124, + "rewards/rejected": -2.2487170696258545, + "step": 17200 + }, + { + "epoch": 2.963473466574776, + "eval_logits/chosen": -1.7261788845062256, + "eval_logits/rejected": -1.7024034261703491, + "eval_logps/chosen": -203.55084228515625, + "eval_logps/rejected": -243.9775848388672, + "eval_loss": 0.6303371787071228, + "eval_rewards/accuracies": 0.6435873508453369, + "eval_rewards/chosen": -1.4483894109725952, + "eval_rewards/margins": 0.35958507657051086, + "eval_rewards/rejected": -1.8079745769500732, + "eval_runtime": 384.5784, + "eval_samples_per_second": 11.191, + "eval_steps_per_second": 1.399, + "step": 17200 + }, + { + "epoch": 2.965196416264645, + "grad_norm": 35.75117111206055, + "learning_rate": 8.199269067491466e-11, + "logits/chosen": -1.587419033050537, + "logits/rejected": -1.5489394664764404, + "logps/chosen": -213.60415649414062, + "logps/rejected": -301.45562744140625, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6136400699615479, + "rewards/margins": 0.8584750294685364, + "rewards/rejected": -2.4721150398254395, + "step": 17210 + }, + { + "epoch": 2.966919365954514, + "grad_norm": 29.765380859375, + "learning_rate": 7.407652287640953e-11, + "logits/chosen": -1.5780322551727295, + "logits/rejected": -1.5399539470672607, + "logps/chosen": -216.28274536132812, + "logps/rejected": -317.44146728515625, + "loss": 0.4935, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6082651615142822, + "rewards/margins": 0.9990278482437134, + "rewards/rejected": -2.607292890548706, + "step": 17220 + }, + { + "epoch": 2.968642315644383, + "grad_norm": 23.371280670166016, + "learning_rate": 6.656199768505511e-11, + "logits/chosen": -1.6521990299224854, + "logits/rejected": -1.6265060901641846, + "logps/chosen": -206.3870086669922, + "logps/rejected": -290.2902526855469, + "loss": 0.4938, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5403478145599365, + "rewards/margins": 0.8099233508110046, + "rewards/rejected": -2.350271224975586, + "step": 17230 + }, + { + "epoch": 2.9703652653342525, + "grad_norm": 29.65461540222168, + "learning_rate": 5.944914530475742e-11, + "logits/chosen": -1.6170860528945923, + "logits/rejected": -1.5837604999542236, + "logps/chosen": -198.25338745117188, + "logps/rejected": -274.77642822265625, + "loss": 0.5417, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.443676233291626, + "rewards/margins": 0.7559821009635925, + "rewards/rejected": -2.199658155441284, + "step": 17240 + }, + { + "epoch": 2.9720882150241215, + "grad_norm": 33.59632110595703, + "learning_rate": 5.2737994324958403e-11, + "logits/chosen": -1.5519667863845825, + "logits/rejected": -1.512790560722351, + "logps/chosen": -204.31234741210938, + "logps/rejected": -287.47869873046875, + "loss": 0.4811, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5204417705535889, + "rewards/margins": 0.8285139799118042, + "rewards/rejected": -2.3489556312561035, + "step": 17250 + }, + { + "epoch": 2.9738111647139904, + "grad_norm": 27.488880157470703, + "learning_rate": 4.642857172045822e-11, + "logits/chosen": -1.5949409008026123, + "logits/rejected": -1.544858694076538, + "logps/chosen": -201.71656799316406, + "logps/rejected": -292.2054748535156, + "loss": 0.482, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.46438729763031, + "rewards/margins": 0.9317871332168579, + "rewards/rejected": -2.396174430847168, + "step": 17260 + }, + { + "epoch": 2.9755341144038594, + "grad_norm": 39.49324417114258, + "learning_rate": 4.052090285138199e-11, + "logits/chosen": -1.604092001914978, + "logits/rejected": -1.560024380683899, + "logps/chosen": -220.26065063476562, + "logps/rejected": -285.1950988769531, + "loss": 0.5878, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6459325551986694, + "rewards/margins": 0.6614850163459778, + "rewards/rejected": -2.307417631149292, + "step": 17270 + }, + { + "epoch": 2.9772570640937284, + "grad_norm": 25.67490577697754, + "learning_rate": 3.501501146304653e-11, + "logits/chosen": -1.5917980670928955, + "logits/rejected": -1.5360498428344727, + "logps/chosen": -196.49559020996094, + "logps/rejected": -287.30413818359375, + "loss": 0.4694, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4197368621826172, + "rewards/margins": 0.9354475140571594, + "rewards/rejected": -2.355184316635132, + "step": 17280 + }, + { + "epoch": 2.9789800137835973, + "grad_norm": 27.564085006713867, + "learning_rate": 2.991091968582715e-11, + "logits/chosen": -1.6404955387115479, + "logits/rejected": -1.596842885017395, + "logps/chosen": -212.32986450195312, + "logps/rejected": -295.46478271484375, + "loss": 0.4701, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5325970649719238, + "rewards/margins": 0.8477786183357239, + "rewards/rejected": -2.380375862121582, + "step": 17290 + }, + { + "epoch": 2.9807029634734663, + "grad_norm": 30.048521041870117, + "learning_rate": 2.5208648035146553e-11, + "logits/chosen": -1.658026099205017, + "logits/rejected": -1.6178169250488281, + "logps/chosen": -203.4453125, + "logps/rejected": -279.23419189453125, + "loss": 0.519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4819501638412476, + "rewards/margins": 0.7884173393249512, + "rewards/rejected": -2.2703678607940674, + "step": 17300 + }, + { + "epoch": 2.9824259131633357, + "grad_norm": 36.83639907836914, + "learning_rate": 2.0908215411330477e-11, + "logits/chosen": -1.6300697326660156, + "logits/rejected": -1.5847969055175781, + "logps/chosen": -209.23526000976562, + "logps/rejected": -297.26318359375, + "loss": 0.4792, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5416957139968872, + "rewards/margins": 0.9089337587356567, + "rewards/rejected": -2.450629472732544, + "step": 17310 + }, + { + "epoch": 2.9841488628532047, + "grad_norm": 34.16779708862305, + "learning_rate": 1.7009639099541118e-11, + "logits/chosen": -1.6238996982574463, + "logits/rejected": -1.5776101350784302, + "logps/chosen": -212.78903198242188, + "logps/rejected": -294.0577087402344, + "loss": 0.513, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.587726354598999, + "rewards/margins": 0.8322674036026001, + "rewards/rejected": -2.4199938774108887, + "step": 17320 + }, + { + "epoch": 2.9858718125430737, + "grad_norm": 33.81337356567383, + "learning_rate": 1.35129347697438e-11, + "logits/chosen": -1.5871931314468384, + "logits/rejected": -1.5417367219924927, + "logps/chosen": -206.4938507080078, + "logps/rejected": -279.7281188964844, + "loss": 0.5444, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.512385606765747, + "rewards/margins": 0.7700924277305603, + "rewards/rejected": -2.282477855682373, + "step": 17330 + }, + { + "epoch": 2.987594762232943, + "grad_norm": 22.616756439208984, + "learning_rate": 1.0418116476584859e-11, + "logits/chosen": -1.687488317489624, + "logits/rejected": -1.6277202367782593, + "logps/chosen": -185.19866943359375, + "logps/rejected": -278.3511047363281, + "loss": 0.4499, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3271843194961548, + "rewards/margins": 0.938696563243866, + "rewards/rejected": -2.265880823135376, + "step": 17340 + }, + { + "epoch": 2.989317711922812, + "grad_norm": 33.05019760131836, + "learning_rate": 7.725196659413847e-12, + "logits/chosen": -1.6483237743377686, + "logits/rejected": -1.5888398885726929, + "logps/chosen": -191.623779296875, + "logps/rejected": -276.997802734375, + "loss": 0.488, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3805105686187744, + "rewards/margins": 0.8659530878067017, + "rewards/rejected": -2.2464635372161865, + "step": 17350 + }, + { + "epoch": 2.991040661612681, + "grad_norm": 39.97116470336914, + "learning_rate": 5.4341861421391965e-12, + "logits/chosen": -1.7078800201416016, + "logits/rejected": -1.6725330352783203, + "logps/chosen": -197.40243530273438, + "logps/rejected": -285.66595458984375, + "loss": 0.4817, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4299650192260742, + "rewards/margins": 0.8771953582763672, + "rewards/rejected": -2.3071603775024414, + "step": 17360 + }, + { + "epoch": 2.99276361130255, + "grad_norm": 40.23675537109375, + "learning_rate": 3.5450941332726415e-12, + "logits/chosen": -1.6248849630355835, + "logits/rejected": -1.5938169956207275, + "logps/chosen": -205.1375732421875, + "logps/rejected": -274.900146484375, + "loss": 0.5501, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5296798944473267, + "rewards/margins": 0.6712183356285095, + "rewards/rejected": -2.2008984088897705, + "step": 17370 + }, + { + "epoch": 2.994486560992419, + "grad_norm": 36.696414947509766, + "learning_rate": 2.0579282258292862e-12, + "logits/chosen": -1.6086667776107788, + "logits/rejected": -1.573327660560608, + "logps/chosen": -213.1057586669922, + "logps/rejected": -290.3735046386719, + "loss": 0.5333, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.601100206375122, + "rewards/margins": 0.7504446506500244, + "rewards/rejected": -2.3515448570251465, + "step": 17380 + }, + { + "epoch": 2.996209510682288, + "grad_norm": 56.76747512817383, + "learning_rate": 9.726943973387137e-13, + "logits/chosen": -1.6698760986328125, + "logits/rejected": -1.626619577407837, + "logps/chosen": -204.91049194335938, + "logps/rejected": -282.6231689453125, + "loss": 0.5274, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5097030401229858, + "rewards/margins": 0.782251238822937, + "rewards/rejected": -2.2919540405273438, + "step": 17390 + }, + { + "epoch": 2.997932460372157, + "grad_norm": 28.173402786254883, + "learning_rate": 2.8939700977836934e-13, + "logits/chosen": -1.6111886501312256, + "logits/rejected": -1.5667483806610107, + "logps/chosen": -220.0554656982422, + "logps/rejected": -298.34857177734375, + "loss": 0.5284, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6483074426651, + "rewards/margins": 0.8398798704147339, + "rewards/rejected": -2.488187313079834, + "step": 17400 + }, + { + "epoch": 2.9996554100620263, + "grad_norm": 41.602970123291016, + "learning_rate": 8.038809595767305e-15, + "logits/chosen": -1.5945218801498413, + "logits/rejected": -1.548925518989563, + "logps/chosen": -201.5911865234375, + "logps/rejected": -293.29644775390625, + "loss": 0.4551, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.475780725479126, + "rewards/margins": 0.907615065574646, + "rewards/rejected": -2.3833956718444824, + "step": 17410 + }, + { + "epoch": 3.0, + "step": 17412, + "total_flos": 0.0, + "train_loss": 0.5724969553024556, + "train_runtime": 86264.9537, + "train_samples_per_second": 3.229, + "train_steps_per_second": 0.202 + } + ], + "logging_steps": 10, + "max_steps": 17412, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}