{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994601079784043, "eval_steps": 500, "global_step": 1248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02399520095980804, "grad_norm": 24.58741331565172, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -0.5075146555900574, "logits/rejected": -0.31934085488319397, "logps/chosen": -1.394007921218872, "logps/rejected": -1.3630257844924927, "loss": 1.3501, "odds_ratio_loss": 0.8239962458610535, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.06970040500164032, "rewards/margins": -0.0015491036465391517, "rewards/rejected": -0.06815129518508911, "sft_loss": 1.394007921218872, "step": 10 }, { "epoch": 0.04799040191961608, "grad_norm": 4.281683015852783, "learning_rate": 3.5e-06, "logits/chosen": 0.08614908158779144, "logits/rejected": 0.3013238310813904, "logps/chosen": -1.3080074787139893, "logps/rejected": -1.334457278251648, "loss": 1.2858, "odds_ratio_loss": 0.7804475426673889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0654003769159317, "rewards/margins": 0.0013224859721958637, "rewards/rejected": -0.06672286242246628, "sft_loss": 1.3080074787139893, "step": 20 }, { "epoch": 0.07198560287942411, "grad_norm": 3.830958349381369, "learning_rate": 4.99986910314335e-06, "logits/chosen": 0.3485943675041199, "logits/rejected": 0.6042150855064392, "logps/chosen": -0.9540683627128601, "logps/rejected": -1.1750730276107788, "loss": 0.9904, "odds_ratio_loss": 0.6533687710762024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.047703422605991364, "rewards/margins": 0.011050237342715263, "rewards/rejected": -0.05875365808606148, "sft_loss": 0.9540683627128601, "step": 30 }, { "epoch": 0.09598080383923216, "grad_norm": 3.6776666943951675, "learning_rate": 4.998396670920005e-06, "logits/chosen": 0.17601105570793152, "logits/rejected": 0.5272272229194641, "logps/chosen": -0.898045539855957, "logps/rejected": -1.0136868953704834, "loss": 0.9614, "odds_ratio_loss": 0.6860688328742981, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04490227997303009, "rewards/margins": 0.005782057531177998, "rewards/rejected": -0.05068434029817581, "sft_loss": 0.898045539855957, "step": 40 }, { "epoch": 0.11997600479904019, "grad_norm": 2.636908991979515, "learning_rate": 4.995289152254744e-06, "logits/chosen": 0.2309066355228424, "logits/rejected": 0.22152824699878693, "logps/chosen": -0.9074997901916504, "logps/rejected": -1.0551084280014038, "loss": 0.9374, "odds_ratio_loss": 0.663613498210907, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.04537498578429222, "rewards/margins": 0.007380434311926365, "rewards/rejected": -0.05275542289018631, "sft_loss": 0.9074997901916504, "step": 50 }, { "epoch": 0.14397120575884823, "grad_norm": 1.8300107701302537, "learning_rate": 4.990548580876516e-06, "logits/chosen": 0.307407021522522, "logits/rejected": 0.37507694959640503, "logps/chosen": -0.9279610514640808, "logps/rejected": -0.986476719379425, "loss": 0.9464, "odds_ratio_loss": 0.7063499093055725, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04639805108308792, "rewards/margins": 0.00292578199878335, "rewards/rejected": -0.04932383447885513, "sft_loss": 0.9279610514640808, "step": 60 }, { "epoch": 0.16796640671865626, "grad_norm": 3.8157191209486507, "learning_rate": 4.9841780592726385e-06, "logits/chosen": 0.19509825110435486, "logits/rejected": 0.2650177776813507, "logps/chosen": -0.9848098754882812, "logps/rejected": -1.0149097442626953, "loss": 0.9578, "odds_ratio_loss": 0.726799488067627, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04924049228429794, "rewards/margins": 0.0015049913199618459, "rewards/rejected": -0.050745487213134766, "sft_loss": 0.9848098754882812, "step": 70 }, { "epoch": 0.19196160767846432, "grad_norm": 4.078587531391316, "learning_rate": 4.976181756658363e-06, "logits/chosen": 0.061622969806194305, "logits/rejected": 0.2444450408220291, "logps/chosen": -0.8894473910331726, "logps/rejected": -1.0614734888076782, "loss": 0.9675, "odds_ratio_loss": 0.6382969617843628, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04447237029671669, "rewards/margins": 0.008601305074989796, "rewards/rejected": -0.05307367444038391, "sft_loss": 0.8894473910331726, "step": 80 }, { "epoch": 0.21595680863827235, "grad_norm": 2.9874023740770363, "learning_rate": 4.9665649062483115e-06, "logits/chosen": 0.6337467432022095, "logits/rejected": 0.7902036905288696, "logps/chosen": -0.9439412951469421, "logps/rejected": -0.9588793516159058, "loss": 0.9635, "odds_ratio_loss": 0.7716476917266846, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.047197069972753525, "rewards/margins": 0.0007468975381925702, "rewards/rejected": -0.047943972051143646, "sft_loss": 0.9439412951469421, "step": 90 }, { "epoch": 0.23995200959808038, "grad_norm": 2.3029148332001745, "learning_rate": 4.955333801831578e-06, "logits/chosen": 0.49920982122421265, "logits/rejected": 0.6337569355964661, "logps/chosen": -0.8333128094673157, "logps/rejected": -1.059599757194519, "loss": 0.9453, "odds_ratio_loss": 0.6517213582992554, "rewards/accuracies": 0.5625, "rewards/chosen": -0.041665639728307724, "rewards/margins": 0.011314347386360168, "rewards/rejected": -0.05297998711466789, "sft_loss": 0.8333128094673157, "step": 100 }, { "epoch": 0.26394721055788845, "grad_norm": 2.8766587489414395, "learning_rate": 4.9424957936527295e-06, "logits/chosen": -0.28645992279052734, "logits/rejected": 0.04107431694865227, "logps/chosen": -0.9429195523262024, "logps/rejected": -0.9936224222183228, "loss": 0.9526, "odds_ratio_loss": 0.705885112285614, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04714598134160042, "rewards/margins": 0.002535139676183462, "rewards/rejected": -0.04968111589550972, "sft_loss": 0.9429195523262024, "step": 110 }, { "epoch": 0.28794241151769645, "grad_norm": 2.1411106644617703, "learning_rate": 4.92805928360141e-06, "logits/chosen": -0.29608479142189026, "logits/rejected": -0.21111997961997986, "logps/chosen": -0.888851523399353, "logps/rejected": -1.0842912197113037, "loss": 0.8904, "odds_ratio_loss": 0.5968859195709229, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04444257169961929, "rewards/margins": 0.009771987795829773, "rewards/rejected": -0.054214559495449066, "sft_loss": 0.888851523399353, "step": 120 }, { "epoch": 0.3119376124775045, "grad_norm": 2.1891227152981347, "learning_rate": 4.912033719713687e-06, "logits/chosen": 0.49228960275650024, "logits/rejected": 0.5680336952209473, "logps/chosen": -0.9152839779853821, "logps/rejected": -1.0058788061141968, "loss": 0.9427, "odds_ratio_loss": 0.6943625807762146, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04576420038938522, "rewards/margins": 0.004529745317995548, "rewards/rejected": -0.0502939410507679, "sft_loss": 0.9152839779853821, "step": 130 }, { "epoch": 0.3359328134373125, "grad_norm": 2.5131225459939, "learning_rate": 4.894429589988739e-06, "logits/chosen": -1.2468726634979248, "logits/rejected": -1.0485397577285767, "logps/chosen": -1.0104249715805054, "logps/rejected": -1.0477244853973389, "loss": 0.949, "odds_ratio_loss": 0.7160865068435669, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.05052124708890915, "rewards/margins": 0.0018649749690666795, "rewards/rejected": -0.05238622426986694, "sft_loss": 1.0104249715805054, "step": 140 }, { "epoch": 0.3599280143971206, "grad_norm": 2.696319834123575, "learning_rate": 4.875258415524945e-06, "logits/chosen": 0.039508234709501266, "logits/rejected": 0.23594827950000763, "logps/chosen": -0.904223620891571, "logps/rejected": -1.032157063484192, "loss": 0.9533, "odds_ratio_loss": 0.6739581823348999, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04521118476986885, "rewards/margins": 0.0063966671004891396, "rewards/rejected": -0.051607854664325714, "sft_loss": 0.904223620891571, "step": 150 }, { "epoch": 0.38392321535692864, "grad_norm": 2.241170193835809, "learning_rate": 4.85453274297985e-06, "logits/chosen": 0.4507044851779938, "logits/rejected": 0.7088828682899475, "logps/chosen": -0.9252007603645325, "logps/rejected": -1.0105345249176025, "loss": 0.9187, "odds_ratio_loss": 0.6664329171180725, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0462600402534008, "rewards/margins": 0.004266692791134119, "rewards/rejected": -0.050526730716228485, "sft_loss": 0.9252007603645325, "step": 160 }, { "epoch": 0.40791841631673664, "grad_norm": 1.759854296483571, "learning_rate": 4.832266136358951e-06, "logits/chosen": -0.12876208126544952, "logits/rejected": 0.014335835352540016, "logps/chosen": -0.8540490865707397, "logps/rejected": -0.9863293766975403, "loss": 0.926, "odds_ratio_loss": 0.6714656352996826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04270245134830475, "rewards/margins": 0.006614011712372303, "rewards/rejected": -0.04931646212935448, "sft_loss": 0.8540490865707397, "step": 170 }, { "epoch": 0.4319136172765447, "grad_norm": 2.793191882203603, "learning_rate": 4.808473168138675e-06, "logits/chosen": 0.3617595136165619, "logits/rejected": 0.3396950364112854, "logps/chosen": -0.8613064885139465, "logps/rejected": -1.0067331790924072, "loss": 0.9162, "odds_ratio_loss": 0.6582903861999512, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04306532442569733, "rewards/margins": 0.007271329872310162, "rewards/rejected": -0.050336651504039764, "sft_loss": 0.8613064885139465, "step": 180 }, { "epoch": 0.4559088182363527, "grad_norm": 1.7774141067161418, "learning_rate": 4.783169409729363e-06, "logits/chosen": 0.9685203433036804, "logits/rejected": 1.1009634733200073, "logps/chosen": -0.8521540760993958, "logps/rejected": -0.9150575399398804, "loss": 0.9004, "odds_ratio_loss": 0.7224193811416626, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04260770231485367, "rewards/margins": 0.0031451724935323, "rewards/rejected": -0.0457528755068779, "sft_loss": 0.8521540760993958, "step": 190 }, { "epoch": 0.47990401919616077, "grad_norm": 2.052107783396207, "learning_rate": 4.756371421284482e-06, "logits/chosen": 0.33597105741500854, "logits/rejected": 0.44187426567077637, "logps/chosen": -0.8725342750549316, "logps/rejected": -0.9003400802612305, "loss": 0.919, "odds_ratio_loss": 0.7135496735572815, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04362671449780464, "rewards/margins": 0.0013902939390391111, "rewards/rejected": -0.04501700773835182, "sft_loss": 0.8725342750549316, "step": 200 }, { "epoch": 0.5038992201559688, "grad_norm": 2.3000145040966973, "learning_rate": 4.728096740862778e-06, "logits/chosen": 0.16287042200565338, "logits/rejected": 0.35098087787628174, "logps/chosen": -0.8514264822006226, "logps/rejected": -0.9913795590400696, "loss": 0.9096, "odds_ratio_loss": 0.6634506583213806, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.042571328580379486, "rewards/margins": 0.006997650023549795, "rewards/rejected": -0.04956897348165512, "sft_loss": 0.8514264822006226, "step": 210 }, { "epoch": 0.5278944211157769, "grad_norm": 1.581079267248328, "learning_rate": 4.698363872950406e-06, "logits/chosen": 0.298981636762619, "logits/rejected": 0.49268895387649536, "logps/chosen": -0.8895601034164429, "logps/rejected": -1.026539921760559, "loss": 0.8744, "odds_ratio_loss": 0.6685082316398621, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04447800666093826, "rewards/margins": 0.0068489923141896725, "rewards/rejected": -0.051326997578144073, "sft_loss": 0.8895601034164429, "step": 220 }, { "epoch": 0.5518896220755849, "grad_norm": 1.7094822098553022, "learning_rate": 4.6671922763505915e-06, "logits/chosen": 0.34609514474868774, "logits/rejected": 0.5052930116653442, "logps/chosen": -0.863084614276886, "logps/rejected": -0.9836879968643188, "loss": 0.8905, "odds_ratio_loss": 0.6813028454780579, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.043154239654541016, "rewards/margins": 0.006030158139765263, "rewards/rejected": -0.049184400588274, "sft_loss": 0.863084614276886, "step": 230 }, { "epoch": 0.5758848230353929, "grad_norm": 1.9367159826113498, "learning_rate": 4.634602351448738e-06, "logits/chosen": 0.286350816488266, "logits/rejected": 0.3788919448852539, "logps/chosen": -0.8919585943222046, "logps/rejected": -0.9452742338180542, "loss": 0.9133, "odds_ratio_loss": 0.6905114650726318, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04459793120622635, "rewards/margins": 0.0026657807175070047, "rewards/rejected": -0.04726371169090271, "sft_loss": 0.8919585943222046, "step": 240 }, { "epoch": 0.5998800239952009, "grad_norm": 2.0772847936555636, "learning_rate": 4.6006154268613015e-06, "logits/chosen": 0.4635019898414612, "logits/rejected": 0.5444530248641968, "logps/chosen": -0.8181222081184387, "logps/rejected": -0.9908831715583801, "loss": 0.8927, "odds_ratio_loss": 0.6295598149299622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04090610891580582, "rewards/margins": 0.008638045750558376, "rewards/rejected": -0.04954415559768677, "sft_loss": 0.8181222081184387, "step": 250 }, { "epoch": 0.623875224955009, "grad_norm": 2.084215689408855, "learning_rate": 4.565253745477187e-06, "logits/chosen": 0.40253886580467224, "logits/rejected": 0.4625183045864105, "logps/chosen": -0.9301355481147766, "logps/rejected": -1.0306508541107178, "loss": 0.9162, "odds_ratio_loss": 0.6872043609619141, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04650677740573883, "rewards/margins": 0.005025765858590603, "rewards/rejected": -0.05153254419565201, "sft_loss": 0.9301355481147766, "step": 260 }, { "epoch": 0.647870425914817, "grad_norm": 1.9031984888179019, "learning_rate": 4.528540449900799e-06, "logits/chosen": 0.4078219532966614, "logits/rejected": 0.6789823174476624, "logps/chosen": -0.8785255551338196, "logps/rejected": -0.9139087796211243, "loss": 0.9176, "odds_ratio_loss": 0.7333613038063049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04392627626657486, "rewards/margins": 0.0017691642278805375, "rewards/rejected": -0.04569543898105621, "sft_loss": 0.8785255551338196, "step": 270 }, { "epoch": 0.671865626874625, "grad_norm": 2.3067419173621113, "learning_rate": 4.490499567306256e-06, "logits/chosen": 0.304252564907074, "logits/rejected": 0.5160123109817505, "logps/chosen": -0.8951358795166016, "logps/rejected": -0.9636558294296265, "loss": 0.8917, "odds_ratio_loss": 0.69621342420578, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.04475679248571396, "rewards/margins": 0.0034259993117302656, "rewards/rejected": -0.04818279296159744, "sft_loss": 0.8951358795166016, "step": 280 }, { "epoch": 0.6958608278344331, "grad_norm": 3.1297290877323003, "learning_rate": 4.451155993712711e-06, "logits/chosen": 0.25184166431427, "logits/rejected": 0.43299436569213867, "logps/chosen": -0.808620810508728, "logps/rejected": -0.9780584573745728, "loss": 0.9379, "odds_ratio_loss": 0.6151310205459595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04043104499578476, "rewards/margins": 0.008471880108118057, "rewards/rejected": -0.048902928829193115, "sft_loss": 0.808620810508728, "step": 290 }, { "epoch": 0.7198560287942412, "grad_norm": 2.001570442654457, "learning_rate": 4.410535477691041e-06, "logits/chosen": 0.6736063957214355, "logits/rejected": 0.8922637104988098, "logps/chosen": -0.8743098974227905, "logps/rejected": -1.0198915004730225, "loss": 0.8962, "odds_ratio_loss": 0.6545746326446533, "rewards/accuracies": 0.625, "rewards/chosen": -0.043715499341487885, "rewards/margins": 0.0072790831327438354, "rewards/rejected": -0.05099458247423172, "sft_loss": 0.8743098974227905, "step": 300 }, { "epoch": 0.7438512297540492, "grad_norm": 3.088640251108737, "learning_rate": 4.368664603512586e-06, "logits/chosen": -0.10074709355831146, "logits/rejected": 0.08682968467473984, "logps/chosen": -0.7929955720901489, "logps/rejected": -0.9449365735054016, "loss": 0.8789, "odds_ratio_loss": 0.6474851369857788, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03964977711439133, "rewards/margins": 0.007597046438604593, "rewards/rejected": -0.047246821224689484, "sft_loss": 0.7929955720901489, "step": 310 }, { "epoch": 0.7678464307138573, "grad_norm": 2.278875813822025, "learning_rate": 4.325570773750952e-06, "logits/chosen": -0.22130906581878662, "logits/rejected": -0.028980206698179245, "logps/chosen": -0.8826779127120972, "logps/rejected": -1.0213041305541992, "loss": 0.9204, "odds_ratio_loss": 0.6443883180618286, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04413389414548874, "rewards/margins": 0.006931307725608349, "rewards/rejected": -0.05106520652770996, "sft_loss": 0.8826779127120972, "step": 320 }, { "epoch": 0.7918416316736653, "grad_norm": 1.6952516043840655, "learning_rate": 4.281282191348289e-06, "logits/chosen": 0.45927032828330994, "logits/rejected": 0.6593443751335144, "logps/chosen": -0.8378440141677856, "logps/rejected": -0.9682254791259766, "loss": 0.8995, "odds_ratio_loss": 0.6620376110076904, "rewards/accuracies": 0.625, "rewards/chosen": -0.04189220070838928, "rewards/margins": 0.006519075483083725, "rewards/rejected": -0.04841126874089241, "sft_loss": 0.8378440141677856, "step": 330 }, { "epoch": 0.8158368326334733, "grad_norm": 2.4806806819218794, "learning_rate": 4.235827841157748e-06, "logits/chosen": 0.01970214769244194, "logits/rejected": 0.11670324951410294, "logps/chosen": -0.8856766819953918, "logps/rejected": -1.0817759037017822, "loss": 0.8834, "odds_ratio_loss": 0.6194185018539429, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04428383335471153, "rewards/margins": 0.009804959408938885, "rewards/rejected": -0.054088789969682693, "sft_loss": 0.8856766819953918, "step": 340 }, { "epoch": 0.8398320335932813, "grad_norm": 1.5265892877639438, "learning_rate": 4.1892374709742186e-06, "logits/chosen": -0.7483745813369751, "logits/rejected": -0.42045336961746216, "logps/chosen": -0.7948485016822815, "logps/rejected": -0.9918915033340454, "loss": 0.9474, "odds_ratio_loss": 0.5842909812927246, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03974242880940437, "rewards/margins": 0.009852146729826927, "rewards/rejected": -0.04959457367658615, "sft_loss": 0.7948485016822815, "step": 350 }, { "epoch": 0.8638272345530894, "grad_norm": 2.1051154185205543, "learning_rate": 4.141541572065762e-06, "logits/chosen": 0.41192498803138733, "logits/rejected": 0.5341157913208008, "logps/chosen": -0.7971394658088684, "logps/rejected": -0.9216561317443848, "loss": 0.8881, "odds_ratio_loss": 0.69920814037323, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03985697776079178, "rewards/margins": 0.0062258280813694, "rewards/rejected": -0.04608280584216118, "sft_loss": 0.7971394658088684, "step": 360 }, { "epoch": 0.8878224355128974, "grad_norm": 2.049071087536336, "learning_rate": 4.092771359218462e-06, "logits/chosen": 0.2649831771850586, "logits/rejected": 0.45568495988845825, "logps/chosen": -0.8466150164604187, "logps/rejected": -1.0025365352630615, "loss": 0.9065, "odds_ratio_loss": 0.629971444606781, "rewards/accuracies": 0.625, "rewards/chosen": -0.042330749332904816, "rewards/margins": 0.007796071469783783, "rewards/rejected": -0.0501268208026886, "sft_loss": 0.8466150164604187, "step": 370 }, { "epoch": 0.9118176364727054, "grad_norm": 3.597524104140319, "learning_rate": 4.04295875030778e-06, "logits/chosen": -0.18752217292785645, "logits/rejected": 0.15378537774085999, "logps/chosen": -0.8704308271408081, "logps/rejected": -0.9513336420059204, "loss": 0.9014, "odds_ratio_loss": 0.6948253512382507, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.043521542102098465, "rewards/margins": 0.004045139066874981, "rewards/rejected": -0.04756668210029602, "sft_loss": 0.8704308271408081, "step": 380 }, { "epoch": 0.9358128374325135, "grad_norm": 3.1405630532603395, "learning_rate": 3.992136345409765e-06, "logits/chosen": -0.1735876053571701, "logits/rejected": -0.20124337077140808, "logps/chosen": -0.9253339767456055, "logps/rejected": -1.0305973291397095, "loss": 0.9111, "odds_ratio_loss": 0.6636070013046265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04626670479774475, "rewards/margins": 0.005263164173811674, "rewards/rejected": -0.051529865711927414, "sft_loss": 0.9253339767456055, "step": 390 }, { "epoch": 0.9598080383923215, "grad_norm": 2.4716790122788983, "learning_rate": 3.940337405465786e-06, "logits/chosen": 0.26361703872680664, "logits/rejected": 0.44345617294311523, "logps/chosen": -0.8355854153633118, "logps/rejected": -1.0225704908370972, "loss": 0.9062, "odds_ratio_loss": 0.6545855402946472, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04177927225828171, "rewards/margins": 0.009349259547889233, "rewards/rejected": -0.05112852901220322, "sft_loss": 0.8355854153633118, "step": 400 }, { "epoch": 0.9838032393521295, "grad_norm": 2.3985102639359406, "learning_rate": 3.887595830514775e-06, "logits/chosen": 0.21671700477600098, "logits/rejected": 0.29912179708480835, "logps/chosen": -0.809670090675354, "logps/rejected": -1.0107569694519043, "loss": 0.9029, "odds_ratio_loss": 0.6326887011528015, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0404835119843483, "rewards/margins": 0.010054344311356544, "rewards/rejected": -0.05053785443305969, "sft_loss": 0.809670090675354, "step": 410 }, { "epoch": 1.0077984403119375, "grad_norm": 1.6971594247197401, "learning_rate": 3.833946137507195e-06, "logits/chosen": 0.4990086555480957, "logits/rejected": 0.616361141204834, "logps/chosen": -0.8005359768867493, "logps/rejected": -0.9603840708732605, "loss": 0.8398, "odds_ratio_loss": 0.6354148387908936, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.040026795119047165, "rewards/margins": 0.007992411032319069, "rewards/rejected": -0.04801920801401138, "sft_loss": 0.8005359768867493, "step": 420 }, { "epoch": 1.0317936412717457, "grad_norm": 2.2002987962167904, "learning_rate": 3.779423437715274e-06, "logits/chosen": 0.7601526975631714, "logits/rejected": 0.8180352449417114, "logps/chosen": -0.6671024560928345, "logps/rejected": -0.9577730298042297, "loss": 0.7742, "odds_ratio_loss": 0.5807942152023315, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03335512429475784, "rewards/margins": 0.014533529989421368, "rewards/rejected": -0.047888655215501785, "sft_loss": 0.6671024560928345, "step": 430 }, { "epoch": 1.0557888422315538, "grad_norm": 1.5148819350515028, "learning_rate": 3.7240634137542864e-06, "logits/chosen": 0.19566980004310608, "logits/rejected": 0.3528198003768921, "logps/chosen": -0.6874720454216003, "logps/rejected": -1.0558958053588867, "loss": 0.7663, "odds_ratio_loss": 0.48211669921875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.034373603761196136, "rewards/margins": 0.01842118799686432, "rewards/rejected": -0.052794791758060455, "sft_loss": 0.6874720454216003, "step": 440 }, { "epoch": 1.0797840431913617, "grad_norm": 1.6130353172110996, "learning_rate": 3.6679022962299054e-06, "logits/chosen": 0.8750432133674622, "logits/rejected": 0.8553866147994995, "logps/chosen": -0.7515122890472412, "logps/rejected": -0.9563247561454773, "loss": 0.7745, "odds_ratio_loss": 0.5920617580413818, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.037575613707304, "rewards/margins": 0.010240620002150536, "rewards/rejected": -0.047816235572099686, "sft_loss": 0.7515122890472412, "step": 450 }, { "epoch": 1.1037792441511698, "grad_norm": 1.8444047185661667, "learning_rate": 3.6109768400269336e-06, "logits/chosen": 0.21664266288280487, "logits/rejected": 0.3455556333065033, "logps/chosen": -0.7820109128952026, "logps/rejected": -1.1722263097763062, "loss": 0.7949, "odds_ratio_loss": 0.5249099731445312, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03910055011510849, "rewards/margins": 0.019510772079229355, "rewards/rejected": -0.05861131474375725, "sft_loss": 0.7820109128952026, "step": 460 }, { "epoch": 1.127774445110978, "grad_norm": 1.923809039800638, "learning_rate": 3.5533243002549044e-06, "logits/chosen": -0.051299355924129486, "logits/rejected": 0.12599964439868927, "logps/chosen": -0.6766480803489685, "logps/rejected": -0.9556339979171753, "loss": 0.769, "odds_ratio_loss": 0.5771059989929199, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03383240848779678, "rewards/margins": 0.013949294574558735, "rewards/rejected": -0.047781698405742645, "sft_loss": 0.6766480803489685, "step": 470 }, { "epoch": 1.1517696460707858, "grad_norm": 2.0416324249302593, "learning_rate": 3.4949824078663214e-06, "logits/chosen": 0.3260158598423004, "logits/rejected": 0.4627075791358948, "logps/chosen": -0.6955934762954712, "logps/rejected": -1.0405316352844238, "loss": 0.7744, "odds_ratio_loss": 0.5207543969154358, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03477967530488968, "rewards/margins": 0.017246905714273453, "rewards/rejected": -0.05202658101916313, "sft_loss": 0.6955934762954712, "step": 480 }, { "epoch": 1.175764847030594, "grad_norm": 2.159701142475688, "learning_rate": 3.4359893449634713e-06, "logits/chosen": 0.10285909473896027, "logits/rejected": 0.18586108088493347, "logps/chosen": -0.7835036516189575, "logps/rejected": -0.9662873148918152, "loss": 0.7699, "odds_ratio_loss": 0.6257883310317993, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03917517885565758, "rewards/margins": 0.009139184840023518, "rewards/rejected": -0.04831436648964882, "sft_loss": 0.7835036516189575, "step": 490 }, { "epoch": 1.1997600479904018, "grad_norm": 1.905386181833648, "learning_rate": 3.3763837198099807e-06, "logits/chosen": 0.2618166208267212, "logits/rejected": 0.403994083404541, "logps/chosen": -0.7472913861274719, "logps/rejected": -0.9723391532897949, "loss": 0.8034, "odds_ratio_loss": 0.5758217573165894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03736456483602524, "rewards/margins": 0.011252395808696747, "rewards/rejected": -0.048616960644721985, "sft_loss": 0.7472913861274719, "step": 500 }, { "epoch": 1.22375524895021, "grad_norm": 1.8483335773730425, "learning_rate": 3.3162045415634793e-06, "logits/chosen": -0.06936601549386978, "logits/rejected": 0.15932008624076843, "logps/chosen": -0.7298214435577393, "logps/rejected": -0.989848792552948, "loss": 0.764, "odds_ratio_loss": 0.5586143136024475, "rewards/accuracies": 0.6875, "rewards/chosen": -0.036491066217422485, "rewards/margins": 0.013001373037695885, "rewards/rejected": -0.04949244111776352, "sft_loss": 0.7298214435577393, "step": 510 }, { "epoch": 1.247750449910018, "grad_norm": 1.4105189905656275, "learning_rate": 3.255491194745878e-06, "logits/chosen": -0.0699717178940773, "logits/rejected": 0.11926586925983429, "logps/chosen": -0.7712666988372803, "logps/rejected": -1.0007984638214111, "loss": 0.7514, "odds_ratio_loss": 0.576269805431366, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03856333717703819, "rewards/margins": 0.011476586572825909, "rewards/rejected": -0.050039924681186676, "sft_loss": 0.7712666988372803, "step": 520 }, { "epoch": 1.2717456508698262, "grad_norm": 1.5086406745902339, "learning_rate": 3.1942834134680123e-06, "logits/chosen": -0.4110763669013977, "logits/rejected": -0.197097510099411, "logps/chosen": -0.7337836027145386, "logps/rejected": -1.0581499338150024, "loss": 0.747, "odds_ratio_loss": 0.5731949806213379, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03668918460607529, "rewards/margins": 0.016218315809965134, "rewards/rejected": -0.05290750414133072, "sft_loss": 0.7337836027145386, "step": 530 }, { "epoch": 1.295740851829634, "grad_norm": 2.007767969966132, "learning_rate": 3.13262125542547e-06, "logits/chosen": 0.24464428424835205, "logits/rejected": 0.42607539892196655, "logps/chosen": -0.8008230328559875, "logps/rejected": -1.019913911819458, "loss": 0.7839, "odds_ratio_loss": 0.5772299766540527, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04004114866256714, "rewards/margins": 0.010954543016850948, "rewards/rejected": -0.05099569633603096, "sft_loss": 0.8008230328559875, "step": 540 }, { "epoch": 1.3197360527894422, "grad_norm": 2.031522996603775, "learning_rate": 3.0705450756826707e-06, "logits/chosen": -0.6761570572853088, "logits/rejected": -0.5336428880691528, "logps/chosen": -0.7791737914085388, "logps/rejected": -0.9758432507514954, "loss": 0.7734, "odds_ratio_loss": 0.5955380201339722, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03895869478583336, "rewards/margins": 0.009833470918238163, "rewards/rejected": -0.04879216477274895, "sft_loss": 0.7791737914085388, "step": 550 }, { "epoch": 1.34373125374925, "grad_norm": 1.8127230145286217, "learning_rate": 3.00809550026231e-06, "logits/chosen": 0.7122937440872192, "logits/rejected": 0.8374090194702148, "logps/chosen": -0.7448546290397644, "logps/rejected": -1.0183660984039307, "loss": 0.7313, "odds_ratio_loss": 0.5605376362800598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03724273294210434, "rewards/margins": 0.01367556769400835, "rewards/rejected": -0.050918303430080414, "sft_loss": 0.7448546290397644, "step": 560 }, { "epoch": 1.3677264547090582, "grad_norm": 1.6102410365866324, "learning_rate": 2.9453133995574955e-06, "logits/chosen": 0.1695878505706787, "logits/rejected": 0.34987810254096985, "logps/chosen": -0.7041548490524292, "logps/rejected": -1.1295292377471924, "loss": 0.7529, "odds_ratio_loss": 0.5541011095046997, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03520774096250534, "rewards/margins": 0.02126871421933174, "rewards/rejected": -0.05647646263241768, "sft_loss": 0.7041548490524292, "step": 570 }, { "epoch": 1.3917216556688663, "grad_norm": 2.0516481147792964, "learning_rate": 2.8822398615839337e-06, "logits/chosen": -0.15236589312553406, "logits/rejected": 0.005555987358093262, "logps/chosen": -0.7019264698028564, "logps/rejected": -0.9463084936141968, "loss": 0.7377, "odds_ratio_loss": 0.5546727180480957, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03509632498025894, "rewards/margins": 0.012219103053212166, "rewards/rejected": -0.04731542617082596, "sft_loss": 0.7019264698028564, "step": 580 }, { "epoch": 1.4157168566286742, "grad_norm": 2.5703275268486463, "learning_rate": 2.8189161650897045e-06, "logits/chosen": 0.09915417432785034, "logits/rejected": 0.2876579761505127, "logps/chosen": -0.7416352033615112, "logps/rejected": -0.9542354345321655, "loss": 0.7748, "odds_ratio_loss": 0.5765627026557922, "rewards/accuracies": 0.625, "rewards/chosen": -0.0370817631483078, "rewards/margins": 0.010630009695887566, "rewards/rejected": -0.04771176725625992, "sft_loss": 0.7416352033615112, "step": 590 }, { "epoch": 1.4397120575884823, "grad_norm": 1.6574957139548097, "learning_rate": 2.7553837525402095e-06, "logits/chosen": 0.14950448274612427, "logits/rejected": 0.14670611917972565, "logps/chosen": -0.7459922432899475, "logps/rejected": -0.9438718557357788, "loss": 0.764, "odds_ratio_loss": 0.6029990911483765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.037299610674381256, "rewards/margins": 0.009893985465168953, "rewards/rejected": -0.04719359427690506, "sft_loss": 0.7459922432899475, "step": 600 }, { "epoch": 1.4637072585482904, "grad_norm": 1.5955732799355493, "learning_rate": 2.691684202995966e-06, "logits/chosen": 0.43530672788619995, "logits/rejected": 0.4994083344936371, "logps/chosen": -0.8142836689949036, "logps/rejected": -0.9706009030342102, "loss": 0.7559, "odds_ratio_loss": 0.7006958723068237, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04071418568491936, "rewards/margins": 0.007815859280526638, "rewards/rejected": -0.04853004962205887, "sft_loss": 0.8142836689949036, "step": 610 }, { "epoch": 1.4877024595080983, "grad_norm": 1.9589861397245603, "learning_rate": 2.6278592049010204e-06, "logits/chosen": -0.19675548374652863, "logits/rejected": -0.004504656884819269, "logps/chosen": -0.7537368535995483, "logps/rejected": -1.0135046243667603, "loss": 0.7741, "odds_ratio_loss": 0.5691729187965393, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03768684342503548, "rewards/margins": 0.012988388538360596, "rewards/rejected": -0.050675224512815475, "sft_loss": 0.7537368535995483, "step": 620 }, { "epoch": 1.5116976604679064, "grad_norm": 1.7255875955000524, "learning_rate": 2.5639505287997584e-06, "logits/chosen": 0.3145737051963806, "logits/rejected": 0.47394928336143494, "logps/chosen": -0.7314926385879517, "logps/rejected": -1.001952886581421, "loss": 0.7829, "odds_ratio_loss": 0.5629433393478394, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03657463565468788, "rewards/margins": 0.013523015193641186, "rewards/rejected": -0.050097644329071045, "sft_loss": 0.7314926385879517, "step": 630 }, { "epoch": 1.5356928614277146, "grad_norm": 2.504847023988975, "learning_rate": 2.5e-06, "logits/chosen": 0.2320265769958496, "logits/rejected": 0.3284027874469757, "logps/chosen": -0.7656562924385071, "logps/rejected": -1.076923131942749, "loss": 0.7503, "odds_ratio_loss": 0.584337592124939, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.038282815366983414, "rewards/margins": 0.015563338994979858, "rewards/rejected": -0.053846150636672974, "sft_loss": 0.7656562924385071, "step": 640 }, { "epoch": 1.5596880623875224, "grad_norm": 1.4394266237384084, "learning_rate": 2.436049471200242e-06, "logits/chosen": -0.5206400156021118, "logits/rejected": -0.38631540536880493, "logps/chosen": -0.8094362020492554, "logps/rejected": -0.9923938512802124, "loss": 0.7752, "odds_ratio_loss": 0.5967071056365967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04047181457281113, "rewards/margins": 0.00914788618683815, "rewards/rejected": -0.04961969703435898, "sft_loss": 0.8094362020492554, "step": 650 }, { "epoch": 1.5836832633473306, "grad_norm": 1.7625452374002906, "learning_rate": 2.3721407950989804e-06, "logits/chosen": -0.24351301789283752, "logits/rejected": -0.07003232091665268, "logps/chosen": -0.6876959800720215, "logps/rejected": -0.9035342335700989, "loss": 0.7734, "odds_ratio_loss": 0.5917103290557861, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.034384798258543015, "rewards/margins": 0.010791914537549019, "rewards/rejected": -0.045176707208156586, "sft_loss": 0.6876959800720215, "step": 660 }, { "epoch": 1.6076784643071385, "grad_norm": 1.6046093499190943, "learning_rate": 2.3083157970040344e-06, "logits/chosen": 0.5633162260055542, "logits/rejected": 0.6462755799293518, "logps/chosen": -0.7524802684783936, "logps/rejected": -1.0558850765228271, "loss": 0.7563, "odds_ratio_loss": 0.552274227142334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03762401267886162, "rewards/margins": 0.015170246362686157, "rewards/rejected": -0.05279426649212837, "sft_loss": 0.7524802684783936, "step": 670 }, { "epoch": 1.6316736652669466, "grad_norm": 2.117352018263469, "learning_rate": 2.2446162474597913e-06, "logits/chosen": 0.43944865465164185, "logits/rejected": 0.5002392530441284, "logps/chosen": -0.7501770257949829, "logps/rejected": -0.9691005945205688, "loss": 0.7699, "odds_ratio_loss": 0.5791727304458618, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.037508852779865265, "rewards/margins": 0.010946177877485752, "rewards/rejected": -0.04845503345131874, "sft_loss": 0.7501770257949829, "step": 680 }, { "epoch": 1.6556688662267547, "grad_norm": 1.6685249776962552, "learning_rate": 2.1810838349102963e-06, "logits/chosen": 0.16153453290462494, "logits/rejected": 0.20878514647483826, "logps/chosen": -0.7516240477561951, "logps/rejected": -1.0250643491744995, "loss": 0.7666, "odds_ratio_loss": 0.5872852206230164, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03758120536804199, "rewards/margins": 0.013672016561031342, "rewards/rejected": -0.051253218203783035, "sft_loss": 0.7516240477561951, "step": 690 }, { "epoch": 1.6796640671865628, "grad_norm": 2.782782057649718, "learning_rate": 2.117760138416067e-06, "logits/chosen": 0.24376201629638672, "logits/rejected": 0.44258540868759155, "logps/chosen": -0.6985687017440796, "logps/rejected": -1.0050299167633057, "loss": 0.7614, "odds_ratio_loss": 0.543103814125061, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03492843732237816, "rewards/margins": 0.015323063358664513, "rewards/rejected": -0.05025150254368782, "sft_loss": 0.6985687017440796, "step": 700 }, { "epoch": 1.7036592681463707, "grad_norm": 1.5369658154698735, "learning_rate": 2.0546866004425053e-06, "logits/chosen": 0.3964254558086395, "logits/rejected": 0.4900701642036438, "logps/chosen": -0.7590494155883789, "logps/rejected": -1.2440413236618042, "loss": 0.7652, "odds_ratio_loss": 0.5372438430786133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.037952471524477005, "rewards/margins": 0.024249596521258354, "rewards/rejected": -0.06220207363367081, "sft_loss": 0.7590494155883789, "step": 710 }, { "epoch": 1.7276544691061788, "grad_norm": 1.9970193945029362, "learning_rate": 1.9919044997376906e-06, "logits/chosen": 0.6031176447868347, "logits/rejected": 0.7783833742141724, "logps/chosen": -0.7290822267532349, "logps/rejected": -1.021554946899414, "loss": 0.7176, "odds_ratio_loss": 0.557815432548523, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03645411133766174, "rewards/margins": 0.014623639173805714, "rewards/rejected": -0.051077745854854584, "sft_loss": 0.7290822267532349, "step": 720 }, { "epoch": 1.7516496700659867, "grad_norm": 2.558147455560064, "learning_rate": 1.9294549243173306e-06, "logits/chosen": -0.027294237166643143, "logits/rejected": 0.11035363376140594, "logps/chosen": -0.7765438556671143, "logps/rejected": -1.0300321578979492, "loss": 0.7771, "odds_ratio_loss": 0.5954040884971619, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03882719203829765, "rewards/margins": 0.012674416415393353, "rewards/rejected": -0.05150160938501358, "sft_loss": 0.7765438556671143, "step": 730 }, { "epoch": 1.7756448710257948, "grad_norm": 2.346615273317464, "learning_rate": 1.8673787445745298e-06, "logits/chosen": -0.449845552444458, "logits/rejected": -0.3746832311153412, "logps/chosen": -0.7114017605781555, "logps/rejected": -0.928491473197937, "loss": 0.7699, "odds_ratio_loss": 0.5795110464096069, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.035570088773965836, "rewards/margins": 0.010854486376047134, "rewards/rejected": -0.04642457515001297, "sft_loss": 0.7114017605781555, "step": 740 }, { "epoch": 1.799640071985603, "grad_norm": 1.995371230537378, "learning_rate": 1.805716586531988e-06, "logits/chosen": -0.13443303108215332, "logits/rejected": 0.014731263741850853, "logps/chosen": -0.8079891204833984, "logps/rejected": -1.0810317993164062, "loss": 0.7825, "odds_ratio_loss": 0.6112096309661865, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0403994545340538, "rewards/margins": 0.013652140274643898, "rewards/rejected": -0.05405158922076225, "sft_loss": 0.8079891204833984, "step": 750 }, { "epoch": 1.823635272945411, "grad_norm": 1.8742057389590454, "learning_rate": 1.7445088052541218e-06, "logits/chosen": 0.046121031045913696, "logits/rejected": 0.1955467015504837, "logps/chosen": -0.7093559503555298, "logps/rejected": -1.0484099388122559, "loss": 0.7617, "odds_ratio_loss": 0.5657014846801758, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03546779602766037, "rewards/margins": 0.016952697187662125, "rewards/rejected": -0.05242049694061279, "sft_loss": 0.7093559503555298, "step": 760 }, { "epoch": 1.847630473905219, "grad_norm": 1.2680203881504901, "learning_rate": 1.6837954584365217e-06, "logits/chosen": 0.4459083080291748, "logits/rejected": 0.5636454224586487, "logps/chosen": -0.7526987195014954, "logps/rejected": -1.009804606437683, "loss": 0.7871, "odds_ratio_loss": 0.5556772947311401, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03763493150472641, "rewards/margins": 0.012855296023190022, "rewards/rejected": -0.050490230321884155, "sft_loss": 0.7526987195014954, "step": 770 }, { "epoch": 1.8716256748650268, "grad_norm": 1.9254646582677224, "learning_rate": 1.6236162801900191e-06, "logits/chosen": -0.10451897233724594, "logits/rejected": 0.3060254156589508, "logps/chosen": -0.6585639715194702, "logps/rejected": -0.9869001507759094, "loss": 0.71, "odds_ratio_loss": 0.4942260682582855, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03292820230126381, "rewards/margins": 0.016416804865002632, "rewards/rejected": -0.04934500530362129, "sft_loss": 0.6585639715194702, "step": 780 }, { "epoch": 1.895620875824835, "grad_norm": 1.9904836511656812, "learning_rate": 1.5640106550365298e-06, "logits/chosen": 0.11656351387500763, "logits/rejected": 0.29824742674827576, "logps/chosen": -0.7831540703773499, "logps/rejected": -1.0284688472747803, "loss": 0.7758, "odds_ratio_loss": 0.5839165449142456, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03915770351886749, "rewards/margins": 0.01226573996245861, "rewards/rejected": -0.051423441618680954, "sft_loss": 0.7831540703773499, "step": 790 }, { "epoch": 1.919616076784643, "grad_norm": 1.7061927534288226, "learning_rate": 1.5050175921336797e-06, "logits/chosen": 0.14354857802391052, "logits/rejected": 0.27334246039390564, "logps/chosen": -0.7474446892738342, "logps/rejected": -0.9480558633804321, "loss": 0.7575, "odds_ratio_loss": 0.6441240310668945, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03737223893404007, "rewards/margins": 0.010030550882220268, "rewards/rejected": -0.04740279167890549, "sft_loss": 0.7474446892738342, "step": 800 }, { "epoch": 1.9436112777444512, "grad_norm": 2.251879648695612, "learning_rate": 1.446675699745097e-06, "logits/chosen": 0.25183239579200745, "logits/rejected": 0.38326969742774963, "logps/chosen": -0.7823570966720581, "logps/rejected": -0.9946805238723755, "loss": 0.8037, "odds_ratio_loss": 0.6080455183982849, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03911786153912544, "rewards/margins": 0.010616169311106205, "rewards/rejected": -0.049734026193618774, "sft_loss": 0.7823570966720581, "step": 810 }, { "epoch": 1.9676064787042593, "grad_norm": 1.9391362449031262, "learning_rate": 1.3890231599730674e-06, "logits/chosen": 0.31725913286209106, "logits/rejected": 0.5106421709060669, "logps/chosen": -0.7221857309341431, "logps/rejected": -0.9829575419425964, "loss": 0.7904, "odds_ratio_loss": 0.5538625121116638, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03610928729176521, "rewards/margins": 0.013038587756454945, "rewards/rejected": -0.049147870391607285, "sft_loss": 0.7221857309341431, "step": 820 }, { "epoch": 1.9916016796640672, "grad_norm": 1.5457295502049215, "learning_rate": 1.3320977037700952e-06, "logits/chosen": 0.8291665315628052, "logits/rejected": 1.1122350692749023, "logps/chosen": -0.6864774227142334, "logps/rejected": -1.0247427225112915, "loss": 0.7452, "odds_ratio_loss": 0.49447354674339294, "rewards/accuracies": 0.75, "rewards/chosen": -0.03432386741042137, "rewards/margins": 0.016913266852498055, "rewards/rejected": -0.051237136125564575, "sft_loss": 0.6864774227142334, "step": 830 }, { "epoch": 2.015596880623875, "grad_norm": 1.5016852289986733, "learning_rate": 1.2759365862457148e-06, "logits/chosen": -0.4956502318382263, "logits/rejected": -0.1621031016111374, "logps/chosen": -0.7308815717697144, "logps/rejected": -0.9828909039497375, "loss": 0.7173, "odds_ratio_loss": 0.5487710237503052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0365440808236599, "rewards/margins": 0.012600463815033436, "rewards/rejected": -0.049144547432661057, "sft_loss": 0.7308815717697144, "step": 840 }, { "epoch": 2.039592081583683, "grad_norm": 1.622924065562837, "learning_rate": 1.2205765622847273e-06, "logits/chosen": -0.12397761642932892, "logits/rejected": 0.08023932576179504, "logps/chosen": -0.6277745962142944, "logps/rejected": -1.0955206155776978, "loss": 0.6995, "odds_ratio_loss": 0.4475070536136627, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.03138873726129532, "rewards/margins": 0.023387301713228226, "rewards/rejected": -0.054776035249233246, "sft_loss": 0.6277745962142944, "step": 850 }, { "epoch": 2.0635872825434913, "grad_norm": 1.4741935497367946, "learning_rate": 1.1660538624928062e-06, "logits/chosen": -0.3639386296272278, "logits/rejected": -0.2011258602142334, "logps/chosen": -0.6642920970916748, "logps/rejected": -1.0270217657089233, "loss": 0.7019, "odds_ratio_loss": 0.4971997141838074, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03321460261940956, "rewards/margins": 0.018136484548449516, "rewards/rejected": -0.05135108903050423, "sft_loss": 0.6642920970916748, "step": 860 }, { "epoch": 2.0875824835032994, "grad_norm": 1.7172174730539993, "learning_rate": 1.112404169485226e-06, "logits/chosen": -0.3923923075199127, "logits/rejected": -0.10327514261007309, "logps/chosen": -0.5645719766616821, "logps/rejected": -1.071115255355835, "loss": 0.6681, "odds_ratio_loss": 0.42052555084228516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.028228599578142166, "rewards/margins": 0.025327179580926895, "rewards/rejected": -0.053555767983198166, "sft_loss": 0.5645719766616821, "step": 870 }, { "epoch": 2.1115776844631076, "grad_norm": 1.1474314844125568, "learning_rate": 1.0596625945342148e-06, "logits/chosen": -0.008033117279410362, "logits/rejected": 0.16419892013072968, "logps/chosen": -0.7100299000740051, "logps/rejected": -0.9733055233955383, "loss": 0.6813, "odds_ratio_loss": 0.5328400731086731, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03550150245428085, "rewards/margins": 0.013163777068257332, "rewards/rejected": -0.048665277659893036, "sft_loss": 0.7100299000740051, "step": 880 }, { "epoch": 2.1355728854229152, "grad_norm": 2.1383619388719515, "learning_rate": 1.0078636545902363e-06, "logits/chosen": -0.4247666001319885, "logits/rejected": -0.17631380259990692, "logps/chosen": -0.6582883596420288, "logps/rejected": -1.0547147989273071, "loss": 0.6895, "odds_ratio_loss": 0.47398701310157776, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0329144187271595, "rewards/margins": 0.019821325317025185, "rewards/rejected": -0.05273573845624924, "sft_loss": 0.6582883596420288, "step": 890 }, { "epoch": 2.1595680863827234, "grad_norm": 1.5320300236939732, "learning_rate": 9.570412496922198e-07, "logits/chosen": -0.27953624725341797, "logits/rejected": -0.08715387433767319, "logps/chosen": -0.5965186357498169, "logps/rejected": -1.154284119606018, "loss": 0.6738, "odds_ratio_loss": 0.4240815043449402, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.029825935140252113, "rewards/margins": 0.02788827195763588, "rewards/rejected": -0.05771421268582344, "sft_loss": 0.5965186357498169, "step": 900 }, { "epoch": 2.1835632873425315, "grad_norm": 1.6204787225170885, "learning_rate": 9.07228640781539e-07, "logits/chosen": 0.368365079164505, "logits/rejected": 0.6101259589195251, "logps/chosen": -0.6893322467803955, "logps/rejected": -1.0903311967849731, "loss": 0.6791, "odds_ratio_loss": 0.4818887710571289, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.03446660935878754, "rewards/margins": 0.02004995197057724, "rewards/rejected": -0.054516565054655075, "sft_loss": 0.6893322467803955, "step": 910 }, { "epoch": 2.2075584883023396, "grad_norm": 1.290844558254926, "learning_rate": 8.584584279342392e-07, "logits/chosen": -0.16083380579948425, "logits/rejected": -0.10739579051733017, "logps/chosen": -0.6938862800598145, "logps/rejected": -0.9513536691665649, "loss": 0.6888, "odds_ratio_loss": 0.5428452491760254, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.034694310277700424, "rewards/margins": 0.012873371131718159, "rewards/rejected": -0.047567687928676605, "sft_loss": 0.6938862800598145, "step": 920 }, { "epoch": 2.2315536892621477, "grad_norm": 1.5229766148545818, "learning_rate": 8.10762529025782e-07, "logits/chosen": -0.4659739136695862, "logits/rejected": -0.4786594808101654, "logps/chosen": -0.6584521532058716, "logps/rejected": -0.8917843699455261, "loss": 0.65, "odds_ratio_loss": 0.5486137866973877, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03292260691523552, "rewards/margins": 0.011666612699627876, "rewards/rejected": -0.044589221477508545, "sft_loss": 0.6584521532058716, "step": 930 }, { "epoch": 2.255548890221956, "grad_norm": 1.7015940933867517, "learning_rate": 7.641721588422526e-07, "logits/chosen": -0.009342163801193237, "logits/rejected": 0.1280032843351364, "logps/chosen": -0.6387184262275696, "logps/rejected": -1.049140453338623, "loss": 0.687, "odds_ratio_loss": 0.4773840010166168, "rewards/accuracies": 0.75, "rewards/chosen": -0.0319359228014946, "rewards/margins": 0.020521100610494614, "rewards/rejected": -0.05245702341198921, "sft_loss": 0.6387184262275696, "step": 940 }, { "epoch": 2.2795440911817635, "grad_norm": 1.4203319350991257, "learning_rate": 7.187178086517116e-07, "logits/chosen": 0.14468683302402496, "logits/rejected": 0.2608656883239746, "logps/chosen": -0.6514204144477844, "logps/rejected": -1.2591578960418701, "loss": 0.6695, "odds_ratio_loss": 0.455849826335907, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03257102146744728, "rewards/margins": 0.03038688376545906, "rewards/rejected": -0.06295789778232574, "sft_loss": 0.6514204144477844, "step": 950 }, { "epoch": 2.3035392921415716, "grad_norm": 1.7783791010197938, "learning_rate": 6.74429226249049e-07, "logits/chosen": 0.09898465871810913, "logits/rejected": 0.21373791992664337, "logps/chosen": -0.6381307244300842, "logps/rejected": -0.9742431640625, "loss": 0.6712, "odds_ratio_loss": 0.49530988931655884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03190653771162033, "rewards/margins": 0.016805628314614296, "rewards/rejected": -0.04871216416358948, "sft_loss": 0.6381307244300842, "step": 960 }, { "epoch": 2.3275344931013797, "grad_norm": 1.6090454208525553, "learning_rate": 6.313353964874155e-07, "logits/chosen": 0.1333683431148529, "logits/rejected": 0.3417516350746155, "logps/chosen": -0.6887052655220032, "logps/rejected": -1.0016798973083496, "loss": 0.6673, "odds_ratio_loss": 0.5059822797775269, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03443526476621628, "rewards/margins": 0.01564873196184635, "rewards/rejected": -0.05008399486541748, "sft_loss": 0.6887052655220032, "step": 970 }, { "epoch": 2.351529694061188, "grad_norm": 1.6382111002720514, "learning_rate": 5.894645223089584e-07, "logits/chosen": 0.7236309051513672, "logits/rejected": 0.8550646901130676, "logps/chosen": -0.6779772639274597, "logps/rejected": -1.2183148860931396, "loss": 0.6958, "odds_ratio_loss": 0.448292076587677, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.033898863941431046, "rewards/margins": 0.027016881853342056, "rewards/rejected": -0.0609157457947731, "sft_loss": 0.6779772639274597, "step": 980 }, { "epoch": 2.375524895020996, "grad_norm": 1.680992010239421, "learning_rate": 5.48844006287289e-07, "logits/chosen": 0.12925365567207336, "logits/rejected": 0.3167954981327057, "logps/chosen": -0.6692675352096558, "logps/rejected": -1.0140740871429443, "loss": 0.6691, "odds_ratio_loss": 0.4763975143432617, "rewards/accuracies": 0.75, "rewards/chosen": -0.033463381230831146, "rewards/margins": 0.01724032498896122, "rewards/rejected": -0.050703711807727814, "sft_loss": 0.6692675352096558, "step": 990 }, { "epoch": 2.3995200959808036, "grad_norm": 1.544720546176764, "learning_rate": 5.095004326937445e-07, "logits/chosen": -0.4231066107749939, "logits/rejected": -0.20230142772197723, "logps/chosen": -0.6737790107727051, "logps/rejected": -1.0810075998306274, "loss": 0.6744, "odds_ratio_loss": 0.4769432544708252, "rewards/accuracies": 0.75, "rewards/chosen": -0.033688947558403015, "rewards/margins": 0.02036142908036709, "rewards/rejected": -0.05405038595199585, "sft_loss": 0.6737790107727051, "step": 1000 }, { "epoch": 2.4235152969406117, "grad_norm": 1.7400382431256138, "learning_rate": 4.71459550099202e-07, "logits/chosen": 0.2943962812423706, "logits/rejected": 0.5343393087387085, "logps/chosen": -0.6686779856681824, "logps/rejected": -1.0820672512054443, "loss": 0.7078, "odds_ratio_loss": 0.5010559558868408, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03343390300869942, "rewards/margins": 0.020669464021921158, "rewards/rejected": -0.054103363305330276, "sft_loss": 0.6686779856681824, "step": 1010 }, { "epoch": 2.44751049790042, "grad_norm": 1.548219424075948, "learning_rate": 4.347462545228134e-07, "logits/chosen": 0.13567771017551422, "logits/rejected": 0.31968480348587036, "logps/chosen": -0.6244124174118042, "logps/rejected": -1.05476975440979, "loss": 0.6563, "odds_ratio_loss": 0.4984089732170105, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03122062422335148, "rewards/margins": 0.021517863497138023, "rewards/rejected": -0.052738480269908905, "sft_loss": 0.6244124174118042, "step": 1020 }, { "epoch": 2.471505698860228, "grad_norm": 1.4610216249122747, "learning_rate": 3.9938457313869914e-07, "logits/chosen": -0.08544759452342987, "logits/rejected": 0.07162941992282867, "logps/chosen": -0.7579829096794128, "logps/rejected": -1.1255767345428467, "loss": 0.6864, "odds_ratio_loss": 0.547897458076477, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03789914771914482, "rewards/margins": 0.01837969198822975, "rewards/rejected": -0.05627884343266487, "sft_loss": 0.7579829096794128, "step": 1030 }, { "epoch": 2.495500899820036, "grad_norm": 1.6006797776983446, "learning_rate": 3.6539764855126224e-07, "logits/chosen": -0.23340921103954315, "logits/rejected": -0.1814245879650116, "logps/chosen": -0.6439553499221802, "logps/rejected": -1.0276587009429932, "loss": 0.6617, "odds_ratio_loss": 0.5049816370010376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03219776228070259, "rewards/margins": 0.019185172393918037, "rewards/rejected": -0.05138293653726578, "sft_loss": 0.6439553499221802, "step": 1040 }, { "epoch": 2.519496100779844, "grad_norm": 2.318524117790848, "learning_rate": 3.328077236494087e-07, "logits/chosen": -0.12850667536258698, "logits/rejected": 0.07032374292612076, "logps/chosen": -0.5922039747238159, "logps/rejected": -1.0730435848236084, "loss": 0.6694, "odds_ratio_loss": 0.43941235542297363, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.029610196128487587, "rewards/margins": 0.024041980504989624, "rewards/rejected": -0.05365217477083206, "sft_loss": 0.5922039747238159, "step": 1050 }, { "epoch": 2.5434913017396523, "grad_norm": 1.8087989245838814, "learning_rate": 3.0163612704959486e-07, "logits/chosen": -0.6611061692237854, "logits/rejected": -0.5293869376182556, "logps/chosen": -0.6281863451004028, "logps/rejected": -0.9944284558296204, "loss": 0.6705, "odds_ratio_loss": 0.47698038816452026, "rewards/accuracies": 0.75, "rewards/chosen": -0.03140931576490402, "rewards/margins": 0.018312102183699608, "rewards/rejected": -0.04972142353653908, "sft_loss": 0.6281863451004028, "step": 1060 }, { "epoch": 2.56748650269946, "grad_norm": 1.5444353690364836, "learning_rate": 2.71903259137222e-07, "logits/chosen": 0.411745548248291, "logits/rejected": 0.4236873686313629, "logps/chosen": -0.611006498336792, "logps/rejected": -1.0047032833099365, "loss": 0.672, "odds_ratio_loss": 0.48614612221717834, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03055032715201378, "rewards/margins": 0.019684839993715286, "rewards/rejected": -0.050235163420438766, "sft_loss": 0.611006498336792, "step": 1070 }, { "epoch": 2.591481703659268, "grad_norm": 2.593043127599419, "learning_rate": 2.436285787155185e-07, "logits/chosen": 0.316955029964447, "logits/rejected": 0.47285112738609314, "logps/chosen": -0.6786519885063171, "logps/rejected": -1.2019875049591064, "loss": 0.6881, "odds_ratio_loss": 0.4908427298069, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03393259644508362, "rewards/margins": 0.026166772469878197, "rewards/rejected": -0.060099370777606964, "sft_loss": 0.6786519885063171, "step": 1080 }, { "epoch": 2.6154769046190762, "grad_norm": 2.2050381193088207, "learning_rate": 2.168305902706383e-07, "logits/chosen": -0.4541945457458496, "logits/rejected": -0.18702273070812225, "logps/chosen": -0.7026795148849487, "logps/rejected": -0.962356448173523, "loss": 0.6583, "odds_ratio_loss": 0.5365189909934998, "rewards/accuracies": 0.75, "rewards/chosen": -0.035133976489305496, "rewards/margins": 0.012983846478164196, "rewards/rejected": -0.04811782017350197, "sft_loss": 0.7026795148849487, "step": 1090 }, { "epoch": 2.6394721055788843, "grad_norm": 1.6921175899136245, "learning_rate": 1.9152683186132476e-07, "logits/chosen": -0.4067768156528473, "logits/rejected": -0.3039708137512207, "logps/chosen": -0.6328436136245728, "logps/rejected": -1.12655770778656, "loss": 0.6919, "odds_ratio_loss": 0.4709090292453766, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.031642183661460876, "rewards/margins": 0.024685706943273544, "rewards/rejected": -0.05632789060473442, "sft_loss": 0.6328436136245728, "step": 1100 }, { "epoch": 2.663467306538692, "grad_norm": 1.5594348597838832, "learning_rate": 1.6773386364104972e-07, "logits/chosen": -0.1575368195772171, "logits/rejected": -0.003553843591362238, "logps/chosen": -0.6768941879272461, "logps/rejected": -1.032041072845459, "loss": 0.6913, "odds_ratio_loss": 0.50171959400177, "rewards/accuracies": 0.75, "rewards/chosen": -0.033844709396362305, "rewards/margins": 0.017757344990968704, "rewards/rejected": -0.05160205811262131, "sft_loss": 0.6768941879272461, "step": 1110 }, { "epoch": 2.6874625074985, "grad_norm": 1.2735811398241894, "learning_rate": 1.4546725702015096e-07, "logits/chosen": 0.004650235176086426, "logits/rejected": 0.1661575585603714, "logps/chosen": -0.6541981101036072, "logps/rejected": -1.1094247102737427, "loss": 0.6669, "odds_ratio_loss": 0.4492813050746918, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.03270990774035454, "rewards/margins": 0.022761326283216476, "rewards/rejected": -0.055471230298280716, "sft_loss": 0.6541981101036072, "step": 1120 }, { "epoch": 2.7114577084583082, "grad_norm": 2.2135398834819715, "learning_rate": 1.24741584475056e-07, "logits/chosen": -0.07907108962535858, "logits/rejected": 0.08474680036306381, "logps/chosen": -0.6154497861862183, "logps/rejected": -1.0710924863815308, "loss": 0.6491, "odds_ratio_loss": 0.4509805142879486, "rewards/accuracies": 0.8125, "rewards/chosen": -0.030772492289543152, "rewards/margins": 0.022782133892178535, "rewards/rejected": -0.05355461686849594, "sft_loss": 0.6154497861862183, "step": 1130 }, { "epoch": 2.7354529094181164, "grad_norm": 1.5137426741255027, "learning_rate": 1.0557041001126145e-07, "logits/chosen": 0.3702402710914612, "logits/rejected": 0.6300150156021118, "logps/chosen": -0.5984182357788086, "logps/rejected": -1.115179419517517, "loss": 0.6191, "odds_ratio_loss": 0.41762223839759827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0299209114164114, "rewards/margins": 0.025838062167167664, "rewards/rejected": -0.05575897544622421, "sft_loss": 0.5984182357788086, "step": 1140 }, { "epoch": 2.7594481103779245, "grad_norm": 1.565522436867544, "learning_rate": 8.796628028631321e-08, "logits/chosen": 0.17880654335021973, "logits/rejected": 0.1116660013794899, "logps/chosen": -0.6091745495796204, "logps/rejected": -1.0210378170013428, "loss": 0.6583, "odds_ratio_loss": 0.4544963836669922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.030458729714155197, "rewards/margins": 0.02059316076338291, "rewards/rejected": -0.05105189234018326, "sft_loss": 0.6091745495796204, "step": 1150 }, { "epoch": 2.7834433113377326, "grad_norm": 1.604017358081912, "learning_rate": 7.19407163985894e-08, "logits/chosen": -0.04378344863653183, "logits/rejected": 0.18321049213409424, "logps/chosen": -0.6626521348953247, "logps/rejected": -1.1215763092041016, "loss": 0.666, "odds_ratio_loss": 0.4741577208042145, "rewards/accuracies": 0.75, "rewards/chosen": -0.033132605254650116, "rewards/margins": 0.022946210578083992, "rewards/rejected": -0.05607881397008896, "sft_loss": 0.6626521348953247, "step": 1160 }, { "epoch": 2.8074385122975407, "grad_norm": 1.4084206676302562, "learning_rate": 5.750420634727083e-08, "logits/chosen": -0.45710262656211853, "logits/rejected": -0.3050076961517334, "logps/chosen": -0.671418309211731, "logps/rejected": -1.1854102611541748, "loss": 0.6842, "odds_ratio_loss": 0.4368383288383484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03357091173529625, "rewards/margins": 0.02569960430264473, "rewards/rejected": -0.05927051231265068, "sft_loss": 0.671418309211731, "step": 1170 }, { "epoch": 2.8314337132573484, "grad_norm": 1.3507137389822068, "learning_rate": 4.4666198168422656e-08, "logits/chosen": 0.33376216888427734, "logits/rejected": 0.41172194480895996, "logps/chosen": -0.6510582566261292, "logps/rejected": -1.0800405740737915, "loss": 0.6747, "odds_ratio_loss": 0.5277644395828247, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.032552916556596756, "rewards/margins": 0.021449116989970207, "rewards/rejected": -0.054002027958631516, "sft_loss": 0.6510582566261292, "step": 1180 }, { "epoch": 2.8554289142171565, "grad_norm": 1.6874037821147798, "learning_rate": 3.343509375168863e-08, "logits/chosen": 0.20301933586597443, "logits/rejected": 0.32382094860076904, "logps/chosen": -0.6405006647109985, "logps/rejected": -1.0241023302078247, "loss": 0.6718, "odds_ratio_loss": 0.48166948556900024, "rewards/accuracies": 0.75, "rewards/chosen": -0.03202503174543381, "rewards/margins": 0.019180091097950935, "rewards/rejected": -0.051205117255449295, "sft_loss": 0.6405006647109985, "step": 1190 }, { "epoch": 2.8794241151769646, "grad_norm": 1.6417139708130921, "learning_rate": 2.3818243341637293e-08, "logits/chosen": -0.3619822859764099, "logits/rejected": -0.15361133217811584, "logps/chosen": -0.6599988341331482, "logps/rejected": -1.098881483078003, "loss": 0.6565, "odds_ratio_loss": 0.456063449382782, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03299994021654129, "rewards/margins": 0.021944135427474976, "rewards/rejected": -0.054944075644016266, "sft_loss": 0.6599988341331482, "step": 1200 }, { "epoch": 2.9034193161367727, "grad_norm": 1.648932215503252, "learning_rate": 1.5821940727361874e-08, "logits/chosen": -0.7362561821937561, "logits/rejected": -0.4996170997619629, "logps/chosen": -0.6824958920478821, "logps/rejected": -0.9969790577888489, "loss": 0.7067, "odds_ratio_loss": 0.5307115316390991, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.034124795347452164, "rewards/margins": 0.01572415977716446, "rewards/rejected": -0.049848951399326324, "sft_loss": 0.6824958920478821, "step": 1210 }, { "epoch": 2.927414517096581, "grad_norm": 1.7678674281978446, "learning_rate": 9.451419123484573e-09, "logits/chosen": -0.15318191051483154, "logits/rejected": 0.047946538776159286, "logps/chosen": -0.6560810804367065, "logps/rejected": -1.0658347606658936, "loss": 0.6692, "odds_ratio_loss": 0.5046226382255554, "rewards/accuracies": 0.75, "rewards/chosen": -0.032804060727357864, "rewards/margins": 0.02048768661916256, "rewards/rejected": -0.053291745483875275, "sft_loss": 0.6560810804367065, "step": 1220 }, { "epoch": 2.9514097180563885, "grad_norm": 1.4413325593301094, "learning_rate": 4.710847745256209e-09, "logits/chosen": 0.12647075951099396, "logits/rejected": 0.2795228958129883, "logps/chosen": -0.6180914640426636, "logps/rejected": -1.0847346782684326, "loss": 0.6722, "odds_ratio_loss": 0.41623228788375854, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.030904576182365417, "rewards/margins": 0.02333216182887554, "rewards/rejected": -0.05423673242330551, "sft_loss": 0.6180914640426636, "step": 1230 }, { "epoch": 2.9754049190161966, "grad_norm": 1.5296676400661524, "learning_rate": 1.603329079994942e-09, "logits/chosen": -0.3425149619579315, "logits/rejected": -0.06856220215559006, "logps/chosen": -0.6569226980209351, "logps/rejected": -1.1020539999008179, "loss": 0.6649, "odds_ratio_loss": 0.4642546772956848, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03284613788127899, "rewards/margins": 0.02225656434893608, "rewards/rejected": -0.055102698504924774, "sft_loss": 0.6569226980209351, "step": 1240 }, { "epoch": 2.994601079784043, "step": 1248, "total_flos": 132590267662336.0, "train_loss": 0.7937506708579186, "train_runtime": 49781.9259, "train_samples_per_second": 1.205, "train_steps_per_second": 0.025 } ], "logging_steps": 10, "max_steps": 1248, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100.0, "total_flos": 132590267662336.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }