phi3-4k-chinese-orpo / trainer_state.json
postitive666
orpo chinese phi3 4K
3c7b14a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994601079784043,
"eval_steps": 500,
"global_step": 1248,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02399520095980804,
"grad_norm": 24.58741331565172,
"learning_rate": 1.0000000000000002e-06,
"logits/chosen": -0.5075146555900574,
"logits/rejected": -0.31934085488319397,
"logps/chosen": -1.394007921218872,
"logps/rejected": -1.3630257844924927,
"loss": 1.3501,
"odds_ratio_loss": 0.8239962458610535,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.06970040500164032,
"rewards/margins": -0.0015491036465391517,
"rewards/rejected": -0.06815129518508911,
"sft_loss": 1.394007921218872,
"step": 10
},
{
"epoch": 0.04799040191961608,
"grad_norm": 4.281683015852783,
"learning_rate": 3.5e-06,
"logits/chosen": 0.08614908158779144,
"logits/rejected": 0.3013238310813904,
"logps/chosen": -1.3080074787139893,
"logps/rejected": -1.334457278251648,
"loss": 1.2858,
"odds_ratio_loss": 0.7804475426673889,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0654003769159317,
"rewards/margins": 0.0013224859721958637,
"rewards/rejected": -0.06672286242246628,
"sft_loss": 1.3080074787139893,
"step": 20
},
{
"epoch": 0.07198560287942411,
"grad_norm": 3.830958349381369,
"learning_rate": 4.99986910314335e-06,
"logits/chosen": 0.3485943675041199,
"logits/rejected": 0.6042150855064392,
"logps/chosen": -0.9540683627128601,
"logps/rejected": -1.1750730276107788,
"loss": 0.9904,
"odds_ratio_loss": 0.6533687710762024,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.047703422605991364,
"rewards/margins": 0.011050237342715263,
"rewards/rejected": -0.05875365808606148,
"sft_loss": 0.9540683627128601,
"step": 30
},
{
"epoch": 0.09598080383923216,
"grad_norm": 3.6776666943951675,
"learning_rate": 4.998396670920005e-06,
"logits/chosen": 0.17601105570793152,
"logits/rejected": 0.5272272229194641,
"logps/chosen": -0.898045539855957,
"logps/rejected": -1.0136868953704834,
"loss": 0.9614,
"odds_ratio_loss": 0.6860688328742981,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.04490227997303009,
"rewards/margins": 0.005782057531177998,
"rewards/rejected": -0.05068434029817581,
"sft_loss": 0.898045539855957,
"step": 40
},
{
"epoch": 0.11997600479904019,
"grad_norm": 2.636908991979515,
"learning_rate": 4.995289152254744e-06,
"logits/chosen": 0.2309066355228424,
"logits/rejected": 0.22152824699878693,
"logps/chosen": -0.9074997901916504,
"logps/rejected": -1.0551084280014038,
"loss": 0.9374,
"odds_ratio_loss": 0.663613498210907,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.04537498578429222,
"rewards/margins": 0.007380434311926365,
"rewards/rejected": -0.05275542289018631,
"sft_loss": 0.9074997901916504,
"step": 50
},
{
"epoch": 0.14397120575884823,
"grad_norm": 1.8300107701302537,
"learning_rate": 4.990548580876516e-06,
"logits/chosen": 0.307407021522522,
"logits/rejected": 0.37507694959640503,
"logps/chosen": -0.9279610514640808,
"logps/rejected": -0.986476719379425,
"loss": 0.9464,
"odds_ratio_loss": 0.7063499093055725,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.04639805108308792,
"rewards/margins": 0.00292578199878335,
"rewards/rejected": -0.04932383447885513,
"sft_loss": 0.9279610514640808,
"step": 60
},
{
"epoch": 0.16796640671865626,
"grad_norm": 3.8157191209486507,
"learning_rate": 4.9841780592726385e-06,
"logits/chosen": 0.19509825110435486,
"logits/rejected": 0.2650177776813507,
"logps/chosen": -0.9848098754882812,
"logps/rejected": -1.0149097442626953,
"loss": 0.9578,
"odds_ratio_loss": 0.726799488067627,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.04924049228429794,
"rewards/margins": 0.0015049913199618459,
"rewards/rejected": -0.050745487213134766,
"sft_loss": 0.9848098754882812,
"step": 70
},
{
"epoch": 0.19196160767846432,
"grad_norm": 4.078587531391316,
"learning_rate": 4.976181756658363e-06,
"logits/chosen": 0.061622969806194305,
"logits/rejected": 0.2444450408220291,
"logps/chosen": -0.8894473910331726,
"logps/rejected": -1.0614734888076782,
"loss": 0.9675,
"odds_ratio_loss": 0.6382969617843628,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.04447237029671669,
"rewards/margins": 0.008601305074989796,
"rewards/rejected": -0.05307367444038391,
"sft_loss": 0.8894473910331726,
"step": 80
},
{
"epoch": 0.21595680863827235,
"grad_norm": 2.9874023740770363,
"learning_rate": 4.9665649062483115e-06,
"logits/chosen": 0.6337467432022095,
"logits/rejected": 0.7902036905288696,
"logps/chosen": -0.9439412951469421,
"logps/rejected": -0.9588793516159058,
"loss": 0.9635,
"odds_ratio_loss": 0.7716476917266846,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.047197069972753525,
"rewards/margins": 0.0007468975381925702,
"rewards/rejected": -0.047943972051143646,
"sft_loss": 0.9439412951469421,
"step": 90
},
{
"epoch": 0.23995200959808038,
"grad_norm": 2.3029148332001745,
"learning_rate": 4.955333801831578e-06,
"logits/chosen": 0.49920982122421265,
"logits/rejected": 0.6337569355964661,
"logps/chosen": -0.8333128094673157,
"logps/rejected": -1.059599757194519,
"loss": 0.9453,
"odds_ratio_loss": 0.6517213582992554,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.041665639728307724,
"rewards/margins": 0.011314347386360168,
"rewards/rejected": -0.05297998711466789,
"sft_loss": 0.8333128094673157,
"step": 100
},
{
"epoch": 0.26394721055788845,
"grad_norm": 2.8766587489414395,
"learning_rate": 4.9424957936527295e-06,
"logits/chosen": -0.28645992279052734,
"logits/rejected": 0.04107431694865227,
"logps/chosen": -0.9429195523262024,
"logps/rejected": -0.9936224222183228,
"loss": 0.9526,
"odds_ratio_loss": 0.705885112285614,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.04714598134160042,
"rewards/margins": 0.002535139676183462,
"rewards/rejected": -0.04968111589550972,
"sft_loss": 0.9429195523262024,
"step": 110
},
{
"epoch": 0.28794241151769645,
"grad_norm": 2.1411106644617703,
"learning_rate": 4.92805928360141e-06,
"logits/chosen": -0.29608479142189026,
"logits/rejected": -0.21111997961997986,
"logps/chosen": -0.888851523399353,
"logps/rejected": -1.0842912197113037,
"loss": 0.8904,
"odds_ratio_loss": 0.5968859195709229,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04444257169961929,
"rewards/margins": 0.009771987795829773,
"rewards/rejected": -0.054214559495449066,
"sft_loss": 0.888851523399353,
"step": 120
},
{
"epoch": 0.3119376124775045,
"grad_norm": 2.1891227152981347,
"learning_rate": 4.912033719713687e-06,
"logits/chosen": 0.49228960275650024,
"logits/rejected": 0.5680336952209473,
"logps/chosen": -0.9152839779853821,
"logps/rejected": -1.0058788061141968,
"loss": 0.9427,
"odds_ratio_loss": 0.6943625807762146,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.04576420038938522,
"rewards/margins": 0.004529745317995548,
"rewards/rejected": -0.0502939410507679,
"sft_loss": 0.9152839779853821,
"step": 130
},
{
"epoch": 0.3359328134373125,
"grad_norm": 2.5131225459939,
"learning_rate": 4.894429589988739e-06,
"logits/chosen": -1.2468726634979248,
"logits/rejected": -1.0485397577285767,
"logps/chosen": -1.0104249715805054,
"logps/rejected": -1.0477244853973389,
"loss": 0.949,
"odds_ratio_loss": 0.7160865068435669,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.05052124708890915,
"rewards/margins": 0.0018649749690666795,
"rewards/rejected": -0.05238622426986694,
"sft_loss": 1.0104249715805054,
"step": 140
},
{
"epoch": 0.3599280143971206,
"grad_norm": 2.696319834123575,
"learning_rate": 4.875258415524945e-06,
"logits/chosen": 0.039508234709501266,
"logits/rejected": 0.23594827950000763,
"logps/chosen": -0.904223620891571,
"logps/rejected": -1.032157063484192,
"loss": 0.9533,
"odds_ratio_loss": 0.6739581823348999,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.04521118476986885,
"rewards/margins": 0.0063966671004891396,
"rewards/rejected": -0.051607854664325714,
"sft_loss": 0.904223620891571,
"step": 150
},
{
"epoch": 0.38392321535692864,
"grad_norm": 2.241170193835809,
"learning_rate": 4.85453274297985e-06,
"logits/chosen": 0.4507044851779938,
"logits/rejected": 0.7088828682899475,
"logps/chosen": -0.9252007603645325,
"logps/rejected": -1.0105345249176025,
"loss": 0.9187,
"odds_ratio_loss": 0.6664329171180725,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0462600402534008,
"rewards/margins": 0.004266692791134119,
"rewards/rejected": -0.050526730716228485,
"sft_loss": 0.9252007603645325,
"step": 160
},
{
"epoch": 0.40791841631673664,
"grad_norm": 1.759854296483571,
"learning_rate": 4.832266136358951e-06,
"logits/chosen": -0.12876208126544952,
"logits/rejected": 0.014335835352540016,
"logps/chosen": -0.8540490865707397,
"logps/rejected": -0.9863293766975403,
"loss": 0.926,
"odds_ratio_loss": 0.6714656352996826,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.04270245134830475,
"rewards/margins": 0.006614011712372303,
"rewards/rejected": -0.04931646212935448,
"sft_loss": 0.8540490865707397,
"step": 170
},
{
"epoch": 0.4319136172765447,
"grad_norm": 2.793191882203603,
"learning_rate": 4.808473168138675e-06,
"logits/chosen": 0.3617595136165619,
"logits/rejected": 0.3396950364112854,
"logps/chosen": -0.8613064885139465,
"logps/rejected": -1.0067331790924072,
"loss": 0.9162,
"odds_ratio_loss": 0.6582903861999512,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.04306532442569733,
"rewards/margins": 0.007271329872310162,
"rewards/rejected": -0.050336651504039764,
"sft_loss": 0.8613064885139465,
"step": 180
},
{
"epoch": 0.4559088182363527,
"grad_norm": 1.7774141067161418,
"learning_rate": 4.783169409729363e-06,
"logits/chosen": 0.9685203433036804,
"logits/rejected": 1.1009634733200073,
"logps/chosen": -0.8521540760993958,
"logps/rejected": -0.9150575399398804,
"loss": 0.9004,
"odds_ratio_loss": 0.7224193811416626,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.04260770231485367,
"rewards/margins": 0.0031451724935323,
"rewards/rejected": -0.0457528755068779,
"sft_loss": 0.8521540760993958,
"step": 190
},
{
"epoch": 0.47990401919616077,
"grad_norm": 2.052107783396207,
"learning_rate": 4.756371421284482e-06,
"logits/chosen": 0.33597105741500854,
"logits/rejected": 0.44187426567077637,
"logps/chosen": -0.8725342750549316,
"logps/rejected": -0.9003400802612305,
"loss": 0.919,
"odds_ratio_loss": 0.7135496735572815,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.04362671449780464,
"rewards/margins": 0.0013902939390391111,
"rewards/rejected": -0.04501700773835182,
"sft_loss": 0.8725342750549316,
"step": 200
},
{
"epoch": 0.5038992201559688,
"grad_norm": 2.3000145040966973,
"learning_rate": 4.728096740862778e-06,
"logits/chosen": 0.16287042200565338,
"logits/rejected": 0.35098087787628174,
"logps/chosen": -0.8514264822006226,
"logps/rejected": -0.9913795590400696,
"loss": 0.9096,
"odds_ratio_loss": 0.6634506583213806,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.042571328580379486,
"rewards/margins": 0.006997650023549795,
"rewards/rejected": -0.04956897348165512,
"sft_loss": 0.8514264822006226,
"step": 210
},
{
"epoch": 0.5278944211157769,
"grad_norm": 1.581079267248328,
"learning_rate": 4.698363872950406e-06,
"logits/chosen": 0.298981636762619,
"logits/rejected": 0.49268895387649536,
"logps/chosen": -0.8895601034164429,
"logps/rejected": -1.026539921760559,
"loss": 0.8744,
"odds_ratio_loss": 0.6685082316398621,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.04447800666093826,
"rewards/margins": 0.0068489923141896725,
"rewards/rejected": -0.051326997578144073,
"sft_loss": 0.8895601034164429,
"step": 220
},
{
"epoch": 0.5518896220755849,
"grad_norm": 1.7094822098553022,
"learning_rate": 4.6671922763505915e-06,
"logits/chosen": 0.34609514474868774,
"logits/rejected": 0.5052930116653442,
"logps/chosen": -0.863084614276886,
"logps/rejected": -0.9836879968643188,
"loss": 0.8905,
"odds_ratio_loss": 0.6813028454780579,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.043154239654541016,
"rewards/margins": 0.006030158139765263,
"rewards/rejected": -0.049184400588274,
"sft_loss": 0.863084614276886,
"step": 230
},
{
"epoch": 0.5758848230353929,
"grad_norm": 1.9367159826113498,
"learning_rate": 4.634602351448738e-06,
"logits/chosen": 0.286350816488266,
"logits/rejected": 0.3788919448852539,
"logps/chosen": -0.8919585943222046,
"logps/rejected": -0.9452742338180542,
"loss": 0.9133,
"odds_ratio_loss": 0.6905114650726318,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.04459793120622635,
"rewards/margins": 0.0026657807175070047,
"rewards/rejected": -0.04726371169090271,
"sft_loss": 0.8919585943222046,
"step": 240
},
{
"epoch": 0.5998800239952009,
"grad_norm": 2.0772847936555636,
"learning_rate": 4.6006154268613015e-06,
"logits/chosen": 0.4635019898414612,
"logits/rejected": 0.5444530248641968,
"logps/chosen": -0.8181222081184387,
"logps/rejected": -0.9908831715583801,
"loss": 0.8927,
"odds_ratio_loss": 0.6295598149299622,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.04090610891580582,
"rewards/margins": 0.008638045750558376,
"rewards/rejected": -0.04954415559768677,
"sft_loss": 0.8181222081184387,
"step": 250
},
{
"epoch": 0.623875224955009,
"grad_norm": 2.084215689408855,
"learning_rate": 4.565253745477187e-06,
"logits/chosen": 0.40253886580467224,
"logits/rejected": 0.4625183045864105,
"logps/chosen": -0.9301355481147766,
"logps/rejected": -1.0306508541107178,
"loss": 0.9162,
"odds_ratio_loss": 0.6872043609619141,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.04650677740573883,
"rewards/margins": 0.005025765858590603,
"rewards/rejected": -0.05153254419565201,
"sft_loss": 0.9301355481147766,
"step": 260
},
{
"epoch": 0.647870425914817,
"grad_norm": 1.9031984888179019,
"learning_rate": 4.528540449900799e-06,
"logits/chosen": 0.4078219532966614,
"logits/rejected": 0.6789823174476624,
"logps/chosen": -0.8785255551338196,
"logps/rejected": -0.9139087796211243,
"loss": 0.9176,
"odds_ratio_loss": 0.7333613038063049,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.04392627626657486,
"rewards/margins": 0.0017691642278805375,
"rewards/rejected": -0.04569543898105621,
"sft_loss": 0.8785255551338196,
"step": 270
},
{
"epoch": 0.671865626874625,
"grad_norm": 2.3067419173621113,
"learning_rate": 4.490499567306256e-06,
"logits/chosen": 0.304252564907074,
"logits/rejected": 0.5160123109817505,
"logps/chosen": -0.8951358795166016,
"logps/rejected": -0.9636558294296265,
"loss": 0.8917,
"odds_ratio_loss": 0.69621342420578,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.04475679248571396,
"rewards/margins": 0.0034259993117302656,
"rewards/rejected": -0.04818279296159744,
"sft_loss": 0.8951358795166016,
"step": 280
},
{
"epoch": 0.6958608278344331,
"grad_norm": 3.1297290877323003,
"learning_rate": 4.451155993712711e-06,
"logits/chosen": 0.25184166431427,
"logits/rejected": 0.43299436569213867,
"logps/chosen": -0.808620810508728,
"logps/rejected": -0.9780584573745728,
"loss": 0.9379,
"odds_ratio_loss": 0.6151310205459595,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.04043104499578476,
"rewards/margins": 0.008471880108118057,
"rewards/rejected": -0.048902928829193115,
"sft_loss": 0.808620810508728,
"step": 290
},
{
"epoch": 0.7198560287942412,
"grad_norm": 2.001570442654457,
"learning_rate": 4.410535477691041e-06,
"logits/chosen": 0.6736063957214355,
"logits/rejected": 0.8922637104988098,
"logps/chosen": -0.8743098974227905,
"logps/rejected": -1.0198915004730225,
"loss": 0.8962,
"odds_ratio_loss": 0.6545746326446533,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.043715499341487885,
"rewards/margins": 0.0072790831327438354,
"rewards/rejected": -0.05099458247423172,
"sft_loss": 0.8743098974227905,
"step": 300
},
{
"epoch": 0.7438512297540492,
"grad_norm": 3.088640251108737,
"learning_rate": 4.368664603512586e-06,
"logits/chosen": -0.10074709355831146,
"logits/rejected": 0.08682968467473984,
"logps/chosen": -0.7929955720901489,
"logps/rejected": -0.9449365735054016,
"loss": 0.8789,
"odds_ratio_loss": 0.6474851369857788,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03964977711439133,
"rewards/margins": 0.007597046438604593,
"rewards/rejected": -0.047246821224689484,
"sft_loss": 0.7929955720901489,
"step": 310
},
{
"epoch": 0.7678464307138573,
"grad_norm": 2.278875813822025,
"learning_rate": 4.325570773750952e-06,
"logits/chosen": -0.22130906581878662,
"logits/rejected": -0.028980206698179245,
"logps/chosen": -0.8826779127120972,
"logps/rejected": -1.0213041305541992,
"loss": 0.9204,
"odds_ratio_loss": 0.6443883180618286,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.04413389414548874,
"rewards/margins": 0.006931307725608349,
"rewards/rejected": -0.05106520652770996,
"sft_loss": 0.8826779127120972,
"step": 320
},
{
"epoch": 0.7918416316736653,
"grad_norm": 1.6952516043840655,
"learning_rate": 4.281282191348289e-06,
"logits/chosen": 0.45927032828330994,
"logits/rejected": 0.6593443751335144,
"logps/chosen": -0.8378440141677856,
"logps/rejected": -0.9682254791259766,
"loss": 0.8995,
"odds_ratio_loss": 0.6620376110076904,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04189220070838928,
"rewards/margins": 0.006519075483083725,
"rewards/rejected": -0.04841126874089241,
"sft_loss": 0.8378440141677856,
"step": 330
},
{
"epoch": 0.8158368326334733,
"grad_norm": 2.4806806819218794,
"learning_rate": 4.235827841157748e-06,
"logits/chosen": 0.01970214769244194,
"logits/rejected": 0.11670324951410294,
"logps/chosen": -0.8856766819953918,
"logps/rejected": -1.0817759037017822,
"loss": 0.8834,
"odds_ratio_loss": 0.6194185018539429,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.04428383335471153,
"rewards/margins": 0.009804959408938885,
"rewards/rejected": -0.054088789969682693,
"sft_loss": 0.8856766819953918,
"step": 340
},
{
"epoch": 0.8398320335932813,
"grad_norm": 1.5265892877639438,
"learning_rate": 4.1892374709742186e-06,
"logits/chosen": -0.7483745813369751,
"logits/rejected": -0.42045336961746216,
"logps/chosen": -0.7948485016822815,
"logps/rejected": -0.9918915033340454,
"loss": 0.9474,
"odds_ratio_loss": 0.5842909812927246,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03974242880940437,
"rewards/margins": 0.009852146729826927,
"rewards/rejected": -0.04959457367658615,
"sft_loss": 0.7948485016822815,
"step": 350
},
{
"epoch": 0.8638272345530894,
"grad_norm": 2.1051154185205543,
"learning_rate": 4.141541572065762e-06,
"logits/chosen": 0.41192498803138733,
"logits/rejected": 0.5341157913208008,
"logps/chosen": -0.7971394658088684,
"logps/rejected": -0.9216561317443848,
"loss": 0.8881,
"odds_ratio_loss": 0.69920814037323,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.03985697776079178,
"rewards/margins": 0.0062258280813694,
"rewards/rejected": -0.04608280584216118,
"sft_loss": 0.7971394658088684,
"step": 360
},
{
"epoch": 0.8878224355128974,
"grad_norm": 2.049071087536336,
"learning_rate": 4.092771359218462e-06,
"logits/chosen": 0.2649831771850586,
"logits/rejected": 0.45568495988845825,
"logps/chosen": -0.8466150164604187,
"logps/rejected": -1.0025365352630615,
"loss": 0.9065,
"odds_ratio_loss": 0.629971444606781,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.042330749332904816,
"rewards/margins": 0.007796071469783783,
"rewards/rejected": -0.0501268208026886,
"sft_loss": 0.8466150164604187,
"step": 370
},
{
"epoch": 0.9118176364727054,
"grad_norm": 3.597524104140319,
"learning_rate": 4.04295875030778e-06,
"logits/chosen": -0.18752217292785645,
"logits/rejected": 0.15378537774085999,
"logps/chosen": -0.8704308271408081,
"logps/rejected": -0.9513336420059204,
"loss": 0.9014,
"odds_ratio_loss": 0.6948253512382507,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.043521542102098465,
"rewards/margins": 0.004045139066874981,
"rewards/rejected": -0.04756668210029602,
"sft_loss": 0.8704308271408081,
"step": 380
},
{
"epoch": 0.9358128374325135,
"grad_norm": 3.1405630532603395,
"learning_rate": 3.992136345409765e-06,
"logits/chosen": -0.1735876053571701,
"logits/rejected": -0.20124337077140808,
"logps/chosen": -0.9253339767456055,
"logps/rejected": -1.0305973291397095,
"loss": 0.9111,
"odds_ratio_loss": 0.6636070013046265,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.04626670479774475,
"rewards/margins": 0.005263164173811674,
"rewards/rejected": -0.051529865711927414,
"sft_loss": 0.9253339767456055,
"step": 390
},
{
"epoch": 0.9598080383923215,
"grad_norm": 2.4716790122788983,
"learning_rate": 3.940337405465786e-06,
"logits/chosen": 0.26361703872680664,
"logits/rejected": 0.44345617294311523,
"logps/chosen": -0.8355854153633118,
"logps/rejected": -1.0225704908370972,
"loss": 0.9062,
"odds_ratio_loss": 0.6545855402946472,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.04177927225828171,
"rewards/margins": 0.009349259547889233,
"rewards/rejected": -0.05112852901220322,
"sft_loss": 0.8355854153633118,
"step": 400
},
{
"epoch": 0.9838032393521295,
"grad_norm": 2.3985102639359406,
"learning_rate": 3.887595830514775e-06,
"logits/chosen": 0.21671700477600098,
"logits/rejected": 0.29912179708480835,
"logps/chosen": -0.809670090675354,
"logps/rejected": -1.0107569694519043,
"loss": 0.9029,
"odds_ratio_loss": 0.6326887011528015,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0404835119843483,
"rewards/margins": 0.010054344311356544,
"rewards/rejected": -0.05053785443305969,
"sft_loss": 0.809670090675354,
"step": 410
},
{
"epoch": 1.0077984403119375,
"grad_norm": 1.6971594247197401,
"learning_rate": 3.833946137507195e-06,
"logits/chosen": 0.4990086555480957,
"logits/rejected": 0.616361141204834,
"logps/chosen": -0.8005359768867493,
"logps/rejected": -0.9603840708732605,
"loss": 0.8398,
"odds_ratio_loss": 0.6354148387908936,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.040026795119047165,
"rewards/margins": 0.007992411032319069,
"rewards/rejected": -0.04801920801401138,
"sft_loss": 0.8005359768867493,
"step": 420
},
{
"epoch": 1.0317936412717457,
"grad_norm": 2.2002987962167904,
"learning_rate": 3.779423437715274e-06,
"logits/chosen": 0.7601526975631714,
"logits/rejected": 0.8180352449417114,
"logps/chosen": -0.6671024560928345,
"logps/rejected": -0.9577730298042297,
"loss": 0.7742,
"odds_ratio_loss": 0.5807942152023315,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03335512429475784,
"rewards/margins": 0.014533529989421368,
"rewards/rejected": -0.047888655215501785,
"sft_loss": 0.6671024560928345,
"step": 430
},
{
"epoch": 1.0557888422315538,
"grad_norm": 1.5148819350515028,
"learning_rate": 3.7240634137542864e-06,
"logits/chosen": 0.19566980004310608,
"logits/rejected": 0.3528198003768921,
"logps/chosen": -0.6874720454216003,
"logps/rejected": -1.0558958053588867,
"loss": 0.7663,
"odds_ratio_loss": 0.48211669921875,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.034373603761196136,
"rewards/margins": 0.01842118799686432,
"rewards/rejected": -0.052794791758060455,
"sft_loss": 0.6874720454216003,
"step": 440
},
{
"epoch": 1.0797840431913617,
"grad_norm": 1.6130353172110996,
"learning_rate": 3.6679022962299054e-06,
"logits/chosen": 0.8750432133674622,
"logits/rejected": 0.8553866147994995,
"logps/chosen": -0.7515122890472412,
"logps/rejected": -0.9563247561454773,
"loss": 0.7745,
"odds_ratio_loss": 0.5920617580413818,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.037575613707304,
"rewards/margins": 0.010240620002150536,
"rewards/rejected": -0.047816235572099686,
"sft_loss": 0.7515122890472412,
"step": 450
},
{
"epoch": 1.1037792441511698,
"grad_norm": 1.8444047185661667,
"learning_rate": 3.6109768400269336e-06,
"logits/chosen": 0.21664266288280487,
"logits/rejected": 0.3455556333065033,
"logps/chosen": -0.7820109128952026,
"logps/rejected": -1.1722263097763062,
"loss": 0.7949,
"odds_ratio_loss": 0.5249099731445312,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03910055011510849,
"rewards/margins": 0.019510772079229355,
"rewards/rejected": -0.05861131474375725,
"sft_loss": 0.7820109128952026,
"step": 460
},
{
"epoch": 1.127774445110978,
"grad_norm": 1.923809039800638,
"learning_rate": 3.5533243002549044e-06,
"logits/chosen": -0.051299355924129486,
"logits/rejected": 0.12599964439868927,
"logps/chosen": -0.6766480803489685,
"logps/rejected": -0.9556339979171753,
"loss": 0.769,
"odds_ratio_loss": 0.5771059989929199,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03383240848779678,
"rewards/margins": 0.013949294574558735,
"rewards/rejected": -0.047781698405742645,
"sft_loss": 0.6766480803489685,
"step": 470
},
{
"epoch": 1.1517696460707858,
"grad_norm": 2.0416324249302593,
"learning_rate": 3.4949824078663214e-06,
"logits/chosen": 0.3260158598423004,
"logits/rejected": 0.4627075791358948,
"logps/chosen": -0.6955934762954712,
"logps/rejected": -1.0405316352844238,
"loss": 0.7744,
"odds_ratio_loss": 0.5207543969154358,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.03477967530488968,
"rewards/margins": 0.017246905714273453,
"rewards/rejected": -0.05202658101916313,
"sft_loss": 0.6955934762954712,
"step": 480
},
{
"epoch": 1.175764847030594,
"grad_norm": 2.159701142475688,
"learning_rate": 3.4359893449634713e-06,
"logits/chosen": 0.10285909473896027,
"logits/rejected": 0.18586108088493347,
"logps/chosen": -0.7835036516189575,
"logps/rejected": -0.9662873148918152,
"loss": 0.7699,
"odds_ratio_loss": 0.6257883310317993,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03917517885565758,
"rewards/margins": 0.009139184840023518,
"rewards/rejected": -0.04831436648964882,
"sft_loss": 0.7835036516189575,
"step": 490
},
{
"epoch": 1.1997600479904018,
"grad_norm": 1.905386181833648,
"learning_rate": 3.3763837198099807e-06,
"logits/chosen": 0.2618166208267212,
"logits/rejected": 0.403994083404541,
"logps/chosen": -0.7472913861274719,
"logps/rejected": -0.9723391532897949,
"loss": 0.8034,
"odds_ratio_loss": 0.5758217573165894,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03736456483602524,
"rewards/margins": 0.011252395808696747,
"rewards/rejected": -0.048616960644721985,
"sft_loss": 0.7472913861274719,
"step": 500
},
{
"epoch": 1.22375524895021,
"grad_norm": 1.8483335773730425,
"learning_rate": 3.3162045415634793e-06,
"logits/chosen": -0.06936601549386978,
"logits/rejected": 0.15932008624076843,
"logps/chosen": -0.7298214435577393,
"logps/rejected": -0.989848792552948,
"loss": 0.764,
"odds_ratio_loss": 0.5586143136024475,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.036491066217422485,
"rewards/margins": 0.013001373037695885,
"rewards/rejected": -0.04949244111776352,
"sft_loss": 0.7298214435577393,
"step": 510
},
{
"epoch": 1.247750449910018,
"grad_norm": 1.4105189905656275,
"learning_rate": 3.255491194745878e-06,
"logits/chosen": -0.0699717178940773,
"logits/rejected": 0.11926586925983429,
"logps/chosen": -0.7712666988372803,
"logps/rejected": -1.0007984638214111,
"loss": 0.7514,
"odds_ratio_loss": 0.576269805431366,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03856333717703819,
"rewards/margins": 0.011476586572825909,
"rewards/rejected": -0.050039924681186676,
"sft_loss": 0.7712666988372803,
"step": 520
},
{
"epoch": 1.2717456508698262,
"grad_norm": 1.5086406745902339,
"learning_rate": 3.1942834134680123e-06,
"logits/chosen": -0.4110763669013977,
"logits/rejected": -0.197097510099411,
"logps/chosen": -0.7337836027145386,
"logps/rejected": -1.0581499338150024,
"loss": 0.747,
"odds_ratio_loss": 0.5731949806213379,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03668918460607529,
"rewards/margins": 0.016218315809965134,
"rewards/rejected": -0.05290750414133072,
"sft_loss": 0.7337836027145386,
"step": 530
},
{
"epoch": 1.295740851829634,
"grad_norm": 2.007767969966132,
"learning_rate": 3.13262125542547e-06,
"logits/chosen": 0.24464428424835205,
"logits/rejected": 0.42607539892196655,
"logps/chosen": -0.8008230328559875,
"logps/rejected": -1.019913911819458,
"loss": 0.7839,
"odds_ratio_loss": 0.5772299766540527,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04004114866256714,
"rewards/margins": 0.010954543016850948,
"rewards/rejected": -0.05099569633603096,
"sft_loss": 0.8008230328559875,
"step": 540
},
{
"epoch": 1.3197360527894422,
"grad_norm": 2.031522996603775,
"learning_rate": 3.0705450756826707e-06,
"logits/chosen": -0.6761570572853088,
"logits/rejected": -0.5336428880691528,
"logps/chosen": -0.7791737914085388,
"logps/rejected": -0.9758432507514954,
"loss": 0.7734,
"odds_ratio_loss": 0.5955380201339722,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03895869478583336,
"rewards/margins": 0.009833470918238163,
"rewards/rejected": -0.04879216477274895,
"sft_loss": 0.7791737914085388,
"step": 550
},
{
"epoch": 1.34373125374925,
"grad_norm": 1.8127230145286217,
"learning_rate": 3.00809550026231e-06,
"logits/chosen": 0.7122937440872192,
"logits/rejected": 0.8374090194702148,
"logps/chosen": -0.7448546290397644,
"logps/rejected": -1.0183660984039307,
"loss": 0.7313,
"odds_ratio_loss": 0.5605376362800598,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03724273294210434,
"rewards/margins": 0.01367556769400835,
"rewards/rejected": -0.050918303430080414,
"sft_loss": 0.7448546290397644,
"step": 560
},
{
"epoch": 1.3677264547090582,
"grad_norm": 1.6102410365866324,
"learning_rate": 2.9453133995574955e-06,
"logits/chosen": 0.1695878505706787,
"logits/rejected": 0.34987810254096985,
"logps/chosen": -0.7041548490524292,
"logps/rejected": -1.1295292377471924,
"loss": 0.7529,
"odds_ratio_loss": 0.5541011095046997,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03520774096250534,
"rewards/margins": 0.02126871421933174,
"rewards/rejected": -0.05647646263241768,
"sft_loss": 0.7041548490524292,
"step": 570
},
{
"epoch": 1.3917216556688663,
"grad_norm": 2.0516481147792964,
"learning_rate": 2.8822398615839337e-06,
"logits/chosen": -0.15236589312553406,
"logits/rejected": 0.005555987358093262,
"logps/chosen": -0.7019264698028564,
"logps/rejected": -0.9463084936141968,
"loss": 0.7377,
"odds_ratio_loss": 0.5546727180480957,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03509632498025894,
"rewards/margins": 0.012219103053212166,
"rewards/rejected": -0.04731542617082596,
"sft_loss": 0.7019264698028564,
"step": 580
},
{
"epoch": 1.4157168566286742,
"grad_norm": 2.5703275268486463,
"learning_rate": 2.8189161650897045e-06,
"logits/chosen": 0.09915417432785034,
"logits/rejected": 0.2876579761505127,
"logps/chosen": -0.7416352033615112,
"logps/rejected": -0.9542354345321655,
"loss": 0.7748,
"odds_ratio_loss": 0.5765627026557922,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0370817631483078,
"rewards/margins": 0.010630009695887566,
"rewards/rejected": -0.04771176725625992,
"sft_loss": 0.7416352033615112,
"step": 590
},
{
"epoch": 1.4397120575884823,
"grad_norm": 1.6574957139548097,
"learning_rate": 2.7553837525402095e-06,
"logits/chosen": 0.14950448274612427,
"logits/rejected": 0.14670611917972565,
"logps/chosen": -0.7459922432899475,
"logps/rejected": -0.9438718557357788,
"loss": 0.764,
"odds_ratio_loss": 0.6029990911483765,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.037299610674381256,
"rewards/margins": 0.009893985465168953,
"rewards/rejected": -0.04719359427690506,
"sft_loss": 0.7459922432899475,
"step": 600
},
{
"epoch": 1.4637072585482904,
"grad_norm": 1.5955732799355493,
"learning_rate": 2.691684202995966e-06,
"logits/chosen": 0.43530672788619995,
"logits/rejected": 0.4994083344936371,
"logps/chosen": -0.8142836689949036,
"logps/rejected": -0.9706009030342102,
"loss": 0.7559,
"odds_ratio_loss": 0.7006958723068237,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.04071418568491936,
"rewards/margins": 0.007815859280526638,
"rewards/rejected": -0.04853004962205887,
"sft_loss": 0.8142836689949036,
"step": 610
},
{
"epoch": 1.4877024595080983,
"grad_norm": 1.9589861397245603,
"learning_rate": 2.6278592049010204e-06,
"logits/chosen": -0.19675548374652863,
"logits/rejected": -0.004504656884819269,
"logps/chosen": -0.7537368535995483,
"logps/rejected": -1.0135046243667603,
"loss": 0.7741,
"odds_ratio_loss": 0.5691729187965393,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.03768684342503548,
"rewards/margins": 0.012988388538360596,
"rewards/rejected": -0.050675224512815475,
"sft_loss": 0.7537368535995483,
"step": 620
},
{
"epoch": 1.5116976604679064,
"grad_norm": 1.7255875955000524,
"learning_rate": 2.5639505287997584e-06,
"logits/chosen": 0.3145737051963806,
"logits/rejected": 0.47394928336143494,
"logps/chosen": -0.7314926385879517,
"logps/rejected": -1.001952886581421,
"loss": 0.7829,
"odds_ratio_loss": 0.5629433393478394,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03657463565468788,
"rewards/margins": 0.013523015193641186,
"rewards/rejected": -0.050097644329071045,
"sft_loss": 0.7314926385879517,
"step": 630
},
{
"epoch": 1.5356928614277146,
"grad_norm": 2.504847023988975,
"learning_rate": 2.5e-06,
"logits/chosen": 0.2320265769958496,
"logits/rejected": 0.3284027874469757,
"logps/chosen": -0.7656562924385071,
"logps/rejected": -1.076923131942749,
"loss": 0.7503,
"odds_ratio_loss": 0.584337592124939,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.038282815366983414,
"rewards/margins": 0.015563338994979858,
"rewards/rejected": -0.053846150636672974,
"sft_loss": 0.7656562924385071,
"step": 640
},
{
"epoch": 1.5596880623875224,
"grad_norm": 1.4394266237384084,
"learning_rate": 2.436049471200242e-06,
"logits/chosen": -0.5206400156021118,
"logits/rejected": -0.38631540536880493,
"logps/chosen": -0.8094362020492554,
"logps/rejected": -0.9923938512802124,
"loss": 0.7752,
"odds_ratio_loss": 0.5967071056365967,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.04047181457281113,
"rewards/margins": 0.00914788618683815,
"rewards/rejected": -0.04961969703435898,
"sft_loss": 0.8094362020492554,
"step": 650
},
{
"epoch": 1.5836832633473306,
"grad_norm": 1.7625452374002906,
"learning_rate": 2.3721407950989804e-06,
"logits/chosen": -0.24351301789283752,
"logits/rejected": -0.07003232091665268,
"logps/chosen": -0.6876959800720215,
"logps/rejected": -0.9035342335700989,
"loss": 0.7734,
"odds_ratio_loss": 0.5917103290557861,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.034384798258543015,
"rewards/margins": 0.010791914537549019,
"rewards/rejected": -0.045176707208156586,
"sft_loss": 0.6876959800720215,
"step": 660
},
{
"epoch": 1.6076784643071385,
"grad_norm": 1.6046093499190943,
"learning_rate": 2.3083157970040344e-06,
"logits/chosen": 0.5633162260055542,
"logits/rejected": 0.6462755799293518,
"logps/chosen": -0.7524802684783936,
"logps/rejected": -1.0558850765228271,
"loss": 0.7563,
"odds_ratio_loss": 0.552274227142334,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03762401267886162,
"rewards/margins": 0.015170246362686157,
"rewards/rejected": -0.05279426649212837,
"sft_loss": 0.7524802684783936,
"step": 670
},
{
"epoch": 1.6316736652669466,
"grad_norm": 2.117352018263469,
"learning_rate": 2.2446162474597913e-06,
"logits/chosen": 0.43944865465164185,
"logits/rejected": 0.5002392530441284,
"logps/chosen": -0.7501770257949829,
"logps/rejected": -0.9691005945205688,
"loss": 0.7699,
"odds_ratio_loss": 0.5791727304458618,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.037508852779865265,
"rewards/margins": 0.010946177877485752,
"rewards/rejected": -0.04845503345131874,
"sft_loss": 0.7501770257949829,
"step": 680
},
{
"epoch": 1.6556688662267547,
"grad_norm": 1.6685249776962552,
"learning_rate": 2.1810838349102963e-06,
"logits/chosen": 0.16153453290462494,
"logits/rejected": 0.20878514647483826,
"logps/chosen": -0.7516240477561951,
"logps/rejected": -1.0250643491744995,
"loss": 0.7666,
"odds_ratio_loss": 0.5872852206230164,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03758120536804199,
"rewards/margins": 0.013672016561031342,
"rewards/rejected": -0.051253218203783035,
"sft_loss": 0.7516240477561951,
"step": 690
},
{
"epoch": 1.6796640671865628,
"grad_norm": 2.782782057649718,
"learning_rate": 2.117760138416067e-06,
"logits/chosen": 0.24376201629638672,
"logits/rejected": 0.44258540868759155,
"logps/chosen": -0.6985687017440796,
"logps/rejected": -1.0050299167633057,
"loss": 0.7614,
"odds_ratio_loss": 0.543103814125061,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03492843732237816,
"rewards/margins": 0.015323063358664513,
"rewards/rejected": -0.05025150254368782,
"sft_loss": 0.6985687017440796,
"step": 700
},
{
"epoch": 1.7036592681463707,
"grad_norm": 1.5369658154698735,
"learning_rate": 2.0546866004425053e-06,
"logits/chosen": 0.3964254558086395,
"logits/rejected": 0.4900701642036438,
"logps/chosen": -0.7590494155883789,
"logps/rejected": -1.2440413236618042,
"loss": 0.7652,
"odds_ratio_loss": 0.5372438430786133,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.037952471524477005,
"rewards/margins": 0.024249596521258354,
"rewards/rejected": -0.06220207363367081,
"sft_loss": 0.7590494155883789,
"step": 710
},
{
"epoch": 1.7276544691061788,
"grad_norm": 1.9970193945029362,
"learning_rate": 1.9919044997376906e-06,
"logits/chosen": 0.6031176447868347,
"logits/rejected": 0.7783833742141724,
"logps/chosen": -0.7290822267532349,
"logps/rejected": -1.021554946899414,
"loss": 0.7176,
"odds_ratio_loss": 0.557815432548523,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.03645411133766174,
"rewards/margins": 0.014623639173805714,
"rewards/rejected": -0.051077745854854584,
"sft_loss": 0.7290822267532349,
"step": 720
},
{
"epoch": 1.7516496700659867,
"grad_norm": 2.558147455560064,
"learning_rate": 1.9294549243173306e-06,
"logits/chosen": -0.027294237166643143,
"logits/rejected": 0.11035363376140594,
"logps/chosen": -0.7765438556671143,
"logps/rejected": -1.0300321578979492,
"loss": 0.7771,
"odds_ratio_loss": 0.5954040884971619,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03882719203829765,
"rewards/margins": 0.012674416415393353,
"rewards/rejected": -0.05150160938501358,
"sft_loss": 0.7765438556671143,
"step": 730
},
{
"epoch": 1.7756448710257948,
"grad_norm": 2.346615273317464,
"learning_rate": 1.8673787445745298e-06,
"logits/chosen": -0.449845552444458,
"logits/rejected": -0.3746832311153412,
"logps/chosen": -0.7114017605781555,
"logps/rejected": -0.928491473197937,
"loss": 0.7699,
"odds_ratio_loss": 0.5795110464096069,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.035570088773965836,
"rewards/margins": 0.010854486376047134,
"rewards/rejected": -0.04642457515001297,
"sft_loss": 0.7114017605781555,
"step": 740
},
{
"epoch": 1.799640071985603,
"grad_norm": 1.995371230537378,
"learning_rate": 1.805716586531988e-06,
"logits/chosen": -0.13443303108215332,
"logits/rejected": 0.014731263741850853,
"logps/chosen": -0.8079891204833984,
"logps/rejected": -1.0810317993164062,
"loss": 0.7825,
"odds_ratio_loss": 0.6112096309661865,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0403994545340538,
"rewards/margins": 0.013652140274643898,
"rewards/rejected": -0.05405158922076225,
"sft_loss": 0.8079891204833984,
"step": 750
},
{
"epoch": 1.823635272945411,
"grad_norm": 1.8742057389590454,
"learning_rate": 1.7445088052541218e-06,
"logits/chosen": 0.046121031045913696,
"logits/rejected": 0.1955467015504837,
"logps/chosen": -0.7093559503555298,
"logps/rejected": -1.0484099388122559,
"loss": 0.7617,
"odds_ratio_loss": 0.5657014846801758,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03546779602766037,
"rewards/margins": 0.016952697187662125,
"rewards/rejected": -0.05242049694061279,
"sft_loss": 0.7093559503555298,
"step": 760
},
{
"epoch": 1.847630473905219,
"grad_norm": 1.2680203881504901,
"learning_rate": 1.6837954584365217e-06,
"logits/chosen": 0.4459083080291748,
"logits/rejected": 0.5636454224586487,
"logps/chosen": -0.7526987195014954,
"logps/rejected": -1.009804606437683,
"loss": 0.7871,
"odds_ratio_loss": 0.5556772947311401,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03763493150472641,
"rewards/margins": 0.012855296023190022,
"rewards/rejected": -0.050490230321884155,
"sft_loss": 0.7526987195014954,
"step": 770
},
{
"epoch": 1.8716256748650268,
"grad_norm": 1.9254646582677224,
"learning_rate": 1.6236162801900191e-06,
"logits/chosen": -0.10451897233724594,
"logits/rejected": 0.3060254156589508,
"logps/chosen": -0.6585639715194702,
"logps/rejected": -0.9869001507759094,
"loss": 0.71,
"odds_ratio_loss": 0.4942260682582855,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03292820230126381,
"rewards/margins": 0.016416804865002632,
"rewards/rejected": -0.04934500530362129,
"sft_loss": 0.6585639715194702,
"step": 780
},
{
"epoch": 1.895620875824835,
"grad_norm": 1.9904836511656812,
"learning_rate": 1.5640106550365298e-06,
"logits/chosen": 0.11656351387500763,
"logits/rejected": 0.29824742674827576,
"logps/chosen": -0.7831540703773499,
"logps/rejected": -1.0284688472747803,
"loss": 0.7758,
"odds_ratio_loss": 0.5839165449142456,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03915770351886749,
"rewards/margins": 0.01226573996245861,
"rewards/rejected": -0.051423441618680954,
"sft_loss": 0.7831540703773499,
"step": 790
},
{
"epoch": 1.919616076784643,
"grad_norm": 1.7061927534288226,
"learning_rate": 1.5050175921336797e-06,
"logits/chosen": 0.14354857802391052,
"logits/rejected": 0.27334246039390564,
"logps/chosen": -0.7474446892738342,
"logps/rejected": -0.9480558633804321,
"loss": 0.7575,
"odds_ratio_loss": 0.6441240310668945,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03737223893404007,
"rewards/margins": 0.010030550882220268,
"rewards/rejected": -0.04740279167890549,
"sft_loss": 0.7474446892738342,
"step": 800
},
{
"epoch": 1.9436112777444512,
"grad_norm": 2.251879648695612,
"learning_rate": 1.446675699745097e-06,
"logits/chosen": 0.25183239579200745,
"logits/rejected": 0.38326969742774963,
"logps/chosen": -0.7823570966720581,
"logps/rejected": -0.9946805238723755,
"loss": 0.8037,
"odds_ratio_loss": 0.6080455183982849,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03911786153912544,
"rewards/margins": 0.010616169311106205,
"rewards/rejected": -0.049734026193618774,
"sft_loss": 0.7823570966720581,
"step": 810
},
{
"epoch": 1.9676064787042593,
"grad_norm": 1.9391362449031262,
"learning_rate": 1.3890231599730674e-06,
"logits/chosen": 0.31725913286209106,
"logits/rejected": 0.5106421709060669,
"logps/chosen": -0.7221857309341431,
"logps/rejected": -0.9829575419425964,
"loss": 0.7904,
"odds_ratio_loss": 0.5538625121116638,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03610928729176521,
"rewards/margins": 0.013038587756454945,
"rewards/rejected": -0.049147870391607285,
"sft_loss": 0.7221857309341431,
"step": 820
},
{
"epoch": 1.9916016796640672,
"grad_norm": 1.5457295502049215,
"learning_rate": 1.3320977037700952e-06,
"logits/chosen": 0.8291665315628052,
"logits/rejected": 1.1122350692749023,
"logps/chosen": -0.6864774227142334,
"logps/rejected": -1.0247427225112915,
"loss": 0.7452,
"odds_ratio_loss": 0.49447354674339294,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03432386741042137,
"rewards/margins": 0.016913266852498055,
"rewards/rejected": -0.051237136125564575,
"sft_loss": 0.6864774227142334,
"step": 830
},
{
"epoch": 2.015596880623875,
"grad_norm": 1.5016852289986733,
"learning_rate": 1.2759365862457148e-06,
"logits/chosen": -0.4956502318382263,
"logits/rejected": -0.1621031016111374,
"logps/chosen": -0.7308815717697144,
"logps/rejected": -0.9828909039497375,
"loss": 0.7173,
"odds_ratio_loss": 0.5487710237503052,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0365440808236599,
"rewards/margins": 0.012600463815033436,
"rewards/rejected": -0.049144547432661057,
"sft_loss": 0.7308815717697144,
"step": 840
},
{
"epoch": 2.039592081583683,
"grad_norm": 1.622924065562837,
"learning_rate": 1.2205765622847273e-06,
"logits/chosen": -0.12397761642932892,
"logits/rejected": 0.08023932576179504,
"logps/chosen": -0.6277745962142944,
"logps/rejected": -1.0955206155776978,
"loss": 0.6995,
"odds_ratio_loss": 0.4475070536136627,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.03138873726129532,
"rewards/margins": 0.023387301713228226,
"rewards/rejected": -0.054776035249233246,
"sft_loss": 0.6277745962142944,
"step": 850
},
{
"epoch": 2.0635872825434913,
"grad_norm": 1.4741935497367946,
"learning_rate": 1.1660538624928062e-06,
"logits/chosen": -0.3639386296272278,
"logits/rejected": -0.2011258602142334,
"logps/chosen": -0.6642920970916748,
"logps/rejected": -1.0270217657089233,
"loss": 0.7019,
"odds_ratio_loss": 0.4971997141838074,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03321460261940956,
"rewards/margins": 0.018136484548449516,
"rewards/rejected": -0.05135108903050423,
"sft_loss": 0.6642920970916748,
"step": 860
},
{
"epoch": 2.0875824835032994,
"grad_norm": 1.7172174730539993,
"learning_rate": 1.112404169485226e-06,
"logits/chosen": -0.3923923075199127,
"logits/rejected": -0.10327514261007309,
"logps/chosen": -0.5645719766616821,
"logps/rejected": -1.071115255355835,
"loss": 0.6681,
"odds_ratio_loss": 0.42052555084228516,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.028228599578142166,
"rewards/margins": 0.025327179580926895,
"rewards/rejected": -0.053555767983198166,
"sft_loss": 0.5645719766616821,
"step": 870
},
{
"epoch": 2.1115776844631076,
"grad_norm": 1.1474314844125568,
"learning_rate": 1.0596625945342148e-06,
"logits/chosen": -0.008033117279410362,
"logits/rejected": 0.16419892013072968,
"logps/chosen": -0.7100299000740051,
"logps/rejected": -0.9733055233955383,
"loss": 0.6813,
"odds_ratio_loss": 0.5328400731086731,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03550150245428085,
"rewards/margins": 0.013163777068257332,
"rewards/rejected": -0.048665277659893036,
"sft_loss": 0.7100299000740051,
"step": 880
},
{
"epoch": 2.1355728854229152,
"grad_norm": 2.1383619388719515,
"learning_rate": 1.0078636545902363e-06,
"logits/chosen": -0.4247666001319885,
"logits/rejected": -0.17631380259990692,
"logps/chosen": -0.6582883596420288,
"logps/rejected": -1.0547147989273071,
"loss": 0.6895,
"odds_ratio_loss": 0.47398701310157776,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0329144187271595,
"rewards/margins": 0.019821325317025185,
"rewards/rejected": -0.05273573845624924,
"sft_loss": 0.6582883596420288,
"step": 890
},
{
"epoch": 2.1595680863827234,
"grad_norm": 1.5320300236939732,
"learning_rate": 9.570412496922198e-07,
"logits/chosen": -0.27953624725341797,
"logits/rejected": -0.08715387433767319,
"logps/chosen": -0.5965186357498169,
"logps/rejected": -1.154284119606018,
"loss": 0.6738,
"odds_ratio_loss": 0.4240815043449402,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.029825935140252113,
"rewards/margins": 0.02788827195763588,
"rewards/rejected": -0.05771421268582344,
"sft_loss": 0.5965186357498169,
"step": 900
},
{
"epoch": 2.1835632873425315,
"grad_norm": 1.6204787225170885,
"learning_rate": 9.07228640781539e-07,
"logits/chosen": 0.368365079164505,
"logits/rejected": 0.6101259589195251,
"logps/chosen": -0.6893322467803955,
"logps/rejected": -1.0903311967849731,
"loss": 0.6791,
"odds_ratio_loss": 0.4818887710571289,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.03446660935878754,
"rewards/margins": 0.02004995197057724,
"rewards/rejected": -0.054516565054655075,
"sft_loss": 0.6893322467803955,
"step": 910
},
{
"epoch": 2.2075584883023396,
"grad_norm": 1.290844558254926,
"learning_rate": 8.584584279342392e-07,
"logits/chosen": -0.16083380579948425,
"logits/rejected": -0.10739579051733017,
"logps/chosen": -0.6938862800598145,
"logps/rejected": -0.9513536691665649,
"loss": 0.6888,
"odds_ratio_loss": 0.5428452491760254,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.034694310277700424,
"rewards/margins": 0.012873371131718159,
"rewards/rejected": -0.047567687928676605,
"sft_loss": 0.6938862800598145,
"step": 920
},
{
"epoch": 2.2315536892621477,
"grad_norm": 1.5229766148545818,
"learning_rate": 8.10762529025782e-07,
"logits/chosen": -0.4659739136695862,
"logits/rejected": -0.4786594808101654,
"logps/chosen": -0.6584521532058716,
"logps/rejected": -0.8917843699455261,
"loss": 0.65,
"odds_ratio_loss": 0.5486137866973877,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03292260691523552,
"rewards/margins": 0.011666612699627876,
"rewards/rejected": -0.044589221477508545,
"sft_loss": 0.6584521532058716,
"step": 930
},
{
"epoch": 2.255548890221956,
"grad_norm": 1.7015940933867517,
"learning_rate": 7.641721588422526e-07,
"logits/chosen": -0.009342163801193237,
"logits/rejected": 0.1280032843351364,
"logps/chosen": -0.6387184262275696,
"logps/rejected": -1.049140453338623,
"loss": 0.687,
"odds_ratio_loss": 0.4773840010166168,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0319359228014946,
"rewards/margins": 0.020521100610494614,
"rewards/rejected": -0.05245702341198921,
"sft_loss": 0.6387184262275696,
"step": 940
},
{
"epoch": 2.2795440911817635,
"grad_norm": 1.4203319350991257,
"learning_rate": 7.187178086517116e-07,
"logits/chosen": 0.14468683302402496,
"logits/rejected": 0.2608656883239746,
"logps/chosen": -0.6514204144477844,
"logps/rejected": -1.2591578960418701,
"loss": 0.6695,
"odds_ratio_loss": 0.455849826335907,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03257102146744728,
"rewards/margins": 0.03038688376545906,
"rewards/rejected": -0.06295789778232574,
"sft_loss": 0.6514204144477844,
"step": 950
},
{
"epoch": 2.3035392921415716,
"grad_norm": 1.7783791010197938,
"learning_rate": 6.74429226249049e-07,
"logits/chosen": 0.09898465871810913,
"logits/rejected": 0.21373791992664337,
"logps/chosen": -0.6381307244300842,
"logps/rejected": -0.9742431640625,
"loss": 0.6712,
"odds_ratio_loss": 0.49530988931655884,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.03190653771162033,
"rewards/margins": 0.016805628314614296,
"rewards/rejected": -0.04871216416358948,
"sft_loss": 0.6381307244300842,
"step": 960
},
{
"epoch": 2.3275344931013797,
"grad_norm": 1.6090454208525553,
"learning_rate": 6.313353964874155e-07,
"logits/chosen": 0.1333683431148529,
"logits/rejected": 0.3417516350746155,
"logps/chosen": -0.6887052655220032,
"logps/rejected": -1.0016798973083496,
"loss": 0.6673,
"odds_ratio_loss": 0.5059822797775269,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03443526476621628,
"rewards/margins": 0.01564873196184635,
"rewards/rejected": -0.05008399486541748,
"sft_loss": 0.6887052655220032,
"step": 970
},
{
"epoch": 2.351529694061188,
"grad_norm": 1.6382111002720514,
"learning_rate": 5.894645223089584e-07,
"logits/chosen": 0.7236309051513672,
"logits/rejected": 0.8550646901130676,
"logps/chosen": -0.6779772639274597,
"logps/rejected": -1.2183148860931396,
"loss": 0.6958,
"odds_ratio_loss": 0.448292076587677,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.033898863941431046,
"rewards/margins": 0.027016881853342056,
"rewards/rejected": -0.0609157457947731,
"sft_loss": 0.6779772639274597,
"step": 980
},
{
"epoch": 2.375524895020996,
"grad_norm": 1.680992010239421,
"learning_rate": 5.48844006287289e-07,
"logits/chosen": 0.12925365567207336,
"logits/rejected": 0.3167954981327057,
"logps/chosen": -0.6692675352096558,
"logps/rejected": -1.0140740871429443,
"loss": 0.6691,
"odds_ratio_loss": 0.4763975143432617,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.033463381230831146,
"rewards/margins": 0.01724032498896122,
"rewards/rejected": -0.050703711807727814,
"sft_loss": 0.6692675352096558,
"step": 990
},
{
"epoch": 2.3995200959808036,
"grad_norm": 1.544720546176764,
"learning_rate": 5.095004326937445e-07,
"logits/chosen": -0.4231066107749939,
"logits/rejected": -0.20230142772197723,
"logps/chosen": -0.6737790107727051,
"logps/rejected": -1.0810075998306274,
"loss": 0.6744,
"odds_ratio_loss": 0.4769432544708252,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.033688947558403015,
"rewards/margins": 0.02036142908036709,
"rewards/rejected": -0.05405038595199585,
"sft_loss": 0.6737790107727051,
"step": 1000
},
{
"epoch": 2.4235152969406117,
"grad_norm": 1.7400382431256138,
"learning_rate": 4.71459550099202e-07,
"logits/chosen": 0.2943962812423706,
"logits/rejected": 0.5343393087387085,
"logps/chosen": -0.6686779856681824,
"logps/rejected": -1.0820672512054443,
"loss": 0.7078,
"odds_ratio_loss": 0.5010559558868408,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.03343390300869942,
"rewards/margins": 0.020669464021921158,
"rewards/rejected": -0.054103363305330276,
"sft_loss": 0.6686779856681824,
"step": 1010
},
{
"epoch": 2.44751049790042,
"grad_norm": 1.548219424075948,
"learning_rate": 4.347462545228134e-07,
"logits/chosen": 0.13567771017551422,
"logits/rejected": 0.31968480348587036,
"logps/chosen": -0.6244124174118042,
"logps/rejected": -1.05476975440979,
"loss": 0.6563,
"odds_ratio_loss": 0.4984089732170105,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03122062422335148,
"rewards/margins": 0.021517863497138023,
"rewards/rejected": -0.052738480269908905,
"sft_loss": 0.6244124174118042,
"step": 1020
},
{
"epoch": 2.471505698860228,
"grad_norm": 1.4610216249122747,
"learning_rate": 3.9938457313869914e-07,
"logits/chosen": -0.08544759452342987,
"logits/rejected": 0.07162941992282867,
"logps/chosen": -0.7579829096794128,
"logps/rejected": -1.1255767345428467,
"loss": 0.6864,
"odds_ratio_loss": 0.547897458076477,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03789914771914482,
"rewards/margins": 0.01837969198822975,
"rewards/rejected": -0.05627884343266487,
"sft_loss": 0.7579829096794128,
"step": 1030
},
{
"epoch": 2.495500899820036,
"grad_norm": 1.6006797776983446,
"learning_rate": 3.6539764855126224e-07,
"logits/chosen": -0.23340921103954315,
"logits/rejected": -0.1814245879650116,
"logps/chosen": -0.6439553499221802,
"logps/rejected": -1.0276587009429932,
"loss": 0.6617,
"odds_ratio_loss": 0.5049816370010376,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.03219776228070259,
"rewards/margins": 0.019185172393918037,
"rewards/rejected": -0.05138293653726578,
"sft_loss": 0.6439553499221802,
"step": 1040
},
{
"epoch": 2.519496100779844,
"grad_norm": 2.318524117790848,
"learning_rate": 3.328077236494087e-07,
"logits/chosen": -0.12850667536258698,
"logits/rejected": 0.07032374292612076,
"logps/chosen": -0.5922039747238159,
"logps/rejected": -1.0730435848236084,
"loss": 0.6694,
"odds_ratio_loss": 0.43941235542297363,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.029610196128487587,
"rewards/margins": 0.024041980504989624,
"rewards/rejected": -0.05365217477083206,
"sft_loss": 0.5922039747238159,
"step": 1050
},
{
"epoch": 2.5434913017396523,
"grad_norm": 1.8087989245838814,
"learning_rate": 3.0163612704959486e-07,
"logits/chosen": -0.6611061692237854,
"logits/rejected": -0.5293869376182556,
"logps/chosen": -0.6281863451004028,
"logps/rejected": -0.9944284558296204,
"loss": 0.6705,
"odds_ratio_loss": 0.47698038816452026,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03140931576490402,
"rewards/margins": 0.018312102183699608,
"rewards/rejected": -0.04972142353653908,
"sft_loss": 0.6281863451004028,
"step": 1060
},
{
"epoch": 2.56748650269946,
"grad_norm": 1.5444353690364836,
"learning_rate": 2.71903259137222e-07,
"logits/chosen": 0.411745548248291,
"logits/rejected": 0.4236873686313629,
"logps/chosen": -0.611006498336792,
"logps/rejected": -1.0047032833099365,
"loss": 0.672,
"odds_ratio_loss": 0.48614612221717834,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03055032715201378,
"rewards/margins": 0.019684839993715286,
"rewards/rejected": -0.050235163420438766,
"sft_loss": 0.611006498336792,
"step": 1070
},
{
"epoch": 2.591481703659268,
"grad_norm": 2.593043127599419,
"learning_rate": 2.436285787155185e-07,
"logits/chosen": 0.316955029964447,
"logits/rejected": 0.47285112738609314,
"logps/chosen": -0.6786519885063171,
"logps/rejected": -1.2019875049591064,
"loss": 0.6881,
"odds_ratio_loss": 0.4908427298069,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03393259644508362,
"rewards/margins": 0.026166772469878197,
"rewards/rejected": -0.060099370777606964,
"sft_loss": 0.6786519885063171,
"step": 1080
},
{
"epoch": 2.6154769046190762,
"grad_norm": 2.2050381193088207,
"learning_rate": 2.168305902706383e-07,
"logits/chosen": -0.4541945457458496,
"logits/rejected": -0.18702273070812225,
"logps/chosen": -0.7026795148849487,
"logps/rejected": -0.962356448173523,
"loss": 0.6583,
"odds_ratio_loss": 0.5365189909934998,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.035133976489305496,
"rewards/margins": 0.012983846478164196,
"rewards/rejected": -0.04811782017350197,
"sft_loss": 0.7026795148849487,
"step": 1090
},
{
"epoch": 2.6394721055788843,
"grad_norm": 1.6921175899136245,
"learning_rate": 1.9152683186132476e-07,
"logits/chosen": -0.4067768156528473,
"logits/rejected": -0.3039708137512207,
"logps/chosen": -0.6328436136245728,
"logps/rejected": -1.12655770778656,
"loss": 0.6919,
"odds_ratio_loss": 0.4709090292453766,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.031642183661460876,
"rewards/margins": 0.024685706943273544,
"rewards/rejected": -0.05632789060473442,
"sft_loss": 0.6328436136245728,
"step": 1100
},
{
"epoch": 2.663467306538692,
"grad_norm": 1.5594348597838832,
"learning_rate": 1.6773386364104972e-07,
"logits/chosen": -0.1575368195772171,
"logits/rejected": -0.003553843591362238,
"logps/chosen": -0.6768941879272461,
"logps/rejected": -1.032041072845459,
"loss": 0.6913,
"odds_ratio_loss": 0.50171959400177,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.033844709396362305,
"rewards/margins": 0.017757344990968704,
"rewards/rejected": -0.05160205811262131,
"sft_loss": 0.6768941879272461,
"step": 1110
},
{
"epoch": 2.6874625074985,
"grad_norm": 1.2735811398241894,
"learning_rate": 1.4546725702015096e-07,
"logits/chosen": 0.004650235176086426,
"logits/rejected": 0.1661575585603714,
"logps/chosen": -0.6541981101036072,
"logps/rejected": -1.1094247102737427,
"loss": 0.6669,
"odds_ratio_loss": 0.4492813050746918,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.03270990774035454,
"rewards/margins": 0.022761326283216476,
"rewards/rejected": -0.055471230298280716,
"sft_loss": 0.6541981101036072,
"step": 1120
},
{
"epoch": 2.7114577084583082,
"grad_norm": 2.2135398834819715,
"learning_rate": 1.24741584475056e-07,
"logits/chosen": -0.07907108962535858,
"logits/rejected": 0.08474680036306381,
"logps/chosen": -0.6154497861862183,
"logps/rejected": -1.0710924863815308,
"loss": 0.6491,
"odds_ratio_loss": 0.4509805142879486,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.030772492289543152,
"rewards/margins": 0.022782133892178535,
"rewards/rejected": -0.05355461686849594,
"sft_loss": 0.6154497861862183,
"step": 1130
},
{
"epoch": 2.7354529094181164,
"grad_norm": 1.5137426741255027,
"learning_rate": 1.0557041001126145e-07,
"logits/chosen": 0.3702402710914612,
"logits/rejected": 0.6300150156021118,
"logps/chosen": -0.5984182357788086,
"logps/rejected": -1.115179419517517,
"loss": 0.6191,
"odds_ratio_loss": 0.41762223839759827,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.0299209114164114,
"rewards/margins": 0.025838062167167664,
"rewards/rejected": -0.05575897544622421,
"sft_loss": 0.5984182357788086,
"step": 1140
},
{
"epoch": 2.7594481103779245,
"grad_norm": 1.565522436867544,
"learning_rate": 8.796628028631321e-08,
"logits/chosen": 0.17880654335021973,
"logits/rejected": 0.1116660013794899,
"logps/chosen": -0.6091745495796204,
"logps/rejected": -1.0210378170013428,
"loss": 0.6583,
"odds_ratio_loss": 0.4544963836669922,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.030458729714155197,
"rewards/margins": 0.02059316076338291,
"rewards/rejected": -0.05105189234018326,
"sft_loss": 0.6091745495796204,
"step": 1150
},
{
"epoch": 2.7834433113377326,
"grad_norm": 1.604017358081912,
"learning_rate": 7.19407163985894e-08,
"logits/chosen": -0.04378344863653183,
"logits/rejected": 0.18321049213409424,
"logps/chosen": -0.6626521348953247,
"logps/rejected": -1.1215763092041016,
"loss": 0.666,
"odds_ratio_loss": 0.4741577208042145,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.033132605254650116,
"rewards/margins": 0.022946210578083992,
"rewards/rejected": -0.05607881397008896,
"sft_loss": 0.6626521348953247,
"step": 1160
},
{
"epoch": 2.8074385122975407,
"grad_norm": 1.4084206676302562,
"learning_rate": 5.750420634727083e-08,
"logits/chosen": -0.45710262656211853,
"logits/rejected": -0.3050076961517334,
"logps/chosen": -0.671418309211731,
"logps/rejected": -1.1854102611541748,
"loss": 0.6842,
"odds_ratio_loss": 0.4368383288383484,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.03357091173529625,
"rewards/margins": 0.02569960430264473,
"rewards/rejected": -0.05927051231265068,
"sft_loss": 0.671418309211731,
"step": 1170
},
{
"epoch": 2.8314337132573484,
"grad_norm": 1.3507137389822068,
"learning_rate": 4.4666198168422656e-08,
"logits/chosen": 0.33376216888427734,
"logits/rejected": 0.41172194480895996,
"logps/chosen": -0.6510582566261292,
"logps/rejected": -1.0800405740737915,
"loss": 0.6747,
"odds_ratio_loss": 0.5277644395828247,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.032552916556596756,
"rewards/margins": 0.021449116989970207,
"rewards/rejected": -0.054002027958631516,
"sft_loss": 0.6510582566261292,
"step": 1180
},
{
"epoch": 2.8554289142171565,
"grad_norm": 1.6874037821147798,
"learning_rate": 3.343509375168863e-08,
"logits/chosen": 0.20301933586597443,
"logits/rejected": 0.32382094860076904,
"logps/chosen": -0.6405006647109985,
"logps/rejected": -1.0241023302078247,
"loss": 0.6718,
"odds_ratio_loss": 0.48166948556900024,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03202503174543381,
"rewards/margins": 0.019180091097950935,
"rewards/rejected": -0.051205117255449295,
"sft_loss": 0.6405006647109985,
"step": 1190
},
{
"epoch": 2.8794241151769646,
"grad_norm": 1.6417139708130921,
"learning_rate": 2.3818243341637293e-08,
"logits/chosen": -0.3619822859764099,
"logits/rejected": -0.15361133217811584,
"logps/chosen": -0.6599988341331482,
"logps/rejected": -1.098881483078003,
"loss": 0.6565,
"odds_ratio_loss": 0.456063449382782,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03299994021654129,
"rewards/margins": 0.021944135427474976,
"rewards/rejected": -0.054944075644016266,
"sft_loss": 0.6599988341331482,
"step": 1200
},
{
"epoch": 2.9034193161367727,
"grad_norm": 1.648932215503252,
"learning_rate": 1.5821940727361874e-08,
"logits/chosen": -0.7362561821937561,
"logits/rejected": -0.4996170997619629,
"logps/chosen": -0.6824958920478821,
"logps/rejected": -0.9969790577888489,
"loss": 0.7067,
"odds_ratio_loss": 0.5307115316390991,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.034124795347452164,
"rewards/margins": 0.01572415977716446,
"rewards/rejected": -0.049848951399326324,
"sft_loss": 0.6824958920478821,
"step": 1210
},
{
"epoch": 2.927414517096581,
"grad_norm": 1.7678674281978446,
"learning_rate": 9.451419123484573e-09,
"logits/chosen": -0.15318191051483154,
"logits/rejected": 0.047946538776159286,
"logps/chosen": -0.6560810804367065,
"logps/rejected": -1.0658347606658936,
"loss": 0.6692,
"odds_ratio_loss": 0.5046226382255554,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.032804060727357864,
"rewards/margins": 0.02048768661916256,
"rewards/rejected": -0.053291745483875275,
"sft_loss": 0.6560810804367065,
"step": 1220
},
{
"epoch": 2.9514097180563885,
"grad_norm": 1.4413325593301094,
"learning_rate": 4.710847745256209e-09,
"logits/chosen": 0.12647075951099396,
"logits/rejected": 0.2795228958129883,
"logps/chosen": -0.6180914640426636,
"logps/rejected": -1.0847346782684326,
"loss": 0.6722,
"odds_ratio_loss": 0.41623228788375854,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.030904576182365417,
"rewards/margins": 0.02333216182887554,
"rewards/rejected": -0.05423673242330551,
"sft_loss": 0.6180914640426636,
"step": 1230
},
{
"epoch": 2.9754049190161966,
"grad_norm": 1.5296676400661524,
"learning_rate": 1.603329079994942e-09,
"logits/chosen": -0.3425149619579315,
"logits/rejected": -0.06856220215559006,
"logps/chosen": -0.6569226980209351,
"logps/rejected": -1.1020539999008179,
"loss": 0.6649,
"odds_ratio_loss": 0.4642546772956848,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03284613788127899,
"rewards/margins": 0.02225656434893608,
"rewards/rejected": -0.055102698504924774,
"sft_loss": 0.6569226980209351,
"step": 1240
},
{
"epoch": 2.994601079784043,
"step": 1248,
"total_flos": 132590267662336.0,
"train_loss": 0.7937506708579186,
"train_runtime": 49781.9259,
"train_samples_per_second": 1.205,
"train_steps_per_second": 0.025
}
],
"logging_steps": 10,
"max_steps": 1248,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100.0,
"total_flos": 132590267662336.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}