zephyr-7b-dpo-lora / trainer_state.json
Jan Majkutewicz
Model save
d601388
raw
history blame
228 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00026171159382360636,
"grad_norm": 1.999703049659729,
"learning_rate": 1.3054830287206266e-09,
"logits/chosen": -2.9875593185424805,
"logits/rejected": -2.936753749847412,
"logps/chosen": -307.4898681640625,
"logps/rejected": -392.088623046875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0026171159382360636,
"grad_norm": 1.9285504817962646,
"learning_rate": 1.3054830287206264e-08,
"logits/chosen": -2.8448944091796875,
"logits/rejected": -2.83210825920105,
"logps/chosen": -299.1453857421875,
"logps/rejected": -260.9873352050781,
"loss": 0.693,
"rewards/accuracies": 0.4930555522441864,
"rewards/chosen": -0.00014580304559785873,
"rewards/margins": 0.0003282717370893806,
"rewards/rejected": -0.00047407473903149366,
"step": 10
},
{
"epoch": 0.005234231876472127,
"grad_norm": 2.234384775161743,
"learning_rate": 2.610966057441253e-08,
"logits/chosen": -2.861093044281006,
"logits/rejected": -2.826277732849121,
"logps/chosen": -325.42889404296875,
"logps/rejected": -252.72314453125,
"loss": 0.6928,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.00027085753390565515,
"rewards/margins": 0.0006726925494149327,
"rewards/rejected": -0.00040183504461310804,
"step": 20
},
{
"epoch": 0.007851347814708191,
"grad_norm": 2.5200695991516113,
"learning_rate": 3.91644908616188e-08,
"logits/chosen": -2.8650269508361816,
"logits/rejected": -2.839594841003418,
"logps/chosen": -269.79888916015625,
"logps/rejected": -268.51544189453125,
"loss": 0.6928,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0004993680049665272,
"rewards/margins": 0.0007416309672407806,
"rewards/rejected": -0.00024226296227425337,
"step": 30
},
{
"epoch": 0.010468463752944255,
"grad_norm": 1.6392391920089722,
"learning_rate": 5.221932114882506e-08,
"logits/chosen": -2.8317809104919434,
"logits/rejected": -2.8215935230255127,
"logps/chosen": -233.3176727294922,
"logps/rejected": -238.38671875,
"loss": 0.6929,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -3.822711369139142e-05,
"rewards/margins": 0.000457162968814373,
"rewards/rejected": -0.0004953901516273618,
"step": 40
},
{
"epoch": 0.01308557969118032,
"grad_norm": 1.624583125114441,
"learning_rate": 6.527415143603133e-08,
"logits/chosen": -2.865053176879883,
"logits/rejected": -2.852184295654297,
"logps/chosen": -290.0357360839844,
"logps/rejected": -253.96719360351562,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.00021998901502229273,
"rewards/margins": 8.350692223757505e-05,
"rewards/rejected": -0.00030349590815603733,
"step": 50
},
{
"epoch": 0.015702695629416383,
"grad_norm": 1.7673835754394531,
"learning_rate": 7.83289817232376e-08,
"logits/chosen": -2.8233509063720703,
"logits/rejected": -2.809717893600464,
"logps/chosen": -273.7070617675781,
"logps/rejected": -246.9080352783203,
"loss": 0.6931,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.00012425810564309359,
"rewards/margins": 6.1127066146582365e-06,
"rewards/rejected": -0.00013037076860200614,
"step": 60
},
{
"epoch": 0.018319811567652448,
"grad_norm": 1.7462002038955688,
"learning_rate": 9.138381201044386e-08,
"logits/chosen": -2.8822834491729736,
"logits/rejected": -2.8470146656036377,
"logps/chosen": -293.1849060058594,
"logps/rejected": -266.12908935546875,
"loss": 0.6931,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00014021484821569175,
"rewards/margins": 4.102182720089331e-05,
"rewards/rejected": -0.00018123674090020359,
"step": 70
},
{
"epoch": 0.02093692750588851,
"grad_norm": 2.281116008758545,
"learning_rate": 1.0443864229765012e-07,
"logits/chosen": -2.820223331451416,
"logits/rejected": -2.797712564468384,
"logps/chosen": -279.3045959472656,
"logps/rejected": -266.4049072265625,
"loss": 0.6932,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.00035889382706955075,
"rewards/margins": -3.7797075492562726e-05,
"rewards/rejected": -0.00032109676976688206,
"step": 80
},
{
"epoch": 0.023554043444124574,
"grad_norm": 1.8048748970031738,
"learning_rate": 1.174934725848564e-07,
"logits/chosen": -2.834364652633667,
"logits/rejected": -2.821197032928467,
"logps/chosen": -270.66107177734375,
"logps/rejected": -251.8137664794922,
"loss": 0.693,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.5717377866385505e-05,
"rewards/margins": 0.00027994689298793674,
"rewards/rejected": -0.00030566431814804673,
"step": 90
},
{
"epoch": 0.02617115938236064,
"grad_norm": 1.8376109600067139,
"learning_rate": 1.3054830287206266e-07,
"logits/chosen": -2.8485753536224365,
"logits/rejected": -2.8414525985717773,
"logps/chosen": -267.0416259765625,
"logps/rejected": -248.66622924804688,
"loss": 0.6929,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.00016696630336809903,
"rewards/margins": 0.0004185012076050043,
"rewards/rejected": -0.0005854673800058663,
"step": 100
},
{
"epoch": 0.02617115938236064,
"eval_logits/chosen": -2.8661274909973145,
"eval_logits/rejected": -2.8388071060180664,
"eval_logps/chosen": -282.74957275390625,
"eval_logps/rejected": -261.47882080078125,
"eval_loss": 0.693004846572876,
"eval_rewards/accuracies": 0.5249999761581421,
"eval_rewards/chosen": -0.00011926326260436326,
"eval_rewards/margins": 0.0002895805810112506,
"eval_rewards/rejected": -0.00040884382906369865,
"eval_runtime": 692.2735,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 100
},
{
"epoch": 0.028788275320596704,
"grad_norm": 2.015868663787842,
"learning_rate": 1.4360313315926893e-07,
"logits/chosen": -2.856309652328491,
"logits/rejected": -2.823089361190796,
"logps/chosen": -307.3843994140625,
"logps/rejected": -257.291015625,
"loss": 0.6932,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.0002652711991686374,
"rewards/margins": -0.00011966088641202077,
"rewards/rejected": -0.00014561018906533718,
"step": 110
},
{
"epoch": 0.031405391258832765,
"grad_norm": 1.7159242630004883,
"learning_rate": 1.566579634464752e-07,
"logits/chosen": -2.869659423828125,
"logits/rejected": -2.8464877605438232,
"logps/chosen": -310.60089111328125,
"logps/rejected": -287.7904357910156,
"loss": 0.6929,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0001522126840427518,
"rewards/margins": 0.0004031356074847281,
"rewards/rejected": -0.00025092283613048494,
"step": 120
},
{
"epoch": 0.03402250719706883,
"grad_norm": 2.0958242416381836,
"learning_rate": 1.6971279373368143e-07,
"logits/chosen": -2.850337266921997,
"logits/rejected": -2.8188374042510986,
"logps/chosen": -271.6417236328125,
"logps/rejected": -269.60174560546875,
"loss": 0.6928,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.00013120910443831235,
"rewards/margins": 0.0006835443200543523,
"rewards/rejected": -0.0005523352883756161,
"step": 130
},
{
"epoch": 0.036639623135304895,
"grad_norm": 1.8925613164901733,
"learning_rate": 1.8276762402088773e-07,
"logits/chosen": -2.8673295974731445,
"logits/rejected": -2.8122167587280273,
"logps/chosen": -291.46307373046875,
"logps/rejected": -247.7669677734375,
"loss": 0.6927,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.00036148293293081224,
"rewards/margins": 0.0009279497899115086,
"rewards/rejected": -0.0005664670607075095,
"step": 140
},
{
"epoch": 0.03925673907354096,
"grad_norm": 1.9597433805465698,
"learning_rate": 1.95822454308094e-07,
"logits/chosen": -2.8569109439849854,
"logits/rejected": -2.837003707885742,
"logps/chosen": -298.9459228515625,
"logps/rejected": -256.0478515625,
"loss": 0.6927,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0006078753503970802,
"rewards/margins": 0.0009616016177460551,
"rewards/rejected": -0.00035372626734897494,
"step": 150
},
{
"epoch": 0.04187385501177702,
"grad_norm": 1.913694977760315,
"learning_rate": 2.0887728459530023e-07,
"logits/chosen": -2.864971876144409,
"logits/rejected": -2.8458945751190186,
"logps/chosen": -275.124755859375,
"logps/rejected": -275.0151062011719,
"loss": 0.6926,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.00012734555639326572,
"rewards/margins": 0.001163811655715108,
"rewards/rejected": -0.0010364660993218422,
"step": 160
},
{
"epoch": 0.04449097095001309,
"grad_norm": 2.1846537590026855,
"learning_rate": 2.2193211488250652e-07,
"logits/chosen": -2.822680950164795,
"logits/rejected": -2.8042876720428467,
"logps/chosen": -236.7074432373047,
"logps/rejected": -238.3466339111328,
"loss": 0.6927,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0003129563410766423,
"rewards/margins": 0.0008108107140287757,
"rewards/rejected": -0.0011237671133130789,
"step": 170
},
{
"epoch": 0.04710808688824915,
"grad_norm": 1.6035895347595215,
"learning_rate": 2.349869451697128e-07,
"logits/chosen": -2.850816249847412,
"logits/rejected": -2.823718309402466,
"logps/chosen": -276.2500915527344,
"logps/rejected": -259.9451904296875,
"loss": 0.6927,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0001872165739769116,
"rewards/margins": 0.0008747532265260816,
"rewards/rejected": -0.001061969785951078,
"step": 180
},
{
"epoch": 0.04972520282648522,
"grad_norm": 3.182461738586426,
"learning_rate": 2.4804177545691903e-07,
"logits/chosen": -2.8869190216064453,
"logits/rejected": -2.8687491416931152,
"logps/chosen": -290.9490661621094,
"logps/rejected": -257.3797302246094,
"loss": 0.6927,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0006612293072976172,
"rewards/margins": 0.000992011046037078,
"rewards/rejected": -0.00033078185515478253,
"step": 190
},
{
"epoch": 0.05234231876472128,
"grad_norm": 1.8618322610855103,
"learning_rate": 2.610966057441253e-07,
"logits/chosen": -2.837772846221924,
"logits/rejected": -2.8276214599609375,
"logps/chosen": -267.96173095703125,
"logps/rejected": -225.5831756591797,
"loss": 0.6923,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0006539617897942662,
"rewards/margins": 0.0017792375292629004,
"rewards/rejected": -0.0011252757394686341,
"step": 200
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -2.8652713298797607,
"eval_logits/rejected": -2.837984800338745,
"eval_logps/chosen": -282.66241455078125,
"eval_logps/rejected": -261.5315856933594,
"eval_loss": 0.6923088431358337,
"eval_rewards/accuracies": 0.6050000190734863,
"eval_rewards/chosen": 0.0007522286614403129,
"eval_rewards/margins": 0.001688659773208201,
"eval_rewards/rejected": -0.0009364310535602272,
"eval_runtime": 693.0899,
"eval_samples_per_second": 2.886,
"eval_steps_per_second": 0.361,
"step": 200
},
{
"epoch": 0.05495943470295734,
"grad_norm": 1.7776113748550415,
"learning_rate": 2.7415143603133156e-07,
"logits/chosen": -2.8762500286102295,
"logits/rejected": -2.8429489135742188,
"logps/chosen": -275.98614501953125,
"logps/rejected": -245.2783660888672,
"loss": 0.6922,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0009210329735651612,
"rewards/margins": 0.0018816586816683412,
"rewards/rejected": -0.0009606255334801972,
"step": 210
},
{
"epoch": 0.05757655064119341,
"grad_norm": 1.6921358108520508,
"learning_rate": 2.8720626631853785e-07,
"logits/chosen": -2.817211627960205,
"logits/rejected": -2.811617851257324,
"logps/chosen": -274.0498962402344,
"logps/rejected": -242.93923950195312,
"loss": 0.6919,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0014726849040016532,
"rewards/margins": 0.0024847507011145353,
"rewards/rejected": -0.0010120656806975603,
"step": 220
},
{
"epoch": 0.06019366657942947,
"grad_norm": 2.0040206909179688,
"learning_rate": 3.002610966057441e-07,
"logits/chosen": -2.885439157485962,
"logits/rejected": -2.86034893989563,
"logps/chosen": -322.754150390625,
"logps/rejected": -285.758056640625,
"loss": 0.6922,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0018624020740389824,
"rewards/margins": 0.0018660586792975664,
"rewards/rejected": -3.6565586469805567e-06,
"step": 230
},
{
"epoch": 0.06281078251766553,
"grad_norm": 1.809605360031128,
"learning_rate": 3.133159268929504e-07,
"logits/chosen": -2.8532462120056152,
"logits/rejected": -2.8391811847686768,
"logps/chosen": -312.47088623046875,
"logps/rejected": -297.48907470703125,
"loss": 0.6921,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0017323732608929276,
"rewards/margins": 0.0021942437160760164,
"rewards/rejected": -0.00046187033876776695,
"step": 240
},
{
"epoch": 0.06542789845590159,
"grad_norm": 1.6686596870422363,
"learning_rate": 3.263707571801567e-07,
"logits/chosen": -2.814990282058716,
"logits/rejected": -2.81905198097229,
"logps/chosen": -277.08941650390625,
"logps/rejected": -249.03414916992188,
"loss": 0.6915,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.001977517269551754,
"rewards/margins": 0.003367725061252713,
"rewards/rejected": -0.0013902074424549937,
"step": 250
},
{
"epoch": 0.06804501439413765,
"grad_norm": 1.5935229063034058,
"learning_rate": 3.3942558746736286e-07,
"logits/chosen": -2.8718338012695312,
"logits/rejected": -2.8251404762268066,
"logps/chosen": -297.3100280761719,
"logps/rejected": -277.9830017089844,
"loss": 0.6916,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0025989424902945757,
"rewards/margins": 0.0032064050901681185,
"rewards/rejected": -0.00060746242525056,
"step": 260
},
{
"epoch": 0.07066213033237373,
"grad_norm": 1.4248483180999756,
"learning_rate": 3.5248041775456916e-07,
"logits/chosen": -2.8370590209960938,
"logits/rejected": -2.8248658180236816,
"logps/chosen": -281.2889709472656,
"logps/rejected": -245.48855590820312,
"loss": 0.6901,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.003083507064729929,
"rewards/margins": 0.006150919944047928,
"rewards/rejected": -0.003067413344979286,
"step": 270
},
{
"epoch": 0.07327924627060979,
"grad_norm": 1.725456714630127,
"learning_rate": 3.6553524804177545e-07,
"logits/chosen": -2.8781139850616455,
"logits/rejected": -2.8350632190704346,
"logps/chosen": -276.51568603515625,
"logps/rejected": -253.5542755126953,
"loss": 0.6906,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0024674157612025738,
"rewards/margins": 0.005187267437577248,
"rewards/rejected": -0.00271985144354403,
"step": 280
},
{
"epoch": 0.07589636220884585,
"grad_norm": 1.9681357145309448,
"learning_rate": 3.785900783289817e-07,
"logits/chosen": -2.849203586578369,
"logits/rejected": -2.838613986968994,
"logps/chosen": -304.06463623046875,
"logps/rejected": -279.3326721191406,
"loss": 0.6901,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0034332734066993,
"rewards/margins": 0.0062034172005951405,
"rewards/rejected": -0.0027701437938958406,
"step": 290
},
{
"epoch": 0.07851347814708191,
"grad_norm": 2.0513315200805664,
"learning_rate": 3.91644908616188e-07,
"logits/chosen": -2.8060500621795654,
"logits/rejected": -2.76236629486084,
"logps/chosen": -266.20794677734375,
"logps/rejected": -248.80886840820312,
"loss": 0.6898,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.0026042419485747814,
"rewards/margins": 0.006667142268270254,
"rewards/rejected": -0.004062901251018047,
"step": 300
},
{
"epoch": 0.07851347814708191,
"eval_logits/chosen": -2.8622689247131348,
"eval_logits/rejected": -2.834963321685791,
"eval_logps/chosen": -282.39178466796875,
"eval_logps/rejected": -261.6759948730469,
"eval_loss": 0.6902644038200378,
"eval_rewards/accuracies": 0.6639999747276306,
"eval_rewards/chosen": 0.0034584649838507175,
"eval_rewards/margins": 0.0058389026671648026,
"eval_rewards/rejected": -0.0023804374504834414,
"eval_runtime": 692.5367,
"eval_samples_per_second": 2.888,
"eval_steps_per_second": 0.361,
"step": 300
},
{
"epoch": 0.08113059408531798,
"grad_norm": 2.1205692291259766,
"learning_rate": 4.046997389033943e-07,
"logits/chosen": -2.893097400665283,
"logits/rejected": -2.87463641166687,
"logps/chosen": -306.21636962890625,
"logps/rejected": -250.2729949951172,
"loss": 0.6888,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.004871034994721413,
"rewards/margins": 0.008721152320504189,
"rewards/rejected": -0.003850117791444063,
"step": 310
},
{
"epoch": 0.08374771002355404,
"grad_norm": 1.7468680143356323,
"learning_rate": 4.1775456919060046e-07,
"logits/chosen": -2.873706817626953,
"logits/rejected": -2.8421998023986816,
"logps/chosen": -272.94659423828125,
"logps/rejected": -255.0898895263672,
"loss": 0.6904,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.00492675369605422,
"rewards/margins": 0.005588999018073082,
"rewards/rejected": -0.000662245147395879,
"step": 320
},
{
"epoch": 0.08636482596179011,
"grad_norm": 1.7784926891326904,
"learning_rate": 4.3080939947780675e-07,
"logits/chosen": -2.8389968872070312,
"logits/rejected": -2.8390631675720215,
"logps/chosen": -277.24652099609375,
"logps/rejected": -250.9720458984375,
"loss": 0.6892,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.007157427724450827,
"rewards/margins": 0.00795576348900795,
"rewards/rejected": -0.0007983351242728531,
"step": 330
},
{
"epoch": 0.08898194190002617,
"grad_norm": 2.0122432708740234,
"learning_rate": 4.4386422976501305e-07,
"logits/chosen": -2.868762254714966,
"logits/rejected": -2.8562684059143066,
"logps/chosen": -306.8142395019531,
"logps/rejected": -284.90679931640625,
"loss": 0.6886,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.00881933607161045,
"rewards/margins": 0.009326713159680367,
"rewards/rejected": -0.000507376913446933,
"step": 340
},
{
"epoch": 0.09159905783826224,
"grad_norm": 1.7484519481658936,
"learning_rate": 4.569190600522193e-07,
"logits/chosen": -2.824993848800659,
"logits/rejected": -2.797851085662842,
"logps/chosen": -309.11224365234375,
"logps/rejected": -296.3442687988281,
"loss": 0.6894,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.009017640724778175,
"rewards/margins": 0.00765979802235961,
"rewards/rejected": 0.0013578429352492094,
"step": 350
},
{
"epoch": 0.0942161737764983,
"grad_norm": 1.2647193670272827,
"learning_rate": 4.699738903394256e-07,
"logits/chosen": -2.8344480991363525,
"logits/rejected": -2.816068649291992,
"logps/chosen": -256.1959533691406,
"logps/rejected": -236.88818359375,
"loss": 0.6883,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.007074951194226742,
"rewards/margins": 0.009867229498922825,
"rewards/rejected": -0.0027922778390347958,
"step": 360
},
{
"epoch": 0.09683328971473436,
"grad_norm": 2.0885772705078125,
"learning_rate": 4.830287206266319e-07,
"logits/chosen": -2.8475875854492188,
"logits/rejected": -2.8186795711517334,
"logps/chosen": -295.1861572265625,
"logps/rejected": -251.5151824951172,
"loss": 0.6856,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.010460047982633114,
"rewards/margins": 0.015231410041451454,
"rewards/rejected": -0.004771359730511904,
"step": 370
},
{
"epoch": 0.09945040565297043,
"grad_norm": 1.8870456218719482,
"learning_rate": 4.960835509138381e-07,
"logits/chosen": -2.8488352298736572,
"logits/rejected": -2.7997212409973145,
"logps/chosen": -315.6346740722656,
"logps/rejected": -279.5706481933594,
"loss": 0.6871,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.011897383257746696,
"rewards/margins": 0.012441580183804035,
"rewards/rejected": -0.0005441965768113732,
"step": 380
},
{
"epoch": 0.1020675215912065,
"grad_norm": 2.3549890518188477,
"learning_rate": 4.999948856244767e-07,
"logits/chosen": -2.8280773162841797,
"logits/rejected": -2.8224241733551025,
"logps/chosen": -297.057373046875,
"logps/rejected": -278.00421142578125,
"loss": 0.6836,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.01873602904379368,
"rewards/margins": 0.01945691928267479,
"rewards/rejected": -0.0007208908209577203,
"step": 390
},
{
"epoch": 0.10468463752944256,
"grad_norm": 1.818867802619934,
"learning_rate": 4.999698361256577e-07,
"logits/chosen": -2.851010799407959,
"logits/rejected": -2.8151259422302246,
"logps/chosen": -279.1597900390625,
"logps/rejected": -237.5978546142578,
"loss": 0.6872,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.016593072563409805,
"rewards/margins": 0.012265140190720558,
"rewards/rejected": 0.004327933304011822,
"step": 400
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -2.857703685760498,
"eval_logits/rejected": -2.830756425857544,
"eval_logps/chosen": -281.0899963378906,
"eval_logps/rejected": -261.22564697265625,
"eval_loss": 0.6861628293991089,
"eval_rewards/accuracies": 0.6669999957084656,
"eval_rewards/chosen": 0.01647624559700489,
"eval_rewards/margins": 0.014353430829942226,
"eval_rewards/rejected": 0.002122814767062664,
"eval_runtime": 692.2781,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 400
},
{
"epoch": 0.10730175346767862,
"grad_norm": 1.9545940160751343,
"learning_rate": 4.99923914217458e-07,
"logits/chosen": -2.818399667739868,
"logits/rejected": -2.802830457687378,
"logps/chosen": -256.24957275390625,
"logps/rejected": -256.09527587890625,
"loss": 0.6893,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.013771469704806805,
"rewards/margins": 0.008097216486930847,
"rewards/rejected": 0.005674251355230808,
"step": 410
},
{
"epoch": 0.10991886940591468,
"grad_norm": 4.077869415283203,
"learning_rate": 4.99857123734344e-07,
"logits/chosen": -2.8153655529022217,
"logits/rejected": -2.769317865371704,
"logps/chosen": -244.53890991210938,
"logps/rejected": -238.0004119873047,
"loss": 0.6855,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.015213017351925373,
"rewards/margins": 0.015682024881243706,
"rewards/rejected": -0.00046900735469534993,
"step": 420
},
{
"epoch": 0.11253598534415074,
"grad_norm": 2.243114471435547,
"learning_rate": 4.997694702533016e-07,
"logits/chosen": -2.837740182876587,
"logits/rejected": -2.806856870651245,
"logps/chosen": -293.7519836425781,
"logps/rejected": -272.25494384765625,
"loss": 0.6835,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.022876007482409477,
"rewards/margins": 0.019848225638270378,
"rewards/rejected": 0.0030277802143245935,
"step": 430
},
{
"epoch": 0.11515310128238682,
"grad_norm": 1.829640507698059,
"learning_rate": 4.996609610933712e-07,
"logits/chosen": -2.875370740890503,
"logits/rejected": -2.8540024757385254,
"logps/chosen": -285.1123962402344,
"logps/rejected": -256.6170654296875,
"loss": 0.6833,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02266586944460869,
"rewards/margins": 0.020275097340345383,
"rewards/rejected": 0.0023907723370939493,
"step": 440
},
{
"epoch": 0.11777021722062288,
"grad_norm": 1.756147861480713,
"learning_rate": 4.995316053150366e-07,
"logits/chosen": -2.806842088699341,
"logits/rejected": -2.8101210594177246,
"logps/chosen": -288.1036376953125,
"logps/rejected": -259.46014404296875,
"loss": 0.6824,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.029574494808912277,
"rewards/margins": 0.022273657843470573,
"rewards/rejected": 0.007300837431102991,
"step": 450
},
{
"epoch": 0.12038733315885894,
"grad_norm": 3.1120874881744385,
"learning_rate": 4.99381413719468e-07,
"logits/chosen": -2.825704574584961,
"logits/rejected": -2.81204891204834,
"logps/chosen": -279.86334228515625,
"logps/rejected": -268.80755615234375,
"loss": 0.6796,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.029285842552781105,
"rewards/margins": 0.027944009751081467,
"rewards/rejected": 0.0013418343150988221,
"step": 460
},
{
"epoch": 0.123004449097095,
"grad_norm": 1.9212427139282227,
"learning_rate": 4.992103988476205e-07,
"logits/chosen": -2.83656644821167,
"logits/rejected": -2.810007333755493,
"logps/chosen": -257.7132873535156,
"logps/rejected": -245.3390655517578,
"loss": 0.6831,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.024322878569364548,
"rewards/margins": 0.020839061588048935,
"rewards/rejected": 0.003483818843960762,
"step": 470
},
{
"epoch": 0.12562156503533106,
"grad_norm": 2.0051708221435547,
"learning_rate": 4.990185749791864e-07,
"logits/chosen": -2.868682622909546,
"logits/rejected": -2.836199998855591,
"logps/chosen": -271.63922119140625,
"logps/rejected": -274.00189208984375,
"loss": 0.68,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.027854889631271362,
"rewards/margins": 0.0271223783493042,
"rewards/rejected": 0.0007325109909288585,
"step": 480
},
{
"epoch": 0.12823868097356714,
"grad_norm": 2.0355913639068604,
"learning_rate": 4.988059581314039e-07,
"logits/chosen": -2.8479950428009033,
"logits/rejected": -2.8285024166107178,
"logps/chosen": -305.7145690917969,
"logps/rejected": -269.5832214355469,
"loss": 0.6789,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.02704049088060856,
"rewards/margins": 0.029708972200751305,
"rewards/rejected": -0.002668480621650815,
"step": 490
},
{
"epoch": 0.13085579691180318,
"grad_norm": 1.996235966682434,
"learning_rate": 4.985725660577184e-07,
"logits/chosen": -2.8617165088653564,
"logits/rejected": -2.843017101287842,
"logps/chosen": -288.36846923828125,
"logps/rejected": -249.8210906982422,
"loss": 0.6783,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.023136448115110397,
"rewards/margins": 0.031000768765807152,
"rewards/rejected": -0.007864321582019329,
"step": 500
},
{
"epoch": 0.13085579691180318,
"eval_logits/chosen": -2.848633289337158,
"eval_logits/rejected": -2.8214972019195557,
"eval_logps/chosen": -280.6480712890625,
"eval_logps/rejected": -262.0230407714844,
"eval_loss": 0.6803756356239319,
"eval_rewards/accuracies": 0.6834999918937683,
"eval_rewards/chosen": 0.020895304158329964,
"eval_rewards/margins": 0.026746317744255066,
"eval_rewards/rejected": -0.005851015914231539,
"eval_runtime": 691.0122,
"eval_samples_per_second": 2.894,
"eval_steps_per_second": 0.362,
"step": 500
},
{
"epoch": 0.13347291285003926,
"grad_norm": 2.2953689098358154,
"learning_rate": 4.983184182463008e-07,
"logits/chosen": -2.83900785446167,
"logits/rejected": -2.8163068294525146,
"logps/chosen": -292.3056335449219,
"logps/rejected": -256.3818359375,
"loss": 0.6779,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0237285066395998,
"rewards/margins": 0.03204946964979172,
"rewards/rejected": -0.008320963010191917,
"step": 510
},
{
"epoch": 0.1360900287882753,
"grad_norm": 2.152860164642334,
"learning_rate": 4.980435359184203e-07,
"logits/chosen": -2.8620104789733887,
"logits/rejected": -2.8637924194335938,
"logps/chosen": -285.1622314453125,
"logps/rejected": -270.9977722167969,
"loss": 0.6791,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.026320820674300194,
"rewards/margins": 0.029663830995559692,
"rewards/rejected": -0.0033430135808885098,
"step": 520
},
{
"epoch": 0.13870714472651138,
"grad_norm": 2.3760368824005127,
"learning_rate": 4.977479420266723e-07,
"logits/chosen": -2.8074328899383545,
"logits/rejected": -2.8127429485321045,
"logps/chosen": -278.2021484375,
"logps/rejected": -288.5596618652344,
"loss": 0.6792,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.02414657548069954,
"rewards/margins": 0.02932720258831978,
"rewards/rejected": -0.005180628038942814,
"step": 530
},
{
"epoch": 0.14132426066474746,
"grad_norm": 1.8068273067474365,
"learning_rate": 4.974316612530614e-07,
"logits/chosen": -2.799464464187622,
"logits/rejected": -2.781719446182251,
"logps/chosen": -296.43017578125,
"logps/rejected": -260.1778869628906,
"loss": 0.6685,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.03263556957244873,
"rewards/margins": 0.05155158042907715,
"rewards/rejected": -0.018916018307209015,
"step": 540
},
{
"epoch": 0.1439413766029835,
"grad_norm": 2.295518636703491,
"learning_rate": 4.970947200069415e-07,
"logits/chosen": -2.8136024475097656,
"logits/rejected": -2.8002548217773438,
"logps/chosen": -296.8650817871094,
"logps/rejected": -277.0992431640625,
"loss": 0.6793,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.026846662163734436,
"rewards/margins": 0.029769038781523705,
"rewards/rejected": -0.0029223733581602573,
"step": 550
},
{
"epoch": 0.14655849254121958,
"grad_norm": 1.8040831089019775,
"learning_rate": 4.967371464228095e-07,
"logits/chosen": -2.8747551441192627,
"logits/rejected": -2.8538835048675537,
"logps/chosen": -269.18994140625,
"logps/rejected": -272.37799072265625,
"loss": 0.6782,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.026889195665717125,
"rewards/margins": 0.03184649348258972,
"rewards/rejected": -0.004957299679517746,
"step": 560
},
{
"epoch": 0.14917560847945563,
"grad_norm": 2.131438970565796,
"learning_rate": 4.963589703579569e-07,
"logits/chosen": -2.899491310119629,
"logits/rejected": -2.8730692863464355,
"logps/chosen": -313.0187072753906,
"logps/rejected": -280.3568420410156,
"loss": 0.6752,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.028542449697852135,
"rewards/margins": 0.03851853683590889,
"rewards/rejected": -0.009976087138056755,
"step": 570
},
{
"epoch": 0.1517927244176917,
"grad_norm": 1.8194427490234375,
"learning_rate": 4.959602233899761e-07,
"logits/chosen": -2.892979621887207,
"logits/rejected": -2.8543694019317627,
"logps/chosen": -311.68353271484375,
"logps/rejected": -272.5694580078125,
"loss": 0.673,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.035731758922338486,
"rewards/margins": 0.04327362775802612,
"rewards/rejected": -0.007541867904365063,
"step": 580
},
{
"epoch": 0.15440984035592778,
"grad_norm": 2.1900675296783447,
"learning_rate": 4.955409388141243e-07,
"logits/chosen": -2.8265955448150635,
"logits/rejected": -2.8132894039154053,
"logps/chosen": -273.9072265625,
"logps/rejected": -251.5390167236328,
"loss": 0.6752,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.016455931589007378,
"rewards/margins": 0.03850039094686508,
"rewards/rejected": -0.022044459357857704,
"step": 590
},
{
"epoch": 0.15702695629416383,
"grad_norm": 1.8198952674865723,
"learning_rate": 4.951011516405429e-07,
"logits/chosen": -2.84102201461792,
"logits/rejected": -2.84004807472229,
"logps/chosen": -265.394775390625,
"logps/rejected": -252.8574676513672,
"loss": 0.6729,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.021321838721632957,
"rewards/margins": 0.04377777501940727,
"rewards/rejected": -0.022455941885709763,
"step": 600
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -2.8409736156463623,
"eval_logits/rejected": -2.813835382461548,
"eval_logps/chosen": -281.19580078125,
"eval_logps/rejected": -264.16082763671875,
"eval_loss": 0.6732848882675171,
"eval_rewards/accuracies": 0.6840000152587891,
"eval_rewards/chosen": 0.015417821705341339,
"eval_rewards/margins": 0.04264672100543976,
"eval_rewards/rejected": -0.02722889743745327,
"eval_runtime": 691.9111,
"eval_samples_per_second": 2.891,
"eval_steps_per_second": 0.361,
"step": 600
},
{
"epoch": 0.1596440722323999,
"grad_norm": 2.117947578430176,
"learning_rate": 4.946408985913344e-07,
"logits/chosen": -2.834245204925537,
"logits/rejected": -2.8125996589660645,
"logps/chosen": -262.54144287109375,
"logps/rejected": -246.34860229492188,
"loss": 0.6734,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.020137300714850426,
"rewards/margins": 0.04266170784831047,
"rewards/rejected": -0.022524405270814896,
"step": 610
},
{
"epoch": 0.16226118817063595,
"grad_norm": 2.218667507171631,
"learning_rate": 4.941602180974958e-07,
"logits/chosen": -2.8357930183410645,
"logits/rejected": -2.7973721027374268,
"logps/chosen": -303.65606689453125,
"logps/rejected": -245.33108520507812,
"loss": 0.6696,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.019601870328187943,
"rewards/margins": 0.049685824662446976,
"rewards/rejected": -0.030083950608968735,
"step": 620
},
{
"epoch": 0.16487830410887203,
"grad_norm": 1.9840420484542847,
"learning_rate": 4.936591502957101e-07,
"logits/chosen": -2.8378233909606934,
"logits/rejected": -2.8140475749969482,
"logps/chosen": -261.1944580078125,
"logps/rejected": -257.957763671875,
"loss": 0.6647,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.024741780012845993,
"rewards/margins": 0.06145521253347397,
"rewards/rejected": -0.036713436245918274,
"step": 630
},
{
"epoch": 0.16749542004710807,
"grad_norm": 2.034658432006836,
"learning_rate": 4.931377370249945e-07,
"logits/chosen": -2.845576763153076,
"logits/rejected": -2.78796124458313,
"logps/chosen": -281.0826110839844,
"logps/rejected": -263.23370361328125,
"loss": 0.6673,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -6.524250056827441e-05,
"rewards/margins": 0.05555204302072525,
"rewards/rejected": -0.055617284029722214,
"step": 640
},
{
"epoch": 0.17011253598534415,
"grad_norm": 2.102283239364624,
"learning_rate": 4.925960218232072e-07,
"logits/chosen": -2.8266994953155518,
"logits/rejected": -2.8046762943267822,
"logps/chosen": -269.2861633300781,
"logps/rejected": -264.4281005859375,
"loss": 0.6646,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.008663799613714218,
"rewards/margins": 0.06168809533119202,
"rewards/rejected": -0.0530242919921875,
"step": 650
},
{
"epoch": 0.17272965192358022,
"grad_norm": 3.1403772830963135,
"learning_rate": 4.920340499234116e-07,
"logits/chosen": -2.796461343765259,
"logits/rejected": -2.757336139678955,
"logps/chosen": -285.25445556640625,
"logps/rejected": -251.8562469482422,
"loss": 0.6684,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.010964155197143555,
"rewards/margins": 0.05367765575647354,
"rewards/rejected": -0.04271350055932999,
"step": 660
},
{
"epoch": 0.17534676786181627,
"grad_norm": 1.932573914527893,
"learning_rate": 4.914518682500995e-07,
"logits/chosen": -2.870535373687744,
"logits/rejected": -2.840186595916748,
"logps/chosen": -297.72967529296875,
"logps/rejected": -261.30780029296875,
"loss": 0.661,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0239148810505867,
"rewards/margins": 0.07002829760313034,
"rewards/rejected": -0.04611341655254364,
"step": 670
},
{
"epoch": 0.17796388380005235,
"grad_norm": 2.7643067836761475,
"learning_rate": 4.90849525415273e-07,
"logits/chosen": -2.830029249191284,
"logits/rejected": -2.8078887462615967,
"logps/chosen": -288.3429260253906,
"logps/rejected": -245.07369995117188,
"loss": 0.6589,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.02092517912387848,
"rewards/margins": 0.07483113557100296,
"rewards/rejected": -0.05390595644712448,
"step": 680
},
{
"epoch": 0.1805809997382884,
"grad_norm": 2.184591054916382,
"learning_rate": 4.902270717143858e-07,
"logits/chosen": -2.837787628173828,
"logits/rejected": -2.8210721015930176,
"logps/chosen": -255.417724609375,
"logps/rejected": -272.31591796875,
"loss": 0.6509,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.005492637865245342,
"rewards/margins": 0.09071613848209381,
"rewards/rejected": -0.0852234959602356,
"step": 690
},
{
"epoch": 0.18319811567652447,
"grad_norm": 2.2565648555755615,
"learning_rate": 4.895845591221426e-07,
"logits/chosen": -2.833556652069092,
"logits/rejected": -2.836822032928467,
"logps/chosen": -269.5510559082031,
"logps/rejected": -269.97686767578125,
"loss": 0.6665,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.003929516766220331,
"rewards/margins": 0.058883119374513626,
"rewards/rejected": -0.06281263381242752,
"step": 700
},
{
"epoch": 0.18319811567652447,
"eval_logits/chosen": -2.8327224254608154,
"eval_logits/rejected": -2.8060340881347656,
"eval_logps/chosen": -283.0862731933594,
"eval_logps/rejected": -268.32659912109375,
"eval_loss": 0.6637989282608032,
"eval_rewards/accuracies": 0.6754999756813049,
"eval_rewards/chosen": -0.0034864526242017746,
"eval_rewards/margins": 0.06540023535490036,
"eval_rewards/rejected": -0.06888668984174728,
"eval_runtime": 691.7822,
"eval_samples_per_second": 2.891,
"eval_steps_per_second": 0.361,
"step": 700
},
{
"epoch": 0.18581523161476055,
"grad_norm": 2.449979782104492,
"learning_rate": 4.8892204128816e-07,
"logits/chosen": -2.865187644958496,
"logits/rejected": -2.8416965007781982,
"logps/chosen": -281.83489990234375,
"logps/rejected": -273.02984619140625,
"loss": 0.6666,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0031673975754529238,
"rewards/margins": 0.059172265231609344,
"rewards/rejected": -0.062339670956134796,
"step": 710
},
{
"epoch": 0.1884323475529966,
"grad_norm": 2.0199317932128906,
"learning_rate": 4.882395735324863e-07,
"logits/chosen": -2.840233325958252,
"logits/rejected": -2.7969911098480225,
"logps/chosen": -281.1783447265625,
"logps/rejected": -274.934326171875,
"loss": 0.6572,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0017295643920078874,
"rewards/margins": 0.08150311559438705,
"rewards/rejected": -0.07977355271577835,
"step": 720
},
{
"epoch": 0.19104946349123267,
"grad_norm": 2.187190294265747,
"learning_rate": 4.875372128409829e-07,
"logits/chosen": -2.815016269683838,
"logits/rejected": -2.7854647636413574,
"logps/chosen": -285.82489013671875,
"logps/rejected": -259.6023254394531,
"loss": 0.6616,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.02074645273387432,
"rewards/margins": 0.07150407880544662,
"rewards/rejected": -0.0922505259513855,
"step": 730
},
{
"epoch": 0.19366657942946872,
"grad_norm": 2.0459957122802734,
"learning_rate": 4.868150178605653e-07,
"logits/chosen": -2.812069892883301,
"logits/rejected": -2.7864902019500732,
"logps/chosen": -246.3455352783203,
"logps/rejected": -221.7488250732422,
"loss": 0.6527,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03750302642583847,
"rewards/margins": 0.08891085535287857,
"rewards/rejected": -0.12641388177871704,
"step": 740
},
{
"epoch": 0.1962836953677048,
"grad_norm": 2.3921523094177246,
"learning_rate": 4.860730488943068e-07,
"logits/chosen": -2.7749264240264893,
"logits/rejected": -2.7638156414031982,
"logps/chosen": -253.1526641845703,
"logps/rejected": -256.56072998046875,
"loss": 0.657,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.013170385733246803,
"rewards/margins": 0.08013583719730377,
"rewards/rejected": -0.09330622851848602,
"step": 750
},
{
"epoch": 0.19890081130594087,
"grad_norm": 2.7103869915008545,
"learning_rate": 4.853113678964021e-07,
"logits/chosen": -2.7963593006134033,
"logits/rejected": -2.786759376525879,
"logps/chosen": -295.2373962402344,
"logps/rejected": -288.03070068359375,
"loss": 0.6532,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0016003316268324852,
"rewards/margins": 0.09059783071279526,
"rewards/rejected": -0.09219817072153091,
"step": 760
},
{
"epoch": 0.20151792724417691,
"grad_norm": 2.149914026260376,
"learning_rate": 4.845300384669957e-07,
"logits/chosen": -2.81345534324646,
"logits/rejected": -2.783003807067871,
"logps/chosen": -270.67730712890625,
"logps/rejected": -254.6434326171875,
"loss": 0.6605,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.006530989892780781,
"rewards/margins": 0.07502902299165726,
"rewards/rejected": -0.08156001567840576,
"step": 770
},
{
"epoch": 0.204135043182413,
"grad_norm": 2.4296960830688477,
"learning_rate": 4.8372912584687e-07,
"logits/chosen": -2.8353335857391357,
"logits/rejected": -2.801575183868408,
"logps/chosen": -300.9684143066406,
"logps/rejected": -283.5567626953125,
"loss": 0.6587,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.0004579909145832062,
"rewards/margins": 0.0798453614115715,
"rewards/rejected": -0.079387366771698,
"step": 780
},
{
"epoch": 0.20675215912064904,
"grad_norm": 3.0373857021331787,
"learning_rate": 4.829086969119983e-07,
"logits/chosen": -2.8006482124328613,
"logits/rejected": -2.8082146644592285,
"logps/chosen": -276.4783020019531,
"logps/rejected": -276.69720458984375,
"loss": 0.6671,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.024218443781137466,
"rewards/margins": 0.06143224984407425,
"rewards/rejected": -0.08565069735050201,
"step": 790
},
{
"epoch": 0.2093692750588851,
"grad_norm": 2.1895201206207275,
"learning_rate": 4.820688201679605e-07,
"logits/chosen": -2.8546204566955566,
"logits/rejected": -2.809619426727295,
"logps/chosen": -277.23187255859375,
"logps/rejected": -223.0809783935547,
"loss": 0.6427,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.0034676387440413237,
"rewards/margins": 0.11413818597793579,
"rewards/rejected": -0.1106705442070961,
"step": 800
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -2.828324317932129,
"eval_logits/rejected": -2.8020219802856445,
"eval_logps/chosen": -284.8824768066406,
"eval_logps/rejected": -272.4747314453125,
"eval_loss": 0.6546491980552673,
"eval_rewards/accuracies": 0.6815000176429749,
"eval_rewards/chosen": -0.02144855633378029,
"eval_rewards/margins": 0.08891918510198593,
"eval_rewards/rejected": -0.11036773025989532,
"eval_runtime": 691.3571,
"eval_samples_per_second": 2.893,
"eval_steps_per_second": 0.362,
"step": 800
},
{
"epoch": 0.21198639099712116,
"grad_norm": 2.411094903945923,
"learning_rate": 4.812095657442231e-07,
"logits/chosen": -2.8379623889923096,
"logits/rejected": -2.8474135398864746,
"logps/chosen": -292.9294128417969,
"logps/rejected": -291.79937744140625,
"loss": 0.6657,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03384638577699661,
"rewards/margins": 0.06732925027608871,
"rewards/rejected": -0.10117564350366592,
"step": 810
},
{
"epoch": 0.21460350693535724,
"grad_norm": 2.2789130210876465,
"learning_rate": 4.803310053882831e-07,
"logits/chosen": -2.820188522338867,
"logits/rejected": -2.8341267108917236,
"logps/chosen": -253.18002319335938,
"logps/rejected": -271.46209716796875,
"loss": 0.6585,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.04002877324819565,
"rewards/margins": 0.08040440827608109,
"rewards/rejected": -0.12043318897485733,
"step": 820
},
{
"epoch": 0.2172206228735933,
"grad_norm": 2.6294658184051514,
"learning_rate": 4.794332124596775e-07,
"logits/chosen": -2.8491604328155518,
"logits/rejected": -2.8390445709228516,
"logps/chosen": -288.0977478027344,
"logps/rejected": -289.91839599609375,
"loss": 0.6617,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.025598719716072083,
"rewards/margins": 0.0781911239027977,
"rewards/rejected": -0.10378985106945038,
"step": 830
},
{
"epoch": 0.21983773881182936,
"grad_norm": 2.718003273010254,
"learning_rate": 4.785162619238574e-07,
"logits/chosen": -2.7903778553009033,
"logits/rejected": -2.750192880630493,
"logps/chosen": -271.6007995605469,
"logps/rejected": -255.642822265625,
"loss": 0.6434,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.013516816310584545,
"rewards/margins": 0.11254201829433441,
"rewards/rejected": -0.12605881690979004,
"step": 840
},
{
"epoch": 0.22245485475006543,
"grad_norm": 2.693995714187622,
"learning_rate": 4.775802303459287e-07,
"logits/chosen": -2.7961440086364746,
"logits/rejected": -2.782381534576416,
"logps/chosen": -266.48406982421875,
"logps/rejected": -271.54876708984375,
"loss": 0.6543,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.025890201330184937,
"rewards/margins": 0.09162938594818115,
"rewards/rejected": -0.11751959472894669,
"step": 850
},
{
"epoch": 0.22507197068830148,
"grad_norm": 3.3223588466644287,
"learning_rate": 4.766251958842589e-07,
"logits/chosen": -2.770634174346924,
"logits/rejected": -2.7624752521514893,
"logps/chosen": -295.11322021484375,
"logps/rejected": -291.52655029296875,
"loss": 0.6493,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.03162473067641258,
"rewards/margins": 0.10102814435958862,
"rewards/rejected": -0.1326528638601303,
"step": 860
},
{
"epoch": 0.22768908662653756,
"grad_norm": 2.2951784133911133,
"learning_rate": 4.756512382839506e-07,
"logits/chosen": -2.792806625366211,
"logits/rejected": -2.7687854766845703,
"logps/chosen": -276.4913024902344,
"logps/rejected": -288.6650390625,
"loss": 0.6455,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.06362788379192352,
"rewards/margins": 0.11521414667367935,
"rewards/rejected": -0.17884202301502228,
"step": 870
},
{
"epoch": 0.23030620256477363,
"grad_norm": 2.3468611240386963,
"learning_rate": 4.746584388701831e-07,
"logits/chosen": -2.804765224456787,
"logits/rejected": -2.8049676418304443,
"logps/chosen": -284.9786071777344,
"logps/rejected": -280.96392822265625,
"loss": 0.6438,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.05107206106185913,
"rewards/margins": 0.11629124730825424,
"rewards/rejected": -0.16736331582069397,
"step": 880
},
{
"epoch": 0.23292331850300968,
"grad_norm": 3.075714588165283,
"learning_rate": 4.736468805414218e-07,
"logits/chosen": -2.77662992477417,
"logits/rejected": -2.7775301933288574,
"logps/chosen": -271.46368408203125,
"logps/rejected": -293.26531982421875,
"loss": 0.6421,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.033290714025497437,
"rewards/margins": 0.12201287597417831,
"rewards/rejected": -0.15530358254909515,
"step": 890
},
{
"epoch": 0.23554043444124576,
"grad_norm": 2.879183769226074,
"learning_rate": 4.7261664776249595e-07,
"logits/chosen": -2.7510781288146973,
"logits/rejected": -2.7387068271636963,
"logps/chosen": -250.3533477783203,
"logps/rejected": -251.46630859375,
"loss": 0.6428,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.04132508859038353,
"rewards/margins": 0.12072241306304932,
"rewards/rejected": -0.16204750537872314,
"step": 900
},
{
"epoch": 0.23554043444124576,
"eval_logits/chosen": -2.819901704788208,
"eval_logits/rejected": -2.794234275817871,
"eval_logps/chosen": -285.2049865722656,
"eval_logps/rejected": -275.2684631347656,
"eval_loss": 0.6458239555358887,
"eval_rewards/accuracies": 0.6769999861717224,
"eval_rewards/chosen": -0.024673735722899437,
"eval_rewards/margins": 0.1136314645409584,
"eval_rewards/rejected": -0.138305202126503,
"eval_runtime": 690.9829,
"eval_samples_per_second": 2.894,
"eval_steps_per_second": 0.362,
"step": 900
},
{
"epoch": 0.2381575503794818,
"grad_norm": 2.7687416076660156,
"learning_rate": 4.7156782655754624e-07,
"logits/chosen": -2.8114147186279297,
"logits/rejected": -2.772068977355957,
"logps/chosen": -300.78826904296875,
"logps/rejected": -255.8038330078125,
"loss": 0.6426,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.002123198937624693,
"rewards/margins": 0.1206832155585289,
"rewards/rejected": -0.12280640751123428,
"step": 910
},
{
"epoch": 0.24077466631771788,
"grad_norm": 2.5618391036987305,
"learning_rate": 4.705005045028414e-07,
"logits/chosen": -2.765242338180542,
"logits/rejected": -2.737863063812256,
"logps/chosen": -287.15667724609375,
"logps/rejected": -278.50726318359375,
"loss": 0.6459,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.05967919901013374,
"rewards/margins": 0.11529602855443954,
"rewards/rejected": -0.1749752312898636,
"step": 920
},
{
"epoch": 0.24339178225595393,
"grad_norm": 2.9336323738098145,
"learning_rate": 4.694147707194659e-07,
"logits/chosen": -2.832733631134033,
"logits/rejected": -2.8244283199310303,
"logps/chosen": -294.346923828125,
"logps/rejected": -287.9342346191406,
"loss": 0.6366,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.06329428404569626,
"rewards/margins": 0.1405760794878006,
"rewards/rejected": -0.20387034118175507,
"step": 930
},
{
"epoch": 0.24600889819419,
"grad_norm": 3.908505439758301,
"learning_rate": 4.683107158658781e-07,
"logits/chosen": -2.7808585166931152,
"logits/rejected": -2.763042688369751,
"logps/chosen": -314.3782653808594,
"logps/rejected": -299.661865234375,
"loss": 0.6227,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.05701801925897598,
"rewards/margins": 0.16755308210849762,
"rewards/rejected": -0.2245711088180542,
"step": 940
},
{
"epoch": 0.24862601413242608,
"grad_norm": 3.2749459743499756,
"learning_rate": 4.6718843213034066e-07,
"logits/chosen": -2.7944037914276123,
"logits/rejected": -2.77887225151062,
"logps/chosen": -272.23724365234375,
"logps/rejected": -273.14776611328125,
"loss": 0.633,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.09230604767799377,
"rewards/margins": 0.14217710494995117,
"rewards/rejected": -0.23448316752910614,
"step": 950
},
{
"epoch": 0.2512431300706621,
"grad_norm": 3.0224010944366455,
"learning_rate": 4.660480132232224e-07,
"logits/chosen": -2.805572986602783,
"logits/rejected": -2.80751371383667,
"logps/chosen": -293.3813171386719,
"logps/rejected": -280.83465576171875,
"loss": 0.6507,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0641000047326088,
"rewards/margins": 0.10990612208843231,
"rewards/rejected": -0.1740061342716217,
"step": 960
},
{
"epoch": 0.25386024600889817,
"grad_norm": 3.5039138793945312,
"learning_rate": 4.64889554369174e-07,
"logits/chosen": -2.805609941482544,
"logits/rejected": -2.771754741668701,
"logps/chosen": -298.55157470703125,
"logps/rejected": -267.65087890625,
"loss": 0.6166,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0059810527600348,
"rewards/margins": 0.18814215064048767,
"rewards/rejected": -0.1821610927581787,
"step": 970
},
{
"epoch": 0.2564773619471343,
"grad_norm": 2.8160240650177,
"learning_rate": 4.637131522991764e-07,
"logits/chosen": -2.7994441986083984,
"logits/rejected": -2.7969179153442383,
"logps/chosen": -309.35089111328125,
"logps/rejected": -296.6192321777344,
"loss": 0.6321,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.029499268159270287,
"rewards/margins": 0.14880326390266418,
"rewards/rejected": -0.17830254137516022,
"step": 980
},
{
"epoch": 0.2590944778853703,
"grad_norm": 3.782945156097412,
"learning_rate": 4.6251890524246375e-07,
"logits/chosen": -2.8050458431243896,
"logits/rejected": -2.786475658416748,
"logps/chosen": -262.4518737792969,
"logps/rejected": -256.80792236328125,
"loss": 0.6166,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0701083093881607,
"rewards/margins": 0.18339978158473969,
"rewards/rejected": -0.253508061170578,
"step": 990
},
{
"epoch": 0.26171159382360637,
"grad_norm": 3.791015148162842,
"learning_rate": 4.613069129183218e-07,
"logits/chosen": -2.8377981185913086,
"logits/rejected": -2.799161911010742,
"logps/chosen": -328.35491943359375,
"logps/rejected": -301.65679931640625,
"loss": 0.6381,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0686495453119278,
"rewards/margins": 0.13748301565647125,
"rewards/rejected": -0.20613256096839905,
"step": 1000
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -2.813830852508545,
"eval_logits/rejected": -2.7887284755706787,
"eval_logps/chosen": -289.12060546875,
"eval_logps/rejected": -282.1760559082031,
"eval_loss": 0.635771632194519,
"eval_rewards/accuracies": 0.6784999966621399,
"eval_rewards/chosen": -0.06382979452610016,
"eval_rewards/margins": 0.14355140924453735,
"eval_rewards/rejected": -0.2073812186717987,
"eval_runtime": 691.4427,
"eval_samples_per_second": 2.893,
"eval_steps_per_second": 0.362,
"step": 1000
},
{
"epoch": 0.2643287097618425,
"grad_norm": 4.366467475891113,
"learning_rate": 4.6007727652776065e-07,
"logits/chosen": -2.7737021446228027,
"logits/rejected": -2.7608792781829834,
"logps/chosen": -254.6834259033203,
"logps/rejected": -263.98565673828125,
"loss": 0.6304,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.036558397114276886,
"rewards/margins": 0.1544768214225769,
"rewards/rejected": -0.1910352259874344,
"step": 1010
},
{
"epoch": 0.2669458257000785,
"grad_norm": 3.2850377559661865,
"learning_rate": 4.588300987450652e-07,
"logits/chosen": -2.82348895072937,
"logits/rejected": -2.7995572090148926,
"logps/chosen": -271.41241455078125,
"logps/rejected": -254.01864624023438,
"loss": 0.6293,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.008820459246635437,
"rewards/margins": 0.1594310700893402,
"rewards/rejected": -0.16825154423713684,
"step": 1020
},
{
"epoch": 0.26956294163831457,
"grad_norm": 3.3716328144073486,
"learning_rate": 4.5756548370922134e-07,
"logits/chosen": -2.781808853149414,
"logits/rejected": -2.7637503147125244,
"logps/chosen": -258.62860107421875,
"logps/rejected": -260.2466125488281,
"loss": 0.6508,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.024007773026823997,
"rewards/margins": 0.11937548965215683,
"rewards/rejected": -0.14338326454162598,
"step": 1030
},
{
"epoch": 0.2721800575765506,
"grad_norm": 3.529965400695801,
"learning_rate": 4.5628353701522047e-07,
"logits/chosen": -2.815080404281616,
"logits/rejected": -2.7873313426971436,
"logps/chosen": -321.65435791015625,
"logps/rejected": -310.28497314453125,
"loss": 0.6072,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.017561940476298332,
"rewards/margins": 0.2143036425113678,
"rewards/rejected": -0.2318655550479889,
"step": 1040
},
{
"epoch": 0.2747971735147867,
"grad_norm": 2.87839412689209,
"learning_rate": 4.549843657052429e-07,
"logits/chosen": -2.834746837615967,
"logits/rejected": -2.808051347732544,
"logps/chosen": -287.9942321777344,
"logps/rejected": -302.9963684082031,
"loss": 0.6048,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03615923970937729,
"rewards/margins": 0.21066415309906006,
"rewards/rejected": -0.24682338535785675,
"step": 1050
},
{
"epoch": 0.27741428945302277,
"grad_norm": 3.860949993133545,
"learning_rate": 4.5366807825971907e-07,
"logits/chosen": -2.780369758605957,
"logits/rejected": -2.7750542163848877,
"logps/chosen": -262.59075927734375,
"logps/rejected": -269.21051025390625,
"loss": 0.6437,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08822160959243774,
"rewards/margins": 0.14002035558223724,
"rewards/rejected": -0.228241965174675,
"step": 1060
},
{
"epoch": 0.2800314053912588,
"grad_norm": 6.0348801612854,
"learning_rate": 4.5233478458827176e-07,
"logits/chosen": -2.8092315196990967,
"logits/rejected": -2.785090446472168,
"logps/chosen": -316.466064453125,
"logps/rejected": -282.1798400878906,
"loss": 0.6104,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.08112485706806183,
"rewards/margins": 0.2059168517589569,
"rewards/rejected": -0.2870417535305023,
"step": 1070
},
{
"epoch": 0.2826485213294949,
"grad_norm": 4.09010124206543,
"learning_rate": 4.509845960205389e-07,
"logits/chosen": -2.749141216278076,
"logits/rejected": -2.753202438354492,
"logps/chosen": -304.83111572265625,
"logps/rejected": -288.3349304199219,
"loss": 0.626,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.07901586592197418,
"rewards/margins": 0.17285946011543274,
"rewards/rejected": -0.2518753409385681,
"step": 1080
},
{
"epoch": 0.28526563726773096,
"grad_norm": 4.772919654846191,
"learning_rate": 4.4961762529687736e-07,
"logits/chosen": -2.8033485412597656,
"logits/rejected": -2.7844488620758057,
"logps/chosen": -288.91998291015625,
"logps/rejected": -284.6497802734375,
"loss": 0.6324,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09251121431589127,
"rewards/margins": 0.15693159401416779,
"rewards/rejected": -0.24944277107715607,
"step": 1090
},
{
"epoch": 0.287882753205967,
"grad_norm": 4.188416957855225,
"learning_rate": 4.482339865589492e-07,
"logits/chosen": -2.8103842735290527,
"logits/rejected": -2.768054962158203,
"logps/chosen": -299.87091064453125,
"logps/rejected": -267.5564880371094,
"loss": 0.6488,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.16943010687828064,
"rewards/margins": 0.12456401437520981,
"rewards/rejected": -0.29399409890174866,
"step": 1100
},
{
"epoch": 0.287882753205967,
"eval_logits/chosen": -2.8070549964904785,
"eval_logits/rejected": -2.782604694366455,
"eval_logps/chosen": -296.5137634277344,
"eval_logps/rejected": -291.989013671875,
"eval_loss": 0.6283535361289978,
"eval_rewards/accuracies": 0.6790000200271606,
"eval_rewards/chosen": -0.13776110112667084,
"eval_rewards/margins": 0.16774973273277283,
"eval_rewards/rejected": -0.30551087856292725,
"eval_runtime": 691.0066,
"eval_samples_per_second": 2.894,
"eval_steps_per_second": 0.362,
"step": 1100
},
{
"epoch": 0.2904998691442031,
"grad_norm": 4.440745830535889,
"learning_rate": 4.4683379534019076e-07,
"logits/chosen": -2.803920269012451,
"logits/rejected": -2.8017265796661377,
"logps/chosen": -300.3214111328125,
"logps/rejected": -309.1615905761719,
"loss": 0.6336,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1395951509475708,
"rewards/margins": 0.1519310027360916,
"rewards/rejected": -0.2915261387825012,
"step": 1110
},
{
"epoch": 0.29311698508243916,
"grad_norm": 3.8111138343811035,
"learning_rate": 4.4541716855616593e-07,
"logits/chosen": -2.7794926166534424,
"logits/rejected": -2.7597875595092773,
"logps/chosen": -264.9614562988281,
"logps/rejected": -282.9358825683594,
"loss": 0.6252,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07038460671901703,
"rewards/margins": 0.17066633701324463,
"rewards/rejected": -0.24105095863342285,
"step": 1120
},
{
"epoch": 0.2957341010206752,
"grad_norm": 5.494072914123535,
"learning_rate": 4.4398422449480357e-07,
"logits/chosen": -2.774218797683716,
"logits/rejected": -2.725161075592041,
"logps/chosen": -294.66448974609375,
"logps/rejected": -311.0096740722656,
"loss": 0.6402,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1438552290201187,
"rewards/margins": 0.14675047993659973,
"rewards/rejected": -0.29060572385787964,
"step": 1130
},
{
"epoch": 0.29835121695891126,
"grad_norm": 4.3281474113464355,
"learning_rate": 4.4253508280652036e-07,
"logits/chosen": -2.7951579093933105,
"logits/rejected": -2.7520532608032227,
"logps/chosen": -317.461181640625,
"logps/rejected": -285.7931213378906,
"loss": 0.6139,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.13621816039085388,
"rewards/margins": 0.19606857001781464,
"rewards/rejected": -0.3322867453098297,
"step": 1140
},
{
"epoch": 0.30096833289714736,
"grad_norm": 6.221525192260742,
"learning_rate": 4.410698644942302e-07,
"logits/chosen": -2.8402047157287598,
"logits/rejected": -2.816387176513672,
"logps/chosen": -297.50286865234375,
"logps/rejected": -292.28436279296875,
"loss": 0.6183,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.10158324241638184,
"rewards/margins": 0.19611066579818726,
"rewards/rejected": -0.2976939082145691,
"step": 1150
},
{
"epoch": 0.3035854488353834,
"grad_norm": 4.492012023925781,
"learning_rate": 4.3958869190324057e-07,
"logits/chosen": -2.76503586769104,
"logits/rejected": -2.7254602909088135,
"logps/chosen": -291.94873046875,
"logps/rejected": -282.52880859375,
"loss": 0.6221,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.12198346853256226,
"rewards/margins": 0.18694952130317688,
"rewards/rejected": -0.30893296003341675,
"step": 1160
},
{
"epoch": 0.30620256477361946,
"grad_norm": 3.562570810317993,
"learning_rate": 4.380916887110365e-07,
"logits/chosen": -2.829111099243164,
"logits/rejected": -2.800809383392334,
"logps/chosen": -290.05316162109375,
"logps/rejected": -266.3580017089844,
"loss": 0.6199,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.14898671209812164,
"rewards/margins": 0.19158688187599182,
"rewards/rejected": -0.34057360887527466,
"step": 1170
},
{
"epoch": 0.30881968071185556,
"grad_norm": 5.379666805267334,
"learning_rate": 4.3657897991695394e-07,
"logits/chosen": -2.7369437217712402,
"logits/rejected": -2.7774927616119385,
"logps/chosen": -281.9171142578125,
"logps/rejected": -300.78912353515625,
"loss": 0.6192,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.11646691709756851,
"rewards/margins": 0.19827672839164734,
"rewards/rejected": -0.31474363803863525,
"step": 1180
},
{
"epoch": 0.3114367966500916,
"grad_norm": 4.079792499542236,
"learning_rate": 4.350506918317416e-07,
"logits/chosen": -2.8184256553649902,
"logits/rejected": -2.788510799407959,
"logps/chosen": -274.4839172363281,
"logps/rejected": -287.8948669433594,
"loss": 0.6194,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.12529827654361725,
"rewards/margins": 0.19565680623054504,
"rewards/rejected": -0.3209550976753235,
"step": 1190
},
{
"epoch": 0.31405391258832765,
"grad_norm": 4.406829833984375,
"learning_rate": 4.335069520670149e-07,
"logits/chosen": -2.7956674098968506,
"logits/rejected": -2.7690110206604004,
"logps/chosen": -252.70156860351562,
"logps/rejected": -279.14111328125,
"loss": 0.6427,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.09379851073026657,
"rewards/margins": 0.14501607418060303,
"rewards/rejected": -0.2388145923614502,
"step": 1200
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -2.8165299892425537,
"eval_logits/rejected": -2.793107032775879,
"eval_logps/chosen": -293.77850341796875,
"eval_logps/rejected": -291.3028259277344,
"eval_loss": 0.622346818447113,
"eval_rewards/accuracies": 0.6834999918937683,
"eval_rewards/chosen": -0.11040891706943512,
"eval_rewards/margins": 0.18824002146720886,
"eval_rewards/rejected": -0.2986489236354828,
"eval_runtime": 690.8187,
"eval_samples_per_second": 2.895,
"eval_steps_per_second": 0.362,
"step": 1200
},
{
"epoch": 0.3166710285265637,
"grad_norm": 4.730831146240234,
"learning_rate": 4.319478895245999e-07,
"logits/chosen": -2.8096089363098145,
"logits/rejected": -2.781852960586548,
"logps/chosen": -277.19305419921875,
"logps/rejected": -268.88653564453125,
"loss": 0.6189,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.12051185220479965,
"rewards/margins": 0.19446460902690887,
"rewards/rejected": -0.3149764835834503,
"step": 1210
},
{
"epoch": 0.3192881444647998,
"grad_norm": 4.179198741912842,
"learning_rate": 4.3037363438577036e-07,
"logits/chosen": -2.8334312438964844,
"logits/rejected": -2.796905517578125,
"logps/chosen": -275.5434875488281,
"logps/rejected": -309.56561279296875,
"loss": 0.6074,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.03255675360560417,
"rewards/margins": 0.21560052037239075,
"rewards/rejected": -0.24815726280212402,
"step": 1220
},
{
"epoch": 0.32190526040303585,
"grad_norm": 3.7570934295654297,
"learning_rate": 4.2878431810037716e-07,
"logits/chosen": -2.8290486335754395,
"logits/rejected": -2.821361780166626,
"logps/chosen": -317.92926025390625,
"logps/rejected": -291.9640197753906,
"loss": 0.6102,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.06272344291210175,
"rewards/margins": 0.21540877223014832,
"rewards/rejected": -0.27813225984573364,
"step": 1230
},
{
"epoch": 0.3245223763412719,
"grad_norm": 5.973113536834717,
"learning_rate": 4.271800733758729e-07,
"logits/chosen": -2.801720380783081,
"logits/rejected": -2.804701566696167,
"logps/chosen": -308.4283142089844,
"logps/rejected": -294.974609375,
"loss": 0.6055,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04135540500283241,
"rewards/margins": 0.23178556561470032,
"rewards/rejected": -0.27314096689224243,
"step": 1240
},
{
"epoch": 0.327139492279508,
"grad_norm": 5.047220706939697,
"learning_rate": 4.255610341662304e-07,
"logits/chosen": -2.8307595252990723,
"logits/rejected": -2.779573440551758,
"logps/chosen": -282.5008239746094,
"logps/rejected": -278.0930480957031,
"loss": 0.6297,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07180126756429672,
"rewards/margins": 0.17990802228450775,
"rewards/rejected": -0.2517092823982239,
"step": 1250
},
{
"epoch": 0.32975660821774405,
"grad_norm": 4.12667179107666,
"learning_rate": 4.2392733566075757e-07,
"logits/chosen": -2.8080954551696777,
"logits/rejected": -2.7833712100982666,
"logps/chosen": -279.9812927246094,
"logps/rejected": -274.603271484375,
"loss": 0.6437,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06445430964231491,
"rewards/margins": 0.1353849321603775,
"rewards/rejected": -0.19983923435211182,
"step": 1260
},
{
"epoch": 0.3323737241559801,
"grad_norm": 3.241464138031006,
"learning_rate": 4.2227911427280973e-07,
"logits/chosen": -2.7715563774108887,
"logits/rejected": -2.7483251094818115,
"logps/chosen": -269.14215087890625,
"logps/rejected": -254.9038543701172,
"loss": 0.6275,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029628584161400795,
"rewards/margins": 0.1794588267803192,
"rewards/rejected": -0.20908741652965546,
"step": 1270
},
{
"epoch": 0.33499084009421615,
"grad_norm": 6.028203010559082,
"learning_rate": 4.206165076283982e-07,
"logits/chosen": -2.8015265464782715,
"logits/rejected": -2.7831873893737793,
"logps/chosen": -270.62139892578125,
"logps/rejected": -273.0738830566406,
"loss": 0.6107,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09085245430469513,
"rewards/margins": 0.2116876095533371,
"rewards/rejected": -0.30254003405570984,
"step": 1280
},
{
"epoch": 0.33760795603245225,
"grad_norm": 5.242630958557129,
"learning_rate": 4.1893965455469946e-07,
"logits/chosen": -2.8173327445983887,
"logits/rejected": -2.7973732948303223,
"logps/chosen": -279.14031982421875,
"logps/rejected": -275.79638671875,
"loss": 0.6269,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.14117182791233063,
"rewards/margins": 0.18503603339195251,
"rewards/rejected": -0.32620781660079956,
"step": 1290
},
{
"epoch": 0.3402250719706883,
"grad_norm": 5.775106430053711,
"learning_rate": 4.172486950684626e-07,
"logits/chosen": -2.821103096008301,
"logits/rejected": -2.814502477645874,
"logps/chosen": -279.78289794921875,
"logps/rejected": -298.9765930175781,
"loss": 0.6131,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.11109775304794312,
"rewards/margins": 0.21843478083610535,
"rewards/rejected": -0.32953253388404846,
"step": 1300
},
{
"epoch": 0.3402250719706883,
"eval_logits/chosen": -2.818049430847168,
"eval_logits/rejected": -2.7951488494873047,
"eval_logps/chosen": -297.3945007324219,
"eval_logps/rejected": -296.5805969238281,
"eval_loss": 0.6172210574150085,
"eval_rewards/accuracies": 0.6865000128746033,
"eval_rewards/chosen": -0.14656904339790344,
"eval_rewards/margins": 0.2048574537038803,
"eval_rewards/rejected": -0.35142648220062256,
"eval_runtime": 691.9861,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 1300
},
{
"epoch": 0.34284218790892435,
"grad_norm": 8.304680824279785,
"learning_rate": 4.155437703643181e-07,
"logits/chosen": -2.841334581375122,
"logits/rejected": -2.806217670440674,
"logps/chosen": -272.61444091796875,
"logps/rejected": -267.8605041503906,
"loss": 0.6005,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.11493051052093506,
"rewards/margins": 0.24178418517112732,
"rewards/rejected": -0.35671466588974,
"step": 1310
},
{
"epoch": 0.34545930384716045,
"grad_norm": 6.887094497680664,
"learning_rate": 4.138250228029881e-07,
"logits/chosen": -2.811464786529541,
"logits/rejected": -2.797884941101074,
"logps/chosen": -295.8591613769531,
"logps/rejected": -319.4233703613281,
"loss": 0.6383,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2382466346025467,
"rewards/margins": 0.16607843339443207,
"rewards/rejected": -0.40432506799697876,
"step": 1320
},
{
"epoch": 0.3480764197853965,
"grad_norm": 4.52334451675415,
"learning_rate": 4.1209259589939935e-07,
"logits/chosen": -2.8012988567352295,
"logits/rejected": -2.8001253604888916,
"logps/chosen": -262.8810119628906,
"logps/rejected": -272.76788330078125,
"loss": 0.6321,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.13144102692604065,
"rewards/margins": 0.17341327667236328,
"rewards/rejected": -0.30485430359840393,
"step": 1330
},
{
"epoch": 0.35069353572363254,
"grad_norm": 3.246675729751587,
"learning_rate": 4.103466343106998e-07,
"logits/chosen": -2.8291964530944824,
"logits/rejected": -2.824831247329712,
"logps/chosen": -302.6276550292969,
"logps/rejected": -286.753662109375,
"loss": 0.6334,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1298406422138214,
"rewards/margins": 0.16963128745555878,
"rewards/rejected": -0.2994719445705414,
"step": 1340
},
{
"epoch": 0.35331065166186865,
"grad_norm": 4.933244705200195,
"learning_rate": 4.085872838241796e-07,
"logits/chosen": -2.767702102661133,
"logits/rejected": -2.730109691619873,
"logps/chosen": -311.7983703613281,
"logps/rejected": -294.95294189453125,
"loss": 0.6356,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.16958799958229065,
"rewards/margins": 0.17119386792182922,
"rewards/rejected": -0.3407818675041199,
"step": 1350
},
{
"epoch": 0.3559277676001047,
"grad_norm": 6.131802082061768,
"learning_rate": 4.06814691345098e-07,
"logits/chosen": -2.7470338344573975,
"logits/rejected": -2.722545862197876,
"logps/chosen": -288.4170837402344,
"logps/rejected": -289.61102294921875,
"loss": 0.602,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.1348382532596588,
"rewards/margins": 0.2365628182888031,
"rewards/rejected": -0.3714010715484619,
"step": 1360
},
{
"epoch": 0.35854488353834074,
"grad_norm": 4.9708638191223145,
"learning_rate": 4.0502900488441707e-07,
"logits/chosen": -2.7989072799682617,
"logits/rejected": -2.789274215698242,
"logps/chosen": -306.6829528808594,
"logps/rejected": -320.0224304199219,
"loss": 0.6285,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.20997491478919983,
"rewards/margins": 0.1841730773448944,
"rewards/rejected": -0.39414799213409424,
"step": 1370
},
{
"epoch": 0.3611619994765768,
"grad_norm": 6.784174919128418,
"learning_rate": 4.032303735464422e-07,
"logits/chosen": -2.880401134490967,
"logits/rejected": -2.835643768310547,
"logps/chosen": -310.90679931640625,
"logps/rejected": -308.8883361816406,
"loss": 0.6053,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.20711734890937805,
"rewards/margins": 0.24106808006763458,
"rewards/rejected": -0.44818538427352905,
"step": 1380
},
{
"epoch": 0.3637791154148129,
"grad_norm": 5.785353183746338,
"learning_rate": 4.014189475163726e-07,
"logits/chosen": -2.794342517852783,
"logits/rejected": -2.7849628925323486,
"logps/chosen": -297.41961669921875,
"logps/rejected": -308.3134765625,
"loss": 0.6053,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.24589386582374573,
"rewards/margins": 0.2298090010881424,
"rewards/rejected": -0.47570285201072693,
"step": 1390
},
{
"epoch": 0.36639623135304894,
"grad_norm": 6.076969146728516,
"learning_rate": 3.995948780477605e-07,
"logits/chosen": -2.8259429931640625,
"logits/rejected": -2.795186996459961,
"logps/chosen": -306.1077880859375,
"logps/rejected": -299.7892150878906,
"loss": 0.6326,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21178540587425232,
"rewards/margins": 0.17982172966003418,
"rewards/rejected": -0.3916071355342865,
"step": 1400
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -2.814655065536499,
"eval_logits/rejected": -2.7920358180999756,
"eval_logps/chosen": -300.2596740722656,
"eval_logps/rejected": -300.3965759277344,
"eval_loss": 0.6155202388763428,
"eval_rewards/accuracies": 0.6859999895095825,
"eval_rewards/chosen": -0.175220787525177,
"eval_rewards/margins": 0.2143653929233551,
"eval_rewards/rejected": -0.3895862102508545,
"eval_runtime": 692.0291,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 1400
},
{
"epoch": 0.369013347291285,
"grad_norm": 6.421947479248047,
"learning_rate": 3.977583174498816e-07,
"logits/chosen": -2.816697359085083,
"logits/rejected": -2.8030014038085938,
"logps/chosen": -300.00640869140625,
"logps/rejected": -303.1688232421875,
"loss": 0.5882,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.14107367396354675,
"rewards/margins": 0.27628999948501587,
"rewards/rejected": -0.41736364364624023,
"step": 1410
},
{
"epoch": 0.3716304632295211,
"grad_norm": 4.980222225189209,
"learning_rate": 3.9590941907501717e-07,
"logits/chosen": -2.8284125328063965,
"logits/rejected": -2.812608242034912,
"logps/chosen": -307.8800354003906,
"logps/rejected": -303.53021240234375,
"loss": 0.6005,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.056650467216968536,
"rewards/margins": 0.2519657611846924,
"rewards/rejected": -0.3086162507534027,
"step": 1420
},
{
"epoch": 0.37424757916775714,
"grad_norm": 5.049463272094727,
"learning_rate": 3.9404833730564974e-07,
"logits/chosen": -2.735870838165283,
"logits/rejected": -2.722884178161621,
"logps/chosen": -285.8304443359375,
"logps/rejected": -297.43341064453125,
"loss": 0.6055,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.12556666135787964,
"rewards/margins": 0.2363204061985016,
"rewards/rejected": -0.3618870973587036,
"step": 1430
},
{
"epoch": 0.3768646951059932,
"grad_norm": 6.007881164550781,
"learning_rate": 3.9217522754157117e-07,
"logits/chosen": -2.8069920539855957,
"logits/rejected": -2.80522084236145,
"logps/chosen": -284.0002136230469,
"logps/rejected": -286.4706115722656,
"loss": 0.5941,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.15235498547554016,
"rewards/margins": 0.26103848218917847,
"rewards/rejected": -0.41339343786239624,
"step": 1440
},
{
"epoch": 0.37948181104422923,
"grad_norm": 4.487087726593018,
"learning_rate": 3.9029024618690785e-07,
"logits/chosen": -2.8235816955566406,
"logits/rejected": -2.7990283966064453,
"logps/chosen": -266.3917541503906,
"logps/rejected": -270.59381103515625,
"loss": 0.6161,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.11356230825185776,
"rewards/margins": 0.21844033896923065,
"rewards/rejected": -0.3320026695728302,
"step": 1450
},
{
"epoch": 0.38209892698246534,
"grad_norm": 3.7364535331726074,
"learning_rate": 3.883935506370605e-07,
"logits/chosen": -2.7793936729431152,
"logits/rejected": -2.770378589630127,
"logps/chosen": -278.8677062988281,
"logps/rejected": -271.43145751953125,
"loss": 0.6076,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.08725923299789429,
"rewards/margins": 0.2380957156419754,
"rewards/rejected": -0.3253549635410309,
"step": 1460
},
{
"epoch": 0.3847160429207014,
"grad_norm": 4.045937538146973,
"learning_rate": 3.864852992655616e-07,
"logits/chosen": -2.7860310077667236,
"logits/rejected": -2.7741951942443848,
"logps/chosen": -279.3297119140625,
"logps/rejected": -292.84356689453125,
"loss": 0.5813,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.10104944556951523,
"rewards/margins": 0.2876027524471283,
"rewards/rejected": -0.38865217566490173,
"step": 1470
},
{
"epoch": 0.38733315885893743,
"grad_norm": 5.180766582489014,
"learning_rate": 3.845656514108515e-07,
"logits/chosen": -2.8035526275634766,
"logits/rejected": -2.784550189971924,
"logps/chosen": -299.1927490234375,
"logps/rejected": -258.96661376953125,
"loss": 0.6143,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.18017061054706573,
"rewards/margins": 0.21995961666107178,
"rewards/rejected": -0.40013018250465393,
"step": 1480
},
{
"epoch": 0.38995027479717354,
"grad_norm": 3.420503616333008,
"learning_rate": 3.8263476736297375e-07,
"logits/chosen": -2.8004748821258545,
"logits/rejected": -2.755922794342041,
"logps/chosen": -280.3719177246094,
"logps/rejected": -276.71051025390625,
"loss": 0.6096,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.11658191680908203,
"rewards/margins": 0.22706842422485352,
"rewards/rejected": -0.34365034103393555,
"step": 1490
},
{
"epoch": 0.3925673907354096,
"grad_norm": 6.24570369720459,
"learning_rate": 3.8069280835019055e-07,
"logits/chosen": -2.7886569499969482,
"logits/rejected": -2.757636070251465,
"logps/chosen": -291.5840759277344,
"logps/rejected": -290.7030334472656,
"loss": 0.6128,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.07107678055763245,
"rewards/margins": 0.2125014066696167,
"rewards/rejected": -0.28357818722724915,
"step": 1500
},
{
"epoch": 0.3925673907354096,
"eval_logits/chosen": -2.819805145263672,
"eval_logits/rejected": -2.798032283782959,
"eval_logps/chosen": -289.036865234375,
"eval_logps/rejected": -288.3089904785156,
"eval_loss": 0.6180471777915955,
"eval_rewards/accuracies": 0.6890000104904175,
"eval_rewards/chosen": -0.06299243867397308,
"eval_rewards/margins": 0.20571817457675934,
"eval_rewards/rejected": -0.2687106430530548,
"eval_runtime": 691.9992,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 1500
},
{
"epoch": 0.39518450667364563,
"grad_norm": 7.418298721313477,
"learning_rate": 3.7873993652552073e-07,
"logits/chosen": -2.7985031604766846,
"logits/rejected": -2.7847418785095215,
"logps/chosen": -256.2576904296875,
"logps/rejected": -263.3230895996094,
"loss": 0.646,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07267605513334274,
"rewards/margins": 0.14168903231620789,
"rewards/rejected": -0.21436509490013123,
"step": 1510
},
{
"epoch": 0.39780162261188173,
"grad_norm": 3.0412213802337646,
"learning_rate": 3.767763149531995e-07,
"logits/chosen": -2.8065857887268066,
"logits/rejected": -2.792532205581665,
"logps/chosen": -282.3772888183594,
"logps/rejected": -286.32757568359375,
"loss": 0.6036,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.029223937541246414,
"rewards/margins": 0.23573264479637146,
"rewards/rejected": -0.26495662331581116,
"step": 1520
},
{
"epoch": 0.4004187385501178,
"grad_norm": 6.914887428283691,
"learning_rate": 3.7480210759506326e-07,
"logits/chosen": -2.771960973739624,
"logits/rejected": -2.769230365753174,
"logps/chosen": -301.027099609375,
"logps/rejected": -306.0934143066406,
"loss": 0.6321,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.05497425049543381,
"rewards/margins": 0.1824551671743393,
"rewards/rejected": -0.2374294102191925,
"step": 1530
},
{
"epoch": 0.40303585448835383,
"grad_norm": 5.229218006134033,
"learning_rate": 3.728174792968582e-07,
"logits/chosen": -2.7818996906280518,
"logits/rejected": -2.753554582595825,
"logps/chosen": -264.9828186035156,
"logps/rejected": -266.6888122558594,
"loss": 0.6304,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.10081575810909271,
"rewards/margins": 0.1800784170627594,
"rewards/rejected": -0.2808941900730133,
"step": 1540
},
{
"epoch": 0.4056529704265899,
"grad_norm": 3.8269035816192627,
"learning_rate": 3.70822595774476e-07,
"logits/chosen": -2.8083198070526123,
"logits/rejected": -2.7798688411712646,
"logps/chosen": -294.8878479003906,
"logps/rejected": -306.19659423828125,
"loss": 0.5877,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.06873732060194016,
"rewards/margins": 0.28800445795059204,
"rewards/rejected": -0.3567417860031128,
"step": 1550
},
{
"epoch": 0.408270086364826,
"grad_norm": 6.544018268585205,
"learning_rate": 3.688176236001168e-07,
"logits/chosen": -2.7987208366394043,
"logits/rejected": -2.7670371532440186,
"logps/chosen": -304.5577392578125,
"logps/rejected": -289.78729248046875,
"loss": 0.611,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.0676363930106163,
"rewards/margins": 0.23785026371479034,
"rewards/rejected": -0.30548661947250366,
"step": 1560
},
{
"epoch": 0.410887202303062,
"grad_norm": 9.901212692260742,
"learning_rate": 3.6680273018838016e-07,
"logits/chosen": -2.8177802562713623,
"logits/rejected": -2.806378126144409,
"logps/chosen": -281.0837707519531,
"logps/rejected": -286.8470153808594,
"loss": 0.6035,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.11407822370529175,
"rewards/margins": 0.25138336420059204,
"rewards/rejected": -0.3654615879058838,
"step": 1570
},
{
"epoch": 0.4135043182412981,
"grad_norm": 7.281955718994141,
"learning_rate": 3.6477808378228596e-07,
"logits/chosen": -2.787090539932251,
"logits/rejected": -2.7860255241394043,
"logps/chosen": -283.32928466796875,
"logps/rejected": -338.25714111328125,
"loss": 0.6043,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.12236142158508301,
"rewards/margins": 0.2562143802642822,
"rewards/rejected": -0.37857580184936523,
"step": 1580
},
{
"epoch": 0.4161214341795342,
"grad_norm": 8.57088565826416,
"learning_rate": 3.6274385343922674e-07,
"logits/chosen": -2.8543007373809814,
"logits/rejected": -2.8531434535980225,
"logps/chosen": -267.55767822265625,
"logps/rejected": -295.7901306152344,
"loss": 0.6187,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.15387986600399017,
"rewards/margins": 0.21341195702552795,
"rewards/rejected": -0.36729180812835693,
"step": 1590
},
{
"epoch": 0.4187385501177702,
"grad_norm": 5.7539849281311035,
"learning_rate": 3.6070020901685057e-07,
"logits/chosen": -2.7576816082000732,
"logits/rejected": -2.769594669342041,
"logps/chosen": -300.43572998046875,
"logps/rejected": -298.788818359375,
"loss": 0.6223,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.18216048181056976,
"rewards/margins": 0.21212442219257355,
"rewards/rejected": -0.3942849040031433,
"step": 1600
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -2.8147764205932617,
"eval_logits/rejected": -2.792606830596924,
"eval_logps/chosen": -299.62200927734375,
"eval_logps/rejected": -302.40740966796875,
"eval_loss": 0.6088424324989319,
"eval_rewards/accuracies": 0.6945000290870667,
"eval_rewards/chosen": -0.16884401440620422,
"eval_rewards/margins": 0.2408505380153656,
"eval_rewards/rejected": -0.4096945822238922,
"eval_runtime": 691.674,
"eval_samples_per_second": 2.892,
"eval_steps_per_second": 0.361,
"step": 1600
},
{
"epoch": 0.4213556660560063,
"grad_norm": 6.157792568206787,
"learning_rate": 3.5864732115887863e-07,
"logits/chosen": -2.81066632270813,
"logits/rejected": -2.802830219268799,
"logps/chosen": -273.0591735839844,
"logps/rejected": -307.04254150390625,
"loss": 0.5896,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.12094251811504364,
"rewards/margins": 0.2827422022819519,
"rewards/rejected": -0.40368470549583435,
"step": 1610
},
{
"epoch": 0.4239727819942423,
"grad_norm": 6.331284999847412,
"learning_rate": 3.565853612808562e-07,
"logits/chosen": -2.823272466659546,
"logits/rejected": -2.794790744781494,
"logps/chosen": -303.06683349609375,
"logps/rejected": -291.0,
"loss": 0.639,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.23127253353595734,
"rewards/margins": 0.17943724989891052,
"rewards/rejected": -0.41070979833602905,
"step": 1620
},
{
"epoch": 0.4265898979324784,
"grad_norm": 9.121101379394531,
"learning_rate": 3.5451450155583984e-07,
"logits/chosen": -2.733624219894409,
"logits/rejected": -2.7721478939056396,
"logps/chosen": -277.8062744140625,
"logps/rejected": -282.9922790527344,
"loss": 0.623,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.28953424096107483,
"rewards/margins": 0.21646256744861603,
"rewards/rejected": -0.5059967041015625,
"step": 1630
},
{
"epoch": 0.42920701387071447,
"grad_norm": 4.436567306518555,
"learning_rate": 3.5243491490002055e-07,
"logits/chosen": -2.817996025085449,
"logits/rejected": -2.8122916221618652,
"logps/chosen": -305.4420471191406,
"logps/rejected": -318.54742431640625,
"loss": 0.6265,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.32780542969703674,
"rewards/margins": 0.21562886238098145,
"rewards/rejected": -0.5434342622756958,
"step": 1640
},
{
"epoch": 0.4318241298089505,
"grad_norm": 7.695457935333252,
"learning_rate": 3.503467749582857e-07,
"logits/chosen": -2.790708303451538,
"logits/rejected": -2.7539708614349365,
"logps/chosen": -298.7849426269531,
"logps/rejected": -281.51995849609375,
"loss": 0.6324,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2754608690738678,
"rewards/margins": 0.19722957909107208,
"rewards/rejected": -0.47269049286842346,
"step": 1650
},
{
"epoch": 0.4344412457471866,
"grad_norm": 8.035721778869629,
"learning_rate": 3.482502560897194e-07,
"logits/chosen": -2.7719411849975586,
"logits/rejected": -2.762267589569092,
"logps/chosen": -256.39263916015625,
"logps/rejected": -276.6297607421875,
"loss": 0.6336,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.19001971185207367,
"rewards/margins": 0.172675222158432,
"rewards/rejected": -0.3626949191093445,
"step": 1660
},
{
"epoch": 0.43705836168542267,
"grad_norm": 4.791623115539551,
"learning_rate": 3.4614553335304403e-07,
"logits/chosen": -2.8094491958618164,
"logits/rejected": -2.7578389644622803,
"logps/chosen": -303.371337890625,
"logps/rejected": -291.80615234375,
"loss": 0.5957,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.12800468504428864,
"rewards/margins": 0.26551762223243713,
"rewards/rejected": -0.39352232217788696,
"step": 1670
},
{
"epoch": 0.4396754776236587,
"grad_norm": 7.589243412017822,
"learning_rate": 3.440327824920022e-07,
"logits/chosen": -2.7957282066345215,
"logits/rejected": -2.775707483291626,
"logps/chosen": -309.8748474121094,
"logps/rejected": -299.0494384765625,
"loss": 0.5742,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08200428634881973,
"rewards/margins": 0.3152574598789215,
"rewards/rejected": -0.39726167917251587,
"step": 1680
},
{
"epoch": 0.44229259356189476,
"grad_norm": 6.186291694641113,
"learning_rate": 3.4191217992068287e-07,
"logits/chosen": -2.8362536430358887,
"logits/rejected": -2.8137047290802,
"logps/chosen": -306.2242431640625,
"logps/rejected": -284.80548095703125,
"loss": 0.6043,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.115182064473629,
"rewards/margins": 0.25850868225097656,
"rewards/rejected": -0.37369078397750854,
"step": 1690
},
{
"epoch": 0.44490970950013087,
"grad_norm": 12.576449394226074,
"learning_rate": 3.3978390270879056e-07,
"logits/chosen": -2.7859883308410645,
"logits/rejected": -2.7761070728302,
"logps/chosen": -251.69168090820312,
"logps/rejected": -273.64825439453125,
"loss": 0.6338,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23590262234210968,
"rewards/margins": 0.1843734234571457,
"rewards/rejected": -0.42027607560157776,
"step": 1700
},
{
"epoch": 0.44490970950013087,
"eval_logits/chosen": -2.818115234375,
"eval_logits/rejected": -2.7960946559906006,
"eval_logps/chosen": -304.2535095214844,
"eval_logps/rejected": -308.0869140625,
"eval_loss": 0.6060847043991089,
"eval_rewards/accuracies": 0.6924999952316284,
"eval_rewards/chosen": -0.21515871584415436,
"eval_rewards/margins": 0.2513309419155121,
"eval_rewards/rejected": -0.46648964285850525,
"eval_runtime": 691.2139,
"eval_samples_per_second": 2.893,
"eval_steps_per_second": 0.362,
"step": 1700
},
{
"epoch": 0.4475268254383669,
"grad_norm": 8.074392318725586,
"learning_rate": 3.376481285668599e-07,
"logits/chosen": -2.8055875301361084,
"logits/rejected": -2.8101181983947754,
"logps/chosen": -259.6014404296875,
"logps/rejected": -299.0648193359375,
"loss": 0.6022,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.21092364192008972,
"rewards/margins": 0.25584885478019714,
"rewards/rejected": -0.4667724668979645,
"step": 1710
},
{
"epoch": 0.45014394137660296,
"grad_norm": 9.234480857849121,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": -2.838655948638916,
"logits/rejected": -2.825796604156494,
"logps/chosen": -299.0382995605469,
"logps/rejected": -306.70733642578125,
"loss": 0.5981,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.14200787246227264,
"rewards/margins": 0.2596356272697449,
"rewards/rejected": -0.40164345502853394,
"step": 1720
},
{
"epoch": 0.45276105731483907,
"grad_norm": 6.1853437423706055,
"learning_rate": 3.33354803450089e-07,
"logits/chosen": -2.745539426803589,
"logits/rejected": -2.7465980052948,
"logps/chosen": -298.8321533203125,
"logps/rejected": -300.1834411621094,
"loss": 0.6179,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.14898642897605896,
"rewards/margins": 0.23417282104492188,
"rewards/rejected": -0.38315925002098083,
"step": 1730
},
{
"epoch": 0.4553781732530751,
"grad_norm": 3.701824426651001,
"learning_rate": 3.311976109666605e-07,
"logits/chosen": -2.762765407562256,
"logits/rejected": -2.745163917541504,
"logps/chosen": -306.2688293457031,
"logps/rejected": -297.1578369140625,
"loss": 0.6142,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.11404751241207123,
"rewards/margins": 0.22985681891441345,
"rewards/rejected": -0.3439043462276459,
"step": 1740
},
{
"epoch": 0.45799528919131116,
"grad_norm": 5.698086738586426,
"learning_rate": 3.2903363850608317e-07,
"logits/chosen": -2.8657941818237305,
"logits/rejected": -2.8256325721740723,
"logps/chosen": -286.952392578125,
"logps/rejected": -288.02484130859375,
"loss": 0.609,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.22551126778125763,
"rewards/margins": 0.23173291981220245,
"rewards/rejected": -0.45724421739578247,
"step": 1750
},
{
"epoch": 0.46061240512954726,
"grad_norm": 7.6980085372924805,
"learning_rate": 3.2686306675943477e-07,
"logits/chosen": -2.792118549346924,
"logits/rejected": -2.8060059547424316,
"logps/chosen": -294.06951904296875,
"logps/rejected": -291.16302490234375,
"loss": 0.6134,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.20452764630317688,
"rewards/margins": 0.24044232070446014,
"rewards/rejected": -0.44496995210647583,
"step": 1760
},
{
"epoch": 0.4632295210677833,
"grad_norm": 4.300843238830566,
"learning_rate": 3.2468607696883145e-07,
"logits/chosen": -2.7653212547302246,
"logits/rejected": -2.756118059158325,
"logps/chosen": -298.01544189453125,
"logps/rejected": -333.34234619140625,
"loss": 0.5883,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2969765067100525,
"rewards/margins": 0.29465410113334656,
"rewards/rejected": -0.5916305780410767,
"step": 1770
},
{
"epoch": 0.46584663700601936,
"grad_norm": 9.618111610412598,
"learning_rate": 3.2250285091229435e-07,
"logits/chosen": -2.825916290283203,
"logits/rejected": -2.8047428131103516,
"logps/chosen": -277.54571533203125,
"logps/rejected": -286.90704345703125,
"loss": 0.6269,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2800549864768982,
"rewards/margins": 0.20103518664836884,
"rewards/rejected": -0.4810902178287506,
"step": 1780
},
{
"epoch": 0.4684637529442554,
"grad_norm": 15.666852951049805,
"learning_rate": 3.2031357088857083e-07,
"logits/chosen": -2.8130288124084473,
"logits/rejected": -2.8077621459960938,
"logps/chosen": -317.0379333496094,
"logps/rejected": -347.8671569824219,
"loss": 0.6115,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.24338212609291077,
"rewards/margins": 0.24569590389728546,
"rewards/rejected": -0.4890781044960022,
"step": 1790
},
{
"epoch": 0.4710808688824915,
"grad_norm": 6.9462571144104,
"learning_rate": 3.1811841970191267e-07,
"logits/chosen": -2.736687183380127,
"logits/rejected": -2.714433193206787,
"logps/chosen": -264.3397521972656,
"logps/rejected": -324.6456604003906,
"loss": 0.585,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.18001236021518707,
"rewards/margins": 0.31897181272506714,
"rewards/rejected": -0.4989841878414154,
"step": 1800
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -2.8173904418945312,
"eval_logits/rejected": -2.7949471473693848,
"eval_logps/chosen": -296.00537109375,
"eval_logps/rejected": -299.93682861328125,
"eval_loss": 0.6049584746360779,
"eval_rewards/accuracies": 0.6915000081062317,
"eval_rewards/chosen": -0.1326776146888733,
"eval_rewards/margins": 0.25231143832206726,
"eval_rewards/rejected": -0.38498908281326294,
"eval_runtime": 691.5153,
"eval_samples_per_second": 2.892,
"eval_steps_per_second": 0.362,
"step": 1800
},
{
"epoch": 0.47369798482072756,
"grad_norm": 4.673962116241455,
"learning_rate": 3.1591758064681257e-07,
"logits/chosen": -2.7477469444274902,
"logits/rejected": -2.7178540229797363,
"logps/chosen": -282.83074951171875,
"logps/rejected": -272.26715087890625,
"loss": 0.5961,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.11454129219055176,
"rewards/margins": 0.27904239296913147,
"rewards/rejected": -0.3935837149620056,
"step": 1810
},
{
"epoch": 0.4763151007589636,
"grad_norm": 7.684245586395264,
"learning_rate": 3.13711237492698e-07,
"logits/chosen": -2.7976129055023193,
"logits/rejected": -2.7869057655334473,
"logps/chosen": -313.35540771484375,
"logps/rejected": -318.04559326171875,
"loss": 0.6319,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1507539302110672,
"rewards/margins": 0.1945343315601349,
"rewards/rejected": -0.3452882170677185,
"step": 1820
},
{
"epoch": 0.4789322166971997,
"grad_norm": 4.426579475402832,
"learning_rate": 3.1149957446858767e-07,
"logits/chosen": -2.791010618209839,
"logits/rejected": -2.807931423187256,
"logps/chosen": -277.4505310058594,
"logps/rejected": -279.3646240234375,
"loss": 0.6403,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.12662403285503387,
"rewards/margins": 0.16396556794643402,
"rewards/rejected": -0.2905896306037903,
"step": 1830
},
{
"epoch": 0.48154933263543576,
"grad_norm": 5.900054931640625,
"learning_rate": 3.0928277624770736e-07,
"logits/chosen": -2.843986988067627,
"logits/rejected": -2.823529005050659,
"logps/chosen": -312.50799560546875,
"logps/rejected": -315.56402587890625,
"loss": 0.5825,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.0948447436094284,
"rewards/margins": 0.32227185368537903,
"rewards/rejected": -0.41711658239364624,
"step": 1840
},
{
"epoch": 0.4841664485736718,
"grad_norm": 4.000248908996582,
"learning_rate": 3.0706102793207073e-07,
"logits/chosen": -2.8290603160858154,
"logits/rejected": -2.8024706840515137,
"logps/chosen": -316.80023193359375,
"logps/rejected": -323.507080078125,
"loss": 0.5882,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1260642558336258,
"rewards/margins": 0.2963547706604004,
"rewards/rejected": -0.422419011592865,
"step": 1850
},
{
"epoch": 0.48678356451190785,
"grad_norm": 7.178162574768066,
"learning_rate": 3.048345150370226e-07,
"logits/chosen": -2.8230552673339844,
"logits/rejected": -2.817823886871338,
"logps/chosen": -320.08123779296875,
"logps/rejected": -328.2519836425781,
"loss": 0.6011,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1783401370048523,
"rewards/margins": 0.27760833501815796,
"rewards/rejected": -0.45594844222068787,
"step": 1860
},
{
"epoch": 0.48940068045014395,
"grad_norm": 5.042900562286377,
"learning_rate": 3.0260342347574913e-07,
"logits/chosen": -2.809600353240967,
"logits/rejected": -2.78784441947937,
"logps/chosen": -304.2792053222656,
"logps/rejected": -314.709716796875,
"loss": 0.5808,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1620454490184784,
"rewards/margins": 0.3016073703765869,
"rewards/rejected": -0.4636527895927429,
"step": 1870
},
{
"epoch": 0.49201779638838,
"grad_norm": 6.708124160766602,
"learning_rate": 3.0036793954375357e-07,
"logits/chosen": -2.840010643005371,
"logits/rejected": -2.820410966873169,
"logps/chosen": -301.98583984375,
"logps/rejected": -291.33465576171875,
"loss": 0.5776,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.15946264564990997,
"rewards/margins": 0.32609638571739197,
"rewards/rejected": -0.48555904626846313,
"step": 1880
},
{
"epoch": 0.49463491232661605,
"grad_norm": 4.842483043670654,
"learning_rate": 2.9812824990330085e-07,
"logits/chosen": -2.8116726875305176,
"logits/rejected": -2.8013501167297363,
"logps/chosen": -312.96807861328125,
"logps/rejected": -315.23675537109375,
"loss": 0.5975,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.20859690010547638,
"rewards/margins": 0.28837090730667114,
"rewards/rejected": -0.4969678521156311,
"step": 1890
},
{
"epoch": 0.49725202826485215,
"grad_norm": 11.47492790222168,
"learning_rate": 2.958845415678316e-07,
"logits/chosen": -2.8100364208221436,
"logits/rejected": -2.7813189029693604,
"logps/chosen": -317.1954650878906,
"logps/rejected": -327.9840087890625,
"loss": 0.577,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.21498079597949982,
"rewards/margins": 0.32284659147262573,
"rewards/rejected": -0.5378273725509644,
"step": 1900
},
{
"epoch": 0.49725202826485215,
"eval_logits/chosen": -2.8176026344299316,
"eval_logits/rejected": -2.7953593730926514,
"eval_logps/chosen": -304.433349609375,
"eval_logps/rejected": -310.2669677734375,
"eval_loss": 0.6012681722640991,
"eval_rewards/accuracies": 0.6965000033378601,
"eval_rewards/chosen": -0.2169574648141861,
"eval_rewards/margins": 0.27133309841156006,
"eval_rewards/rejected": -0.4882905185222626,
"eval_runtime": 691.3293,
"eval_samples_per_second": 2.893,
"eval_steps_per_second": 0.362,
"step": 1900
},
{
"epoch": 0.4998691442030882,
"grad_norm": 8.036276817321777,
"learning_rate": 2.936370018863459e-07,
"logits/chosen": -2.833437442779541,
"logits/rejected": -2.8240761756896973,
"logps/chosen": -301.29473876953125,
"logps/rejected": -287.30487060546875,
"loss": 0.6058,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2125242054462433,
"rewards/margins": 0.2442711889743805,
"rewards/rejected": -0.4567953944206238,
"step": 1910
},
{
"epoch": 0.5024862601413242,
"grad_norm": 6.088084697723389,
"learning_rate": 2.913858185277605e-07,
"logits/chosen": -2.793074131011963,
"logits/rejected": -2.7879836559295654,
"logps/chosen": -291.63409423828125,
"logps/rejected": -303.8699035644531,
"loss": 0.5963,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.14563243091106415,
"rewards/margins": 0.27376314997673035,
"rewards/rejected": -0.4193955361843109,
"step": 1920
},
{
"epoch": 0.5051033760795604,
"grad_norm": 6.633253574371338,
"learning_rate": 2.89131179465238e-07,
"logits/chosen": -2.763582706451416,
"logits/rejected": -2.7273335456848145,
"logps/chosen": -300.27764892578125,
"logps/rejected": -291.0055236816406,
"loss": 0.5841,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.12304127216339111,
"rewards/margins": 0.3036150336265564,
"rewards/rejected": -0.4266563355922699,
"step": 1930
},
{
"epoch": 0.5077204920177963,
"grad_norm": 4.170144557952881,
"learning_rate": 2.8687327296049125e-07,
"logits/chosen": -2.803448438644409,
"logits/rejected": -2.7855215072631836,
"logps/chosen": -287.71673583984375,
"logps/rejected": -312.64544677734375,
"loss": 0.6077,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.14048686623573303,
"rewards/margins": 0.2633481025695801,
"rewards/rejected": -0.4038349688053131,
"step": 1940
},
{
"epoch": 0.5103376079560324,
"grad_norm": 4.711779594421387,
"learning_rate": 2.846122875480637e-07,
"logits/chosen": -2.823185682296753,
"logits/rejected": -2.7931466102600098,
"logps/chosen": -301.4597473144531,
"logps/rejected": -299.9159851074219,
"loss": 0.6066,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.10608525574207306,
"rewards/margins": 0.25272199511528015,
"rewards/rejected": -0.3588072657585144,
"step": 1950
},
{
"epoch": 0.5129547238942685,
"grad_norm": 5.881545543670654,
"learning_rate": 2.8234841201958647e-07,
"logits/chosen": -2.8165388107299805,
"logits/rejected": -2.784043550491333,
"logps/chosen": -311.29217529296875,
"logps/rejected": -301.19964599609375,
"loss": 0.5839,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1093025654554367,
"rewards/margins": 0.299915611743927,
"rewards/rejected": -0.4092181622982025,
"step": 1960
},
{
"epoch": 0.5155718398325045,
"grad_norm": 10.640946388244629,
"learning_rate": 2.800818354080148e-07,
"logits/chosen": -2.7974326610565186,
"logits/rejected": -2.7710323333740234,
"logps/chosen": -303.19610595703125,
"logps/rejected": -281.1106872558594,
"loss": 0.6138,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.13868093490600586,
"rewards/margins": 0.2444918155670166,
"rewards/rejected": -0.38317275047302246,
"step": 1970
},
{
"epoch": 0.5181889557707406,
"grad_norm": 5.855273246765137,
"learning_rate": 2.778127469718435e-07,
"logits/chosen": -2.751603364944458,
"logits/rejected": -2.7628543376922607,
"logps/chosen": -261.6673278808594,
"logps/rejected": -309.0796813964844,
"loss": 0.5864,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1476416289806366,
"rewards/margins": 0.2927255630493164,
"rewards/rejected": -0.4403671622276306,
"step": 1980
},
{
"epoch": 0.5208060717089767,
"grad_norm": 5.992628574371338,
"learning_rate": 2.755413361793039e-07,
"logits/chosen": -2.7673847675323486,
"logits/rejected": -2.7404510974884033,
"logps/chosen": -280.890869140625,
"logps/rejected": -294.01092529296875,
"loss": 0.6048,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.15447109937667847,
"rewards/margins": 0.2593531310558319,
"rewards/rejected": -0.4138242304325104,
"step": 1990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 6.741150379180908,
"learning_rate": 2.7326779269254356e-07,
"logits/chosen": -2.826737880706787,
"logits/rejected": -2.811283588409424,
"logps/chosen": -320.9913024902344,
"logps/rejected": -290.5726318359375,
"loss": 0.5945,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.14564435184001923,
"rewards/margins": 0.29357942938804626,
"rewards/rejected": -0.4392237663269043,
"step": 2000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -2.812201976776123,
"eval_logits/rejected": -2.7902560234069824,
"eval_logps/chosen": -303.8027648925781,
"eval_logps/rejected": -310.42926025390625,
"eval_loss": 0.5991718173027039,
"eval_rewards/accuracies": 0.6995000243186951,
"eval_rewards/chosen": -0.21065115928649902,
"eval_rewards/margins": 0.27926215529441833,
"eval_rewards/rejected": -0.48991334438323975,
"eval_runtime": 691.9553,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 2000
},
{
"epoch": 0.5260403035854488,
"grad_norm": 5.159753322601318,
"learning_rate": 2.709923063517895e-07,
"logits/chosen": -2.770754337310791,
"logits/rejected": -2.7877042293548584,
"logps/chosen": -297.4669494628906,
"logps/rejected": -326.15008544921875,
"loss": 0.5803,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.18324916064739227,
"rewards/margins": 0.3264145255088806,
"rewards/rejected": -0.5096637010574341,
"step": 2010
},
{
"epoch": 0.528657419523685,
"grad_norm": 9.780900001525879,
"learning_rate": 2.68715067159496e-07,
"logits/chosen": -2.804417133331299,
"logits/rejected": -2.7843241691589355,
"logps/chosen": -287.03619384765625,
"logps/rejected": -296.3020324707031,
"loss": 0.5831,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.18021352589130402,
"rewards/margins": 0.30431440472602844,
"rewards/rejected": -0.4845278859138489,
"step": 2020
},
{
"epoch": 0.5312745354619209,
"grad_norm": 7.88455867767334,
"learning_rate": 2.664362652644806e-07,
"logits/chosen": -2.820744514465332,
"logits/rejected": -2.8191521167755127,
"logps/chosen": -334.691650390625,
"logps/rejected": -322.51885986328125,
"loss": 0.5813,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.22317573428153992,
"rewards/margins": 0.33498162031173706,
"rewards/rejected": -0.5581573247909546,
"step": 2030
},
{
"epoch": 0.533891651400157,
"grad_norm": 6.620345115661621,
"learning_rate": 2.6415609094604555e-07,
"logits/chosen": -2.802522659301758,
"logits/rejected": -2.8061249256134033,
"logps/chosen": -310.2366638183594,
"logps/rejected": -317.20941162109375,
"loss": 0.6023,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.22533388435840607,
"rewards/margins": 0.28193774819374084,
"rewards/rejected": -0.5072715878486633,
"step": 2040
},
{
"epoch": 0.5365087673383931,
"grad_norm": 8.580389022827148,
"learning_rate": 2.618747345980904e-07,
"logits/chosen": -2.8094029426574707,
"logits/rejected": -2.768106460571289,
"logps/chosen": -293.4418029785156,
"logps/rejected": -266.50897216796875,
"loss": 0.6014,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.28857478499412537,
"rewards/margins": 0.2732298970222473,
"rewards/rejected": -0.5618046522140503,
"step": 2050
},
{
"epoch": 0.5391258832766291,
"grad_norm": 11.197132110595703,
"learning_rate": 2.595923867132136e-07,
"logits/chosen": -2.8401012420654297,
"logits/rejected": -2.835894823074341,
"logps/chosen": -327.6039733886719,
"logps/rejected": -335.93634033203125,
"loss": 0.5892,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.29747676849365234,
"rewards/margins": 0.3320815861225128,
"rewards/rejected": -0.6295583844184875,
"step": 2060
},
{
"epoch": 0.5417429992148652,
"grad_norm": 7.386964797973633,
"learning_rate": 2.5730923786680667e-07,
"logits/chosen": -2.820725917816162,
"logits/rejected": -2.821699619293213,
"logps/chosen": -294.2755432128906,
"logps/rejected": -329.28900146484375,
"loss": 0.6084,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.28539037704467773,
"rewards/margins": 0.27198493480682373,
"rewards/rejected": -0.5573753714561462,
"step": 2070
},
{
"epoch": 0.5443601151531012,
"grad_norm": 10.91450023651123,
"learning_rate": 2.5502547870114135e-07,
"logits/chosen": -2.798468589782715,
"logits/rejected": -2.764756441116333,
"logps/chosen": -296.8208923339844,
"logps/rejected": -290.93609619140625,
"loss": 0.6123,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.25504210591316223,
"rewards/margins": 0.26738548278808594,
"rewards/rejected": -0.5224276185035706,
"step": 2080
},
{
"epoch": 0.5469772310913373,
"grad_norm": 9.419450759887695,
"learning_rate": 2.527412999094506e-07,
"logits/chosen": -2.7591891288757324,
"logits/rejected": -2.7384586334228516,
"logps/chosen": -340.7040100097656,
"logps/rejected": -353.3229064941406,
"loss": 0.5947,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2267749011516571,
"rewards/margins": 0.2946481704711914,
"rewards/rejected": -0.5214229822158813,
"step": 2090
},
{
"epoch": 0.5495943470295734,
"grad_norm": 9.121070861816406,
"learning_rate": 2.5045689222000636e-07,
"logits/chosen": -2.748777151107788,
"logits/rejected": -2.737816333770752,
"logps/chosen": -279.33941650390625,
"logps/rejected": -290.88262939453125,
"loss": 0.5913,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.20830078423023224,
"rewards/margins": 0.28861740231513977,
"rewards/rejected": -0.4969182014465332,
"step": 2100
},
{
"epoch": 0.5495943470295734,
"eval_logits/chosen": -2.8085484504699707,
"eval_logits/rejected": -2.786346673965454,
"eval_logps/chosen": -306.4640808105469,
"eval_logps/rejected": -313.952880859375,
"eval_loss": 0.5981019139289856,
"eval_rewards/accuracies": 0.7024999856948853,
"eval_rewards/chosen": -0.23726463317871094,
"eval_rewards/margins": 0.2878848612308502,
"eval_rewards/rejected": -0.5251494646072388,
"eval_runtime": 690.4278,
"eval_samples_per_second": 2.897,
"eval_steps_per_second": 0.362,
"step": 2100
},
{
"epoch": 0.5522114629678094,
"grad_norm": 7.360952854156494,
"learning_rate": 2.481724463801933e-07,
"logits/chosen": -2.7974154949188232,
"logits/rejected": -2.7778165340423584,
"logps/chosen": -320.70465087890625,
"logps/rejected": -308.23455810546875,
"loss": 0.5916,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.25429460406303406,
"rewards/margins": 0.29730120301246643,
"rewards/rejected": -0.5515958070755005,
"step": 2110
},
{
"epoch": 0.5548285789060455,
"grad_norm": 9.077162742614746,
"learning_rate": 2.4588815314058154e-07,
"logits/chosen": -2.7863690853118896,
"logits/rejected": -2.787247896194458,
"logps/chosen": -283.7870788574219,
"logps/rejected": -277.558837890625,
"loss": 0.5976,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.24108314514160156,
"rewards/margins": 0.27977603673934937,
"rewards/rejected": -0.5208591818809509,
"step": 2120
},
{
"epoch": 0.5574456948442816,
"grad_norm": 6.194889545440674,
"learning_rate": 2.4360420323899917e-07,
"logits/chosen": -2.7870755195617676,
"logits/rejected": -2.779362916946411,
"logps/chosen": -321.5159606933594,
"logps/rejected": -313.3367614746094,
"loss": 0.6106,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.25045931339263916,
"rewards/margins": 0.27981314063072205,
"rewards/rejected": -0.5302724242210388,
"step": 2130
},
{
"epoch": 0.5600628107825176,
"grad_norm": 9.01162338256836,
"learning_rate": 2.4132078738460583e-07,
"logits/chosen": -2.821700096130371,
"logits/rejected": -2.7977004051208496,
"logps/chosen": -299.77734375,
"logps/rejected": -288.15472412109375,
"loss": 0.5911,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2075999230146408,
"rewards/margins": 0.2872273027896881,
"rewards/rejected": -0.49482718110084534,
"step": 2140
},
{
"epoch": 0.5626799267207537,
"grad_norm": 8.978148460388184,
"learning_rate": 2.390380962419682e-07,
"logits/chosen": -2.7910008430480957,
"logits/rejected": -2.7853500843048096,
"logps/chosen": -271.1761474609375,
"logps/rejected": -258.0618896484375,
"loss": 0.6279,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2159349024295807,
"rewards/margins": 0.2157304286956787,
"rewards/rejected": -0.4316653609275818,
"step": 2150
},
{
"epoch": 0.5652970426589898,
"grad_norm": 10.330108642578125,
"learning_rate": 2.3675632041513977e-07,
"logits/chosen": -2.8272249698638916,
"logits/rejected": -2.781740427017212,
"logps/chosen": -321.1408996582031,
"logps/rejected": -290.31451416015625,
"loss": 0.566,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1839137077331543,
"rewards/margins": 0.36078041791915894,
"rewards/rejected": -0.5446941256523132,
"step": 2160
},
{
"epoch": 0.5679141585972258,
"grad_norm": 4.827859401702881,
"learning_rate": 2.344756504317453e-07,
"logits/chosen": -2.7731990814208984,
"logits/rejected": -2.739841938018799,
"logps/chosen": -311.63385009765625,
"logps/rejected": -300.05657958984375,
"loss": 0.6069,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.37105852365493774,
"rewards/margins": 0.2651851773262024,
"rewards/rejected": -0.6362437009811401,
"step": 2170
},
{
"epoch": 0.5705312745354619,
"grad_norm": 7.324320316314697,
"learning_rate": 2.3219627672707237e-07,
"logits/chosen": -2.7636940479278564,
"logits/rejected": -2.7629504203796387,
"logps/chosen": -312.3614196777344,
"logps/rejected": -291.49920654296875,
"loss": 0.6201,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.40163812041282654,
"rewards/margins": 0.2354915589094162,
"rewards/rejected": -0.6371296644210815,
"step": 2180
},
{
"epoch": 0.573148390473698,
"grad_norm": 9.793487548828125,
"learning_rate": 2.2991838962816918e-07,
"logits/chosen": -2.760166645050049,
"logits/rejected": -2.7421138286590576,
"logps/chosen": -309.69378662109375,
"logps/rejected": -330.1057434082031,
"loss": 0.6189,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.391974538564682,
"rewards/margins": 0.23559853434562683,
"rewards/rejected": -0.6275731325149536,
"step": 2190
},
{
"epoch": 0.575765506411934,
"grad_norm": 4.884433746337891,
"learning_rate": 2.2764217933795297e-07,
"logits/chosen": -2.7735462188720703,
"logits/rejected": -2.7576115131378174,
"logps/chosen": -306.01983642578125,
"logps/rejected": -319.36273193359375,
"loss": 0.5816,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.28672754764556885,
"rewards/margins": 0.3387922942638397,
"rewards/rejected": -0.625519871711731,
"step": 2200
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -2.806988000869751,
"eval_logits/rejected": -2.7848920822143555,
"eval_logps/chosen": -309.6146240234375,
"eval_logps/rejected": -317.14105224609375,
"eval_loss": 0.5989395976066589,
"eval_rewards/accuracies": 0.6970000267028809,
"eval_rewards/chosen": -0.26876989006996155,
"eval_rewards/margins": 0.28826138377189636,
"eval_rewards/rejected": -0.5570313334465027,
"eval_runtime": 692.0182,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 2200
},
{
"epoch": 0.5783826223501701,
"grad_norm": 5.080691337585449,
"learning_rate": 2.253678359193278e-07,
"logits/chosen": -2.8626627922058105,
"logits/rejected": -2.8227312564849854,
"logps/chosen": -323.10284423828125,
"logps/rejected": -324.9154968261719,
"loss": 0.6192,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.28973332047462463,
"rewards/margins": 0.24134087562561035,
"rewards/rejected": -0.5310741662979126,
"step": 2210
},
{
"epoch": 0.5809997382884062,
"grad_norm": 8.136847496032715,
"learning_rate": 2.230955492793149e-07,
"logits/chosen": -2.7363781929016113,
"logits/rejected": -2.747398853302002,
"logps/chosen": -315.01092529296875,
"logps/rejected": -321.312744140625,
"loss": 0.6301,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2270394265651703,
"rewards/margins": 0.22412936389446259,
"rewards/rejected": -0.4511687755584717,
"step": 2220
},
{
"epoch": 0.5836168542266422,
"grad_norm": 3.2636797428131104,
"learning_rate": 2.2082550915319468e-07,
"logits/chosen": -2.746173858642578,
"logits/rejected": -2.7479488849639893,
"logps/chosen": -311.60443115234375,
"logps/rejected": -304.00933837890625,
"loss": 0.5897,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.16526171565055847,
"rewards/margins": 0.31148332357406616,
"rewards/rejected": -0.47674503922462463,
"step": 2230
},
{
"epoch": 0.5862339701648783,
"grad_norm": 7.513117790222168,
"learning_rate": 2.1855790508866433e-07,
"logits/chosen": -2.7626214027404785,
"logits/rejected": -2.766356945037842,
"logps/chosen": -345.93560791015625,
"logps/rejected": -345.16632080078125,
"loss": 0.6017,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.19639845192432404,
"rewards/margins": 0.2772556245326996,
"rewards/rejected": -0.473654180765152,
"step": 2240
},
{
"epoch": 0.5888510861031143,
"grad_norm": 4.226502418518066,
"learning_rate": 2.162929264300107e-07,
"logits/chosen": -2.7443809509277344,
"logits/rejected": -2.740731716156006,
"logps/chosen": -298.61883544921875,
"logps/rejected": -312.0686950683594,
"loss": 0.5729,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.14046551287174225,
"rewards/margins": 0.34373658895492554,
"rewards/rejected": -0.4842020869255066,
"step": 2250
},
{
"epoch": 0.5914682020413504,
"grad_norm": 5.33687162399292,
"learning_rate": 2.1403076230230005e-07,
"logits/chosen": -2.767137289047241,
"logits/rejected": -2.7396111488342285,
"logps/chosen": -312.28643798828125,
"logps/rejected": -306.20172119140625,
"loss": 0.616,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.19273105263710022,
"rewards/margins": 0.26331207156181335,
"rewards/rejected": -0.45604315400123596,
"step": 2260
},
{
"epoch": 0.5940853179795865,
"grad_norm": 9.639008522033691,
"learning_rate": 2.1177160159558596e-07,
"logits/chosen": -2.7518250942230225,
"logits/rejected": -2.7383649349212646,
"logps/chosen": -321.7221374511719,
"logps/rejected": -297.3667297363281,
"loss": 0.6038,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.21679162979125977,
"rewards/margins": 0.29109686613082886,
"rewards/rejected": -0.5078884959220886,
"step": 2270
},
{
"epoch": 0.5967024339178225,
"grad_norm": 6.384767055511475,
"learning_rate": 2.0951563294913734e-07,
"logits/chosen": -2.760425090789795,
"logits/rejected": -2.7438526153564453,
"logps/chosen": -299.39373779296875,
"logps/rejected": -302.9912109375,
"loss": 0.5717,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.20336699485778809,
"rewards/margins": 0.3353096842765808,
"rewards/rejected": -0.5386766791343689,
"step": 2280
},
{
"epoch": 0.5993195498560586,
"grad_norm": 6.036366939544678,
"learning_rate": 2.072630447356869e-07,
"logits/chosen": -2.7959117889404297,
"logits/rejected": -2.7956790924072266,
"logps/chosen": -300.03179931640625,
"logps/rejected": -291.49481201171875,
"loss": 0.6001,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23898771405220032,
"rewards/margins": 0.26846712827682495,
"rewards/rejected": -0.5074548125267029,
"step": 2290
},
{
"epoch": 0.6019366657942947,
"grad_norm": 7.8020195960998535,
"learning_rate": 2.0501402504570232e-07,
"logits/chosen": -2.829082727432251,
"logits/rejected": -2.772502899169922,
"logps/chosen": -318.4316711425781,
"logps/rejected": -315.959716796875,
"loss": 0.5824,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.22740764915943146,
"rewards/margins": 0.3216533958911896,
"rewards/rejected": -0.5490610003471375,
"step": 2300
},
{
"epoch": 0.6019366657942947,
"eval_logits/chosen": -2.80366849899292,
"eval_logits/rejected": -2.7820827960968018,
"eval_logps/chosen": -305.00982666015625,
"eval_logps/rejected": -313.32330322265625,
"eval_loss": 0.5960872769355774,
"eval_rewards/accuracies": 0.6955000162124634,
"eval_rewards/chosen": -0.2227218896150589,
"eval_rewards/margins": 0.2961318790912628,
"eval_rewards/rejected": -0.5188537836074829,
"eval_runtime": 691.9375,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 2300
},
{
"epoch": 0.6045537817325307,
"grad_norm": 12.083107948303223,
"learning_rate": 2.027687616716804e-07,
"logits/chosen": -2.72344970703125,
"logits/rejected": -2.7168376445770264,
"logps/chosen": -268.31243896484375,
"logps/rejected": -255.6737518310547,
"loss": 0.6189,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2189827412366867,
"rewards/margins": 0.24416430294513702,
"rewards/rejected": -0.46314701437950134,
"step": 2310
},
{
"epoch": 0.6071708976707668,
"grad_norm": 8.845372200012207,
"learning_rate": 2.005274420924668e-07,
"logits/chosen": -2.790346145629883,
"logits/rejected": -2.778743267059326,
"logps/chosen": -295.9941711425781,
"logps/rejected": -287.6865234375,
"loss": 0.6086,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.25174736976623535,
"rewards/margins": 0.2748829126358032,
"rewards/rejected": -0.5266302824020386,
"step": 2320
},
{
"epoch": 0.6097880136090029,
"grad_norm": 7.964311599731445,
"learning_rate": 1.9829025345760121e-07,
"logits/chosen": -2.7749578952789307,
"logits/rejected": -2.7802319526672363,
"logps/chosen": -315.29290771484375,
"logps/rejected": -332.8951721191406,
"loss": 0.6062,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.17806461453437805,
"rewards/margins": 0.2752231955528259,
"rewards/rejected": -0.4532877802848816,
"step": 2330
},
{
"epoch": 0.6124051295472389,
"grad_norm": 8.214485168457031,
"learning_rate": 1.960573825716911e-07,
"logits/chosen": -2.743821620941162,
"logits/rejected": -2.7305188179016113,
"logps/chosen": -275.1949768066406,
"logps/rejected": -297.45172119140625,
"loss": 0.6016,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.23889228701591492,
"rewards/margins": 0.29088443517684937,
"rewards/rejected": -0.5297766923904419,
"step": 2340
},
{
"epoch": 0.615022245485475,
"grad_norm": 7.783448696136475,
"learning_rate": 1.9382901587881273e-07,
"logits/chosen": -2.8195502758026123,
"logits/rejected": -2.8172898292541504,
"logps/chosen": -291.1629333496094,
"logps/rejected": -292.11553955078125,
"loss": 0.5555,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.15334704518318176,
"rewards/margins": 0.37875789403915405,
"rewards/rejected": -0.5321049094200134,
"step": 2350
},
{
"epoch": 0.6176393614237111,
"grad_norm": 7.713850498199463,
"learning_rate": 1.9160533944694364e-07,
"logits/chosen": -2.802713394165039,
"logits/rejected": -2.763248920440674,
"logps/chosen": -297.48541259765625,
"logps/rejected": -321.0580139160156,
"loss": 0.5661,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1875167191028595,
"rewards/margins": 0.3671106696128845,
"rewards/rejected": -0.5546274185180664,
"step": 2360
},
{
"epoch": 0.6202564773619471,
"grad_norm": 7.275653839111328,
"learning_rate": 1.8938653895242602e-07,
"logits/chosen": -2.805842161178589,
"logits/rejected": -2.7778079509735107,
"logps/chosen": -301.32257080078125,
"logps/rejected": -307.5292663574219,
"loss": 0.569,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.22137200832366943,
"rewards/margins": 0.3620893061161041,
"rewards/rejected": -0.583461344242096,
"step": 2370
},
{
"epoch": 0.6228735933001832,
"grad_norm": 7.8891282081604,
"learning_rate": 1.8717279966446264e-07,
"logits/chosen": -2.702014684677124,
"logits/rejected": -2.6890392303466797,
"logps/chosen": -299.67095947265625,
"logps/rejected": -315.53125,
"loss": 0.6047,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3092700242996216,
"rewards/margins": 0.2915950417518616,
"rewards/rejected": -0.6008650660514832,
"step": 2380
},
{
"epoch": 0.6254907092384192,
"grad_norm": 9.103086471557617,
"learning_rate": 1.8496430642964694e-07,
"logits/chosen": -2.7693662643432617,
"logits/rejected": -2.749218702316284,
"logps/chosen": -320.30596923828125,
"logps/rejected": -322.6269226074219,
"loss": 0.6135,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2899993658065796,
"rewards/margins": 0.2783369719982147,
"rewards/rejected": -0.5683363676071167,
"step": 2390
},
{
"epoch": 0.6281078251766553,
"grad_norm": 8.552151679992676,
"learning_rate": 1.8276124365652855e-07,
"logits/chosen": -2.796008586883545,
"logits/rejected": -2.750042200088501,
"logps/chosen": -308.24066162109375,
"logps/rejected": -318.9580993652344,
"loss": 0.602,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.283893346786499,
"rewards/margins": 0.2797131836414337,
"rewards/rejected": -0.5636065602302551,
"step": 2400
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -2.796116352081299,
"eval_logits/rejected": -2.774383783340454,
"eval_logps/chosen": -309.5652160644531,
"eval_logps/rejected": -318.12506103515625,
"eval_loss": 0.5968618392944336,
"eval_rewards/accuracies": 0.6990000009536743,
"eval_rewards/chosen": -0.2682757079601288,
"eval_rewards/margins": 0.2985955774784088,
"eval_rewards/rejected": -0.5668712258338928,
"eval_runtime": 690.9152,
"eval_samples_per_second": 2.895,
"eval_steps_per_second": 0.362,
"step": 2400
},
{
"epoch": 0.6307249411148914,
"grad_norm": 10.884597778320312,
"learning_rate": 1.805637953002149e-07,
"logits/chosen": -2.806243658065796,
"logits/rejected": -2.804234266281128,
"logps/chosen": -287.49090576171875,
"logps/rejected": -287.6014404296875,
"loss": 0.6169,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.27734607458114624,
"rewards/margins": 0.24837279319763184,
"rewards/rejected": -0.5257189273834229,
"step": 2410
},
{
"epoch": 0.6333420570531274,
"grad_norm": 9.013958930969238,
"learning_rate": 1.7837214484701153e-07,
"logits/chosen": -2.7953040599823,
"logits/rejected": -2.7851452827453613,
"logps/chosen": -289.382568359375,
"logps/rejected": -297.02679443359375,
"loss": 0.5733,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2011108100414276,
"rewards/margins": 0.34568914771080017,
"rewards/rejected": -0.5468000173568726,
"step": 2420
},
{
"epoch": 0.6359591729913635,
"grad_norm": 14.238588333129883,
"learning_rate": 1.761864752991004e-07,
"logits/chosen": -2.778735399246216,
"logits/rejected": -2.759908437728882,
"logps/chosen": -295.66241455078125,
"logps/rejected": -312.7738952636719,
"loss": 0.5791,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.21105961501598358,
"rewards/margins": 0.3268287181854248,
"rewards/rejected": -0.5378884077072144,
"step": 2430
},
{
"epoch": 0.6385762889295996,
"grad_norm": 5.6600518226623535,
"learning_rate": 1.7400696915925995e-07,
"logits/chosen": -2.7974464893341064,
"logits/rejected": -2.7732651233673096,
"logps/chosen": -312.24798583984375,
"logps/rejected": -279.251708984375,
"loss": 0.5943,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23233290016651154,
"rewards/margins": 0.3078458309173584,
"rewards/rejected": -0.5401787161827087,
"step": 2440
},
{
"epoch": 0.6411934048678356,
"grad_norm": 11.058223724365234,
"learning_rate": 1.718338084156254e-07,
"logits/chosen": -2.7382242679595947,
"logits/rejected": -2.727843761444092,
"logps/chosen": -323.4954528808594,
"logps/rejected": -317.99456787109375,
"loss": 0.57,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1701376736164093,
"rewards/margins": 0.3507465720176697,
"rewards/rejected": -0.5208842754364014,
"step": 2450
},
{
"epoch": 0.6438105208060717,
"grad_norm": 14.676642417907715,
"learning_rate": 1.696671745264937e-07,
"logits/chosen": -2.799201488494873,
"logits/rejected": -2.8146328926086426,
"logps/chosen": -313.3539733886719,
"logps/rejected": -290.71197509765625,
"loss": 0.5616,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.154522106051445,
"rewards/margins": 0.36096832156181335,
"rewards/rejected": -0.5154904127120972,
"step": 2460
},
{
"epoch": 0.6464276367443078,
"grad_norm": 7.134603500366211,
"learning_rate": 1.67507248405171e-07,
"logits/chosen": -2.786536693572998,
"logits/rejected": -2.7716171741485596,
"logps/chosen": -290.3885192871094,
"logps/rejected": -317.96453857421875,
"loss": 0.6052,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.17861530184745789,
"rewards/margins": 0.2776513695716858,
"rewards/rejected": -0.4562666416168213,
"step": 2470
},
{
"epoch": 0.6490447526825438,
"grad_norm": 9.284005165100098,
"learning_rate": 1.6535421040486683e-07,
"logits/chosen": -2.695885181427002,
"logits/rejected": -2.683889150619507,
"logps/chosen": -292.3827209472656,
"logps/rejected": -295.35003662109375,
"loss": 0.5708,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.20486466586589813,
"rewards/margins": 0.3616489768028259,
"rewards/rejected": -0.5665136575698853,
"step": 2480
},
{
"epoch": 0.6516618686207799,
"grad_norm": 11.596046447753906,
"learning_rate": 1.6320824030363456e-07,
"logits/chosen": -2.7673633098602295,
"logits/rejected": -2.7697348594665527,
"logps/chosen": -269.5127868652344,
"logps/rejected": -284.500732421875,
"loss": 0.5804,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.19040192663669586,
"rewards/margins": 0.32062506675720215,
"rewards/rejected": -0.5110269784927368,
"step": 2490
},
{
"epoch": 0.654278984559016,
"grad_norm": 8.306464195251465,
"learning_rate": 1.6106951728936024e-07,
"logits/chosen": -2.8287737369537354,
"logits/rejected": -2.785698413848877,
"logps/chosen": -290.69586181640625,
"logps/rejected": -315.9652404785156,
"loss": 0.5792,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.17289450764656067,
"rewards/margins": 0.32756882905960083,
"rewards/rejected": -0.5004633069038391,
"step": 2500
},
{
"epoch": 0.654278984559016,
"eval_logits/chosen": -2.7979679107666016,
"eval_logits/rejected": -2.776271104812622,
"eval_logps/chosen": -303.76153564453125,
"eval_logps/rejected": -311.8429260253906,
"eval_loss": 0.5962891578674316,
"eval_rewards/accuracies": 0.6974999904632568,
"eval_rewards/chosen": -0.2102394998073578,
"eval_rewards/margins": 0.2938106954097748,
"eval_rewards/rejected": -0.5040501952171326,
"eval_runtime": 692.3854,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 2500
},
{
"epoch": 0.656896100497252,
"grad_norm": 6.3364176750183105,
"learning_rate": 1.5893821994479994e-07,
"logits/chosen": -2.8073089122772217,
"logits/rejected": -2.7984962463378906,
"logps/chosen": -307.6702880859375,
"logps/rejected": -299.78192138671875,
"loss": 0.583,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.15238206088542938,
"rewards/margins": 0.3230430781841278,
"rewards/rejected": -0.4754251539707184,
"step": 2510
},
{
"epoch": 0.6595132164354881,
"grad_norm": 7.475069999694824,
"learning_rate": 1.5681452623266867e-07,
"logits/chosen": -2.788701057434082,
"logits/rejected": -2.7505264282226562,
"logps/chosen": -323.1575012207031,
"logps/rejected": -304.9902038574219,
"loss": 0.5469,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.1838502436876297,
"rewards/margins": 0.4115122854709625,
"rewards/rejected": -0.5953624844551086,
"step": 2520
},
{
"epoch": 0.6621303323737242,
"grad_norm": 9.084112167358398,
"learning_rate": 1.546986134807801e-07,
"logits/chosen": -2.8091278076171875,
"logits/rejected": -2.780764102935791,
"logps/chosen": -293.3882751464844,
"logps/rejected": -309.5545349121094,
"loss": 0.5931,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.28720271587371826,
"rewards/margins": 0.30004793405532837,
"rewards/rejected": -0.5872506499290466,
"step": 2530
},
{
"epoch": 0.6647474483119602,
"grad_norm": 7.817606449127197,
"learning_rate": 1.5259065836724034e-07,
"logits/chosen": -2.7307331562042236,
"logits/rejected": -2.7140753269195557,
"logps/chosen": -290.29443359375,
"logps/rejected": -307.90399169921875,
"loss": 0.5968,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2654728889465332,
"rewards/margins": 0.2819042205810547,
"rewards/rejected": -0.5473771095275879,
"step": 2540
},
{
"epoch": 0.6673645642501963,
"grad_norm": 8.136064529418945,
"learning_rate": 1.5049083690569454e-07,
"logits/chosen": -2.7462635040283203,
"logits/rejected": -2.731522798538208,
"logps/chosen": -279.6645812988281,
"logps/rejected": -303.47857666015625,
"loss": 0.6011,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.2667672336101532,
"rewards/margins": 0.28752660751342773,
"rewards/rejected": -0.5542938113212585,
"step": 2550
},
{
"epoch": 0.6699816801884323,
"grad_norm": 5.6162896156311035,
"learning_rate": 1.4839932443063056e-07,
"logits/chosen": -2.7818315029144287,
"logits/rejected": -2.754776954650879,
"logps/chosen": -331.192626953125,
"logps/rejected": -306.44342041015625,
"loss": 0.5807,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.23324036598205566,
"rewards/margins": 0.33265605568885803,
"rewards/rejected": -0.5658964514732361,
"step": 2560
},
{
"epoch": 0.6725987961266684,
"grad_norm": 15.203133583068848,
"learning_rate": 1.46316295582738e-07,
"logits/chosen": -2.755795955657959,
"logits/rejected": -2.745166301727295,
"logps/chosen": -288.94012451171875,
"logps/rejected": -295.92974853515625,
"loss": 0.63,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.30726075172424316,
"rewards/margins": 0.21980533003807068,
"rewards/rejected": -0.5270661115646362,
"step": 2570
},
{
"epoch": 0.6752159120649045,
"grad_norm": 23.822792053222656,
"learning_rate": 1.4424192429432655e-07,
"logits/chosen": -2.783210515975952,
"logits/rejected": -2.766979694366455,
"logps/chosen": -291.4307556152344,
"logps/rejected": -328.7579040527344,
"loss": 0.5738,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.18577826023101807,
"rewards/margins": 0.34509676694869995,
"rewards/rejected": -0.5308750867843628,
"step": 2580
},
{
"epoch": 0.6778330280031405,
"grad_norm": 9.544054985046387,
"learning_rate": 1.4217638377480158e-07,
"logits/chosen": -2.7744319438934326,
"logits/rejected": -2.7644972801208496,
"logps/chosen": -299.30975341796875,
"logps/rejected": -312.57220458984375,
"loss": 0.598,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.23222167789936066,
"rewards/margins": 0.28205937147140503,
"rewards/rejected": -0.5142810344696045,
"step": 2590
},
{
"epoch": 0.6804501439413766,
"grad_norm": 7.35859489440918,
"learning_rate": 1.401198464962021e-07,
"logits/chosen": -2.7667133808135986,
"logits/rejected": -2.7541134357452393,
"logps/chosen": -305.63446044921875,
"logps/rejected": -288.49676513671875,
"loss": 0.6028,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2002829611301422,
"rewards/margins": 0.26447853446006775,
"rewards/rejected": -0.4647614359855652,
"step": 2600
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -2.793254852294922,
"eval_logits/rejected": -2.771672010421753,
"eval_logps/chosen": -301.69635009765625,
"eval_logps/rejected": -309.3417053222656,
"eval_loss": 0.5973595976829529,
"eval_rewards/accuracies": 0.6919999718666077,
"eval_rewards/chosen": -0.18958736956119537,
"eval_rewards/margins": 0.289450466632843,
"eval_rewards/rejected": -0.4790377914905548,
"eval_runtime": 692.1987,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 2600
},
{
"epoch": 0.6830672598796127,
"grad_norm": 6.412085056304932,
"learning_rate": 1.3807248417879894e-07,
"logits/chosen": -2.799522638320923,
"logits/rejected": -2.801234483718872,
"logps/chosen": -304.61505126953125,
"logps/rejected": -318.75360107421875,
"loss": 0.5742,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.1567406803369522,
"rewards/margins": 0.35466814041137695,
"rewards/rejected": -0.511408805847168,
"step": 2610
},
{
"epoch": 0.6856843758178487,
"grad_norm": 6.595985412597656,
"learning_rate": 1.3603446777675665e-07,
"logits/chosen": -2.7163891792297363,
"logits/rejected": -2.6980533599853516,
"logps/chosen": -301.43170166015625,
"logps/rejected": -309.5948486328125,
"loss": 0.5767,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.18890248239040375,
"rewards/margins": 0.33902615308761597,
"rewards/rejected": -0.5279285907745361,
"step": 2620
},
{
"epoch": 0.6883014917560848,
"grad_norm": 5.626343250274658,
"learning_rate": 1.3400596746385814e-07,
"logits/chosen": -2.785409450531006,
"logits/rejected": -2.7549426555633545,
"logps/chosen": -305.23779296875,
"logps/rejected": -306.29864501953125,
"loss": 0.5866,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.17120136320590973,
"rewards/margins": 0.3220587372779846,
"rewards/rejected": -0.49326008558273315,
"step": 2630
},
{
"epoch": 0.6909186076943209,
"grad_norm": 7.084354400634766,
"learning_rate": 1.3198715261929586e-07,
"logits/chosen": -2.8111932277679443,
"logits/rejected": -2.7792601585388184,
"logps/chosen": -269.24957275390625,
"logps/rejected": -297.8160400390625,
"loss": 0.5557,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.19386166334152222,
"rewards/margins": 0.37062662839889526,
"rewards/rejected": -0.5644882917404175,
"step": 2640
},
{
"epoch": 0.6935357236325569,
"grad_norm": 6.301397800445557,
"learning_rate": 1.299781918135282e-07,
"logits/chosen": -2.780548095703125,
"logits/rejected": -2.7463881969451904,
"logps/chosen": -331.93035888671875,
"logps/rejected": -346.24005126953125,
"loss": 0.5488,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.12747205793857574,
"rewards/margins": 0.4090425372123718,
"rewards/rejected": -0.5365146398544312,
"step": 2650
},
{
"epoch": 0.696152839570793,
"grad_norm": 4.976480007171631,
"learning_rate": 1.279792527942045e-07,
"logits/chosen": -2.7965517044067383,
"logits/rejected": -2.7541985511779785,
"logps/chosen": -308.75946044921875,
"logps/rejected": -333.583251953125,
"loss": 0.573,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2170572280883789,
"rewards/margins": 0.3559117913246155,
"rewards/rejected": -0.5729690194129944,
"step": 2660
},
{
"epoch": 0.6987699555090291,
"grad_norm": 7.420611381530762,
"learning_rate": 1.259905024721576e-07,
"logits/chosen": -2.7755208015441895,
"logits/rejected": -2.7653794288635254,
"logps/chosen": -297.36810302734375,
"logps/rejected": -308.62139892578125,
"loss": 0.574,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.21521492302417755,
"rewards/margins": 0.3440507650375366,
"rewards/rejected": -0.5592657327651978,
"step": 2670
},
{
"epoch": 0.7013870714472651,
"grad_norm": 9.432327270507812,
"learning_rate": 1.2401210690746703e-07,
"logits/chosen": -2.7644107341766357,
"logits/rejected": -2.7474875450134277,
"logps/chosen": -305.26129150390625,
"logps/rejected": -300.5979309082031,
"loss": 0.5966,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.19491124153137207,
"rewards/margins": 0.2979043126106262,
"rewards/rejected": -0.4928155541419983,
"step": 2680
},
{
"epoch": 0.7040041873855012,
"grad_norm": 13.687203407287598,
"learning_rate": 1.2204423129559305e-07,
"logits/chosen": -2.803926467895508,
"logits/rejected": -2.8096935749053955,
"logps/chosen": -304.5517272949219,
"logps/rejected": -332.74627685546875,
"loss": 0.5878,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.21299275755882263,
"rewards/margins": 0.32770127058029175,
"rewards/rejected": -0.540693998336792,
"step": 2690
},
{
"epoch": 0.7066213033237373,
"grad_norm": 9.307769775390625,
"learning_rate": 1.2008703995358299e-07,
"logits/chosen": -2.7696948051452637,
"logits/rejected": -2.7626984119415283,
"logps/chosen": -305.66973876953125,
"logps/rejected": -309.4637756347656,
"loss": 0.5854,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.23966650664806366,
"rewards/margins": 0.3371264636516571,
"rewards/rejected": -0.5767929553985596,
"step": 2700
},
{
"epoch": 0.7066213033237373,
"eval_logits/chosen": -2.7892041206359863,
"eval_logits/rejected": -2.7675600051879883,
"eval_logps/chosen": -307.9026794433594,
"eval_logps/rejected": -317.58642578125,
"eval_loss": 0.5930463671684265,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -0.2516505718231201,
"eval_rewards/margins": 0.309834361076355,
"eval_rewards/rejected": -0.5614849925041199,
"eval_runtime": 692.1934,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 2700
},
{
"epoch": 0.7092384192619733,
"grad_norm": 7.60300874710083,
"learning_rate": 1.1814069630635068e-07,
"logits/chosen": -2.7490410804748535,
"logits/rejected": -2.7561395168304443,
"logps/chosen": -311.02667236328125,
"logps/rejected": -334.8045349121094,
"loss": 0.5936,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.2228337824344635,
"rewards/margins": 0.31492942571640015,
"rewards/rejected": -0.5377631783485413,
"step": 2710
},
{
"epoch": 0.7118555352002094,
"grad_norm": 5.55739164352417,
"learning_rate": 1.1620536287303051e-07,
"logits/chosen": -2.7841482162475586,
"logits/rejected": -2.7707200050354004,
"logps/chosen": -330.66802978515625,
"logps/rejected": -324.71453857421875,
"loss": 0.6076,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.21253342926502228,
"rewards/margins": 0.2718030512332916,
"rewards/rejected": -0.4843364655971527,
"step": 2720
},
{
"epoch": 0.7144726511384454,
"grad_norm": 4.946017742156982,
"learning_rate": 1.1428120125340716e-07,
"logits/chosen": -2.771012783050537,
"logits/rejected": -2.751859188079834,
"logps/chosen": -299.06195068359375,
"logps/rejected": -291.7746276855469,
"loss": 0.5414,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.18322893977165222,
"rewards/margins": 0.4256429076194763,
"rewards/rejected": -0.6088718175888062,
"step": 2730
},
{
"epoch": 0.7170897670766815,
"grad_norm": 8.510547637939453,
"learning_rate": 1.123683721144223e-07,
"logits/chosen": -2.773465871810913,
"logits/rejected": -2.750523328781128,
"logps/chosen": -322.75030517578125,
"logps/rejected": -322.23541259765625,
"loss": 0.5924,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2108650654554367,
"rewards/margins": 0.3147924840450287,
"rewards/rejected": -0.5256575345993042,
"step": 2740
},
{
"epoch": 0.7197068830149176,
"grad_norm": 6.666440010070801,
"learning_rate": 1.1046703517675845e-07,
"logits/chosen": -2.792327642440796,
"logits/rejected": -2.780276298522949,
"logps/chosen": -292.0575256347656,
"logps/rejected": -331.8373718261719,
"loss": 0.5803,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.20287561416625977,
"rewards/margins": 0.3353033661842346,
"rewards/rejected": -0.5381789803504944,
"step": 2750
},
{
"epoch": 0.7223239989531536,
"grad_norm": 3.823488712310791,
"learning_rate": 1.085773492015028e-07,
"logits/chosen": -2.7709414958953857,
"logits/rejected": -2.7493114471435547,
"logps/chosen": -284.67193603515625,
"logps/rejected": -288.34991455078125,
"loss": 0.5487,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2005012482404709,
"rewards/margins": 0.4104704260826111,
"rewards/rejected": -0.6109716892242432,
"step": 2760
},
{
"epoch": 0.7249411148913897,
"grad_norm": 10.498513221740723,
"learning_rate": 1.0669947197689033e-07,
"logits/chosen": -2.7609269618988037,
"logits/rejected": -2.723078489303589,
"logps/chosen": -316.71929931640625,
"logps/rejected": -321.02239990234375,
"loss": 0.5936,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2569184899330139,
"rewards/margins": 0.3084716498851776,
"rewards/rejected": -0.5653902292251587,
"step": 2770
},
{
"epoch": 0.7275582308296258,
"grad_norm": 9.501131057739258,
"learning_rate": 1.048335603051291e-07,
"logits/chosen": -2.7370448112487793,
"logits/rejected": -2.730591058731079,
"logps/chosen": -329.8760986328125,
"logps/rejected": -340.55413818359375,
"loss": 0.5523,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2385425865650177,
"rewards/margins": 0.41302841901779175,
"rewards/rejected": -0.6515710353851318,
"step": 2780
},
{
"epoch": 0.7301753467678618,
"grad_norm": 9.440362930297852,
"learning_rate": 1.0297976998930663e-07,
"logits/chosen": -2.787727117538452,
"logits/rejected": -2.7839837074279785,
"logps/chosen": -315.8175048828125,
"logps/rejected": -321.4845275878906,
"loss": 0.5551,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.23409982025623322,
"rewards/margins": 0.4074832797050476,
"rewards/rejected": -0.6415830850601196,
"step": 2790
},
{
"epoch": 0.7327924627060979,
"grad_norm": 9.004974365234375,
"learning_rate": 1.0113825582038077e-07,
"logits/chosen": -2.7806646823883057,
"logits/rejected": -2.770219326019287,
"logps/chosen": -309.5851135253906,
"logps/rejected": -321.6380310058594,
"loss": 0.5994,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2802024185657501,
"rewards/margins": 0.2918320596218109,
"rewards/rejected": -0.572034478187561,
"step": 2800
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -2.785149335861206,
"eval_logits/rejected": -2.7636430263519287,
"eval_logps/chosen": -308.8106689453125,
"eval_logps/rejected": -319.18377685546875,
"eval_loss": 0.5920370221138,
"eval_rewards/accuracies": 0.7045000195503235,
"eval_rewards/chosen": -0.2607303559780121,
"eval_rewards/margins": 0.31672805547714233,
"eval_rewards/rejected": -0.577458381652832,
"eval_runtime": 691.5482,
"eval_samples_per_second": 2.892,
"eval_steps_per_second": 0.362,
"step": 2800
},
{
"epoch": 0.735409578644334,
"grad_norm": 5.153034687042236,
"learning_rate": 9.930917156425475e-08,
"logits/chosen": -2.7953689098358154,
"logits/rejected": -2.7769198417663574,
"logps/chosen": -307.6942443847656,
"logps/rejected": -336.81036376953125,
"loss": 0.5828,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2690412104129791,
"rewards/margins": 0.3371729254722595,
"rewards/rejected": -0.6062140464782715,
"step": 2810
},
{
"epoch": 0.73802669458257,
"grad_norm": 10.421857833862305,
"learning_rate": 9.749266994893754e-08,
"logits/chosen": -2.7286500930786133,
"logits/rejected": -2.696841239929199,
"logps/chosen": -283.78277587890625,
"logps/rejected": -293.64666748046875,
"loss": 0.6332,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2909180223941803,
"rewards/margins": 0.21305350959300995,
"rewards/rejected": -0.5039715766906738,
"step": 2820
},
{
"epoch": 0.7406438105208061,
"grad_norm": 14.213560104370117,
"learning_rate": 9.568890265179128e-08,
"logits/chosen": -2.7485554218292236,
"logits/rejected": -2.7543232440948486,
"logps/chosen": -308.8101806640625,
"logps/rejected": -305.62347412109375,
"loss": 0.609,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2877466678619385,
"rewards/margins": 0.28105878829956055,
"rewards/rejected": -0.568805456161499,
"step": 2830
},
{
"epoch": 0.7432609264590422,
"grad_norm": 5.577268600463867,
"learning_rate": 9.389802028686616e-08,
"logits/chosen": -2.7711002826690674,
"logits/rejected": -2.7511260509490967,
"logps/chosen": -308.267822265625,
"logps/rejected": -295.8204650878906,
"loss": 0.6301,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.292976438999176,
"rewards/margins": 0.21805603802204132,
"rewards/rejected": -0.5110324621200562,
"step": 2840
},
{
"epoch": 0.7458780423972782,
"grad_norm": 5.392404556274414,
"learning_rate": 9.212017239232426e-08,
"logits/chosen": -2.7617223262786865,
"logits/rejected": -2.7573046684265137,
"logps/chosen": -312.38421630859375,
"logps/rejected": -330.9461975097656,
"loss": 0.5444,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.22561874985694885,
"rewards/margins": 0.4286450445652008,
"rewards/rejected": -0.6542637348175049,
"step": 2850
},
{
"epoch": 0.7484951583355143,
"grad_norm": 6.394357681274414,
"learning_rate": 9.035550741795328e-08,
"logits/chosen": -2.7431981563568115,
"logits/rejected": -2.7521939277648926,
"logps/chosen": -295.7667541503906,
"logps/rejected": -334.49688720703125,
"loss": 0.5794,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.21194259822368622,
"rewards/margins": 0.35274478793144226,
"rewards/rejected": -0.5646874308586121,
"step": 2860
},
{
"epoch": 0.7511122742737504,
"grad_norm": 9.479743003845215,
"learning_rate": 8.860417271277065e-08,
"logits/chosen": -2.819362163543701,
"logits/rejected": -2.8213016986846924,
"logps/chosen": -308.4556884765625,
"logps/rejected": -324.0565490722656,
"loss": 0.6036,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.23003943264484406,
"rewards/margins": 0.26295268535614014,
"rewards/rejected": -0.492992103099823,
"step": 2870
},
{
"epoch": 0.7537293902119864,
"grad_norm": 9.29710865020752,
"learning_rate": 8.686631451272029e-08,
"logits/chosen": -2.7966079711914062,
"logits/rejected": -2.7735276222229004,
"logps/chosen": -297.5863952636719,
"logps/rejected": -300.37908935546875,
"loss": 0.6135,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2746056914329529,
"rewards/margins": 0.2642548680305481,
"rewards/rejected": -0.5388606190681458,
"step": 2880
},
{
"epoch": 0.7563465061502225,
"grad_norm": 9.630151748657227,
"learning_rate": 8.514207792846168e-08,
"logits/chosen": -2.7753801345825195,
"logits/rejected": -2.775832414627075,
"logps/chosen": -292.93609619140625,
"logps/rejected": -292.79754638671875,
"loss": 0.5907,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2579854130744934,
"rewards/margins": 0.3091490864753723,
"rewards/rejected": -0.5671344995498657,
"step": 2890
},
{
"epoch": 0.7589636220884585,
"grad_norm": 7.0608439445495605,
"learning_rate": 8.343160693325355e-08,
"logits/chosen": -2.7492966651916504,
"logits/rejected": -2.7410671710968018,
"logps/chosen": -293.8484802246094,
"logps/rejected": -324.77001953125,
"loss": 0.5837,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22494366765022278,
"rewards/margins": 0.3548448979854584,
"rewards/rejected": -0.5797885656356812,
"step": 2900
},
{
"epoch": 0.7589636220884585,
"eval_logits/chosen": -2.783421277999878,
"eval_logits/rejected": -2.7619221210479736,
"eval_logps/chosen": -308.137939453125,
"eval_logps/rejected": -318.6510925292969,
"eval_loss": 0.5913165211677551,
"eval_rewards/accuracies": 0.7055000066757202,
"eval_rewards/chosen": -0.2540031671524048,
"eval_rewards/margins": 0.3181284964084625,
"eval_rewards/rejected": -0.5721316933631897,
"eval_runtime": 692.0731,
"eval_samples_per_second": 2.89,
"eval_steps_per_second": 0.361,
"step": 2900
},
{
"epoch": 0.7615807380266946,
"grad_norm": 7.802112579345703,
"learning_rate": 8.173504435093173e-08,
"logits/chosen": -2.7537245750427246,
"logits/rejected": -2.726355791091919,
"logps/chosen": -290.5617980957031,
"logps/rejected": -287.50799560546875,
"loss": 0.5806,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2531769275665283,
"rewards/margins": 0.35345658659935,
"rewards/rejected": -0.6066334843635559,
"step": 2910
},
{
"epoch": 0.7641978539649307,
"grad_norm": 9.018595695495605,
"learning_rate": 8.005253184398359e-08,
"logits/chosen": -2.7553019523620605,
"logits/rejected": -2.745943546295166,
"logps/chosen": -320.03265380859375,
"logps/rejected": -340.8626403808594,
"loss": 0.6027,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.24576649069786072,
"rewards/margins": 0.28911441564559937,
"rewards/rejected": -0.5348808765411377,
"step": 2920
},
{
"epoch": 0.7668149699031667,
"grad_norm": 6.111194133758545,
"learning_rate": 7.838420990171926e-08,
"logits/chosen": -2.789515972137451,
"logits/rejected": -2.7570273876190186,
"logps/chosen": -310.61224365234375,
"logps/rejected": -312.87152099609375,
"loss": 0.5865,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22255787253379822,
"rewards/margins": 0.31383711099624634,
"rewards/rejected": -0.5363950133323669,
"step": 2930
},
{
"epoch": 0.7694320858414028,
"grad_norm": 5.815800666809082,
"learning_rate": 7.673021782854083e-08,
"logits/chosen": -2.69783091545105,
"logits/rejected": -2.6870310306549072,
"logps/chosen": -311.68963623046875,
"logps/rejected": -288.39215087890625,
"loss": 0.5979,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2525468170642853,
"rewards/margins": 0.31668832898139954,
"rewards/rejected": -0.5692351460456848,
"step": 2940
},
{
"epoch": 0.7720492017796389,
"grad_norm": 10.589014053344727,
"learning_rate": 7.509069373231039e-08,
"logits/chosen": -2.742522716522217,
"logits/rejected": -2.7218940258026123,
"logps/chosen": -293.1689453125,
"logps/rejected": -302.7828369140625,
"loss": 0.6006,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.24362894892692566,
"rewards/margins": 0.29250627756118774,
"rewards/rejected": -0.536135196685791,
"step": 2950
},
{
"epoch": 0.7746663177178749,
"grad_norm": 8.408040046691895,
"learning_rate": 7.346577451281821e-08,
"logits/chosen": -2.7488350868225098,
"logits/rejected": -2.7583699226379395,
"logps/chosen": -308.5254821777344,
"logps/rejected": -321.6301574707031,
"loss": 0.578,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.26960092782974243,
"rewards/margins": 0.3470562696456909,
"rewards/rejected": -0.6166571378707886,
"step": 2960
},
{
"epoch": 0.777283433656111,
"grad_norm": 7.626022815704346,
"learning_rate": 7.185559585035136e-08,
"logits/chosen": -2.7650535106658936,
"logits/rejected": -2.736623764038086,
"logps/chosen": -327.43792724609375,
"logps/rejected": -349.74005126953125,
"loss": 0.5695,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28370755910873413,
"rewards/margins": 0.38453495502471924,
"rewards/rejected": -0.6682425737380981,
"step": 2970
},
{
"epoch": 0.7799005495943471,
"grad_norm": 8.664432525634766,
"learning_rate": 7.026029219436502e-08,
"logits/chosen": -2.7403178215026855,
"logits/rejected": -2.726973533630371,
"logps/chosen": -296.88629150390625,
"logps/rejected": -320.1584167480469,
"loss": 0.5807,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2508087158203125,
"rewards/margins": 0.3491096496582031,
"rewards/rejected": -0.5999183058738708,
"step": 2980
},
{
"epoch": 0.7825176655325831,
"grad_norm": 7.381548881530762,
"learning_rate": 6.867999675225522e-08,
"logits/chosen": -2.7898964881896973,
"logits/rejected": -2.765493392944336,
"logps/chosen": -269.5013427734375,
"logps/rejected": -287.95318603515625,
"loss": 0.577,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.22013553977012634,
"rewards/margins": 0.3466190695762634,
"rewards/rejected": -0.5667546391487122,
"step": 2990
},
{
"epoch": 0.7851347814708192,
"grad_norm": 8.886544227600098,
"learning_rate": 6.711484147823662e-08,
"logits/chosen": -2.7362468242645264,
"logits/rejected": -2.7374088764190674,
"logps/chosen": -273.03204345703125,
"logps/rejected": -309.46832275390625,
"loss": 0.5858,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.22477373480796814,
"rewards/margins": 0.3105041980743408,
"rewards/rejected": -0.5352779626846313,
"step": 3000
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -2.781898021697998,
"eval_logits/rejected": -2.7604949474334717,
"eval_logps/chosen": -308.9897766113281,
"eval_logps/rejected": -319.7853088378906,
"eval_loss": 0.5910181999206543,
"eval_rewards/accuracies": 0.7055000066757202,
"eval_rewards/chosen": -0.2625214755535126,
"eval_rewards/margins": 0.32095208764076233,
"eval_rewards/rejected": -0.5834735035896301,
"eval_runtime": 691.7146,
"eval_samples_per_second": 2.891,
"eval_steps_per_second": 0.361,
"step": 3000
},
{
"epoch": 0.7877518974090553,
"grad_norm": 12.21480655670166,
"learning_rate": 6.556495706232412e-08,
"logits/chosen": -2.7469980716705322,
"logits/rejected": -2.7527496814727783,
"logps/chosen": -316.41766357421875,
"logps/rejected": -328.52532958984375,
"loss": 0.5886,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.291492760181427,
"rewards/margins": 0.32380086183547974,
"rewards/rejected": -0.6152936816215515,
"step": 3010
},
{
"epoch": 0.7903690133472913,
"grad_norm": 8.182783126831055,
"learning_rate": 6.403047291942057e-08,
"logits/chosen": -2.722087860107422,
"logits/rejected": -2.6903903484344482,
"logps/chosen": -275.5090637207031,
"logps/rejected": -277.62420654296875,
"loss": 0.5972,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3089084327220917,
"rewards/margins": 0.29682403802871704,
"rewards/rejected": -0.6057325005531311,
"step": 3020
},
{
"epoch": 0.7929861292855274,
"grad_norm": 8.147031784057617,
"learning_rate": 6.251151717851021e-08,
"logits/chosen": -2.743332624435425,
"logits/rejected": -2.7332491874694824,
"logps/chosen": -280.6979064941406,
"logps/rejected": -292.1900329589844,
"loss": 0.6154,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.29725611209869385,
"rewards/margins": 0.27910858392715454,
"rewards/rejected": -0.5763646960258484,
"step": 3030
},
{
"epoch": 0.7956032452237635,
"grad_norm": 10.667434692382812,
"learning_rate": 6.100821667196041e-08,
"logits/chosen": -2.8258140087127686,
"logits/rejected": -2.772840976715088,
"logps/chosen": -316.3697204589844,
"logps/rejected": -283.46575927734375,
"loss": 0.5777,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2557021975517273,
"rewards/margins": 0.3566380739212036,
"rewards/rejected": -0.6123403310775757,
"step": 3040
},
{
"epoch": 0.7982203611619995,
"grad_norm": 11.156988143920898,
"learning_rate": 5.952069692493061e-08,
"logits/chosen": -2.7050204277038574,
"logits/rejected": -2.7095789909362793,
"logps/chosen": -266.9496154785156,
"logps/rejected": -308.8603515625,
"loss": 0.5668,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.21333126723766327,
"rewards/margins": 0.3779350519180298,
"rewards/rejected": -0.5912663340568542,
"step": 3050
},
{
"epoch": 0.8008374771002356,
"grad_norm": 17.065628051757812,
"learning_rate": 5.8049082144891794e-08,
"logits/chosen": -2.702791452407837,
"logits/rejected": -2.6872074604034424,
"logps/chosen": -304.93463134765625,
"logps/rejected": -380.0108642578125,
"loss": 0.5933,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.24746175110340118,
"rewards/margins": 0.32271090149879456,
"rewards/rejected": -0.5701726675033569,
"step": 3060
},
{
"epoch": 0.8034545930384716,
"grad_norm": 5.375977516174316,
"learning_rate": 5.659349521125459e-08,
"logits/chosen": -2.828627109527588,
"logits/rejected": -2.8292970657348633,
"logps/chosen": -323.8910827636719,
"logps/rejected": -331.82403564453125,
"loss": 0.5963,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.25231170654296875,
"rewards/margins": 0.3079237937927246,
"rewards/rejected": -0.5602355003356934,
"step": 3070
},
{
"epoch": 0.8060717089767077,
"grad_norm": 10.280311584472656,
"learning_rate": 5.5154057665109e-08,
"logits/chosen": -2.772552490234375,
"logits/rejected": -2.7637112140655518,
"logps/chosen": -304.2619934082031,
"logps/rejected": -313.9085998535156,
"loss": 0.5688,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.28161460161209106,
"rewards/margins": 0.3681698441505432,
"rewards/rejected": -0.6497844457626343,
"step": 3080
},
{
"epoch": 0.8086888249149438,
"grad_norm": 5.905206203460693,
"learning_rate": 5.3730889699075853e-08,
"logits/chosen": -2.790621280670166,
"logits/rejected": -2.764768123626709,
"logps/chosen": -320.5517272949219,
"logps/rejected": -295.2154541015625,
"loss": 0.5839,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.23616118729114532,
"rewards/margins": 0.32327955961227417,
"rewards/rejected": -0.5594406723976135,
"step": 3090
},
{
"epoch": 0.8113059408531798,
"grad_norm": 5.722733974456787,
"learning_rate": 5.2324110147270893e-08,
"logits/chosen": -2.766014814376831,
"logits/rejected": -2.758927583694458,
"logps/chosen": -317.6996154785156,
"logps/rejected": -342.97039794921875,
"loss": 0.5685,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.17896804213523865,
"rewards/margins": 0.3621399402618408,
"rewards/rejected": -0.5411080121994019,
"step": 3100
},
{
"epoch": 0.8113059408531798,
"eval_logits/chosen": -2.7776589393615723,
"eval_logits/rejected": -2.7558252811431885,
"eval_logps/chosen": -306.57073974609375,
"eval_logps/rejected": -317.1507263183594,
"eval_loss": 0.5914422869682312,
"eval_rewards/accuracies": 0.7039999961853027,
"eval_rewards/chosen": -0.23833158612251282,
"eval_rewards/margins": 0.3187963366508484,
"eval_rewards/rejected": -0.5571279525756836,
"eval_runtime": 692.3976,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 3100
},
{
"epoch": 0.8139230567914159,
"grad_norm": 5.692158222198486,
"learning_rate": 5.0933836475381795e-08,
"logits/chosen": -2.773538827896118,
"logits/rejected": -2.743774175643921,
"logps/chosen": -323.03564453125,
"logps/rejected": -339.22576904296875,
"loss": 0.5839,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.20304706692695618,
"rewards/margins": 0.33373111486434937,
"rewards/rejected": -0.5367781519889832,
"step": 3110
},
{
"epoch": 0.816540172729652,
"grad_norm": 6.522732734680176,
"learning_rate": 4.956018477086005e-08,
"logits/chosen": -2.7541415691375732,
"logits/rejected": -2.7304270267486572,
"logps/chosen": -312.82550048828125,
"logps/rejected": -319.4942626953125,
"loss": 0.5787,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.2153932750225067,
"rewards/margins": 0.3583284020423889,
"rewards/rejected": -0.5737215876579285,
"step": 3120
},
{
"epoch": 0.819157288667888,
"grad_norm": 12.873359680175781,
"learning_rate": 4.820326973322763e-08,
"logits/chosen": -2.7611987590789795,
"logits/rejected": -2.7416489124298096,
"logps/chosen": -294.5945129394531,
"logps/rejected": -322.9219055175781,
"loss": 0.5902,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.26755794882774353,
"rewards/margins": 0.30830827355384827,
"rewards/rejected": -0.5758662223815918,
"step": 3130
},
{
"epoch": 0.821774404606124,
"grad_norm": 6.0704731941223145,
"learning_rate": 4.686320466449981e-08,
"logits/chosen": -2.765129566192627,
"logits/rejected": -2.712188482284546,
"logps/chosen": -279.4689025878906,
"logps/rejected": -308.8946533203125,
"loss": 0.5878,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.21677632629871368,
"rewards/margins": 0.3269808888435364,
"rewards/rejected": -0.5437572598457336,
"step": 3140
},
{
"epoch": 0.8243915205443602,
"grad_norm": 9.32778549194336,
"learning_rate": 4.554010145972417e-08,
"logits/chosen": -2.8120663166046143,
"logits/rejected": -2.7678775787353516,
"logps/chosen": -308.05975341796875,
"logps/rejected": -326.4994812011719,
"loss": 0.6037,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.28121477365493774,
"rewards/margins": 0.3111681342124939,
"rewards/rejected": -0.5923829078674316,
"step": 3150
},
{
"epoch": 0.8270086364825961,
"grad_norm": 6.878976345062256,
"learning_rate": 4.423407059763745e-08,
"logits/chosen": -2.769566535949707,
"logits/rejected": -2.754739999771118,
"logps/chosen": -313.4940490722656,
"logps/rejected": -338.7357482910156,
"loss": 0.5795,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.22392907738685608,
"rewards/margins": 0.3541107773780823,
"rewards/rejected": -0.578039824962616,
"step": 3160
},
{
"epoch": 0.8296257524208323,
"grad_norm": 8.941882133483887,
"learning_rate": 4.294522113144078e-08,
"logits/chosen": -2.7120773792266846,
"logits/rejected": -2.676596164703369,
"logps/chosen": -310.96600341796875,
"logps/rejected": -309.7723083496094,
"loss": 0.5784,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.23985318839550018,
"rewards/margins": 0.3455398380756378,
"rewards/rejected": -0.5853930115699768,
"step": 3170
},
{
"epoch": 0.8322428683590684,
"grad_norm": 11.861396789550781,
"learning_rate": 4.1673660679693804e-08,
"logits/chosen": -2.759885311126709,
"logits/rejected": -2.7518694400787354,
"logps/chosen": -264.2064514160156,
"logps/rejected": -315.90380859375,
"loss": 0.6069,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.26855209469795227,
"rewards/margins": 0.2709905505180359,
"rewards/rejected": -0.539542555809021,
"step": 3180
},
{
"epoch": 0.8348599842973043,
"grad_norm": 3.688720941543579,
"learning_rate": 4.041949541732825e-08,
"logits/chosen": -2.7698843479156494,
"logits/rejected": -2.773341655731201,
"logps/chosen": -306.61480712890625,
"logps/rejected": -325.04541015625,
"loss": 0.5851,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2628583610057831,
"rewards/margins": 0.3378385603427887,
"rewards/rejected": -0.6006969213485718,
"step": 3190
},
{
"epoch": 0.8374771002355405,
"grad_norm": 3.2142703533172607,
"learning_rate": 3.9182830066782605e-08,
"logits/chosen": -2.7356200218200684,
"logits/rejected": -2.740725040435791,
"logps/chosen": -303.8326721191406,
"logps/rejected": -351.736083984375,
"loss": 0.5753,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.28992363810539246,
"rewards/margins": 0.3618486821651459,
"rewards/rejected": -0.6517723798751831,
"step": 3200
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -2.778296709060669,
"eval_logits/rejected": -2.7567243576049805,
"eval_logps/chosen": -308.9666442871094,
"eval_logps/rejected": -320.12237548828125,
"eval_loss": 0.5903262495994568,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -0.26229044795036316,
"eval_rewards/margins": 0.3245540261268616,
"eval_rewards/rejected": -0.5868445038795471,
"eval_runtime": 691.7572,
"eval_samples_per_second": 2.891,
"eval_steps_per_second": 0.361,
"step": 3200
},
{
"epoch": 0.8400942161737766,
"grad_norm": 5.404438018798828,
"learning_rate": 3.79637678892577e-08,
"logits/chosen": -2.737617015838623,
"logits/rejected": -2.7435827255249023,
"logps/chosen": -313.7263488769531,
"logps/rejected": -326.2721862792969,
"loss": 0.5958,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.21620874106884003,
"rewards/margins": 0.29549044370651245,
"rewards/rejected": -0.5116991996765137,
"step": 3210
},
{
"epoch": 0.8427113321120125,
"grad_norm": 8.482666015625,
"learning_rate": 3.6762410676094645e-08,
"logits/chosen": -2.7493488788604736,
"logits/rejected": -2.751436233520508,
"logps/chosen": -342.2435302734375,
"logps/rejected": -334.9501953125,
"loss": 0.5649,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.22917640209197998,
"rewards/margins": 0.40211135149002075,
"rewards/rejected": -0.631287693977356,
"step": 3220
},
{
"epoch": 0.8453284480502486,
"grad_norm": 21.451396942138672,
"learning_rate": 3.557885874027497e-08,
"logits/chosen": -2.7467381954193115,
"logits/rejected": -2.7420356273651123,
"logps/chosen": -307.3967590332031,
"logps/rejected": -319.23785400390625,
"loss": 0.626,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.2908255457878113,
"rewards/margins": 0.24548819661140442,
"rewards/rejected": -0.5363136529922485,
"step": 3230
},
{
"epoch": 0.8479455639884846,
"grad_norm": 9.142580032348633,
"learning_rate": 3.441321090804469e-08,
"logits/chosen": -2.805671215057373,
"logits/rejected": -2.7749440670013428,
"logps/chosen": -311.969482421875,
"logps/rejected": -301.92559814453125,
"loss": 0.5872,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.28862375020980835,
"rewards/margins": 0.3031871020793915,
"rewards/rejected": -0.5918108820915222,
"step": 3240
},
{
"epoch": 0.8505626799267207,
"grad_norm": 6.999141216278076,
"learning_rate": 3.326556451066234e-08,
"logits/chosen": -2.8003592491149902,
"logits/rejected": -2.7750496864318848,
"logps/chosen": -333.262451171875,
"logps/rejected": -342.88970947265625,
"loss": 0.5676,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.21939554810523987,
"rewards/margins": 0.38298407196998596,
"rewards/rejected": -0.602379560470581,
"step": 3250
},
{
"epoch": 0.8531797958649568,
"grad_norm": 8.473172187805176,
"learning_rate": 3.2136015376271946e-08,
"logits/chosen": -2.7543041706085205,
"logits/rejected": -2.7237446308135986,
"logps/chosen": -310.47503662109375,
"logps/rejected": -316.1898498535156,
"loss": 0.6202,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.3483801782131195,
"rewards/margins": 0.25630325078964233,
"rewards/rejected": -0.6046834588050842,
"step": 3260
},
{
"epoch": 0.8557969118031928,
"grad_norm": 6.828322887420654,
"learning_rate": 3.102465782190106e-08,
"logits/chosen": -2.765094041824341,
"logits/rejected": -2.7622992992401123,
"logps/chosen": -292.77264404296875,
"logps/rejected": -306.03790283203125,
"loss": 0.6049,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.26343613862991333,
"rewards/margins": 0.2959148585796356,
"rewards/rejected": -0.5593509674072266,
"step": 3270
},
{
"epoch": 0.8584140277414289,
"grad_norm": 7.230039119720459,
"learning_rate": 2.993158464558565e-08,
"logits/chosen": -2.752277135848999,
"logits/rejected": -2.7456305027008057,
"logps/chosen": -313.83514404296875,
"logps/rejected": -343.77923583984375,
"loss": 0.6083,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2221953421831131,
"rewards/margins": 0.2806113660335541,
"rewards/rejected": -0.5028067231178284,
"step": 3280
},
{
"epoch": 0.861031143679665,
"grad_norm": 3.2468912601470947,
"learning_rate": 2.8856887118621358e-08,
"logits/chosen": -2.7951433658599854,
"logits/rejected": -2.8030707836151123,
"logps/chosen": -308.23077392578125,
"logps/rejected": -336.6316223144531,
"loss": 0.6066,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3232649266719818,
"rewards/margins": 0.30740997195243835,
"rewards/rejected": -0.6306749582290649,
"step": 3290
},
{
"epoch": 0.863648259617901,
"grad_norm": 6.59912109375,
"learning_rate": 2.7800654977942482e-08,
"logits/chosen": -2.7431418895721436,
"logits/rejected": -2.7131383419036865,
"logps/chosen": -301.9719543457031,
"logps/rejected": -354.3257751464844,
"loss": 0.5769,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2676336467266083,
"rewards/margins": 0.3562368154525757,
"rewards/rejected": -0.6238704919815063,
"step": 3300
},
{
"epoch": 0.863648259617901,
"eval_logits/chosen": -2.7770590782165527,
"eval_logits/rejected": -2.755500555038452,
"eval_logps/chosen": -309.4716491699219,
"eval_logps/rejected": -320.77569580078125,
"eval_loss": 0.5899637341499329,
"eval_rewards/accuracies": 0.703000009059906,
"eval_rewards/chosen": -0.2673403322696686,
"eval_rewards/margins": 0.3260369896888733,
"eval_rewards/rejected": -0.5933773517608643,
"eval_runtime": 692.4414,
"eval_samples_per_second": 2.888,
"eval_steps_per_second": 0.361,
"step": 3300
},
{
"epoch": 0.8662653755561371,
"grad_norm": 7.842947959899902,
"learning_rate": 2.676297641862879e-08,
"logits/chosen": -2.76792049407959,
"logits/rejected": -2.7621943950653076,
"logps/chosen": -265.380859375,
"logps/rejected": -254.47140502929688,
"loss": 0.5895,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.22911493480205536,
"rewards/margins": 0.32636719942092896,
"rewards/rejected": -0.5554821491241455,
"step": 3310
},
{
"epoch": 0.8688824914943732,
"grad_norm": 13.967310905456543,
"learning_rate": 2.5743938086541352e-08,
"logits/chosen": -2.7548770904541016,
"logits/rejected": -2.729977607727051,
"logps/chosen": -309.2705383300781,
"logps/rejected": -313.9998779296875,
"loss": 0.603,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.29073604941368103,
"rewards/margins": 0.31791952252388,
"rewards/rejected": -0.6086556315422058,
"step": 3320
},
{
"epoch": 0.8714996074326092,
"grad_norm": 11.057051658630371,
"learning_rate": 2.474362507108757e-08,
"logits/chosen": -2.814598560333252,
"logits/rejected": -2.7810606956481934,
"logps/chosen": -317.7953186035156,
"logps/rejected": -332.5885314941406,
"loss": 0.5725,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.25249534845352173,
"rewards/margins": 0.38154152035713196,
"rewards/rejected": -0.6340368390083313,
"step": 3330
},
{
"epoch": 0.8741167233708453,
"grad_norm": 10.906637191772461,
"learning_rate": 2.3762120898116495e-08,
"logits/chosen": -2.774956226348877,
"logits/rejected": -2.764927625656128,
"logps/chosen": -322.2221984863281,
"logps/rejected": -341.53216552734375,
"loss": 0.6079,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3415859639644623,
"rewards/margins": 0.2912564277648926,
"rewards/rejected": -0.6328424215316772,
"step": 3340
},
{
"epoch": 0.8767338393090814,
"grad_norm": 6.918145656585693,
"learning_rate": 2.2799507522944044e-08,
"logits/chosen": -2.689883232116699,
"logits/rejected": -2.6739673614501953,
"logps/chosen": -313.18524169921875,
"logps/rejected": -340.9402770996094,
"loss": 0.5669,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.26544609665870667,
"rewards/margins": 0.35022976994514465,
"rewards/rejected": -0.6156758069992065,
"step": 3350
},
{
"epoch": 0.8793509552473174,
"grad_norm": 10.59185791015625,
"learning_rate": 2.1855865323510054e-08,
"logits/chosen": -2.7279655933380127,
"logits/rejected": -2.6860973834991455,
"logps/chosen": -320.9715576171875,
"logps/rejected": -355.20880126953125,
"loss": 0.5657,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2595919072628021,
"rewards/margins": 0.40920543670654297,
"rewards/rejected": -0.6687973141670227,
"step": 3360
},
{
"epoch": 0.8819680711855535,
"grad_norm": 6.1795830726623535,
"learning_rate": 2.0931273093666573e-08,
"logits/chosen": -2.728386878967285,
"logits/rejected": -2.7089622020721436,
"logps/chosen": -283.88409423828125,
"logps/rejected": -303.3033142089844,
"loss": 0.5462,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2573816776275635,
"rewards/margins": 0.4072348475456238,
"rewards/rejected": -0.6646164655685425,
"step": 3370
},
{
"epoch": 0.8845851871237895,
"grad_norm": 6.445786476135254,
"learning_rate": 2.002580803659873e-08,
"logits/chosen": -2.747699022293091,
"logits/rejected": -2.7049365043640137,
"logps/chosen": -303.89813232421875,
"logps/rejected": -318.79693603515625,
"loss": 0.617,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.3385586738586426,
"rewards/margins": 0.2693432867527008,
"rewards/rejected": -0.607901930809021,
"step": 3380
},
{
"epoch": 0.8872023030620256,
"grad_norm": 9.493855476379395,
"learning_rate": 1.9139545758378256e-08,
"logits/chosen": -2.770669460296631,
"logits/rejected": -2.722433090209961,
"logps/chosen": -311.3063659667969,
"logps/rejected": -296.7181701660156,
"loss": 0.5721,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2407282292842865,
"rewards/margins": 0.3538174629211426,
"rewards/rejected": -0.5945457220077515,
"step": 3390
},
{
"epoch": 0.8898194190002617,
"grad_norm": 8.795994758605957,
"learning_rate": 1.8272560261650277e-08,
"logits/chosen": -2.782130479812622,
"logits/rejected": -2.757819652557373,
"logps/chosen": -354.10919189453125,
"logps/rejected": -333.00250244140625,
"loss": 0.5608,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.22043180465698242,
"rewards/margins": 0.3896182179450989,
"rewards/rejected": -0.6100499629974365,
"step": 3400
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -2.773853302001953,
"eval_logits/rejected": -2.7519986629486084,
"eval_logps/chosen": -309.8929748535156,
"eval_logps/rejected": -321.31964111328125,
"eval_loss": 0.5895980000495911,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -0.27155351638793945,
"eval_rewards/margins": 0.3272639214992523,
"eval_rewards/rejected": -0.5988174676895142,
"eval_runtime": 692.3174,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 3400
},
{
"epoch": 0.8924365349384977,
"grad_norm": 7.369442462921143,
"learning_rate": 1.742492393945427e-08,
"logits/chosen": -2.7513797283172607,
"logits/rejected": -2.710066318511963,
"logps/chosen": -323.8204650878906,
"logps/rejected": -317.6787109375,
"loss": 0.568,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2688294053077698,
"rewards/margins": 0.36988669633865356,
"rewards/rejected": -0.6387161016464233,
"step": 3410
},
{
"epoch": 0.8950536508767338,
"grad_norm": 7.45905876159668,
"learning_rate": 1.6596707569179302e-08,
"logits/chosen": -2.791177749633789,
"logits/rejected": -2.7743191719055176,
"logps/chosen": -325.4018249511719,
"logps/rejected": -326.23291015625,
"loss": 0.5784,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.28556251525878906,
"rewards/margins": 0.3457964062690735,
"rewards/rejected": -0.6313589215278625,
"step": 3420
},
{
"epoch": 0.8976707668149699,
"grad_norm": 6.628225326538086,
"learning_rate": 1.5787980306653848e-08,
"logits/chosen": -2.75858736038208,
"logits/rejected": -2.7154600620269775,
"logps/chosen": -316.15985107421875,
"logps/rejected": -336.3743896484375,
"loss": 0.5708,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.24860498309135437,
"rewards/margins": 0.3617299795150757,
"rewards/rejected": -0.6103349924087524,
"step": 3430
},
{
"epoch": 0.9002878827532059,
"grad_norm": 10.542095184326172,
"learning_rate": 1.499880968037165e-08,
"logits/chosen": -2.752002477645874,
"logits/rejected": -2.733220100402832,
"logps/chosen": -292.7621765136719,
"logps/rejected": -285.80218505859375,
"loss": 0.5813,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.22942595183849335,
"rewards/margins": 0.32194358110427856,
"rewards/rejected": -0.5513694882392883,
"step": 3440
},
{
"epoch": 0.902904998691442,
"grad_norm": 5.9859395027160645,
"learning_rate": 1.4229261585852803e-08,
"logits/chosen": -2.77447772026062,
"logits/rejected": -2.7663679122924805,
"logps/chosen": -305.6563415527344,
"logps/rejected": -314.01043701171875,
"loss": 0.5806,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.22854971885681152,
"rewards/margins": 0.3463636040687561,
"rewards/rejected": -0.5749133825302124,
"step": 3450
},
{
"epoch": 0.9055221146296781,
"grad_norm": 9.172728538513184,
"learning_rate": 1.3479400280141883e-08,
"logits/chosen": -2.74762225151062,
"logits/rejected": -2.7340774536132812,
"logps/chosen": -290.8319396972656,
"logps/rejected": -326.6239929199219,
"loss": 0.5852,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2622153162956238,
"rewards/margins": 0.35056665539741516,
"rewards/rejected": -0.6127818822860718,
"step": 3460
},
{
"epoch": 0.9081392305679141,
"grad_norm": 8.79883098602295,
"learning_rate": 1.2749288376442042e-08,
"logits/chosen": -2.7586569786071777,
"logits/rejected": -2.730827569961548,
"logps/chosen": -337.0930480957031,
"logps/rejected": -317.09912109375,
"loss": 0.5455,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.19493858516216278,
"rewards/margins": 0.4252621531486511,
"rewards/rejected": -0.6202007532119751,
"step": 3470
},
{
"epoch": 0.9107563465061502,
"grad_norm": 11.71596622467041,
"learning_rate": 1.2038986838887127e-08,
"logits/chosen": -2.792734384536743,
"logits/rejected": -2.77490234375,
"logps/chosen": -288.8994445800781,
"logps/rejected": -313.22430419921875,
"loss": 0.6242,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.30519285798072815,
"rewards/margins": 0.2674819231033325,
"rewards/rejected": -0.5726747512817383,
"step": 3480
},
{
"epoch": 0.9133734624443863,
"grad_norm": 6.5518951416015625,
"learning_rate": 1.1348554977451131e-08,
"logits/chosen": -2.805830478668213,
"logits/rejected": -2.7894272804260254,
"logps/chosen": -327.4478759765625,
"logps/rejected": -324.9560546875,
"loss": 0.582,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2583698630332947,
"rewards/margins": 0.3414859175682068,
"rewards/rejected": -0.5998557806015015,
"step": 3490
},
{
"epoch": 0.9159905783826223,
"grad_norm": 5.205156326293945,
"learning_rate": 1.06780504429958e-08,
"logits/chosen": -2.7797505855560303,
"logits/rejected": -2.7590694427490234,
"logps/chosen": -325.8748779296875,
"logps/rejected": -310.8509521484375,
"loss": 0.6008,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.27413299679756165,
"rewards/margins": 0.3151172399520874,
"rewards/rejected": -0.5892502665519714,
"step": 3500
},
{
"epoch": 0.9159905783826223,
"eval_logits/chosen": -2.775543689727783,
"eval_logits/rejected": -2.7539024353027344,
"eval_logps/chosen": -309.8999938964844,
"eval_logps/rejected": -321.37445068359375,
"eval_loss": 0.5894958972930908,
"eval_rewards/accuracies": 0.703499972820282,
"eval_rewards/chosen": -0.2716234028339386,
"eval_rewards/margins": 0.3277418315410614,
"eval_rewards/rejected": -0.599365234375,
"eval_runtime": 692.3998,
"eval_samples_per_second": 2.889,
"eval_steps_per_second": 0.361,
"step": 3500
},
{
"epoch": 0.9186076943208584,
"grad_norm": 6.786498069763184,
"learning_rate": 1.0027529222456754e-08,
"logits/chosen": -2.7301533222198486,
"logits/rejected": -2.702810764312744,
"logps/chosen": -296.23834228515625,
"logps/rejected": -315.268310546875,
"loss": 0.5539,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2563706934452057,
"rewards/margins": 0.3883191645145416,
"rewards/rejected": -0.6446898579597473,
"step": 3510
},
{
"epoch": 0.9212248102590945,
"grad_norm": 9.892511367797852,
"learning_rate": 9.397045634168766e-09,
"logits/chosen": -2.8002243041992188,
"logits/rejected": -2.7856602668762207,
"logps/chosen": -308.3498229980469,
"logps/rejected": -351.95831298828125,
"loss": 0.57,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.21786804497241974,
"rewards/margins": 0.41681188344955444,
"rewards/rejected": -0.634679913520813,
"step": 3520
},
{
"epoch": 0.9238419261973305,
"grad_norm": 12.571949005126953,
"learning_rate": 8.78665232332998e-09,
"logits/chosen": -2.724975347518921,
"logits/rejected": -2.708922863006592,
"logps/chosen": -277.4271545410156,
"logps/rejected": -300.2417297363281,
"loss": 0.6055,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3028232455253601,
"rewards/margins": 0.2615777850151062,
"rewards/rejected": -0.5644010305404663,
"step": 3530
},
{
"epoch": 0.9264590421355666,
"grad_norm": 7.908664703369141,
"learning_rate": 8.196400257606206e-09,
"logits/chosen": -2.772461414337158,
"logits/rejected": -2.7343640327453613,
"logps/chosen": -328.0716247558594,
"logps/rejected": -358.15655517578125,
"loss": 0.577,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2738083302974701,
"rewards/margins": 0.3519710600376129,
"rewards/rejected": -0.625779390335083,
"step": 3540
},
{
"epoch": 0.9290761580738026,
"grad_norm": 5.722252368927002,
"learning_rate": 7.626338722875075e-09,
"logits/chosen": -2.7591617107391357,
"logits/rejected": -2.780594825744629,
"logps/chosen": -298.6004943847656,
"logps/rejected": -326.13287353515625,
"loss": 0.5986,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2562271058559418,
"rewards/margins": 0.3074356019496918,
"rewards/rejected": -0.5636627078056335,
"step": 3550
},
{
"epoch": 0.9316932740120387,
"grad_norm": 8.03117847442627,
"learning_rate": 7.0765153191106875e-09,
"logits/chosen": -2.781140089035034,
"logits/rejected": -2.7692975997924805,
"logps/chosen": -295.3600158691406,
"logps/rejected": -291.2763366699219,
"loss": 0.5659,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2436678409576416,
"rewards/margins": 0.39614516496658325,
"rewards/rejected": -0.6398130655288696,
"step": 3560
},
{
"epoch": 0.9343103899502748,
"grad_norm": 7.668455600738525,
"learning_rate": 6.54697595640899e-09,
"logits/chosen": -2.7558670043945312,
"logits/rejected": -2.7410783767700195,
"logps/chosen": -333.0140075683594,
"logps/rejected": -347.9772033691406,
"loss": 0.5718,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.23105120658874512,
"rewards/margins": 0.38840624690055847,
"rewards/rejected": -0.6194573640823364,
"step": 3570
},
{
"epoch": 0.9369275058885108,
"grad_norm": 7.808078765869141,
"learning_rate": 6.037764851154425e-09,
"logits/chosen": -2.7314181327819824,
"logits/rejected": -2.7231030464172363,
"logps/chosen": -305.7143249511719,
"logps/rejected": -345.88983154296875,
"loss": 0.5699,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.22997505962848663,
"rewards/margins": 0.37085598707199097,
"rewards/rejected": -0.6008309721946716,
"step": 3580
},
{
"epoch": 0.9395446218267469,
"grad_norm": 9.760852813720703,
"learning_rate": 5.548924522327747e-09,
"logits/chosen": -2.7540392875671387,
"logits/rejected": -2.7462592124938965,
"logps/chosen": -308.9768981933594,
"logps/rejected": -327.16802978515625,
"loss": 0.5826,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.299643337726593,
"rewards/margins": 0.3448326587677002,
"rewards/rejected": -0.6444759368896484,
"step": 3590
},
{
"epoch": 0.942161737764983,
"grad_norm": 11.123191833496094,
"learning_rate": 5.080495787955691e-09,
"logits/chosen": -2.734261989593506,
"logits/rejected": -2.717097043991089,
"logps/chosen": -269.73223876953125,
"logps/rejected": -300.8177490234375,
"loss": 0.585,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.254092276096344,
"rewards/margins": 0.306030809879303,
"rewards/rejected": -0.560123085975647,
"step": 3600
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -2.776420831680298,
"eval_logits/rejected": -2.7549078464508057,
"eval_logps/chosen": -309.95306396484375,
"eval_logps/rejected": -321.4418029785156,
"eval_loss": 0.5895029306411743,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -0.27215421199798584,
"eval_rewards/margins": 0.3278846740722656,
"eval_rewards/rejected": -0.6000389456748962,
"eval_runtime": 692.4927,
"eval_samples_per_second": 2.888,
"eval_steps_per_second": 0.361,
"step": 3600
},
{
"epoch": 0.944778853703219,
"grad_norm": 7.403170585632324,
"learning_rate": 4.632517761702814e-09,
"logits/chosen": -2.7008776664733887,
"logits/rejected": -2.6773476600646973,
"logps/chosen": -289.5223083496094,
"logps/rejected": -309.5367431640625,
"loss": 0.5795,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.308106005191803,
"rewards/margins": 0.33852237462997437,
"rewards/rejected": -0.6466284394264221,
"step": 3610
},
{
"epoch": 0.9473959696414551,
"grad_norm": 9.613285064697266,
"learning_rate": 4.205027849605358e-09,
"logits/chosen": -2.738858699798584,
"logits/rejected": -2.726569414138794,
"logps/chosen": -294.84014892578125,
"logps/rejected": -290.58770751953125,
"loss": 0.5959,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.292969286441803,
"rewards/margins": 0.3049730956554413,
"rewards/rejected": -0.5979424715042114,
"step": 3620
},
{
"epoch": 0.9500130855796912,
"grad_norm": 4.820310115814209,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": -2.785492420196533,
"logits/rejected": -2.767252206802368,
"logps/chosen": -311.9582214355469,
"logps/rejected": -305.7359924316406,
"loss": 0.5893,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.31197255849838257,
"rewards/margins": 0.3228316307067871,
"rewards/rejected": -0.6348041296005249,
"step": 3630
},
{
"epoch": 0.9526302015179272,
"grad_norm": 5.795242786407471,
"learning_rate": 3.411653435283157e-09,
"logits/chosen": -2.7570109367370605,
"logits/rejected": -2.7326931953430176,
"logps/chosen": -313.0288391113281,
"logps/rejected": -286.85894775390625,
"loss": 0.5868,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.23662319779396057,
"rewards/margins": 0.32369619607925415,
"rewards/rejected": -0.5603194236755371,
"step": 3640
},
{
"epoch": 0.9552473174561633,
"grad_norm": 8.141414642333984,
"learning_rate": 3.0458351795936698e-09,
"logits/chosen": -2.800523281097412,
"logits/rejected": -2.7803540229797363,
"logps/chosen": -287.27178955078125,
"logps/rejected": -296.94482421875,
"loss": 0.5557,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.20535226166248322,
"rewards/margins": 0.4138811230659485,
"rewards/rejected": -0.6192333102226257,
"step": 3650
},
{
"epoch": 0.9578644333943994,
"grad_norm": 10.963499069213867,
"learning_rate": 2.700637525598598e-09,
"logits/chosen": -2.7325665950775146,
"logits/rejected": -2.742112636566162,
"logps/chosen": -318.7773742675781,
"logps/rejected": -340.5607604980469,
"loss": 0.6213,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2901912331581116,
"rewards/margins": 0.23596885800361633,
"rewards/rejected": -0.5261600613594055,
"step": 3660
},
{
"epoch": 0.9604815493326354,
"grad_norm": 5.604915618896484,
"learning_rate": 2.3760892972027324e-09,
"logits/chosen": -2.8125240802764893,
"logits/rejected": -2.794743061065674,
"logps/chosen": -320.9376525878906,
"logps/rejected": -314.6265869140625,
"loss": 0.6086,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3326115012168884,
"rewards/margins": 0.2905888855457306,
"rewards/rejected": -0.6232004165649414,
"step": 3670
},
{
"epoch": 0.9630986652708715,
"grad_norm": 8.076900482177734,
"learning_rate": 2.0722175940897645e-09,
"logits/chosen": -2.730006694793701,
"logits/rejected": -2.7527151107788086,
"logps/chosen": -304.4130554199219,
"logps/rejected": -333.45281982421875,
"loss": 0.5561,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2793710231781006,
"rewards/margins": 0.40125495195388794,
"rewards/rejected": -0.6806259751319885,
"step": 3680
},
{
"epoch": 0.9657157812091076,
"grad_norm": 5.261369705200195,
"learning_rate": 1.7890477894593748e-09,
"logits/chosen": -2.7596428394317627,
"logits/rejected": -2.73931622505188,
"logps/chosen": -363.08984375,
"logps/rejected": -348.8448486328125,
"loss": 0.5621,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.23692288994789124,
"rewards/margins": 0.40229707956314087,
"rewards/rejected": -0.6392199993133545,
"step": 3690
},
{
"epoch": 0.9683328971473436,
"grad_norm": 7.541417598724365,
"learning_rate": 1.5266035279088708e-09,
"logits/chosen": -2.6856465339660645,
"logits/rejected": -2.6826679706573486,
"logps/chosen": -347.5863952636719,
"logps/rejected": -356.30120849609375,
"loss": 0.567,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2766670286655426,
"rewards/margins": 0.367573082447052,
"rewards/rejected": -0.6442400813102722,
"step": 3700
},
{
"epoch": 0.9683328971473436,
"eval_logits/chosen": -2.7754881381988525,
"eval_logits/rejected": -2.753868341445923,
"eval_logps/chosen": -310.11712646484375,
"eval_logps/rejected": -321.65545654296875,
"eval_loss": 0.5893409252166748,
"eval_rewards/accuracies": 0.7014999985694885,
"eval_rewards/chosen": -0.2737952172756195,
"eval_rewards/margins": 0.32838013768196106,
"eval_rewards/rejected": -0.6021752953529358,
"eval_runtime": 692.7848,
"eval_samples_per_second": 2.887,
"eval_steps_per_second": 0.361,
"step": 3700
},
{
"epoch": 0.9709500130855797,
"grad_norm": 11.719736099243164,
"learning_rate": 1.2849067234584621e-09,
"logits/chosen": -2.714137315750122,
"logits/rejected": -2.7111401557922363,
"logps/chosen": -280.48919677734375,
"logps/rejected": -300.55706787109375,
"loss": 0.607,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2787315249443054,
"rewards/margins": 0.30049681663513184,
"rewards/rejected": -0.5792283415794373,
"step": 3710
},
{
"epoch": 0.9735671290238157,
"grad_norm": 12.492560386657715,
"learning_rate": 1.0639775577218625e-09,
"logits/chosen": -2.719714403152466,
"logits/rejected": -2.667534589767456,
"logps/chosen": -295.1371765136719,
"logps/rejected": -294.61932373046875,
"loss": 0.5762,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.27527686953544617,
"rewards/margins": 0.36209625005722046,
"rewards/rejected": -0.637373149394989,
"step": 3720
},
{
"epoch": 0.9761842449620518,
"grad_norm": 7.440390110015869,
"learning_rate": 8.638344782207485e-10,
"logits/chosen": -2.725163459777832,
"logits/rejected": -2.7303969860076904,
"logps/chosen": -296.50689697265625,
"logps/rejected": -305.67706298828125,
"loss": 0.5767,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.22756004333496094,
"rewards/margins": 0.3577590882778168,
"rewards/rejected": -0.5853191018104553,
"step": 3730
},
{
"epoch": 0.9788013609002879,
"grad_norm": 10.965612411499023,
"learning_rate": 6.844941968447149e-10,
"logits/chosen": -2.7626724243164062,
"logits/rejected": -2.7460460662841797,
"logps/chosen": -316.35015869140625,
"logps/rejected": -349.7431945800781,
"loss": 0.5453,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2542489767074585,
"rewards/margins": 0.45952582359313965,
"rewards/rejected": -0.7137748003005981,
"step": 3740
},
{
"epoch": 0.9814184768385239,
"grad_norm": 5.883279323577881,
"learning_rate": 5.25971688455612e-10,
"logits/chosen": -2.7904438972473145,
"logits/rejected": -2.775864362716675,
"logps/chosen": -316.23297119140625,
"logps/rejected": -347.6502685546875,
"loss": 0.5698,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2520793080329895,
"rewards/margins": 0.36716121435165405,
"rewards/rejected": -0.6192405223846436,
"step": 3750
},
{
"epoch": 0.98403559277676,
"grad_norm": 4.377948760986328,
"learning_rate": 3.882801896372967e-10,
"logits/chosen": -2.785407543182373,
"logits/rejected": -2.785416841506958,
"logps/chosen": -311.1086120605469,
"logps/rejected": -308.876220703125,
"loss": 0.6124,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.2892715036869049,
"rewards/margins": 0.29838478565216064,
"rewards/rejected": -0.5876562595367432,
"step": 3760
},
{
"epoch": 0.9866527087149961,
"grad_norm": 8.081770896911621,
"learning_rate": 2.714311975902661e-10,
"logits/chosen": -2.7383980751037598,
"logits/rejected": -2.710829257965088,
"logps/chosen": -330.71771240234375,
"logps/rejected": -337.7955627441406,
"loss": 0.5649,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.25471562147140503,
"rewards/margins": 0.3609776496887207,
"rewards/rejected": -0.6156932711601257,
"step": 3770
},
{
"epoch": 0.9892698246532321,
"grad_norm": 7.887190818786621,
"learning_rate": 1.754344691717591e-10,
"logits/chosen": -2.761021852493286,
"logits/rejected": -2.7340810298919678,
"logps/chosen": -295.04718017578125,
"logps/rejected": -336.95147705078125,
"loss": 0.6306,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.27056482434272766,
"rewards/margins": 0.2132827490568161,
"rewards/rejected": -0.4838475286960602,
"step": 3780
},
{
"epoch": 0.9918869405914682,
"grad_norm": 7.817293643951416,
"learning_rate": 1.0029802008096333e-10,
"logits/chosen": -2.7683863639831543,
"logits/rejected": -2.7289211750030518,
"logps/chosen": -316.55340576171875,
"logps/rejected": -334.72845458984375,
"loss": 0.5639,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.25747808814048767,
"rewards/margins": 0.4023471474647522,
"rewards/rejected": -0.6598252654075623,
"step": 3790
},
{
"epoch": 0.9945040565297043,
"grad_norm": 6.426971435546875,
"learning_rate": 4.602812418974533e-11,
"logits/chosen": -2.791513442993164,
"logits/rejected": -2.7664811611175537,
"logps/chosen": -328.2163391113281,
"logps/rejected": -337.54974365234375,
"loss": 0.5834,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2448674440383911,
"rewards/margins": 0.3448673486709595,
"rewards/rejected": -0.5897347927093506,
"step": 3800
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -2.7742366790771484,
"eval_logits/rejected": -2.7524607181549072,
"eval_logps/chosen": -310.13330078125,
"eval_logps/rejected": -321.6666259765625,
"eval_loss": 0.5893096923828125,
"eval_rewards/accuracies": 0.7024999856948853,
"eval_rewards/chosen": -0.2739570438861847,
"eval_rewards/margins": 0.32832974195480347,
"eval_rewards/rejected": -0.6022867560386658,
"eval_runtime": 692.7928,
"eval_samples_per_second": 2.887,
"eval_steps_per_second": 0.361,
"step": 3800
},
{
"epoch": 0.9971211724679403,
"grad_norm": 7.6028289794921875,
"learning_rate": 1.2629313018819309e-11,
"logits/chosen": -2.7530319690704346,
"logits/rejected": -2.7311320304870605,
"logps/chosen": -300.90142822265625,
"logps/rejected": -311.88006591796875,
"loss": 0.5936,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2676599621772766,
"rewards/margins": 0.3127606511116028,
"rewards/rejected": -0.5804205536842346,
"step": 3810
},
{
"epoch": 0.9997382884061764,
"grad_norm": 10.209754943847656,
"learning_rate": 1.0437535929996855e-13,
"logits/chosen": -2.765655279159546,
"logits/rejected": -2.7465381622314453,
"logps/chosen": -334.4398498535156,
"logps/rejected": -327.4457702636719,
"loss": 0.5626,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2764059007167816,
"rewards/margins": 0.397102028131485,
"rewards/rejected": -0.6735079288482666,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.6164219083351729,
"train_runtime": 73481.1174,
"train_samples_per_second": 0.832,
"train_steps_per_second": 0.052
}
],
"logging_steps": 10,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}