statking's picture
Model save
0083b65 verified
raw
history blame
No virus
114 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005234231876472127,
"grad_norm": 0.5695298687302295,
"learning_rate": 2.617801047120419e-08,
"logits/chosen": -0.4997953176498413,
"logits/rejected": -0.5751151442527771,
"logps/chosen": -395.12640380859375,
"logps/rejected": -316.8270568847656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.005234231876472127,
"grad_norm": 0.5536662200797415,
"learning_rate": 2.617801047120419e-07,
"logits/chosen": -0.5801360011100769,
"logits/rejected": -0.6067044138908386,
"logps/chosen": -304.32293701171875,
"logps/rejected": -244.29046630859375,
"loss": 0.6929,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": -0.0005742510547861457,
"rewards/margins": 0.00042081804713234305,
"rewards/rejected": -0.0009950690437108278,
"step": 10
},
{
"epoch": 0.010468463752944255,
"grad_norm": 0.4593035639265128,
"learning_rate": 5.235602094240838e-07,
"logits/chosen": -0.6146650314331055,
"logits/rejected": -0.615092396736145,
"logps/chosen": -267.64739990234375,
"logps/rejected": -261.7188415527344,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.0001226421882165596,
"rewards/margins": -0.00022562053345609456,
"rewards/rejected": 0.00034826272167265415,
"step": 20
},
{
"epoch": 0.015702695629416383,
"grad_norm": 0.5145955390417808,
"learning_rate": 7.853403141361258e-07,
"logits/chosen": -0.6289754509925842,
"logits/rejected": -0.6177533268928528,
"logps/chosen": -280.3735046386719,
"logps/rejected": -242.95175170898438,
"loss": 0.693,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0002500644768588245,
"rewards/margins": 0.0001505579421063885,
"rewards/rejected": 9.950650564860553e-05,
"step": 30
},
{
"epoch": 0.02093692750588851,
"grad_norm": 0.5230612257382828,
"learning_rate": 1.0471204188481676e-06,
"logits/chosen": -0.5945444107055664,
"logits/rejected": -0.6121580600738525,
"logps/chosen": -267.5122375488281,
"logps/rejected": -268.740234375,
"loss": 0.6926,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0005460727261379361,
"rewards/margins": 0.0009088722290471196,
"rewards/rejected": -0.0003627995611168444,
"step": 40
},
{
"epoch": 0.02617115938236064,
"grad_norm": 0.5459284518349332,
"learning_rate": 1.3089005235602096e-06,
"logits/chosen": -0.6024752259254456,
"logits/rejected": -0.6329907774925232,
"logps/chosen": -285.15838623046875,
"logps/rejected": -254.0716552734375,
"loss": 0.693,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0005030709435231984,
"rewards/margins": 0.000754082459025085,
"rewards/rejected": -0.0002510116610210389,
"step": 50
},
{
"epoch": 0.031405391258832765,
"grad_norm": 0.47503175617580706,
"learning_rate": 1.5706806282722515e-06,
"logits/chosen": -0.6266981363296509,
"logits/rejected": -0.6208000183105469,
"logps/chosen": -318.3113098144531,
"logps/rejected": -271.3620300292969,
"loss": 0.6917,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0019611348398029804,
"rewards/margins": 0.002480647061020136,
"rewards/rejected": -0.0005195126286707819,
"step": 60
},
{
"epoch": 0.036639623135304895,
"grad_norm": 0.5387833374092045,
"learning_rate": 1.8324607329842933e-06,
"logits/chosen": -0.5752447843551636,
"logits/rejected": -0.6073333024978638,
"logps/chosen": -274.0575866699219,
"logps/rejected": -241.2310028076172,
"loss": 0.6912,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0042357188649475574,
"rewards/margins": 0.0048388526774942875,
"rewards/rejected": -0.0006031342782080173,
"step": 70
},
{
"epoch": 0.04187385501177702,
"grad_norm": 0.573484197995026,
"learning_rate": 2.094240837696335e-06,
"logits/chosen": -0.5686159133911133,
"logits/rejected": -0.5909486413002014,
"logps/chosen": -298.2327880859375,
"logps/rejected": -275.91986083984375,
"loss": 0.6902,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.007856507785618305,
"rewards/margins": 0.0063124834559857845,
"rewards/rejected": 0.0015440242132171988,
"step": 80
},
{
"epoch": 0.04710808688824915,
"grad_norm": 0.5157342119342403,
"learning_rate": 2.356020942408377e-06,
"logits/chosen": -0.6013309955596924,
"logits/rejected": -0.6156548857688904,
"logps/chosen": -256.23602294921875,
"logps/rejected": -235.50259399414062,
"loss": 0.6896,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.010481725446879864,
"rewards/margins": 0.005771929398179054,
"rewards/rejected": 0.0047097960487008095,
"step": 90
},
{
"epoch": 0.05234231876472128,
"grad_norm": 0.5281481398431945,
"learning_rate": 2.617801047120419e-06,
"logits/chosen": -0.5954197645187378,
"logits/rejected": -0.6362258195877075,
"logps/chosen": -246.4618682861328,
"logps/rejected": -205.40481567382812,
"loss": 0.6865,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01956891641020775,
"rewards/margins": 0.013755053281784058,
"rewards/rejected": 0.005813863128423691,
"step": 100
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -0.5952818393707275,
"eval_logits/rejected": -0.6047875285148621,
"eval_logps/chosen": -274.37066650390625,
"eval_logps/rejected": -252.90138244628906,
"eval_loss": 0.685746967792511,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": 0.02022642455995083,
"eval_rewards/margins": 0.016626423224806786,
"eval_rewards/rejected": 0.003600001335144043,
"eval_runtime": 492.1641,
"eval_samples_per_second": 4.064,
"eval_steps_per_second": 0.254,
"step": 100
},
{
"epoch": 0.05757655064119341,
"grad_norm": 0.5272794123234011,
"learning_rate": 2.8795811518324613e-06,
"logits/chosen": -0.5827732682228088,
"logits/rejected": -0.6242547035217285,
"logps/chosen": -261.44049072265625,
"logps/rejected": -211.53970336914062,
"loss": 0.6829,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.024311328306794167,
"rewards/margins": 0.0224609337747097,
"rewards/rejected": 0.0018503913888707757,
"step": 110
},
{
"epoch": 0.06281078251766553,
"grad_norm": 0.5281893762396359,
"learning_rate": 3.141361256544503e-06,
"logits/chosen": -0.5533467531204224,
"logits/rejected": -0.5774653553962708,
"logps/chosen": -323.4781799316406,
"logps/rejected": -284.3753967285156,
"loss": 0.6828,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.028400782495737076,
"rewards/margins": 0.018748918548226357,
"rewards/rejected": 0.009651863016188145,
"step": 120
},
{
"epoch": 0.06804501439413765,
"grad_norm": 0.5624037848656742,
"learning_rate": 3.403141361256545e-06,
"logits/chosen": -0.519309401512146,
"logits/rejected": -0.538366436958313,
"logps/chosen": -298.1244201660156,
"logps/rejected": -273.02166748046875,
"loss": 0.6764,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.018718790262937546,
"rewards/margins": 0.03766729682683945,
"rewards/rejected": -0.0189485065639019,
"step": 130
},
{
"epoch": 0.07327924627060979,
"grad_norm": 0.6241582910982458,
"learning_rate": 3.6649214659685865e-06,
"logits/chosen": -0.6155737638473511,
"logits/rejected": -0.615781307220459,
"logps/chosen": -254.7561798095703,
"logps/rejected": -252.4250030517578,
"loss": 0.6661,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0013563375687226653,
"rewards/margins": 0.060362327843904495,
"rewards/rejected": -0.0617186613380909,
"step": 140
},
{
"epoch": 0.07851347814708191,
"grad_norm": 0.7819020006868519,
"learning_rate": 3.926701570680629e-06,
"logits/chosen": -0.5579255819320679,
"logits/rejected": -0.5662384033203125,
"logps/chosen": -294.1602478027344,
"logps/rejected": -283.9729309082031,
"loss": 0.6542,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.06525762379169464,
"rewards/margins": 0.0802302211523056,
"rewards/rejected": -0.14548785984516144,
"step": 150
},
{
"epoch": 0.08374771002355404,
"grad_norm": 1.0082244738839943,
"learning_rate": 4.18848167539267e-06,
"logits/chosen": -0.6144393086433411,
"logits/rejected": -0.6438087224960327,
"logps/chosen": -293.11529541015625,
"logps/rejected": -274.2389221191406,
"loss": 0.6383,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.17805984616279602,
"rewards/margins": 0.11895015090703964,
"rewards/rejected": -0.29701000452041626,
"step": 160
},
{
"epoch": 0.08898194190002617,
"grad_norm": 1.0185195747317044,
"learning_rate": 4.450261780104713e-06,
"logits/chosen": -0.6413242220878601,
"logits/rejected": -0.6541165709495544,
"logps/chosen": -264.94549560546875,
"logps/rejected": -280.5323486328125,
"loss": 0.6307,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.06771933287382126,
"rewards/margins": 0.1421503722667694,
"rewards/rejected": -0.20986969769001007,
"step": 170
},
{
"epoch": 0.0942161737764983,
"grad_norm": 1.4632164660193285,
"learning_rate": 4.712041884816754e-06,
"logits/chosen": -0.8007810711860657,
"logits/rejected": -0.8442818522453308,
"logps/chosen": -335.73040771484375,
"logps/rejected": -315.95654296875,
"loss": 0.6239,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.2423553764820099,
"rewards/margins": 0.19543686509132385,
"rewards/rejected": -0.43779221177101135,
"step": 180
},
{
"epoch": 0.09945040565297043,
"grad_norm": 1.4793919762168768,
"learning_rate": 4.9738219895287965e-06,
"logits/chosen": -0.8508358001708984,
"logits/rejected": -0.9171808362007141,
"logps/chosen": -319.34222412109375,
"logps/rejected": -312.38458251953125,
"loss": 0.5741,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3020084500312805,
"rewards/margins": 0.32925525307655334,
"rewards/rejected": -0.6312636733055115,
"step": 190
},
{
"epoch": 0.10468463752944256,
"grad_norm": 1.3909979729045314,
"learning_rate": 4.999661831436499e-06,
"logits/chosen": -0.8342474699020386,
"logits/rejected": -0.8582927584648132,
"logps/chosen": -305.7598571777344,
"logps/rejected": -329.6683044433594,
"loss": 0.5773,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3809957504272461,
"rewards/margins": 0.3516261875629425,
"rewards/rejected": -0.7326219081878662,
"step": 200
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -0.9088600277900696,
"eval_logits/rejected": -0.9407602548599243,
"eval_logps/chosen": -330.3779296875,
"eval_logps/rejected": -347.161376953125,
"eval_loss": 0.580152690410614,
"eval_rewards/accuracies": 0.7080000042915344,
"eval_rewards/chosen": -0.5398465991020203,
"eval_rewards/margins": 0.3991530239582062,
"eval_rewards/rejected": -0.9389996528625488,
"eval_runtime": 493.0243,
"eval_samples_per_second": 4.057,
"eval_steps_per_second": 0.254,
"step": 200
},
{
"epoch": 0.10991886940591468,
"grad_norm": 2.2981570438136902,
"learning_rate": 4.9984929711403395e-06,
"logits/chosen": -0.8167268633842468,
"logits/rejected": -0.8397541046142578,
"logps/chosen": -311.28460693359375,
"logps/rejected": -334.7868347167969,
"loss": 0.5759,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6255691051483154,
"rewards/margins": 0.4341684877872467,
"rewards/rejected": -1.0597375631332397,
"step": 210
},
{
"epoch": 0.11515310128238682,
"grad_norm": 2.1371618628386155,
"learning_rate": 4.996489634487865e-06,
"logits/chosen": -0.8237009048461914,
"logits/rejected": -0.8077519536018372,
"logps/chosen": -394.8115539550781,
"logps/rejected": -410.77276611328125,
"loss": 0.5898,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0387346744537354,
"rewards/margins": 0.47027960419654846,
"rewards/rejected": -1.509014368057251,
"step": 220
},
{
"epoch": 0.12038733315885894,
"grad_norm": 2.1931382967274287,
"learning_rate": 4.9936524905772466e-06,
"logits/chosen": -0.7595096826553345,
"logits/rejected": -0.7923563718795776,
"logps/chosen": -336.1356201171875,
"logps/rejected": -397.56024169921875,
"loss": 0.5126,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9009682536125183,
"rewards/margins": 0.6920944452285767,
"rewards/rejected": -1.5930627584457397,
"step": 230
},
{
"epoch": 0.12562156503533106,
"grad_norm": 3.5772907228062776,
"learning_rate": 4.9899824869915e-06,
"logits/chosen": -0.8187972903251648,
"logits/rejected": -0.8766587376594543,
"logps/chosen": -422.3431701660156,
"logps/rejected": -457.0511779785156,
"loss": 0.5458,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.592961311340332,
"rewards/margins": 0.48186540603637695,
"rewards/rejected": -2.074826717376709,
"step": 240
},
{
"epoch": 0.13085579691180318,
"grad_norm": 1.9360981239656954,
"learning_rate": 4.985480849482012e-06,
"logits/chosen": -0.8391903042793274,
"logits/rejected": -0.8728139996528625,
"logps/chosen": -411.3055725097656,
"logps/rejected": -408.16082763671875,
"loss": 0.5184,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0500333309173584,
"rewards/margins": 0.5413428544998169,
"rewards/rejected": -1.5913760662078857,
"step": 250
},
{
"epoch": 0.1360900287882753,
"grad_norm": 3.8063942330720124,
"learning_rate": 4.980149081559142e-06,
"logits/chosen": -0.8690627813339233,
"logits/rejected": -0.9096955060958862,
"logps/chosen": -373.99871826171875,
"logps/rejected": -421.1673889160156,
"loss": 0.5362,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9393401145935059,
"rewards/margins": 0.7630335092544556,
"rewards/rejected": -1.702373743057251,
"step": 260
},
{
"epoch": 0.14132426066474746,
"grad_norm": 2.3218168753541364,
"learning_rate": 4.9739889639900655e-06,
"logits/chosen": -0.8408964276313782,
"logits/rejected": -0.8704169392585754,
"logps/chosen": -416.67352294921875,
"logps/rejected": -505.36279296875,
"loss": 0.4785,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.3642221689224243,
"rewards/margins": 0.9632150530815125,
"rewards/rejected": -2.327437162399292,
"step": 270
},
{
"epoch": 0.14655849254121958,
"grad_norm": 3.6911223439591407,
"learning_rate": 4.967002554204009e-06,
"logits/chosen": -0.9045387506484985,
"logits/rejected": -0.9169967770576477,
"logps/chosen": -359.976806640625,
"logps/rejected": -423.2659606933594,
"loss": 0.5433,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8073030710220337,
"rewards/margins": 0.6288052201271057,
"rewards/rejected": -1.4361083507537842,
"step": 280
},
{
"epoch": 0.1517927244176917,
"grad_norm": 3.3423618640552095,
"learning_rate": 4.959192185605089e-06,
"logits/chosen": -0.9444735646247864,
"logits/rejected": -0.9996434450149536,
"logps/chosen": -437.74822998046875,
"logps/rejected": -472.47003173828125,
"loss": 0.5385,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0655570030212402,
"rewards/margins": 0.7227457761764526,
"rewards/rejected": -1.7883027791976929,
"step": 290
},
{
"epoch": 0.15702695629416383,
"grad_norm": 1.7496473866481979,
"learning_rate": 4.950560466792969e-06,
"logits/chosen": -0.9565297961235046,
"logits/rejected": -1.0268198251724243,
"logps/chosen": -383.75982666015625,
"logps/rejected": -431.46868896484375,
"loss": 0.546,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1211820840835571,
"rewards/margins": 0.7908428907394409,
"rewards/rejected": -1.9120250940322876,
"step": 300
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -1.0509591102600098,
"eval_logits/rejected": -1.093695044517517,
"eval_logps/chosen": -375.9070739746094,
"eval_logps/rejected": -426.7812194824219,
"eval_loss": 0.5337316393852234,
"eval_rewards/accuracies": 0.7369999885559082,
"eval_rewards/chosen": -0.9951376914978027,
"eval_rewards/margins": 0.7400606274604797,
"eval_rewards/rejected": -1.7351982593536377,
"eval_runtime": 490.0828,
"eval_samples_per_second": 4.081,
"eval_steps_per_second": 0.255,
"step": 300
},
{
"epoch": 0.16226118817063595,
"grad_norm": 4.167924029619072,
"learning_rate": 4.9411102806916185e-06,
"logits/chosen": -0.9661039113998413,
"logits/rejected": -1.0193841457366943,
"logps/chosen": -386.1661071777344,
"logps/rejected": -459.80059814453125,
"loss": 0.4885,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.1400443315505981,
"rewards/margins": 0.9499530792236328,
"rewards/rejected": -2.0899975299835205,
"step": 310
},
{
"epoch": 0.16749542004710807,
"grad_norm": 3.7134888984526406,
"learning_rate": 4.930844783586424e-06,
"logits/chosen": -0.9965893030166626,
"logits/rejected": -1.0922787189483643,
"logps/chosen": -487.04620361328125,
"logps/rejected": -534.34326171875,
"loss": 0.5121,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.133643627166748,
"rewards/margins": 0.8644089698791504,
"rewards/rejected": -2.9980525970458984,
"step": 320
},
{
"epoch": 0.17272965192358022,
"grad_norm": 2.6372716992994625,
"learning_rate": 4.919767404070033e-06,
"logits/chosen": -1.044533371925354,
"logits/rejected": -1.132899522781372,
"logps/chosen": -446.17132568359375,
"logps/rejected": -512.2128295898438,
"loss": 0.5259,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.830833077430725,
"rewards/margins": 0.8785037994384766,
"rewards/rejected": -2.709336519241333,
"step": 330
},
{
"epoch": 0.17796388380005235,
"grad_norm": 2.585264829780915,
"learning_rate": 4.907881841897216e-06,
"logits/chosen": -1.0099565982818604,
"logits/rejected": -1.0530710220336914,
"logps/chosen": -442.65887451171875,
"logps/rejected": -488.6505432128906,
"loss": 0.4837,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5058012008666992,
"rewards/margins": 0.7143822908401489,
"rewards/rejected": -2.2201833724975586,
"step": 340
},
{
"epoch": 0.18319811567652447,
"grad_norm": 3.3879317879760484,
"learning_rate": 4.89519206674919e-06,
"logits/chosen": -0.9927116632461548,
"logits/rejected": -1.0346943140029907,
"logps/chosen": -456.9622497558594,
"logps/rejected": -577.3074951171875,
"loss": 0.4784,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0847067832946777,
"rewards/margins": 1.002403974533081,
"rewards/rejected": -3.0871105194091797,
"step": 350
},
{
"epoch": 0.1884323475529966,
"grad_norm": 3.57516881024075,
"learning_rate": 4.881702316907769e-06,
"logits/chosen": -1.0031338930130005,
"logits/rejected": -1.0955650806427002,
"logps/chosen": -503.44683837890625,
"logps/rejected": -570.7683715820312,
"loss": 0.4926,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9374068975448608,
"rewards/margins": 1.0033470392227173,
"rewards/rejected": -2.940753936767578,
"step": 360
},
{
"epoch": 0.19366657942946872,
"grad_norm": 5.038599527660393,
"learning_rate": 4.86741709783982e-06,
"logits/chosen": -1.026948094367981,
"logits/rejected": -1.0862653255462646,
"logps/chosen": -428.8387145996094,
"logps/rejected": -521.2025146484375,
"loss": 0.4521,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.6744959354400635,
"rewards/margins": 1.2723809480667114,
"rewards/rejected": -2.9468765258789062,
"step": 370
},
{
"epoch": 0.19890081130594087,
"grad_norm": 2.580969896149703,
"learning_rate": 4.852341180692471e-06,
"logits/chosen": -1.0237780809402466,
"logits/rejected": -1.0312225818634033,
"logps/chosen": -460.7245178222656,
"logps/rejected": -579.8516845703125,
"loss": 0.4707,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.8517287969589233,
"rewards/margins": 1.0035735368728638,
"rewards/rejected": -2.855302333831787,
"step": 380
},
{
"epoch": 0.204135043182413,
"grad_norm": 4.049123524870898,
"learning_rate": 4.836479600699579e-06,
"logits/chosen": -1.021194577217102,
"logits/rejected": -1.065612554550171,
"logps/chosen": -474.7022399902344,
"logps/rejected": -515.19384765625,
"loss": 0.5062,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6703779697418213,
"rewards/margins": 0.9831420183181763,
"rewards/rejected": -2.653519868850708,
"step": 390
},
{
"epoch": 0.2093692750588851,
"grad_norm": 2.669349458066594,
"learning_rate": 4.819837655500014e-06,
"logits/chosen": -1.055345892906189,
"logits/rejected": -1.1153924465179443,
"logps/chosen": -409.0503845214844,
"logps/rejected": -475.8119201660156,
"loss": 0.501,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5296335220336914,
"rewards/margins": 0.9046236872673035,
"rewards/rejected": -2.4342570304870605,
"step": 400
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -1.0595479011535645,
"eval_logits/rejected": -1.101104974746704,
"eval_logps/chosen": -458.5479431152344,
"eval_logps/rejected": -529.427734375,
"eval_loss": 0.5120114088058472,
"eval_rewards/accuracies": 0.753000020980835,
"eval_rewards/chosen": -1.8215464353561401,
"eval_rewards/margins": 0.9401166439056396,
"eval_rewards/rejected": -2.7616631984710693,
"eval_runtime": 490.6583,
"eval_samples_per_second": 4.076,
"eval_steps_per_second": 0.255,
"step": 400
},
{
"epoch": 0.21460350693535724,
"grad_norm": 2.693659265285776,
"learning_rate": 4.802420903368286e-06,
"logits/chosen": -1.017884612083435,
"logits/rejected": -1.0450990200042725,
"logps/chosen": -452.4844665527344,
"logps/rejected": -545.2508544921875,
"loss": 0.5075,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.978095293045044,
"rewards/margins": 0.8133966326713562,
"rewards/rejected": -2.791491746902466,
"step": 410
},
{
"epoch": 0.21983773881182936,
"grad_norm": 3.0138584618040816,
"learning_rate": 4.784235161358124e-06,
"logits/chosen": -1.0173218250274658,
"logits/rejected": -1.0448085069656372,
"logps/chosen": -483.49609375,
"logps/rejected": -571.6468505859375,
"loss": 0.5192,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0148584842681885,
"rewards/margins": 0.8309059143066406,
"rewards/rejected": -2.84576416015625,
"step": 420
},
{
"epoch": 0.22507197068830148,
"grad_norm": 2.7069812702685327,
"learning_rate": 4.765286503359632e-06,
"logits/chosen": -0.953274130821228,
"logits/rejected": -1.0237443447113037,
"logps/chosen": -445.3104553222656,
"logps/rejected": -500.704345703125,
"loss": 0.5173,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7284530401229858,
"rewards/margins": 0.7751290798187256,
"rewards/rejected": -2.503582239151001,
"step": 430
},
{
"epoch": 0.23030620256477363,
"grad_norm": 2.6842575235901256,
"learning_rate": 4.745581258070654e-06,
"logits/chosen": -0.9820082783699036,
"logits/rejected": -1.0258147716522217,
"logps/chosen": -424.865966796875,
"logps/rejected": -499.13031005859375,
"loss": 0.5075,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5546586513519287,
"rewards/margins": 0.8893228769302368,
"rewards/rejected": -2.443981409072876,
"step": 440
},
{
"epoch": 0.23554043444124576,
"grad_norm": 2.8248158426262036,
"learning_rate": 4.725126006883047e-06,
"logits/chosen": -0.9185377955436707,
"logits/rejected": -0.94728022813797,
"logps/chosen": -403.70159912109375,
"logps/rejected": -514.3756713867188,
"loss": 0.5284,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.593689203262329,
"rewards/margins": 0.8767625689506531,
"rewards/rejected": -2.470451593399048,
"step": 450
},
{
"epoch": 0.24077466631771788,
"grad_norm": 3.479296042367863,
"learning_rate": 4.70392758168454e-06,
"logits/chosen": -0.8726640939712524,
"logits/rejected": -0.9334267377853394,
"logps/chosen": -439.721435546875,
"logps/rejected": -487.37384033203125,
"loss": 0.5126,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5047967433929443,
"rewards/margins": 0.8623396158218384,
"rewards/rejected": -2.3671364784240723,
"step": 460
},
{
"epoch": 0.24600889819419,
"grad_norm": 4.319750283791727,
"learning_rate": 4.68199306257695e-06,
"logits/chosen": -0.9039748311042786,
"logits/rejected": -0.9309199452400208,
"logps/chosen": -492.0690002441406,
"logps/rejected": -554.73388671875,
"loss": 0.4953,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8066781759262085,
"rewards/margins": 0.9244368672370911,
"rewards/rejected": -2.7311148643493652,
"step": 470
},
{
"epoch": 0.2512431300706621,
"grad_norm": 3.14164730805949,
"learning_rate": 4.659329775511478e-06,
"logits/chosen": -0.9303410649299622,
"logits/rejected": -0.977057933807373,
"logps/chosen": -399.0423889160156,
"logps/rejected": -490.00543212890625,
"loss": 0.4683,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4180445671081543,
"rewards/margins": 1.0890886783599854,
"rewards/rejected": -2.5071330070495605,
"step": 480
},
{
"epoch": 0.2564773619471343,
"grad_norm": 2.3614868571221357,
"learning_rate": 4.635945289841902e-06,
"logits/chosen": -0.8816567659378052,
"logits/rejected": -0.9269768595695496,
"logps/chosen": -415.19219970703125,
"logps/rejected": -476.89556884765625,
"loss": 0.4735,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3752299547195435,
"rewards/margins": 0.9180902242660522,
"rewards/rejected": -2.2933201789855957,
"step": 490
},
{
"epoch": 0.26171159382360637,
"grad_norm": 2.8583338952559925,
"learning_rate": 4.611847415796476e-06,
"logits/chosen": -0.8435959815979004,
"logits/rejected": -0.9220407605171204,
"logps/chosen": -441.06201171875,
"logps/rejected": -533.26904296875,
"loss": 0.4525,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.673166275024414,
"rewards/margins": 1.2388008832931519,
"rewards/rejected": -2.9119672775268555,
"step": 500
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -0.9134161472320557,
"eval_logits/rejected": -0.9429917931556702,
"eval_logps/chosen": -474.96240234375,
"eval_logps/rejected": -561.74462890625,
"eval_loss": 0.5090299248695374,
"eval_rewards/accuracies": 0.7509999871253967,
"eval_rewards/chosen": -1.985690951347351,
"eval_rewards/margins": 1.0991418361663818,
"eval_rewards/rejected": -3.0848329067230225,
"eval_runtime": 490.4743,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 0.255,
"step": 500
},
{
"epoch": 0.2669458257000785,
"grad_norm": 3.035220836230476,
"learning_rate": 4.587044201869378e-06,
"logits/chosen": -0.8761506080627441,
"logits/rejected": -0.902166485786438,
"logps/chosen": -425.97332763671875,
"logps/rejected": -501.02789306640625,
"loss": 0.497,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.6953567266464233,
"rewards/margins": 0.9890697598457336,
"rewards/rejected": -2.6844265460968018,
"step": 510
},
{
"epoch": 0.2721800575765506,
"grad_norm": 3.130477955866568,
"learning_rate": 4.561543932132574e-06,
"logits/chosen": -0.7917976379394531,
"logits/rejected": -0.8613120317459106,
"logps/chosen": -403.6101379394531,
"logps/rejected": -495.6336975097656,
"loss": 0.4638,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1837232112884521,
"rewards/margins": 1.1299959421157837,
"rewards/rejected": -2.3137192726135254,
"step": 520
},
{
"epoch": 0.27741428945302277,
"grad_norm": 5.62051791025711,
"learning_rate": 4.535355123469009e-06,
"logits/chosen": -0.8685650825500488,
"logits/rejected": -0.9064447283744812,
"logps/chosen": -453.49505615234375,
"logps/rejected": -571.8440551757812,
"loss": 0.4962,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.7710943222045898,
"rewards/margins": 1.2530162334442139,
"rewards/rejected": -3.0241103172302246,
"step": 530
},
{
"epoch": 0.2826485213294949,
"grad_norm": 4.031206237193703,
"learning_rate": 4.508486522728037e-06,
"logits/chosen": -0.8691636919975281,
"logits/rejected": -0.8954900503158569,
"logps/chosen": -481.83001708984375,
"logps/rejected": -557.0582275390625,
"loss": 0.464,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.1070663928985596,
"rewards/margins": 1.1235601902008057,
"rewards/rejected": -3.2306265830993652,
"step": 540
},
{
"epoch": 0.287882753205967,
"grad_norm": 2.5570522477618023,
"learning_rate": 4.480947103804044e-06,
"logits/chosen": -0.8274758458137512,
"logits/rejected": -0.8582462072372437,
"logps/chosen": -516.5406494140625,
"logps/rejected": -561.04736328125,
"loss": 0.5267,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.3619277477264404,
"rewards/margins": 0.9269983172416687,
"rewards/rejected": -3.288925886154175,
"step": 550
},
{
"epoch": 0.29311698508243916,
"grad_norm": 2.788640181418083,
"learning_rate": 4.452746064639239e-06,
"logits/chosen": -0.9235193133354187,
"logits/rejected": -0.9567066431045532,
"logps/chosen": -500.55810546875,
"logps/rejected": -591.5838012695312,
"loss": 0.4856,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.2232844829559326,
"rewards/margins": 0.9942595362663269,
"rewards/rejected": -3.217543840408325,
"step": 560
},
{
"epoch": 0.29835121695891126,
"grad_norm": 2.993526317828841,
"learning_rate": 4.423892824151617e-06,
"logits/chosen": -0.9176028966903687,
"logits/rejected": -0.9547454118728638,
"logps/chosen": -483.21490478515625,
"logps/rejected": -539.8753051757812,
"loss": 0.4752,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.7947994470596313,
"rewards/margins": 0.9781554937362671,
"rewards/rejected": -2.7729547023773193,
"step": 570
},
{
"epoch": 0.3035854488353834,
"grad_norm": 3.5431736202634885,
"learning_rate": 4.3943970190891164e-06,
"logits/chosen": -0.9206205606460571,
"logits/rejected": -0.9628446698188782,
"logps/chosen": -463.5152282714844,
"logps/rejected": -528.37158203125,
"loss": 0.4885,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.6288913488388062,
"rewards/margins": 1.1642273664474487,
"rewards/rejected": -2.793118953704834,
"step": 580
},
{
"epoch": 0.30881968071185556,
"grad_norm": 3.365610293148701,
"learning_rate": 4.364268500811025e-06,
"logits/chosen": -0.8485568165779114,
"logits/rejected": -0.9199141263961792,
"logps/chosen": -483.6082458496094,
"logps/rejected": -560.9791259765625,
"loss": 0.5057,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.1659016609191895,
"rewards/margins": 1.034793734550476,
"rewards/rejected": -3.200695514678955,
"step": 590
},
{
"epoch": 0.31405391258832765,
"grad_norm": 4.5918534300247975,
"learning_rate": 4.333517331997704e-06,
"logits/chosen": -0.9025293588638306,
"logits/rejected": -0.9563030004501343,
"logps/chosen": -486.9297790527344,
"logps/rejected": -560.46630859375,
"loss": 0.508,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.339505434036255,
"rewards/margins": 0.8578627705574036,
"rewards/rejected": -3.1973681449890137,
"step": 600
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -0.96256422996521,
"eval_logits/rejected": -0.9954620599746704,
"eval_logps/chosen": -497.45501708984375,
"eval_logps/rejected": -568.3763427734375,
"eval_loss": 0.5005487203598022,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -2.210617780685425,
"eval_rewards/margins": 0.940531849861145,
"eval_rewards/rejected": -3.151149272918701,
"eval_runtime": 491.9737,
"eval_samples_per_second": 4.065,
"eval_steps_per_second": 0.254,
"step": 600
},
{
"epoch": 0.3192881444647998,
"grad_norm": 2.901694328327758,
"learning_rate": 4.302153783289737e-06,
"logits/chosen": -0.9074804186820984,
"logits/rejected": -0.9480104446411133,
"logps/chosen": -492.01910400390625,
"logps/rejected": -600.9209594726562,
"loss": 0.469,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1020569801330566,
"rewards/margins": 1.1244044303894043,
"rewards/rejected": -3.226461410522461,
"step": 610
},
{
"epoch": 0.3245223763412719,
"grad_norm": 3.94946155728758,
"learning_rate": 4.270188329857613e-06,
"logits/chosen": -0.8973082304000854,
"logits/rejected": -0.9559001922607422,
"logps/chosen": -465.6111755371094,
"logps/rejected": -525.2369384765625,
"loss": 0.4846,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6850999593734741,
"rewards/margins": 1.1260297298431396,
"rewards/rejected": -2.811129570007324,
"step": 620
},
{
"epoch": 0.32975660821774405,
"grad_norm": 3.033290372148656,
"learning_rate": 4.237631647903115e-06,
"logits/chosen": -0.9191315770149231,
"logits/rejected": -0.9655174016952515,
"logps/chosen": -424.0669860839844,
"logps/rejected": -516.409912109375,
"loss": 0.5221,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6103105545043945,
"rewards/margins": 1.0200879573822021,
"rewards/rejected": -2.630398988723755,
"step": 630
},
{
"epoch": 0.33499084009421615,
"grad_norm": 6.947248145776445,
"learning_rate": 4.204494611093548e-06,
"logits/chosen": -0.8765581846237183,
"logits/rejected": -0.9510295987129211,
"logps/chosen": -403.50701904296875,
"logps/rejected": -494.7275390625,
"loss": 0.5006,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.570704698562622,
"rewards/margins": 1.1784632205963135,
"rewards/rejected": -2.7491679191589355,
"step": 640
},
{
"epoch": 0.3402250719706883,
"grad_norm": 3.486686904718388,
"learning_rate": 4.170788286930024e-06,
"logits/chosen": -0.9069339632987976,
"logits/rejected": -0.9199384450912476,
"logps/chosen": -444.85009765625,
"logps/rejected": -555.9014282226562,
"loss": 0.5045,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9438743591308594,
"rewards/margins": 1.0122339725494385,
"rewards/rejected": -2.956108570098877,
"step": 650
},
{
"epoch": 0.34545930384716045,
"grad_norm": 4.061094803349214,
"learning_rate": 4.136523933051005e-06,
"logits/chosen": -0.8677660822868347,
"logits/rejected": -0.9190397262573242,
"logps/chosen": -406.159912109375,
"logps/rejected": -498.9130859375,
"loss": 0.497,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4709824323654175,
"rewards/margins": 0.9458149671554565,
"rewards/rejected": -2.416797399520874,
"step": 660
},
{
"epoch": 0.35069353572363254,
"grad_norm": 4.528134734617534,
"learning_rate": 4.101712993472348e-06,
"logits/chosen": -0.8407672643661499,
"logits/rejected": -0.8770118951797485,
"logps/chosen": -421.9403381347656,
"logps/rejected": -511.84820556640625,
"loss": 0.5147,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5217100381851196,
"rewards/margins": 1.0356905460357666,
"rewards/rejected": -2.5574002265930176,
"step": 670
},
{
"epoch": 0.3559277676001047,
"grad_norm": 3.331708093137532,
"learning_rate": 4.066367094765091e-06,
"logits/chosen": -0.8362857103347778,
"logits/rejected": -0.8756265640258789,
"logps/chosen": -427.3866271972656,
"logps/rejected": -472.51116943359375,
"loss": 0.5107,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6107534170150757,
"rewards/margins": 0.761100709438324,
"rewards/rejected": -2.371854066848755,
"step": 680
},
{
"epoch": 0.3611619994765768,
"grad_norm": 3.054351768159171,
"learning_rate": 4.030498042172277e-06,
"logits/chosen": -0.887158989906311,
"logits/rejected": -0.9305670857429504,
"logps/chosen": -436.98663330078125,
"logps/rejected": -536.4581298828125,
"loss": 0.4933,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.5907622575759888,
"rewards/margins": 1.0380842685699463,
"rewards/rejected": -2.6288466453552246,
"step": 690
},
{
"epoch": 0.36639623135304894,
"grad_norm": 3.9217623567649165,
"learning_rate": 3.994117815666095e-06,
"logits/chosen": -0.8911476135253906,
"logits/rejected": -0.9339841604232788,
"logps/chosen": -435.4269104003906,
"logps/rejected": -515.2606201171875,
"loss": 0.4852,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6368818283081055,
"rewards/margins": 0.932881236076355,
"rewards/rejected": -2.56976318359375,
"step": 700
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -0.9476205110549927,
"eval_logits/rejected": -0.9794169664382935,
"eval_logps/chosen": -416.10260009765625,
"eval_logps/rejected": -494.5317077636719,
"eval_loss": 0.5027905106544495,
"eval_rewards/accuracies": 0.7770000100135803,
"eval_rewards/chosen": -1.3970927000045776,
"eval_rewards/margins": 1.0156108140945435,
"eval_rewards/rejected": -2.412703275680542,
"eval_runtime": 489.1035,
"eval_samples_per_second": 4.089,
"eval_steps_per_second": 0.256,
"step": 700
},
{
"epoch": 0.3716304632295211,
"grad_norm": 3.6606573607441772,
"learning_rate": 3.957238565946672e-06,
"logits/chosen": -0.8809655904769897,
"logits/rejected": -0.9375411868095398,
"logps/chosen": -473.4790954589844,
"logps/rejected": -525.9794311523438,
"loss": 0.4515,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.573014736175537,
"rewards/margins": 1.1010257005691528,
"rewards/rejected": -2.6740405559539795,
"step": 710
},
{
"epoch": 0.3768646951059932,
"grad_norm": 4.5368251934335655,
"learning_rate": 3.919872610383831e-06,
"logits/chosen": -0.8901004791259766,
"logits/rejected": -0.9005835652351379,
"logps/chosen": -456.73626708984375,
"logps/rejected": -578.81201171875,
"loss": 0.4683,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.028618335723877,
"rewards/margins": 1.2738052606582642,
"rewards/rejected": -3.3024234771728516,
"step": 720
},
{
"epoch": 0.38209892698246534,
"grad_norm": 4.7885163624566704,
"learning_rate": 3.882032428903195e-06,
"logits/chosen": -0.8851186633110046,
"logits/rejected": -0.8868842124938965,
"logps/chosen": -499.48040771484375,
"logps/rejected": -608.1116333007812,
"loss": 0.5104,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.6055073738098145,
"rewards/margins": 1.1688528060913086,
"rewards/rejected": -3.774359941482544,
"step": 730
},
{
"epoch": 0.38733315885893743,
"grad_norm": 2.6857761500293766,
"learning_rate": 3.84373065981799e-06,
"logits/chosen": -0.8212822675704956,
"logits/rejected": -0.851559042930603,
"logps/chosen": -512.4810791015625,
"logps/rejected": -562.7243041992188,
"loss": 0.451,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.420043468475342,
"rewards/margins": 1.00551438331604,
"rewards/rejected": -3.425558090209961,
"step": 740
},
{
"epoch": 0.3925673907354096,
"grad_norm": 5.276994078087987,
"learning_rate": 3.8049800956079552e-06,
"logits/chosen": -0.8459002375602722,
"logits/rejected": -0.8481931686401367,
"logps/chosen": -452.31103515625,
"logps/rejected": -549.0643310546875,
"loss": 0.4316,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8753325939178467,
"rewards/margins": 1.1157124042510986,
"rewards/rejected": -2.9910449981689453,
"step": 750
},
{
"epoch": 0.39780162261188173,
"grad_norm": 4.2662925093714525,
"learning_rate": 3.765793678646753e-06,
"logits/chosen": -0.8927903175354004,
"logits/rejected": -0.9021077156066895,
"logps/chosen": -400.74273681640625,
"logps/rejected": -483.39813232421875,
"loss": 0.538,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5194071531295776,
"rewards/margins": 1.0460177659988403,
"rewards/rejected": -2.565425157546997,
"step": 760
},
{
"epoch": 0.40303585448835383,
"grad_norm": 2.642122221399109,
"learning_rate": 3.726184496879323e-06,
"logits/chosen": -0.8244895935058594,
"logits/rejected": -0.8434764742851257,
"logps/chosen": -467.0747985839844,
"logps/rejected": -540.9425048828125,
"loss": 0.5261,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.963444471359253,
"rewards/margins": 0.8455499410629272,
"rewards/rejected": -2.8089945316314697,
"step": 770
},
{
"epoch": 0.408270086364826,
"grad_norm": 3.728031543819746,
"learning_rate": 3.686165779450619e-06,
"logits/chosen": -0.8528604507446289,
"logits/rejected": -0.8891122937202454,
"logps/chosen": -460.427734375,
"logps/rejected": -544.4713134765625,
"loss": 0.4439,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.879953145980835,
"rewards/margins": 1.079024314880371,
"rewards/rejected": -2.958977699279785,
"step": 780
},
{
"epoch": 0.4135043182412981,
"grad_norm": 2.85654604962964,
"learning_rate": 3.645750892287178e-06,
"logits/chosen": -0.8759455680847168,
"logits/rejected": -0.8966878652572632,
"logps/chosen": -451.11865234375,
"logps/rejected": -574.4451293945312,
"loss": 0.5208,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9754890203475952,
"rewards/margins": 1.0046672821044922,
"rewards/rejected": -2.980156421661377,
"step": 790
},
{
"epoch": 0.4187385501177702,
"grad_norm": 2.7417872161100374,
"learning_rate": 3.604953333633009e-06,
"logits/chosen": -0.8261939883232117,
"logits/rejected": -0.884585976600647,
"logps/chosen": -468.4187927246094,
"logps/rejected": -541.056640625,
"loss": 0.5474,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.829167366027832,
"rewards/margins": 0.9327658414840698,
"rewards/rejected": -2.7619330883026123,
"step": 800
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -0.8851308822631836,
"eval_logits/rejected": -0.9115467667579651,
"eval_logps/chosen": -455.87139892578125,
"eval_logps/rejected": -529.6283569335938,
"eval_loss": 0.4966377317905426,
"eval_rewards/accuracies": 0.7670000195503235,
"eval_rewards/chosen": -1.794780969619751,
"eval_rewards/margins": 0.9688891768455505,
"eval_rewards/rejected": -2.7636702060699463,
"eval_runtime": 491.3021,
"eval_samples_per_second": 4.071,
"eval_steps_per_second": 0.254,
"step": 800
},
{
"epoch": 0.4239727819942423,
"grad_norm": 4.3302971642908465,
"learning_rate": 3.56378672954129e-06,
"logits/chosen": -0.8280191421508789,
"logits/rejected": -0.8614367246627808,
"logps/chosen": -439.85662841796875,
"logps/rejected": -539.2572021484375,
"loss": 0.5092,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.863586187362671,
"rewards/margins": 0.8855009078979492,
"rewards/rejected": -2.74908709526062,
"step": 810
},
{
"epoch": 0.42920701387071447,
"grad_norm": 3.1116409091724786,
"learning_rate": 3.5222648293233806e-06,
"logits/chosen": -0.8204169273376465,
"logits/rejected": -0.8331009149551392,
"logps/chosen": -489.24383544921875,
"logps/rejected": -579.9151611328125,
"loss": 0.5038,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.3326661586761475,
"rewards/margins": 1.1430214643478394,
"rewards/rejected": -3.4756877422332764,
"step": 820
},
{
"epoch": 0.4344412457471866,
"grad_norm": 3.2429890344778074,
"learning_rate": 3.4804015009566573e-06,
"logits/chosen": -0.7415329217910767,
"logits/rejected": -0.7724164724349976,
"logps/chosen": -455.18572998046875,
"logps/rejected": -487.82733154296875,
"loss": 0.5401,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0002198219299316,
"rewards/margins": 0.7170491218566895,
"rewards/rejected": -2.717268705368042,
"step": 830
},
{
"epoch": 0.4396754776236587,
"grad_norm": 3.5668607410560975,
"learning_rate": 3.4382107264527244e-06,
"logits/chosen": -0.7589952945709229,
"logits/rejected": -0.7982163429260254,
"logps/chosen": -483.5228576660156,
"logps/rejected": -528.97021484375,
"loss": 0.4736,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.7952258586883545,
"rewards/margins": 1.0175402164459229,
"rewards/rejected": -2.8127660751342773,
"step": 840
},
{
"epoch": 0.44490970950013087,
"grad_norm": 4.0501340588509365,
"learning_rate": 3.3957065971875387e-06,
"logits/chosen": -0.795210063457489,
"logits/rejected": -0.8109579086303711,
"logps/chosen": -458.65814208984375,
"logps/rejected": -532.5260620117188,
"loss": 0.4962,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.041184902191162,
"rewards/margins": 1.0093342065811157,
"rewards/rejected": -3.0505189895629883,
"step": 850
},
{
"epoch": 0.45014394137660296,
"grad_norm": 3.973745164078984,
"learning_rate": 3.352903309194999e-06,
"logits/chosen": -0.8046371340751648,
"logits/rejected": -0.8593829274177551,
"logps/chosen": -461.7774353027344,
"logps/rejected": -595.0950927734375,
"loss": 0.4721,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.2889552116394043,
"rewards/margins": 1.1978638172149658,
"rewards/rejected": -3.4868195056915283,
"step": 860
},
{
"epoch": 0.4553781732530751,
"grad_norm": 2.672124703356638,
"learning_rate": 3.309815158425591e-06,
"logits/chosen": -0.7477758526802063,
"logits/rejected": -0.7731175422668457,
"logps/chosen": -472.6768493652344,
"logps/rejected": -555.8510131835938,
"loss": 0.5073,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.2288527488708496,
"rewards/margins": 0.9096274375915527,
"rewards/rejected": -3.1384804248809814,
"step": 870
},
{
"epoch": 0.46061240512954726,
"grad_norm": 2.903999411374589,
"learning_rate": 3.266456535971654e-06,
"logits/chosen": -0.864072322845459,
"logits/rejected": -0.8836487531661987,
"logps/chosen": -464.50115966796875,
"logps/rejected": -530.8693237304688,
"loss": 0.4929,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0616631507873535,
"rewards/margins": 0.824569821357727,
"rewards/rejected": -2.88623309135437,
"step": 880
},
{
"epoch": 0.46584663700601936,
"grad_norm": 2.4287136163193974,
"learning_rate": 3.2228419232608692e-06,
"logits/chosen": -0.8057514429092407,
"logits/rejected": -0.8173397183418274,
"logps/chosen": -457.15032958984375,
"logps/rejected": -542.5882568359375,
"loss": 0.4903,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8502060174942017,
"rewards/margins": 0.9911335110664368,
"rewards/rejected": -2.841339588165283,
"step": 890
},
{
"epoch": 0.4710808688824915,
"grad_norm": 2.8863413320590774,
"learning_rate": 3.1789858872195888e-06,
"logits/chosen": -0.771654486656189,
"logits/rejected": -0.8108331561088562,
"logps/chosen": -418.285888671875,
"logps/rejected": -514.6083984375,
"loss": 0.5246,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6747478246688843,
"rewards/margins": 0.8920180201530457,
"rewards/rejected": -2.566765785217285,
"step": 900
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -0.7979649305343628,
"eval_logits/rejected": -0.8138096928596497,
"eval_logps/chosen": -429.2431335449219,
"eval_logps/rejected": -507.4219055175781,
"eval_loss": 0.49433061480522156,
"eval_rewards/accuracies": 0.765999972820282,
"eval_rewards/chosen": -1.5284981727600098,
"eval_rewards/margins": 1.013107419013977,
"eval_rewards/rejected": -2.5416059494018555,
"eval_runtime": 490.1393,
"eval_samples_per_second": 4.08,
"eval_steps_per_second": 0.255,
"step": 900
},
{
"epoch": 0.4763151007589636,
"grad_norm": 2.884609247157233,
"learning_rate": 3.1349030754075945e-06,
"logits/chosen": -0.7577202320098877,
"logits/rejected": -0.764434278011322,
"logps/chosen": -435.4732360839844,
"logps/rejected": -516.2320556640625,
"loss": 0.5219,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7493627071380615,
"rewards/margins": 0.8683775067329407,
"rewards/rejected": -2.6177401542663574,
"step": 910
},
{
"epoch": 0.48154933263543576,
"grad_norm": 2.1669192855886257,
"learning_rate": 3.0906082111259313e-06,
"logits/chosen": -0.8027156591415405,
"logits/rejected": -0.8391119837760925,
"logps/chosen": -479.461181640625,
"logps/rejected": -578.0491943359375,
"loss": 0.4802,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1167943477630615,
"rewards/margins": 1.2451462745666504,
"rewards/rejected": -3.361940383911133,
"step": 920
},
{
"epoch": 0.48678356451190785,
"grad_norm": 3.304438899218671,
"learning_rate": 3.046116088499449e-06,
"logits/chosen": -0.8075677752494812,
"logits/rejected": -0.818462073802948,
"logps/chosen": -551.7511596679688,
"logps/rejected": -617.6654052734375,
"loss": 0.4831,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.702815532684326,
"rewards/margins": 0.8609806895256042,
"rewards/rejected": -3.563796281814575,
"step": 930
},
{
"epoch": 0.49201779638838,
"grad_norm": 5.162139925744972,
"learning_rate": 3.0014415675356813e-06,
"logits/chosen": -0.8480289578437805,
"logits/rejected": -0.8807282447814941,
"logps/chosen": -580.1326904296875,
"logps/rejected": -658.1190795898438,
"loss": 0.451,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.9515488147735596,
"rewards/margins": 1.0807795524597168,
"rewards/rejected": -4.032328128814697,
"step": 940
},
{
"epoch": 0.49725202826485215,
"grad_norm": 4.270308518724467,
"learning_rate": 2.9565995691617242e-06,
"logits/chosen": -0.8305244445800781,
"logits/rejected": -0.8603514432907104,
"logps/chosen": -577.685302734375,
"logps/rejected": -670.2604370117188,
"loss": 0.4688,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.654804229736328,
"rewards/margins": 1.3251168727874756,
"rewards/rejected": -3.9799208641052246,
"step": 950
},
{
"epoch": 0.5024862601413242,
"grad_norm": 3.9112848274397374,
"learning_rate": 2.9116050702407706e-06,
"logits/chosen": -0.8263424634933472,
"logits/rejected": -0.8838019371032715,
"logps/chosen": -500.05841064453125,
"logps/rejected": -590.2412109375,
"loss": 0.5068,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.197012424468994,
"rewards/margins": 1.2063963413238525,
"rewards/rejected": -3.4034087657928467,
"step": 960
},
{
"epoch": 0.5077204920177963,
"grad_norm": 2.6527116057322813,
"learning_rate": 2.8664730985699537e-06,
"logits/chosen": -0.8739676475524902,
"logits/rejected": -0.8983599543571472,
"logps/chosen": -471.83978271484375,
"logps/rejected": -584.647216796875,
"loss": 0.4578,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0771896839141846,
"rewards/margins": 1.0904505252838135,
"rewards/rejected": -3.167639970779419,
"step": 970
},
{
"epoch": 0.5129547238942685,
"grad_norm": 3.0654272233325806,
"learning_rate": 2.8212187278611907e-06,
"logits/chosen": -0.8305708765983582,
"logits/rejected": -0.8489472270011902,
"logps/chosen": -484.65679931640625,
"logps/rejected": -540.0225219726562,
"loss": 0.4925,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8167269229888916,
"rewards/margins": 0.965971827507019,
"rewards/rejected": -2.7826988697052,
"step": 980
},
{
"epoch": 0.5181889557707406,
"grad_norm": 2.8308862425981896,
"learning_rate": 2.7758570727066843e-06,
"logits/chosen": -0.8484777212142944,
"logits/rejected": -0.861672580242157,
"logps/chosen": -470.0426330566406,
"logps/rejected": -552.4635620117188,
"loss": 0.5011,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.991310715675354,
"rewards/margins": 1.036561369895935,
"rewards/rejected": -3.027872085571289,
"step": 990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 3.1475704256823818,
"learning_rate": 2.730403283530767e-06,
"logits/chosen": -0.7995238900184631,
"logits/rejected": -0.8208199739456177,
"logps/chosen": -514.13525390625,
"logps/rejected": -616.7322387695312,
"loss": 0.4635,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.589371681213379,
"rewards/margins": 1.151365876197815,
"rewards/rejected": -3.7407374382019043,
"step": 1000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -0.8520967364311218,
"eval_logits/rejected": -0.871288001537323,
"eval_logps/chosen": -558.1610107421875,
"eval_logps/rejected": -656.6334228515625,
"eval_loss": 0.4907907247543335,
"eval_rewards/accuracies": 0.7630000114440918,
"eval_rewards/chosen": -2.817676544189453,
"eval_rewards/margins": 1.2160439491271973,
"eval_rewards/rejected": -4.03372049331665,
"eval_runtime": 490.4932,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 0.255,
"step": 1000
},
{
"epoch": 0.528657419523685,
"grad_norm": 2.8934222921049226,
"learning_rate": 2.6848725415297888e-06,
"logits/chosen": -0.8423219919204712,
"logits/rejected": -0.8691143989562988,
"logps/chosen": -526.5357666015625,
"logps/rejected": -626.977294921875,
"loss": 0.4864,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.859957456588745,
"rewards/margins": 1.0318825244903564,
"rewards/rejected": -3.8918404579162598,
"step": 1010
},
{
"epoch": 0.533891651400157,
"grad_norm": 4.072441147880593,
"learning_rate": 2.639280053601719e-06,
"logits/chosen": -0.7951158285140991,
"logits/rejected": -0.8444026708602905,
"logps/chosen": -551.2407836914062,
"logps/rejected": -624.159912109375,
"loss": 0.4753,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.523639678955078,
"rewards/margins": 1.181814193725586,
"rewards/rejected": -3.7054543495178223,
"step": 1020
},
{
"epoch": 0.5391258832766291,
"grad_norm": 2.288139837580109,
"learning_rate": 2.59364104726716e-06,
"logits/chosen": -0.7849830389022827,
"logits/rejected": -0.7941724061965942,
"logps/chosen": -558.4784545898438,
"logps/rejected": -631.0448608398438,
"loss": 0.4813,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.9077861309051514,
"rewards/margins": 1.099591851234436,
"rewards/rejected": -4.007378101348877,
"step": 1030
},
{
"epoch": 0.5443601151531012,
"grad_norm": 2.255027947939241,
"learning_rate": 2.547970765583491e-06,
"logits/chosen": -0.7766658663749695,
"logits/rejected": -0.7682461142539978,
"logps/chosen": -551.2001953125,
"logps/rejected": -626.3408203125,
"loss": 0.508,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.647867202758789,
"rewards/margins": 0.965995192527771,
"rewards/rejected": -3.6138622760772705,
"step": 1040
},
{
"epoch": 0.5495943470295734,
"grad_norm": 3.365620498364263,
"learning_rate": 2.502284462053799e-06,
"logits/chosen": -0.7241968512535095,
"logits/rejected": -0.7473636269569397,
"logps/chosen": -527.13427734375,
"logps/rejected": -617.5802612304688,
"loss": 0.4658,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.2644190788269043,
"rewards/margins": 1.2528817653656006,
"rewards/rejected": -3.517301082611084,
"step": 1050
},
{
"epoch": 0.5548285789060455,
"grad_norm": 4.206518455943664,
"learning_rate": 2.456597395532338e-06,
"logits/chosen": -0.7641677856445312,
"logits/rejected": -0.8080291748046875,
"logps/chosen": -470.62872314453125,
"logps/rejected": -533.24560546875,
"loss": 0.4611,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.032501220703125,
"rewards/margins": 1.0605053901672363,
"rewards/rejected": -3.0930066108703613,
"step": 1060
},
{
"epoch": 0.5600628107825176,
"grad_norm": 2.230846159297882,
"learning_rate": 2.4109248251281953e-06,
"logits/chosen": -0.7514528632164001,
"logits/rejected": -0.775009036064148,
"logps/chosen": -500.794921875,
"logps/rejected": -553.78466796875,
"loss": 0.4948,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0267245769500732,
"rewards/margins": 1.0498759746551514,
"rewards/rejected": -3.0766005516052246,
"step": 1070
},
{
"epoch": 0.5652970426589898,
"grad_norm": 2.9850161302908567,
"learning_rate": 2.365282005108875e-06,
"logits/chosen": -0.7392014265060425,
"logits/rejected": -0.7644953727722168,
"logps/chosen": -484.65216064453125,
"logps/rejected": -520.8948974609375,
"loss": 0.5131,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1898112297058105,
"rewards/margins": 0.9680379629135132,
"rewards/rejected": -3.1578493118286133,
"step": 1080
},
{
"epoch": 0.5705312745354619,
"grad_norm": 4.871043269302411,
"learning_rate": 2.319684179805491e-06,
"logits/chosen": -0.7259557247161865,
"logits/rejected": -0.7469737529754639,
"logps/chosen": -513.62353515625,
"logps/rejected": -555.1145629882812,
"loss": 0.5266,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.5438778400421143,
"rewards/margins": 0.932669997215271,
"rewards/rejected": -3.476547956466675,
"step": 1090
},
{
"epoch": 0.575765506411934,
"grad_norm": 2.9604109906028677,
"learning_rate": 2.2741465785212905e-06,
"logits/chosen": -0.7221305966377258,
"logits/rejected": -0.7538542747497559,
"logps/chosen": -503.03143310546875,
"logps/rejected": -606.3543090820312,
"loss": 0.4856,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.407829761505127,
"rewards/margins": 0.9688541293144226,
"rewards/rejected": -3.3766837120056152,
"step": 1100
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -0.7912909984588623,
"eval_logits/rejected": -0.8044111728668213,
"eval_logps/chosen": -512.9990234375,
"eval_logps/rejected": -602.4694213867188,
"eval_loss": 0.4817214012145996,
"eval_rewards/accuracies": 0.7720000147819519,
"eval_rewards/chosen": -2.3660569190979004,
"eval_rewards/margins": 1.1260240077972412,
"eval_rewards/rejected": -3.4920809268951416,
"eval_runtime": 490.5656,
"eval_samples_per_second": 4.077,
"eval_steps_per_second": 0.255,
"step": 1100
},
{
"epoch": 0.5809997382884062,
"grad_norm": 4.040674423079137,
"learning_rate": 2.2286844104451848e-06,
"logits/chosen": -0.7596691250801086,
"logits/rejected": -0.7672920227050781,
"logps/chosen": -506.1031799316406,
"logps/rejected": -615.0109252929688,
"loss": 0.51,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.4000015258789062,
"rewards/margins": 1.183712363243103,
"rewards/rejected": -3.583714008331299,
"step": 1110
},
{
"epoch": 0.5862339701648783,
"grad_norm": 3.766321110228989,
"learning_rate": 2.183312859572008e-06,
"logits/chosen": -0.7806628346443176,
"logits/rejected": -0.7962976098060608,
"logps/chosen": -550.5853271484375,
"logps/rejected": -597.5955200195312,
"loss": 0.5075,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.502775192260742,
"rewards/margins": 0.9967296719551086,
"rewards/rejected": -3.499504804611206,
"step": 1120
},
{
"epoch": 0.5914682020413504,
"grad_norm": 3.8765385487802715,
"learning_rate": 2.1380470796311843e-06,
"logits/chosen": -0.7329021692276001,
"logits/rejected": -0.716983437538147,
"logps/chosen": -548.9044799804688,
"logps/rejected": -640.8181762695312,
"loss": 0.4782,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.5259509086608887,
"rewards/margins": 1.2471174001693726,
"rewards/rejected": -3.77306866645813,
"step": 1130
},
{
"epoch": 0.5967024339178225,
"grad_norm": 3.776816209238374,
"learning_rate": 2.092902189025507e-06,
"logits/chosen": -0.7345478534698486,
"logits/rejected": -0.765870213508606,
"logps/chosen": -567.6248168945312,
"logps/rejected": -644.7882690429688,
"loss": 0.4409,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.6834120750427246,
"rewards/margins": 1.2605786323547363,
"rewards/rejected": -3.943991184234619,
"step": 1140
},
{
"epoch": 0.6019366657942947,
"grad_norm": 2.4633069424861476,
"learning_rate": 2.0478932657817105e-06,
"logits/chosen": -0.7708589434623718,
"logits/rejected": -0.7781729698181152,
"logps/chosen": -546.5011596679688,
"logps/rejected": -616.3358154296875,
"loss": 0.4785,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.6856911182403564,
"rewards/margins": 1.0351371765136719,
"rewards/rejected": -3.7208282947540283,
"step": 1150
},
{
"epoch": 0.6071708976707668,
"grad_norm": 2.9067779299636363,
"learning_rate": 2.0030353425145376e-06,
"logits/chosen": -0.7892950773239136,
"logits/rejected": -0.7964081764221191,
"logps/chosen": -454.22735595703125,
"logps/rejected": -548.8640747070312,
"loss": 0.551,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.2731716632843018,
"rewards/margins": 1.0861868858337402,
"rewards/rejected": -3.359358549118042,
"step": 1160
},
{
"epoch": 0.6124051295472389,
"grad_norm": 3.0126218778741816,
"learning_rate": 1.958343401405964e-06,
"logits/chosen": -0.7989486455917358,
"logits/rejected": -0.8354769945144653,
"logps/chosen": -471.26971435546875,
"logps/rejected": -576.9542236328125,
"loss": 0.515,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9779512882232666,
"rewards/margins": 1.1961392164230347,
"rewards/rejected": -3.1740903854370117,
"step": 1170
},
{
"epoch": 0.6176393614237111,
"grad_norm": 2.8356127345530298,
"learning_rate": 1.9138323692012734e-06,
"logits/chosen": -0.7760749459266663,
"logits/rejected": -0.8243614435195923,
"logps/chosen": -477.05255126953125,
"logps/rejected": -567.8319091796875,
"loss": 0.4482,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9603474140167236,
"rewards/margins": 1.083135962486267,
"rewards/rejected": -3.0434834957122803,
"step": 1180
},
{
"epoch": 0.6228735933001832,
"grad_norm": 2.653741359724002,
"learning_rate": 1.8695171122236443e-06,
"logits/chosen": -0.7773482799530029,
"logits/rejected": -0.8091195821762085,
"logps/chosen": -534.4111328125,
"logps/rejected": -603.28564453125,
"loss": 0.4399,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.1728804111480713,
"rewards/margins": 1.205948829650879,
"rewards/rejected": -3.37882924079895,
"step": 1190
},
{
"epoch": 0.6281078251766553,
"grad_norm": 3.666686057880921,
"learning_rate": 1.8254124314089225e-06,
"logits/chosen": -0.7455651760101318,
"logits/rejected": -0.7562496066093445,
"logps/chosen": -496.6683044433594,
"logps/rejected": -561.0657958984375,
"loss": 0.5013,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.1546523571014404,
"rewards/margins": 0.8877624273300171,
"rewards/rejected": -3.042415142059326,
"step": 1200
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -0.7745435237884521,
"eval_logits/rejected": -0.7890085577964783,
"eval_logps/chosen": -488.01080322265625,
"eval_logps/rejected": -582.3287353515625,
"eval_loss": 0.4859620928764343,
"eval_rewards/accuracies": 0.7720000147819519,
"eval_rewards/chosen": -2.1161751747131348,
"eval_rewards/margins": 1.1744980812072754,
"eval_rewards/rejected": -3.29067325592041,
"eval_runtime": 492.7641,
"eval_samples_per_second": 4.059,
"eval_steps_per_second": 0.254,
"step": 1200
},
{
"epoch": 0.6333420570531274,
"grad_norm": 4.445062813407665,
"learning_rate": 1.781533057362221e-06,
"logits/chosen": -0.7981818914413452,
"logits/rejected": -0.8196622729301453,
"logps/chosen": -482.60601806640625,
"logps/rejected": -558.7507934570312,
"loss": 0.4991,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.063744068145752,
"rewards/margins": 1.000634789466858,
"rewards/rejected": -3.0643787384033203,
"step": 1210
},
{
"epoch": 0.6385762889295996,
"grad_norm": 4.507009476169383,
"learning_rate": 1.7378936454380277e-06,
"logits/chosen": -0.7655137181282043,
"logits/rejected": -0.7727285623550415,
"logps/chosen": -491.4285583496094,
"logps/rejected": -586.7752685546875,
"loss": 0.4859,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.225255012512207,
"rewards/margins": 1.2578017711639404,
"rewards/rejected": -3.4830570220947266,
"step": 1220
},
{
"epoch": 0.6438105208060717,
"grad_norm": 2.769661114858915,
"learning_rate": 1.6945087708454273e-06,
"logits/chosen": -0.7237203121185303,
"logits/rejected": -0.752585768699646,
"logps/chosen": -524.2711791992188,
"logps/rejected": -591.5977783203125,
"loss": 0.4308,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2781286239624023,
"rewards/margins": 1.1484158039093018,
"rewards/rejected": -3.426544666290283,
"step": 1230
},
{
"epoch": 0.6490447526825438,
"grad_norm": 2.8409702645316486,
"learning_rate": 1.651392923780105e-06,
"logits/chosen": -0.7248971462249756,
"logits/rejected": -0.7399358153343201,
"logps/chosen": -502.482666015625,
"logps/rejected": -614.2011108398438,
"loss": 0.4665,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.312042713165283,
"rewards/margins": 1.1948063373565674,
"rewards/rejected": -3.5068485736846924,
"step": 1240
},
{
"epoch": 0.654278984559016,
"grad_norm": 2.5610614385353614,
"learning_rate": 1.608560504584737e-06,
"logits/chosen": -0.7505759000778198,
"logits/rejected": -0.748982846736908,
"logps/chosen": -497.2088317871094,
"logps/rejected": -608.0076904296875,
"loss": 0.4391,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.297612428665161,
"rewards/margins": 1.178468942642212,
"rewards/rejected": -3.476081132888794,
"step": 1250
},
{
"epoch": 0.6595132164354881,
"grad_norm": 2.9288478965975204,
"learning_rate": 1.5660258189393945e-06,
"logits/chosen": -0.7709019780158997,
"logits/rejected": -0.7804166674613953,
"logps/chosen": -484.7782287597656,
"logps/rejected": -561.656494140625,
"loss": 0.4529,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.0165915489196777,
"rewards/margins": 1.224448323249817,
"rewards/rejected": -3.241039752960205,
"step": 1260
},
{
"epoch": 0.6647474483119602,
"grad_norm": 3.3843415697325123,
"learning_rate": 1.5238030730835578e-06,
"logits/chosen": -0.7160421013832092,
"logits/rejected": -0.7430087327957153,
"logps/chosen": -469.3851623535156,
"logps/rejected": -589.1715698242188,
"loss": 0.4304,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.223741292953491,
"rewards/margins": 1.258303165435791,
"rewards/rejected": -3.4820446968078613,
"step": 1270
},
{
"epoch": 0.6699816801884323,
"grad_norm": 4.31798859648472,
"learning_rate": 1.4819063690713565e-06,
"logits/chosen": -0.7581356167793274,
"logits/rejected": -0.7594733238220215,
"logps/chosen": -471.11822509765625,
"logps/rejected": -587.6780395507812,
"loss": 0.4618,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2457804679870605,
"rewards/margins": 1.2690086364746094,
"rewards/rejected": -3.514788866043091,
"step": 1280
},
{
"epoch": 0.6752159120649045,
"grad_norm": 2.9043056997624785,
"learning_rate": 1.4403497000615885e-06,
"logits/chosen": -0.729811429977417,
"logits/rejected": -0.7537652254104614,
"logps/chosen": -517.2977294921875,
"logps/rejected": -622.2907104492188,
"loss": 0.5079,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.318469524383545,
"rewards/margins": 1.2079073190689087,
"rewards/rejected": -3.5263772010803223,
"step": 1290
},
{
"epoch": 0.6804501439413766,
"grad_norm": 3.476254076191995,
"learning_rate": 1.3991469456441273e-06,
"logits/chosen": -0.7037397623062134,
"logits/rejected": -0.7578923106193542,
"logps/chosen": -535.3514404296875,
"logps/rejected": -628.6389770507812,
"loss": 0.4497,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.396596670150757,
"rewards/margins": 1.2228056192398071,
"rewards/rejected": -3.6194019317626953,
"step": 1300
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -0.7940347790718079,
"eval_logits/rejected": -0.8095559477806091,
"eval_logps/chosen": -524.7894897460938,
"eval_logps/rejected": -626.9693603515625,
"eval_loss": 0.485016793012619,
"eval_rewards/accuracies": 0.7730000019073486,
"eval_rewards/chosen": -2.483961820602417,
"eval_rewards/margins": 1.2531172037124634,
"eval_rewards/rejected": -3.73707914352417,
"eval_runtime": 490.5183,
"eval_samples_per_second": 4.077,
"eval_steps_per_second": 0.255,
"step": 1300
},
{
"epoch": 0.6856843758178487,
"grad_norm": 4.224665538714578,
"learning_rate": 1.3583118672042441e-06,
"logits/chosen": -0.7764405608177185,
"logits/rejected": -0.7887049913406372,
"logps/chosen": -516.7277221679688,
"logps/rejected": -632.888427734375,
"loss": 0.472,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.473681926727295,
"rewards/margins": 1.352980136871338,
"rewards/rejected": -3.8266615867614746,
"step": 1310
},
{
"epoch": 0.6909186076943209,
"grad_norm": 4.563976167233384,
"learning_rate": 1.3178581033264218e-06,
"logits/chosen": -0.7718938589096069,
"logits/rejected": -0.7976632714271545,
"logps/chosen": -519.279541015625,
"logps/rejected": -601.4541015625,
"loss": 0.4487,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.553548812866211,
"rewards/margins": 1.0802887678146362,
"rewards/rejected": -3.633837938308716,
"step": 1320
},
{
"epoch": 0.696152839570793,
"grad_norm": 4.184522446421226,
"learning_rate": 1.2777991652391757e-06,
"logits/chosen": -0.7361895442008972,
"logits/rejected": -0.7566362619400024,
"logps/chosen": -527.201416015625,
"logps/rejected": -645.9843139648438,
"loss": 0.4656,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.564059257507324,
"rewards/margins": 1.190525770187378,
"rewards/rejected": -3.7545852661132812,
"step": 1330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 4.166233355017187,
"learning_rate": 1.2381484323024178e-06,
"logits/chosen": -0.713969349861145,
"logits/rejected": -0.7467511892318726,
"logps/chosen": -518.2627563476562,
"logps/rejected": -595.9432373046875,
"loss": 0.4851,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.4644479751586914,
"rewards/margins": 1.2395137548446655,
"rewards/rejected": -3.7039618492126465,
"step": 1340
},
{
"epoch": 0.7066213033237373,
"grad_norm": 3.865579244226054,
"learning_rate": 1.1989191475388518e-06,
"logits/chosen": -0.75743567943573,
"logits/rejected": -0.7837706804275513,
"logps/chosen": -509.4811096191406,
"logps/rejected": -594.09033203125,
"loss": 0.4915,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.323971748352051,
"rewards/margins": 1.0227216482162476,
"rewards/rejected": -3.346693515777588,
"step": 1350
},
{
"epoch": 0.7118555352002094,
"grad_norm": 2.652467671657092,
"learning_rate": 1.160124413210918e-06,
"logits/chosen": -0.7264483571052551,
"logits/rejected": -0.7697917222976685,
"logps/chosen": -533.6471557617188,
"logps/rejected": -609.0975341796875,
"loss": 0.4969,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.2726407051086426,
"rewards/margins": 1.1175401210784912,
"rewards/rejected": -3.3901805877685547,
"step": 1360
},
{
"epoch": 0.7170897670766815,
"grad_norm": 2.5577134603401506,
"learning_rate": 1.1217771864447396e-06,
"logits/chosen": -0.8089788556098938,
"logits/rejected": -0.8140621185302734,
"logps/chosen": -544.7674560546875,
"logps/rejected": -623.3822631835938,
"loss": 0.4132,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.3746323585510254,
"rewards/margins": 1.3340243101119995,
"rewards/rejected": -3.7086567878723145,
"step": 1370
},
{
"epoch": 0.7223239989531536,
"grad_norm": 3.219449421326901,
"learning_rate": 1.08389027490255e-06,
"logits/chosen": -0.7905744314193726,
"logits/rejected": -0.7953212857246399,
"logps/chosen": -490.263916015625,
"logps/rejected": -593.0517578125,
"loss": 0.4631,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2938039302825928,
"rewards/margins": 1.0697680711746216,
"rewards/rejected": -3.363571882247925,
"step": 1380
},
{
"epoch": 0.7275582308296258,
"grad_norm": 2.9190971855424936,
"learning_rate": 1.046476332505036e-06,
"logits/chosen": -0.736000657081604,
"logits/rejected": -0.7762190103530884,
"logps/chosen": -537.1323852539062,
"logps/rejected": -622.3277587890625,
"loss": 0.4459,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.433082103729248,
"rewards/margins": 1.1599032878875732,
"rewards/rejected": -3.592985153198242,
"step": 1390
},
{
"epoch": 0.7327924627060979,
"grad_norm": 3.087299031916978,
"learning_rate": 1.0095478552050348e-06,
"logits/chosen": -0.7585629224777222,
"logits/rejected": -0.7968350648880005,
"logps/chosen": -505.4625549316406,
"logps/rejected": -628.9778442382812,
"loss": 0.4734,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.4186644554138184,
"rewards/margins": 1.282457947731018,
"rewards/rejected": -3.701122283935547,
"step": 1400
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -0.7989752888679504,
"eval_logits/rejected": -0.8147993087768555,
"eval_logps/chosen": -491.04962158203125,
"eval_logps/rejected": -590.251953125,
"eval_loss": 0.4832901060581207,
"eval_rewards/accuracies": 0.7739999890327454,
"eval_rewards/chosen": -2.1465635299682617,
"eval_rewards/margins": 1.223341703414917,
"eval_rewards/rejected": -3.369905471801758,
"eval_runtime": 492.7006,
"eval_samples_per_second": 4.059,
"eval_steps_per_second": 0.254,
"step": 1400
},
{
"epoch": 0.73802669458257,
"grad_norm": 3.5157604346203226,
"learning_rate": 9.731171768139808e-07,
"logits/chosen": -0.7569630146026611,
"logits/rejected": -0.7974756956100464,
"logps/chosen": -481.45477294921875,
"logps/rejected": -610.00146484375,
"loss": 0.5139,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.1769986152648926,
"rewards/margins": 1.2288345098495483,
"rewards/rejected": -3.4058330059051514,
"step": 1410
},
{
"epoch": 0.7432609264590422,
"grad_norm": 3.358269083637402,
"learning_rate": 9.371964648825221e-07,
"logits/chosen": -0.7507213950157166,
"logits/rejected": -0.784144401550293,
"logps/chosen": -489.03515625,
"logps/rejected": -545.8272705078125,
"loss": 0.5023,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.055467128753662,
"rewards/margins": 0.9931677579879761,
"rewards/rejected": -3.0486350059509277,
"step": 1420
},
{
"epoch": 0.7484951583355143,
"grad_norm": 3.450390480039677,
"learning_rate": 9.017977166366445e-07,
"logits/chosen": -0.7912155985832214,
"logits/rejected": -0.8058408498764038,
"logps/chosen": -509.6456604003906,
"logps/rejected": -619.4719848632812,
"loss": 0.4582,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.1829304695129395,
"rewards/margins": 1.2820956707000732,
"rewards/rejected": -3.4650261402130127,
"step": 1430
},
{
"epoch": 0.7537293902119864,
"grad_norm": 3.9019374901211066,
"learning_rate": 8.669327549707096e-07,
"logits/chosen": -0.8170151710510254,
"logits/rejected": -0.8503853678703308,
"logps/chosen": -525.7236328125,
"logps/rejected": -609.9266967773438,
"loss": 0.4976,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.2434840202331543,
"rewards/margins": 0.9734998941421509,
"rewards/rejected": -3.2169833183288574,
"step": 1440
},
{
"epoch": 0.7589636220884585,
"grad_norm": 3.1069704561124567,
"learning_rate": 8.326132244986932e-07,
"logits/chosen": -0.8235033750534058,
"logits/rejected": -0.8337501287460327,
"logps/chosen": -507.8706970214844,
"logps/rejected": -617.8888549804688,
"loss": 0.4459,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.2898011207580566,
"rewards/margins": 1.271906852722168,
"rewards/rejected": -3.5617077350616455,
"step": 1450
},
{
"epoch": 0.7641978539649307,
"grad_norm": 2.5011020359087137,
"learning_rate": 7.988505876649863e-07,
"logits/chosen": -0.7653478384017944,
"logits/rejected": -0.8202483057975769,
"logps/chosen": -471.88299560546875,
"logps/rejected": -579.7264404296875,
"loss": 0.475,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.1682538986206055,
"rewards/margins": 1.1780481338500977,
"rewards/rejected": -3.346302032470703,
"step": 1460
},
{
"epoch": 0.7694320858414028,
"grad_norm": 2.063720356926022,
"learning_rate": 7.656561209160248e-07,
"logits/chosen": -0.7891589999198914,
"logits/rejected": -0.8133258819580078,
"logps/chosen": -516.5853271484375,
"logps/rejected": -602.1696166992188,
"loss": 0.467,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.344850540161133,
"rewards/margins": 1.223329782485962,
"rewards/rejected": -3.5681800842285156,
"step": 1470
},
{
"epoch": 0.7746663177178749,
"grad_norm": 3.4450781896535694,
"learning_rate": 7.330409109340563e-07,
"logits/chosen": -0.8023967742919922,
"logits/rejected": -0.8343321084976196,
"logps/chosen": -532.4755859375,
"logps/rejected": -600.9357299804688,
"loss": 0.4975,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.426600217819214,
"rewards/margins": 1.1944777965545654,
"rewards/rejected": -3.6210780143737793,
"step": 1480
},
{
"epoch": 0.7799005495943471,
"grad_norm": 4.640657971253548,
"learning_rate": 7.010158509342682e-07,
"logits/chosen": -0.7905557155609131,
"logits/rejected": -0.8068881034851074,
"logps/chosen": -546.9576416015625,
"logps/rejected": -667.5574340820312,
"loss": 0.483,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.799778699874878,
"rewards/margins": 1.137909173965454,
"rewards/rejected": -3.937687635421753,
"step": 1490
},
{
"epoch": 0.7851347814708192,
"grad_norm": 8.807489938353271,
"learning_rate": 6.695916370265529e-07,
"logits/chosen": -0.7907344698905945,
"logits/rejected": -0.8382173776626587,
"logps/chosen": -507.33575439453125,
"logps/rejected": -633.0894775390625,
"loss": 0.4482,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5701141357421875,
"rewards/margins": 1.3424403667449951,
"rewards/rejected": -3.9125542640686035,
"step": 1500
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -0.8246029615402222,
"eval_logits/rejected": -0.8422741293907166,
"eval_logps/chosen": -527.0020751953125,
"eval_logps/rejected": -624.8656005859375,
"eval_loss": 0.4811870753765106,
"eval_rewards/accuracies": 0.7760000228881836,
"eval_rewards/chosen": -2.506087303161621,
"eval_rewards/margins": 1.209954857826233,
"eval_rewards/rejected": -3.7160420417785645,
"eval_runtime": 491.111,
"eval_samples_per_second": 4.072,
"eval_steps_per_second": 0.255,
"step": 1500
},
{
"epoch": 0.7903690133472913,
"grad_norm": 3.367824088455391,
"learning_rate": 6.387787646430854e-07,
"logits/chosen": -0.7824467420578003,
"logits/rejected": -0.8199766874313354,
"logps/chosen": -542.2174682617188,
"logps/rejected": -609.8660278320312,
"loss": 0.4931,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.7443554401397705,
"rewards/margins": 1.1063810586929321,
"rewards/rejected": -3.850736141204834,
"step": 1510
},
{
"epoch": 0.7956032452237635,
"grad_norm": 2.87877789191255,
"learning_rate": 6.085875250329401e-07,
"logits/chosen": -0.8328324556350708,
"logits/rejected": -0.8468655347824097,
"logps/chosen": -507.84844970703125,
"logps/rejected": -600.4588623046875,
"loss": 0.4568,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.569871187210083,
"rewards/margins": 1.0555684566497803,
"rewards/rejected": -3.625439405441284,
"step": 1520
},
{
"epoch": 0.8008374771002356,
"grad_norm": 3.688216572569841,
"learning_rate": 5.79028001824894e-07,
"logits/chosen": -0.7519486546516418,
"logits/rejected": -0.7730967402458191,
"logps/chosen": -484.6551208496094,
"logps/rejected": -676.157958984375,
"loss": 0.429,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3991482257843018,
"rewards/margins": 1.4251246452331543,
"rewards/rejected": -3.824272871017456,
"step": 1530
},
{
"epoch": 0.8060717089767077,
"grad_norm": 2.22556494947782,
"learning_rate": 5.501100676595761e-07,
"logits/chosen": -0.8356014490127563,
"logits/rejected": -0.8369625210762024,
"logps/chosen": -538.4762573242188,
"logps/rejected": -637.4407958984375,
"loss": 0.462,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.500579833984375,
"rewards/margins": 1.1396005153656006,
"rewards/rejected": -3.6401805877685547,
"step": 1540
},
{
"epoch": 0.8113059408531798,
"grad_norm": 3.6363110356872355,
"learning_rate": 5.218433808920884e-07,
"logits/chosen": -0.8045191764831543,
"logits/rejected": -0.8361748456954956,
"logps/chosen": -514.1395263671875,
"logps/rejected": -600.3705444335938,
"loss": 0.4748,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.4007575511932373,
"rewards/margins": 1.0883753299713135,
"rewards/rejected": -3.4891326427459717,
"step": 1550
},
{
"epoch": 0.816540172729652,
"grad_norm": 4.618670652390617,
"learning_rate": 4.942373823661928e-07,
"logits/chosen": -0.8143288493156433,
"logits/rejected": -0.8433948755264282,
"logps/chosen": -569.4879760742188,
"logps/rejected": -613.3157348632812,
"loss": 0.5362,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.3494620323181152,
"rewards/margins": 1.1645643711090088,
"rewards/rejected": -3.514026641845703,
"step": 1560
},
{
"epoch": 0.821774404606124,
"grad_norm": 2.7457651434809893,
"learning_rate": 4.6730129226114363e-07,
"logits/chosen": -0.8218878507614136,
"logits/rejected": -0.8484194874763489,
"logps/chosen": -491.6686096191406,
"logps/rejected": -611.8516845703125,
"loss": 0.4799,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.467026710510254,
"rewards/margins": 1.123768925666809,
"rewards/rejected": -3.5907950401306152,
"step": 1570
},
{
"epoch": 0.8270086364825961,
"grad_norm": 2.68839096035273,
"learning_rate": 4.4104410701222703e-07,
"logits/chosen": -0.8048428297042847,
"logits/rejected": -0.8185631036758423,
"logps/chosen": -523.8101196289062,
"logps/rejected": -619.0768432617188,
"loss": 0.5228,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.5985608100891113,
"rewards/margins": 0.9621597528457642,
"rewards/rejected": -3.560720443725586,
"step": 1580
},
{
"epoch": 0.8322428683590684,
"grad_norm": 3.1572366237408316,
"learning_rate": 4.154745963060197e-07,
"logits/chosen": -0.7831433415412903,
"logits/rejected": -0.7975921034812927,
"logps/chosen": -479.276123046875,
"logps/rejected": -577.49169921875,
"loss": 0.4298,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.3617072105407715,
"rewards/margins": 1.14790940284729,
"rewards/rejected": -3.5096163749694824,
"step": 1590
},
{
"epoch": 0.8374771002355405,
"grad_norm": 3.2177419289192843,
"learning_rate": 3.9060130015138863e-07,
"logits/chosen": -0.7938388586044312,
"logits/rejected": -0.8209096193313599,
"logps/chosen": -501.1689453125,
"logps/rejected": -594.0531005859375,
"loss": 0.4982,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.3808839321136475,
"rewards/margins": 1.0191795825958252,
"rewards/rejected": -3.4000632762908936,
"step": 1600
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -0.8202841877937317,
"eval_logits/rejected": -0.8376915454864502,
"eval_logps/chosen": -499.3264465332031,
"eval_logps/rejected": -592.1224365234375,
"eval_loss": 0.4787100553512573,
"eval_rewards/accuracies": 0.7770000100135803,
"eval_rewards/chosen": -2.2293312549591064,
"eval_rewards/margins": 1.159279465675354,
"eval_rewards/rejected": -3.388610601425171,
"eval_runtime": 490.4246,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 0.255,
"step": 1600
},
{
"epoch": 0.8427113321120125,
"grad_norm": 7.214430205771756,
"learning_rate": 3.664325260271953e-07,
"logits/chosen": -0.796721339225769,
"logits/rejected": -0.8238224983215332,
"logps/chosen": -540.4601440429688,
"logps/rejected": -581.5528564453125,
"loss": 0.4829,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.191901683807373,
"rewards/margins": 0.9697766304016113,
"rewards/rejected": -3.1616785526275635,
"step": 1610
},
{
"epoch": 0.8479455639884846,
"grad_norm": 2.9830959712764993,
"learning_rate": 3.429763461076677e-07,
"logits/chosen": -0.795876145362854,
"logits/rejected": -0.8391523361206055,
"logps/chosen": -538.1182861328125,
"logps/rejected": -600.2818603515625,
"loss": 0.4793,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.279409408569336,
"rewards/margins": 1.0599138736724854,
"rewards/rejected": -3.3393235206604004,
"step": 1620
},
{
"epoch": 0.8531797958649568,
"grad_norm": 2.9273391665482746,
"learning_rate": 3.202405945663556e-07,
"logits/chosen": -0.8152064085006714,
"logits/rejected": -0.8364097476005554,
"logps/chosen": -526.7260131835938,
"logps/rejected": -597.470947265625,
"loss": 0.4621,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.458247661590576,
"rewards/margins": 1.0275027751922607,
"rewards/rejected": -3.485750675201416,
"step": 1630
},
{
"epoch": 0.8584140277414289,
"grad_norm": 3.433887400095838,
"learning_rate": 2.982328649595856e-07,
"logits/chosen": -0.80964195728302,
"logits/rejected": -0.8188964128494263,
"logps/chosen": -488.6082458496094,
"logps/rejected": -585.5677490234375,
"loss": 0.4951,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1840968132019043,
"rewards/margins": 1.1141045093536377,
"rewards/rejected": -3.2982017993927,
"step": 1640
},
{
"epoch": 0.863648259617901,
"grad_norm": 2.622521132377607,
"learning_rate": 2.7696050769026954e-07,
"logits/chosen": -0.8142071962356567,
"logits/rejected": -0.8193332552909851,
"logps/chosen": -515.5963745117188,
"logps/rejected": -645.8536987304688,
"loss": 0.4876,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.4191887378692627,
"rewards/margins": 1.2284886837005615,
"rewards/rejected": -3.6476776599884033,
"step": 1650
},
{
"epoch": 0.8688824914943732,
"grad_norm": 3.0785040031189537,
"learning_rate": 2.564306275529341e-07,
"logits/chosen": -0.8157971501350403,
"logits/rejected": -0.8292611241340637,
"logps/chosen": -481.8374938964844,
"logps/rejected": -571.3180541992188,
"loss": 0.489,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.333627700805664,
"rewards/margins": 1.1653764247894287,
"rewards/rejected": -3.4990038871765137,
"step": 1660
},
{
"epoch": 0.8741167233708453,
"grad_norm": 2.984496753224684,
"learning_rate": 2.3665008136077332e-07,
"logits/chosen": -0.8203511238098145,
"logits/rejected": -0.8614446520805359,
"logps/chosen": -540.2431030273438,
"logps/rejected": -649.6897583007812,
"loss": 0.4706,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.4780304431915283,
"rewards/margins": 1.2412341833114624,
"rewards/rejected": -3.719264507293701,
"step": 1670
},
{
"epoch": 0.8793509552473174,
"grad_norm": 2.6893878378493556,
"learning_rate": 2.1762547565553293e-07,
"logits/chosen": -0.7929474711418152,
"logits/rejected": -0.8287181854248047,
"logps/chosen": -522.1485595703125,
"logps/rejected": -629.0785522460938,
"loss": 0.4218,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3336870670318604,
"rewards/margins": 1.2596609592437744,
"rewards/rejected": -3.5933480262756348,
"step": 1680
},
{
"epoch": 0.8845851871237895,
"grad_norm": 3.6443267203731198,
"learning_rate": 1.993631645009747e-07,
"logits/chosen": -0.7802733778953552,
"logits/rejected": -0.8199615478515625,
"logps/chosen": -495.28399658203125,
"logps/rejected": -611.3101196289062,
"loss": 0.4402,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.3746984004974365,
"rewards/margins": 1.337454915046692,
"rewards/rejected": -3.712153196334839,
"step": 1690
},
{
"epoch": 0.8898194190002617,
"grad_norm": 3.031125869912656,
"learning_rate": 1.818692473606748e-07,
"logits/chosen": -0.7836586833000183,
"logits/rejected": -0.8228501081466675,
"logps/chosen": -522.3655395507812,
"logps/rejected": -597.0633544921875,
"loss": 0.4594,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3934988975524902,
"rewards/margins": 1.1604548692703247,
"rewards/rejected": -3.5539536476135254,
"step": 1700
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -0.8378602266311646,
"eval_logits/rejected": -0.8565782904624939,
"eval_logps/chosen": -513.1796264648438,
"eval_logps/rejected": -610.4910888671875,
"eval_loss": 0.47899821400642395,
"eval_rewards/accuracies": 0.7730000019073486,
"eval_rewards/chosen": -2.3678627014160156,
"eval_rewards/margins": 1.2044339179992676,
"eval_rewards/rejected": -3.5722968578338623,
"eval_runtime": 490.4611,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 0.255,
"step": 1700
},
{
"epoch": 0.8950536508767338,
"grad_norm": 3.891535373197336,
"learning_rate": 1.6514956706084885e-07,
"logits/chosen": -0.8305776715278625,
"logits/rejected": -0.8483774065971375,
"logps/chosen": -526.2332763671875,
"logps/rejected": -616.9027709960938,
"loss": 0.4435,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3717777729034424,
"rewards/margins": 1.235668659210205,
"rewards/rejected": -3.6074466705322266,
"step": 1710
},
{
"epoch": 0.9002878827532059,
"grad_norm": 3.9743964527469595,
"learning_rate": 1.4920970783889737e-07,
"logits/chosen": -0.8023131489753723,
"logits/rejected": -0.8172504305839539,
"logps/chosen": -517.201416015625,
"logps/rejected": -612.9605712890625,
"loss": 0.4561,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.430854082107544,
"rewards/margins": 1.1006124019622803,
"rewards/rejected": -3.531466245651245,
"step": 1720
},
{
"epoch": 0.9055221146296781,
"grad_norm": 3.3367892778828034,
"learning_rate": 1.340549934783164e-07,
"logits/chosen": -0.793104887008667,
"logits/rejected": -0.8096216320991516,
"logps/chosen": -490.0877380371094,
"logps/rejected": -592.3488159179688,
"loss": 0.4734,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2823646068573,
"rewards/margins": 1.2109824419021606,
"rewards/rejected": -3.493346691131592,
"step": 1730
},
{
"epoch": 0.9107563465061502,
"grad_norm": 3.7031601561414935,
"learning_rate": 1.196904855305961e-07,
"logits/chosen": -0.7839618921279907,
"logits/rejected": -0.7986418008804321,
"logps/chosen": -523.5616455078125,
"logps/rejected": -599.9172973632812,
"loss": 0.493,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.390275001525879,
"rewards/margins": 1.185589075088501,
"rewards/rejected": -3.57586407661438,
"step": 1740
},
{
"epoch": 0.9159905783826223,
"grad_norm": 3.0931062812733674,
"learning_rate": 1.0612098162470302e-07,
"logits/chosen": -0.8060128092765808,
"logits/rejected": -0.8450511693954468,
"logps/chosen": -531.2606201171875,
"logps/rejected": -596.30224609375,
"loss": 0.5061,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.386965274810791,
"rewards/margins": 1.1063708066940308,
"rewards/rejected": -3.493335723876953,
"step": 1750
},
{
"epoch": 0.9212248102590945,
"grad_norm": 3.0522935551827777,
"learning_rate": 9.335101386471285e-08,
"logits/chosen": -0.8363837003707886,
"logits/rejected": -0.8385022282600403,
"logps/chosen": -485.2688903808594,
"logps/rejected": -600.6049194335938,
"loss": 0.4747,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.374671459197998,
"rewards/margins": 1.2024142742156982,
"rewards/rejected": -3.5770859718322754,
"step": 1760
},
{
"epoch": 0.9264590421355666,
"grad_norm": 3.816198597418785,
"learning_rate": 8.138484731612273e-08,
"logits/chosen": -0.7746875286102295,
"logits/rejected": -0.7804522514343262,
"logps/chosen": -511.98760986328125,
"logps/rejected": -612.0762329101562,
"loss": 0.4838,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.5132033824920654,
"rewards/margins": 1.177661657333374,
"rewards/rejected": -3.6908645629882812,
"step": 1770
},
{
"epoch": 0.9316932740120387,
"grad_norm": 2.5117854405272495,
"learning_rate": 7.022647858135501e-08,
"logits/chosen": -0.8066379427909851,
"logits/rejected": -0.8451001048088074,
"logps/chosen": -493.4380798339844,
"logps/rejected": -612.2415771484375,
"loss": 0.4947,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3315136432647705,
"rewards/margins": 1.325115442276001,
"rewards/rejected": -3.6566288471221924,
"step": 1780
},
{
"epoch": 0.9369275058885108,
"grad_norm": 2.293063931614578,
"learning_rate": 5.987963446492384e-08,
"logits/chosen": -0.7881379723548889,
"logits/rejected": -0.8155984878540039,
"logps/chosen": -554.5796508789062,
"logps/rejected": -694.9855346679688,
"loss": 0.4453,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.371554136276245,
"rewards/margins": 1.3966000080108643,
"rewards/rejected": -3.7681541442871094,
"step": 1790
},
{
"epoch": 0.942161737764983,
"grad_norm": 3.9098036406700367,
"learning_rate": 5.034777072871394e-08,
"logits/chosen": -0.7948960065841675,
"logits/rejected": -0.817171573638916,
"logps/chosen": -482.91650390625,
"logps/rejected": -598.6817626953125,
"loss": 0.4551,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3254570960998535,
"rewards/margins": 1.1948542594909668,
"rewards/rejected": -3.5203113555908203,
"step": 1800
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -0.8396913409233093,
"eval_logits/rejected": -0.8587289452552795,
"eval_logps/chosen": -509.1396789550781,
"eval_logps/rejected": -605.8721923828125,
"eval_loss": 0.4786478281021118,
"eval_rewards/accuracies": 0.7730000019073486,
"eval_rewards/chosen": -2.3274641036987305,
"eval_rewards/margins": 1.1986435651779175,
"eval_rewards/rejected": -3.5261073112487793,
"eval_runtime": 490.7513,
"eval_samples_per_second": 4.075,
"eval_steps_per_second": 0.255,
"step": 1800
},
{
"epoch": 0.9473959696414551,
"grad_norm": 3.3434570748221177,
"learning_rate": 4.163407093778243e-08,
"logits/chosen": -0.7641734480857849,
"logits/rejected": -0.7936812043190002,
"logps/chosen": -508.19061279296875,
"logps/rejected": -573.2919311523438,
"loss": 0.4779,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.2934184074401855,
"rewards/margins": 1.2088220119476318,
"rewards/rejected": -3.5022406578063965,
"step": 1810
},
{
"epoch": 0.9526302015179272,
"grad_norm": 3.306806286518662,
"learning_rate": 3.37414453970758e-08,
"logits/chosen": -0.7990435361862183,
"logits/rejected": -0.8609498739242554,
"logps/chosen": -522.0191650390625,
"logps/rejected": -591.8367309570312,
"loss": 0.4377,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.2719483375549316,
"rewards/margins": 1.3299857378005981,
"rewards/rejected": -3.601933717727661,
"step": 1820
},
{
"epoch": 0.9578644333943994,
"grad_norm": 3.622993124197589,
"learning_rate": 2.6672530179410183e-08,
"logits/chosen": -0.8073042631149292,
"logits/rejected": -0.823306679725647,
"logps/chosen": -486.38525390625,
"logps/rejected": -573.3196411132812,
"loss": 0.493,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.312579393386841,
"rewards/margins": 1.030884027481079,
"rewards/rejected": -3.34346342086792,
"step": 1830
},
{
"epoch": 0.9630986652708715,
"grad_norm": 3.2673896792966106,
"learning_rate": 2.04296862450451e-08,
"logits/chosen": -0.8151887059211731,
"logits/rejected": -0.8248012661933899,
"logps/chosen": -551.4527587890625,
"logps/rejected": -624.1947631835938,
"loss": 0.4627,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.5660691261291504,
"rewards/margins": 1.0703225135803223,
"rewards/rejected": -3.6363918781280518,
"step": 1840
},
{
"epoch": 0.9683328971473436,
"grad_norm": 2.95865023129791,
"learning_rate": 1.501499865314171e-08,
"logits/chosen": -0.7353194952011108,
"logits/rejected": -0.7695177793502808,
"logps/chosen": -559.2681884765625,
"logps/rejected": -642.7736206054688,
"loss": 0.4444,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.3837671279907227,
"rewards/margins": 1.1292965412139893,
"rewards/rejected": -3.513063907623291,
"step": 1850
},
{
"epoch": 0.9735671290238157,
"grad_norm": 2.492804384265188,
"learning_rate": 1.0430275865371265e-08,
"logits/chosen": -0.799170732498169,
"logits/rejected": -0.8115439414978027,
"logps/chosen": -491.9395446777344,
"logps/rejected": -578.8630981445312,
"loss": 0.4702,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.4343132972717285,
"rewards/margins": 1.1897588968276978,
"rewards/rejected": -3.6240718364715576,
"step": 1860
},
{
"epoch": 0.9788013609002879,
"grad_norm": 4.425829277630147,
"learning_rate": 6.677049141901315e-09,
"logits/chosen": -0.7830789685249329,
"logits/rejected": -0.8174635171890259,
"logps/chosen": -483.9396057128906,
"logps/rejected": -624.7434692382812,
"loss": 0.4794,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.34547758102417,
"rewards/margins": 1.2174358367919922,
"rewards/rejected": -3.562913417816162,
"step": 1870
},
{
"epoch": 0.98403559277676,
"grad_norm": 2.8797731827784325,
"learning_rate": 3.756572029968708e-09,
"logits/chosen": -0.8215857744216919,
"logits/rejected": -0.8443831205368042,
"logps/chosen": -508.30419921875,
"logps/rejected": -605.548828125,
"loss": 0.4655,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.2893662452697754,
"rewards/margins": 1.2450931072235107,
"rewards/rejected": -3.534459352493286,
"step": 1880
},
{
"epoch": 0.9892698246532321,
"grad_norm": 4.192074572610901,
"learning_rate": 1.6698199452053199e-09,
"logits/chosen": -0.7996589541435242,
"logits/rejected": -0.8368595838546753,
"logps/chosen": -520.9606323242188,
"logps/rejected": -608.918701171875,
"loss": 0.482,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.332554340362549,
"rewards/margins": 1.064354658126831,
"rewards/rejected": -3.39690899848938,
"step": 1890
},
{
"epoch": 0.9945040565297043,
"grad_norm": 2.981651951733524,
"learning_rate": 4.1748984585560094e-10,
"logits/chosen": -0.80968177318573,
"logits/rejected": -0.794297993183136,
"logps/chosen": -501.98553466796875,
"logps/rejected": -608.4963989257812,
"loss": 0.4605,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.3502135276794434,
"rewards/margins": 1.3177697658538818,
"rewards/rejected": -3.667983293533325,
"step": 1900
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -0.8360199928283691,
"eval_logits/rejected": -0.8543941378593445,
"eval_logps/chosen": -507.2547607421875,
"eval_logps/rejected": -604.1884765625,
"eval_loss": 0.47848692536354065,
"eval_rewards/accuracies": 0.7739999890327454,
"eval_rewards/chosen": -2.3086142539978027,
"eval_rewards/margins": 1.2006564140319824,
"eval_rewards/rejected": -3.509270668029785,
"eval_runtime": 492.0941,
"eval_samples_per_second": 4.064,
"eval_steps_per_second": 0.254,
"step": 1900
},
{
"epoch": 0.9997382884061764,
"grad_norm": 3.6416690762006327,
"learning_rate": 0.0,
"logits/chosen": -0.8213506937026978,
"logits/rejected": -0.8055696487426758,
"logps/chosen": -526.1131591796875,
"logps/rejected": -621.0297241210938,
"loss": 0.4759,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.379620313644409,
"rewards/margins": 1.0635095834732056,
"rewards/rejected": -3.4431300163269043,
"step": 1910
},
{
"epoch": 0.9997382884061764,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.504705511212973,
"train_runtime": 53071.3067,
"train_samples_per_second": 1.152,
"train_steps_per_second": 0.036
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}