zephyr-7b-kto-qlora / trainer_state.json
nnheui's picture
Model save
d8e1435 verified
raw
history blame
97.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.58203125,
"learning_rate": 2.617801047120419e-08,
"logits/chosen": -3.1532161235809326,
"logits/rejected": -3.1690337657928467,
"logps/chosen": -305.45306396484375,
"logps/rejected": -294.4603576660156,
"loss": 0.5,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0007838421151973307,
"rewards/margins": -0.00040248289587907493,
"rewards/rejected": -0.000381359423045069,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 0.5390625,
"learning_rate": 2.617801047120419e-07,
"logits/chosen": -3.177987813949585,
"logits/rejected": -3.2059593200683594,
"logps/chosen": -299.1102294921875,
"logps/rejected": -249.10623168945312,
"loss": 0.5001,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": -0.0008526805322617292,
"rewards/margins": -0.00045007685548625886,
"rewards/rejected": -0.0004026036476716399,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.59765625,
"learning_rate": 5.235602094240838e-07,
"logits/chosen": -3.1716275215148926,
"logits/rejected": -3.166067123413086,
"logps/chosen": -238.83120727539062,
"logps/rejected": -244.2283935546875,
"loss": 0.5,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 0.0001240858546225354,
"rewards/margins": 5.3543342801276594e-05,
"rewards/rejected": 7.054249726934358e-05,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.765625,
"learning_rate": 7.853403141361258e-07,
"logits/chosen": -3.194286823272705,
"logits/rejected": -3.2046267986297607,
"logps/chosen": -268.1184387207031,
"logps/rejected": -239.86087036132812,
"loss": 0.4997,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.002198445377871394,
"rewards/margins": 0.0013555358164012432,
"rewards/rejected": 0.0008429096196778119,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.72265625,
"learning_rate": 1.0471204188481676e-06,
"logits/chosen": -3.1798417568206787,
"logits/rejected": -3.185044765472412,
"logps/chosen": -273.47900390625,
"logps/rejected": -255.7032928466797,
"loss": 0.4993,
"rewards/accuracies": 0.6343749761581421,
"rewards/chosen": 0.005988434888422489,
"rewards/margins": 0.0028830617666244507,
"rewards/rejected": 0.003105373587459326,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.7578125,
"learning_rate": 1.3089005235602096e-06,
"logits/chosen": -3.162355899810791,
"logits/rejected": -3.1799404621124268,
"logps/chosen": -256.9862060546875,
"logps/rejected": -239.87069702148438,
"loss": 0.4985,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.012596851214766502,
"rewards/margins": 0.006152496673166752,
"rewards/rejected": 0.0064443545415997505,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.73046875,
"learning_rate": 1.5706806282722515e-06,
"logits/chosen": -3.1871049404144287,
"logits/rejected": -3.200637102127075,
"logps/chosen": -294.3240661621094,
"logps/rejected": -262.1870422363281,
"loss": 0.4969,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.022759366780519485,
"rewards/margins": 0.012819233350455761,
"rewards/rejected": 0.009940135292708874,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 0.75390625,
"learning_rate": 1.8324607329842933e-06,
"logits/chosen": -3.1636972427368164,
"logits/rejected": -3.161069869995117,
"logps/chosen": -266.68853759765625,
"logps/rejected": -243.20263671875,
"loss": 0.496,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": 0.031995244324207306,
"rewards/margins": 0.016239028424024582,
"rewards/rejected": 0.015756219625473022,
"step": 70
},
{
"epoch": 0.04,
"grad_norm": 0.8359375,
"learning_rate": 2.094240837696335e-06,
"logits/chosen": -3.1705234050750732,
"logits/rejected": -3.1865649223327637,
"logps/chosen": -271.360595703125,
"logps/rejected": -252.78170776367188,
"loss": 0.4962,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.038576819002628326,
"rewards/margins": 0.015578309074044228,
"rewards/rejected": 0.022998513653874397,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 0.72265625,
"learning_rate": 2.356020942408377e-06,
"logits/chosen": -3.166350841522217,
"logits/rejected": -3.1725335121154785,
"logps/chosen": -240.39999389648438,
"logps/rejected": -236.23782348632812,
"loss": 0.4956,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.035779114812612534,
"rewards/margins": 0.018319377675652504,
"rewards/rejected": 0.01745973899960518,
"step": 90
},
{
"epoch": 0.05,
"grad_norm": 0.7109375,
"learning_rate": 2.617801047120419e-06,
"logits/chosen": -3.169689655303955,
"logits/rejected": -3.2062766551971436,
"logps/chosen": -260.24462890625,
"logps/rejected": -230.7229766845703,
"loss": 0.4929,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.04880410060286522,
"rewards/margins": 0.029587719589471817,
"rewards/rejected": 0.0192163847386837,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 0.65625,
"learning_rate": 2.8795811518324613e-06,
"logits/chosen": -3.1644022464752197,
"logits/rejected": -3.178515672683716,
"logps/chosen": -257.0642395019531,
"logps/rejected": -233.0090789794922,
"loss": 0.4904,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": 0.05435089394450188,
"rewards/margins": 0.03999961167573929,
"rewards/rejected": 0.014351281337440014,
"step": 110
},
{
"epoch": 0.06,
"grad_norm": 0.7265625,
"learning_rate": 3.141361256544503e-06,
"logits/chosen": -3.1647660732269287,
"logits/rejected": -3.181644916534424,
"logps/chosen": -300.6939392089844,
"logps/rejected": -279.0010070800781,
"loss": 0.4918,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": 0.0594431571662426,
"rewards/margins": 0.035575076937675476,
"rewards/rejected": 0.023868080228567123,
"step": 120
},
{
"epoch": 0.07,
"grad_norm": 0.6171875,
"learning_rate": 3.403141361256545e-06,
"logits/chosen": -3.1200623512268066,
"logits/rejected": -3.1410274505615234,
"logps/chosen": -265.76513671875,
"logps/rejected": -246.6106414794922,
"loss": 0.4896,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": 0.05789119750261307,
"rewards/margins": 0.045422304421663284,
"rewards/rejected": 0.012468896806240082,
"step": 130
},
{
"epoch": 0.07,
"grad_norm": 0.671875,
"learning_rate": 3.6649214659685865e-06,
"logits/chosen": -3.1760947704315186,
"logits/rejected": -3.1799349784851074,
"logps/chosen": -258.2982482910156,
"logps/rejected": -239.6028289794922,
"loss": 0.4864,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": 0.06110318750143051,
"rewards/margins": 0.06362718343734741,
"rewards/rejected": -0.00252399779856205,
"step": 140
},
{
"epoch": 0.08,
"grad_norm": 0.625,
"learning_rate": 3.926701570680629e-06,
"logits/chosen": -3.1222329139709473,
"logits/rejected": -3.133145332336426,
"logps/chosen": -265.1998291015625,
"logps/rejected": -251.78952026367188,
"loss": 0.4875,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.05429766699671745,
"rewards/margins": 0.05967814847826958,
"rewards/rejected": -0.005380480550229549,
"step": 150
},
{
"epoch": 0.08,
"grad_norm": 0.703125,
"learning_rate": 4.18848167539267e-06,
"logits/chosen": -3.176487445831299,
"logits/rejected": -3.190802812576294,
"logps/chosen": -272.86651611328125,
"logps/rejected": -244.38687133789062,
"loss": 0.4852,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": 0.05848199874162674,
"rewards/margins": 0.07154129445552826,
"rewards/rejected": -0.013059285469353199,
"step": 160
},
{
"epoch": 0.09,
"grad_norm": 0.71484375,
"learning_rate": 4.450261780104713e-06,
"logits/chosen": -3.153256893157959,
"logits/rejected": -3.1715409755706787,
"logps/chosen": -274.42901611328125,
"logps/rejected": -259.0591125488281,
"loss": 0.4835,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": 0.06613625586032867,
"rewards/margins": 0.0808890238404274,
"rewards/rejected": -0.014752751216292381,
"step": 170
},
{
"epoch": 0.09,
"grad_norm": 0.51953125,
"learning_rate": 4.712041884816754e-06,
"logits/chosen": -3.116530179977417,
"logits/rejected": -3.125654697418213,
"logps/chosen": -264.8692932128906,
"logps/rejected": -256.16748046875,
"loss": 0.486,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": 0.05057697370648384,
"rewards/margins": 0.0704292505979538,
"rewards/rejected": -0.019852278754115105,
"step": 180
},
{
"epoch": 0.1,
"grad_norm": 0.6171875,
"learning_rate": 4.9738219895287965e-06,
"logits/chosen": -3.123274564743042,
"logits/rejected": -3.1312222480773926,
"logps/chosen": -288.4281311035156,
"logps/rejected": -258.3103332519531,
"loss": 0.4814,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.05620621517300606,
"rewards/margins": 0.09435133635997772,
"rewards/rejected": -0.038145121186971664,
"step": 190
},
{
"epoch": 0.1,
"grad_norm": 0.703125,
"learning_rate": 4.999661831436499e-06,
"logits/chosen": -3.126032590866089,
"logits/rejected": -3.1409640312194824,
"logps/chosen": -271.5264587402344,
"logps/rejected": -251.412109375,
"loss": 0.4787,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": 0.0683365911245346,
"rewards/margins": 0.10697062313556671,
"rewards/rejected": -0.038634032011032104,
"step": 200
},
{
"epoch": 0.11,
"grad_norm": 0.6953125,
"learning_rate": 4.9984929711403395e-06,
"logits/chosen": -3.0909173488616943,
"logits/rejected": -3.0889172554016113,
"logps/chosen": -235.9461212158203,
"logps/rejected": -240.8140106201172,
"loss": 0.4855,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.03789529204368591,
"rewards/margins": 0.07996337115764618,
"rewards/rejected": -0.04206807166337967,
"step": 210
},
{
"epoch": 0.12,
"grad_norm": 0.609375,
"learning_rate": 4.996489634487865e-06,
"logits/chosen": -3.149304151535034,
"logits/rejected": -3.1437649726867676,
"logps/chosen": -276.66497802734375,
"logps/rejected": -261.4449157714844,
"loss": 0.4814,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": 0.030237609520554543,
"rewards/margins": 0.1089547872543335,
"rewards/rejected": -0.0787171721458435,
"step": 220
},
{
"epoch": 0.12,
"grad_norm": 0.69921875,
"learning_rate": 4.9936524905772466e-06,
"logits/chosen": -3.092778444290161,
"logits/rejected": -3.1226553916931152,
"logps/chosen": -265.37457275390625,
"logps/rejected": -260.417724609375,
"loss": 0.4759,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.06305978447198868,
"rewards/margins": 0.13326093554496765,
"rewards/rejected": -0.07020114362239838,
"step": 230
},
{
"epoch": 0.13,
"grad_norm": 0.76953125,
"learning_rate": 4.9899824869915e-06,
"logits/chosen": -3.1257612705230713,
"logits/rejected": -3.142120838165283,
"logps/chosen": -251.7377166748047,
"logps/rejected": -258.2868957519531,
"loss": 0.4783,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.034979041665792465,
"rewards/margins": 0.12733003497123718,
"rewards/rejected": -0.09235100448131561,
"step": 240
},
{
"epoch": 0.13,
"grad_norm": 0.75,
"learning_rate": 4.985480849482012e-06,
"logits/chosen": -3.1416521072387695,
"logits/rejected": -3.1663966178894043,
"logps/chosen": -280.97442626953125,
"logps/rejected": -259.81915283203125,
"loss": 0.4749,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.03899794816970825,
"rewards/margins": 0.14351439476013184,
"rewards/rejected": -0.10451646894216537,
"step": 250
},
{
"epoch": 0.14,
"grad_norm": 0.78125,
"learning_rate": 4.980149081559142e-06,
"logits/chosen": -3.166862964630127,
"logits/rejected": -3.1826682090759277,
"logps/chosen": -277.7224426269531,
"logps/rejected": -264.73248291015625,
"loss": 0.4769,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.02976931631565094,
"rewards/margins": 0.13806195557117462,
"rewards/rejected": -0.10829265415668488,
"step": 260
},
{
"epoch": 0.14,
"grad_norm": 0.6171875,
"learning_rate": 4.9739889639900655e-06,
"logits/chosen": -3.1009061336517334,
"logits/rejected": -3.1220781803131104,
"logps/chosen": -271.1175842285156,
"logps/rejected": -270.35601806640625,
"loss": 0.4696,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.041884977370500565,
"rewards/margins": 0.18558195233345032,
"rewards/rejected": -0.14369697868824005,
"step": 270
},
{
"epoch": 0.15,
"grad_norm": 0.77734375,
"learning_rate": 4.967002554204009e-06,
"logits/chosen": -3.1314620971679688,
"logits/rejected": -3.147207021713257,
"logps/chosen": -272.50836181640625,
"logps/rejected": -278.05487060546875,
"loss": 0.4785,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0026828604750335217,
"rewards/margins": 0.14521858096122742,
"rewards/rejected": -0.1425357311964035,
"step": 280
},
{
"epoch": 0.15,
"grad_norm": 0.62109375,
"learning_rate": 4.959192185605089e-06,
"logits/chosen": -3.1541976928710938,
"logits/rejected": -3.1754889488220215,
"logps/chosen": -304.40740966796875,
"logps/rejected": -284.8604736328125,
"loss": 0.4718,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.0017113524954766035,
"rewards/margins": 0.18049772083759308,
"rewards/rejected": -0.17878638207912445,
"step": 290
},
{
"epoch": 0.16,
"grad_norm": 0.859375,
"learning_rate": 4.950560466792969e-06,
"logits/chosen": -3.1485819816589355,
"logits/rejected": -3.1677544116973877,
"logps/chosen": -263.881103515625,
"logps/rejected": -263.1119689941406,
"loss": 0.4741,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.045438483357429504,
"rewards/margins": 0.1866464614868164,
"rewards/rejected": -0.23208491504192352,
"step": 300
},
{
"epoch": 0.16,
"grad_norm": 0.80078125,
"learning_rate": 4.9411102806916185e-06,
"logits/chosen": -3.122454881668091,
"logits/rejected": -3.144866943359375,
"logps/chosen": -278.1111145019531,
"logps/rejected": -262.2369689941406,
"loss": 0.4695,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.052451539784669876,
"rewards/margins": 0.22689659893512726,
"rewards/rejected": -0.2793481647968292,
"step": 310
},
{
"epoch": 0.17,
"grad_norm": 0.71484375,
"learning_rate": 4.930844783586424e-06,
"logits/chosen": -3.116986036300659,
"logits/rejected": -3.1314234733581543,
"logps/chosen": -280.05523681640625,
"logps/rejected": -292.8658447265625,
"loss": 0.4642,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.18633142113685608,
"rewards/margins": 0.2993203103542328,
"rewards/rejected": -0.4856516718864441,
"step": 320
},
{
"epoch": 0.17,
"grad_norm": 0.8828125,
"learning_rate": 4.919767404070033e-06,
"logits/chosen": -3.0919606685638428,
"logits/rejected": -3.1069421768188477,
"logps/chosen": -290.76690673828125,
"logps/rejected": -299.6058044433594,
"loss": 0.4644,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.24150581657886505,
"rewards/margins": 0.32079803943634033,
"rewards/rejected": -0.562303900718689,
"step": 330
},
{
"epoch": 0.18,
"grad_norm": 0.8984375,
"learning_rate": 4.907881841897216e-06,
"logits/chosen": -3.113529920578003,
"logits/rejected": -3.13189697265625,
"logps/chosen": -316.0239562988281,
"logps/rejected": -312.1597900390625,
"loss": 0.4576,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.3316032886505127,
"rewards/margins": 0.40208154916763306,
"rewards/rejected": -0.733684778213501,
"step": 340
},
{
"epoch": 0.18,
"grad_norm": 1.03125,
"learning_rate": 4.89519206674919e-06,
"logits/chosen": -3.0299558639526367,
"logits/rejected": -3.0673482418060303,
"logps/chosen": -298.59539794921875,
"logps/rejected": -356.8745422363281,
"loss": 0.4522,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.496961772441864,
"rewards/margins": 0.5310899615287781,
"rewards/rejected": -1.028051733970642,
"step": 350
},
{
"epoch": 0.19,
"grad_norm": 0.984375,
"learning_rate": 4.881702316907769e-06,
"logits/chosen": -3.04780912399292,
"logits/rejected": -3.0520381927490234,
"logps/chosen": -342.5206604003906,
"logps/rejected": -376.9095153808594,
"loss": 0.4588,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.7044845819473267,
"rewards/margins": 0.4986873269081116,
"rewards/rejected": -1.203171968460083,
"step": 360
},
{
"epoch": 0.19,
"grad_norm": 1.515625,
"learning_rate": 4.86741709783982e-06,
"logits/chosen": -3.0080177783966064,
"logits/rejected": -3.0334441661834717,
"logps/chosen": -322.6118469238281,
"logps/rejected": -343.72320556640625,
"loss": 0.4541,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.7056992053985596,
"rewards/margins": 0.5343005061149597,
"rewards/rejected": -1.239999771118164,
"step": 370
},
{
"epoch": 0.2,
"grad_norm": 0.88671875,
"learning_rate": 4.852341180692471e-06,
"logits/chosen": -2.9617342948913574,
"logits/rejected": -2.9721832275390625,
"logps/chosen": -318.089111328125,
"logps/rejected": -369.661865234375,
"loss": 0.4526,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5964111089706421,
"rewards/margins": 0.5622240900993347,
"rewards/rejected": -1.1586352586746216,
"step": 380
},
{
"epoch": 0.2,
"grad_norm": 0.9375,
"learning_rate": 4.836479600699579e-06,
"logits/chosen": -3.004973888397217,
"logits/rejected": -3.015404224395752,
"logps/chosen": -322.59222412109375,
"logps/rejected": -350.57952880859375,
"loss": 0.4555,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.4821473956108093,
"rewards/margins": 0.5106935501098633,
"rewards/rejected": -0.9928409457206726,
"step": 390
},
{
"epoch": 0.21,
"grad_norm": 1.359375,
"learning_rate": 4.819837655500014e-06,
"logits/chosen": -2.982062816619873,
"logits/rejected": -3.0214340686798096,
"logps/chosen": -328.14764404296875,
"logps/rejected": -358.10198974609375,
"loss": 0.4518,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6790810823440552,
"rewards/margins": 0.5925682783126831,
"rewards/rejected": -1.2716493606567383,
"step": 400
},
{
"epoch": 0.21,
"grad_norm": 1.1953125,
"learning_rate": 4.802420903368286e-06,
"logits/chosen": -2.996058225631714,
"logits/rejected": -3.0428948402404785,
"logps/chosen": -311.92181396484375,
"logps/rejected": -364.3048095703125,
"loss": 0.4564,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5532656908035278,
"rewards/margins": 0.49352067708969116,
"rewards/rejected": -1.0467865467071533,
"step": 410
},
{
"epoch": 0.22,
"grad_norm": 1.140625,
"learning_rate": 4.784235161358124e-06,
"logits/chosen": -2.960181951522827,
"logits/rejected": -2.9744322299957275,
"logps/chosen": -318.44744873046875,
"logps/rejected": -352.4103088378906,
"loss": 0.4597,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.5166889429092407,
"rewards/margins": 0.5014506578445435,
"rewards/rejected": -1.0181396007537842,
"step": 420
},
{
"epoch": 0.23,
"grad_norm": 1.09375,
"learning_rate": 4.765286503359632e-06,
"logits/chosen": -2.887781858444214,
"logits/rejected": -2.90468430519104,
"logps/chosen": -332.93792724609375,
"logps/rejected": -379.0559997558594,
"loss": 0.4516,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.6485487222671509,
"rewards/margins": 0.5632439255714417,
"rewards/rejected": -1.2117927074432373,
"step": 430
},
{
"epoch": 0.23,
"grad_norm": 0.9921875,
"learning_rate": 4.745581258070654e-06,
"logits/chosen": -2.8706612586975098,
"logits/rejected": -2.8921449184417725,
"logps/chosen": -356.7577209472656,
"logps/rejected": -414.74224853515625,
"loss": 0.4523,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9310439229011536,
"rewards/margins": 0.6457870602607727,
"rewards/rejected": -1.5768309831619263,
"step": 440
},
{
"epoch": 0.24,
"grad_norm": 1.4765625,
"learning_rate": 4.725126006883047e-06,
"logits/chosen": -2.8595480918884277,
"logits/rejected": -2.884725332260132,
"logps/chosen": -306.8425598144531,
"logps/rejected": -388.7693786621094,
"loss": 0.4424,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.637574315071106,
"rewards/margins": 0.7818979620933533,
"rewards/rejected": -1.4194722175598145,
"step": 450
},
{
"epoch": 0.24,
"grad_norm": 1.0546875,
"learning_rate": 4.70392758168454e-06,
"logits/chosen": -2.8378663063049316,
"logits/rejected": -2.8632078170776367,
"logps/chosen": -362.0655822753906,
"logps/rejected": -393.5525817871094,
"loss": 0.456,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.843133807182312,
"rewards/margins": 0.6695644855499268,
"rewards/rejected": -1.5126984119415283,
"step": 460
},
{
"epoch": 0.25,
"grad_norm": 1.59375,
"learning_rate": 4.68199306257695e-06,
"logits/chosen": -2.90700101852417,
"logits/rejected": -2.930222511291504,
"logps/chosen": -368.11419677734375,
"logps/rejected": -419.2084045410156,
"loss": 0.4412,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.8187941312789917,
"rewards/margins": 0.7371198534965515,
"rewards/rejected": -1.5559141635894775,
"step": 470
},
{
"epoch": 0.25,
"grad_norm": 0.8984375,
"learning_rate": 4.659329775511478e-06,
"logits/chosen": -2.8993983268737793,
"logits/rejected": -2.930690050125122,
"logps/chosen": -331.52081298828125,
"logps/rejected": -389.1390686035156,
"loss": 0.4494,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6849466562271118,
"rewards/margins": 0.7577627301216125,
"rewards/rejected": -1.4427093267440796,
"step": 480
},
{
"epoch": 0.26,
"grad_norm": 1.140625,
"learning_rate": 4.635945289841902e-06,
"logits/chosen": -2.8397905826568604,
"logits/rejected": -2.8895552158355713,
"logps/chosen": -351.8777770996094,
"logps/rejected": -399.30126953125,
"loss": 0.4437,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6446736454963684,
"rewards/margins": 0.8137520551681519,
"rewards/rejected": -1.458425760269165,
"step": 490
},
{
"epoch": 0.26,
"grad_norm": 2.265625,
"learning_rate": 4.611847415796476e-06,
"logits/chosen": -2.8411877155303955,
"logits/rejected": -2.8607966899871826,
"logps/chosen": -360.9768371582031,
"logps/rejected": -416.825927734375,
"loss": 0.4382,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.8380700945854187,
"rewards/margins": 0.860516369342804,
"rewards/rejected": -1.6985862255096436,
"step": 500
},
{
"epoch": 0.27,
"grad_norm": 1.75,
"learning_rate": 4.587044201869378e-06,
"logits/chosen": -2.848315715789795,
"logits/rejected": -2.8671722412109375,
"logps/chosen": -326.2365417480469,
"logps/rejected": -389.13720703125,
"loss": 0.4425,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7817949056625366,
"rewards/margins": 0.8195638656616211,
"rewards/rejected": -1.6013587713241577,
"step": 510
},
{
"epoch": 0.27,
"grad_norm": 1.4453125,
"learning_rate": 4.561543932132574e-06,
"logits/chosen": -2.7974531650543213,
"logits/rejected": -2.828226327896118,
"logps/chosen": -348.9537658691406,
"logps/rejected": -422.35186767578125,
"loss": 0.4319,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7357637286186218,
"rewards/margins": 0.9210460782051086,
"rewards/rejected": -1.6568095684051514,
"step": 520
},
{
"epoch": 0.28,
"grad_norm": 1.6015625,
"learning_rate": 4.535355123469009e-06,
"logits/chosen": -2.8311352729797363,
"logits/rejected": -2.845012664794922,
"logps/chosen": -350.1925964355469,
"logps/rejected": -428.8135681152344,
"loss": 0.4413,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.9110676050186157,
"rewards/margins": 0.8541079759597778,
"rewards/rejected": -1.765175461769104,
"step": 530
},
{
"epoch": 0.28,
"grad_norm": 2.546875,
"learning_rate": 4.508486522728037e-06,
"logits/chosen": -2.7914469242095947,
"logits/rejected": -2.821166515350342,
"logps/chosen": -362.1348876953125,
"logps/rejected": -431.3775329589844,
"loss": 0.425,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.72906094789505,
"rewards/margins": 1.1009550094604492,
"rewards/rejected": -1.8300158977508545,
"step": 540
},
{
"epoch": 0.29,
"grad_norm": 2.953125,
"learning_rate": 4.480947103804044e-06,
"logits/chosen": -2.7665412425994873,
"logits/rejected": -2.7718183994293213,
"logps/chosen": -364.27362060546875,
"logps/rejected": -409.62042236328125,
"loss": 0.4476,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.9586559534072876,
"rewards/margins": 0.7505531311035156,
"rewards/rejected": -1.7092090845108032,
"step": 550
},
{
"epoch": 0.29,
"grad_norm": 2.21875,
"learning_rate": 4.452746064639239e-06,
"logits/chosen": -2.7609431743621826,
"logits/rejected": -2.7761027812957764,
"logps/chosen": -350.8570556640625,
"logps/rejected": -438.7533264160156,
"loss": 0.4401,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.9020726084709167,
"rewards/margins": 0.9013819694519043,
"rewards/rejected": -1.8034546375274658,
"step": 560
},
{
"epoch": 0.3,
"grad_norm": 1.6171875,
"learning_rate": 4.423892824151617e-06,
"logits/chosen": -2.724292755126953,
"logits/rejected": -2.7128217220306396,
"logps/chosen": -369.49822998046875,
"logps/rejected": -421.8377380371094,
"loss": 0.4407,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9183570742607117,
"rewards/margins": 0.7936559319496155,
"rewards/rejected": -1.7120128870010376,
"step": 570
},
{
"epoch": 0.3,
"grad_norm": 2.296875,
"learning_rate": 4.3943970190891164e-06,
"logits/chosen": -2.767193555831909,
"logits/rejected": -2.7671852111816406,
"logps/chosen": -338.62347412109375,
"logps/rejected": -402.12713623046875,
"loss": 0.4297,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.6558475494384766,
"rewards/margins": 0.883314311504364,
"rewards/rejected": -1.5391619205474854,
"step": 580
},
{
"epoch": 0.31,
"grad_norm": 1.9765625,
"learning_rate": 4.364268500811025e-06,
"logits/chosen": -2.71061635017395,
"logits/rejected": -2.7537200450897217,
"logps/chosen": -356.565185546875,
"logps/rejected": -418.8211975097656,
"loss": 0.4445,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9843032956123352,
"rewards/margins": 0.7892019152641296,
"rewards/rejected": -1.773505449295044,
"step": 590
},
{
"epoch": 0.31,
"grad_norm": 1.7421875,
"learning_rate": 4.333517331997704e-06,
"logits/chosen": -2.7916760444641113,
"logits/rejected": -2.771669864654541,
"logps/chosen": -304.84649658203125,
"logps/rejected": -379.7848815917969,
"loss": 0.4453,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.6202788949012756,
"rewards/margins": 0.7137486338615417,
"rewards/rejected": -1.3340275287628174,
"step": 600
},
{
"epoch": 0.32,
"grad_norm": 1.34375,
"learning_rate": 4.302153783289737e-06,
"logits/chosen": -2.771042823791504,
"logits/rejected": -2.7721784114837646,
"logps/chosen": -304.8459167480469,
"logps/rejected": -382.05029296875,
"loss": 0.4336,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.48120832443237305,
"rewards/margins": 0.8254661560058594,
"rewards/rejected": -1.3066743612289429,
"step": 610
},
{
"epoch": 0.32,
"grad_norm": 1.171875,
"learning_rate": 4.270188329857613e-06,
"logits/chosen": -2.6971840858459473,
"logits/rejected": -2.7405362129211426,
"logps/chosen": -365.59869384765625,
"logps/rejected": -422.66082763671875,
"loss": 0.4337,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.7066925168037415,
"rewards/margins": 0.9992557764053345,
"rewards/rejected": -1.7059482336044312,
"step": 620
},
{
"epoch": 0.33,
"grad_norm": 2.546875,
"learning_rate": 4.237631647903115e-06,
"logits/chosen": -2.729823589324951,
"logits/rejected": -2.7406442165374756,
"logps/chosen": -343.63861083984375,
"logps/rejected": -398.4572448730469,
"loss": 0.4474,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.834626317024231,
"rewards/margins": 0.7315307855606079,
"rewards/rejected": -1.5661571025848389,
"step": 630
},
{
"epoch": 0.33,
"grad_norm": 3.15625,
"learning_rate": 4.204494611093548e-06,
"logits/chosen": -2.6876988410949707,
"logits/rejected": -2.710921049118042,
"logps/chosen": -344.4961242675781,
"logps/rejected": -413.8218688964844,
"loss": 0.4355,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9162249565124512,
"rewards/margins": 0.9599907994270325,
"rewards/rejected": -1.8762153387069702,
"step": 640
},
{
"epoch": 0.34,
"grad_norm": 3.3125,
"learning_rate": 4.170788286930024e-06,
"logits/chosen": -2.720813751220703,
"logits/rejected": -2.7221953868865967,
"logps/chosen": -396.78424072265625,
"logps/rejected": -472.07293701171875,
"loss": 0.4488,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.4064452648162842,
"rewards/margins": 0.8613995313644409,
"rewards/rejected": -2.2678446769714355,
"step": 650
},
{
"epoch": 0.35,
"grad_norm": 1.578125,
"learning_rate": 4.136523933051005e-06,
"logits/chosen": -2.759519338607788,
"logits/rejected": -2.7709243297576904,
"logps/chosen": -343.8990173339844,
"logps/rejected": -425.5016174316406,
"loss": 0.4433,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.881717324256897,
"rewards/margins": 0.9298511743545532,
"rewards/rejected": -1.8115684986114502,
"step": 660
},
{
"epoch": 0.35,
"grad_norm": 2.390625,
"learning_rate": 4.101712993472348e-06,
"logits/chosen": -2.7461307048797607,
"logits/rejected": -2.7893545627593994,
"logps/chosen": -306.47528076171875,
"logps/rejected": -357.6148986816406,
"loss": 0.4434,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.500246524810791,
"rewards/margins": 0.6672911643981934,
"rewards/rejected": -1.1675376892089844,
"step": 670
},
{
"epoch": 0.36,
"grad_norm": 1.53125,
"learning_rate": 4.066367094765091e-06,
"logits/chosen": -2.6456458568573,
"logits/rejected": -2.6387665271759033,
"logps/chosen": -346.2305603027344,
"logps/rejected": -411.1793518066406,
"loss": 0.4375,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.803577721118927,
"rewards/margins": 0.8857296109199524,
"rewards/rejected": -1.689307451248169,
"step": 680
},
{
"epoch": 0.36,
"grad_norm": 1.875,
"learning_rate": 4.030498042172277e-06,
"logits/chosen": -2.729139804840088,
"logits/rejected": -2.7350516319274902,
"logps/chosen": -343.1637268066406,
"logps/rejected": -424.0751037597656,
"loss": 0.4338,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.6723712086677551,
"rewards/margins": 0.953707218170166,
"rewards/rejected": -1.6260782480239868,
"step": 690
},
{
"epoch": 0.37,
"grad_norm": 1.5,
"learning_rate": 3.994117815666095e-06,
"logits/chosen": -2.737205743789673,
"logits/rejected": -2.756513833999634,
"logps/chosen": -358.9730529785156,
"logps/rejected": -435.59930419921875,
"loss": 0.4352,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9257775545120239,
"rewards/margins": 0.9314867258071899,
"rewards/rejected": -1.8572641611099243,
"step": 700
},
{
"epoch": 0.37,
"grad_norm": 2.59375,
"learning_rate": 3.957238565946672e-06,
"logits/chosen": -2.7129783630371094,
"logits/rejected": -2.740182399749756,
"logps/chosen": -382.54144287109375,
"logps/rejected": -479.25079345703125,
"loss": 0.425,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0284990072250366,
"rewards/margins": 1.2022464275360107,
"rewards/rejected": -2.230745315551758,
"step": 710
},
{
"epoch": 0.38,
"grad_norm": 3.109375,
"learning_rate": 3.919872610383831e-06,
"logits/chosen": -2.700688600540161,
"logits/rejected": -2.715548038482666,
"logps/chosen": -346.7179870605469,
"logps/rejected": -442.24468994140625,
"loss": 0.429,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9038440585136414,
"rewards/margins": 1.1126606464385986,
"rewards/rejected": -2.0165047645568848,
"step": 720
},
{
"epoch": 0.38,
"grad_norm": 5.4375,
"learning_rate": 3.882032428903195e-06,
"logits/chosen": -2.732431650161743,
"logits/rejected": -2.741513252258301,
"logps/chosen": -341.4494323730469,
"logps/rejected": -418.03485107421875,
"loss": 0.4393,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.9002019166946411,
"rewards/margins": 0.9896795153617859,
"rewards/rejected": -1.8898814916610718,
"step": 730
},
{
"epoch": 0.39,
"grad_norm": 2.828125,
"learning_rate": 3.84373065981799e-06,
"logits/chosen": -2.6866860389709473,
"logits/rejected": -2.7075419425964355,
"logps/chosen": -335.77630615234375,
"logps/rejected": -392.2721252441406,
"loss": 0.4315,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7698886394500732,
"rewards/margins": 0.8988865613937378,
"rewards/rejected": -1.668775200843811,
"step": 740
},
{
"epoch": 0.39,
"grad_norm": 3.03125,
"learning_rate": 3.8049800956079552e-06,
"logits/chosen": -2.728248357772827,
"logits/rejected": -2.719029426574707,
"logps/chosen": -333.34747314453125,
"logps/rejected": -403.9107971191406,
"loss": 0.4329,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": -0.688866376876831,
"rewards/margins": 0.9195898771286011,
"rewards/rejected": -1.6084562540054321,
"step": 750
},
{
"epoch": 0.4,
"grad_norm": 2.640625,
"learning_rate": 3.765793678646753e-06,
"logits/chosen": -2.7588818073272705,
"logits/rejected": -2.764564037322998,
"logps/chosen": -327.6832580566406,
"logps/rejected": -397.34967041015625,
"loss": 0.4445,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.7366557121276855,
"rewards/margins": 0.8244975805282593,
"rewards/rejected": -1.5611531734466553,
"step": 760
},
{
"epoch": 0.4,
"grad_norm": 3.40625,
"learning_rate": 3.726184496879323e-06,
"logits/chosen": -2.7015931606292725,
"logits/rejected": -2.7093541622161865,
"logps/chosen": -328.6782531738281,
"logps/rejected": -392.6251525878906,
"loss": 0.4434,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.6729280948638916,
"rewards/margins": 0.7624879479408264,
"rewards/rejected": -1.4354161024093628,
"step": 770
},
{
"epoch": 0.41,
"grad_norm": 2.046875,
"learning_rate": 3.686165779450619e-06,
"logits/chosen": -2.7212226390838623,
"logits/rejected": -2.7281336784362793,
"logps/chosen": -332.78985595703125,
"logps/rejected": -407.54339599609375,
"loss": 0.4226,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5121452808380127,
"rewards/margins": 1.0209487676620483,
"rewards/rejected": -1.5330939292907715,
"step": 780
},
{
"epoch": 0.41,
"grad_norm": 2.0625,
"learning_rate": 3.645750892287178e-06,
"logits/chosen": -2.6992287635803223,
"logits/rejected": -2.7106306552886963,
"logps/chosen": -349.5966491699219,
"logps/rejected": -479.349609375,
"loss": 0.4291,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9312782287597656,
"rewards/margins": 1.2202502489089966,
"rewards/rejected": -2.1515283584594727,
"step": 790
},
{
"epoch": 0.42,
"grad_norm": 4.4375,
"learning_rate": 3.604953333633009e-06,
"logits/chosen": -2.6964669227600098,
"logits/rejected": -2.718437671661377,
"logps/chosen": -368.614990234375,
"logps/rejected": -455.652099609375,
"loss": 0.444,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1283608675003052,
"rewards/margins": 0.9377814531326294,
"rewards/rejected": -2.0661423206329346,
"step": 800
},
{
"epoch": 0.42,
"grad_norm": 1.796875,
"learning_rate": 3.56378672954129e-06,
"logits/chosen": -2.734504222869873,
"logits/rejected": -2.741596221923828,
"logps/chosen": -345.35894775390625,
"logps/rejected": -441.3074645996094,
"loss": 0.4339,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": -0.8633454442024231,
"rewards/margins": 1.0725480318069458,
"rewards/rejected": -1.9358936548233032,
"step": 810
},
{
"epoch": 0.43,
"grad_norm": 4.34375,
"learning_rate": 3.5222648293233806e-06,
"logits/chosen": -2.726121425628662,
"logits/rejected": -2.757293701171875,
"logps/chosen": -328.99981689453125,
"logps/rejected": -409.4175720214844,
"loss": 0.4433,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.8074792623519897,
"rewards/margins": 0.893712043762207,
"rewards/rejected": -1.7011913061141968,
"step": 820
},
{
"epoch": 0.43,
"grad_norm": 2.015625,
"learning_rate": 3.4804015009566573e-06,
"logits/chosen": -2.7143707275390625,
"logits/rejected": -2.7266762256622314,
"logps/chosen": -343.6038513183594,
"logps/rejected": -408.5287170410156,
"loss": 0.4446,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0202709436416626,
"rewards/margins": 0.8006850481033325,
"rewards/rejected": -1.8209559917449951,
"step": 830
},
{
"epoch": 0.44,
"grad_norm": 8.25,
"learning_rate": 3.4382107264527244e-06,
"logits/chosen": -2.731356143951416,
"logits/rejected": -2.749807357788086,
"logps/chosen": -387.00372314453125,
"logps/rejected": -461.8328552246094,
"loss": 0.4274,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.057356834411621,
"rewards/margins": 1.1233993768692017,
"rewards/rejected": -2.180756092071533,
"step": 840
},
{
"epoch": 0.44,
"grad_norm": 1.328125,
"learning_rate": 3.3957065971875387e-06,
"logits/chosen": -2.736109972000122,
"logits/rejected": -2.761101245880127,
"logps/chosen": -378.2186279296875,
"logps/rejected": -433.41986083984375,
"loss": 0.4517,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.291465401649475,
"rewards/margins": 0.7508509159088135,
"rewards/rejected": -2.042316198348999,
"step": 850
},
{
"epoch": 0.45,
"grad_norm": 3.15625,
"learning_rate": 3.352903309194999e-06,
"logits/chosen": -2.7496652603149414,
"logits/rejected": -2.766449451446533,
"logps/chosen": -347.00531005859375,
"logps/rejected": -450.0355529785156,
"loss": 0.4337,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9623804092407227,
"rewards/margins": 1.0263848304748535,
"rewards/rejected": -1.9887651205062866,
"step": 860
},
{
"epoch": 0.46,
"grad_norm": 0.9765625,
"learning_rate": 3.309815158425591e-06,
"logits/chosen": -2.6927943229675293,
"logits/rejected": -2.7125327587127686,
"logps/chosen": -312.2998046875,
"logps/rejected": -379.62017822265625,
"loss": 0.43,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.39604613184928894,
"rewards/margins": 0.9399534463882446,
"rewards/rejected": -1.335999608039856,
"step": 870
},
{
"epoch": 0.46,
"grad_norm": 4.53125,
"learning_rate": 3.266456535971654e-06,
"logits/chosen": -2.790156602859497,
"logits/rejected": -2.7961854934692383,
"logps/chosen": -305.0512390136719,
"logps/rejected": -366.155029296875,
"loss": 0.4325,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.48138341307640076,
"rewards/margins": 0.8310259580612183,
"rewards/rejected": -1.3124094009399414,
"step": 880
},
{
"epoch": 0.47,
"grad_norm": 1.0625,
"learning_rate": 3.2228419232608692e-06,
"logits/chosen": -2.701418399810791,
"logits/rejected": -2.688974380493164,
"logps/chosen": -316.68133544921875,
"logps/rejected": -413.67156982421875,
"loss": 0.4314,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6776655316352844,
"rewards/margins": 0.9983049631118774,
"rewards/rejected": -1.6759703159332275,
"step": 890
},
{
"epoch": 0.47,
"grad_norm": 2.21875,
"learning_rate": 3.1789858872195888e-06,
"logits/chosen": -2.6642849445343018,
"logits/rejected": -2.647975444793701,
"logps/chosen": -365.28936767578125,
"logps/rejected": -515.3648681640625,
"loss": 0.438,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0754855871200562,
"rewards/margins": 1.3716323375701904,
"rewards/rejected": -2.447117567062378,
"step": 900
},
{
"epoch": 0.48,
"grad_norm": 3.625,
"learning_rate": 3.1349030754075945e-06,
"logits/chosen": -2.668508291244507,
"logits/rejected": -2.6765646934509277,
"logps/chosen": -357.05712890625,
"logps/rejected": -427.643798828125,
"loss": 0.4391,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8638874888420105,
"rewards/margins": 0.9187320470809937,
"rewards/rejected": -1.7826197147369385,
"step": 910
},
{
"epoch": 0.48,
"grad_norm": 4.15625,
"learning_rate": 3.0906082111259313e-06,
"logits/chosen": -2.721989154815674,
"logits/rejected": -2.7402305603027344,
"logps/chosen": -341.566650390625,
"logps/rejected": -425.6533203125,
"loss": 0.4271,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6936538815498352,
"rewards/margins": 1.0446574687957764,
"rewards/rejected": -1.7383114099502563,
"step": 920
},
{
"epoch": 0.49,
"grad_norm": 2.25,
"learning_rate": 3.046116088499449e-06,
"logits/chosen": -2.732478380203247,
"logits/rejected": -2.7298645973205566,
"logps/chosen": -366.4847717285156,
"logps/rejected": -460.4178161621094,
"loss": 0.4317,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7452308535575867,
"rewards/margins": 1.1475111246109009,
"rewards/rejected": -1.8927419185638428,
"step": 930
},
{
"epoch": 0.49,
"grad_norm": 1.46875,
"learning_rate": 3.0014415675356813e-06,
"logits/chosen": -2.7274584770202637,
"logits/rejected": -2.7278664112091064,
"logps/chosen": -354.26751708984375,
"logps/rejected": -469.7132873535156,
"loss": 0.4211,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.7849363088607788,
"rewards/margins": 1.4399199485778809,
"rewards/rejected": -2.224856376647949,
"step": 940
},
{
"epoch": 0.5,
"grad_norm": 2.765625,
"learning_rate": 2.9565995691617242e-06,
"logits/chosen": -2.7352840900421143,
"logits/rejected": -2.737842321395874,
"logps/chosen": -358.52020263671875,
"logps/rejected": -454.4913024902344,
"loss": 0.4305,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.7539668083190918,
"rewards/margins": 1.1879950761795044,
"rewards/rejected": -1.941961646080017,
"step": 950
},
{
"epoch": 0.5,
"grad_norm": 0.9765625,
"learning_rate": 2.9116050702407706e-06,
"logits/chosen": -2.7479987144470215,
"logits/rejected": -2.7785484790802,
"logps/chosen": -313.64837646484375,
"logps/rejected": -373.56689453125,
"loss": 0.4343,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.4597587585449219,
"rewards/margins": 0.8521866798400879,
"rewards/rejected": -1.3119454383850098,
"step": 960
},
{
"epoch": 0.51,
"grad_norm": 2.171875,
"learning_rate": 2.8664730985699537e-06,
"logits/chosen": -2.7080225944519043,
"logits/rejected": -2.717515707015991,
"logps/chosen": -313.6962890625,
"logps/rejected": -389.29205322265625,
"loss": 0.4272,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.46188563108444214,
"rewards/margins": 0.994129478931427,
"rewards/rejected": -1.4560149908065796,
"step": 970
},
{
"epoch": 0.51,
"grad_norm": 2.5,
"learning_rate": 2.8212187278611907e-06,
"logits/chosen": -2.7235171794891357,
"logits/rejected": -2.736121416091919,
"logps/chosen": -342.36273193359375,
"logps/rejected": -417.322509765625,
"loss": 0.4259,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6042460203170776,
"rewards/margins": 1.0531026124954224,
"rewards/rejected": -1.6573486328125,
"step": 980
},
{
"epoch": 0.52,
"grad_norm": 1.7265625,
"learning_rate": 2.7758570727066843e-06,
"logits/chosen": -2.690983533859253,
"logits/rejected": -2.6975481510162354,
"logps/chosen": -342.63079833984375,
"logps/rejected": -407.81231689453125,
"loss": 0.4475,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.8637169599533081,
"rewards/margins": 0.7765380144119263,
"rewards/rejected": -1.6402549743652344,
"step": 990
},
{
"epoch": 0.52,
"grad_norm": 1.7109375,
"learning_rate": 2.730403283530767e-06,
"logits/chosen": -2.6636836528778076,
"logits/rejected": -2.661403179168701,
"logps/chosen": -344.44744873046875,
"logps/rejected": -413.1937561035156,
"loss": 0.4248,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.7091799974441528,
"rewards/margins": 1.0423099994659424,
"rewards/rejected": -1.7514899969100952,
"step": 1000
},
{
"epoch": 0.53,
"grad_norm": 4.21875,
"learning_rate": 2.6848725415297888e-06,
"logits/chosen": -2.6942477226257324,
"logits/rejected": -2.7120838165283203,
"logps/chosen": -336.60760498046875,
"logps/rejected": -421.9783630371094,
"loss": 0.4339,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.7389696836471558,
"rewards/margins": 0.9657734036445618,
"rewards/rejected": -1.7047427892684937,
"step": 1010
},
{
"epoch": 0.53,
"grad_norm": 1.140625,
"learning_rate": 2.639280053601719e-06,
"logits/chosen": -2.700098991394043,
"logits/rejected": -2.7320022583007812,
"logps/chosen": -346.1884460449219,
"logps/rejected": -410.5491638183594,
"loss": 0.4324,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5768822431564331,
"rewards/margins": 0.9592208862304688,
"rewards/rejected": -1.5361031293869019,
"step": 1020
},
{
"epoch": 0.54,
"grad_norm": 3.046875,
"learning_rate": 2.59364104726716e-06,
"logits/chosen": -2.7298641204833984,
"logits/rejected": -2.73411226272583,
"logps/chosen": -346.91912841796875,
"logps/rejected": -417.67034912109375,
"loss": 0.43,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.7697745561599731,
"rewards/margins": 1.0892785787582397,
"rewards/rejected": -1.8590532541275024,
"step": 1030
},
{
"epoch": 0.54,
"grad_norm": 2.609375,
"learning_rate": 2.547970765583491e-06,
"logits/chosen": -2.716035842895508,
"logits/rejected": -2.702650547027588,
"logps/chosen": -330.34124755859375,
"logps/rejected": -416.9476623535156,
"loss": 0.4345,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7259209156036377,
"rewards/margins": 0.974997878074646,
"rewards/rejected": -1.7009187936782837,
"step": 1040
},
{
"epoch": 0.55,
"grad_norm": 1.390625,
"learning_rate": 2.502284462053799e-06,
"logits/chosen": -2.656066656112671,
"logits/rejected": -2.6666572093963623,
"logps/chosen": -331.7149658203125,
"logps/rejected": -423.1683654785156,
"loss": 0.4287,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5932595729827881,
"rewards/margins": 1.035936713218689,
"rewards/rejected": -1.6291964054107666,
"step": 1050
},
{
"epoch": 0.55,
"grad_norm": 1.9296875,
"learning_rate": 2.456597395532338e-06,
"logits/chosen": -2.7209274768829346,
"logits/rejected": -2.735020399093628,
"logps/chosen": -328.7770080566406,
"logps/rejected": -391.19354248046875,
"loss": 0.4294,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6377438306808472,
"rewards/margins": 0.9876799583435059,
"rewards/rejected": -1.625423789024353,
"step": 1060
},
{
"epoch": 0.56,
"grad_norm": 3.515625,
"learning_rate": 2.4109248251281953e-06,
"logits/chosen": -2.7338156700134277,
"logits/rejected": -2.7391562461853027,
"logps/chosen": -343.69390869140625,
"logps/rejected": -414.94805908203125,
"loss": 0.432,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6750501990318298,
"rewards/margins": 1.0820457935333252,
"rewards/rejected": -1.7570960521697998,
"step": 1070
},
{
"epoch": 0.57,
"grad_norm": 0.99609375,
"learning_rate": 2.365282005108875e-06,
"logits/chosen": -2.7210652828216553,
"logits/rejected": -2.7339107990264893,
"logps/chosen": -335.1302490234375,
"logps/rejected": -390.1886901855469,
"loss": 0.4303,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6980403661727905,
"rewards/margins": 1.0391963720321655,
"rewards/rejected": -1.737236738204956,
"step": 1080
},
{
"epoch": 0.57,
"grad_norm": 2.921875,
"learning_rate": 2.319684179805491e-06,
"logits/chosen": -2.6966023445129395,
"logits/rejected": -2.7194454669952393,
"logps/chosen": -343.26934814453125,
"logps/rejected": -400.3363037109375,
"loss": 0.4381,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8400837779045105,
"rewards/margins": 0.9520319104194641,
"rewards/rejected": -1.7921158075332642,
"step": 1090
},
{
"epoch": 0.58,
"grad_norm": 2.0,
"learning_rate": 2.2741465785212905e-06,
"logits/chosen": -2.7013275623321533,
"logits/rejected": -2.7162578105926514,
"logps/chosen": -336.082763671875,
"logps/rejected": -424.86895751953125,
"loss": 0.4383,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.7391124963760376,
"rewards/margins": 0.9916101694107056,
"rewards/rejected": -1.7307227849960327,
"step": 1100
},
{
"epoch": 0.58,
"grad_norm": 1.359375,
"learning_rate": 2.2286844104451848e-06,
"logits/chosen": -2.7336266040802,
"logits/rejected": -2.7385802268981934,
"logps/chosen": -368.2712707519531,
"logps/rejected": -442.3502502441406,
"loss": 0.4393,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.8786946535110474,
"rewards/margins": 0.9058005213737488,
"rewards/rejected": -1.7844951152801514,
"step": 1110
},
{
"epoch": 0.59,
"grad_norm": 1.1328125,
"learning_rate": 2.183312859572008e-06,
"logits/chosen": -2.6661019325256348,
"logits/rejected": -2.687495708465576,
"logps/chosen": -360.59503173828125,
"logps/rejected": -445.728759765625,
"loss": 0.4259,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6851642727851868,
"rewards/margins": 1.106838345527649,
"rewards/rejected": -1.7920026779174805,
"step": 1120
},
{
"epoch": 0.59,
"grad_norm": 3.015625,
"learning_rate": 2.1380470796311843e-06,
"logits/chosen": -2.6705803871154785,
"logits/rejected": -2.668128252029419,
"logps/chosen": -339.4599609375,
"logps/rejected": -421.2923278808594,
"loss": 0.4272,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.642924964427948,
"rewards/margins": 1.065707802772522,
"rewards/rejected": -1.7086328268051147,
"step": 1130
},
{
"epoch": 0.6,
"grad_norm": 3.96875,
"learning_rate": 2.092902189025507e-06,
"logits/chosen": -2.6446332931518555,
"logits/rejected": -2.655050754547119,
"logps/chosen": -355.1614074707031,
"logps/rejected": -427.96636962890625,
"loss": 0.4265,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7822728157043457,
"rewards/margins": 1.1439166069030762,
"rewards/rejected": -1.9261894226074219,
"step": 1140
},
{
"epoch": 0.6,
"grad_norm": 0.98828125,
"learning_rate": 2.0478932657817105e-06,
"logits/chosen": -2.737699031829834,
"logits/rejected": -2.732815742492676,
"logps/chosen": -356.5775451660156,
"logps/rejected": -431.93609619140625,
"loss": 0.4282,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8207361102104187,
"rewards/margins": 1.0940120220184326,
"rewards/rejected": -1.914747953414917,
"step": 1150
},
{
"epoch": 0.61,
"grad_norm": 2.546875,
"learning_rate": 2.0030353425145376e-06,
"logits/chosen": -2.676217555999756,
"logits/rejected": -2.6970784664154053,
"logps/chosen": -318.20355224609375,
"logps/rejected": -386.79388427734375,
"loss": 0.4289,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.741962730884552,
"rewards/margins": 1.0068390369415283,
"rewards/rejected": -1.748801589012146,
"step": 1160
},
{
"epoch": 0.61,
"grad_norm": 0.7890625,
"learning_rate": 1.958343401405964e-06,
"logits/chosen": -2.6743831634521484,
"logits/rejected": -2.6914896965026855,
"logps/chosen": -323.74053955078125,
"logps/rejected": -417.9308166503906,
"loss": 0.4318,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6001055240631104,
"rewards/margins": 1.030330777168274,
"rewards/rejected": -1.6304363012313843,
"step": 1170
},
{
"epoch": 0.62,
"grad_norm": 5.53125,
"learning_rate": 1.9138323692012734e-06,
"logits/chosen": -2.719978094100952,
"logits/rejected": -2.7362709045410156,
"logps/chosen": -316.5953063964844,
"logps/rejected": -410.26593017578125,
"loss": 0.4226,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5119751691818237,
"rewards/margins": 1.1661722660064697,
"rewards/rejected": -1.678147554397583,
"step": 1180
},
{
"epoch": 0.62,
"grad_norm": 1.7890625,
"learning_rate": 1.8695171122236443e-06,
"logits/chosen": -2.6844494342803955,
"logits/rejected": -2.6935813426971436,
"logps/chosen": -319.3164367675781,
"logps/rejected": -416.75616455078125,
"loss": 0.4208,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -0.5857919454574585,
"rewards/margins": 1.1580404043197632,
"rewards/rejected": -1.7438323497772217,
"step": 1190
},
{
"epoch": 0.63,
"grad_norm": 3.0625,
"learning_rate": 1.8254124314089225e-06,
"logits/chosen": -2.755115509033203,
"logits/rejected": -2.7330613136291504,
"logps/chosen": -322.7129821777344,
"logps/rejected": -407.7967529296875,
"loss": 0.4345,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4961959719657898,
"rewards/margins": 1.078574776649475,
"rewards/rejected": -1.5747709274291992,
"step": 1200
},
{
"epoch": 0.63,
"grad_norm": 1.9375,
"learning_rate": 1.781533057362221e-06,
"logits/chosen": -2.746798038482666,
"logits/rejected": -2.7739720344543457,
"logps/chosen": -299.58160400390625,
"logps/rejected": -374.47930908203125,
"loss": 0.4296,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.4686378538608551,
"rewards/margins": 0.9811908602714539,
"rewards/rejected": -1.4498287439346313,
"step": 1210
},
{
"epoch": 0.64,
"grad_norm": 3.296875,
"learning_rate": 1.7378936454380277e-06,
"logits/chosen": -2.7310540676116943,
"logits/rejected": -2.7376887798309326,
"logps/chosen": -327.89410400390625,
"logps/rejected": -404.711669921875,
"loss": 0.4279,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5759900808334351,
"rewards/margins": 1.1363575458526611,
"rewards/rejected": -1.7123476266860962,
"step": 1220
},
{
"epoch": 0.64,
"grad_norm": 1.5390625,
"learning_rate": 1.6945087708454273e-06,
"logits/chosen": -2.6734468936920166,
"logits/rejected": -2.714264392852783,
"logps/chosen": -343.795654296875,
"logps/rejected": -427.343505859375,
"loss": 0.4219,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.5730609893798828,
"rewards/margins": 1.2771522998809814,
"rewards/rejected": -1.8502132892608643,
"step": 1230
},
{
"epoch": 0.65,
"grad_norm": 1.8125,
"learning_rate": 1.651392923780105e-06,
"logits/chosen": -2.6670920848846436,
"logits/rejected": -2.6546778678894043,
"logps/chosen": -322.3703918457031,
"logps/rejected": -434.3629455566406,
"loss": 0.4254,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.6311038136482239,
"rewards/margins": 1.2766520977020264,
"rewards/rejected": -1.9077558517456055,
"step": 1240
},
{
"epoch": 0.65,
"grad_norm": 2.59375,
"learning_rate": 1.608560504584737e-06,
"logits/chosen": -2.737743377685547,
"logits/rejected": -2.720360517501831,
"logps/chosen": -334.63275146484375,
"logps/rejected": -431.16827392578125,
"loss": 0.4287,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8308721780776978,
"rewards/margins": 1.0826470851898193,
"rewards/rejected": -1.913519263267517,
"step": 1250
},
{
"epoch": 0.66,
"grad_norm": 5.21875,
"learning_rate": 1.5660258189393945e-06,
"logits/chosen": -2.7399215698242188,
"logits/rejected": -2.750929117202759,
"logps/chosen": -338.90924072265625,
"logps/rejected": -417.3045959472656,
"loss": 0.4214,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.5204821825027466,
"rewards/margins": 1.254575252532959,
"rewards/rejected": -1.7750571966171265,
"step": 1260
},
{
"epoch": 0.66,
"grad_norm": 5.96875,
"learning_rate": 1.5238030730835578e-06,
"logits/chosen": -2.716184616088867,
"logits/rejected": -2.690531015396118,
"logps/chosen": -311.8260192871094,
"logps/rejected": -440.0191345214844,
"loss": 0.4252,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6099028587341309,
"rewards/margins": 1.3696248531341553,
"rewards/rejected": -1.979527473449707,
"step": 1270
},
{
"epoch": 0.67,
"grad_norm": 8.5625,
"learning_rate": 1.4819063690713565e-06,
"logits/chosen": -2.723191022872925,
"logits/rejected": -2.699627637863159,
"logps/chosen": -343.42608642578125,
"logps/rejected": -442.84033203125,
"loss": 0.424,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.7352994680404663,
"rewards/margins": 1.2973926067352295,
"rewards/rejected": -2.0326919555664062,
"step": 1280
},
{
"epoch": 0.68,
"grad_norm": 1.3359375,
"learning_rate": 1.4403497000615885e-06,
"logits/chosen": -2.709167242050171,
"logits/rejected": -2.711165189743042,
"logps/chosen": -346.2407531738281,
"logps/rejected": -453.6136779785156,
"loss": 0.4365,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9276860952377319,
"rewards/margins": 1.1097049713134766,
"rewards/rejected": -2.037391185760498,
"step": 1290
},
{
"epoch": 0.68,
"grad_norm": 3.109375,
"learning_rate": 1.3991469456441273e-06,
"logits/chosen": -2.6968870162963867,
"logits/rejected": -2.7066521644592285,
"logps/chosen": -349.227294921875,
"logps/rejected": -449.37274169921875,
"loss": 0.4264,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.8388339281082153,
"rewards/margins": 1.244845986366272,
"rewards/rejected": -2.083679676055908,
"step": 1300
},
{
"epoch": 0.69,
"grad_norm": 1.90625,
"learning_rate": 1.3583118672042441e-06,
"logits/chosen": -2.682274580001831,
"logits/rejected": -2.689542055130005,
"logps/chosen": -358.6896057128906,
"logps/rejected": -461.7701721191406,
"loss": 0.4298,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8582962155342102,
"rewards/margins": 1.2384599447250366,
"rewards/rejected": -2.0967559814453125,
"step": 1310
},
{
"epoch": 0.69,
"grad_norm": 2.734375,
"learning_rate": 1.3178581033264218e-06,
"logits/chosen": -2.7299036979675293,
"logits/rejected": -2.725554943084717,
"logps/chosen": -337.6006774902344,
"logps/rejected": -427.9097595214844,
"loss": 0.4254,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8014458417892456,
"rewards/margins": 1.1019701957702637,
"rewards/rejected": -1.9034160375595093,
"step": 1320
},
{
"epoch": 0.7,
"grad_norm": 5.71875,
"learning_rate": 1.2777991652391757e-06,
"logits/chosen": -2.7372395992279053,
"logits/rejected": -2.713787078857422,
"logps/chosen": -349.9215087890625,
"logps/rejected": -458.30218505859375,
"loss": 0.4146,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.605326235294342,
"rewards/margins": 1.2629462480545044,
"rewards/rejected": -1.8682724237442017,
"step": 1330
},
{
"epoch": 0.7,
"grad_norm": 2.796875,
"learning_rate": 1.2381484323024178e-06,
"logits/chosen": -2.710744619369507,
"logits/rejected": -2.7276930809020996,
"logps/chosen": -340.2386779785156,
"logps/rejected": -410.47406005859375,
"loss": 0.4286,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.6984527111053467,
"rewards/margins": 0.98872309923172,
"rewards/rejected": -1.687175989151001,
"step": 1340
},
{
"epoch": 0.71,
"grad_norm": 4.375,
"learning_rate": 1.1989191475388518e-06,
"logits/chosen": -2.7345008850097656,
"logits/rejected": -2.737495183944702,
"logps/chosen": -341.25408935546875,
"logps/rejected": -463.845703125,
"loss": 0.423,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.720673680305481,
"rewards/margins": 1.3686655759811401,
"rewards/rejected": -2.089339256286621,
"step": 1350
},
{
"epoch": 0.71,
"grad_norm": 1.484375,
"learning_rate": 1.160124413210918e-06,
"logits/chosen": -2.710549831390381,
"logits/rejected": -2.7327733039855957,
"logps/chosen": -357.73516845703125,
"logps/rejected": -460.0575256347656,
"loss": 0.4351,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.7213457822799683,
"rewards/margins": 1.1925950050354004,
"rewards/rejected": -1.9139407873153687,
"step": 1360
},
{
"epoch": 0.72,
"grad_norm": 5.8125,
"learning_rate": 1.1217771864447396e-06,
"logits/chosen": -2.745375871658325,
"logits/rejected": -2.734910488128662,
"logps/chosen": -334.57159423828125,
"logps/rejected": -431.4033203125,
"loss": 0.4065,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6131495833396912,
"rewards/margins": 1.3364530801773071,
"rewards/rejected": -1.949602484703064,
"step": 1370
},
{
"epoch": 0.72,
"grad_norm": 3.5,
"learning_rate": 1.08389027490255e-06,
"logits/chosen": -2.7352352142333984,
"logits/rejected": -2.731990098953247,
"logps/chosen": -318.87939453125,
"logps/rejected": -447.78167724609375,
"loss": 0.4207,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.6276998519897461,
"rewards/margins": 1.426599383354187,
"rewards/rejected": -2.0542993545532227,
"step": 1380
},
{
"epoch": 0.73,
"grad_norm": 3.765625,
"learning_rate": 1.046476332505036e-06,
"logits/chosen": -2.7105917930603027,
"logits/rejected": -2.7000508308410645,
"logps/chosen": -351.5965881347656,
"logps/rejected": -464.24041748046875,
"loss": 0.4229,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.666134238243103,
"rewards/margins": 1.3925716876983643,
"rewards/rejected": -2.0587058067321777,
"step": 1390
},
{
"epoch": 0.73,
"grad_norm": 1.78125,
"learning_rate": 1.0095478552050348e-06,
"logits/chosen": -2.7293667793273926,
"logits/rejected": -2.748169422149658,
"logps/chosen": -347.258056640625,
"logps/rejected": -454.30126953125,
"loss": 0.428,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7874716520309448,
"rewards/margins": 1.2901605367660522,
"rewards/rejected": -2.077632188796997,
"step": 1400
},
{
"epoch": 0.74,
"grad_norm": 3.6875,
"learning_rate": 9.731171768139808e-07,
"logits/chosen": -2.747467279434204,
"logits/rejected": -2.7329678535461426,
"logps/chosen": -328.9295959472656,
"logps/rejected": -426.88812255859375,
"loss": 0.4343,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7400668859481812,
"rewards/margins": 1.056199073791504,
"rewards/rejected": -1.796265959739685,
"step": 1410
},
{
"epoch": 0.74,
"grad_norm": 2.40625,
"learning_rate": 9.371964648825221e-07,
"logits/chosen": -2.7056431770324707,
"logits/rejected": -2.711988925933838,
"logps/chosen": -358.60186767578125,
"logps/rejected": -416.0870056152344,
"loss": 0.4388,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9142545461654663,
"rewards/margins": 0.8788741827011108,
"rewards/rejected": -1.7931289672851562,
"step": 1420
},
{
"epoch": 0.75,
"grad_norm": 3.09375,
"learning_rate": 9.017977166366445e-07,
"logits/chosen": -2.712580919265747,
"logits/rejected": -2.7144558429718018,
"logps/chosen": -340.9812316894531,
"logps/rejected": -484.244873046875,
"loss": 0.4154,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.7170324921607971,
"rewards/margins": 1.5092626810073853,
"rewards/rejected": -2.2262954711914062,
"step": 1430
},
{
"epoch": 0.75,
"grad_norm": 4.5625,
"learning_rate": 8.669327549707096e-07,
"logits/chosen": -2.7783093452453613,
"logits/rejected": -2.782752513885498,
"logps/chosen": -336.6181335449219,
"logps/rejected": -427.4580078125,
"loss": 0.4337,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7095115184783936,
"rewards/margins": 1.0761725902557373,
"rewards/rejected": -1.7856842279434204,
"step": 1440
},
{
"epoch": 0.76,
"grad_norm": 6.625,
"learning_rate": 8.326132244986932e-07,
"logits/chosen": -2.7186310291290283,
"logits/rejected": -2.7277238368988037,
"logps/chosen": -330.8462219238281,
"logps/rejected": -445.0597229003906,
"loss": 0.4328,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7316503524780273,
"rewards/margins": 1.2994225025177002,
"rewards/rejected": -2.0310728549957275,
"step": 1450
},
{
"epoch": 0.76,
"grad_norm": 6.40625,
"learning_rate": 7.988505876649863e-07,
"logits/chosen": -2.673827648162842,
"logits/rejected": -2.6909382343292236,
"logps/chosen": -328.7157287597656,
"logps/rejected": -426.40997314453125,
"loss": 0.4208,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6509988307952881,
"rewards/margins": 1.171143651008606,
"rewards/rejected": -1.8221423625946045,
"step": 1460
},
{
"epoch": 0.77,
"grad_norm": 5.03125,
"learning_rate": 7.656561209160248e-07,
"logits/chosen": -2.688586711883545,
"logits/rejected": -2.6996278762817383,
"logps/chosen": -335.6667785644531,
"logps/rejected": -412.533447265625,
"loss": 0.4229,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6491155624389648,
"rewards/margins": 1.1151740550994873,
"rewards/rejected": -1.7642898559570312,
"step": 1470
},
{
"epoch": 0.77,
"grad_norm": 5.71875,
"learning_rate": 7.330409109340563e-07,
"logits/chosen": -2.7187418937683105,
"logits/rejected": -2.713865041732788,
"logps/chosen": -330.7655334472656,
"logps/rejected": -445.03485107421875,
"loss": 0.421,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.6798223257064819,
"rewards/margins": 1.387519121170044,
"rewards/rejected": -2.0673413276672363,
"step": 1480
},
{
"epoch": 0.78,
"grad_norm": 4.21875,
"learning_rate": 7.010158509342682e-07,
"logits/chosen": -2.705104351043701,
"logits/rejected": -2.7204995155334473,
"logps/chosen": -348.8349304199219,
"logps/rejected": -465.5157775878906,
"loss": 0.4239,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7643205523490906,
"rewards/margins": 1.3054136037826538,
"rewards/rejected": -2.0697340965270996,
"step": 1490
},
{
"epoch": 0.79,
"grad_norm": 3.25,
"learning_rate": 6.695916370265529e-07,
"logits/chosen": -2.7456018924713135,
"logits/rejected": -2.75111722946167,
"logps/chosen": -298.90582275390625,
"logps/rejected": -406.1940002441406,
"loss": 0.4243,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6092133522033691,
"rewards/margins": 1.119093894958496,
"rewards/rejected": -1.7283073663711548,
"step": 1500
},
{
"epoch": 0.79,
"grad_norm": 3.75,
"learning_rate": 6.387787646430854e-07,
"logits/chosen": -2.7123939990997314,
"logits/rejected": -2.7206320762634277,
"logps/chosen": -311.45892333984375,
"logps/rejected": -402.0213317871094,
"loss": 0.4307,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5885223746299744,
"rewards/margins": 1.1126749515533447,
"rewards/rejected": -1.7011972665786743,
"step": 1510
},
{
"epoch": 0.8,
"grad_norm": 2.28125,
"learning_rate": 6.085875250329401e-07,
"logits/chosen": -2.7589831352233887,
"logits/rejected": -2.747678756713867,
"logps/chosen": -326.8139953613281,
"logps/rejected": -408.4124450683594,
"loss": 0.4281,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.6763142347335815,
"rewards/margins": 1.2141085863113403,
"rewards/rejected": -1.8904228210449219,
"step": 1520
},
{
"epoch": 0.8,
"grad_norm": 4.4375,
"learning_rate": 5.79028001824894e-07,
"logits/chosen": -2.676466941833496,
"logits/rejected": -2.6653060913085938,
"logps/chosen": -302.88861083984375,
"logps/rejected": -453.6151428222656,
"loss": 0.4205,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5426191687583923,
"rewards/margins": 1.233906626701355,
"rewards/rejected": -1.776525855064392,
"step": 1530
},
{
"epoch": 0.81,
"grad_norm": 2.265625,
"learning_rate": 5.501100676595761e-07,
"logits/chosen": -2.7586090564727783,
"logits/rejected": -2.7702584266662598,
"logps/chosen": -345.1263732910156,
"logps/rejected": -439.22808837890625,
"loss": 0.4249,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6815992593765259,
"rewards/margins": 1.174318790435791,
"rewards/rejected": -1.8559181690216064,
"step": 1540
},
{
"epoch": 0.81,
"grad_norm": 3.3125,
"learning_rate": 5.218433808920884e-07,
"logits/chosen": -2.741339921951294,
"logits/rejected": -2.7451634407043457,
"logps/chosen": -329.6106262207031,
"logps/rejected": -418.102294921875,
"loss": 0.418,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -0.48116618394851685,
"rewards/margins": 1.2408123016357422,
"rewards/rejected": -1.7219784259796143,
"step": 1550
},
{
"epoch": 0.82,
"grad_norm": 2.78125,
"learning_rate": 4.942373823661928e-07,
"logits/chosen": -2.7156219482421875,
"logits/rejected": -2.7195382118225098,
"logps/chosen": -338.1180114746094,
"logps/rejected": -429.4137268066406,
"loss": 0.4242,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5551623702049255,
"rewards/margins": 1.1304242610931396,
"rewards/rejected": -1.6855865716934204,
"step": 1560
},
{
"epoch": 0.82,
"grad_norm": 0.64453125,
"learning_rate": 4.6730129226114363e-07,
"logits/chosen": -2.74899959564209,
"logits/rejected": -2.7066941261291504,
"logps/chosen": -316.0484619140625,
"logps/rejected": -443.41156005859375,
"loss": 0.4244,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.6658861637115479,
"rewards/margins": 1.3104979991912842,
"rewards/rejected": -1.976383924484253,
"step": 1570
},
{
"epoch": 0.83,
"grad_norm": 3.03125,
"learning_rate": 4.4104410701222703e-07,
"logits/chosen": -2.764725923538208,
"logits/rejected": -2.7511062622070312,
"logps/chosen": -345.6468200683594,
"logps/rejected": -458.3433532714844,
"loss": 0.4297,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.7146095037460327,
"rewards/margins": 1.2348105907440186,
"rewards/rejected": -1.9494202136993408,
"step": 1580
},
{
"epoch": 0.83,
"grad_norm": 2.609375,
"learning_rate": 4.154745963060197e-07,
"logits/chosen": -2.690899610519409,
"logits/rejected": -2.68369460105896,
"logps/chosen": -327.4097900390625,
"logps/rejected": -445.50457763671875,
"loss": 0.4267,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.7736159563064575,
"rewards/margins": 1.229644775390625,
"rewards/rejected": -2.003260850906372,
"step": 1590
},
{
"epoch": 0.84,
"grad_norm": 2.640625,
"learning_rate": 3.9060130015138863e-07,
"logits/chosen": -2.7254629135131836,
"logits/rejected": -2.7183403968811035,
"logps/chosen": -338.1430358886719,
"logps/rejected": -464.69049072265625,
"loss": 0.4245,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.7131890058517456,
"rewards/margins": 1.2739613056182861,
"rewards/rejected": -1.987149953842163,
"step": 1600
},
{
"epoch": 0.84,
"grad_norm": 1.015625,
"learning_rate": 3.664325260271953e-07,
"logits/chosen": -2.7088608741760254,
"logits/rejected": -2.730421304702759,
"logps/chosen": -357.98126220703125,
"logps/rejected": -414.66668701171875,
"loss": 0.4309,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7439313530921936,
"rewards/margins": 0.9089797139167786,
"rewards/rejected": -1.652910828590393,
"step": 1610
},
{
"epoch": 0.85,
"grad_norm": 4.28125,
"learning_rate": 3.429763461076677e-07,
"logits/chosen": -2.7392077445983887,
"logits/rejected": -2.7532036304473877,
"logps/chosen": -340.41436767578125,
"logps/rejected": -411.6546936035156,
"loss": 0.4351,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.7221731543540955,
"rewards/margins": 0.9596914052963257,
"rewards/rejected": -1.6818645000457764,
"step": 1620
},
{
"epoch": 0.85,
"grad_norm": 6.1875,
"learning_rate": 3.202405945663556e-07,
"logits/chosen": -2.739431858062744,
"logits/rejected": -2.7153592109680176,
"logps/chosen": -351.1690368652344,
"logps/rejected": -460.6399841308594,
"loss": 0.4206,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7035288214683533,
"rewards/margins": 1.3128876686096191,
"rewards/rejected": -2.016416549682617,
"step": 1630
},
{
"epoch": 0.86,
"grad_norm": 4.34375,
"learning_rate": 2.982328649595856e-07,
"logits/chosen": -2.7347230911254883,
"logits/rejected": -2.7274715900421143,
"logps/chosen": -338.5975646972656,
"logps/rejected": -435.02972412109375,
"loss": 0.4329,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.722823441028595,
"rewards/margins": 1.025390625,
"rewards/rejected": -1.7482140064239502,
"step": 1640
},
{
"epoch": 0.86,
"grad_norm": 2.75,
"learning_rate": 2.7696050769026954e-07,
"logits/chosen": -2.7211785316467285,
"logits/rejected": -2.684309720993042,
"logps/chosen": -344.1480407714844,
"logps/rejected": -481.3314514160156,
"loss": 0.4299,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7990375757217407,
"rewards/margins": 1.399259328842163,
"rewards/rejected": -2.1982970237731934,
"step": 1650
},
{
"epoch": 0.87,
"grad_norm": 2.015625,
"learning_rate": 2.564306275529341e-07,
"logits/chosen": -2.7370145320892334,
"logits/rejected": -2.7407755851745605,
"logps/chosen": -321.891357421875,
"logps/rejected": -401.0546569824219,
"loss": 0.4248,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7101179957389832,
"rewards/margins": 1.1242733001708984,
"rewards/rejected": -1.8343912363052368,
"step": 1660
},
{
"epoch": 0.87,
"grad_norm": 3.671875,
"learning_rate": 2.3665008136077332e-07,
"logits/chosen": -2.7449889183044434,
"logits/rejected": -2.7408945560455322,
"logps/chosen": -360.5602722167969,
"logps/rejected": -467.7608337402344,
"loss": 0.4302,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.823918342590332,
"rewards/margins": 1.2585489749908447,
"rewards/rejected": -2.0824673175811768,
"step": 1670
},
{
"epoch": 0.88,
"grad_norm": 4.25,
"learning_rate": 2.1762547565553293e-07,
"logits/chosen": -2.6576642990112305,
"logits/rejected": -2.6262636184692383,
"logps/chosen": -352.53857421875,
"logps/rejected": -485.3980407714844,
"loss": 0.419,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7759647965431213,
"rewards/margins": 1.4105063676834106,
"rewards/rejected": -2.1864712238311768,
"step": 1680
},
{
"epoch": 0.88,
"grad_norm": 1.2578125,
"learning_rate": 1.993631645009747e-07,
"logits/chosen": -2.70914888381958,
"logits/rejected": -2.6992902755737305,
"logps/chosen": -348.275390625,
"logps/rejected": -432.3793029785156,
"loss": 0.4294,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9621282815933228,
"rewards/margins": 1.0199404954910278,
"rewards/rejected": -1.982068657875061,
"step": 1690
},
{
"epoch": 0.89,
"grad_norm": 4.125,
"learning_rate": 1.818692473606748e-07,
"logits/chosen": -2.749687671661377,
"logits/rejected": -2.743114948272705,
"logps/chosen": -356.88311767578125,
"logps/rejected": -437.4881896972656,
"loss": 0.4101,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5851279497146606,
"rewards/margins": 1.345589280128479,
"rewards/rejected": -1.93071711063385,
"step": 1700
},
{
"epoch": 0.9,
"grad_norm": 3.015625,
"learning_rate": 1.6514956706084885e-07,
"logits/chosen": -2.731550931930542,
"logits/rejected": -2.7247581481933594,
"logps/chosen": -367.1907653808594,
"logps/rejected": -460.9814453125,
"loss": 0.4151,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.8159070014953613,
"rewards/margins": 1.3104612827301025,
"rewards/rejected": -2.1263680458068848,
"step": 1710
},
{
"epoch": 0.9,
"grad_norm": 1.921875,
"learning_rate": 1.4920970783889737e-07,
"logits/chosen": -2.728755235671997,
"logits/rejected": -2.716214895248413,
"logps/chosen": -326.6175842285156,
"logps/rejected": -425.10528564453125,
"loss": 0.4179,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5755528807640076,
"rewards/margins": 1.2382352352142334,
"rewards/rejected": -1.8137880563735962,
"step": 1720
},
{
"epoch": 0.91,
"grad_norm": 1.921875,
"learning_rate": 1.340549934783164e-07,
"logits/chosen": -2.7430858612060547,
"logits/rejected": -2.7155654430389404,
"logps/chosen": -319.4033508300781,
"logps/rejected": -460.38494873046875,
"loss": 0.4219,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.5762430429458618,
"rewards/margins": 1.5360796451568604,
"rewards/rejected": -2.1123225688934326,
"step": 1730
},
{
"epoch": 0.91,
"grad_norm": 0.8046875,
"learning_rate": 1.196904855305961e-07,
"logits/chosen": -2.7446229457855225,
"logits/rejected": -2.741819143295288,
"logps/chosen": -347.061279296875,
"logps/rejected": -454.8154296875,
"loss": 0.4227,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.7210251688957214,
"rewards/margins": 1.3714605569839478,
"rewards/rejected": -2.0924859046936035,
"step": 1740
},
{
"epoch": 0.92,
"grad_norm": 6.78125,
"learning_rate": 1.0612098162470302e-07,
"logits/chosen": -2.754495143890381,
"logits/rejected": -2.770519733428955,
"logps/chosen": -356.2037658691406,
"logps/rejected": -428.56915283203125,
"loss": 0.4204,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6976330876350403,
"rewards/margins": 1.1086335182189941,
"rewards/rejected": -1.8062665462493896,
"step": 1750
},
{
"epoch": 0.92,
"grad_norm": 1.703125,
"learning_rate": 9.335101386471285e-08,
"logits/chosen": -2.7438502311706543,
"logits/rejected": -2.732393741607666,
"logps/chosen": -325.47747802734375,
"logps/rejected": -466.6300354003906,
"loss": 0.4096,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.5821502208709717,
"rewards/margins": 1.489473819732666,
"rewards/rejected": -2.0716240406036377,
"step": 1760
},
{
"epoch": 0.93,
"grad_norm": 1.125,
"learning_rate": 8.138484731612273e-08,
"logits/chosen": -2.6938061714172363,
"logits/rejected": -2.6772360801696777,
"logps/chosen": -339.33856201171875,
"logps/rejected": -459.6874084472656,
"loss": 0.4259,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.7721551060676575,
"rewards/margins": 1.2288380861282349,
"rewards/rejected": -2.000993251800537,
"step": 1770
},
{
"epoch": 0.93,
"grad_norm": 5.15625,
"learning_rate": 7.022647858135501e-08,
"logits/chosen": -2.744938850402832,
"logits/rejected": -2.7732322216033936,
"logps/chosen": -332.72509765625,
"logps/rejected": -421.79949951171875,
"loss": 0.4295,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7302526235580444,
"rewards/margins": 1.0942634344100952,
"rewards/rejected": -1.82451593875885,
"step": 1780
},
{
"epoch": 0.94,
"grad_norm": 1.7734375,
"learning_rate": 5.987963446492384e-08,
"logits/chosen": -2.6715445518493652,
"logits/rejected": -2.6765220165252686,
"logps/chosen": -348.5739440917969,
"logps/rejected": -482.542236328125,
"loss": 0.4117,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.6485568284988403,
"rewards/margins": 1.4271323680877686,
"rewards/rejected": -2.0756890773773193,
"step": 1790
},
{
"epoch": 0.94,
"grad_norm": 4.46875,
"learning_rate": 5.034777072871394e-08,
"logits/chosen": -2.7057948112487793,
"logits/rejected": -2.7085628509521484,
"logps/chosen": -318.53497314453125,
"logps/rejected": -424.2813415527344,
"loss": 0.4289,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7039138674736023,
"rewards/margins": 1.1032226085662842,
"rewards/rejected": -1.8071365356445312,
"step": 1800
},
{
"epoch": 0.95,
"grad_norm": 2.5625,
"learning_rate": 4.163407093778243e-08,
"logits/chosen": -2.6782615184783936,
"logits/rejected": -2.6741080284118652,
"logps/chosen": -333.125732421875,
"logps/rejected": -420.4185485839844,
"loss": 0.4345,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": -0.8739233016967773,
"rewards/margins": 1.0754514932632446,
"rewards/rejected": -1.949374794960022,
"step": 1810
},
{
"epoch": 0.95,
"grad_norm": 2.3125,
"learning_rate": 3.37414453970758e-08,
"logits/chosen": -2.742910146713257,
"logits/rejected": -2.7536118030548096,
"logps/chosen": -337.20489501953125,
"logps/rejected": -413.38238525390625,
"loss": 0.4246,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6338103413581848,
"rewards/margins": 1.2298805713653564,
"rewards/rejected": -1.863690972328186,
"step": 1820
},
{
"epoch": 0.96,
"grad_norm": 7.125,
"learning_rate": 2.6672530179410183e-08,
"logits/chosen": -2.7278337478637695,
"logits/rejected": -2.7276864051818848,
"logps/chosen": -324.9429626464844,
"logps/rejected": -436.3682556152344,
"loss": 0.4287,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.6213011145591736,
"rewards/margins": 1.2622824907302856,
"rewards/rejected": -1.883583426475525,
"step": 1830
},
{
"epoch": 0.96,
"grad_norm": 1.5859375,
"learning_rate": 2.04296862450451e-08,
"logits/chosen": -2.7341113090515137,
"logits/rejected": -2.7228341102600098,
"logps/chosen": -359.82659912109375,
"logps/rejected": -492.15771484375,
"loss": 0.4246,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8840007781982422,
"rewards/margins": 1.547300100326538,
"rewards/rejected": -2.431300640106201,
"step": 1840
},
{
"epoch": 0.97,
"grad_norm": 1.3046875,
"learning_rate": 1.501499865314171e-08,
"logits/chosen": -2.655301809310913,
"logits/rejected": -2.6794886589050293,
"logps/chosen": -379.09075927734375,
"logps/rejected": -465.2655334472656,
"loss": 0.4153,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -0.6638426184654236,
"rewards/margins": 1.2455824613571167,
"rewards/rejected": -1.909425139427185,
"step": 1850
},
{
"epoch": 0.97,
"grad_norm": 1.8046875,
"learning_rate": 1.0430275865371265e-08,
"logits/chosen": -2.700707197189331,
"logits/rejected": -2.6735973358154297,
"logps/chosen": -324.67864990234375,
"logps/rejected": -426.3045959472656,
"loss": 0.4269,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7671259641647339,
"rewards/margins": 1.2393693923950195,
"rewards/rejected": -2.006495475769043,
"step": 1860
},
{
"epoch": 0.98,
"grad_norm": 2.96875,
"learning_rate": 6.677049141901315e-09,
"logits/chosen": -2.714787006378174,
"logits/rejected": -2.729104518890381,
"logps/chosen": -332.42156982421875,
"logps/rejected": -465.78497314453125,
"loss": 0.4213,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6763681769371033,
"rewards/margins": 1.4941723346710205,
"rewards/rejected": -2.1705403327941895,
"step": 1870
},
{
"epoch": 0.98,
"grad_norm": 1.4140625,
"learning_rate": 3.756572029968708e-09,
"logits/chosen": -2.7444651126861572,
"logits/rejected": -2.756309986114502,
"logps/chosen": -348.5865783691406,
"logps/rejected": -453.53350830078125,
"loss": 0.4206,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7736285328865051,
"rewards/margins": 1.2819445133209229,
"rewards/rejected": -2.055572986602783,
"step": 1880
},
{
"epoch": 0.99,
"grad_norm": 3.234375,
"learning_rate": 1.6698199452053199e-09,
"logits/chosen": -2.7279868125915527,
"logits/rejected": -2.7107343673706055,
"logps/chosen": -339.8231506347656,
"logps/rejected": -435.91436767578125,
"loss": 0.4382,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6851938962936401,
"rewards/margins": 0.9626883268356323,
"rewards/rejected": -1.6478822231292725,
"step": 1890
},
{
"epoch": 0.99,
"grad_norm": 1.25,
"learning_rate": 4.1748984585560094e-10,
"logits/chosen": -2.7459967136383057,
"logits/rejected": -2.7115063667297363,
"logps/chosen": -353.6007385253906,
"logps/rejected": -478.2798767089844,
"loss": 0.4255,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7070942521095276,
"rewards/margins": 1.4682670831680298,
"rewards/rejected": -2.175361156463623,
"step": 1900
},
{
"epoch": 1.0,
"grad_norm": 0.84765625,
"learning_rate": 0.0,
"logits/chosen": -2.7471261024475098,
"logits/rejected": -2.7326393127441406,
"logps/chosen": -346.9029846191406,
"logps/rejected": -450.1078186035156,
"loss": 0.4197,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.7211757302284241,
"rewards/margins": 1.336022973060608,
"rewards/rejected": -2.0571985244750977,
"step": 1910
},
{
"epoch": 1.0,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.4411179514611579,
"train_runtime": 83930.2059,
"train_samples_per_second": 0.728,
"train_steps_per_second": 0.023
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}