chchen's picture
End of training
311e985 verified
{
"best_metric": 0.8629826903343201,
"best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.2/lora/orpo-salt/checkpoint-1500",
"epoch": 2.9969690846635686,
"eval_steps": 500,
"global_step": 1854,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01616488179430188,
"grad_norm": 16.64879608154297,
"learning_rate": 4.999648198770648e-06,
"logits/chosen": -2.4989278316497803,
"logits/rejected": -2.5208303928375244,
"logps/chosen": -1.9139716625213623,
"logps/rejected": -3.1082823276519775,
"loss": 1.9977,
"odds_ratio_loss": 0.8370735049247742,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.19139717519283295,
"rewards/margins": 0.1194310411810875,
"rewards/rejected": -0.31082823872566223,
"sft_loss": 1.9139716625213623,
"step": 10
},
{
"epoch": 0.03232976358860376,
"grad_norm": 13.894062042236328,
"learning_rate": 4.998578646361359e-06,
"logits/chosen": -2.5156219005584717,
"logits/rejected": -2.51640248298645,
"logps/chosen": -1.635488748550415,
"logps/rejected": -2.132800817489624,
"loss": 1.7095,
"odds_ratio_loss": 0.7404953241348267,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.16354887187480927,
"rewards/margins": 0.04973122477531433,
"rewards/rejected": -0.2132801115512848,
"sft_loss": 1.635488748550415,
"step": 20
},
{
"epoch": 0.04849464538290564,
"grad_norm": 23.089773178100586,
"learning_rate": 4.996791614004449e-06,
"logits/chosen": -2.518998861312866,
"logits/rejected": -2.544835090637207,
"logps/chosen": -1.6531565189361572,
"logps/rejected": -2.541318893432617,
"loss": 1.7385,
"odds_ratio_loss": 0.8539272546768188,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.165315642952919,
"rewards/margins": 0.08881621062755585,
"rewards/rejected": -0.25413185358047485,
"sft_loss": 1.6531565189361572,
"step": 30
},
{
"epoch": 0.06465952717720752,
"grad_norm": 13.833389282226562,
"learning_rate": 4.994287614855618e-06,
"logits/chosen": -2.518852472305298,
"logits/rejected": -2.551032066345215,
"logps/chosen": -1.7646430730819702,
"logps/rejected": -2.508850574493408,
"loss": 1.8742,
"odds_ratio_loss": 1.0958486795425415,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.17646430432796478,
"rewards/margins": 0.07442077249288559,
"rewards/rejected": -0.2508850693702698,
"sft_loss": 1.7646430730819702,
"step": 40
},
{
"epoch": 0.0808244089715094,
"grad_norm": 28.34682846069336,
"learning_rate": 4.991067367951343e-06,
"logits/chosen": -2.5992355346679688,
"logits/rejected": -2.5891082286834717,
"logps/chosen": -1.345651388168335,
"logps/rejected": -2.2306911945343018,
"loss": 1.4115,
"odds_ratio_loss": 0.6583842039108276,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13456514477729797,
"rewards/margins": 0.08850395679473877,
"rewards/rejected": -0.22306910157203674,
"sft_loss": 1.345651388168335,
"step": 50
},
{
"epoch": 0.09698929076581128,
"grad_norm": 3.4724316596984863,
"learning_rate": 4.987131798002389e-06,
"logits/chosen": -2.539771556854248,
"logits/rejected": -2.5456976890563965,
"logps/chosen": -1.3674490451812744,
"logps/rejected": -2.1061840057373047,
"loss": 1.4542,
"odds_ratio_loss": 0.8671566247940063,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.1367449164390564,
"rewards/margins": 0.07387349754571915,
"rewards/rejected": -0.21061840653419495,
"sft_loss": 1.3674490451812744,
"step": 60
},
{
"epoch": 0.11315417256011315,
"grad_norm": 46.33675003051758,
"learning_rate": 4.982482035128285e-06,
"logits/chosen": -2.5208637714385986,
"logits/rejected": -2.528776168823242,
"logps/chosen": -1.4248360395431519,
"logps/rejected": -2.067411184310913,
"loss": 1.5025,
"odds_ratio_loss": 0.7764666676521301,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.14248362183570862,
"rewards/margins": 0.06425751000642776,
"rewards/rejected": -0.2067411243915558,
"sft_loss": 1.4248360395431519,
"step": 70
},
{
"epoch": 0.12931905435441504,
"grad_norm": 25.993545532226562,
"learning_rate": 4.9771194145328e-06,
"logits/chosen": -2.5788090229034424,
"logits/rejected": -2.572688341140747,
"logps/chosen": -1.0824676752090454,
"logps/rejected": -1.7445621490478516,
"loss": 1.1449,
"odds_ratio_loss": 0.6242043972015381,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.10824675858020782,
"rewards/margins": 0.06620947271585464,
"rewards/rejected": -0.17445623874664307,
"sft_loss": 1.0824676752090454,
"step": 80
},
{
"epoch": 0.1454839361487169,
"grad_norm": 19.184228897094727,
"learning_rate": 4.971045476120532e-06,
"logits/chosen": -2.5863890647888184,
"logits/rejected": -2.591404914855957,
"logps/chosen": -1.080370306968689,
"logps/rejected": -1.753382682800293,
"loss": 1.1463,
"odds_ratio_loss": 0.6591774821281433,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.10803703963756561,
"rewards/margins": 0.06730123609304428,
"rewards/rejected": -0.1753382831811905,
"sft_loss": 1.080370306968689,
"step": 90
},
{
"epoch": 0.1616488179430188,
"grad_norm": 5.7092084884643555,
"learning_rate": 4.964261964054713e-06,
"logits/chosen": -2.5851123332977295,
"logits/rejected": -2.5928287506103516,
"logps/chosen": -1.20145583152771,
"logps/rejected": -1.920117735862732,
"loss": 1.2771,
"odds_ratio_loss": 0.7563266754150391,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12014558166265488,
"rewards/margins": 0.07186620682477951,
"rewards/rejected": -0.1920117884874344,
"sft_loss": 1.20145583152771,
"step": 100
},
{
"epoch": 0.17781369973732067,
"grad_norm": 4.211212635040283,
"learning_rate": 4.956770826256372e-06,
"logits/chosen": -2.6192798614501953,
"logits/rejected": -2.6177656650543213,
"logps/chosen": -1.1085783243179321,
"logps/rejected": -1.4738147258758545,
"loss": 1.1766,
"odds_ratio_loss": 0.6805119514465332,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.11085782200098038,
"rewards/margins": 0.03652365505695343,
"rewards/rejected": -0.1473814696073532,
"sft_loss": 1.1085783243179321,
"step": 110
},
{
"epoch": 0.19397858153162256,
"grad_norm": 3.4872381687164307,
"learning_rate": 4.94857421384497e-06,
"logits/chosen": -2.602118968963623,
"logits/rejected": -2.6089630126953125,
"logps/chosen": -1.0341213941574097,
"logps/rejected": -1.5845638513565063,
"loss": 1.1041,
"odds_ratio_loss": 0.6995517611503601,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.10341213643550873,
"rewards/margins": 0.05504424497485161,
"rewards/rejected": -0.15845640003681183,
"sft_loss": 1.0341213941574097,
"step": 120
},
{
"epoch": 0.21014346332592443,
"grad_norm": 5.468324661254883,
"learning_rate": 4.939674480520701e-06,
"logits/chosen": -2.6128063201904297,
"logits/rejected": -2.6255507469177246,
"logps/chosen": -0.9619969129562378,
"logps/rejected": -1.390077829360962,
"loss": 1.0297,
"odds_ratio_loss": 0.6766607165336609,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09619969129562378,
"rewards/margins": 0.04280809685587883,
"rewards/rejected": -0.1390077769756317,
"sft_loss": 0.9619969129562378,
"step": 130
},
{
"epoch": 0.2263083451202263,
"grad_norm": 5.18142032623291,
"learning_rate": 4.930074181888613e-06,
"logits/chosen": -2.6814427375793457,
"logits/rejected": -2.7020936012268066,
"logps/chosen": -0.9705274701118469,
"logps/rejected": -1.315450668334961,
"loss": 1.0341,
"odds_ratio_loss": 0.636103630065918,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09705274552106857,
"rewards/margins": 0.03449232131242752,
"rewards/rejected": -0.1315450817346573,
"sft_loss": 0.9705274701118469,
"step": 140
},
{
"epoch": 0.2424732269145282,
"grad_norm": 1.4752620458602905,
"learning_rate": 4.91977607472475e-06,
"logits/chosen": -2.704951524734497,
"logits/rejected": -2.7246315479278564,
"logps/chosen": -1.0248619318008423,
"logps/rejected": -1.4426223039627075,
"loss": 1.0895,
"odds_ratio_loss": 0.6460444331169128,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.10248619318008423,
"rewards/margins": 0.04177603870630264,
"rewards/rejected": -0.14426222443580627,
"sft_loss": 1.0248619318008423,
"step": 150
},
{
"epoch": 0.2586381087088301,
"grad_norm": 2.9540135860443115,
"learning_rate": 4.908783116184534e-06,
"logits/chosen": -2.671297550201416,
"logits/rejected": -2.676952838897705,
"logps/chosen": -0.9303582906723022,
"logps/rejected": -1.28878653049469,
"loss": 0.991,
"odds_ratio_loss": 0.6061214208602905,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09303583949804306,
"rewards/margins": 0.03584280610084534,
"rewards/rejected": -0.1288786381483078,
"sft_loss": 0.9303582906723022,
"step": 160
},
{
"epoch": 0.27480299050313195,
"grad_norm": 2.913118839263916,
"learning_rate": 4.897098462953598e-06,
"logits/chosen": -2.7513809204101562,
"logits/rejected": -2.7600345611572266,
"logps/chosen": -0.8939758539199829,
"logps/rejected": -1.4527159929275513,
"loss": 0.9601,
"odds_ratio_loss": 0.661632239818573,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08939759433269501,
"rewards/margins": 0.05587399750947952,
"rewards/rejected": -0.14527159929275513,
"sft_loss": 0.8939758539199829,
"step": 170
},
{
"epoch": 0.2909678722974338,
"grad_norm": 1.985352635383606,
"learning_rate": 4.884725470341331e-06,
"logits/chosen": -2.7102103233337402,
"logits/rejected": -2.739673137664795,
"logps/chosen": -0.8302527666091919,
"logps/rejected": -1.2092260122299194,
"loss": 0.8851,
"odds_ratio_loss": 0.5487207174301147,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08302526921033859,
"rewards/margins": 0.03789733722805977,
"rewards/rejected": -0.12092261016368866,
"sft_loss": 0.8302527666091919,
"step": 180
},
{
"epoch": 0.3071327540917357,
"grad_norm": 8.031681060791016,
"learning_rate": 4.871667691317377e-06,
"logits/chosen": -2.764559745788574,
"logits/rejected": -2.767064332962036,
"logps/chosen": -1.0171376466751099,
"logps/rejected": -1.1592780351638794,
"loss": 1.0939,
"odds_ratio_loss": 0.7678386569023132,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.10171377658843994,
"rewards/margins": 0.01421402208507061,
"rewards/rejected": -0.1159278005361557,
"sft_loss": 1.0171376466751099,
"step": 190
},
{
"epoch": 0.3232976358860376,
"grad_norm": 4.448939323425293,
"learning_rate": 4.857928875491392e-06,
"logits/chosen": -2.750746965408325,
"logits/rejected": -2.7596051692962646,
"logps/chosen": -0.8164304494857788,
"logps/rejected": -1.0888216495513916,
"loss": 0.8794,
"odds_ratio_loss": 0.6294754147529602,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08164305239915848,
"rewards/margins": 0.027239132672548294,
"rewards/rejected": -0.10888218879699707,
"sft_loss": 0.8164304494857788,
"step": 200
},
{
"epoch": 0.33946251768033947,
"grad_norm": 2.216554641723633,
"learning_rate": 4.843512968036314e-06,
"logits/chosen": -2.7625343799591064,
"logits/rejected": -2.7599010467529297,
"logps/chosen": -0.833400547504425,
"logps/rejected": -1.0677030086517334,
"loss": 0.8944,
"odds_ratio_loss": 0.6096410751342773,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08334006369113922,
"rewards/margins": 0.0234302319586277,
"rewards/rejected": -0.10677029192447662,
"sft_loss": 0.833400547504425,
"step": 210
},
{
"epoch": 0.35562739947464134,
"grad_norm": 1.4112659692764282,
"learning_rate": 4.828424108555486e-06,
"logits/chosen": -2.807507276535034,
"logits/rejected": -2.803765296936035,
"logps/chosen": -1.0460469722747803,
"logps/rejected": -1.4173492193222046,
"loss": 1.1091,
"odds_ratio_loss": 0.6301766037940979,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.10460470616817474,
"rewards/margins": 0.037130214273929596,
"rewards/rejected": -0.14173491299152374,
"sft_loss": 1.0460469722747803,
"step": 220
},
{
"epoch": 0.3717922812689432,
"grad_norm": 0.9852223992347717,
"learning_rate": 4.812666629893957e-06,
"logits/chosen": -2.795703649520874,
"logits/rejected": -2.8211073875427246,
"logps/chosen": -0.891126275062561,
"logps/rejected": -1.0855722427368164,
"loss": 0.9626,
"odds_ratio_loss": 0.7152143716812134,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08911262452602386,
"rewards/margins": 0.019444596022367477,
"rewards/rejected": -0.10855722427368164,
"sft_loss": 0.891126275062561,
"step": 230
},
{
"epoch": 0.3879571630632451,
"grad_norm": 2.482409954071045,
"learning_rate": 4.796245056894273e-06,
"logits/chosen": -2.757913112640381,
"logits/rejected": -2.794553518295288,
"logps/chosen": -0.9089745283126831,
"logps/rejected": -1.3391778469085693,
"loss": 0.9804,
"odds_ratio_loss": 0.7146768569946289,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09089745581150055,
"rewards/margins": 0.0430203452706337,
"rewards/rejected": -0.13391780853271484,
"sft_loss": 0.9089745283126831,
"step": 240
},
{
"epoch": 0.404122044857547,
"grad_norm": 1.3123791217803955,
"learning_rate": 4.779164105097148e-06,
"logits/chosen": -2.796814441680908,
"logits/rejected": -2.8013055324554443,
"logps/chosen": -0.8589127659797668,
"logps/rejected": -1.3229057788848877,
"loss": 0.9186,
"odds_ratio_loss": 0.5965861082077026,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08589127659797668,
"rewards/margins": 0.046399302780628204,
"rewards/rejected": -0.1322905719280243,
"sft_loss": 0.8589127659797668,
"step": 250
},
{
"epoch": 0.42028692665184886,
"grad_norm": 2.171173095703125,
"learning_rate": 4.761428679387373e-06,
"logits/chosen": -2.790588617324829,
"logits/rejected": -2.7970798015594482,
"logps/chosen": -0.8536098599433899,
"logps/rejected": -1.0807464122772217,
"loss": 0.9168,
"odds_ratio_loss": 0.6316367387771606,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08536098152399063,
"rewards/margins": 0.022713668644428253,
"rewards/rejected": -0.10807464271783829,
"sft_loss": 0.8536098599433899,
"step": 260
},
{
"epoch": 0.4364518084461507,
"grad_norm": 3.753523111343384,
"learning_rate": 4.7430438725853515e-06,
"logits/chosen": -2.7550888061523438,
"logits/rejected": -2.766615629196167,
"logps/chosen": -0.913661003112793,
"logps/rejected": -1.41799795627594,
"loss": 0.9739,
"odds_ratio_loss": 0.6024969816207886,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09136610478162766,
"rewards/margins": 0.0504336841404438,
"rewards/rejected": -0.14179977774620056,
"sft_loss": 0.913661003112793,
"step": 270
},
{
"epoch": 0.4526166902404526,
"grad_norm": 1.5982986688613892,
"learning_rate": 4.724014963984669e-06,
"logits/chosen": -2.798797130584717,
"logits/rejected": -2.8145482540130615,
"logps/chosen": -0.8752357363700867,
"logps/rejected": -1.1694762706756592,
"loss": 0.9358,
"odds_ratio_loss": 0.6060217618942261,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08752357959747314,
"rewards/margins": 0.029424061998724937,
"rewards/rejected": -0.11694763600826263,
"sft_loss": 0.8752357363700867,
"step": 280
},
{
"epoch": 0.4687815720347545,
"grad_norm": 3.8735010623931885,
"learning_rate": 4.704347417836116e-06,
"logits/chosen": -2.7753589153289795,
"logits/rejected": -2.829224109649658,
"logps/chosen": -0.7804813385009766,
"logps/rejected": -1.1957075595855713,
"loss": 0.8432,
"odds_ratio_loss": 0.6271591186523438,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07804813235998154,
"rewards/margins": 0.04152262955904007,
"rewards/rejected": -0.1195707693696022,
"sft_loss": 0.7804813385009766,
"step": 290
},
{
"epoch": 0.4849464538290564,
"grad_norm": 2.0640830993652344,
"learning_rate": 4.684046881778603e-06,
"logits/chosen": -2.8023476600646973,
"logits/rejected": -2.8235526084899902,
"logps/chosen": -0.8398802876472473,
"logps/rejected": -0.9978183507919312,
"loss": 0.9045,
"odds_ratio_loss": 0.6464654803276062,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08398802578449249,
"rewards/margins": 0.015793804079294205,
"rewards/rejected": -0.099781833589077,
"sft_loss": 0.8398802876472473,
"step": 300
},
{
"epoch": 0.5011113356233583,
"grad_norm": 1.626105785369873,
"learning_rate": 4.663119185217409e-06,
"logits/chosen": -2.796461343765259,
"logits/rejected": -2.8225197792053223,
"logps/chosen": -0.8273599743843079,
"logps/rejected": -1.096482515335083,
"loss": 0.8875,
"odds_ratio_loss": 0.6016198396682739,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08273600041866302,
"rewards/margins": 0.026912260800600052,
"rewards/rejected": -0.10964826494455338,
"sft_loss": 0.8273599743843079,
"step": 310
},
{
"epoch": 0.5172762174176602,
"grad_norm": 1.5098748207092285,
"learning_rate": 4.641570337650232e-06,
"logits/chosen": -2.847539186477661,
"logits/rejected": -2.85341215133667,
"logps/chosen": -0.7699432969093323,
"logps/rejected": -1.0820213556289673,
"loss": 0.8268,
"odds_ratio_loss": 0.5688191652297974,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.07699433714151382,
"rewards/margins": 0.03120780549943447,
"rewards/rejected": -0.10820214450359344,
"sft_loss": 0.7699432969093323,
"step": 320
},
{
"epoch": 0.533441099211962,
"grad_norm": 1.3477349281311035,
"learning_rate": 4.61940652694154e-06,
"logits/chosen": -2.7625374794006348,
"logits/rejected": -2.8054728507995605,
"logps/chosen": -0.8576439023017883,
"logps/rejected": -1.2374662160873413,
"loss": 0.9224,
"odds_ratio_loss": 0.6476989984512329,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08576439321041107,
"rewards/margins": 0.03798223286867142,
"rewards/rejected": -0.12374663352966309,
"sft_loss": 0.8576439023017883,
"step": 330
},
{
"epoch": 0.5496059810062639,
"grad_norm": 2.094233274459839,
"learning_rate": 4.596634117545689e-06,
"logits/chosen": -2.8440895080566406,
"logits/rejected": -2.8477485179901123,
"logps/chosen": -0.8450831174850464,
"logps/rejected": -1.1874289512634277,
"loss": 0.9084,
"odds_ratio_loss": 0.6333492994308472,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08450832217931747,
"rewards/margins": 0.03423457592725754,
"rewards/rejected": -0.11874288320541382,
"sft_loss": 0.8450831174850464,
"step": 340
},
{
"epoch": 0.5657708628005658,
"grad_norm": 1.2610398530960083,
"learning_rate": 4.573259648679335e-06,
"logits/chosen": -2.8393020629882812,
"logits/rejected": -2.8172850608825684,
"logps/chosen": -0.8293860554695129,
"logps/rejected": -1.1484854221343994,
"loss": 0.8924,
"odds_ratio_loss": 0.6304416060447693,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08293859660625458,
"rewards/margins": 0.03190993517637253,
"rewards/rejected": -0.1148485392332077,
"sft_loss": 0.8293860554695129,
"step": 350
},
{
"epoch": 0.5819357445948676,
"grad_norm": 7.934630870819092,
"learning_rate": 4.549289832443663e-06,
"logits/chosen": -2.8159756660461426,
"logits/rejected": -2.8409628868103027,
"logps/chosen": -0.885659396648407,
"logps/rejected": -1.2282092571258545,
"loss": 0.9498,
"odds_ratio_loss": 0.641811192035675,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08856594562530518,
"rewards/margins": 0.03425499051809311,
"rewards/rejected": -0.12282093614339828,
"sft_loss": 0.885659396648407,
"step": 360
},
{
"epoch": 0.5981006263891695,
"grad_norm": 1.7960658073425293,
"learning_rate": 4.524731551896978e-06,
"logits/chosen": -2.8090755939483643,
"logits/rejected": -2.825777292251587,
"logps/chosen": -0.7784116864204407,
"logps/rejected": -0.9700002670288086,
"loss": 0.8424,
"odds_ratio_loss": 0.6396910548210144,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07784116268157959,
"rewards/margins": 0.01915885880589485,
"rewards/rejected": -0.09700002521276474,
"sft_loss": 0.7784116864204407,
"step": 370
},
{
"epoch": 0.6142655081834714,
"grad_norm": 3.268920421600342,
"learning_rate": 4.4995918590781925e-06,
"logits/chosen": -2.853820562362671,
"logits/rejected": -2.8512935638427734,
"logps/chosen": -0.8428764343261719,
"logps/rejected": -1.0172072649002075,
"loss": 0.9104,
"odds_ratio_loss": 0.6751636266708374,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08428764343261719,
"rewards/margins": 0.01743307337164879,
"rewards/rejected": -0.10172072798013687,
"sft_loss": 0.8428764343261719,
"step": 380
},
{
"epoch": 0.6304303899777733,
"grad_norm": 1.0598444938659668,
"learning_rate": 4.473877972981797e-06,
"logits/chosen": -2.7993013858795166,
"logits/rejected": -2.789777994155884,
"logps/chosen": -0.8297500610351562,
"logps/rejected": -1.0850985050201416,
"loss": 0.8895,
"odds_ratio_loss": 0.5971348881721497,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.08297501504421234,
"rewards/margins": 0.025534838438034058,
"rewards/rejected": -0.1085098534822464,
"sft_loss": 0.8297500610351562,
"step": 390
},
{
"epoch": 0.6465952717720752,
"grad_norm": 1.9357444047927856,
"learning_rate": 4.447597277484894e-06,
"logits/chosen": -2.7699055671691895,
"logits/rejected": -2.798750400543213,
"logps/chosen": -0.7733790874481201,
"logps/rejected": -0.9783531427383423,
"loss": 0.8347,
"odds_ratio_loss": 0.6135808825492859,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07733791321516037,
"rewards/margins": 0.020497407764196396,
"rewards/rejected": -0.09783531725406647,
"sft_loss": 0.7733790874481201,
"step": 400
},
{
"epoch": 0.6627601535663771,
"grad_norm": 1.4025357961654663,
"learning_rate": 4.42075731922687e-06,
"logits/chosen": -2.8587729930877686,
"logits/rejected": -2.87328839302063,
"logps/chosen": -0.9505017995834351,
"logps/rejected": -1.1930662393569946,
"loss": 1.0132,
"odds_ratio_loss": 0.6273903250694275,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09505018591880798,
"rewards/margins": 0.024256447330117226,
"rewards/rejected": -0.11930663883686066,
"sft_loss": 0.9505017995834351,
"step": 410
},
{
"epoch": 0.6789250353606789,
"grad_norm": 5.174298286437988,
"learning_rate": 4.3933658054423465e-06,
"logits/chosen": -2.8345279693603516,
"logits/rejected": -2.83827543258667,
"logps/chosen": -0.80866539478302,
"logps/rejected": -1.174803614616394,
"loss": 0.8664,
"odds_ratio_loss": 0.5777753591537476,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08086653053760529,
"rewards/margins": 0.036613818258047104,
"rewards/rejected": -0.11748035252094269,
"sft_loss": 0.80866539478302,
"step": 420
},
{
"epoch": 0.6950899171549808,
"grad_norm": 2.207981586456299,
"learning_rate": 4.365430601748003e-06,
"logits/chosen": -2.8343446254730225,
"logits/rejected": -2.857731342315674,
"logps/chosen": -0.9037211537361145,
"logps/rejected": -1.0559289455413818,
"loss": 0.9705,
"odds_ratio_loss": 0.6677287817001343,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09037211537361145,
"rewards/margins": 0.01522077340632677,
"rewards/rejected": -0.10559289157390594,
"sft_loss": 0.9037211537361145,
"step": 430
},
{
"epoch": 0.7112547989492827,
"grad_norm": 15.49936580657959,
"learning_rate": 4.336959729883925e-06,
"logits/chosen": -2.8130838871002197,
"logits/rejected": -2.8357608318328857,
"logps/chosen": -0.8217814564704895,
"logps/rejected": -0.9188777804374695,
"loss": 0.8923,
"odds_ratio_loss": 0.7047211527824402,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08217814564704895,
"rewards/margins": 0.00970962829887867,
"rewards/rejected": -0.09188777953386307,
"sft_loss": 0.8217814564704895,
"step": 440
},
{
"epoch": 0.7274196807435845,
"grad_norm": 1.7557275295257568,
"learning_rate": 4.307961365410118e-06,
"logits/chosen": -2.790027379989624,
"logits/rejected": -2.809622049331665,
"logps/chosen": -0.840091347694397,
"logps/rejected": -1.0152480602264404,
"loss": 0.9039,
"odds_ratio_loss": 0.6380866169929504,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08400914072990417,
"rewards/margins": 0.017515674233436584,
"rewards/rejected": -0.10152481496334076,
"sft_loss": 0.840091347694397,
"step": 450
},
{
"epoch": 0.7435845625378864,
"grad_norm": 2.8990914821624756,
"learning_rate": 4.278443835358854e-06,
"logits/chosen": -2.812924861907959,
"logits/rejected": -2.811110734939575,
"logps/chosen": -0.8139681816101074,
"logps/rejected": -1.0690581798553467,
"loss": 0.8748,
"odds_ratio_loss": 0.6082891225814819,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08139681816101074,
"rewards/margins": 0.025509005412459373,
"rewards/rejected": -0.10690581798553467,
"sft_loss": 0.8139681816101074,
"step": 460
},
{
"epoch": 0.7597494443321883,
"grad_norm": 2.2395644187927246,
"learning_rate": 4.248415615843523e-06,
"logits/chosen": -2.8422694206237793,
"logits/rejected": -2.850648880004883,
"logps/chosen": -0.8527294993400574,
"logps/rejected": -1.0392307043075562,
"loss": 0.9183,
"odds_ratio_loss": 0.6552284359931946,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08527294546365738,
"rewards/margins": 0.01865011267364025,
"rewards/rejected": -0.10392306745052338,
"sft_loss": 0.8527294993400574,
"step": 470
},
{
"epoch": 0.7759143261264903,
"grad_norm": 2.0075857639312744,
"learning_rate": 4.217885329624666e-06,
"logits/chosen": -2.8313276767730713,
"logits/rejected": -2.8245348930358887,
"logps/chosen": -0.790324330329895,
"logps/rejected": -1.0767412185668945,
"loss": 0.8498,
"odds_ratio_loss": 0.5943514108657837,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07903242856264114,
"rewards/margins": 0.02864169515669346,
"rewards/rejected": -0.10767412185668945,
"sft_loss": 0.790324330329895,
"step": 480
},
{
"epoch": 0.7920792079207921,
"grad_norm": 1.7681854963302612,
"learning_rate": 4.186861743633911e-06,
"logits/chosen": -2.8171868324279785,
"logits/rejected": -2.8480162620544434,
"logps/chosen": -0.7983497381210327,
"logps/rejected": -1.1061131954193115,
"loss": 0.8646,
"odds_ratio_loss": 0.6626344919204712,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07983498275279999,
"rewards/margins": 0.03077634610235691,
"rewards/rejected": -0.11061131954193115,
"sft_loss": 0.7983497381210327,
"step": 490
},
{
"epoch": 0.808244089715094,
"grad_norm": 1.5298829078674316,
"learning_rate": 4.155353766456497e-06,
"logits/chosen": -2.874368190765381,
"logits/rejected": -2.8658576011657715,
"logps/chosen": -0.8663871884346008,
"logps/rejected": -1.0296813249588013,
"loss": 0.93,
"odds_ratio_loss": 0.636117160320282,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08663871884346008,
"rewards/margins": 0.016329411417245865,
"rewards/rejected": -0.10296813398599625,
"sft_loss": 0.8663871884346008,
"step": 500
},
{
"epoch": 0.808244089715094,
"eval_logits/chosen": -2.836843729019165,
"eval_logits/rejected": -2.8441994190216064,
"eval_logps/chosen": -0.8278239965438843,
"eval_logps/rejected": -1.0567275285720825,
"eval_loss": 0.8927881121635437,
"eval_odds_ratio_loss": 0.6496399641036987,
"eval_rewards/accuracies": 0.5772727131843567,
"eval_rewards/chosen": -0.08278240263462067,
"eval_rewards/margins": 0.02289034053683281,
"eval_rewards/rejected": -0.10567274689674377,
"eval_runtime": 194.5311,
"eval_samples_per_second": 5.655,
"eval_sft_loss": 0.8278239965438843,
"eval_steps_per_second": 2.827,
"step": 500
},
{
"epoch": 0.8244089715093958,
"grad_norm": 1.6909329891204834,
"learning_rate": 4.123370445773134e-06,
"logits/chosen": -2.8691649436950684,
"logits/rejected": -2.8811800479888916,
"logps/chosen": -0.8283156156539917,
"logps/rejected": -0.9291037321090698,
"loss": 0.8973,
"odds_ratio_loss": 0.689969539642334,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08283156156539917,
"rewards/margins": 0.01007880363613367,
"rewards/rejected": -0.09291036427021027,
"sft_loss": 0.8283156156539917,
"step": 510
},
{
"epoch": 0.8405738533036977,
"grad_norm": 4.33729362487793,
"learning_rate": 4.090920965761906e-06,
"logits/chosen": -2.808586597442627,
"logits/rejected": -2.8186278343200684,
"logps/chosen": -0.8606308698654175,
"logps/rejected": -1.0332623720169067,
"loss": 0.9284,
"odds_ratio_loss": 0.6780760884284973,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08606309443712234,
"rewards/margins": 0.0172631423920393,
"rewards/rejected": -0.1033262237906456,
"sft_loss": 0.8606308698654175,
"step": 520
},
{
"epoch": 0.8567387350979996,
"grad_norm": 6.002406120300293,
"learning_rate": 4.058014644460991e-06,
"logits/chosen": -2.833061456680298,
"logits/rejected": -2.8458170890808105,
"logps/chosen": -0.8242424726486206,
"logps/rejected": -0.9793018102645874,
"loss": 0.8862,
"odds_ratio_loss": 0.6198969483375549,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08242423832416534,
"rewards/margins": 0.015505945309996605,
"rewards/rejected": -0.0979301929473877,
"sft_loss": 0.8242424726486206,
"step": 530
},
{
"epoch": 0.8729036168923014,
"grad_norm": 1.998780608177185,
"learning_rate": 4.024660931092939e-06,
"logits/chosen": -2.81856369972229,
"logits/rejected": -2.8293251991271973,
"logps/chosen": -0.8208298683166504,
"logps/rejected": -1.0441166162490845,
"loss": 0.8828,
"odds_ratio_loss": 0.6198452115058899,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08208298683166504,
"rewards/margins": 0.022328665480017662,
"rewards/rejected": -0.10441166162490845,
"sft_loss": 0.8208298683166504,
"step": 540
},
{
"epoch": 0.8890684986866033,
"grad_norm": 2.4577414989471436,
"learning_rate": 3.990869403351272e-06,
"logits/chosen": -2.8507511615753174,
"logits/rejected": -2.8566970825195312,
"logps/chosen": -0.8117038011550903,
"logps/rejected": -1.0751911401748657,
"loss": 0.8674,
"odds_ratio_loss": 0.5573362112045288,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08117038011550903,
"rewards/margins": 0.026348743587732315,
"rewards/rejected": -0.10751912742853165,
"sft_loss": 0.8117038011550903,
"step": 550
},
{
"epoch": 0.9052333804809052,
"grad_norm": 3.4686763286590576,
"learning_rate": 3.956649764650206e-06,
"logits/chosen": -2.881647825241089,
"logits/rejected": -2.8819093704223633,
"logps/chosen": -0.840446949005127,
"logps/rejected": -1.052137017250061,
"loss": 0.907,
"odds_ratio_loss": 0.6658841967582703,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0840446949005127,
"rewards/margins": 0.02116900309920311,
"rewards/rejected": -0.1052137017250061,
"sft_loss": 0.840446949005127,
"step": 560
},
{
"epoch": 0.9213982622752072,
"grad_norm": 2.2446658611297607,
"learning_rate": 3.92201184133826e-06,
"logits/chosen": -2.864419460296631,
"logits/rejected": -2.8783581256866455,
"logps/chosen": -0.7979758381843567,
"logps/rejected": -1.0608371496200562,
"loss": 0.858,
"odds_ratio_loss": 0.5999220609664917,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07979758828878403,
"rewards/margins": 0.026286140084266663,
"rewards/rejected": -0.1060837134718895,
"sft_loss": 0.7979758381843567,
"step": 570
},
{
"epoch": 0.937563144069509,
"grad_norm": 1.9976744651794434,
"learning_rate": 3.886965579876572e-06,
"logits/chosen": -2.900329351425171,
"logits/rejected": -2.90751051902771,
"logps/chosen": -0.8153482675552368,
"logps/rejected": -0.9346411824226379,
"loss": 0.8816,
"odds_ratio_loss": 0.6620460748672485,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08153482526540756,
"rewards/margins": 0.011929300613701344,
"rewards/rejected": -0.09346412122249603,
"sft_loss": 0.8153482675552368,
"step": 580
},
{
"epoch": 0.9537280258638109,
"grad_norm": 1.6091820001602173,
"learning_rate": 3.851521043982716e-06,
"logits/chosen": -2.8917582035064697,
"logits/rejected": -2.902100086212158,
"logps/chosen": -0.8334836959838867,
"logps/rejected": -1.004950761795044,
"loss": 0.9,
"odds_ratio_loss": 0.6651790738105774,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0833483636379242,
"rewards/margins": 0.017146697267889977,
"rewards/rejected": -0.10049506276845932,
"sft_loss": 0.8334836959838867,
"step": 590
},
{
"epoch": 0.9698929076581128,
"grad_norm": 5.672989845275879,
"learning_rate": 3.81568841174086e-06,
"logits/chosen": -2.861603021621704,
"logits/rejected": -2.876756191253662,
"logps/chosen": -0.7806357145309448,
"logps/rejected": -1.1542575359344482,
"loss": 0.8442,
"odds_ratio_loss": 0.6360144019126892,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07806357741355896,
"rewards/margins": 0.03736215457320213,
"rewards/rejected": -0.1154257282614708,
"sft_loss": 0.7806357145309448,
"step": 600
},
{
"epoch": 0.9860577894524146,
"grad_norm": 1.4658279418945312,
"learning_rate": 3.7794779726790664e-06,
"logits/chosen": -2.845876455307007,
"logits/rejected": -2.8574581146240234,
"logps/chosen": -0.7789396047592163,
"logps/rejected": -1.1114189624786377,
"loss": 0.8409,
"odds_ratio_loss": 0.6194978952407837,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07789396494626999,
"rewards/margins": 0.0332479402422905,
"rewards/rejected": -0.11114190518856049,
"sft_loss": 0.7789396047592163,
"step": 610
},
{
"epoch": 1.0022226712467166,
"grad_norm": 2.5747179985046387,
"learning_rate": 3.7429001248146096e-06,
"logits/chosen": -2.8244144916534424,
"logits/rejected": -2.832597494125366,
"logps/chosen": -0.7860082387924194,
"logps/rejected": -1.0231492519378662,
"loss": 0.8435,
"odds_ratio_loss": 0.5752763748168945,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07860083132982254,
"rewards/margins": 0.023714100942015648,
"rewards/rejected": -0.10231492668390274,
"sft_loss": 0.7860082387924194,
"step": 620
},
{
"epoch": 1.0183875530410185,
"grad_norm": 1.24222993850708,
"learning_rate": 3.7059653716681227e-06,
"logits/chosen": -2.8329997062683105,
"logits/rejected": -2.8287994861602783,
"logps/chosen": -0.8590106964111328,
"logps/rejected": -1.0588136911392212,
"loss": 0.9265,
"odds_ratio_loss": 0.6749905347824097,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08590107411146164,
"rewards/margins": 0.01998029835522175,
"rewards/rejected": -0.10588137060403824,
"sft_loss": 0.8590106964111328,
"step": 630
},
{
"epoch": 1.0345524348353203,
"grad_norm": 1.6466968059539795,
"learning_rate": 3.668684319247463e-06,
"logits/chosen": -2.8495888710021973,
"logits/rejected": -2.872880220413208,
"logps/chosen": -0.7487844824790955,
"logps/rejected": -1.0430450439453125,
"loss": 0.8035,
"odds_ratio_loss": 0.5467280149459839,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07487844675779343,
"rewards/margins": 0.02942606247961521,
"rewards/rejected": -0.10430450737476349,
"sft_loss": 0.7487844824790955,
"step": 640
},
{
"epoch": 1.0507173166296222,
"grad_norm": 1.1547085046768188,
"learning_rate": 3.6310676730021373e-06,
"logits/chosen": -2.8986639976501465,
"logits/rejected": -2.900839328765869,
"logps/chosen": -0.7881689071655273,
"logps/rejected": -0.9517928957939148,
"loss": 0.8509,
"odds_ratio_loss": 0.6268683075904846,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07881689816713333,
"rewards/margins": 0.016362406313419342,
"rewards/rejected": -0.09517930448055267,
"sft_loss": 0.7881689071655273,
"step": 650
},
{
"epoch": 1.066882198423924,
"grad_norm": 3.282292604446411,
"learning_rate": 3.593126234749178e-06,
"logits/chosen": -2.8645131587982178,
"logits/rejected": -2.898613929748535,
"logps/chosen": -0.9009162187576294,
"logps/rejected": -1.1612458229064941,
"loss": 0.9648,
"odds_ratio_loss": 0.6383681297302246,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09009162336587906,
"rewards/margins": 0.026032963767647743,
"rewards/rejected": -0.11612458527088165,
"sft_loss": 0.9009162187576294,
"step": 660
},
{
"epoch": 1.083047080218226,
"grad_norm": 1.7910722494125366,
"learning_rate": 3.554870899571343e-06,
"logits/chosen": -2.8563625812530518,
"logits/rejected": -2.8744523525238037,
"logps/chosen": -0.8285778760910034,
"logps/rejected": -1.0025149583816528,
"loss": 0.8927,
"odds_ratio_loss": 0.6415389776229858,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08285778015851974,
"rewards/margins": 0.01739371195435524,
"rewards/rejected": -0.10025149583816528,
"sft_loss": 0.8285778760910034,
"step": 670
},
{
"epoch": 1.0992119620125278,
"grad_norm": 3.5774316787719727,
"learning_rate": 3.5163126526885373e-06,
"logits/chosen": -2.8437960147857666,
"logits/rejected": -2.870513916015625,
"logps/chosen": -0.7732303142547607,
"logps/rejected": -1.0101302862167358,
"loss": 0.8343,
"odds_ratio_loss": 0.6102721095085144,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07732303440570831,
"rewards/margins": 0.023690002039074898,
"rewards/rejected": -0.10101302713155746,
"sft_loss": 0.7732303142547607,
"step": 680
},
{
"epoch": 1.1153768438068297,
"grad_norm": 2.30400013923645,
"learning_rate": 3.4774625663033484e-06,
"logits/chosen": -2.849010467529297,
"logits/rejected": -2.8660061359405518,
"logps/chosen": -0.7853142619132996,
"logps/rejected": -0.9644325971603394,
"loss": 0.8466,
"odds_ratio_loss": 0.6127563714981079,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0785314291715622,
"rewards/margins": 0.017911842092871666,
"rewards/rejected": -0.09644327312707901,
"sft_loss": 0.7853142619132996,
"step": 690
},
{
"epoch": 1.1315417256011315,
"grad_norm": 1.4719704389572144,
"learning_rate": 3.4383317964216067e-06,
"logits/chosen": -2.8511626720428467,
"logits/rejected": -2.881286382675171,
"logps/chosen": -0.7790023684501648,
"logps/rejected": -0.9076374173164368,
"loss": 0.8484,
"odds_ratio_loss": 0.6935282945632935,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0779002457857132,
"rewards/margins": 0.012863497249782085,
"rewards/rejected": -0.09076374769210815,
"sft_loss": 0.7790023684501648,
"step": 700
},
{
"epoch": 1.1477066073954334,
"grad_norm": 2.1927425861358643,
"learning_rate": 3.398931579648877e-06,
"logits/chosen": -2.8756051063537598,
"logits/rejected": -2.880699872970581,
"logps/chosen": -0.8047206997871399,
"logps/rejected": -1.1634694337844849,
"loss": 0.8667,
"odds_ratio_loss": 0.6202768087387085,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08047207444906235,
"rewards/margins": 0.0358748659491539,
"rewards/rejected": -0.11634693294763565,
"sft_loss": 0.8047206997871399,
"step": 710
},
{
"epoch": 1.1638714891897353,
"grad_norm": 1.4328726530075073,
"learning_rate": 3.359273229963813e-06,
"logits/chosen": -2.8490045070648193,
"logits/rejected": -2.8502037525177,
"logps/chosen": -0.7575694918632507,
"logps/rejected": -0.9301745295524597,
"loss": 0.821,
"odds_ratio_loss": 0.6343931555747986,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07575695216655731,
"rewards/margins": 0.017260495573282242,
"rewards/rejected": -0.09301744401454926,
"sft_loss": 0.7575694918632507,
"step": 720
},
{
"epoch": 1.1800363709840371,
"grad_norm": 1.3170576095581055,
"learning_rate": 3.319368135469285e-06,
"logits/chosen": -2.8658504486083984,
"logits/rejected": -2.8875842094421387,
"logps/chosen": -0.8195670247077942,
"logps/rejected": -1.1535929441452026,
"loss": 0.8841,
"odds_ratio_loss": 0.6450805068016052,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08195669949054718,
"rewards/margins": 0.03340259566903114,
"rewards/rejected": -0.11535929143428802,
"sft_loss": 0.8195670247077942,
"step": 730
},
{
"epoch": 1.196201252778339,
"grad_norm": 4.55858850479126,
"learning_rate": 3.279227755122228e-06,
"logits/chosen": -2.858372211456299,
"logits/rejected": -2.860966205596924,
"logps/chosen": -0.7807797193527222,
"logps/rejected": -1.1492526531219482,
"loss": 0.839,
"odds_ratio_loss": 0.5826634764671326,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.07807797938585281,
"rewards/margins": 0.03684728592634201,
"rewards/rejected": -0.11492526531219482,
"sft_loss": 0.7807797193527222,
"step": 740
},
{
"epoch": 1.2123661345726409,
"grad_norm": 2.330960988998413,
"learning_rate": 3.2388636154431417e-06,
"logits/chosen": -2.868211507797241,
"logits/rejected": -2.898150682449341,
"logps/chosen": -0.8243536949157715,
"logps/rejected": -1.195150375366211,
"loss": 0.883,
"odds_ratio_loss": 0.5866126418113708,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08243536949157715,
"rewards/margins": 0.03707967326045036,
"rewards/rejected": -0.11951503902673721,
"sft_loss": 0.8243536949157715,
"step": 750
},
{
"epoch": 1.2285310163669427,
"grad_norm": 2.7208411693573,
"learning_rate": 3.198287307206192e-06,
"logits/chosen": -2.844311237335205,
"logits/rejected": -2.8444716930389404,
"logps/chosen": -0.7780786752700806,
"logps/rejected": -0.9966138005256653,
"loss": 0.8378,
"odds_ratio_loss": 0.5971704721450806,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07780785858631134,
"rewards/margins": 0.021853512153029442,
"rewards/rejected": -0.09966136515140533,
"sft_loss": 0.7780786752700806,
"step": 760
},
{
"epoch": 1.2446958981612446,
"grad_norm": 1.3042361736297607,
"learning_rate": 3.157510482110856e-06,
"logits/chosen": -2.9084322452545166,
"logits/rejected": -2.905463933944702,
"logps/chosen": -0.7917675971984863,
"logps/rejected": -1.0798178911209106,
"loss": 0.8557,
"odds_ratio_loss": 0.6388932466506958,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07917676120996475,
"rewards/margins": 0.02880503609776497,
"rewards/rejected": -0.10798178613185883,
"sft_loss": 0.7917675971984863,
"step": 770
},
{
"epoch": 1.2608607799555465,
"grad_norm": 1.315172553062439,
"learning_rate": 3.116544849436077e-06,
"logits/chosen": -2.828716993331909,
"logits/rejected": -2.8282887935638428,
"logps/chosen": -0.8439006805419922,
"logps/rejected": -1.2042268514633179,
"loss": 0.9037,
"odds_ratio_loss": 0.5979124307632446,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0843900591135025,
"rewards/margins": 0.03603263571858406,
"rewards/rejected": -0.12042269855737686,
"sft_loss": 0.8439006805419922,
"step": 780
},
{
"epoch": 1.2770256617498483,
"grad_norm": 1.611197829246521,
"learning_rate": 3.0754021726778848e-06,
"logits/chosen": -2.84073543548584,
"logits/rejected": -2.832176685333252,
"logps/chosen": -0.7603198885917664,
"logps/rejected": -1.202492117881775,
"loss": 0.8152,
"odds_ratio_loss": 0.5489572882652283,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07603198289871216,
"rewards/margins": 0.044217221438884735,
"rewards/rejected": -0.1202491968870163,
"sft_loss": 0.7603198885917664,
"step": 790
},
{
"epoch": 1.2931905435441502,
"grad_norm": 1.179275631904602,
"learning_rate": 3.0340942661714463e-06,
"logits/chosen": -2.877725839614868,
"logits/rejected": -2.891244411468506,
"logps/chosen": -0.8281265497207642,
"logps/rejected": -1.0409139394760132,
"loss": 0.8904,
"odds_ratio_loss": 0.6231717467308044,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08281265199184418,
"rewards/margins": 0.021278750151395798,
"rewards/rejected": -0.10409140586853027,
"sft_loss": 0.8281265497207642,
"step": 800
},
{
"epoch": 1.3093554253384523,
"grad_norm": 2.1846208572387695,
"learning_rate": 2.992632991698512e-06,
"logits/chosen": -2.8389461040496826,
"logits/rejected": -2.85896635055542,
"logps/chosen": -0.8289766311645508,
"logps/rejected": -1.0603488683700562,
"loss": 0.8918,
"odds_ratio_loss": 0.6286410093307495,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08289766311645508,
"rewards/margins": 0.02313724346458912,
"rewards/rejected": -0.10603491216897964,
"sft_loss": 0.8289766311645508,
"step": 810
},
{
"epoch": 1.3255203071327541,
"grad_norm": 1.583357334136963,
"learning_rate": 2.9510302550812537e-06,
"logits/chosen": -2.845541000366211,
"logits/rejected": -2.8782455921173096,
"logps/chosen": -0.7186457514762878,
"logps/rejected": -1.0902959108352661,
"loss": 0.776,
"odds_ratio_loss": 0.573469340801239,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07186457514762878,
"rewards/margins": 0.03716501593589783,
"rewards/rejected": -0.10902959108352661,
"sft_loss": 0.7186457514762878,
"step": 820
},
{
"epoch": 1.341685188927056,
"grad_norm": 2.841128349304199,
"learning_rate": 2.9092980027634325e-06,
"logits/chosen": -2.8583426475524902,
"logits/rejected": -2.874774217605591,
"logps/chosen": -0.7276403903961182,
"logps/rejected": -1.0125164985656738,
"loss": 0.788,
"odds_ratio_loss": 0.6034457683563232,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07276404649019241,
"rewards/margins": 0.028487607836723328,
"rewards/rejected": -0.10125164687633514,
"sft_loss": 0.7276403903961182,
"step": 830
},
{
"epoch": 1.3578500707213579,
"grad_norm": 1.7055377960205078,
"learning_rate": 2.867448218379927e-06,
"logits/chosen": -2.8610100746154785,
"logits/rejected": -2.8836147785186768,
"logps/chosen": -0.8485835790634155,
"logps/rejected": -1.0031511783599854,
"loss": 0.9172,
"odds_ratio_loss": 0.6861482858657837,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08485837280750275,
"rewards/margins": 0.015456756576895714,
"rewards/rejected": -0.10031511634588242,
"sft_loss": 0.8485835790634155,
"step": 840
},
{
"epoch": 1.3740149525156597,
"grad_norm": 9.629118919372559,
"learning_rate": 2.825492919315559e-06,
"logits/chosen": -2.8479480743408203,
"logits/rejected": -2.8763227462768555,
"logps/chosen": -0.8768585324287415,
"logps/rejected": -0.999729335308075,
"loss": 0.9437,
"odds_ratio_loss": 0.668052613735199,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08768586814403534,
"rewards/margins": 0.012287073768675327,
"rewards/rejected": -0.0999729260802269,
"sft_loss": 0.8768585324287415,
"step": 850
},
{
"epoch": 1.3901798343099616,
"grad_norm": 2.416870594024658,
"learning_rate": 2.7834441532542482e-06,
"logits/chosen": -2.8881735801696777,
"logits/rejected": -2.9063100814819336,
"logps/chosen": -0.7879316210746765,
"logps/rejected": -1.023233413696289,
"loss": 0.8456,
"odds_ratio_loss": 0.5766496658325195,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07879316806793213,
"rewards/margins": 0.023530183359980583,
"rewards/rejected": -0.10232335329055786,
"sft_loss": 0.7879316210746765,
"step": 860
},
{
"epoch": 1.4063447161042635,
"grad_norm": 1.5628215074539185,
"learning_rate": 2.74131399471945e-06,
"logits/chosen": -2.855931520462036,
"logits/rejected": -2.8686752319335938,
"logps/chosen": -0.7991023063659668,
"logps/rejected": -0.9908691644668579,
"loss": 0.8644,
"odds_ratio_loss": 0.6528818607330322,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07991023361682892,
"rewards/margins": 0.01917668618261814,
"rewards/rejected": -0.09908691793680191,
"sft_loss": 0.7991023063659668,
"step": 870
},
{
"epoch": 1.4225095978985653,
"grad_norm": 2.0555615425109863,
"learning_rate": 2.6991145416068947e-06,
"logits/chosen": -2.846782922744751,
"logits/rejected": -2.8673818111419678,
"logps/chosen": -0.8078680038452148,
"logps/rejected": -0.9619809985160828,
"loss": 0.8714,
"odds_ratio_loss": 0.6356260180473328,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0807868018746376,
"rewards/margins": 0.01541130244731903,
"rewards/rejected": -0.09619811177253723,
"sft_loss": 0.8078680038452148,
"step": 880
},
{
"epoch": 1.4386744796928672,
"grad_norm": 0.9378024339675903,
"learning_rate": 2.6568579117106143e-06,
"logits/chosen": -2.8469960689544678,
"logits/rejected": -2.850614070892334,
"logps/chosen": -0.7744920253753662,
"logps/rejected": -1.0393074750900269,
"loss": 0.8347,
"odds_ratio_loss": 0.6018751859664917,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07744920998811722,
"rewards/margins": 0.026481550186872482,
"rewards/rejected": -0.1039307564496994,
"sft_loss": 0.7744920253753662,
"step": 890
},
{
"epoch": 1.454839361487169,
"grad_norm": 0.9352036118507385,
"learning_rate": 2.6145562392432544e-06,
"logits/chosen": -2.875109910964966,
"logits/rejected": -2.887655735015869,
"logps/chosen": -0.8057360649108887,
"logps/rejected": -0.9923427700996399,
"loss": 0.8708,
"odds_ratio_loss": 0.6502856016159058,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08057360351085663,
"rewards/margins": 0.01866067573428154,
"rewards/rejected": -0.09923428297042847,
"sft_loss": 0.8057360649108887,
"step": 900
},
{
"epoch": 1.471004243281471,
"grad_norm": 2.6385111808776855,
"learning_rate": 2.5722216713516682e-06,
"logits/chosen": -2.8550915718078613,
"logits/rejected": -2.8972582817077637,
"logps/chosen": -0.7460139989852905,
"logps/rejected": -0.9863673448562622,
"loss": 0.8057,
"odds_ratio_loss": 0.5972028374671936,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07460139691829681,
"rewards/margins": 0.024035323411226273,
"rewards/rejected": -0.09863673150539398,
"sft_loss": 0.7460139989852905,
"step": 910
},
{
"epoch": 1.4871691250757728,
"grad_norm": 1.7198817729949951,
"learning_rate": 2.5298663646288064e-06,
"logits/chosen": -2.8807036876678467,
"logits/rejected": -2.888306140899658,
"logps/chosen": -0.7764211893081665,
"logps/rejected": -1.0312559604644775,
"loss": 0.8377,
"odds_ratio_loss": 0.6123490333557129,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07764211297035217,
"rewards/margins": 0.025483474135398865,
"rewards/rejected": -0.10312558710575104,
"sft_loss": 0.7764211893081665,
"step": 920
},
{
"epoch": 1.503334006870075,
"grad_norm": 2.7615318298339844,
"learning_rate": 2.487502481622879e-06,
"logits/chosen": -2.8637490272521973,
"logits/rejected": -2.874497652053833,
"logps/chosen": -0.8163179159164429,
"logps/rejected": -0.9841713905334473,
"loss": 0.8791,
"odds_ratio_loss": 0.6274018287658691,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.08163177967071533,
"rewards/margins": 0.01678534969687462,
"rewards/rejected": -0.09841714054346085,
"sft_loss": 0.8163179159164429,
"step": 930
},
{
"epoch": 1.5194988886643768,
"grad_norm": 1.7173594236373901,
"learning_rate": 2.4451421873448253e-06,
"logits/chosen": -2.8568150997161865,
"logits/rejected": -2.879917621612549,
"logps/chosen": -0.8009888529777527,
"logps/rejected": -0.9833795428276062,
"loss": 0.8678,
"odds_ratio_loss": 0.667960524559021,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08009888231754303,
"rewards/margins": 0.018239066004753113,
"rewards/rejected": -0.09833794832229614,
"sft_loss": 0.8009888529777527,
"step": 940
},
{
"epoch": 1.5356637704586786,
"grad_norm": 3.4001808166503906,
"learning_rate": 2.40279764577506e-06,
"logits/chosen": -2.885816812515259,
"logits/rejected": -2.9209980964660645,
"logps/chosen": -0.8259257078170776,
"logps/rejected": -0.9810823202133179,
"loss": 0.8903,
"odds_ratio_loss": 0.6437360048294067,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08259257674217224,
"rewards/margins": 0.015515660867094994,
"rewards/rejected": -0.09810823202133179,
"sft_loss": 0.8259257078170776,
"step": 950
},
{
"epoch": 1.5518286522529805,
"grad_norm": 3.7369155883789062,
"learning_rate": 2.3604810163705242e-06,
"logits/chosen": -2.878312587738037,
"logits/rejected": -2.9087862968444824,
"logps/chosen": -0.7468287944793701,
"logps/rejected": -0.999441921710968,
"loss": 0.8033,
"odds_ratio_loss": 0.5650970339775085,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07468288391828537,
"rewards/margins": 0.02526130899786949,
"rewards/rejected": -0.09994419664144516,
"sft_loss": 0.7468287944793701,
"step": 960
},
{
"epoch": 1.5679935340472824,
"grad_norm": 1.2655407190322876,
"learning_rate": 2.3182044505730364e-06,
"logits/chosen": -2.872468948364258,
"logits/rejected": -2.873964309692383,
"logps/chosen": -0.7006109952926636,
"logps/rejected": -0.9527314901351929,
"loss": 0.7581,
"odds_ratio_loss": 0.5752806067466736,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0700611025094986,
"rewards/margins": 0.025212040171027184,
"rewards/rejected": -0.09527313709259033,
"sft_loss": 0.7006109952926636,
"step": 970
},
{
"epoch": 1.5841584158415842,
"grad_norm": 2.9336001873016357,
"learning_rate": 2.275980088319941e-06,
"logits/chosen": -2.8779749870300293,
"logits/rejected": -2.8763155937194824,
"logps/chosen": -0.7721344232559204,
"logps/rejected": -0.9309911727905273,
"loss": 0.8406,
"odds_ratio_loss": 0.6845985651016235,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07721343636512756,
"rewards/margins": 0.01588568463921547,
"rewards/rejected": -0.09309910982847214,
"sft_loss": 0.7721344232559204,
"step": 980
},
{
"epoch": 1.600323297635886,
"grad_norm": 2.434041738510132,
"learning_rate": 2.2338200545580577e-06,
"logits/chosen": -2.849057674407959,
"logits/rejected": -2.873142957687378,
"logps/chosen": -0.7509113550186157,
"logps/rejected": -1.0347163677215576,
"loss": 0.8135,
"odds_ratio_loss": 0.6254162788391113,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07509114593267441,
"rewards/margins": 0.02838050201535225,
"rewards/rejected": -0.10347163677215576,
"sft_loss": 0.7509113550186157,
"step": 990
},
{
"epoch": 1.616488179430188,
"grad_norm": 1.686830997467041,
"learning_rate": 2.191736455761947e-06,
"logits/chosen": -2.8971669673919678,
"logits/rejected": -2.9139630794525146,
"logps/chosen": -0.7013322114944458,
"logps/rejected": -0.8860443234443665,
"loss": 0.7571,
"odds_ratio_loss": 0.5572749972343445,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07013322412967682,
"rewards/margins": 0.01847122237086296,
"rewards/rejected": -0.08860443532466888,
"sft_loss": 0.7013322114944458,
"step": 1000
},
{
"epoch": 1.616488179430188,
"eval_logits/chosen": -2.8644185066223145,
"eval_logits/rejected": -2.8728911876678467,
"eval_logps/chosen": -0.8028324842453003,
"eval_logps/rejected": -1.0336546897888184,
"eval_loss": 0.8679323792457581,
"eval_odds_ratio_loss": 0.6509982943534851,
"eval_rewards/accuracies": 0.5699999928474426,
"eval_rewards/chosen": -0.08028324693441391,
"eval_rewards/margins": 0.02308221347630024,
"eval_rewards/rejected": -0.1033654510974884,
"eval_runtime": 194.7336,
"eval_samples_per_second": 5.649,
"eval_sft_loss": 0.8028324842453003,
"eval_steps_per_second": 2.824,
"step": 1000
},
{
"epoch": 1.6326530612244898,
"grad_norm": 1.7911335229873657,
"learning_rate": 2.1497413764574673e-06,
"logits/chosen": -2.8975167274475098,
"logits/rejected": -2.8892812728881836,
"logps/chosen": -0.7816007137298584,
"logps/rejected": -1.069588541984558,
"loss": 0.8393,
"odds_ratio_loss": 0.5774248242378235,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07816006988286972,
"rewards/margins": 0.02879878506064415,
"rewards/rejected": -0.10695885121822357,
"sft_loss": 0.7816007137298584,
"step": 1010
},
{
"epoch": 1.6488179430187917,
"grad_norm": 1.912550687789917,
"learning_rate": 2.1078468757516395e-06,
"logits/chosen": -2.8402116298675537,
"logits/rejected": -2.8773112297058105,
"logps/chosen": -0.7441704273223877,
"logps/rejected": -0.9479702115058899,
"loss": 0.8035,
"odds_ratio_loss": 0.5933586955070496,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07441703975200653,
"rewards/margins": 0.02037998102605343,
"rewards/rejected": -0.0947970300912857,
"sft_loss": 0.7441704273223877,
"step": 1020
},
{
"epoch": 1.6649828248130936,
"grad_norm": 2.0232253074645996,
"learning_rate": 2.0660649838698145e-06,
"logits/chosen": -2.8627827167510986,
"logits/rejected": -2.882736921310425,
"logps/chosen": -0.7718713283538818,
"logps/rejected": -1.1140234470367432,
"loss": 0.832,
"odds_ratio_loss": 0.6009626984596252,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07718713581562042,
"rewards/margins": 0.03421521559357643,
"rewards/rejected": -0.11140235513448715,
"sft_loss": 0.7718713283538818,
"step": 1030
},
{
"epoch": 1.6811477066073954,
"grad_norm": 1.9653966426849365,
"learning_rate": 2.0244076987011284e-06,
"logits/chosen": -2.905303716659546,
"logits/rejected": -2.9009556770324707,
"logps/chosen": -0.827530562877655,
"logps/rejected": -1.0324897766113281,
"loss": 0.8888,
"odds_ratio_loss": 0.6124246716499329,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08275305479764938,
"rewards/margins": 0.020495926961302757,
"rewards/rejected": -0.1032489761710167,
"sft_loss": 0.827530562877655,
"step": 1040
},
{
"epoch": 1.6973125884016973,
"grad_norm": 1.4363154172897339,
"learning_rate": 1.982886982353251e-06,
"logits/chosen": -2.888767957687378,
"logits/rejected": -2.8874547481536865,
"logps/chosen": -0.7899632453918457,
"logps/rejected": -1.1214802265167236,
"loss": 0.8526,
"odds_ratio_loss": 0.6266939640045166,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07899631559848785,
"rewards/margins": 0.03315168619155884,
"rewards/rejected": -0.11214800179004669,
"sft_loss": 0.7899632453918457,
"step": 1050
},
{
"epoch": 1.7134774701959992,
"grad_norm": 1.8043084144592285,
"learning_rate": 1.941514757717392e-06,
"logits/chosen": -2.866079330444336,
"logits/rejected": -2.879364490509033,
"logps/chosen": -0.8468548655509949,
"logps/rejected": -1.1184252500534058,
"loss": 0.9022,
"odds_ratio_loss": 0.552977442741394,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.08468548208475113,
"rewards/margins": 0.02715705707669258,
"rewards/rejected": -0.11184253543615341,
"sft_loss": 0.8468548655509949,
"step": 1060
},
{
"epoch": 1.729642351990301,
"grad_norm": 3.669512987136841,
"learning_rate": 1.9003029050445953e-06,
"logits/chosen": -2.8407020568847656,
"logits/rejected": -2.8639755249023438,
"logps/chosen": -0.8030735850334167,
"logps/rejected": -0.9715849757194519,
"loss": 0.8692,
"odds_ratio_loss": 0.660782516002655,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08030736446380615,
"rewards/margins": 0.016851136460900307,
"rewards/rejected": -0.0971585065126419,
"sft_loss": 0.8030735850334167,
"step": 1070
},
{
"epoch": 1.745807233784603,
"grad_norm": 1.9885250329971313,
"learning_rate": 1.8592632585342523e-06,
"logits/chosen": -2.849134922027588,
"logits/rejected": -2.8679654598236084,
"logps/chosen": -0.7700011730194092,
"logps/rejected": -1.0313342809677124,
"loss": 0.8306,
"odds_ratio_loss": 0.6062373518943787,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07700012624263763,
"rewards/margins": 0.02613331377506256,
"rewards/rejected": -0.1031334400177002,
"sft_loss": 0.7700011730194092,
"step": 1080
},
{
"epoch": 1.7619721155789048,
"grad_norm": 4.0624895095825195,
"learning_rate": 1.8184076029358527e-06,
"logits/chosen": -2.840611457824707,
"logits/rejected": -2.8494577407836914,
"logps/chosen": -0.7611902952194214,
"logps/rejected": -0.9082427024841309,
"loss": 0.8272,
"odds_ratio_loss": 0.6598888635635376,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07611902803182602,
"rewards/margins": 0.014705238863825798,
"rewards/rejected": -0.09082427620887756,
"sft_loss": 0.7611902952194214,
"step": 1090
},
{
"epoch": 1.7781369973732066,
"grad_norm": 1.7686785459518433,
"learning_rate": 1.7777476701649318e-06,
"logits/chosen": -2.8446550369262695,
"logits/rejected": -2.85874342918396,
"logps/chosen": -0.7774368524551392,
"logps/rejected": -1.0228512287139893,
"loss": 0.8388,
"odds_ratio_loss": 0.6141053438186646,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07774369418621063,
"rewards/margins": 0.024541418999433517,
"rewards/rejected": -0.10228510946035385,
"sft_loss": 0.7774368524551392,
"step": 1100
},
{
"epoch": 1.7943018791675085,
"grad_norm": 2.743757724761963,
"learning_rate": 1.7372951359341925e-06,
"logits/chosen": -2.8636326789855957,
"logits/rejected": -2.8647377490997314,
"logps/chosen": -0.750954806804657,
"logps/rejected": -0.9340154528617859,
"loss": 0.814,
"odds_ratio_loss": 0.6307731866836548,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07509546726942062,
"rewards/margins": 0.018306076526641846,
"rewards/rejected": -0.09340154379606247,
"sft_loss": 0.750954806804657,
"step": 1110
},
{
"epoch": 1.8104667609618104,
"grad_norm": 3.9680521488189697,
"learning_rate": 1.6970616164007547e-06,
"logits/chosen": -2.8542914390563965,
"logits/rejected": -2.8552489280700684,
"logps/chosen": -0.7380022406578064,
"logps/rejected": -0.9561580419540405,
"loss": 0.801,
"odds_ratio_loss": 0.6301542520523071,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0738002210855484,
"rewards/margins": 0.0218155849725008,
"rewards/rejected": -0.09561581164598465,
"sft_loss": 0.7380022406578064,
"step": 1120
},
{
"epoch": 1.8266316427561122,
"grad_norm": 2.8756582736968994,
"learning_rate": 1.6570586648305276e-06,
"logits/chosen": -2.8676905632019043,
"logits/rejected": -2.895289897918701,
"logps/chosen": -0.7943655252456665,
"logps/rejected": -1.0809084177017212,
"loss": 0.8591,
"odds_ratio_loss": 0.6475063562393188,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07943655550479889,
"rewards/margins": 0.028654297813773155,
"rewards/rejected": -0.1080908551812172,
"sft_loss": 0.7943655252456665,
"step": 1130
},
{
"epoch": 1.842796524550414,
"grad_norm": 1.8805325031280518,
"learning_rate": 1.6172977682806151e-06,
"logits/chosen": -2.8678653240203857,
"logits/rejected": -2.900193214416504,
"logps/chosen": -0.7862238883972168,
"logps/rejected": -1.0396199226379395,
"loss": 0.8453,
"odds_ratio_loss": 0.5909398198127747,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07862239331007004,
"rewards/margins": 0.02533959411084652,
"rewards/rejected": -0.1039619892835617,
"sft_loss": 0.7862238883972168,
"step": 1140
},
{
"epoch": 1.858961406344716,
"grad_norm": 1.586294174194336,
"learning_rate": 1.5777903443007586e-06,
"logits/chosen": -2.8388750553131104,
"logits/rejected": -2.838686466217041,
"logps/chosen": -0.7984446883201599,
"logps/rejected": -1.093590497970581,
"loss": 0.8601,
"odds_ratio_loss": 0.6163803935050964,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07984446734189987,
"rewards/margins": 0.029514577239751816,
"rewards/rejected": -0.10935904830694199,
"sft_loss": 0.7984446883201599,
"step": 1150
},
{
"epoch": 1.8751262881390178,
"grad_norm": 3.058032751083374,
"learning_rate": 1.5385477376547226e-06,
"logits/chosen": -2.853109121322632,
"logits/rejected": -2.863646984100342,
"logps/chosen": -0.7820562124252319,
"logps/rejected": -1.004570484161377,
"loss": 0.8417,
"odds_ratio_loss": 0.5969026684761047,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07820562273263931,
"rewards/margins": 0.022251427173614502,
"rewards/rejected": -0.10045703500509262,
"sft_loss": 0.7820562124252319,
"step": 1160
},
{
"epoch": 1.89129116993332,
"grad_norm": 3.296496868133545,
"learning_rate": 1.4995812170625845e-06,
"logits/chosen": -2.8537023067474365,
"logits/rejected": -2.8620083332061768,
"logps/chosen": -0.7803040742874146,
"logps/rejected": -1.1614640951156616,
"loss": 0.8383,
"odds_ratio_loss": 0.5798701047897339,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07803040742874146,
"rewards/margins": 0.038116004317998886,
"rewards/rejected": -0.11614640802145004,
"sft_loss": 0.7803040742874146,
"step": 1170
},
{
"epoch": 1.9074560517276218,
"grad_norm": 2.4982151985168457,
"learning_rate": 1.4609019719648666e-06,
"logits/chosen": -2.8664259910583496,
"logits/rejected": -2.880103826522827,
"logps/chosen": -0.7934621572494507,
"logps/rejected": -1.0411931276321411,
"loss": 0.8522,
"odds_ratio_loss": 0.5876864194869995,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07934621721506119,
"rewards/margins": 0.024773094803094864,
"rewards/rejected": -0.10411931574344635,
"sft_loss": 0.7934621572494507,
"step": 1180
},
{
"epoch": 1.9236209335219236,
"grad_norm": 4.357522964477539,
"learning_rate": 1.42252110930943e-06,
"logits/chosen": -2.8305060863494873,
"logits/rejected": -2.850817918777466,
"logps/chosen": -0.7121320962905884,
"logps/rejected": -0.97893887758255,
"loss": 0.7723,
"odds_ratio_loss": 0.6020933389663696,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07121320813894272,
"rewards/margins": 0.02668066881597042,
"rewards/rejected": -0.09789387881755829,
"sft_loss": 0.7121320962905884,
"step": 1190
},
{
"epoch": 1.9397858153162255,
"grad_norm": 3.2690622806549072,
"learning_rate": 1.3844496503620493e-06,
"logits/chosen": -2.855846881866455,
"logits/rejected": -2.885960817337036,
"logps/chosen": -0.7993025779724121,
"logps/rejected": -1.008312702178955,
"loss": 0.8606,
"odds_ratio_loss": 0.6124933362007141,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07993026077747345,
"rewards/margins": 0.020901009440422058,
"rewards/rejected": -0.10083127021789551,
"sft_loss": 0.7993025779724121,
"step": 1200
},
{
"epoch": 1.9559506971105274,
"grad_norm": 3.07012677192688,
"learning_rate": 1.3466985275416081e-06,
"logits/chosen": -2.8368687629699707,
"logits/rejected": -2.8513948917388916,
"logps/chosen": -0.8561896085739136,
"logps/rejected": -1.0195033550262451,
"loss": 0.9234,
"odds_ratio_loss": 0.6718183159828186,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08561895787715912,
"rewards/margins": 0.016331372782588005,
"rewards/rejected": -0.10195034742355347,
"sft_loss": 0.8561896085739136,
"step": 1210
},
{
"epoch": 1.9721155789048292,
"grad_norm": 4.26687479019165,
"learning_rate": 1.309278581280791e-06,
"logits/chosen": -2.8606760501861572,
"logits/rejected": -2.868224620819092,
"logps/chosen": -0.7406347990036011,
"logps/rejected": -1.0179945230484009,
"loss": 0.7986,
"odds_ratio_loss": 0.5793353319168091,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07406347990036011,
"rewards/margins": 0.02773597277700901,
"rewards/rejected": -0.10179946571588516,
"sft_loss": 0.7406347990036011,
"step": 1220
},
{
"epoch": 1.9882804606991311,
"grad_norm": 1.2442247867584229,
"learning_rate": 1.272200556913199e-06,
"logits/chosen": -2.8689868450164795,
"logits/rejected": -2.8818325996398926,
"logps/chosen": -0.812061607837677,
"logps/rejected": -1.029280424118042,
"loss": 0.8795,
"odds_ratio_loss": 0.6747404336929321,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08120616525411606,
"rewards/margins": 0.02172188088297844,
"rewards/rejected": -0.102928027510643,
"sft_loss": 0.812061607837677,
"step": 1230
},
{
"epoch": 2.004445342493433,
"grad_norm": 2.5222415924072266,
"learning_rate": 1.2354751015877698e-06,
"logits/chosen": -2.842041015625,
"logits/rejected": -2.861173629760742,
"logps/chosen": -0.7999058961868286,
"logps/rejected": -1.1007378101348877,
"loss": 0.86,
"odds_ratio_loss": 0.6008915305137634,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07999058067798615,
"rewards/margins": 0.030083194375038147,
"rewards/rejected": -0.11007378250360489,
"sft_loss": 0.7999058961868286,
"step": 1240
},
{
"epoch": 2.020610224287735,
"grad_norm": 3.1796367168426514,
"learning_rate": 1.1991127612113945e-06,
"logits/chosen": -2.860217571258545,
"logits/rejected": -2.8857686519622803,
"logps/chosen": -0.7788959741592407,
"logps/rejected": -1.0279576778411865,
"loss": 0.8366,
"odds_ratio_loss": 0.5771896839141846,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0778895914554596,
"rewards/margins": 0.02490617148578167,
"rewards/rejected": -0.10279576480388641,
"sft_loss": 0.7788959741592407,
"step": 1250
},
{
"epoch": 2.036775106082037,
"grad_norm": 2.174238681793213,
"learning_rate": 1.1631239774206035e-06,
"logits/chosen": -2.8261468410491943,
"logits/rejected": -2.8276760578155518,
"logps/chosen": -0.7623487114906311,
"logps/rejected": -1.0154896974563599,
"loss": 0.8249,
"odds_ratio_loss": 0.6253183484077454,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0762348622083664,
"rewards/margins": 0.02531411312520504,
"rewards/rejected": -0.10154898464679718,
"sft_loss": 0.7623487114906311,
"step": 1260
},
{
"epoch": 2.052939987876339,
"grad_norm": 3.220973253250122,
"learning_rate": 1.1275190845831978e-06,
"logits/chosen": -2.8474819660186768,
"logits/rejected": -2.8597018718719482,
"logps/chosen": -0.730771541595459,
"logps/rejected": -1.0029503107070923,
"loss": 0.7858,
"odds_ratio_loss": 0.550129234790802,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07307715713977814,
"rewards/margins": 0.02721787989139557,
"rewards/rejected": -0.10029502958059311,
"sft_loss": 0.730771541595459,
"step": 1270
},
{
"epoch": 2.0691048696706407,
"grad_norm": 2.44575834274292,
"learning_rate": 1.0923083068306778e-06,
"logits/chosen": -2.8472275733947754,
"logits/rejected": -2.8387467861175537,
"logps/chosen": -0.7656749486923218,
"logps/rejected": -1.1094231605529785,
"loss": 0.8236,
"odds_ratio_loss": 0.5792102813720703,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07656749337911606,
"rewards/margins": 0.03437482565641403,
"rewards/rejected": -0.11094231903553009,
"sft_loss": 0.7656749486923218,
"step": 1280
},
{
"epoch": 2.0852697514649425,
"grad_norm": 1.4943968057632446,
"learning_rate": 1.0575017551223348e-06,
"logits/chosen": -2.829378128051758,
"logits/rejected": -2.8376450538635254,
"logps/chosen": -0.7342156171798706,
"logps/rejected": -0.9912710189819336,
"loss": 0.7958,
"odds_ratio_loss": 0.6156936883926392,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07342156767845154,
"rewards/margins": 0.02570553496479988,
"rewards/rejected": -0.09912709891796112,
"sft_loss": 0.7342156171798706,
"step": 1290
},
{
"epoch": 2.1014346332592444,
"grad_norm": 2.5311193466186523,
"learning_rate": 1.023109424341833e-06,
"logits/chosen": -2.8397974967956543,
"logits/rejected": -2.8779385089874268,
"logps/chosen": -0.7779219746589661,
"logps/rejected": -1.1433827877044678,
"loss": 0.8376,
"odds_ratio_loss": 0.5970156192779541,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0777921974658966,
"rewards/margins": 0.03654608502984047,
"rewards/rejected": -0.11433827877044678,
"sft_loss": 0.7779219746589661,
"step": 1300
},
{
"epoch": 2.1175995150535463,
"grad_norm": 2.6538310050964355,
"learning_rate": 9.891411904271273e-07,
"logits/chosen": -2.856947422027588,
"logits/rejected": -2.86110782623291,
"logps/chosen": -0.7499477863311768,
"logps/rejected": -0.9801033139228821,
"loss": 0.8093,
"odds_ratio_loss": 0.593558669090271,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0749947652220726,
"rewards/margins": 0.023015562444925308,
"rewards/rejected": -0.0980103388428688,
"sft_loss": 0.7499477863311768,
"step": 1310
},
{
"epoch": 2.133764396847848,
"grad_norm": 1.2850011587142944,
"learning_rate": 9.556068075345363e-07,
"logits/chosen": -2.8736729621887207,
"logits/rejected": -2.8673884868621826,
"logps/chosen": -0.7692313194274902,
"logps/rejected": -0.9742280840873718,
"loss": 0.8271,
"odds_ratio_loss": 0.5790851712226868,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07692314684391022,
"rewards/margins": 0.02049967274069786,
"rewards/rejected": -0.09742281585931778,
"sft_loss": 0.7692313194274902,
"step": 1320
},
{
"epoch": 2.14992927864215,
"grad_norm": 1.7034938335418701,
"learning_rate": 9.225159052377838e-07,
"logits/chosen": -2.834965944290161,
"logits/rejected": -2.8684887886047363,
"logps/chosen": -0.796667218208313,
"logps/rejected": -1.1322475671768188,
"loss": 0.8554,
"odds_ratio_loss": 0.587177574634552,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07966671884059906,
"rewards/margins": 0.03355802968144417,
"rewards/rejected": -0.11322475969791412,
"sft_loss": 0.796667218208313,
"step": 1330
},
{
"epoch": 2.166094160436452,
"grad_norm": 2.5143074989318848,
"learning_rate": 8.898779857628184e-07,
"logits/chosen": -2.8322224617004395,
"logits/rejected": -2.8632161617279053,
"logps/chosen": -0.6862845420837402,
"logps/rejected": -0.923437774181366,
"loss": 0.7449,
"odds_ratio_loss": 0.5857266783714294,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.0686284601688385,
"rewards/margins": 0.02371532842516899,
"rewards/rejected": -0.0923437848687172,
"sft_loss": 0.6862845420837402,
"step": 1340
},
{
"epoch": 2.1822590422307537,
"grad_norm": 1.7262011766433716,
"learning_rate": 8.577024212591975e-07,
"logits/chosen": -2.8671224117279053,
"logits/rejected": -2.867626428604126,
"logps/chosen": -0.7982193231582642,
"logps/rejected": -0.9524084329605103,
"loss": 0.862,
"odds_ratio_loss": 0.6382196545600891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07982192933559418,
"rewards/margins": 0.01541891973465681,
"rewards/rejected": -0.09524084627628326,
"sft_loss": 0.7982193231582642,
"step": 1350
},
{
"epoch": 2.1984239240250556,
"grad_norm": 1.9137386083602905,
"learning_rate": 8.259984511088276e-07,
"logits/chosen": -2.8300180435180664,
"logits/rejected": -2.8534936904907227,
"logps/chosen": -0.7877185940742493,
"logps/rejected": -1.0415524244308472,
"loss": 0.8505,
"odds_ratio_loss": 0.6278126239776611,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07877186685800552,
"rewards/margins": 0.025383388623595238,
"rewards/rejected": -0.10415525734424591,
"sft_loss": 0.7877185940742493,
"step": 1360
},
{
"epoch": 2.2145888058193575,
"grad_norm": 2.398965835571289,
"learning_rate": 7.947751792728237e-07,
"logits/chosen": -2.8527517318725586,
"logits/rejected": -2.8384506702423096,
"logps/chosen": -0.7678119540214539,
"logps/rejected": -1.105531930923462,
"loss": 0.8275,
"odds_ratio_loss": 0.5968826413154602,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07678119093179703,
"rewards/margins": 0.03377201408147812,
"rewards/rejected": -0.11055320501327515,
"sft_loss": 0.7678119540214539,
"step": 1370
},
{
"epoch": 2.2307536876136593,
"grad_norm": 2.101724147796631,
"learning_rate": 7.640415716772626e-07,
"logits/chosen": -2.8620262145996094,
"logits/rejected": -2.881200075149536,
"logps/chosen": -0.7912808656692505,
"logps/rejected": -1.0620834827423096,
"loss": 0.8546,
"odds_ratio_loss": 0.6336351633071899,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07912809401750565,
"rewards/margins": 0.027080247178673744,
"rewards/rejected": -0.10620833933353424,
"sft_loss": 0.7912808656692505,
"step": 1380
},
{
"epoch": 2.246918569407961,
"grad_norm": 1.2350420951843262,
"learning_rate": 7.338064536385722e-07,
"logits/chosen": -2.839816093444824,
"logits/rejected": -2.84806489944458,
"logps/chosen": -0.7491471171379089,
"logps/rejected": -1.098024606704712,
"loss": 0.8078,
"odds_ratio_loss": 0.5867569446563721,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07491471618413925,
"rewards/margins": 0.03488774597644806,
"rewards/rejected": -0.10980246961116791,
"sft_loss": 0.7491471171379089,
"step": 1390
},
{
"epoch": 2.263083451202263,
"grad_norm": 3.2553515434265137,
"learning_rate": 7.040785073292883e-07,
"logits/chosen": -2.795974016189575,
"logits/rejected": -2.812316417694092,
"logps/chosen": -0.8446899652481079,
"logps/rejected": -1.1183385848999023,
"loss": 0.9119,
"odds_ratio_loss": 0.6722968220710754,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08446899801492691,
"rewards/margins": 0.02736486867070198,
"rewards/rejected": -0.111833855509758,
"sft_loss": 0.8446899652481079,
"step": 1400
},
{
"epoch": 2.279248332996565,
"grad_norm": 1.5375083684921265,
"learning_rate": 6.748662692849297e-07,
"logits/chosen": -2.8378682136535645,
"logits/rejected": -2.8527588844299316,
"logps/chosen": -0.7140767574310303,
"logps/rejected": -1.1210377216339111,
"loss": 0.7679,
"odds_ratio_loss": 0.5377554893493652,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07140768319368362,
"rewards/margins": 0.04069609194993973,
"rewards/rejected": -0.11210376024246216,
"sft_loss": 0.7140767574310303,
"step": 1410
},
{
"epoch": 2.295413214790867,
"grad_norm": 3.371690273284912,
"learning_rate": 6.46178127952686e-07,
"logits/chosen": -2.8596229553222656,
"logits/rejected": -2.86143159866333,
"logps/chosen": -0.7527777552604675,
"logps/rejected": -1.0262553691864014,
"loss": 0.8073,
"odds_ratio_loss": 0.5452762842178345,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07527776062488556,
"rewards/margins": 0.02734777331352234,
"rewards/rejected": -0.10262554883956909,
"sft_loss": 0.7527777552604675,
"step": 1420
},
{
"epoch": 2.3115780965851687,
"grad_norm": 5.5002760887146,
"learning_rate": 6.180223212826289e-07,
"logits/chosen": -2.8466854095458984,
"logits/rejected": -2.84420108795166,
"logps/chosen": -0.760028600692749,
"logps/rejected": -1.0010223388671875,
"loss": 0.8196,
"odds_ratio_loss": 0.595847487449646,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07600285112857819,
"rewards/margins": 0.024099376052618027,
"rewards/rejected": -0.10010223090648651,
"sft_loss": 0.760028600692749,
"step": 1430
},
{
"epoch": 2.3277429783794705,
"grad_norm": 2.094597339630127,
"learning_rate": 5.904069343621443e-07,
"logits/chosen": -2.8559889793395996,
"logits/rejected": -2.843318462371826,
"logps/chosen": -0.7583047747612,
"logps/rejected": -1.0201733112335205,
"loss": 0.8157,
"odds_ratio_loss": 0.5739010572433472,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07583048194646835,
"rewards/margins": 0.02618684433400631,
"rewards/rejected": -0.10201732814311981,
"sft_loss": 0.7583047747612,
"step": 1440
},
{
"epoch": 2.3439078601737724,
"grad_norm": 3.256753444671631,
"learning_rate": 5.633398970942544e-07,
"logits/chosen": -2.8187243938446045,
"logits/rejected": -2.8463759422302246,
"logps/chosen": -0.763822078704834,
"logps/rejected": -0.9972942471504211,
"loss": 0.8274,
"odds_ratio_loss": 0.6356968283653259,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07638221234083176,
"rewards/margins": 0.023347217589616776,
"rewards/rejected": -0.09972943365573883,
"sft_loss": 0.763822078704834,
"step": 1450
},
{
"epoch": 2.3600727419680743,
"grad_norm": 2.1988418102264404,
"learning_rate": 5.368289819205069e-07,
"logits/chosen": -2.8621747493743896,
"logits/rejected": -2.8629798889160156,
"logps/chosen": -0.699676513671875,
"logps/rejected": -0.9881321787834167,
"loss": 0.7602,
"odds_ratio_loss": 0.6056861877441406,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.06996765732765198,
"rewards/margins": 0.028845559805631638,
"rewards/rejected": -0.09881322085857391,
"sft_loss": 0.699676513671875,
"step": 1460
},
{
"epoch": 2.376237623762376,
"grad_norm": 2.666426181793213,
"learning_rate": 5.108818015890785e-07,
"logits/chosen": -2.8656005859375,
"logits/rejected": -2.889970302581787,
"logps/chosen": -0.8437716364860535,
"logps/rejected": -1.0408810377120972,
"loss": 0.9052,
"odds_ratio_loss": 0.6140363216400146,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08437716960906982,
"rewards/margins": 0.01971094310283661,
"rewards/rejected": -0.10408811271190643,
"sft_loss": 0.8437716364860535,
"step": 1470
},
{
"epoch": 2.392402505556678,
"grad_norm": 2.2777225971221924,
"learning_rate": 4.855058069687291e-07,
"logits/chosen": -2.834155559539795,
"logits/rejected": -2.8524587154388428,
"logps/chosen": -0.7329773306846619,
"logps/rejected": -1.1425807476043701,
"loss": 0.7861,
"odds_ratio_loss": 0.5314901471138,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.07329773157835007,
"rewards/margins": 0.04096033796668053,
"rewards/rejected": -0.11425807327032089,
"sft_loss": 0.7329773306846619,
"step": 1480
},
{
"epoch": 2.40856738735098,
"grad_norm": 2.6650478839874268,
"learning_rate": 4.607082849092523e-07,
"logits/chosen": -2.862356662750244,
"logits/rejected": -2.864802598953247,
"logps/chosen": -0.829633891582489,
"logps/rejected": -1.0255271196365356,
"loss": 0.8935,
"odds_ratio_loss": 0.638370156288147,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08296339213848114,
"rewards/margins": 0.019589336588978767,
"rewards/rejected": -0.10255272686481476,
"sft_loss": 0.829633891582489,
"step": 1490
},
{
"epoch": 2.4247322691452817,
"grad_norm": 3.085514783859253,
"learning_rate": 4.3649635614901405e-07,
"logits/chosen": -2.8451571464538574,
"logits/rejected": -2.8950095176696777,
"logps/chosen": -0.7389890551567078,
"logps/rejected": -0.8802745938301086,
"loss": 0.8035,
"odds_ratio_loss": 0.6446704864501953,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07389890402555466,
"rewards/margins": 0.014128552749752998,
"rewards/rejected": -0.0880274623632431,
"sft_loss": 0.7389890551567078,
"step": 1500
},
{
"epoch": 2.4247322691452817,
"eval_logits/chosen": -2.8472585678100586,
"eval_logits/rejected": -2.8558220863342285,
"eval_logps/chosen": -0.7975095510482788,
"eval_logps/rejected": -1.0328320264816284,
"eval_loss": 0.8629826903343201,
"eval_odds_ratio_loss": 0.6547309160232544,
"eval_rewards/accuracies": 0.5618181824684143,
"eval_rewards/chosen": -0.07975095510482788,
"eval_rewards/margins": 0.02353225089609623,
"eval_rewards/rejected": -0.10328320413827896,
"eval_runtime": 194.6849,
"eval_samples_per_second": 5.65,
"eval_sft_loss": 0.7975095510482788,
"eval_steps_per_second": 2.825,
"step": 1500
},
{
"epoch": 2.4408971509395836,
"grad_norm": 1.7019646167755127,
"learning_rate": 4.128769732701973e-07,
"logits/chosen": -2.82879638671875,
"logits/rejected": -2.832578420639038,
"logps/chosen": -0.7700603604316711,
"logps/rejected": -0.9951756596565247,
"loss": 0.8304,
"odds_ratio_loss": 0.6030290722846985,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.077006034553051,
"rewards/margins": 0.022511538118124008,
"rewards/rejected": -0.0995175689458847,
"sft_loss": 0.7700603604316711,
"step": 1510
},
{
"epoch": 2.4570620327338855,
"grad_norm": 2.5611681938171387,
"learning_rate": 3.8985691870233046e-07,
"logits/chosen": -2.882220506668091,
"logits/rejected": -2.880516529083252,
"logps/chosen": -0.7692660689353943,
"logps/rejected": -1.0380921363830566,
"loss": 0.8284,
"odds_ratio_loss": 0.5917290449142456,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07692660391330719,
"rewards/margins": 0.026882609352469444,
"rewards/rejected": -0.10380921512842178,
"sft_loss": 0.7692660689353943,
"step": 1520
},
{
"epoch": 2.4732269145281873,
"grad_norm": 2.6633763313293457,
"learning_rate": 3.6744280277467904e-07,
"logits/chosen": -2.8530020713806152,
"logits/rejected": -2.8719234466552734,
"logps/chosen": -0.7769867181777954,
"logps/rejected": -1.0218976736068726,
"loss": 0.8392,
"odds_ratio_loss": 0.6218123435974121,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07769867032766342,
"rewards/margins": 0.024491112679243088,
"rewards/rejected": -0.10218977928161621,
"sft_loss": 0.7769867181777954,
"step": 1530
},
{
"epoch": 2.489391796322489,
"grad_norm": 2.7384212017059326,
"learning_rate": 3.456410618180503e-07,
"logits/chosen": -2.832824468612671,
"logits/rejected": -2.856114149093628,
"logps/chosen": -0.7060586810112,
"logps/rejected": -1.0986192226409912,
"loss": 0.7646,
"odds_ratio_loss": 0.5853801965713501,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07060587406158447,
"rewards/margins": 0.03925605118274689,
"rewards/rejected": -0.10986192524433136,
"sft_loss": 0.7060586810112,
"step": 1540
},
{
"epoch": 2.5055566781167915,
"grad_norm": 1.9465371370315552,
"learning_rate": 3.244579563165753e-07,
"logits/chosen": -2.8586621284484863,
"logits/rejected": -2.869255542755127,
"logps/chosen": -0.7589577436447144,
"logps/rejected": -1.1315686702728271,
"loss": 0.8173,
"odds_ratio_loss": 0.5836090445518494,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0758957713842392,
"rewards/margins": 0.03726109117269516,
"rewards/rejected": -0.11315685510635376,
"sft_loss": 0.7589577436447144,
"step": 1550
},
{
"epoch": 2.521721559911093,
"grad_norm": 1.2344708442687988,
"learning_rate": 3.038995691099697e-07,
"logits/chosen": -2.8416831493377686,
"logits/rejected": -2.85313081741333,
"logps/chosen": -0.7924615144729614,
"logps/rejected": -1.2077696323394775,
"loss": 0.8503,
"odds_ratio_loss": 0.5783108472824097,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0792461559176445,
"rewards/margins": 0.041530806571245193,
"rewards/rejected": -0.1207769513130188,
"sft_loss": 0.7924615144729614,
"step": 1560
},
{
"epoch": 2.5378864417053952,
"grad_norm": 12.726688385009766,
"learning_rate": 2.839718036468192e-07,
"logits/chosen": -2.8868002891540527,
"logits/rejected": -2.9153692722320557,
"logps/chosen": -0.884573757648468,
"logps/rejected": -1.0609769821166992,
"loss": 0.9513,
"odds_ratio_loss": 0.6675292253494263,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0884573832154274,
"rewards/margins": 0.01764032617211342,
"rewards/rejected": -0.10609769821166992,
"sft_loss": 0.884573757648468,
"step": 1570
},
{
"epoch": 2.5540513234996967,
"grad_norm": 2.5232503414154053,
"learning_rate": 2.646803822893723e-07,
"logits/chosen": -2.8850457668304443,
"logits/rejected": -2.894557476043701,
"logps/chosen": -0.8000026941299438,
"logps/rejected": -1.0157983303070068,
"loss": 0.8627,
"odds_ratio_loss": 0.6269931793212891,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08000027388334274,
"rewards/margins": 0.02157955802977085,
"rewards/rejected": -0.10157983005046844,
"sft_loss": 0.8000026941299438,
"step": 1580
},
{
"epoch": 2.570216205293999,
"grad_norm": 2.3380508422851562,
"learning_rate": 2.460308446703341e-07,
"logits/chosen": -2.8933000564575195,
"logits/rejected": -2.8834781646728516,
"logps/chosen": -0.791167676448822,
"logps/rejected": -0.9255102276802063,
"loss": 0.8556,
"odds_ratio_loss": 0.6445525884628296,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07911677658557892,
"rewards/margins": 0.013434251770377159,
"rewards/rejected": -0.09255101531744003,
"sft_loss": 0.791167676448822,
"step": 1590
},
{
"epoch": 2.5863810870883004,
"grad_norm": 3.6344377994537354,
"learning_rate": 2.2802854610213143e-07,
"logits/chosen": -2.8420848846435547,
"logits/rejected": -2.8515543937683105,
"logps/chosen": -0.6993797421455383,
"logps/rejected": -1.0781666040420532,
"loss": 0.7531,
"odds_ratio_loss": 0.5369757413864136,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.06993797421455383,
"rewards/margins": 0.03787868469953537,
"rewards/rejected": -0.1078166589140892,
"sft_loss": 0.6993797421455383,
"step": 1600
},
{
"epoch": 2.6025459688826027,
"grad_norm": 2.515239715576172,
"learning_rate": 2.106786560391072e-07,
"logits/chosen": -2.8365635871887207,
"logits/rejected": -2.8803467750549316,
"logps/chosen": -0.8032782673835754,
"logps/rejected": -1.0168392658233643,
"loss": 0.8638,
"odds_ratio_loss": 0.6049396395683289,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0803278312087059,
"rewards/margins": 0.021356089040637016,
"rewards/rejected": -0.10168392956256866,
"sft_loss": 0.8032782673835754,
"step": 1610
},
{
"epoch": 2.6187108506769046,
"grad_norm": 1.520639181137085,
"learning_rate": 1.9398615659308255e-07,
"logits/chosen": -2.861687183380127,
"logits/rejected": -2.89752459526062,
"logps/chosen": -0.7549802660942078,
"logps/rejected": -0.9435558319091797,
"loss": 0.8181,
"odds_ratio_loss": 0.6309365034103394,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07549802213907242,
"rewards/margins": 0.018857568502426147,
"rewards/rejected": -0.09435557574033737,
"sft_loss": 0.7549802660942078,
"step": 1620
},
{
"epoch": 2.6348757324712064,
"grad_norm": 2.2465171813964844,
"learning_rate": 1.7795584110272184e-07,
"logits/chosen": -2.8905723094940186,
"logits/rejected": -2.877936840057373,
"logps/chosen": -0.7934287786483765,
"logps/rejected": -1.0050441026687622,
"loss": 0.8594,
"odds_ratio_loss": 0.6593586802482605,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07934287935495377,
"rewards/margins": 0.021161522716283798,
"rewards/rejected": -0.10050439834594727,
"sft_loss": 0.7934287786483765,
"step": 1630
},
{
"epoch": 2.6510406142655083,
"grad_norm": 4.033486366271973,
"learning_rate": 1.6259231275709636e-07,
"logits/chosen": -2.8982126712799072,
"logits/rejected": -2.8980660438537598,
"logps/chosen": -0.7681853175163269,
"logps/rejected": -0.9490568041801453,
"loss": 0.8356,
"odds_ratio_loss": 0.6740620732307434,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0768185406923294,
"rewards/margins": 0.018087133765220642,
"rewards/rejected": -0.09490568190813065,
"sft_loss": 0.7681853175163269,
"step": 1640
},
{
"epoch": 2.66720549605981,
"grad_norm": 1.5368350744247437,
"learning_rate": 1.478999832738548e-07,
"logits/chosen": -2.8781023025512695,
"logits/rejected": -2.8767361640930176,
"logps/chosen": -0.7599083185195923,
"logps/rejected": -1.0983332395553589,
"loss": 0.82,
"odds_ratio_loss": 0.601204514503479,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07599084079265594,
"rewards/margins": 0.033842481672763824,
"rewards/rejected": -0.10983331501483917,
"sft_loss": 0.7599083185195923,
"step": 1650
},
{
"epoch": 2.683370377854112,
"grad_norm": 1.8103063106536865,
"learning_rate": 1.338830716323769e-07,
"logits/chosen": -2.8456664085388184,
"logits/rejected": -2.8552403450012207,
"logps/chosen": -0.8041807413101196,
"logps/rejected": -0.9866863489151001,
"loss": 0.8687,
"odds_ratio_loss": 0.6454349756240845,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08041806519031525,
"rewards/margins": 0.018250569701194763,
"rewards/rejected": -0.0986686423420906,
"sft_loss": 0.8041807413101196,
"step": 1660
},
{
"epoch": 2.699535259648414,
"grad_norm": 3.796130657196045,
"learning_rate": 1.205456028622723e-07,
"logits/chosen": -2.8858485221862793,
"logits/rejected": -2.883568286895752,
"logps/chosen": -0.7273125648498535,
"logps/rejected": -1.0116485357284546,
"loss": 0.7835,
"odds_ratio_loss": 0.5615276098251343,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07273125648498535,
"rewards/margins": 0.02843359112739563,
"rewards/rejected": -0.10116485506296158,
"sft_loss": 0.7273125648498535,
"step": 1670
},
{
"epoch": 2.7157001414427158,
"grad_norm": 1.619040608406067,
"learning_rate": 1.0789140688756805e-07,
"logits/chosen": -2.8932971954345703,
"logits/rejected": -2.8933002948760986,
"logps/chosen": -0.7631897926330566,
"logps/rejected": -1.0072143077850342,
"loss": 0.8217,
"odds_ratio_loss": 0.5846946239471436,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07631897926330566,
"rewards/margins": 0.024402452632784843,
"rewards/rejected": -0.10072143375873566,
"sft_loss": 0.7631897926330566,
"step": 1680
},
{
"epoch": 2.7318650232370176,
"grad_norm": 4.591987133026123,
"learning_rate": 9.592411742693098e-08,
"logits/chosen": -2.8280813694000244,
"logits/rejected": -2.832314968109131,
"logps/chosen": -0.7757545709609985,
"logps/rejected": -0.9772068858146667,
"loss": 0.845,
"odds_ratio_loss": 0.6925373673439026,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.0775754451751709,
"rewards/margins": 0.020145252346992493,
"rewards/rejected": -0.09772069752216339,
"sft_loss": 0.7757545709609985,
"step": 1690
},
{
"epoch": 2.7480299050313195,
"grad_norm": 2.0528857707977295,
"learning_rate": 8.464717095022168e-08,
"logits/chosen": -2.8116049766540527,
"logits/rejected": -2.8237504959106445,
"logps/chosen": -0.7476006746292114,
"logps/rejected": -1.0309717655181885,
"loss": 0.805,
"odds_ratio_loss": 0.574048638343811,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.0747600644826889,
"rewards/margins": 0.02833711728453636,
"rewards/rejected": -0.10309717804193497,
"sft_loss": 0.7476006746292114,
"step": 1700
},
{
"epoch": 2.7641947868256214,
"grad_norm": 2.445467233657837,
"learning_rate": 7.406380569169841e-08,
"logits/chosen": -2.860349178314209,
"logits/rejected": -2.8944199085235596,
"logps/chosen": -0.7957582473754883,
"logps/rejected": -0.9725676774978638,
"loss": 0.8593,
"odds_ratio_loss": 0.6357892155647278,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07957582920789719,
"rewards/margins": 0.01768093928694725,
"rewards/rejected": -0.09725676476955414,
"sft_loss": 0.7957582473754883,
"step": 1710
},
{
"epoch": 2.7803596686199232,
"grad_norm": 11.543617248535156,
"learning_rate": 6.417706072013808e-08,
"logits/chosen": -2.8683581352233887,
"logits/rejected": -2.894205331802368,
"logps/chosen": -0.7598998546600342,
"logps/rejected": -0.9663190841674805,
"loss": 0.8231,
"odds_ratio_loss": 0.6316258907318115,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07598999887704849,
"rewards/margins": 0.02064192108809948,
"rewards/rejected": -0.09663191437721252,
"sft_loss": 0.7598998546600342,
"step": 1720
},
{
"epoch": 2.796524550414225,
"grad_norm": 3.360384941101074,
"learning_rate": 5.498977506615294e-08,
"logits/chosen": -2.8601443767547607,
"logits/rejected": -2.898664712905884,
"logps/chosen": -0.790396511554718,
"logps/rejected": -0.9606446027755737,
"loss": 0.8544,
"odds_ratio_loss": 0.6396982073783875,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07903965562582016,
"rewards/margins": 0.01702481135725975,
"rewards/rejected": -0.09606447070837021,
"sft_loss": 0.790396511554718,
"step": 1730
},
{
"epoch": 2.812689432208527,
"grad_norm": 2.132490873336792,
"learning_rate": 4.6504586906947756e-08,
"logits/chosen": -2.8836772441864014,
"logits/rejected": -2.9003067016601562,
"logps/chosen": -0.8166056871414185,
"logps/rejected": -0.9932202100753784,
"loss": 0.8767,
"odds_ratio_loss": 0.6010292768478394,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08166056871414185,
"rewards/margins": 0.01766144670546055,
"rewards/rejected": -0.09932202100753784,
"sft_loss": 0.8166056871414185,
"step": 1740
},
{
"epoch": 2.828854314002829,
"grad_norm": 7.204352855682373,
"learning_rate": 3.8723932808754914e-08,
"logits/chosen": -2.887660503387451,
"logits/rejected": -2.9059557914733887,
"logps/chosen": -0.8569768667221069,
"logps/rejected": -0.9907077550888062,
"loss": 0.9219,
"odds_ratio_loss": 0.6491862535476685,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08569768816232681,
"rewards/margins": 0.01337310392409563,
"rewards/rejected": -0.09907079488039017,
"sft_loss": 0.8569768667221069,
"step": 1750
},
{
"epoch": 2.8450191957971307,
"grad_norm": 3.7778828144073486,
"learning_rate": 3.1650047027158014e-08,
"logits/chosen": -2.8876945972442627,
"logits/rejected": -2.9152872562408447,
"logps/chosen": -0.7689987421035767,
"logps/rejected": -0.981308102607727,
"loss": 0.828,
"odds_ratio_loss": 0.5896368622779846,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07689988613128662,
"rewards/margins": 0.02123093418776989,
"rewards/rejected": -0.09813080728054047,
"sft_loss": 0.7689987421035767,
"step": 1760
},
{
"epoch": 2.8611840775914326,
"grad_norm": 1.726138949394226,
"learning_rate": 2.5284960865517848e-08,
"logits/chosen": -2.851304769515991,
"logits/rejected": -2.871598243713379,
"logps/chosen": -0.7240949273109436,
"logps/rejected": -1.0288841724395752,
"loss": 0.7798,
"odds_ratio_loss": 0.5571027994155884,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0724094957113266,
"rewards/margins": 0.030478913336992264,
"rewards/rejected": -0.10288842022418976,
"sft_loss": 0.7240949273109436,
"step": 1770
},
{
"epoch": 2.8773489593857344,
"grad_norm": 2.2119297981262207,
"learning_rate": 1.9630502091670388e-08,
"logits/chosen": -2.8459057807922363,
"logits/rejected": -2.866259813308716,
"logps/chosen": -0.7477800250053406,
"logps/rejected": -1.0080687999725342,
"loss": 0.8054,
"odds_ratio_loss": 0.5758811235427856,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.0747780054807663,
"rewards/margins": 0.026028871536254883,
"rewards/rejected": -0.10080687701702118,
"sft_loss": 0.7477800250053406,
"step": 1780
},
{
"epoch": 2.8935138411800363,
"grad_norm": 2.910409450531006,
"learning_rate": 1.4688294413074677e-08,
"logits/chosen": -2.850733757019043,
"logits/rejected": -2.8780460357666016,
"logps/chosen": -0.6847941279411316,
"logps/rejected": -1.00661301612854,
"loss": 0.7411,
"odds_ratio_loss": 0.5632899403572083,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.06847941130399704,
"rewards/margins": 0.03218189254403114,
"rewards/rejected": -0.10066130012273788,
"sft_loss": 0.6847941279411316,
"step": 1790
},
{
"epoch": 2.909678722974338,
"grad_norm": 2.044072389602661,
"learning_rate": 1.0459757010556626e-08,
"logits/chosen": -2.856724262237549,
"logits/rejected": -2.877833366394043,
"logps/chosen": -0.7718300223350525,
"logps/rejected": -0.9458082914352417,
"loss": 0.8346,
"odds_ratio_loss": 0.6273509860038757,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07718300819396973,
"rewards/margins": 0.017397824674844742,
"rewards/rejected": -0.09458083659410477,
"sft_loss": 0.7718300223350525,
"step": 1800
},
{
"epoch": 2.92584360476864,
"grad_norm": 1.9232614040374756,
"learning_rate": 6.94610413078306e-09,
"logits/chosen": -2.8028831481933594,
"logits/rejected": -2.8568198680877686,
"logps/chosen": -0.8266820907592773,
"logps/rejected": -1.2092140913009644,
"loss": 0.8869,
"odds_ratio_loss": 0.6017346382141113,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08266820758581161,
"rewards/margins": 0.03825319558382034,
"rewards/rejected": -0.12092139571905136,
"sft_loss": 0.8266820907592773,
"step": 1810
},
{
"epoch": 2.942008486562942,
"grad_norm": 1.0960156917572021,
"learning_rate": 4.14834473758563e-09,
"logits/chosen": -2.8286824226379395,
"logits/rejected": -2.838784694671631,
"logps/chosen": -0.7189845442771912,
"logps/rejected": -0.9857820272445679,
"loss": 0.7756,
"odds_ratio_loss": 0.5664829015731812,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0718984454870224,
"rewards/margins": 0.02667975425720215,
"rewards/rejected": -0.09857820719480515,
"sft_loss": 0.7189845442771912,
"step": 1820
},
{
"epoch": 2.9581733683572438,
"grad_norm": 1.63419771194458,
"learning_rate": 2.067282222230349e-09,
"logits/chosen": -2.8597445487976074,
"logits/rejected": -2.8696541786193848,
"logps/chosen": -0.7367098331451416,
"logps/rejected": -1.0127137899398804,
"loss": 0.7943,
"odds_ratio_loss": 0.5762413740158081,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07367098331451416,
"rewards/margins": 0.027600402012467384,
"rewards/rejected": -0.1012713760137558,
"sft_loss": 0.7367098331451416,
"step": 1830
},
{
"epoch": 2.9743382501515456,
"grad_norm": 2.9457271099090576,
"learning_rate": 7.035141727212979e-10,
"logits/chosen": -2.8564071655273438,
"logits/rejected": -2.8889355659484863,
"logps/chosen": -0.7218343615531921,
"logps/rejected": -1.0010156631469727,
"loss": 0.7784,
"odds_ratio_loss": 0.5654899477958679,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07218344509601593,
"rewards/margins": 0.02791813388466835,
"rewards/rejected": -0.10010156780481339,
"sft_loss": 0.7218343615531921,
"step": 1840
},
{
"epoch": 2.9905031319458475,
"grad_norm": 4.486654758453369,
"learning_rate": 5.743220219761592e-11,
"logits/chosen": -2.8505501747131348,
"logits/rejected": -2.870176076889038,
"logps/chosen": -0.8715106248855591,
"logps/rejected": -1.054720401763916,
"loss": 0.9404,
"odds_ratio_loss": 0.6889584064483643,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.08715107291936874,
"rewards/margins": 0.0183209627866745,
"rewards/rejected": -0.10547204315662384,
"sft_loss": 0.8715106248855591,
"step": 1850
},
{
"epoch": 2.9969690846635686,
"step": 1854,
"total_flos": 2.1013894560546816e+18,
"train_loss": 0.9013287582572352,
"train_runtime": 18144.1457,
"train_samples_per_second": 1.637,
"train_steps_per_second": 0.102
}
],
"logging_steps": 10,
"max_steps": 1854,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.1013894560546816e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}