dpo-selective-buffer-safeipo / trainer_state.json
wxzhang's picture
Model save
8ffa4d8 verified
raw
history blame
No virus
111 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995965030262273,
"eval_steps": 500,
"global_step": 1858,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.6881720430107528e-09,
"logits/chosen": -2.5808520317077637,
"logits/rejected": -2.0101242065429688,
"logps/chosen": -299.3489990234375,
"logps/rejected": -186.63014221191406,
"loss": 1.2656,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"rewards/safe_rewards": 0.0,
"rewards/unsafe_rewards": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 2.6881720430107527e-08,
"logits/chosen": -2.38761043548584,
"logits/rejected": -2.2287850379943848,
"logps/chosen": -201.83148193359375,
"logps/rejected": -189.46726989746094,
"loss": 1.4296,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 2.8226104404893704e-05,
"rewards/margins": -9.960395254893228e-05,
"rewards/rejected": 0.00012783010606653988,
"rewards/safe_rewards": -0.0001673989463597536,
"rewards/unsafe_rewards": 0.0002238511951873079,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.3763440860215054e-08,
"logits/chosen": -2.3484911918640137,
"logits/rejected": -2.053339719772339,
"logps/chosen": -226.3044891357422,
"logps/rejected": -181.17330932617188,
"loss": 1.463,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -4.27155100624077e-05,
"rewards/margins": 3.7895108562224777e-06,
"rewards/rejected": -4.650496339309029e-05,
"rewards/safe_rewards": -0.0004773393739014864,
"rewards/unsafe_rewards": 0.0003919084556400776,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 8.064516129032257e-08,
"logits/chosen": -2.3405332565307617,
"logits/rejected": -2.145922899246216,
"logps/chosen": -215.05410766601562,
"logps/rejected": -189.3188018798828,
"loss": 1.431,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 3.503418338368647e-05,
"rewards/margins": 0.0005787784466519952,
"rewards/rejected": -0.0005437443032860756,
"rewards/safe_rewards": -0.00011984705633949488,
"rewards/unsafe_rewards": 0.00018991540127899498,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 1.0752688172043011e-07,
"logits/chosen": -2.2765462398529053,
"logits/rejected": -1.974180817604065,
"logps/chosen": -180.71937561035156,
"logps/rejected": -173.9296417236328,
"loss": 1.4304,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.00015593590796925128,
"rewards/margins": 0.0017944574356079102,
"rewards/rejected": -0.0016385214403271675,
"rewards/safe_rewards": 0.00035788281820714474,
"rewards/unsafe_rewards": -4.6010944060981274e-05,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 1.3440860215053762e-07,
"logits/chosen": -2.403860569000244,
"logits/rejected": -2.0332884788513184,
"logps/chosen": -209.592529296875,
"logps/rejected": -167.6835174560547,
"loss": 1.4344,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.0008560878923162818,
"rewards/margins": 0.004450940527021885,
"rewards/rejected": -0.0035948525182902813,
"rewards/safe_rewards": 0.00018845750309992582,
"rewards/unsafe_rewards": 0.0015237184707075357,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 1.6129032258064515e-07,
"logits/chosen": -2.330204486846924,
"logits/rejected": -2.1555492877960205,
"logps/chosen": -185.8196563720703,
"logps/rejected": -185.08883666992188,
"loss": 1.4264,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0013676225207746029,
"rewards/margins": 0.0021811036858707666,
"rewards/rejected": -0.003548726439476013,
"rewards/safe_rewards": -0.000750910839997232,
"rewards/unsafe_rewards": -0.001984334085136652,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 1.8817204301075268e-07,
"logits/chosen": -2.3298559188842773,
"logits/rejected": -2.0787758827209473,
"logps/chosen": -202.39566040039062,
"logps/rejected": -184.2627410888672,
"loss": 1.3858,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0020925761200487614,
"rewards/margins": 0.006319403648376465,
"rewards/rejected": -0.008411980234086514,
"rewards/safe_rewards": -0.0030772520694881678,
"rewards/unsafe_rewards": -0.0011078999377787113,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 2.1505376344086022e-07,
"logits/chosen": -2.3292670249938965,
"logits/rejected": -2.1124508380889893,
"logps/chosen": -220.2982940673828,
"logps/rejected": -195.1908721923828,
"loss": 1.3919,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.008099144324660301,
"rewards/margins": 0.012995732948184013,
"rewards/rejected": -0.021094877272844315,
"rewards/safe_rewards": -0.010834941640496254,
"rewards/unsafe_rewards": -0.005363349802792072,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 2.4193548387096775e-07,
"logits/chosen": -2.3152518272399902,
"logits/rejected": -2.1177124977111816,
"logps/chosen": -209.4881134033203,
"logps/rejected": -170.4688720703125,
"loss": 1.4229,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03572874516248703,
"rewards/margins": 0.025636380538344383,
"rewards/rejected": -0.061365120112895966,
"rewards/safe_rewards": -0.03464391082525253,
"rewards/unsafe_rewards": -0.03681357204914093,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 2.6881720430107523e-07,
"logits/chosen": -2.312987804412842,
"logits/rejected": -2.1187119483947754,
"logps/chosen": -204.9591827392578,
"logps/rejected": -180.0883331298828,
"loss": 1.4099,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0888073593378067,
"rewards/margins": 0.04026350378990173,
"rewards/rejected": -0.12907087802886963,
"rewards/safe_rewards": -0.08975542336702347,
"rewards/unsafe_rewards": -0.08785931766033173,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 2.956989247311828e-07,
"logits/chosen": -2.2454307079315186,
"logits/rejected": -2.0064516067504883,
"logps/chosen": -224.6505889892578,
"logps/rejected": -190.16635131835938,
"loss": 1.4442,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.12704427540302277,
"rewards/margins": 0.054799921810626984,
"rewards/rejected": -0.18184418976306915,
"rewards/safe_rewards": -0.12200836837291718,
"rewards/unsafe_rewards": -0.13208015263080597,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 3.225806451612903e-07,
"logits/chosen": -2.26438570022583,
"logits/rejected": -1.9198744297027588,
"logps/chosen": -222.88461303710938,
"logps/rejected": -173.9338836669922,
"loss": 1.3613,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.1163511872291565,
"rewards/margins": 0.09466449916362762,
"rewards/rejected": -0.21101567149162292,
"rewards/safe_rewards": -0.11146645247936249,
"rewards/unsafe_rewards": -0.1212359219789505,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 3.4946236559139783e-07,
"logits/chosen": -2.19810152053833,
"logits/rejected": -1.9682261943817139,
"logps/chosen": -241.05618286132812,
"logps/rejected": -201.30264282226562,
"loss": 1.2805,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.26873043179512024,
"rewards/margins": 0.12516936659812927,
"rewards/rejected": -0.39389973878860474,
"rewards/safe_rewards": -0.21829214692115784,
"rewards/unsafe_rewards": -0.3191686272621155,
"step": 130
},
{
"epoch": 0.08,
"learning_rate": 3.7634408602150537e-07,
"logits/chosen": -2.192406177520752,
"logits/rejected": -1.845873236656189,
"logps/chosen": -241.24447631835938,
"logps/rejected": -219.4851531982422,
"loss": 1.2394,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2985479533672333,
"rewards/margins": 0.09591639041900635,
"rewards/rejected": -0.394464373588562,
"rewards/safe_rewards": -0.2801482379436493,
"rewards/unsafe_rewards": -0.31694772839546204,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 4.0322580645161285e-07,
"logits/chosen": -2.0371975898742676,
"logits/rejected": -1.7497107982635498,
"logps/chosen": -249.6632537841797,
"logps/rejected": -225.79537963867188,
"loss": 1.1556,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4271857738494873,
"rewards/margins": 0.13365456461906433,
"rewards/rejected": -0.5608403086662292,
"rewards/safe_rewards": -0.42928582429885864,
"rewards/unsafe_rewards": -0.4250856935977936,
"step": 150
},
{
"epoch": 0.09,
"learning_rate": 4.3010752688172043e-07,
"logits/chosen": -1.9401572942733765,
"logits/rejected": -1.5656859874725342,
"logps/chosen": -246.42398071289062,
"logps/rejected": -226.51400756835938,
"loss": 1.2948,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2622266113758087,
"rewards/margins": 0.10594137012958527,
"rewards/rejected": -0.3681679964065552,
"rewards/safe_rewards": -0.24256543815135956,
"rewards/unsafe_rewards": -0.28188782930374146,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 4.569892473118279e-07,
"logits/chosen": -1.9621531963348389,
"logits/rejected": -1.6663320064544678,
"logps/chosen": -253.3407440185547,
"logps/rejected": -226.83035278320312,
"loss": 1.1636,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.38964638113975525,
"rewards/margins": 0.20659947395324707,
"rewards/rejected": -0.5962458848953247,
"rewards/safe_rewards": -0.3696528375148773,
"rewards/unsafe_rewards": -0.4096398949623108,
"step": 170
},
{
"epoch": 0.1,
"learning_rate": 4.838709677419355e-07,
"logits/chosen": -2.0335605144500732,
"logits/rejected": -1.65840744972229,
"logps/chosen": -232.89804077148438,
"logps/rejected": -238.404296875,
"loss": 1.153,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.40930503606796265,
"rewards/margins": 0.26214295625686646,
"rewards/rejected": -0.6714479327201843,
"rewards/safe_rewards": -0.42569422721862793,
"rewards/unsafe_rewards": -0.39291584491729736,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 4.999929391798331e-07,
"logits/chosen": -2.087995767593384,
"logits/rejected": -1.7015736103057861,
"logps/chosen": -233.88204956054688,
"logps/rejected": -231.9954071044922,
"loss": 1.1701,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.39558762311935425,
"rewards/margins": 0.25256142020225525,
"rewards/rejected": -0.6481491327285767,
"rewards/safe_rewards": -0.3778998851776123,
"rewards/unsafe_rewards": -0.4132753312587738,
"step": 190
},
{
"epoch": 0.11,
"learning_rate": 4.9991350953333e-07,
"logits/chosen": -2.0101492404937744,
"logits/rejected": -1.688194990158081,
"logps/chosen": -269.3438720703125,
"logps/rejected": -272.6773376464844,
"loss": 1.1309,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.49628791213035583,
"rewards/margins": 0.21785268187522888,
"rewards/rejected": -0.7141406536102295,
"rewards/safe_rewards": -0.47186246514320374,
"rewards/unsafe_rewards": -0.5207133293151855,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 4.997458523498236e-07,
"logits/chosen": -2.1067311763763428,
"logits/rejected": -1.8388845920562744,
"logps/chosen": -260.73553466796875,
"logps/rejected": -250.10794067382812,
"loss": 1.1343,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.685817539691925,
"rewards/margins": 0.2200475037097931,
"rewards/rejected": -0.9058650732040405,
"rewards/safe_rewards": -0.6602068543434143,
"rewards/unsafe_rewards": -0.7114282250404358,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 4.99490026817712e-07,
"logits/chosen": -2.0778934955596924,
"logits/rejected": -1.7963426113128662,
"logps/chosen": -245.4992218017578,
"logps/rejected": -252.6900177001953,
"loss": 1.149,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5112438201904297,
"rewards/margins": 0.3192596137523651,
"rewards/rejected": -0.8305034637451172,
"rewards/safe_rewards": -0.469885915517807,
"rewards/unsafe_rewards": -0.5526017546653748,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 4.991461232516674e-07,
"logits/chosen": -2.041980266571045,
"logits/rejected": -1.7099599838256836,
"logps/chosen": -286.7567443847656,
"logps/rejected": -276.52020263671875,
"loss": 1.1816,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.566757082939148,
"rewards/margins": 0.22073951363563538,
"rewards/rejected": -0.7874965667724609,
"rewards/safe_rewards": -0.5972923040390015,
"rewards/unsafe_rewards": -0.5362219214439392,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 4.98714263060751e-07,
"logits/chosen": -2.079230546951294,
"logits/rejected": -1.6963192224502563,
"logps/chosen": -245.6575927734375,
"logps/rejected": -231.0216064453125,
"loss": 1.1425,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5724560022354126,
"rewards/margins": 0.2712286710739136,
"rewards/rejected": -0.8436846733093262,
"rewards/safe_rewards": -0.5816723108291626,
"rewards/unsafe_rewards": -0.5632396936416626,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 4.98194598705552e-07,
"logits/chosen": -1.9290701150894165,
"logits/rejected": -1.7140071392059326,
"logps/chosen": -283.2284851074219,
"logps/rejected": -276.1746826171875,
"loss": 1.2033,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.9349914789199829,
"rewards/margins": 0.15353193879127502,
"rewards/rejected": -1.088523268699646,
"rewards/safe_rewards": -0.9215167164802551,
"rewards/unsafe_rewards": -0.9484661817550659,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 4.975873136443648e-07,
"logits/chosen": -2.1985442638397217,
"logits/rejected": -1.9197852611541748,
"logps/chosen": -303.9427795410156,
"logps/rejected": -298.4468688964844,
"loss": 1.0412,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7289992570877075,
"rewards/margins": 0.2902063727378845,
"rewards/rejected": -1.0192055702209473,
"rewards/safe_rewards": -0.8002890348434448,
"rewards/unsafe_rewards": -0.6577093005180359,
"step": 260
},
{
"epoch": 0.15,
"learning_rate": 4.968926222684212e-07,
"logits/chosen": -2.028428792953491,
"logits/rejected": -1.8198667764663696,
"logps/chosen": -262.1617126464844,
"logps/rejected": -279.1395568847656,
"loss": 1.0273,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6195804476737976,
"rewards/margins": 0.3619759678840637,
"rewards/rejected": -0.9815564155578613,
"rewards/safe_rewards": -0.6148445010185242,
"rewards/unsafe_rewards": -0.6243164539337158,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 4.961107698262044e-07,
"logits/chosen": -1.9460862874984741,
"logits/rejected": -1.6114823818206787,
"logps/chosen": -289.10284423828125,
"logps/rejected": -280.1482849121094,
"loss": 1.0933,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.70842045545578,
"rewards/margins": 0.2855362296104431,
"rewards/rejected": -0.9939567446708679,
"rewards/safe_rewards": -0.6650065183639526,
"rewards/unsafe_rewards": -0.7518342733383179,
"step": 280
},
{
"epoch": 0.16,
"learning_rate": 4.952420323368673e-07,
"logits/chosen": -2.0242223739624023,
"logits/rejected": -1.8324100971221924,
"logps/chosen": -237.0861358642578,
"logps/rejected": -266.013916015625,
"loss": 1.1438,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4568088948726654,
"rewards/margins": 0.3911629617214203,
"rewards/rejected": -0.8479719161987305,
"rewards/safe_rewards": -0.49832311272621155,
"rewards/unsafe_rewards": -0.41529473662376404,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 4.942867164927899e-07,
"logits/chosen": -1.9857969284057617,
"logits/rejected": -1.7354557514190674,
"logps/chosen": -262.5624084472656,
"logps/rejected": -261.7809143066406,
"loss": 1.1983,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6027632355690002,
"rewards/margins": 0.323146253824234,
"rewards/rejected": -0.9259093999862671,
"rewards/safe_rewards": -0.6246525645256042,
"rewards/unsafe_rewards": -0.5808738470077515,
"step": 300
},
{
"epoch": 0.17,
"learning_rate": 4.932451595513062e-07,
"logits/chosen": -2.0492186546325684,
"logits/rejected": -1.6627346277236938,
"logps/chosen": -287.6437072753906,
"logps/rejected": -296.08685302734375,
"loss": 0.9844,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6867466568946838,
"rewards/margins": 0.3914056420326233,
"rewards/rejected": -1.0781524181365967,
"rewards/safe_rewards": -0.7488200664520264,
"rewards/unsafe_rewards": -0.6246733069419861,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 4.921177292156419e-07,
"logits/chosen": -2.0579938888549805,
"logits/rejected": -1.661026954650879,
"logps/chosen": -275.15484619140625,
"logps/rejected": -296.12921142578125,
"loss": 0.9846,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6913672089576721,
"rewards/margins": 0.45374804735183716,
"rewards/rejected": -1.1451152563095093,
"rewards/safe_rewards": -0.6368889808654785,
"rewards/unsafe_rewards": -0.7458454370498657,
"step": 320
},
{
"epoch": 0.18,
"learning_rate": 4.909048235051033e-07,
"logits/chosen": -1.8785194158554077,
"logits/rejected": -1.6157915592193604,
"logps/chosen": -287.54022216796875,
"logps/rejected": -308.53485107421875,
"loss": 1.05,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.881294846534729,
"rewards/margins": 0.377378910779953,
"rewards/rejected": -1.2586736679077148,
"rewards/safe_rewards": -0.8856824040412903,
"rewards/unsafe_rewards": -0.876907229423523,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 4.896068706145631e-07,
"logits/chosen": -2.043916702270508,
"logits/rejected": -1.7367770671844482,
"logps/chosen": -311.2376708984375,
"logps/rejected": -299.87225341796875,
"loss": 1.0624,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.005437970161438,
"rewards/margins": 0.386522501707077,
"rewards/rejected": -1.3919605016708374,
"rewards/safe_rewards": -1.0309627056121826,
"rewards/unsafe_rewards": -0.9799133539199829,
"step": 340
},
{
"epoch": 0.19,
"learning_rate": 4.882243287632946e-07,
"logits/chosen": -2.2845442295074463,
"logits/rejected": -2.015984058380127,
"logps/chosen": -249.07626342773438,
"logps/rejected": -270.349609375,
"loss": 1.0634,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.5606988668441772,
"rewards/margins": 0.3308585584163666,
"rewards/rejected": -0.8915573954582214,
"rewards/safe_rewards": -0.5756514668464661,
"rewards/unsafe_rewards": -0.5457462072372437,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 4.867576860332048e-07,
"logits/chosen": -2.2117257118225098,
"logits/rejected": -1.9762624502182007,
"logps/chosen": -250.96493530273438,
"logps/rejected": -287.42401123046875,
"loss": 1.0082,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.7890839576721191,
"rewards/margins": 0.39461660385131836,
"rewards/rejected": -1.1837005615234375,
"rewards/safe_rewards": -0.8431307673454285,
"rewards/unsafe_rewards": -0.7350370287895203,
"step": 360
},
{
"epoch": 0.2,
"learning_rate": 4.85207460196526e-07,
"logits/chosen": -2.083486557006836,
"logits/rejected": -1.7575275897979736,
"logps/chosen": -315.93890380859375,
"logps/rejected": -334.19989013671875,
"loss": 1.0535,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1155636310577393,
"rewards/margins": 0.3764941096305847,
"rewards/rejected": -1.4920578002929688,
"rewards/safe_rewards": -1.1053030490875244,
"rewards/unsafe_rewards": -1.125824213027954,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 4.835741985330259e-07,
"logits/chosen": -2.07675838470459,
"logits/rejected": -1.7431271076202393,
"logps/chosen": -267.29132080078125,
"logps/rejected": -277.96624755859375,
"loss": 1.0022,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6648051142692566,
"rewards/margins": 0.39567989110946655,
"rewards/rejected": -1.0604850053787231,
"rewards/safe_rewards": -0.6165894865989685,
"rewards/unsafe_rewards": -0.7130206823348999,
"step": 380
},
{
"epoch": 0.21,
"learning_rate": 4.818584776367992e-07,
"logits/chosen": -1.928900122642517,
"logits/rejected": -1.7115558385849,
"logps/chosen": -286.6376647949219,
"logps/rejected": -315.5821838378906,
"loss": 1.0452,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.8273309469223022,
"rewards/margins": 0.4084450602531433,
"rewards/rejected": -1.2357759475708008,
"rewards/safe_rewards": -0.8857539892196655,
"rewards/unsafe_rewards": -0.7689078450202942,
"step": 390
},
{
"epoch": 0.22,
"learning_rate": 4.800609032127122e-07,
"logits/chosen": -1.9402987957000732,
"logits/rejected": -1.653032898902893,
"logps/chosen": -315.54443359375,
"logps/rejected": -311.26129150390625,
"loss": 1.096,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1000012159347534,
"rewards/margins": 0.32444682717323303,
"rewards/rejected": -1.4244478940963745,
"rewards/safe_rewards": -1.031652569770813,
"rewards/unsafe_rewards": -1.1683496236801147,
"step": 400
},
{
"epoch": 0.22,
"learning_rate": 4.78182109862569e-07,
"logits/chosen": -1.957297682762146,
"logits/rejected": -1.815405249595642,
"logps/chosen": -275.63458251953125,
"logps/rejected": -291.7708435058594,
"loss": 1.1237,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.8423420786857605,
"rewards/margins": 0.27593573927879333,
"rewards/rejected": -1.1182777881622314,
"rewards/safe_rewards": -0.7917040586471558,
"rewards/unsafe_rewards": -0.8929800987243652,
"step": 410
},
{
"epoch": 0.23,
"learning_rate": 4.7622276086107677e-07,
"logits/chosen": -2.0669121742248535,
"logits/rejected": -1.759894609451294,
"logps/chosen": -285.4234924316406,
"logps/rejected": -295.45294189453125,
"loss": 1.0445,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7513025999069214,
"rewards/margins": 0.3429652750492096,
"rewards/rejected": -1.0942678451538086,
"rewards/safe_rewards": -0.8147345781326294,
"rewards/unsafe_rewards": -0.6878706216812134,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 4.741835479216879e-07,
"logits/chosen": -2.0404961109161377,
"logits/rejected": -1.6269737482070923,
"logps/chosen": -326.9522399902344,
"logps/rejected": -324.0838623046875,
"loss": 1.0151,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8008167147636414,
"rewards/margins": 0.46369487047195435,
"rewards/rejected": -1.2645115852355957,
"rewards/safe_rewards": -0.8669862747192383,
"rewards/unsafe_rewards": -0.7346470952033997,
"step": 430
},
{
"epoch": 0.24,
"learning_rate": 4.720651909524036e-07,
"logits/chosen": -2.04176664352417,
"logits/rejected": -1.7474247217178345,
"logps/chosen": -265.5859069824219,
"logps/rejected": -273.75836181640625,
"loss": 1.0613,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6951051950454712,
"rewards/margins": 0.3854682147502899,
"rewards/rejected": -1.0805734395980835,
"rewards/safe_rewards": -0.8021440505981445,
"rewards/unsafe_rewards": -0.5880664587020874,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 4.698684378016222e-07,
"logits/chosen": -1.9997985363006592,
"logits/rejected": -1.7316805124282837,
"logps/chosen": -258.0675354003906,
"logps/rejected": -275.7622375488281,
"loss": 1.057,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7074462175369263,
"rewards/margins": 0.33383387327194214,
"rewards/rejected": -1.0412800312042236,
"rewards/safe_rewards": -0.6980961561203003,
"rewards/unsafe_rewards": -0.7167961597442627,
"step": 450
},
{
"epoch": 0.25,
"learning_rate": 4.675940639941256e-07,
"logits/chosen": -2.0017848014831543,
"logits/rejected": -1.6473373174667358,
"logps/chosen": -284.409912109375,
"logps/rejected": -302.1802978515625,
"loss": 0.9964,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7972325086593628,
"rewards/margins": 0.45086246728897095,
"rewards/rejected": -1.248094916343689,
"rewards/safe_rewards": -0.7986913919448853,
"rewards/unsafe_rewards": -0.7957736849784851,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 4.6524287245729286e-07,
"logits/chosen": -1.8008419275283813,
"logits/rejected": -1.545585036277771,
"logps/chosen": -285.72833251953125,
"logps/rejected": -295.92877197265625,
"loss": 1.0215,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.9112635850906372,
"rewards/margins": 0.3833921253681183,
"rewards/rejected": -1.2946555614471436,
"rewards/safe_rewards": -0.9739904403686523,
"rewards/unsafe_rewards": -0.8485366702079773,
"step": 470
},
{
"epoch": 0.26,
"learning_rate": 4.628156932376418e-07,
"logits/chosen": -1.935253381729126,
"logits/rejected": -1.5615403652191162,
"logps/chosen": -283.60662841796875,
"logps/rejected": -275.73553466796875,
"loss": 1.0577,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8973907232284546,
"rewards/margins": 0.34667691588401794,
"rewards/rejected": -1.244067668914795,
"rewards/safe_rewards": -0.9187390208244324,
"rewards/unsafe_rewards": -0.8760424852371216,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": -2.0226516723632812,
"logits/rejected": -1.757315993309021,
"logps/chosen": -319.8951416015625,
"logps/rejected": -344.83306884765625,
"loss": 1.0078,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8721585273742676,
"rewards/margins": 0.43064364790916443,
"rewards/rejected": -1.302802324295044,
"rewards/safe_rewards": -0.8747318983078003,
"rewards/unsafe_rewards": -0.8695852160453796,
"step": 490
},
{
"epoch": 0.27,
"learning_rate": 4.5773682576397776e-07,
"logits/chosen": -1.8940719366073608,
"logits/rejected": -1.6204118728637695,
"logps/chosen": -282.92388916015625,
"logps/rejected": -298.21685791015625,
"loss": 1.0274,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8302618265151978,
"rewards/margins": 0.41684699058532715,
"rewards/rejected": -1.2471086978912354,
"rewards/safe_rewards": -0.8429195284843445,
"rewards/unsafe_rewards": -0.8176040649414062,
"step": 500
},
{
"epoch": 0.27,
"eval_logits/chosen": -1.461648941040039,
"eval_logits/rejected": -0.9854875206947327,
"eval_logps/chosen": -232.3561248779297,
"eval_logps/rejected": -222.5215606689453,
"eval_loss": 0.37307849526405334,
"eval_rewards/accuracies": 0.7075266242027283,
"eval_rewards/chosen": -1.0191720724105835,
"eval_rewards/margins": 0.28133097290992737,
"eval_rewards/rejected": -1.300503134727478,
"eval_rewards/safe_rewards": -1.0088554620742798,
"eval_rewards/unsafe_rewards": -1.028071403503418,
"eval_runtime": 1058.7885,
"eval_samples_per_second": 31.209,
"eval_steps_per_second": 0.976,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 4.5508693051414774e-07,
"logits/chosen": -1.9713261127471924,
"logits/rejected": -1.7668718099594116,
"logps/chosen": -295.57354736328125,
"logps/rejected": -315.67010498046875,
"loss": 1.0145,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.992205023765564,
"rewards/margins": 0.4039764404296875,
"rewards/rejected": -1.3961814641952515,
"rewards/safe_rewards": -0.9630918502807617,
"rewards/unsafe_rewards": -1.0213181972503662,
"step": 510
},
{
"epoch": 0.28,
"learning_rate": 4.52364632956877e-07,
"logits/chosen": -2.020744562149048,
"logits/rejected": -1.7648818492889404,
"logps/chosen": -306.5279235839844,
"logps/rejected": -284.1745910644531,
"loss": 1.1779,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9038923978805542,
"rewards/margins": 0.2524818778038025,
"rewards/rejected": -1.1563743352890015,
"rewards/safe_rewards": -0.9023400545120239,
"rewards/unsafe_rewards": -0.9054449200630188,
"step": 520
},
{
"epoch": 0.29,
"learning_rate": 4.4957089415108895e-07,
"logits/chosen": -1.965404748916626,
"logits/rejected": -1.6829001903533936,
"logps/chosen": -266.58013916015625,
"logps/rejected": -311.5513916015625,
"loss": 0.9603,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7168244123458862,
"rewards/margins": 0.5065333247184753,
"rewards/rejected": -1.2233576774597168,
"rewards/safe_rewards": -0.7431866526603699,
"rewards/unsafe_rewards": -0.6904621720314026,
"step": 530
},
{
"epoch": 0.29,
"learning_rate": 4.467067003767745e-07,
"logits/chosen": -1.9147508144378662,
"logits/rejected": -1.4926608800888062,
"logps/chosen": -276.83740234375,
"logps/rejected": -303.65716552734375,
"loss": 1.0585,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8392030000686646,
"rewards/margins": 0.5333585739135742,
"rewards/rejected": -1.3725616931915283,
"rewards/safe_rewards": -0.8698552250862122,
"rewards/unsafe_rewards": -0.8085508346557617,
"step": 540
},
{
"epoch": 0.3,
"learning_rate": 4.437730627868027e-07,
"logits/chosen": -1.8325055837631226,
"logits/rejected": -1.4082276821136475,
"logps/chosen": -256.5406799316406,
"logps/rejected": -277.9321594238281,
"loss": 0.9491,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.7962465286254883,
"rewards/margins": 0.5230618715286255,
"rewards/rejected": -1.3193082809448242,
"rewards/safe_rewards": -0.7890614867210388,
"rewards/unsafe_rewards": -0.803431510925293,
"step": 550
},
{
"epoch": 0.3,
"learning_rate": 4.4077101704995163e-07,
"logits/chosen": -1.9758007526397705,
"logits/rejected": -1.716571569442749,
"logps/chosen": -271.619140625,
"logps/rejected": -284.5029602050781,
"loss": 1.0222,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.7951744198799133,
"rewards/margins": 0.3159455358982086,
"rewards/rejected": -1.1111198663711548,
"rewards/safe_rewards": -0.751990020275116,
"rewards/unsafe_rewards": -0.8383587002754211,
"step": 560
},
{
"epoch": 0.31,
"learning_rate": 4.3770162298528356e-07,
"logits/chosen": -1.9304263591766357,
"logits/rejected": -1.6568174362182617,
"logps/chosen": -294.840087890625,
"logps/rejected": -296.0787658691406,
"loss": 1.0612,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9997127652168274,
"rewards/margins": 0.4206429421901703,
"rewards/rejected": -1.4203556776046753,
"rewards/safe_rewards": -0.9456769824028015,
"rewards/unsafe_rewards": -1.053748369216919,
"step": 570
},
{
"epoch": 0.31,
"learning_rate": 4.3456596418799476e-07,
"logits/chosen": -1.8863794803619385,
"logits/rejected": -1.6375776529312134,
"logps/chosen": -308.4244689941406,
"logps/rejected": -312.00836181640625,
"loss": 0.9379,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0467333793640137,
"rewards/margins": 0.35309094190597534,
"rewards/rejected": -1.3998241424560547,
"rewards/safe_rewards": -1.0730044841766357,
"rewards/unsafe_rewards": -1.0204620361328125,
"step": 580
},
{
"epoch": 0.32,
"learning_rate": 4.313651476468715e-07,
"logits/chosen": -1.9189523458480835,
"logits/rejected": -1.6596009731292725,
"logps/chosen": -302.38214111328125,
"logps/rejected": -310.23687744140625,
"loss": 0.9881,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0581415891647339,
"rewards/margins": 0.3177962601184845,
"rewards/rejected": -1.375937819480896,
"rewards/safe_rewards": -1.1356565952301025,
"rewards/unsafe_rewards": -0.9806265830993652,
"step": 590
},
{
"epoch": 0.32,
"learning_rate": 4.2810030335348693e-07,
"logits/chosen": -1.9322669506072998,
"logits/rejected": -1.6003156900405884,
"logps/chosen": -305.31378173828125,
"logps/rejected": -295.27435302734375,
"loss": 1.0005,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.9873701930046082,
"rewards/margins": 0.315813809633255,
"rewards/rejected": -1.3031837940216064,
"rewards/safe_rewards": -0.9451528787612915,
"rewards/unsafe_rewards": -1.0295875072479248,
"step": 600
},
{
"epoch": 0.33,
"learning_rate": 4.2477258390327806e-07,
"logits/chosen": -1.9405990839004517,
"logits/rejected": -1.6028436422348022,
"logps/chosen": -266.96820068359375,
"logps/rejected": -300.30810546875,
"loss": 0.952,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8195999264717102,
"rewards/margins": 0.4460810720920563,
"rewards/rejected": -1.2656810283660889,
"rewards/safe_rewards": -0.8683537244796753,
"rewards/unsafe_rewards": -0.7708461880683899,
"step": 610
},
{
"epoch": 0.33,
"learning_rate": 4.2138316408864197e-07,
"logits/chosen": -1.9435718059539795,
"logits/rejected": -1.5219794511795044,
"logps/chosen": -273.6173400878906,
"logps/rejected": -296.8672790527344,
"loss": 0.8771,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.7538294196128845,
"rewards/margins": 0.5905435085296631,
"rewards/rejected": -1.3443728685379028,
"rewards/safe_rewards": -0.7166475653648376,
"rewards/unsafe_rewards": -0.7910112142562866,
"step": 620
},
{
"epoch": 0.34,
"learning_rate": 4.179332404841962e-07,
"logits/chosen": -1.6942704916000366,
"logits/rejected": -1.2099497318267822,
"logps/chosen": -327.61376953125,
"logps/rejected": -344.44866943359375,
"loss": 0.9468,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1748586893081665,
"rewards/margins": 0.5536222457885742,
"rewards/rejected": -1.7284809350967407,
"rewards/safe_rewards": -1.184999942779541,
"rewards/unsafe_rewards": -1.1647173166275024,
"step": 630
},
{
"epoch": 0.34,
"learning_rate": 4.1442403102434954e-07,
"logits/chosen": -1.7647279500961304,
"logits/rejected": -1.3675248622894287,
"logps/chosen": -310.9916076660156,
"logps/rejected": -321.579345703125,
"loss": 1.0042,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9101033210754395,
"rewards/margins": 0.5184718370437622,
"rewards/rejected": -1.428575038909912,
"rewards/safe_rewards": -0.9146644473075867,
"rewards/unsafe_rewards": -0.9055421948432922,
"step": 640
},
{
"epoch": 0.35,
"learning_rate": 4.108567745733318e-07,
"logits/chosen": -1.7713772058486938,
"logits/rejected": -1.3516300916671753,
"logps/chosen": -255.63150024414062,
"logps/rejected": -284.5046081542969,
"loss": 1.0245,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.821179986000061,
"rewards/margins": 0.4137188792228699,
"rewards/rejected": -1.2348989248275757,
"rewards/safe_rewards": -0.8415560722351074,
"rewards/unsafe_rewards": -0.8008037805557251,
"step": 650
},
{
"epoch": 0.36,
"learning_rate": 4.0723273048783426e-07,
"logits/chosen": -1.8442039489746094,
"logits/rejected": -1.5036883354187012,
"logps/chosen": -304.8941650390625,
"logps/rejected": -295.1714172363281,
"loss": 1.0401,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8060005903244019,
"rewards/margins": 0.4549127221107483,
"rewards/rejected": -1.2609132528305054,
"rewards/safe_rewards": -0.7164521813392639,
"rewards/unsafe_rewards": -0.8955489993095398,
"step": 660
},
{
"epoch": 0.36,
"learning_rate": 4.0355317817241697e-07,
"logits/chosen": -1.8206180334091187,
"logits/rejected": -1.4083284139633179,
"logps/chosen": -316.703369140625,
"logps/rejected": -288.38323974609375,
"loss": 1.0164,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8776229619979858,
"rewards/margins": 0.3921293020248413,
"rewards/rejected": -1.2697522640228271,
"rewards/safe_rewards": -0.8068926930427551,
"rewards/unsafe_rewards": -0.9483532905578613,
"step": 670
},
{
"epoch": 0.37,
"learning_rate": 3.998194166278367e-07,
"logits/chosen": -1.7814273834228516,
"logits/rejected": -1.490839958190918,
"logps/chosen": -295.8428955078125,
"logps/rejected": -305.77972412109375,
"loss": 1.0484,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0455328226089478,
"rewards/margins": 0.34231680631637573,
"rewards/rejected": -1.3878495693206787,
"rewards/safe_rewards": -1.0646705627441406,
"rewards/unsafe_rewards": -1.0263949632644653,
"step": 680
},
{
"epoch": 0.37,
"learning_rate": 3.9603276399245855e-07,
"logits/chosen": -1.8057140111923218,
"logits/rejected": -1.3926641941070557,
"logps/chosen": -323.49139404296875,
"logps/rejected": -323.6500549316406,
"loss": 1.0317,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1086748838424683,
"rewards/margins": 0.44791918992996216,
"rewards/rejected": -1.556593894958496,
"rewards/safe_rewards": -1.087368130683899,
"rewards/unsafe_rewards": -1.1299816370010376,
"step": 690
},
{
"epoch": 0.38,
"learning_rate": 3.9219455707691e-07,
"logits/chosen": -1.895880937576294,
"logits/rejected": -1.5151503086090088,
"logps/chosen": -295.07525634765625,
"logps/rejected": -304.41241455078125,
"loss": 0.9783,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9127877950668335,
"rewards/margins": 0.4130992889404297,
"rewards/rejected": -1.3258870840072632,
"rewards/safe_rewards": -0.9074820280075073,
"rewards/unsafe_rewards": -0.9180935025215149,
"step": 700
},
{
"epoch": 0.38,
"learning_rate": 3.883061508921439e-07,
"logits/chosen": -1.9308559894561768,
"logits/rejected": -1.6609346866607666,
"logps/chosen": -277.08172607421875,
"logps/rejected": -316.75994873046875,
"loss": 1.0027,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.7832189798355103,
"rewards/margins": 0.37469321489334106,
"rewards/rejected": -1.1579121351242065,
"rewards/safe_rewards": -0.7912700176239014,
"rewards/unsafe_rewards": -0.7751679420471191,
"step": 710
},
{
"epoch": 0.39,
"learning_rate": 3.8436891817107555e-07,
"logits/chosen": -1.761370301246643,
"logits/rejected": -1.5160869359970093,
"logps/chosen": -285.6172790527344,
"logps/rejected": -315.7353210449219,
"loss": 1.0302,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.9513989686965942,
"rewards/margins": 0.45606541633605957,
"rewards/rejected": -1.4074645042419434,
"rewards/safe_rewards": -1.0125133991241455,
"rewards/unsafe_rewards": -0.890284538269043,
"step": 720
},
{
"epoch": 0.39,
"learning_rate": 3.8038424888396414e-07,
"logits/chosen": -1.8194071054458618,
"logits/rejected": -1.4451847076416016,
"logps/chosen": -295.29241943359375,
"logps/rejected": -320.0098571777344,
"loss": 0.9634,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8949946165084839,
"rewards/margins": 0.4506065249443054,
"rewards/rejected": -1.3456013202667236,
"rewards/safe_rewards": -0.855640709400177,
"rewards/unsafe_rewards": -0.9343485832214355,
"step": 730
},
{
"epoch": 0.4,
"learning_rate": 3.763535497477079e-07,
"logits/chosen": -1.8460315465927124,
"logits/rejected": -1.4700871706008911,
"logps/chosen": -310.37518310546875,
"logps/rejected": -316.61236572265625,
"loss": 0.9976,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0227649211883545,
"rewards/margins": 0.43325185775756836,
"rewards/rejected": -1.4560167789459229,
"rewards/safe_rewards": -1.0539515018463135,
"rewards/unsafe_rewards": -0.9915785789489746,
"step": 740
},
{
"epoch": 0.4,
"learning_rate": 3.7227824372922795e-07,
"logits/chosen": -1.8418188095092773,
"logits/rejected": -1.479627013206482,
"logps/chosen": -286.8067321777344,
"logps/rejected": -302.6395568847656,
"loss": 0.9795,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.9858185052871704,
"rewards/margins": 0.41545191407203674,
"rewards/rejected": -1.4012705087661743,
"rewards/safe_rewards": -0.9399738311767578,
"rewards/unsafe_rewards": -1.031663179397583,
"step": 750
},
{
"epoch": 0.41,
"learning_rate": 3.681597695431148e-07,
"logits/chosen": -1.749204397201538,
"logits/rejected": -1.3665492534637451,
"logps/chosen": -289.353271484375,
"logps/rejected": -327.75677490234375,
"loss": 0.9449,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9073783755302429,
"rewards/margins": 0.5846059918403625,
"rewards/rejected": -1.4919843673706055,
"rewards/safe_rewards": -0.9077240228652954,
"rewards/unsafe_rewards": -0.9070326089859009,
"step": 760
},
{
"epoch": 0.41,
"learning_rate": 3.639995811437159e-07,
"logits/chosen": -1.667654037475586,
"logits/rejected": -1.314335584640503,
"logps/chosen": -289.2320556640625,
"logps/rejected": -326.54229736328125,
"loss": 0.9656,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9039511680603027,
"rewards/margins": 0.5181721448898315,
"rewards/rejected": -1.4221234321594238,
"rewards/safe_rewards": -0.9193918108940125,
"rewards/unsafe_rewards": -0.8885105848312378,
"step": 770
},
{
"epoch": 0.42,
"learning_rate": 3.597991472118426e-07,
"logits/chosen": -1.7382431030273438,
"logits/rejected": -1.3142716884613037,
"logps/chosen": -300.62677001953125,
"logps/rejected": -313.65814208984375,
"loss": 1.055,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8587741851806641,
"rewards/margins": 0.427274614572525,
"rewards/rejected": -1.2860486507415771,
"rewards/safe_rewards": -0.8905943036079407,
"rewards/unsafe_rewards": -0.8269540071487427,
"step": 780
},
{
"epoch": 0.43,
"learning_rate": 3.5555995063627836e-07,
"logits/chosen": -1.8035333156585693,
"logits/rejected": -1.4783815145492554,
"logps/chosen": -321.8671875,
"logps/rejected": -317.4945068359375,
"loss": 0.9705,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.9154338836669922,
"rewards/margins": 0.400259792804718,
"rewards/rejected": -1.3156936168670654,
"rewards/safe_rewards": -0.946729838848114,
"rewards/unsafe_rewards": -0.8841378092765808,
"step": 790
},
{
"epoch": 0.43,
"learning_rate": 3.512834879902715e-07,
"logits/chosen": -1.7292006015777588,
"logits/rejected": -1.3531233072280884,
"logps/chosen": -297.385009765625,
"logps/rejected": -316.39666748046875,
"loss": 0.9624,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9740325808525085,
"rewards/margins": 0.43682414293289185,
"rewards/rejected": -1.4108567237854004,
"rewards/safe_rewards": -0.979016900062561,
"rewards/unsafe_rewards": -0.9690481424331665,
"step": 800
},
{
"epoch": 0.44,
"learning_rate": 3.4697126900319616e-07,
"logits/chosen": -1.6081682443618774,
"logits/rejected": -1.1794389486312866,
"logps/chosen": -295.1870422363281,
"logps/rejected": -306.71160888671875,
"loss": 1.0047,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9457162022590637,
"rewards/margins": 0.4736977517604828,
"rewards/rejected": -1.4194139242172241,
"rewards/safe_rewards": -0.8538358807563782,
"rewards/unsafe_rewards": -1.0375964641571045,
"step": 810
},
{
"epoch": 0.44,
"learning_rate": 3.426248160275693e-07,
"logits/chosen": -1.7311818599700928,
"logits/rejected": -1.3600565195083618,
"logps/chosen": -278.1633605957031,
"logps/rejected": -299.32635498046875,
"loss": 1.0259,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8063637018203735,
"rewards/margins": 0.43319129943847656,
"rewards/rejected": -1.2395551204681396,
"rewards/safe_rewards": -0.859821617603302,
"rewards/unsafe_rewards": -0.7529059052467346,
"step": 820
},
{
"epoch": 0.45,
"learning_rate": 3.3824566350161094e-07,
"logits/chosen": -1.785316824913025,
"logits/rejected": -1.323632836341858,
"logps/chosen": -275.93060302734375,
"logps/rejected": -278.7740783691406,
"loss": 0.9551,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.675979495048523,
"rewards/margins": 0.428046852350235,
"rewards/rejected": -1.104026436805725,
"rewards/safe_rewards": -0.6841101050376892,
"rewards/unsafe_rewards": -0.6678491234779358,
"step": 830
},
{
"epoch": 0.45,
"learning_rate": 3.338353574075381e-07,
"logits/chosen": -1.5819236040115356,
"logits/rejected": -1.3700740337371826,
"logps/chosen": -266.2092590332031,
"logps/rejected": -284.17657470703125,
"loss": 1.1627,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9084580540657043,
"rewards/margins": 0.3083532750606537,
"rewards/rejected": -1.216811180114746,
"rewards/safe_rewards": -0.9423872828483582,
"rewards/unsafe_rewards": -0.8745288848876953,
"step": 840
},
{
"epoch": 0.46,
"learning_rate": 3.2939545472578314e-07,
"logits/chosen": -1.7262417078018188,
"logits/rejected": -1.2280857563018799,
"logps/chosen": -324.2544860839844,
"logps/rejected": -322.0166320800781,
"loss": 1.002,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9809592366218567,
"rewards/margins": 0.39795732498168945,
"rewards/rejected": -1.3789165019989014,
"rewards/safe_rewards": -0.9736999273300171,
"rewards/unsafe_rewards": -0.9882184863090515,
"step": 850
},
{
"epoch": 0.46,
"learning_rate": 3.2492752288532916e-07,
"logits/chosen": -1.7719913721084595,
"logits/rejected": -1.3637843132019043,
"logps/chosen": -289.90692138671875,
"logps/rejected": -295.659423828125,
"loss": 1.0013,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8620317578315735,
"rewards/margins": 0.4116358757019043,
"rewards/rejected": -1.273667573928833,
"rewards/safe_rewards": -0.8024783134460449,
"rewards/unsafe_rewards": -0.9215850830078125,
"step": 860
},
{
"epoch": 0.47,
"learning_rate": 3.204331392103574e-07,
"logits/chosen": -1.8806778192520142,
"logits/rejected": -1.389723539352417,
"logps/chosen": -291.46832275390625,
"logps/rejected": -283.912841796875,
"loss": 0.9879,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.7795971035957336,
"rewards/margins": 0.4201742112636566,
"rewards/rejected": -1.1997714042663574,
"rewards/safe_rewards": -0.8063246607780457,
"rewards/unsafe_rewards": -0.7528696060180664,
"step": 870
},
{
"epoch": 0.47,
"learning_rate": 3.159138903634006e-07,
"logits/chosen": -1.6265941858291626,
"logits/rejected": -1.263318657875061,
"logps/chosen": -308.3788757324219,
"logps/rejected": -303.78753662109375,
"loss": 0.9939,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9974681735038757,
"rewards/margins": 0.31481030583381653,
"rewards/rejected": -1.312278389930725,
"rewards/safe_rewards": -0.9676594734191895,
"rewards/unsafe_rewards": -1.0272767543792725,
"step": 880
},
{
"epoch": 0.48,
"learning_rate": 3.1137137178519977e-07,
"logits/chosen": -1.5345653295516968,
"logits/rejected": -1.2038419246673584,
"logps/chosen": -277.8311767578125,
"logps/rejected": -317.1457214355469,
"loss": 0.9773,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9578744769096375,
"rewards/margins": 0.4849475026130676,
"rewards/rejected": -1.4428222179412842,
"rewards/safe_rewards": -0.9253692626953125,
"rewards/unsafe_rewards": -0.9903799891471863,
"step": 890
},
{
"epoch": 0.48,
"learning_rate": 3.068071871314626e-07,
"logits/chosen": -1.477648138999939,
"logits/rejected": -1.1965210437774658,
"logps/chosen": -279.1951599121094,
"logps/rejected": -292.29833984375,
"loss": 0.9773,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0047625303268433,
"rewards/margins": 0.3235151171684265,
"rewards/rejected": -1.328277826309204,
"rewards/safe_rewards": -1.013329267501831,
"rewards/unsafe_rewards": -0.9961959719657898,
"step": 900
},
{
"epoch": 0.49,
"learning_rate": 3.022229477067205e-07,
"logits/chosen": -1.6116926670074463,
"logits/rejected": -1.2270816564559937,
"logps/chosen": -320.8763732910156,
"logps/rejected": -318.58355712890625,
"loss": 0.8831,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0378270149230957,
"rewards/margins": 0.4454011917114258,
"rewards/rejected": -1.4832282066345215,
"rewards/safe_rewards": -1.0344152450561523,
"rewards/unsafe_rewards": -1.0412386655807495,
"step": 910
},
{
"epoch": 0.49,
"learning_rate": 2.976202718954869e-07,
"logits/chosen": -1.5823721885681152,
"logits/rejected": -1.1224069595336914,
"logps/chosen": -333.028564453125,
"logps/rejected": -350.3519592285156,
"loss": 1.0709,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2154980897903442,
"rewards/margins": 0.46428003907203674,
"rewards/rejected": -1.6797780990600586,
"rewards/safe_rewards": -1.1762654781341553,
"rewards/unsafe_rewards": -1.2547308206558228,
"step": 920
},
{
"epoch": 0.5,
"learning_rate": 2.930007845909146e-07,
"logits/chosen": -1.5423873662948608,
"logits/rejected": -1.2088254690170288,
"logps/chosen": -333.67645263671875,
"logps/rejected": -360.3753662109375,
"loss": 0.9999,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3605718612670898,
"rewards/margins": 0.422391802072525,
"rewards/rejected": -1.782963752746582,
"rewards/safe_rewards": -1.422663688659668,
"rewards/unsafe_rewards": -1.2984803915023804,
"step": 930
},
{
"epoch": 0.51,
"learning_rate": 2.8836611662115634e-07,
"logits/chosen": -1.6511509418487549,
"logits/rejected": -1.275423288345337,
"logps/chosen": -321.2275085449219,
"logps/rejected": -312.20733642578125,
"loss": 1.0348,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.010995626449585,
"rewards/margins": 0.42944231629371643,
"rewards/rejected": -1.440437912940979,
"rewards/safe_rewards": -0.9675294160842896,
"rewards/unsafe_rewards": -1.0544618368148804,
"step": 940
},
{
"epoch": 0.51,
"learning_rate": 2.8371790417362986e-07,
"logits/chosen": -1.5915000438690186,
"logits/rejected": -1.2896572351455688,
"logps/chosen": -281.606201171875,
"logps/rejected": -308.20916748046875,
"loss": 1.0666,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9426482319831848,
"rewards/margins": 0.37754327058792114,
"rewards/rejected": -1.3201916217803955,
"rewards/safe_rewards": -0.8953598141670227,
"rewards/unsafe_rewards": -0.9899368286132812,
"step": 950
},
{
"epoch": 0.52,
"learning_rate": 2.7905778821739056e-07,
"logits/chosen": -1.478126049041748,
"logits/rejected": -1.1587542295455933,
"logps/chosen": -301.42181396484375,
"logps/rejected": -304.0444030761719,
"loss": 0.9759,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0664873123168945,
"rewards/margins": 0.36265167593955994,
"rewards/rejected": -1.4291390180587769,
"rewards/safe_rewards": -1.0449130535125732,
"rewards/unsafe_rewards": -1.0880613327026367,
"step": 960
},
{
"epoch": 0.52,
"learning_rate": 2.74387413923817e-07,
"logits/chosen": -1.4300400018692017,
"logits/rejected": -1.1722289323806763,
"logps/chosen": -327.14739990234375,
"logps/rejected": -327.6680908203125,
"loss": 0.977,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.026637315750122,
"rewards/margins": 0.39583832025527954,
"rewards/rejected": -1.4224755764007568,
"rewards/safe_rewards": -1.0289757251739502,
"rewards/unsafe_rewards": -1.024298906326294,
"step": 970
},
{
"epoch": 0.53,
"learning_rate": 2.69708430085812e-07,
"logits/chosen": -1.6212282180786133,
"logits/rejected": -1.1375267505645752,
"logps/chosen": -331.76654052734375,
"logps/rejected": -341.3858947753906,
"loss": 1.0282,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0838706493377686,
"rewards/margins": 0.5636218190193176,
"rewards/rejected": -1.6474926471710205,
"rewards/safe_rewards": -1.0060853958129883,
"rewards/unsafe_rewards": -1.161656141281128,
"step": 980
},
{
"epoch": 0.53,
"learning_rate": 2.6502248853572504e-07,
"logits/chosen": -1.5994454622268677,
"logits/rejected": -1.2842458486557007,
"logps/chosen": -291.59063720703125,
"logps/rejected": -317.38970947265625,
"loss": 1.0111,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0371322631835938,
"rewards/margins": 0.5142688155174255,
"rewards/rejected": -1.551400899887085,
"rewards/safe_rewards": -1.0258736610412598,
"rewards/unsafe_rewards": -1.0483907461166382,
"step": 990
},
{
"epoch": 0.54,
"learning_rate": 2.6033124356220325e-07,
"logits/chosen": -1.5264742374420166,
"logits/rejected": -1.1301841735839844,
"logps/chosen": -305.7037048339844,
"logps/rejected": -310.7588806152344,
"loss": 0.9569,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.029294490814209,
"rewards/margins": 0.4845431447029114,
"rewards/rejected": -1.5138375759124756,
"rewards/safe_rewards": -0.948198139667511,
"rewards/unsafe_rewards": -1.1103906631469727,
"step": 1000
},
{
"epoch": 0.54,
"eval_logits/chosen": -1.0711549520492554,
"eval_logits/rejected": -0.5821475982666016,
"eval_logps/chosen": -221.7958984375,
"eval_logps/rejected": -212.7307586669922,
"eval_loss": 0.34972870349884033,
"eval_rewards/accuracies": 0.7209583520889282,
"eval_rewards/chosen": -0.913569986820221,
"eval_rewards/margins": 0.28902512788772583,
"eval_rewards/rejected": -1.2025949954986572,
"eval_rewards/safe_rewards": -0.9005662798881531,
"eval_rewards/unsafe_rewards": -0.9165622591972351,
"eval_runtime": 1062.9878,
"eval_samples_per_second": 31.086,
"eval_steps_per_second": 0.972,
"step": 1000
},
{
"epoch": 0.54,
"learning_rate": 2.55636351326173e-07,
"logits/chosen": -1.4447015523910522,
"logits/rejected": -1.0418920516967773,
"logps/chosen": -320.98358154296875,
"logps/rejected": -330.81341552734375,
"loss": 0.8958,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.149300217628479,
"rewards/margins": 0.562938928604126,
"rewards/rejected": -1.7122390270233154,
"rewards/safe_rewards": -1.1756014823913574,
"rewards/unsafe_rewards": -1.122998595237732,
"step": 1010
},
{
"epoch": 0.55,
"learning_rate": 2.509394692761622e-07,
"logits/chosen": -1.4893229007720947,
"logits/rejected": -1.0318996906280518,
"logps/chosen": -320.03070068359375,
"logps/rejected": -331.4481201171875,
"loss": 0.9718,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1435344219207764,
"rewards/margins": 0.5211852788925171,
"rewards/rejected": -1.6647193431854248,
"rewards/safe_rewards": -1.1441400051116943,
"rewards/unsafe_rewards": -1.1429284811019897,
"step": 1020
},
{
"epoch": 0.55,
"learning_rate": 2.462422555631674e-07,
"logits/chosen": -1.3502875566482544,
"logits/rejected": -0.8036888241767883,
"logps/chosen": -344.90618896484375,
"logps/rejected": -345.6969909667969,
"loss": 0.953,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.4247171878814697,
"rewards/margins": 0.532504141330719,
"rewards/rejected": -1.957221269607544,
"rewards/safe_rewards": -1.399427890777588,
"rewards/unsafe_rewards": -1.4500062465667725,
"step": 1030
},
{
"epoch": 0.56,
"learning_rate": 2.415463684552728e-07,
"logits/chosen": -1.279679536819458,
"logits/rejected": -0.875691294670105,
"logps/chosen": -329.5700378417969,
"logps/rejected": -347.35162353515625,
"loss": 1.0239,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3259227275848389,
"rewards/margins": 0.47411665320396423,
"rewards/rejected": -1.800039529800415,
"rewards/safe_rewards": -1.3383002281188965,
"rewards/unsafe_rewards": -1.3135454654693604,
"step": 1040
},
{
"epoch": 0.56,
"learning_rate": 2.3685346575222807e-07,
"logits/chosen": -1.4535144567489624,
"logits/rejected": -0.9614871740341187,
"logps/chosen": -325.8673400878906,
"logps/rejected": -333.7716064453125,
"loss": 0.9704,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.082326889038086,
"rewards/margins": 0.5055097341537476,
"rewards/rejected": -1.5878366231918335,
"rewards/safe_rewards": -1.087576150894165,
"rewards/unsafe_rewards": -1.0770776271820068,
"step": 1050
},
{
"epoch": 0.57,
"learning_rate": 2.321652042001919e-07,
"logits/chosen": -1.425490140914917,
"logits/rejected": -0.9585882425308228,
"logps/chosen": -332.23651123046875,
"logps/rejected": -367.16656494140625,
"loss": 0.9341,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.210457682609558,
"rewards/margins": 0.5372633934020996,
"rewards/rejected": -1.7477210760116577,
"rewards/safe_rewards": -1.155447244644165,
"rewards/unsafe_rewards": -1.265467882156372,
"step": 1060
},
{
"epoch": 0.58,
"learning_rate": 2.2748323890684662e-07,
"logits/chosen": -1.5478570461273193,
"logits/rejected": -0.9782267808914185,
"logps/chosen": -313.54412841796875,
"logps/rejected": -330.4336853027344,
"loss": 0.9344,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0107817649841309,
"rewards/margins": 0.6438864469528198,
"rewards/rejected": -1.6546680927276611,
"rewards/safe_rewards": -1.0349432229995728,
"rewards/unsafe_rewards": -0.986620306968689,
"step": 1070
},
{
"epoch": 0.58,
"learning_rate": 2.2280922275709213e-07,
"logits/chosen": -1.4732444286346436,
"logits/rejected": -1.1423754692077637,
"logps/chosen": -317.08258056640625,
"logps/rejected": -328.4356689453125,
"loss": 0.982,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.0598576068878174,
"rewards/margins": 0.41645368933677673,
"rewards/rejected": -1.4763113260269165,
"rewards/safe_rewards": -1.0165767669677734,
"rewards/unsafe_rewards": -1.1031386852264404,
"step": 1080
},
{
"epoch": 0.59,
"learning_rate": 2.1814480582952375e-07,
"logits/chosen": -1.489725112915039,
"logits/rejected": -1.1142539978027344,
"logps/chosen": -309.04681396484375,
"logps/rejected": -333.715576171875,
"loss": 0.9727,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0693882703781128,
"rewards/margins": 0.4686933159828186,
"rewards/rejected": -1.5380815267562866,
"rewards/safe_rewards": -0.9820979237556458,
"rewards/unsafe_rewards": -1.156678557395935,
"step": 1090
},
{
"epoch": 0.59,
"learning_rate": 2.1349163481390187e-07,
"logits/chosen": -1.421409010887146,
"logits/rejected": -1.031862497329712,
"logps/chosen": -316.99139404296875,
"logps/rejected": -340.88909912109375,
"loss": 0.9693,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1647942066192627,
"rewards/margins": 0.4819185137748718,
"rewards/rejected": -1.6467128992080688,
"rewards/safe_rewards": -1.0639079809188843,
"rewards/unsafe_rewards": -1.2656804323196411,
"step": 1100
},
{
"epoch": 0.6,
"learning_rate": 2.0885135242981647e-07,
"logits/chosen": -1.4362539052963257,
"logits/rejected": -1.005382776260376,
"logps/chosen": -351.58978271484375,
"logps/rejected": -323.1505126953125,
"loss": 0.9265,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1940038204193115,
"rewards/margins": 0.4320971369743347,
"rewards/rejected": -1.6261011362075806,
"rewards/safe_rewards": -1.2753981351852417,
"rewards/unsafe_rewards": -1.112609624862671,
"step": 1110
},
{
"epoch": 0.6,
"learning_rate": 2.0422559684675494e-07,
"logits/chosen": -1.4388701915740967,
"logits/rejected": -0.9763511419296265,
"logps/chosen": -328.2054748535156,
"logps/rejected": -331.94427490234375,
"loss": 0.9247,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1099811792373657,
"rewards/margins": 0.4838770031929016,
"rewards/rejected": -1.5938583612442017,
"rewards/safe_rewards": -1.143110752105713,
"rewards/unsafe_rewards": -1.076851725578308,
"step": 1120
},
{
"epoch": 0.61,
"learning_rate": 1.9961600110577457e-07,
"logits/chosen": -1.3994672298431396,
"logits/rejected": -0.9330530166625977,
"logps/chosen": -320.69989013671875,
"logps/rejected": -341.1868591308594,
"loss": 1.007,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1178644895553589,
"rewards/margins": 0.4091947674751282,
"rewards/rejected": -1.5270591974258423,
"rewards/safe_rewards": -1.0592812299728394,
"rewards/unsafe_rewards": -1.1764475107192993,
"step": 1130
},
{
"epoch": 0.61,
"learning_rate": 1.950241925429867e-07,
"logits/chosen": -1.5059046745300293,
"logits/rejected": -0.9375900030136108,
"logps/chosen": -301.2913513183594,
"logps/rejected": -323.166259765625,
"loss": 0.9384,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0121370553970337,
"rewards/margins": 0.6072208285331726,
"rewards/rejected": -1.6193578243255615,
"rewards/safe_rewards": -1.0190235376358032,
"rewards/unsafe_rewards": -1.0052505731582642,
"step": 1140
},
{
"epoch": 0.62,
"learning_rate": 1.9045179221505495e-07,
"logits/chosen": -1.4665344953536987,
"logits/rejected": -1.1525509357452393,
"logps/chosen": -336.7375183105469,
"logps/rejected": -342.58258056640625,
"loss": 0.9456,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0744739770889282,
"rewards/margins": 0.4735342860221863,
"rewards/rejected": -1.5480082035064697,
"rewards/safe_rewards": -1.0436086654663086,
"rewards/unsafe_rewards": -1.1053390502929688,
"step": 1150
},
{
"epoch": 0.62,
"learning_rate": 1.8590041432690893e-07,
"logits/chosen": -1.3896484375,
"logits/rejected": -1.1142855882644653,
"logps/chosen": -297.66619873046875,
"logps/rejected": -315.8662109375,
"loss": 0.977,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.100203037261963,
"rewards/margins": 0.3822095990180969,
"rewards/rejected": -1.4824126958847046,
"rewards/safe_rewards": -1.1140623092651367,
"rewards/unsafe_rewards": -1.0863438844680786,
"step": 1160
},
{
"epoch": 0.63,
"learning_rate": 1.813716656618788e-07,
"logits/chosen": -1.331933856010437,
"logits/rejected": -1.0040611028671265,
"logps/chosen": -300.43267822265625,
"logps/rejected": -321.0027770996094,
"loss": 0.969,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.1213122606277466,
"rewards/margins": 0.4405437111854553,
"rewards/rejected": -1.5618560314178467,
"rewards/safe_rewards": -1.0390920639038086,
"rewards/unsafe_rewards": -1.2035324573516846,
"step": 1170
},
{
"epoch": 0.63,
"learning_rate": 1.7686714501444788e-07,
"logits/chosen": -1.4456322193145752,
"logits/rejected": -0.8397674560546875,
"logps/chosen": -326.2867736816406,
"logps/rejected": -333.5230407714844,
"loss": 0.9325,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1393133401870728,
"rewards/margins": 0.5787724256515503,
"rewards/rejected": -1.7180858850479126,
"rewards/safe_rewards": -1.2189667224884033,
"rewards/unsafe_rewards": -1.0596599578857422,
"step": 1180
},
{
"epoch": 0.64,
"learning_rate": 1.7238844262582768e-07,
"logits/chosen": -1.363268494606018,
"logits/rejected": -1.1216974258422852,
"logps/chosen": -321.5309143066406,
"logps/rejected": -357.5130920410156,
"loss": 0.9391,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1455228328704834,
"rewards/margins": 0.5131853222846985,
"rewards/rejected": -1.6587082147598267,
"rewards/safe_rewards": -1.0672725439071655,
"rewards/unsafe_rewards": -1.2237731218338013,
"step": 1190
},
{
"epoch": 0.65,
"learning_rate": 1.679371396225504e-07,
"logits/chosen": -1.4268032312393188,
"logits/rejected": -0.9499413371086121,
"logps/chosen": -310.05328369140625,
"logps/rejected": -351.35272216796875,
"loss": 0.9212,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.089523434638977,
"rewards/margins": 0.6114920973777771,
"rewards/rejected": -1.7010157108306885,
"rewards/safe_rewards": -1.027521014213562,
"rewards/unsafe_rewards": -1.151525855064392,
"step": 1200
},
{
"epoch": 0.65,
"learning_rate": 1.6351480745828096e-07,
"logits/chosen": -1.4248908758163452,
"logits/rejected": -1.0554434061050415,
"logps/chosen": -324.76751708984375,
"logps/rejected": -340.4091491699219,
"loss": 0.8506,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2382574081420898,
"rewards/margins": 0.46283411979675293,
"rewards/rejected": -1.7010915279388428,
"rewards/safe_rewards": -1.2276763916015625,
"rewards/unsafe_rewards": -1.2488384246826172,
"step": 1210
},
{
"epoch": 0.66,
"learning_rate": 1.5912300735904248e-07,
"logits/chosen": -1.5351839065551758,
"logits/rejected": -1.176792860031128,
"logps/chosen": -334.9280090332031,
"logps/rejected": -334.9038391113281,
"loss": 0.9582,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1213009357452393,
"rewards/margins": 0.4566107392311096,
"rewards/rejected": -1.577911615371704,
"rewards/safe_rewards": -1.2159931659698486,
"rewards/unsafe_rewards": -1.0266087055206299,
"step": 1220
},
{
"epoch": 0.66,
"learning_rate": 1.5476328977205395e-07,
"logits/chosen": -1.5172470808029175,
"logits/rejected": -1.0838744640350342,
"logps/chosen": -318.88702392578125,
"logps/rejected": -331.92626953125,
"loss": 0.929,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.1251935958862305,
"rewards/margins": 0.5502282381057739,
"rewards/rejected": -1.675421953201294,
"rewards/safe_rewards": -1.2570269107818604,
"rewards/unsafe_rewards": -0.9933602213859558,
"step": 1230
},
{
"epoch": 0.67,
"learning_rate": 1.5043719381837112e-07,
"logits/chosen": -1.4699004888534546,
"logits/rejected": -1.1323144435882568,
"logps/chosen": -332.270751953125,
"logps/rejected": -346.4047546386719,
"loss": 0.9436,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1165038347244263,
"rewards/margins": 0.47613364458084106,
"rewards/rejected": -1.5926374197006226,
"rewards/safe_rewards": -1.0986835956573486,
"rewards/unsafe_rewards": -1.1343239545822144,
"step": 1240
},
{
"epoch": 0.67,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": -1.445765495300293,
"logits/rejected": -1.0371205806732178,
"logps/chosen": -287.15216064453125,
"logps/rejected": -333.86676025390625,
"loss": 0.8531,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.092584490776062,
"rewards/margins": 0.5968576669692993,
"rewards/rejected": -1.6894422769546509,
"rewards/safe_rewards": -1.1179049015045166,
"rewards/unsafe_rewards": -1.067264199256897,
"step": 1250
},
{
"epoch": 0.68,
"learning_rate": 1.4189196340836865e-07,
"logits/chosen": -1.6602089405059814,
"logits/rejected": -1.2219452857971191,
"logps/chosen": -297.80426025390625,
"logps/rejected": -314.51458740234375,
"loss": 0.9095,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9903711080551147,
"rewards/margins": 0.4589962363243103,
"rewards/rejected": -1.4493674039840698,
"rewards/safe_rewards": -1.0244104862213135,
"rewards/unsafe_rewards": -0.9563320279121399,
"step": 1260
},
{
"epoch": 0.68,
"learning_rate": 1.3767584569425561e-07,
"logits/chosen": -1.6483261585235596,
"logits/rejected": -1.1660915613174438,
"logps/chosen": -309.3149719238281,
"logps/rejected": -320.50726318359375,
"loss": 0.9112,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0096858739852905,
"rewards/margins": 0.4956924319267273,
"rewards/rejected": -1.5053783655166626,
"rewards/safe_rewards": -1.045543909072876,
"rewards/unsafe_rewards": -0.973828136920929,
"step": 1270
},
{
"epoch": 0.69,
"learning_rate": 1.334993820328541e-07,
"logits/chosen": -1.501082420349121,
"logits/rejected": -1.1109731197357178,
"logps/chosen": -289.6914978027344,
"logps/rejected": -326.8876953125,
"loss": 0.9211,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1317590475082397,
"rewards/margins": 0.6352017521858215,
"rewards/rejected": -1.766960859298706,
"rewards/safe_rewards": -1.0952441692352295,
"rewards/unsafe_rewards": -1.1682740449905396,
"step": 1280
},
{
"epoch": 0.69,
"learning_rate": 1.2936404685066852e-07,
"logits/chosen": -1.487445592880249,
"logits/rejected": -1.1660425662994385,
"logps/chosen": -333.1927185058594,
"logps/rejected": -355.2337341308594,
"loss": 0.9879,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1514599323272705,
"rewards/margins": 0.4698604643344879,
"rewards/rejected": -1.621320366859436,
"rewards/safe_rewards": -1.2052063941955566,
"rewards/unsafe_rewards": -1.0977137088775635,
"step": 1290
},
{
"epoch": 0.7,
"learning_rate": 1.252713000545221e-07,
"logits/chosen": -1.6853973865509033,
"logits/rejected": -1.2987568378448486,
"logps/chosen": -322.5224914550781,
"logps/rejected": -331.0067138671875,
"loss": 0.8714,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0373764038085938,
"rewards/margins": 0.5552427768707275,
"rewards/rejected": -1.5926191806793213,
"rewards/safe_rewards": -1.0258121490478516,
"rewards/unsafe_rewards": -1.0489407777786255,
"step": 1300
},
{
"epoch": 0.7,
"learning_rate": 1.2122258651616304e-07,
"logits/chosen": -1.6230665445327759,
"logits/rejected": -1.1880443096160889,
"logps/chosen": -313.47772216796875,
"logps/rejected": -304.3802185058594,
"loss": 0.9486,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0324283838272095,
"rewards/margins": 0.39181455969810486,
"rewards/rejected": -1.4242427349090576,
"rewards/safe_rewards": -0.9644128680229187,
"rewards/unsafe_rewards": -1.1004436016082764,
"step": 1310
},
{
"epoch": 0.71,
"learning_rate": 1.1721933556217792e-07,
"logits/chosen": -1.5601403713226318,
"logits/rejected": -1.2365220785140991,
"logps/chosen": -307.1771240234375,
"logps/rejected": -328.2700500488281,
"loss": 0.9875,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.066193699836731,
"rewards/margins": 0.4565289616584778,
"rewards/rejected": -1.5227227210998535,
"rewards/safe_rewards": -1.0890748500823975,
"rewards/unsafe_rewards": -1.0433127880096436,
"step": 1320
},
{
"epoch": 0.72,
"learning_rate": 1.1326296046939333e-07,
"logits/chosen": -1.5020397901535034,
"logits/rejected": -1.146071195602417,
"logps/chosen": -292.02313232421875,
"logps/rejected": -311.25604248046875,
"loss": 0.926,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.971126914024353,
"rewards/margins": 0.559014081954956,
"rewards/rejected": -1.5301411151885986,
"rewards/safe_rewards": -0.9235084652900696,
"rewards/unsafe_rewards": -1.0187455415725708,
"step": 1330
},
{
"epoch": 0.72,
"learning_rate": 1.0935485796594351e-07,
"logits/chosen": -1.5235176086425781,
"logits/rejected": -1.0568602085113525,
"logps/chosen": -333.9339904785156,
"logps/rejected": -333.97576904296875,
"loss": 1.032,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0942230224609375,
"rewards/margins": 0.5436626672744751,
"rewards/rejected": -1.6378856897354126,
"rewards/safe_rewards": -1.079145073890686,
"rewards/unsafe_rewards": -1.109300971031189,
"step": 1340
},
{
"epoch": 0.73,
"learning_rate": 1.0549640773818028e-07,
"logits/chosen": -1.411492943763733,
"logits/rejected": -1.1914324760437012,
"logps/chosen": -315.5474853515625,
"logps/rejected": -322.1836242675781,
"loss": 0.9732,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1521949768066406,
"rewards/margins": 0.41504979133605957,
"rewards/rejected": -1.5672447681427002,
"rewards/safe_rewards": -1.2113215923309326,
"rewards/unsafe_rewards": -1.0930684804916382,
"step": 1350
},
{
"epoch": 0.73,
"learning_rate": 1.0168897194359921e-07,
"logits/chosen": -1.5103179216384888,
"logits/rejected": -1.1291395425796509,
"logps/chosen": -344.5137634277344,
"logps/rejected": -345.2103576660156,
"loss": 0.9403,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1670544147491455,
"rewards/margins": 0.4278257489204407,
"rewards/rejected": -1.5948803424835205,
"rewards/safe_rewards": -1.1103397607803345,
"rewards/unsafe_rewards": -1.2237694263458252,
"step": 1360
},
{
"epoch": 0.74,
"learning_rate": 9.793389472995392e-08,
"logits/chosen": -1.437276840209961,
"logits/rejected": -0.9090574383735657,
"logps/chosen": -319.93829345703125,
"logps/rejected": -318.78802490234375,
"loss": 0.8305,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.0438860654830933,
"rewards/margins": 0.630165696144104,
"rewards/rejected": -1.6740516424179077,
"rewards/safe_rewards": -0.9974796175956726,
"rewards/unsafe_rewards": -1.0902923345565796,
"step": 1370
},
{
"epoch": 0.74,
"learning_rate": 9.423250176072874e-08,
"logits/chosen": -1.4429516792297363,
"logits/rejected": -1.0162016153335571,
"logps/chosen": -314.0827331542969,
"logps/rejected": -311.3514404296875,
"loss": 1.0966,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2516918182373047,
"rewards/margins": 0.3869698643684387,
"rewards/rejected": -1.6386617422103882,
"rewards/safe_rewards": -1.2116239070892334,
"rewards/unsafe_rewards": -1.291759967803955,
"step": 1380
},
{
"epoch": 0.75,
"learning_rate": 9.058609974713654e-08,
"logits/chosen": -1.5069031715393066,
"logits/rejected": -1.0585293769836426,
"logps/chosen": -312.8193054199219,
"logps/rejected": -344.32769775390625,
"loss": 0.8858,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0806872844696045,
"rewards/margins": 0.6248958110809326,
"rewards/rejected": -1.7055833339691162,
"rewards/safe_rewards": -1.065792441368103,
"rewards/unsafe_rewards": -1.0955822467803955,
"step": 1390
},
{
"epoch": 0.75,
"learning_rate": 8.699597598680753e-08,
"logits/chosen": -1.4285691976547241,
"logits/rejected": -1.0217626094818115,
"logps/chosen": -298.48516845703125,
"logps/rejected": -319.5094909667969,
"loss": 0.8598,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0639463663101196,
"rewards/margins": 0.5066736340522766,
"rewards/rejected": -1.570619821548462,
"rewards/safe_rewards": -1.049989938735962,
"rewards/unsafe_rewards": -1.0779026746749878,
"step": 1400
},
{
"epoch": 0.76,
"learning_rate": 8.346339790933166e-08,
"logits/chosen": -1.4796028137207031,
"logits/rejected": -1.0254989862442017,
"logps/chosen": -303.11322021484375,
"logps/rejected": -319.5587463378906,
"loss": 0.9735,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1025015115737915,
"rewards/margins": 0.5202707052230835,
"rewards/rejected": -1.622771978378296,
"rewards/safe_rewards": -1.048610806465149,
"rewards/unsafe_rewards": -1.1563920974731445,
"step": 1410
},
{
"epoch": 0.76,
"learning_rate": 7.998961262881506e-08,
"logits/chosen": -1.4263174533843994,
"logits/rejected": -0.9260984659194946,
"logps/chosen": -325.7618408203125,
"logps/rejected": -318.91070556640625,
"loss": 0.9355,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0862057209014893,
"rewards/margins": 0.51823890209198,
"rewards/rejected": -1.6044447422027588,
"rewards/safe_rewards": -1.1601320505142212,
"rewards/unsafe_rewards": -1.0122793912887573,
"step": 1420
},
{
"epoch": 0.77,
"learning_rate": 7.657584650360846e-08,
"logits/chosen": -1.2710342407226562,
"logits/rejected": -0.989566445350647,
"logps/chosen": -300.971923828125,
"logps/rejected": -315.53387451171875,
"loss": 0.9896,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1461577415466309,
"rewards/margins": 0.5026684999465942,
"rewards/rejected": -1.648826241493225,
"rewards/safe_rewards": -1.1745736598968506,
"rewards/unsafe_rewards": -1.117741584777832,
"step": 1430
},
{
"epoch": 0.77,
"learning_rate": 7.322330470336313e-08,
"logits/chosen": -1.38018798828125,
"logits/rejected": -0.90345698595047,
"logps/chosen": -322.70196533203125,
"logps/rejected": -353.0235595703125,
"loss": 0.9274,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.1377968788146973,
"rewards/margins": 0.5669258236885071,
"rewards/rejected": -1.7047227621078491,
"rewards/safe_rewards": -1.0534660816192627,
"rewards/unsafe_rewards": -1.2221280336380005,
"step": 1440
},
{
"epoch": 0.78,
"learning_rate": 6.993317078356709e-08,
"logits/chosen": -1.4155539274215698,
"logits/rejected": -1.210967779159546,
"logps/chosen": -331.1578674316406,
"logps/rejected": -324.1207580566406,
"loss": 0.9583,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2102378606796265,
"rewards/margins": 0.2986965477466583,
"rewards/rejected": -1.5089343786239624,
"rewards/safe_rewards": -1.2407358884811401,
"rewards/unsafe_rewards": -1.1797398328781128,
"step": 1450
},
{
"epoch": 0.79,
"learning_rate": 6.67066062677118e-08,
"logits/chosen": -1.5069057941436768,
"logits/rejected": -1.0829797983169556,
"logps/chosen": -308.49664306640625,
"logps/rejected": -310.1492004394531,
"loss": 1.0037,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0759823322296143,
"rewards/margins": 0.45751920342445374,
"rewards/rejected": -1.5335016250610352,
"rewards/safe_rewards": -1.0966331958770752,
"rewards/unsafe_rewards": -1.0553314685821533,
"step": 1460
},
{
"epoch": 0.79,
"learning_rate": 6.354475023723685e-08,
"logits/chosen": -1.4374122619628906,
"logits/rejected": -1.0555397272109985,
"logps/chosen": -348.89227294921875,
"logps/rejected": -351.38665771484375,
"loss": 0.949,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1484794616699219,
"rewards/margins": 0.5996941924095154,
"rewards/rejected": -1.748173475265503,
"rewards/safe_rewards": -1.1115808486938477,
"rewards/unsafe_rewards": -1.1853783130645752,
"step": 1470
},
{
"epoch": 0.8,
"learning_rate": 6.044871892939746e-08,
"logits/chosen": -1.5760042667388916,
"logits/rejected": -1.1585383415222168,
"logps/chosen": -320.93597412109375,
"logps/rejected": -338.1377868652344,
"loss": 0.9439,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0907033681869507,
"rewards/margins": 0.4960748255252838,
"rewards/rejected": -1.5867780447006226,
"rewards/safe_rewards": -1.1278786659240723,
"rewards/unsafe_rewards": -1.0535279512405396,
"step": 1480
},
{
"epoch": 0.8,
"learning_rate": 5.741960534319676e-08,
"logits/chosen": -1.5081236362457275,
"logits/rejected": -1.237660527229309,
"logps/chosen": -281.8060607910156,
"logps/rejected": -306.55889892578125,
"loss": 0.9089,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.0866477489471436,
"rewards/margins": 0.4203459322452545,
"rewards/rejected": -1.5069936513900757,
"rewards/safe_rewards": -1.198955774307251,
"rewards/unsafe_rewards": -0.9743399620056152,
"step": 1490
},
{
"epoch": 0.81,
"learning_rate": 5.44584788535217e-08,
"logits/chosen": -1.5474069118499756,
"logits/rejected": -1.1630122661590576,
"logps/chosen": -322.1282653808594,
"logps/rejected": -332.53656005859375,
"loss": 0.8619,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0482981204986572,
"rewards/margins": 0.5498597621917725,
"rewards/rejected": -1.5981578826904297,
"rewards/safe_rewards": -1.0035431385040283,
"rewards/unsafe_rewards": -1.0930533409118652,
"step": 1500
},
{
"epoch": 0.81,
"eval_logits/chosen": -1.146567940711975,
"eval_logits/rejected": -0.7005062699317932,
"eval_logps/chosen": -221.80184936523438,
"eval_logps/rejected": -210.18832397460938,
"eval_loss": 0.34287312626838684,
"eval_rewards/accuracies": 0.7268877029418945,
"eval_rewards/chosen": -0.913629412651062,
"eval_rewards/margins": 0.2635413706302643,
"eval_rewards/rejected": -1.1771708726882935,
"eval_rewards/safe_rewards": -0.9047471284866333,
"eval_rewards/unsafe_rewards": -0.9192255139350891,
"eval_runtime": 1122.8444,
"eval_samples_per_second": 29.429,
"eval_steps_per_second": 0.92,
"step": 1500
},
{
"epoch": 0.81,
"learning_rate": 5.156638483361933e-08,
"logits/chosen": -1.612953782081604,
"logits/rejected": -1.2291548252105713,
"logps/chosen": -316.44293212890625,
"logps/rejected": -337.2187194824219,
"loss": 0.9167,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.9855987429618835,
"rewards/margins": 0.5536761283874512,
"rewards/rejected": -1.5392746925354004,
"rewards/safe_rewards": -1.0115060806274414,
"rewards/unsafe_rewards": -0.9596911668777466,
"step": 1510
},
{
"epoch": 0.82,
"learning_rate": 4.8744344286046236e-08,
"logits/chosen": -1.4979829788208008,
"logits/rejected": -1.1606972217559814,
"logps/chosen": -323.786865234375,
"logps/rejected": -326.4385681152344,
"loss": 0.9593,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0706676244735718,
"rewards/margins": 0.37281331419944763,
"rewards/rejected": -1.4434809684753418,
"rewards/safe_rewards": -1.150662899017334,
"rewards/unsafe_rewards": -0.99067223072052,
"step": 1520
},
{
"epoch": 0.82,
"learning_rate": 4.599335348222169e-08,
"logits/chosen": -1.5021213293075562,
"logits/rejected": -1.2472387552261353,
"logps/chosen": -329.1572265625,
"logps/rejected": -364.0357971191406,
"loss": 0.9118,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0721365213394165,
"rewards/margins": 0.5561688542366028,
"rewards/rejected": -1.628305435180664,
"rewards/safe_rewards": -1.036094307899475,
"rewards/unsafe_rewards": -1.108178734779358,
"step": 1530
},
{
"epoch": 0.83,
"learning_rate": 4.331438361071163e-08,
"logits/chosen": -1.5342875719070435,
"logits/rejected": -1.3547062873840332,
"logps/chosen": -334.36212158203125,
"logps/rejected": -347.2046813964844,
"loss": 0.9704,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.9460422396659851,
"rewards/margins": 0.44854864478111267,
"rewards/rejected": -1.394590973854065,
"rewards/safe_rewards": -0.965340256690979,
"rewards/unsafe_rewards": -0.9267444610595703,
"step": 1540
},
{
"epoch": 0.83,
"learning_rate": 4.0708380434367864e-08,
"logits/chosen": -1.5614886283874512,
"logits/rejected": -1.1849619150161743,
"logps/chosen": -300.528564453125,
"logps/rejected": -326.10638427734375,
"loss": 0.8839,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0258691310882568,
"rewards/margins": 0.4937085211277008,
"rewards/rejected": -1.5195776224136353,
"rewards/safe_rewards": -1.0579198598861694,
"rewards/unsafe_rewards": -0.993818461894989,
"step": 1550
},
{
"epoch": 0.84,
"learning_rate": 3.817626395644305e-08,
"logits/chosen": -1.5818434953689575,
"logits/rejected": -1.210106611251831,
"logps/chosen": -297.70025634765625,
"logps/rejected": -307.71710205078125,
"loss": 1.0046,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.9973915219306946,
"rewards/margins": 0.3978816568851471,
"rewards/rejected": -1.3952730894088745,
"rewards/safe_rewards": -0.9759462475776672,
"rewards/unsafe_rewards": -1.0188367366790771,
"step": 1560
},
{
"epoch": 0.84,
"learning_rate": 3.571892809580013e-08,
"logits/chosen": -1.5090782642364502,
"logits/rejected": -1.1929179430007935,
"logps/chosen": -307.1462097167969,
"logps/rejected": -320.7843017578125,
"loss": 0.9616,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1602782011032104,
"rewards/margins": 0.36262303590774536,
"rewards/rejected": -1.5229012966156006,
"rewards/safe_rewards": -1.113433837890625,
"rewards/unsafe_rewards": -1.207122564315796,
"step": 1570
},
{
"epoch": 0.85,
"learning_rate": 3.333724037132976e-08,
"logits/chosen": -1.5555639266967773,
"logits/rejected": -1.2410228252410889,
"logps/chosen": -306.10662841796875,
"logps/rejected": -334.8534851074219,
"loss": 0.96,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9998570680618286,
"rewards/margins": 0.5056936740875244,
"rewards/rejected": -1.5055506229400635,
"rewards/safe_rewards": -0.9567073583602905,
"rewards/unsafe_rewards": -1.0430065393447876,
"step": 1580
},
{
"epoch": 0.86,
"learning_rate": 3.1032041595688506e-08,
"logits/chosen": -1.4647419452667236,
"logits/rejected": -1.0212126970291138,
"logps/chosen": -309.408935546875,
"logps/rejected": -335.38299560546875,
"loss": 0.9033,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1254334449768066,
"rewards/margins": 0.5138076543807983,
"rewards/rejected": -1.6392412185668945,
"rewards/safe_rewards": -1.1220200061798096,
"rewards/unsafe_rewards": -1.1288467645645142,
"step": 1590
},
{
"epoch": 0.86,
"learning_rate": 2.880414557846453e-08,
"logits/chosen": -1.4411920309066772,
"logits/rejected": -1.2204091548919678,
"logps/chosen": -293.62689208984375,
"logps/rejected": -315.4649353027344,
"loss": 0.8851,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0861139297485352,
"rewards/margins": 0.4907267689704895,
"rewards/rejected": -1.5768407583236694,
"rewards/safe_rewards": -1.0654656887054443,
"rewards/unsafe_rewards": -1.106762170791626,
"step": 1600
},
{
"epoch": 0.87,
"learning_rate": 2.6654338838876662e-08,
"logits/chosen": -1.585949420928955,
"logits/rejected": -1.0787384510040283,
"logps/chosen": -322.25885009765625,
"logps/rejected": -315.6914978027344,
"loss": 0.8857,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0791314840316772,
"rewards/margins": 0.5700170397758484,
"rewards/rejected": -1.6491485834121704,
"rewards/safe_rewards": -1.1312898397445679,
"rewards/unsafe_rewards": -1.0269731283187866,
"step": 1610
},
{
"epoch": 0.87,
"learning_rate": 2.4583380328107805e-08,
"logits/chosen": -1.5202205181121826,
"logits/rejected": -1.1094920635223389,
"logps/chosen": -331.93377685546875,
"logps/rejected": -334.93719482421875,
"loss": 0.9394,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0624914169311523,
"rewards/margins": 0.5675247311592102,
"rewards/rejected": -1.6300160884857178,
"rewards/safe_rewards": -1.0210199356079102,
"rewards/unsafe_rewards": -1.1039628982543945,
"step": 1620
},
{
"epoch": 0.88,
"learning_rate": 2.259200116137039e-08,
"logits/chosen": -1.4817931652069092,
"logits/rejected": -1.1653249263763428,
"logps/chosen": -334.85223388671875,
"logps/rejected": -357.4493713378906,
"loss": 0.9782,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1782487630844116,
"rewards/margins": 0.4701511263847351,
"rewards/rejected": -1.6483999490737915,
"rewards/safe_rewards": -1.1807775497436523,
"rewards/unsafe_rewards": -1.175719976425171,
"step": 1630
},
{
"epoch": 0.88,
"learning_rate": 2.068090435979958e-08,
"logits/chosen": -1.4055829048156738,
"logits/rejected": -1.156343698501587,
"logps/chosen": -306.995361328125,
"logps/rejected": -317.0059509277344,
"loss": 0.9645,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.0118372440338135,
"rewards/margins": 0.44742053747177124,
"rewards/rejected": -1.45925772190094,
"rewards/safe_rewards": -1.0128872394561768,
"rewards/unsafe_rewards": -1.010787010192871,
"step": 1640
},
{
"epoch": 0.89,
"learning_rate": 1.8850764602263423e-08,
"logits/chosen": -1.4388693571090698,
"logits/rejected": -1.0342535972595215,
"logps/chosen": -311.6680603027344,
"logps/rejected": -348.0157165527344,
"loss": 0.9259,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1476449966430664,
"rewards/margins": 0.49749964475631714,
"rewards/rejected": -1.6451447010040283,
"rewards/safe_rewards": -1.1766583919525146,
"rewards/unsafe_rewards": -1.1186316013336182,
"step": 1650
},
{
"epoch": 0.89,
"learning_rate": 1.710222798718028e-08,
"logits/chosen": -1.498641014099121,
"logits/rejected": -1.1954280138015747,
"logps/chosen": -323.10626220703125,
"logps/rejected": -355.72705078125,
"loss": 0.8871,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1219761371612549,
"rewards/margins": 0.4705938696861267,
"rewards/rejected": -1.5925698280334473,
"rewards/safe_rewards": -1.170627474784851,
"rewards/unsafe_rewards": -1.07332444190979,
"step": 1660
},
{
"epoch": 0.9,
"learning_rate": 1.5435911804424356e-08,
"logits/chosen": -1.545506477355957,
"logits/rejected": -1.2171175479888916,
"logps/chosen": -331.9698181152344,
"logps/rejected": -340.3002014160156,
"loss": 0.9938,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.0450648069381714,
"rewards/margins": 0.47697582840919495,
"rewards/rejected": -1.522040605545044,
"rewards/safe_rewards": -1.095984697341919,
"rewards/unsafe_rewards": -0.9941450357437134,
"step": 1670
},
{
"epoch": 0.9,
"learning_rate": 1.3852404317403199e-08,
"logits/chosen": -1.41542649269104,
"logits/rejected": -1.1583986282348633,
"logps/chosen": -297.29962158203125,
"logps/rejected": -336.06988525390625,
"loss": 0.9583,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1032707691192627,
"rewards/margins": 0.3784652352333069,
"rewards/rejected": -1.4817359447479248,
"rewards/safe_rewards": -1.15559983253479,
"rewards/unsafe_rewards": -1.0509414672851562,
"step": 1680
},
{
"epoch": 0.91,
"learning_rate": 1.235226455538113e-08,
"logits/chosen": -1.4842908382415771,
"logits/rejected": -1.2061156034469604,
"logps/chosen": -318.2637939453125,
"logps/rejected": -340.8109436035156,
"loss": 1.0008,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1303695440292358,
"rewards/margins": 0.47945070266723633,
"rewards/rejected": -1.6098201274871826,
"rewards/safe_rewards": -1.1323390007019043,
"rewards/unsafe_rewards": -1.128400206565857,
"step": 1690
},
{
"epoch": 0.91,
"learning_rate": 1.0936022116124321e-08,
"logits/chosen": -1.4996792078018188,
"logits/rejected": -1.116720199584961,
"logps/chosen": -307.4630432128906,
"logps/rejected": -331.1900634765625,
"loss": 0.866,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0742332935333252,
"rewards/margins": 0.5489377975463867,
"rewards/rejected": -1.6231712102890015,
"rewards/safe_rewards": -1.107750654220581,
"rewards/unsafe_rewards": -1.0407161712646484,
"step": 1700
},
{
"epoch": 0.92,
"learning_rate": 9.60417697893534e-09,
"logits/chosen": -1.482126235961914,
"logits/rejected": -1.1564598083496094,
"logps/chosen": -312.1965026855469,
"logps/rejected": -340.93817138671875,
"loss": 0.9667,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.156433343887329,
"rewards/margins": 0.43530288338661194,
"rewards/rejected": -1.5917361974716187,
"rewards/safe_rewards": -1.0536446571350098,
"rewards/unsafe_rewards": -1.2592222690582275,
"step": 1710
},
{
"epoch": 0.93,
"learning_rate": 8.357199328144576e-09,
"logits/chosen": -1.4593350887298584,
"logits/rejected": -1.200596570968628,
"logps/chosen": -356.2051086425781,
"logps/rejected": -369.406494140625,
"loss": 0.8611,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1415997743606567,
"rewards/margins": 0.49805140495300293,
"rewards/rejected": -1.6396510601043701,
"rewards/safe_rewards": -1.2040703296661377,
"rewards/unsafe_rewards": -1.0791290998458862,
"step": 1720
},
{
"epoch": 0.93,
"learning_rate": 7.1955293871198144e-09,
"logits/chosen": -1.350987195968628,
"logits/rejected": -1.1763075590133667,
"logps/chosen": -296.27191162109375,
"logps/rejected": -321.37518310546875,
"loss": 0.9819,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2195558547973633,
"rewards/margins": 0.39501625299453735,
"rewards/rejected": -1.6145721673965454,
"rewards/safe_rewards": -1.2167112827301025,
"rewards/unsafe_rewards": -1.2224003076553345,
"step": 1730
},
{
"epoch": 0.94,
"learning_rate": 6.119577262853254e-09,
"logits/chosen": -1.441007375717163,
"logits/rejected": -1.0252482891082764,
"logps/chosen": -299.6694641113281,
"logps/rejected": -312.8736267089844,
"loss": 0.9801,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.121921420097351,
"rewards/margins": 0.5479141473770142,
"rewards/rejected": -1.6698356866836548,
"rewards/safe_rewards": -1.0443205833435059,
"rewards/unsafe_rewards": -1.1995223760604858,
"step": 1740
},
{
"epoch": 0.94,
"learning_rate": 5.129722801180542e-09,
"logits/chosen": -1.4107353687286377,
"logits/rejected": -1.0875647068023682,
"logps/chosen": -322.1299743652344,
"logps/rejected": -341.2066650390625,
"loss": 0.8399,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.189591407775879,
"rewards/margins": 0.5180362462997437,
"rewards/rejected": -1.707627534866333,
"rewards/safe_rewards": -1.3024308681488037,
"rewards/unsafe_rewards": -1.076751708984375,
"step": 1750
},
{
"epoch": 0.95,
"learning_rate": 4.226315452682816e-09,
"logits/chosen": -1.4723705053329468,
"logits/rejected": -1.1786158084869385,
"logps/chosen": -305.52362060546875,
"logps/rejected": -326.5906677246094,
"loss": 0.9485,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.046170711517334,
"rewards/margins": 0.4863569736480713,
"rewards/rejected": -1.5325279235839844,
"rewards/safe_rewards": -1.0766370296478271,
"rewards/unsafe_rewards": -1.0157043933868408,
"step": 1760
},
{
"epoch": 0.95,
"learning_rate": 3.4096741493194193e-09,
"logits/chosen": -1.527930498123169,
"logits/rejected": -1.2494192123413086,
"logps/chosen": -315.3348693847656,
"logps/rejected": -332.2015686035156,
"loss": 1.0269,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1415612697601318,
"rewards/margins": 0.39452338218688965,
"rewards/rejected": -1.5360848903656006,
"rewards/safe_rewards": -1.0917105674743652,
"rewards/unsafe_rewards": -1.1914122104644775,
"step": 1770
},
{
"epoch": 0.96,
"learning_rate": 2.6800871918346846e-09,
"logits/chosen": -1.5815150737762451,
"logits/rejected": -1.1552437543869019,
"logps/chosen": -320.330078125,
"logps/rejected": -340.8857727050781,
"loss": 0.9364,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0657531023025513,
"rewards/margins": 0.5418880581855774,
"rewards/rejected": -1.6076412200927734,
"rewards/safe_rewards": -1.1473314762115479,
"rewards/unsafe_rewards": -0.9841750264167786,
"step": 1780
},
{
"epoch": 0.96,
"learning_rate": 2.0378121479783796e-09,
"logits/chosen": -1.4162019491195679,
"logits/rejected": -1.0255894660949707,
"logps/chosen": -313.9689025878906,
"logps/rejected": -334.1581115722656,
"loss": 0.9987,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.1681249141693115,
"rewards/margins": 0.49787864089012146,
"rewards/rejected": -1.6660035848617554,
"rewards/safe_rewards": -1.1383296251296997,
"rewards/unsafe_rewards": -1.1979202032089233,
"step": 1790
},
{
"epoch": 0.97,
"learning_rate": 1.4830757615760247e-09,
"logits/chosen": -1.437466025352478,
"logits/rejected": -1.0774166584014893,
"logps/chosen": -325.25677490234375,
"logps/rejected": -333.42791748046875,
"loss": 0.9478,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0991178750991821,
"rewards/margins": 0.44227179884910583,
"rewards/rejected": -1.5413895845413208,
"rewards/safe_rewards": -1.1737608909606934,
"rewards/unsafe_rewards": -1.024474859237671,
"step": 1800
},
{
"epoch": 0.97,
"learning_rate": 1.0160738724809548e-09,
"logits/chosen": -1.496584177017212,
"logits/rejected": -1.0213630199432373,
"logps/chosen": -305.14263916015625,
"logps/rejected": -338.046875,
"loss": 0.881,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1096470355987549,
"rewards/margins": 0.5430228114128113,
"rewards/rejected": -1.6526696681976318,
"rewards/safe_rewards": -1.1323860883712769,
"rewards/unsafe_rewards": -1.0869077444076538,
"step": 1810
},
{
"epoch": 0.98,
"learning_rate": 6.369713474366212e-10,
"logits/chosen": -1.460850477218628,
"logits/rejected": -1.1146998405456543,
"logps/chosen": -342.700439453125,
"logps/rejected": -370.11865234375,
"loss": 0.8416,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.1760565042495728,
"rewards/margins": 0.5496792793273926,
"rewards/rejected": -1.7257359027862549,
"rewards/safe_rewards": -1.209657907485962,
"rewards/unsafe_rewards": -1.1424554586410522,
"step": 1820
},
{
"epoch": 0.98,
"learning_rate": 3.459020218731512e-10,
"logits/chosen": -1.4401605129241943,
"logits/rejected": -1.135258674621582,
"logps/chosen": -297.53607177734375,
"logps/rejected": -319.9522399902344,
"loss": 0.872,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0829004049301147,
"rewards/margins": 0.5533518195152283,
"rewards/rejected": -1.6362521648406982,
"rewards/safe_rewards": -1.041105031967163,
"rewards/unsafe_rewards": -1.124695897102356,
"step": 1830
},
{
"epoch": 0.99,
"learning_rate": 1.429686526593088e-10,
"logits/chosen": -1.4089921712875366,
"logits/rejected": -1.140413522720337,
"logps/chosen": -320.8370666503906,
"logps/rejected": -343.36822509765625,
"loss": 1.0174,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1735649108886719,
"rewards/margins": 0.45700669288635254,
"rewards/rejected": -1.6305716037750244,
"rewards/safe_rewards": -1.203802227973938,
"rewards/unsafe_rewards": -1.1433275938034058,
"step": 1840
},
{
"epoch": 1.0,
"learning_rate": 2.824288182584622e-11,
"logits/chosen": -1.5680155754089355,
"logits/rejected": -1.1377493143081665,
"logps/chosen": -327.45550537109375,
"logps/rejected": -339.25115966796875,
"loss": 0.8677,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1065986156463623,
"rewards/margins": 0.5338612198829651,
"rewards/rejected": -1.6404597759246826,
"rewards/safe_rewards": -1.171757459640503,
"rewards/unsafe_rewards": -1.0414397716522217,
"step": 1850
},
{
"epoch": 1.0,
"step": 1858,
"total_flos": 0.0,
"train_loss": 1.018996798697021,
"train_runtime": 22449.6551,
"train_samples_per_second": 2.649,
"train_steps_per_second": 0.083
}
],
"logging_steps": 10,
"max_steps": 1858,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}