dpo-selective-buffer-safeipo / trainer_state.json
wxzhang's picture
Model save
a2514bf verified
raw
history blame
No virus
114 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996020692399522,
"eval_steps": 500,
"global_step": 1884,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.645502645502645e-09,
"logits/chosen": -1.8052858114242554,
"logits/rejected": -1.8250553607940674,
"logps/chosen": -201.6904296875,
"logps/rejected": -206.93157958984375,
"loss": 7734.375,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"rewards/safe_rewards": 0.0,
"rewards/unsafe_rewards": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 2.6455026455026453e-08,
"logits/chosen": -2.025691032409668,
"logits/rejected": -1.8649556636810303,
"logps/chosen": -270.43963623046875,
"logps/rejected": -169.98423767089844,
"loss": 7727.0087,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": 4.114356852369383e-05,
"rewards/margins": -0.0002653732954058796,
"rewards/rejected": 0.00030651676934212446,
"rewards/safe_rewards": -1.17086410682532e-05,
"rewards/unsafe_rewards": -0.0006500756135210395,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.2910052910052905e-08,
"logits/chosen": -1.961146593093872,
"logits/rejected": -1.873740553855896,
"logps/chosen": -189.17404174804688,
"logps/rejected": -176.31651306152344,
"loss": 7718.007,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -6.340327672660351e-06,
"rewards/margins": -0.00010152898175874725,
"rewards/rejected": 9.518869046587497e-05,
"rewards/safe_rewards": 0.00045737033360637724,
"rewards/unsafe_rewards": -8.718876051716506e-05,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 7.936507936507936e-08,
"logits/chosen": -1.9912703037261963,
"logits/rejected": -1.883933424949646,
"logps/chosen": -198.4538116455078,
"logps/rejected": -183.28781127929688,
"loss": 7515.9359,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0001133469631895423,
"rewards/margins": 0.0007399408495984972,
"rewards/rejected": -0.0006265938864089549,
"rewards/safe_rewards": 0.00022509883274324238,
"rewards/unsafe_rewards": 0.0002071214112220332,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 1.0582010582010581e-07,
"logits/chosen": -1.927167534828186,
"logits/rejected": -1.8453724384307861,
"logps/chosen": -198.85276794433594,
"logps/rejected": -174.22967529296875,
"loss": 7334.5094,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.00027468582266010344,
"rewards/margins": 0.0014765586238354445,
"rewards/rejected": -0.0012018729466944933,
"rewards/safe_rewards": 0.0002533269871491939,
"rewards/unsafe_rewards": 0.00015336349315475672,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 1.3227513227513225e-07,
"logits/chosen": -2.037893533706665,
"logits/rejected": -1.8426322937011719,
"logps/chosen": -214.9281463623047,
"logps/rejected": -162.3707733154297,
"loss": 7399.5859,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.0017435807967558503,
"rewards/margins": 0.001902301562950015,
"rewards/rejected": -0.00015872062067501247,
"rewards/safe_rewards": 0.002309921896085143,
"rewards/unsafe_rewards": 0.00044932105811312795,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 1.5873015873015872e-07,
"logits/chosen": -2.011747360229492,
"logits/rejected": -1.8823707103729248,
"logps/chosen": -182.73411560058594,
"logps/rejected": -155.423095703125,
"loss": 7214.4602,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0006955948774702847,
"rewards/margins": 0.005063413176685572,
"rewards/rejected": -0.0057590072974562645,
"rewards/safe_rewards": -0.0021988481748849154,
"rewards/unsafe_rewards": 0.0001153635821538046,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 1.8518518518518516e-07,
"logits/chosen": -1.975612998008728,
"logits/rejected": -1.8158948421478271,
"logps/chosen": -186.48574829101562,
"logps/rejected": -168.57896423339844,
"loss": 7816.8766,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.007440758403390646,
"rewards/margins": 0.010602862574160099,
"rewards/rejected": -0.018043622374534607,
"rewards/safe_rewards": -0.010516250506043434,
"rewards/unsafe_rewards": -0.015666166320443153,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 2.1164021164021162e-07,
"logits/chosen": -1.9063125848770142,
"logits/rejected": -1.7897474765777588,
"logps/chosen": -210.2836151123047,
"logps/rejected": -180.822998046875,
"loss": 7304.9531,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.024481967091560364,
"rewards/margins": 0.016244709491729736,
"rewards/rejected": -0.0407266803085804,
"rewards/safe_rewards": -0.02365388534963131,
"rewards/unsafe_rewards": -0.0289783775806427,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 2.3809523809523806e-07,
"logits/chosen": -1.994605302810669,
"logits/rejected": -1.866681694984436,
"logps/chosen": -203.6532440185547,
"logps/rejected": -174.1517791748047,
"loss": 7251.9984,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06749475002288818,
"rewards/margins": 0.020768558606505394,
"rewards/rejected": -0.08826331794261932,
"rewards/safe_rewards": -0.06556878238916397,
"rewards/unsafe_rewards": -0.052192188799381256,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 2.645502645502645e-07,
"logits/chosen": -1.9495357275009155,
"logits/rejected": -1.8006837368011475,
"logps/chosen": -205.99411010742188,
"logps/rejected": -192.54415893554688,
"loss": 6776.1008,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11886356770992279,
"rewards/margins": 0.020749244838953018,
"rewards/rejected": -0.1396128088235855,
"rewards/safe_rewards": -0.11704058945178986,
"rewards/unsafe_rewards": -0.1348837912082672,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 2.9100529100529097e-07,
"logits/chosen": -1.9887052774429321,
"logits/rejected": -1.8671073913574219,
"logps/chosen": -226.98001098632812,
"logps/rejected": -217.73733520507812,
"loss": 6636.9766,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.11880362033843994,
"rewards/margins": 0.03935481607913971,
"rewards/rejected": -0.15815845131874084,
"rewards/safe_rewards": -0.14540424942970276,
"rewards/unsafe_rewards": -0.11240017414093018,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 3.1746031746031743e-07,
"logits/chosen": -1.8841511011123657,
"logits/rejected": -1.6952005624771118,
"logps/chosen": -235.6121368408203,
"logps/rejected": -192.76162719726562,
"loss": 6804.4828,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.1285235583782196,
"rewards/margins": 0.07450314611196518,
"rewards/rejected": -0.20302672684192657,
"rewards/safe_rewards": -0.12894900143146515,
"rewards/unsafe_rewards": -0.12272067368030548,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 3.439153439153439e-07,
"logits/chosen": -1.8711330890655518,
"logits/rejected": -1.6887938976287842,
"logps/chosen": -225.3953094482422,
"logps/rejected": -200.31997680664062,
"loss": 7036.6016,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.11849894374608994,
"rewards/margins": 0.05801115185022354,
"rewards/rejected": -0.17651011049747467,
"rewards/safe_rewards": -0.10611984878778458,
"rewards/unsafe_rewards": -0.14429841935634613,
"step": 130
},
{
"epoch": 0.07,
"learning_rate": 3.703703703703703e-07,
"logits/chosen": -1.826206922531128,
"logits/rejected": -1.6439968347549438,
"logps/chosen": -220.1838836669922,
"logps/rejected": -185.7141876220703,
"loss": 6936.9914,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.11376659572124481,
"rewards/margins": 0.0765247792005539,
"rewards/rejected": -0.1902913898229599,
"rewards/safe_rewards": -0.11482509225606918,
"rewards/unsafe_rewards": -0.09925278276205063,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 3.968253968253968e-07,
"logits/chosen": -1.7187334299087524,
"logits/rejected": -1.5741361379623413,
"logps/chosen": -211.09603881835938,
"logps/rejected": -203.66156005859375,
"loss": 6555.6867,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.19104455411434174,
"rewards/margins": 0.06891994178295135,
"rewards/rejected": -0.2599644958972931,
"rewards/safe_rewards": -0.20118245482444763,
"rewards/unsafe_rewards": -0.16981182992458344,
"step": 150
},
{
"epoch": 0.08,
"learning_rate": 4.2328042328042324e-07,
"logits/chosen": -1.7090606689453125,
"logits/rejected": -1.4574247598648071,
"logps/chosen": -231.1162567138672,
"logps/rejected": -197.13832092285156,
"loss": 6483.332,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2999975085258484,
"rewards/margins": 0.08841492235660553,
"rewards/rejected": -0.3884124159812927,
"rewards/safe_rewards": -0.2963607907295227,
"rewards/unsafe_rewards": -0.2815978527069092,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 4.497354497354497e-07,
"logits/chosen": -1.7472738027572632,
"logits/rejected": -1.5065333843231201,
"logps/chosen": -255.1507110595703,
"logps/rejected": -221.82241821289062,
"loss": 6801.5375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.23129959404468536,
"rewards/margins": 0.12043756246566772,
"rewards/rejected": -0.35173720121383667,
"rewards/safe_rewards": -0.22959312796592712,
"rewards/unsafe_rewards": -0.1985938847064972,
"step": 170
},
{
"epoch": 0.1,
"learning_rate": 4.761904761904761e-07,
"logits/chosen": -1.680676817893982,
"logits/rejected": -1.4166452884674072,
"logps/chosen": -216.8690948486328,
"logps/rejected": -191.8008270263672,
"loss": 6535.7055,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.26913732290267944,
"rewards/margins": 0.11233188211917877,
"rewards/rejected": -0.381469190120697,
"rewards/safe_rewards": -0.26176974177360535,
"rewards/unsafe_rewards": -0.23940448462963104,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 4.999995705919032e-07,
"logits/chosen": -1.5433807373046875,
"logits/rejected": -1.2667306661605835,
"logps/chosen": -224.0026397705078,
"logps/rejected": -205.34414672851562,
"loss": 6409.0121,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.19693121314048767,
"rewards/margins": 0.09455744177103043,
"rewards/rejected": -0.2914886772632599,
"rewards/safe_rewards": -0.17649488151073456,
"rewards/unsafe_rewards": -0.18380855023860931,
"step": 190
},
{
"epoch": 0.11,
"learning_rate": 4.999480434051858e-07,
"logits/chosen": -1.5521910190582275,
"logits/rejected": -1.3097938299179077,
"logps/chosen": -225.257568359375,
"logps/rejected": -205.92129516601562,
"loss": 6576.5188,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1997550129890442,
"rewards/margins": 0.0904761329293251,
"rewards/rejected": -0.2902311384677887,
"rewards/safe_rewards": -0.20136451721191406,
"rewards/unsafe_rewards": -0.21680407226085663,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 4.998106548810311e-07,
"logits/chosen": -1.3539698123931885,
"logits/rejected": -1.2038872241973877,
"logps/chosen": -212.8267364501953,
"logps/rejected": -220.0903778076172,
"loss": 6444.5828,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2437468022108078,
"rewards/margins": 0.14799915254116058,
"rewards/rejected": -0.3917458951473236,
"rewards/safe_rewards": -0.2773512601852417,
"rewards/unsafe_rewards": -0.2216939926147461,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 4.995874522146975e-07,
"logits/chosen": -1.503328561782837,
"logits/rejected": -1.3146250247955322,
"logps/chosen": -236.4509735107422,
"logps/rejected": -211.6634063720703,
"loss": 6233.5547,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.29747992753982544,
"rewards/margins": 0.13039958477020264,
"rewards/rejected": -0.4278795123100281,
"rewards/safe_rewards": -0.2768808901309967,
"rewards/unsafe_rewards": -0.3182833790779114,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 4.992785120800375e-07,
"logits/chosen": -1.576887845993042,
"logits/rejected": -1.2664101123809814,
"logps/chosen": -237.9243621826172,
"logps/rejected": -213.4459991455078,
"loss": 6108.0914,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.23068375885486603,
"rewards/margins": 0.14957153797149658,
"rewards/rejected": -0.3802553117275238,
"rewards/safe_rewards": -0.22292426228523254,
"rewards/unsafe_rewards": -0.18162095546722412,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 4.988839406031596e-07,
"logits/chosen": -1.515092134475708,
"logits/rejected": -1.2886550426483154,
"logps/chosen": -223.7300567626953,
"logps/rejected": -192.06324768066406,
"loss": 6310.6699,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.24790284037590027,
"rewards/margins": 0.1096932515501976,
"rewards/rejected": -0.3575960695743561,
"rewards/safe_rewards": -0.2673969864845276,
"rewards/unsafe_rewards": -0.24145250022411346,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 4.98403873325972e-07,
"logits/chosen": -1.5146888494491577,
"logits/rejected": -1.3244738578796387,
"logps/chosen": -213.21694946289062,
"logps/rejected": -209.35061645507812,
"loss": 6209.5707,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2071472406387329,
"rewards/margins": 0.16860046982765198,
"rewards/rejected": -0.3757476806640625,
"rewards/safe_rewards": -0.1998087763786316,
"rewards/unsafe_rewards": -0.20211009681224823,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 4.978384751596212e-07,
"logits/chosen": -1.3180285692214966,
"logits/rejected": -1.1171799898147583,
"logps/chosen": -232.109375,
"logps/rejected": -236.84072875976562,
"loss": 6328.7531,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.32092350721359253,
"rewards/margins": 0.17156612873077393,
"rewards/rejected": -0.49248963594436646,
"rewards/safe_rewards": -0.4227983355522156,
"rewards/unsafe_rewards": -0.3325851559638977,
"step": 260
},
{
"epoch": 0.14,
"learning_rate": 4.971879403278432e-07,
"logits/chosen": -1.1372450590133667,
"logits/rejected": -0.9446180462837219,
"logps/chosen": -234.88888549804688,
"logps/rejected": -224.05886840820312,
"loss": 6312.1719,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.29563266038894653,
"rewards/margins": 0.12811212241649628,
"rewards/rejected": -0.4237447679042816,
"rewards/safe_rewards": -0.33217892050743103,
"rewards/unsafe_rewards": -0.27307888865470886,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 4.964524923002436e-07,
"logits/chosen": -1.415801763534546,
"logits/rejected": -1.1731336116790771,
"logps/chosen": -241.7359619140625,
"logps/rejected": -224.5096893310547,
"loss": 5974.0195,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3053835928440094,
"rewards/margins": 0.16657045483589172,
"rewards/rejected": -0.4719540476799011,
"rewards/safe_rewards": -0.3295218348503113,
"rewards/unsafe_rewards": -0.30390697717666626,
"step": 280
},
{
"epoch": 0.15,
"learning_rate": 4.956323837155325e-07,
"logits/chosen": -1.2966214418411255,
"logits/rejected": -1.1260521411895752,
"logps/chosen": -227.2568359375,
"logps/rejected": -214.1421661376953,
"loss": 6133.0227,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.261239230632782,
"rewards/margins": 0.15825437009334564,
"rewards/rejected": -0.4194936156272888,
"rewards/safe_rewards": -0.2375851422548294,
"rewards/unsafe_rewards": -0.2705303132534027,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 4.947278962947386e-07,
"logits/chosen": -1.255904197692871,
"logits/rejected": -1.0300556421279907,
"logps/chosen": -231.86593627929688,
"logps/rejected": -213.03768920898438,
"loss": 5684.9316,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.30576351284980774,
"rewards/margins": 0.1560250222682953,
"rewards/rejected": -0.4617885649204254,
"rewards/safe_rewards": -0.3117372691631317,
"rewards/unsafe_rewards": -0.30344492197036743,
"step": 300
},
{
"epoch": 0.16,
"learning_rate": 4.937393407444337e-07,
"logits/chosen": -1.1847805976867676,
"logits/rejected": -0.8935750722885132,
"logps/chosen": -235.5170135498047,
"logps/rejected": -226.17910766601562,
"loss": 5606.7586,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.4436865746974945,
"rewards/margins": 0.12356774508953094,
"rewards/rejected": -0.5672543048858643,
"rewards/safe_rewards": -0.4222384989261627,
"rewards/unsafe_rewards": -0.49501723051071167,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 4.926670566499992e-07,
"logits/chosen": -0.6831132173538208,
"logits/rejected": -0.43409886956214905,
"logps/chosen": -230.1105499267578,
"logps/rejected": -223.13021850585938,
"loss": 6029.3086,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.4783251881599426,
"rewards/margins": 0.13184307515621185,
"rewards/rejected": -0.6101682782173157,
"rewards/safe_rewards": -0.46370235085487366,
"rewards/unsafe_rewards": -0.4838125705718994,
"step": 320
},
{
"epoch": 0.18,
"learning_rate": 4.915114123589732e-07,
"logits/chosen": -0.5296390652656555,
"logits/rejected": -0.23315271735191345,
"logps/chosen": -264.1290588378906,
"logps/rejected": -222.7255401611328,
"loss": 6587.2148,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.49660125374794006,
"rewards/margins": 0.1269882619380951,
"rewards/rejected": -0.6235895156860352,
"rewards/safe_rewards": -0.5574027299880981,
"rewards/unsafe_rewards": -0.5570284128189087,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 4.90272804854517e-07,
"logits/chosen": -0.20833459496498108,
"logits/rejected": 0.08662636578083038,
"logps/chosen": -271.68389892578125,
"logps/rejected": -259.1782531738281,
"loss": 6224.5324,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5533224940299988,
"rewards/margins": 0.15772438049316406,
"rewards/rejected": -0.7110469341278076,
"rewards/safe_rewards": -0.5448375940322876,
"rewards/unsafe_rewards": -0.5393844842910767,
"step": 340
},
{
"epoch": 0.19,
"learning_rate": 4.889516596190448e-07,
"logits/chosen": -0.7373126149177551,
"logits/rejected": -0.34005147218704224,
"logps/chosen": -293.0935363769531,
"logps/rejected": -241.9617156982422,
"loss": 6110.7906,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5001389980316162,
"rewards/margins": 0.1725221574306488,
"rewards/rejected": -0.6726611852645874,
"rewards/safe_rewards": -0.4835886061191559,
"rewards/unsafe_rewards": -0.5382236838340759,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 4.875484304880629e-07,
"logits/chosen": -0.8152839541435242,
"logits/rejected": -0.4126107096672058,
"logps/chosen": -302.5885314941406,
"logps/rejected": -256.1798095703125,
"loss": 6488.7234,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.48745980858802795,
"rewards/margins": 0.10641022026538849,
"rewards/rejected": -0.5938700437545776,
"rewards/safe_rewards": -0.449713796377182,
"rewards/unsafe_rewards": -0.48859700560569763,
"step": 360
},
{
"epoch": 0.2,
"learning_rate": 4.860635994942702e-07,
"logits/chosen": -0.47416171431541443,
"logits/rejected": 0.00913926400244236,
"logps/chosen": -258.38189697265625,
"logps/rejected": -230.67880249023438,
"loss": 5790.3816,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.5084312558174133,
"rewards/margins": 0.1444414108991623,
"rewards/rejected": -0.6528726816177368,
"rewards/safe_rewards": -0.5270028114318848,
"rewards/unsafe_rewards": -0.48991069197654724,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 4.844976767019714e-07,
"logits/chosen": -0.19216355681419373,
"logits/rejected": 0.15172423422336578,
"logps/chosen": -222.911865234375,
"logps/rejected": -202.00888061523438,
"loss": 5908.2133,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5394010543823242,
"rewards/margins": 0.11715151369571686,
"rewards/rejected": -0.6565525531768799,
"rewards/safe_rewards": -0.5183984041213989,
"rewards/unsafe_rewards": -0.5164821743965149,
"step": 380
},
{
"epoch": 0.21,
"learning_rate": 4.828512000318616e-07,
"logits/chosen": -0.213291734457016,
"logits/rejected": 0.39291974902153015,
"logps/chosen": -303.5594177246094,
"logps/rejected": -259.14178466796875,
"loss": 6109.6039,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5700324177742004,
"rewards/margins": 0.1927037090063095,
"rewards/rejected": -0.7627362012863159,
"rewards/safe_rewards": -0.5912032723426819,
"rewards/unsafe_rewards": -0.5395609140396118,
"step": 390
},
{
"epoch": 0.21,
"learning_rate": 4.811247350762418e-07,
"logits/chosen": -0.36068278551101685,
"logits/rejected": 0.05598723143339157,
"logps/chosen": -240.6222381591797,
"logps/rejected": -234.20803833007812,
"loss": 5907.1703,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.554689347743988,
"rewards/margins": 0.17352624237537384,
"rewards/rejected": -0.7282156348228455,
"rewards/safe_rewards": -0.5173069834709167,
"rewards/unsafe_rewards": -0.5826700329780579,
"step": 400
},
{
"epoch": 0.22,
"learning_rate": 4.79318874904728e-07,
"logits/chosen": -0.5469863414764404,
"logits/rejected": -0.3919845223426819,
"logps/chosen": -267.99761962890625,
"logps/rejected": -260.9379577636719,
"loss": 6323.5375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5513988137245178,
"rewards/margins": 0.16061297059059143,
"rewards/rejected": -0.7120116949081421,
"rewards/safe_rewards": -0.5992297530174255,
"rewards/unsafe_rewards": -0.5494996309280396,
"step": 410
},
{
"epoch": 0.22,
"learning_rate": 4.774342398605221e-07,
"logits/chosen": -1.3936598300933838,
"logits/rejected": -1.0238125324249268,
"logps/chosen": -262.09033203125,
"logps/rejected": -221.07174682617188,
"loss": 5492.8094,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5323154926300049,
"rewards/margins": 0.15208503603935242,
"rewards/rejected": -0.6844004988670349,
"rewards/safe_rewards": -0.5349102020263672,
"rewards/unsafe_rewards": -0.505738377571106,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 4.754714773473134e-07,
"logits/chosen": -1.2268015146255493,
"logits/rejected": -1.0391647815704346,
"logps/chosen": -248.2527313232422,
"logps/rejected": -258.4667663574219,
"loss": 6146.5922,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5346105694770813,
"rewards/margins": 0.18027544021606445,
"rewards/rejected": -0.7148860692977905,
"rewards/safe_rewards": -0.4759598672389984,
"rewards/unsafe_rewards": -0.534007728099823,
"step": 430
},
{
"epoch": 0.23,
"learning_rate": 4.734312616068851e-07,
"logits/chosen": -1.2311909198760986,
"logits/rejected": -0.9865934252738953,
"logps/chosen": -214.25851440429688,
"logps/rejected": -198.68943786621094,
"loss": 5944.2828,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3851444125175476,
"rewards/margins": 0.0964752659201622,
"rewards/rejected": -0.481619656085968,
"rewards/safe_rewards": -0.40014153718948364,
"rewards/unsafe_rewards": -0.4206266403198242,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 4.713142934875005e-07,
"logits/chosen": -0.7530995607376099,
"logits/rejected": -0.348047137260437,
"logps/chosen": -273.5533447265625,
"logps/rejected": -247.33377075195312,
"loss": 6019.3629,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4809795916080475,
"rewards/margins": 0.16457389295101166,
"rewards/rejected": -0.645553469657898,
"rewards/safe_rewards": -0.4939555525779724,
"rewards/unsafe_rewards": -0.51116544008255,
"step": 450
},
{
"epoch": 0.24,
"learning_rate": 4.6912130020314996e-07,
"logits/chosen": 0.18566010892391205,
"logits/rejected": 0.4161214232444763,
"logps/chosen": -233.847900390625,
"logps/rejected": -238.5542755126953,
"loss": 5555.243,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6200246810913086,
"rewards/margins": 0.13345691561698914,
"rewards/rejected": -0.7534815073013306,
"rewards/safe_rewards": -0.6095362901687622,
"rewards/unsafe_rewards": -0.6309984922409058,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 4.668530350837408e-07,
"logits/chosen": 0.024336492642760277,
"logits/rejected": 0.4952603876590729,
"logps/chosen": -259.33697509765625,
"logps/rejected": -254.6613006591797,
"loss": 5726.7293,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5721555948257446,
"rewards/margins": 0.12051858007907867,
"rewards/rejected": -0.6926741600036621,
"rewards/safe_rewards": -0.5316283702850342,
"rewards/unsafe_rewards": -0.5645433664321899,
"step": 470
},
{
"epoch": 0.25,
"learning_rate": 4.64510277316316e-07,
"logits/chosen": -0.0006995767471380532,
"logits/rejected": 0.4036879539489746,
"logps/chosen": -269.50482177734375,
"logps/rejected": -248.73434448242188,
"loss": 6012.2914,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5171098113059998,
"rewards/margins": 0.20941033959388733,
"rewards/rejected": -0.7265201807022095,
"rewards/safe_rewards": -0.5066377520561218,
"rewards/unsafe_rewards": -0.4963339865207672,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 4.6209383167739015e-07,
"logits/chosen": -0.8723047971725464,
"logits/rejected": -0.47492194175720215,
"logps/chosen": -239.2227020263672,
"logps/rejected": -223.37191772460938,
"loss": 6090.4563,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.39161261916160583,
"rewards/margins": 0.16117171943187714,
"rewards/rejected": -0.5527843832969666,
"rewards/safe_rewards": -0.4009205400943756,
"rewards/unsafe_rewards": -0.4027668535709381,
"step": 490
},
{
"epoch": 0.27,
"learning_rate": 4.5960452825649526e-07,
"logits/chosen": -0.8613616228103638,
"logits/rejected": -0.5483921766281128,
"logps/chosen": -252.01095581054688,
"logps/rejected": -236.2162628173828,
"loss": 5410.1973,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.4818722605705261,
"rewards/margins": 0.12459783256053925,
"rewards/rejected": -0.606469988822937,
"rewards/safe_rewards": -0.4409845769405365,
"rewards/unsafe_rewards": -0.48863571882247925,
"step": 500
},
{
"epoch": 0.27,
"eval_logits/chosen": -0.00993373803794384,
"eval_logits/rejected": 0.6948209404945374,
"eval_logps/chosen": -205.43228149414062,
"eval_logps/rejected": -177.0600128173828,
"eval_loss": 4657.333984375,
"eval_rewards/accuracies": 0.6367472410202026,
"eval_rewards/chosen": -0.6508274078369141,
"eval_rewards/margins": 0.09844248741865158,
"eval_rewards/rejected": -0.749269962310791,
"eval_rewards/safe_rewards": -0.6381882429122925,
"eval_rewards/unsafe_rewards": -0.6354333162307739,
"eval_runtime": 2355.0926,
"eval_samples_per_second": 14.88,
"eval_steps_per_second": 0.465,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 4.570432221710314e-07,
"logits/chosen": -0.2417004406452179,
"logits/rejected": 0.17007017135620117,
"logps/chosen": -273.1074523925781,
"logps/rejected": -236.8904266357422,
"loss": 6244.0367,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5197592973709106,
"rewards/margins": 0.19909226894378662,
"rewards/rejected": -0.7188515067100525,
"rewards/safe_rewards": -0.6001642942428589,
"rewards/unsafe_rewards": -0.5492387413978577,
"step": 510
},
{
"epoch": 0.28,
"learning_rate": 4.5441079327251927e-07,
"logits/chosen": -0.3826223909854889,
"logits/rejected": 0.10965192317962646,
"logps/chosen": -261.4352722167969,
"logps/rejected": -251.9311065673828,
"loss": 5649.8195,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.49133262038230896,
"rewards/margins": 0.11736941337585449,
"rewards/rejected": -0.6087020635604858,
"rewards/safe_rewards": -0.4915240406990051,
"rewards/unsafe_rewards": -0.4991859793663025,
"step": 520
},
{
"epoch": 0.28,
"learning_rate": 4.5170814584435644e-07,
"logits/chosen": -0.1299566924571991,
"logits/rejected": 0.30430150032043457,
"logps/chosen": -281.5189514160156,
"logps/rejected": -248.9510040283203,
"loss": 6070.9859,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5048553347587585,
"rewards/margins": 0.17633280158042908,
"rewards/rejected": -0.6811882257461548,
"rewards/safe_rewards": -0.45997923612594604,
"rewards/unsafe_rewards": -0.5042248964309692,
"step": 530
},
{
"epoch": 0.29,
"learning_rate": 4.4893620829118124e-07,
"logits/chosen": 0.41155165433883667,
"logits/rejected": 0.7351133227348328,
"logps/chosen": -218.6739959716797,
"logps/rejected": -222.22238159179688,
"loss": 5773.9555,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5764225721359253,
"rewards/margins": 0.17755261063575745,
"rewards/rejected": -0.7539752125740051,
"rewards/safe_rewards": -0.5707100033760071,
"rewards/unsafe_rewards": -0.5930426716804504,
"step": 540
},
{
"epoch": 0.29,
"learning_rate": 4.460959328199497e-07,
"logits/chosen": 0.4961000382900238,
"logits/rejected": 0.9081694483757019,
"logps/chosen": -256.54791259765625,
"logps/rejected": -277.130126953125,
"loss": 6108.098,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6318496465682983,
"rewards/margins": 0.2199208289384842,
"rewards/rejected": -0.8517705202102661,
"rewards/safe_rewards": -0.6448063850402832,
"rewards/unsafe_rewards": -0.5973528623580933,
"step": 550
},
{
"epoch": 0.3,
"learning_rate": 4.4318829511283707e-07,
"logits/chosen": 0.23597554862499237,
"logits/rejected": 0.5608280301094055,
"logps/chosen": -262.15960693359375,
"logps/rejected": -276.5953369140625,
"loss": 6017.0984,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7231947183609009,
"rewards/margins": 0.16650545597076416,
"rewards/rejected": -0.8897002339363098,
"rewards/safe_rewards": -0.7144005298614502,
"rewards/unsafe_rewards": -0.6883742213249207,
"step": 560
},
{
"epoch": 0.3,
"learning_rate": 4.40214293992074e-07,
"logits/chosen": 0.30961090326309204,
"logits/rejected": 0.6938155889511108,
"logps/chosen": -267.58404541015625,
"logps/rejected": -252.78311157226562,
"loss": 6321.9309,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5275936722755432,
"rewards/margins": 0.20575468242168427,
"rewards/rejected": -0.7333483099937439,
"rewards/safe_rewards": -0.5182517766952515,
"rewards/unsafe_rewards": -0.5568464994430542,
"step": 570
},
{
"epoch": 0.31,
"learning_rate": 4.3717495107683516e-07,
"logits/chosen": 0.2671489417552948,
"logits/rejected": 0.9092152714729309,
"logps/chosen": -250.55960083007812,
"logps/rejected": -235.89840698242188,
"loss": 5574.8402,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.5318346619606018,
"rewards/margins": 0.18946382403373718,
"rewards/rejected": -0.7212985157966614,
"rewards/safe_rewards": -0.5447245836257935,
"rewards/unsafe_rewards": -0.5725606083869934,
"step": 580
},
{
"epoch": 0.31,
"learning_rate": 4.340713104322953e-07,
"logits/chosen": 0.01171237975358963,
"logits/rejected": 0.4629115164279938,
"logps/chosen": -265.1495056152344,
"logps/rejected": -259.7709045410156,
"loss": 5202.8691,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5935125946998596,
"rewards/margins": 0.18529286980628967,
"rewards/rejected": -0.7788054347038269,
"rewards/safe_rewards": -0.6250792741775513,
"rewards/unsafe_rewards": -0.6238072514533997,
"step": 590
},
{
"epoch": 0.32,
"learning_rate": 4.3090443821097566e-07,
"logits/chosen": 0.7814422845840454,
"logits/rejected": 1.1566433906555176,
"logps/chosen": -278.1474609375,
"logps/rejected": -280.3294677734375,
"loss": 5335.1562,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6250512599945068,
"rewards/margins": 0.19450877606868744,
"rewards/rejected": -0.8195600509643555,
"rewards/safe_rewards": -0.5736940503120422,
"rewards/unsafe_rewards": -0.6311155557632446,
"step": 600
},
{
"epoch": 0.32,
"learning_rate": 4.276754222865029e-07,
"logits/chosen": 0.546709418296814,
"logits/rejected": 1.5038117170333862,
"logps/chosen": -284.0765075683594,
"logps/rejected": -235.79367065429688,
"loss": 5880.4258,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6524443626403809,
"rewards/margins": 0.17251375317573547,
"rewards/rejected": -0.8249581456184387,
"rewards/safe_rewards": -0.6402295231819153,
"rewards/unsafe_rewards": -0.6277676224708557,
"step": 610
},
{
"epoch": 0.33,
"learning_rate": 4.2438537187990565e-07,
"logits/chosen": 0.7865768671035767,
"logits/rejected": 1.5061836242675781,
"logps/chosen": -283.3603820800781,
"logps/rejected": -251.56442260742188,
"loss": 5760.8687,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.658532977104187,
"rewards/margins": 0.21655750274658203,
"rewards/rejected": -0.875090479850769,
"rewards/safe_rewards": -0.6327935457229614,
"rewards/unsafe_rewards": -0.6471335291862488,
"step": 620
},
{
"epoch": 0.33,
"learning_rate": 4.210354171785795e-07,
"logits/chosen": 0.2993673086166382,
"logits/rejected": 0.7917363047599792,
"logps/chosen": -272.6424865722656,
"logps/rejected": -247.65853881835938,
"loss": 5872.0883,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5130705833435059,
"rewards/margins": 0.1547364443540573,
"rewards/rejected": -0.6678069829940796,
"rewards/safe_rewards": -0.5059661269187927,
"rewards/unsafe_rewards": -0.5222837328910828,
"step": 630
},
{
"epoch": 0.34,
"learning_rate": 4.1762670894804775e-07,
"logits/chosen": 0.09364859014749527,
"logits/rejected": 0.5361107587814331,
"logps/chosen": -249.59634399414062,
"logps/rejected": -237.3841094970703,
"loss": 5896.1926,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.49201780557632446,
"rewards/margins": 0.16005203127861023,
"rewards/rejected": -0.6520698070526123,
"rewards/safe_rewards": -0.549709677696228,
"rewards/unsafe_rewards": -0.5637668967247009,
"step": 640
},
{
"epoch": 0.34,
"learning_rate": 4.1416041813665493e-07,
"logits/chosen": -0.5552986860275269,
"logits/rejected": -0.25023895502090454,
"logps/chosen": -253.50790405273438,
"logps/rejected": -253.32583618164062,
"loss": 5920.0328,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.47500887513160706,
"rewards/margins": 0.12813320755958557,
"rewards/rejected": -0.6031420826911926,
"rewards/safe_rewards": -0.43845662474632263,
"rewards/unsafe_rewards": -0.45656904578208923,
"step": 650
},
{
"epoch": 0.35,
"learning_rate": 4.1063773547332584e-07,
"logits/chosen": -0.46418723464012146,
"logits/rejected": -0.049189966171979904,
"logps/chosen": -267.15765380859375,
"logps/rejected": -243.20010375976562,
"loss": 6128.7578,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.6104855537414551,
"rewards/margins": 0.10687772184610367,
"rewards/rejected": -0.7173632383346558,
"rewards/safe_rewards": -0.5476406216621399,
"rewards/unsafe_rewards": -0.603262722492218,
"step": 660
},
{
"epoch": 0.36,
"learning_rate": 4.0705987105853077e-07,
"logits/chosen": -0.2697436213493347,
"logits/rejected": 0.344801664352417,
"logps/chosen": -252.3665313720703,
"logps/rejected": -232.3540496826172,
"loss": 5986.7625,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5879735350608826,
"rewards/margins": 0.14302758872509003,
"rewards/rejected": -0.731001079082489,
"rewards/safe_rewards": -0.543707013130188,
"rewards/unsafe_rewards": -0.5482696294784546,
"step": 670
},
{
"epoch": 0.36,
"learning_rate": 4.034280539485952e-07,
"logits/chosen": -0.36558887362480164,
"logits/rejected": 0.18461750447750092,
"logps/chosen": -295.22119140625,
"logps/rejected": -274.0675354003906,
"loss": 5383.9453,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.5177947878837585,
"rewards/margins": 0.21047362685203552,
"rewards/rejected": -0.7282685041427612,
"rewards/safe_rewards": -0.5312758684158325,
"rewards/unsafe_rewards": -0.5633383393287659,
"step": 680
},
{
"epoch": 0.37,
"learning_rate": 3.997435317334988e-07,
"logits/chosen": 0.3039137125015259,
"logits/rejected": 0.7977389097213745,
"logps/chosen": -279.23187255859375,
"logps/rejected": -261.033935546875,
"loss": 5720.7707,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5356379747390747,
"rewards/margins": 0.2088995724916458,
"rewards/rejected": -0.7445374131202698,
"rewards/safe_rewards": -0.5458201169967651,
"rewards/unsafe_rewards": -0.47182130813598633,
"step": 690
},
{
"epoch": 0.37,
"learning_rate": 3.960075701083074e-07,
"logits/chosen": 0.06580640375614166,
"logits/rejected": 0.28118953108787537,
"logps/chosen": -237.80581665039062,
"logps/rejected": -245.47216796875,
"loss": 5702.616,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5484215021133423,
"rewards/margins": 0.16065733134746552,
"rewards/rejected": -0.709078848361969,
"rewards/safe_rewards": -0.5256644487380981,
"rewards/unsafe_rewards": -0.5779343247413635,
"step": 700
},
{
"epoch": 0.38,
"learning_rate": 3.92221452438385e-07,
"logits/chosen": -0.6886399388313293,
"logits/rejected": -0.33862438797950745,
"logps/chosen": -255.33505249023438,
"logps/rejected": -234.041259765625,
"loss": 5505.9277,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5285482406616211,
"rewards/margins": 0.18568384647369385,
"rewards/rejected": -0.7142320871353149,
"rewards/safe_rewards": -0.5484398007392883,
"rewards/unsafe_rewards": -0.5874748826026917,
"step": 710
},
{
"epoch": 0.38,
"learning_rate": 3.8838647931853684e-07,
"logits/chosen": -0.7950954437255859,
"logits/rejected": -0.4466307759284973,
"logps/chosen": -253.4489288330078,
"logps/rejected": -254.49813842773438,
"loss": 6030.682,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5057817697525024,
"rewards/margins": 0.20095935463905334,
"rewards/rejected": -0.7067410945892334,
"rewards/safe_rewards": -0.5353250503540039,
"rewards/unsafe_rewards": -0.4995631277561188,
"step": 720
},
{
"epoch": 0.39,
"learning_rate": 3.845039681262332e-07,
"logits/chosen": -0.5698283910751343,
"logits/rejected": -0.1652621030807495,
"logps/chosen": -265.46368408203125,
"logps/rejected": -250.52951049804688,
"loss": 5514.4148,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.45593494176864624,
"rewards/margins": 0.1759863793849945,
"rewards/rejected": -0.6319212913513184,
"rewards/safe_rewards": -0.4363466799259186,
"rewards/unsafe_rewards": -0.4330349862575531,
"step": 730
},
{
"epoch": 0.39,
"learning_rate": 3.805752525690681e-07,
"logits/chosen": 0.09326216578483582,
"logits/rejected": 0.7224725484848022,
"logps/chosen": -253.9232940673828,
"logps/rejected": -268.0160217285156,
"loss": 5160.3754,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6526281237602234,
"rewards/margins": 0.22083961963653564,
"rewards/rejected": -0.8734676241874695,
"rewards/safe_rewards": -0.6421413421630859,
"rewards/unsafe_rewards": -0.6364503502845764,
"step": 740
},
{
"epoch": 0.4,
"learning_rate": 3.7660168222660824e-07,
"logits/chosen": 0.43039554357528687,
"logits/rejected": 0.772833526134491,
"logps/chosen": -293.98541259765625,
"logps/rejected": -288.250732421875,
"loss": 5855.4879,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.7387111783027649,
"rewards/margins": 0.16440826654434204,
"rewards/rejected": -0.9031193852424622,
"rewards/safe_rewards": -0.7269446849822998,
"rewards/unsafe_rewards": -0.6723185777664185,
"step": 750
},
{
"epoch": 0.4,
"learning_rate": 3.725846220867901e-07,
"logits/chosen": -0.09916634857654572,
"logits/rejected": 0.4922304153442383,
"logps/chosen": -265.7640686035156,
"logps/rejected": -243.7411346435547,
"loss": 6137.0988,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.6147286295890808,
"rewards/margins": 0.14420659840106964,
"rewards/rejected": -0.7589352130889893,
"rewards/safe_rewards": -0.6549733877182007,
"rewards/unsafe_rewards": -0.6351133584976196,
"step": 760
},
{
"epoch": 0.41,
"learning_rate": 3.6852545207702393e-07,
"logits/chosen": -0.18887875974178314,
"logits/rejected": 0.4651460647583008,
"logps/chosen": -300.3460998535156,
"logps/rejected": -247.0656280517578,
"loss": 5956.6977,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.5610722899436951,
"rewards/margins": 0.18032148480415344,
"rewards/rejected": -0.7413938641548157,
"rewards/safe_rewards": -0.5364476442337036,
"rewards/unsafe_rewards": -0.5671006441116333,
"step": 770
},
{
"epoch": 0.41,
"learning_rate": 3.6442556659016475e-07,
"logits/chosen": 0.3691898286342621,
"logits/rejected": 1.0192655324935913,
"logps/chosen": -278.3470458984375,
"logps/rejected": -240.86141967773438,
"loss": 5414.8289,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5899799466133118,
"rewards/margins": 0.20228877663612366,
"rewards/rejected": -0.7922687530517578,
"rewards/safe_rewards": -0.5520480871200562,
"rewards/unsafe_rewards": -0.5946981906890869,
"step": 780
},
{
"epoch": 0.42,
"learning_rate": 3.602863740055161e-07,
"logits/chosen": 1.002415418624878,
"logits/rejected": 1.6322085857391357,
"logps/chosen": -268.44488525390625,
"logps/rejected": -261.2592468261719,
"loss": 5358.4598,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6824139356613159,
"rewards/margins": 0.22263555228710175,
"rewards/rejected": -0.9050495028495789,
"rewards/safe_rewards": -0.6642250418663025,
"rewards/unsafe_rewards": -0.6494946479797363,
"step": 790
},
{
"epoch": 0.42,
"learning_rate": 3.5610929620502747e-07,
"logits/chosen": 0.9502559900283813,
"logits/rejected": 1.4719197750091553,
"logps/chosen": -271.93231201171875,
"logps/rejected": -281.78125,
"loss": 5792.9727,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.7460067272186279,
"rewards/margins": 0.18493010103702545,
"rewards/rejected": -0.9309368133544922,
"rewards/safe_rewards": -0.7411947846412659,
"rewards/unsafe_rewards": -0.8093317151069641,
"step": 800
},
{
"epoch": 0.43,
"learning_rate": 3.5189576808485404e-07,
"logits/chosen": 0.7791315913200378,
"logits/rejected": 1.4415690898895264,
"logps/chosen": -300.54150390625,
"logps/rejected": -273.402587890625,
"loss": 5584.2125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7409987449645996,
"rewards/margins": 0.20648033916950226,
"rewards/rejected": -0.9474791288375854,
"rewards/safe_rewards": -0.726071834564209,
"rewards/unsafe_rewards": -0.8359003067016602,
"step": 810
},
{
"epoch": 0.44,
"learning_rate": 3.476472370624464e-07,
"logits/chosen": 0.40392106771469116,
"logits/rejected": 0.7413457632064819,
"logps/chosen": -254.9908905029297,
"logps/rejected": -251.4073028564453,
"loss": 6101.9039,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6420382261276245,
"rewards/margins": 0.13990595936775208,
"rewards/rejected": -0.7819441556930542,
"rewards/safe_rewards": -0.5959726572036743,
"rewards/unsafe_rewards": -0.6521440744400024,
"step": 820
},
{
"epoch": 0.44,
"learning_rate": 3.43365162579338e-07,
"logits/chosen": 0.11586692184209824,
"logits/rejected": 0.49579864740371704,
"logps/chosen": -226.8084716796875,
"logps/rejected": -232.3746337890625,
"loss": 5837.0383,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.551177442073822,
"rewards/margins": 0.19108565151691437,
"rewards/rejected": -0.7422630190849304,
"rewards/safe_rewards": -0.5533746480941772,
"rewards/unsafe_rewards": -0.5072416663169861,
"step": 830
},
{
"epoch": 0.45,
"learning_rate": 3.390510155998023e-07,
"logits/chosen": 0.24915654957294464,
"logits/rejected": 0.6536698341369629,
"logps/chosen": -277.9824523925781,
"logps/rejected": -249.2000732421875,
"loss": 5721.2586,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.648623526096344,
"rewards/margins": 0.12514245510101318,
"rewards/rejected": -0.7737659811973572,
"rewards/safe_rewards": -0.7092838287353516,
"rewards/unsafe_rewards": -0.6900613903999329,
"step": 840
},
{
"epoch": 0.45,
"learning_rate": 3.347062781055526e-07,
"logits/chosen": 0.5860965847969055,
"logits/rejected": 0.9803635478019714,
"logps/chosen": -245.1415252685547,
"logps/rejected": -272.01080322265625,
"loss": 5834.2676,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6521397829055786,
"rewards/margins": 0.21285566687583923,
"rewards/rejected": -0.8649954795837402,
"rewards/safe_rewards": -0.6472452878952026,
"rewards/unsafe_rewards": -0.6902757883071899,
"step": 850
},
{
"epoch": 0.46,
"learning_rate": 3.303324425866559e-07,
"logits/chosen": 0.6316410303115845,
"logits/rejected": 0.902866005897522,
"logps/chosen": -291.68597412109375,
"logps/rejected": -266.18585205078125,
"loss": 5964.1836,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6263974905014038,
"rewards/margins": 0.17340168356895447,
"rewards/rejected": -0.7997991442680359,
"rewards/safe_rewards": -0.6621179580688477,
"rewards/unsafe_rewards": -0.6091993451118469,
"step": 860
},
{
"epoch": 0.46,
"learning_rate": 3.2593101152883795e-07,
"logits/chosen": 0.6831669211387634,
"logits/rejected": 0.9902046918869019,
"logps/chosen": -256.2884521484375,
"logps/rejected": -279.5752868652344,
"loss": 5961.9836,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6823039054870605,
"rewards/margins": 0.17010322213172913,
"rewards/rejected": -0.8524071574211121,
"rewards/safe_rewards": -0.6452068090438843,
"rewards/unsafe_rewards": -0.7062270641326904,
"step": 870
},
{
"epoch": 0.47,
"learning_rate": 3.21503496897354e-07,
"logits/chosen": 0.48068660497665405,
"logits/rejected": 0.952492892742157,
"logps/chosen": -289.909423828125,
"logps/rejected": -262.1679992675781,
"loss": 6021.2465,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7239787578582764,
"rewards/margins": 0.12146921455860138,
"rewards/rejected": -0.8454478979110718,
"rewards/safe_rewards": -0.7816897630691528,
"rewards/unsafe_rewards": -0.7392334938049316,
"step": 880
},
{
"epoch": 0.47,
"learning_rate": 3.170514196176037e-07,
"logits/chosen": 0.28930729627609253,
"logits/rejected": 0.6634337902069092,
"logps/chosen": -267.9020080566406,
"logps/rejected": -267.813720703125,
"loss": 5325.9504,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6826976537704468,
"rewards/margins": 0.18379981815814972,
"rewards/rejected": -0.8664973974227905,
"rewards/safe_rewards": -0.6970924139022827,
"rewards/unsafe_rewards": -0.6835001111030579,
"step": 890
},
{
"epoch": 0.48,
"learning_rate": 3.125763090526674e-07,
"logits/chosen": 0.21367737650871277,
"logits/rejected": 0.6621453166007996,
"logps/chosen": -278.2737731933594,
"logps/rejected": -269.89404296875,
"loss": 5261.0746,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6765376329421997,
"rewards/margins": 0.20078134536743164,
"rewards/rejected": -0.8773189783096313,
"rewards/safe_rewards": -0.6867783665657043,
"rewards/unsafe_rewards": -0.6920818090438843,
"step": 900
},
{
"epoch": 0.48,
"learning_rate": 3.080797024779447e-07,
"logits/chosen": 0.19137686491012573,
"logits/rejected": 0.7889005541801453,
"logps/chosen": -253.41421508789062,
"logps/rejected": -236.6729278564453,
"loss": 5719.0418,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6732780933380127,
"rewards/margins": 0.19284026324748993,
"rewards/rejected": -0.866118311882019,
"rewards/safe_rewards": -0.7765754461288452,
"rewards/unsafe_rewards": -0.682191014289856,
"step": 910
},
{
"epoch": 0.49,
"learning_rate": 3.035631445530743e-07,
"logits/chosen": 0.4879905581474304,
"logits/rejected": 0.9158290028572083,
"logps/chosen": -290.2519226074219,
"logps/rejected": -284.17071533203125,
"loss": 5561.2797,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7149994969367981,
"rewards/margins": 0.19377604126930237,
"rewards/rejected": -0.9087755084037781,
"rewards/safe_rewards": -0.6696754693984985,
"rewards/unsafe_rewards": -0.6708149313926697,
"step": 920
},
{
"epoch": 0.49,
"learning_rate": 2.9902818679131775e-07,
"logits/chosen": 0.3951093852519989,
"logits/rejected": 0.8302197456359863,
"logps/chosen": -271.294189453125,
"logps/rejected": -253.5810546875,
"loss": 5419.4855,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7780183553695679,
"rewards/margins": 0.17024961113929749,
"rewards/rejected": -0.9482680559158325,
"rewards/safe_rewards": -0.7877544164657593,
"rewards/unsafe_rewards": -0.7789348363876343,
"step": 930
},
{
"epoch": 0.5,
"learning_rate": 2.944763870265886e-07,
"logits/chosen": -0.13839875161647797,
"logits/rejected": 0.3581174314022064,
"logps/chosen": -272.4313659667969,
"logps/rejected": -267.915771484375,
"loss": 5453.8977,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6422435641288757,
"rewards/margins": 0.19745132327079773,
"rewards/rejected": -0.8396948575973511,
"rewards/safe_rewards": -0.6758723258972168,
"rewards/unsafe_rewards": -0.578320324420929,
"step": 940
},
{
"epoch": 0.5,
"learning_rate": 2.899093088783105e-07,
"logits/chosen": -0.06241287663578987,
"logits/rejected": 0.4015175700187683,
"logps/chosen": -294.8834533691406,
"logps/rejected": -279.0429382324219,
"loss": 5278.1754,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6345726847648621,
"rewards/margins": 0.14065605401992798,
"rewards/rejected": -0.7752287983894348,
"rewards/safe_rewards": -0.6587311029434204,
"rewards/unsafe_rewards": -0.6476761102676392,
"step": 950
},
{
"epoch": 0.51,
"learning_rate": 2.8532852121428733e-07,
"logits/chosen": -0.04936225712299347,
"logits/rejected": 0.38959282636642456,
"logps/chosen": -248.14639282226562,
"logps/rejected": -235.8994598388672,
"loss": 5653.668,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5577735304832458,
"rewards/margins": 0.21775202453136444,
"rewards/rejected": -0.7755255699157715,
"rewards/safe_rewards": -0.55736243724823,
"rewards/unsafe_rewards": -0.5908164978027344,
"step": 960
},
{
"epoch": 0.51,
"learning_rate": 2.807355976117716e-07,
"logits/chosen": 0.11599000543355942,
"logits/rejected": 0.49212461709976196,
"logps/chosen": -284.78472900390625,
"logps/rejected": -265.7978515625,
"loss": 5924.3578,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5290887355804443,
"rewards/margins": 0.22062186896800995,
"rewards/rejected": -0.7497105598449707,
"rewards/safe_rewards": -0.4509585499763489,
"rewards/unsafe_rewards": -0.5535848736763,
"step": 970
},
{
"epoch": 0.52,
"learning_rate": 2.761321158169134e-07,
"logits/chosen": -0.0665382593870163,
"logits/rejected": 0.4467547535896301,
"logps/chosen": -262.4479064941406,
"logps/rejected": -265.8846740722656,
"loss": 5391.7484,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.604932427406311,
"rewards/margins": 0.16624750196933746,
"rewards/rejected": -0.7711800336837769,
"rewards/safe_rewards": -0.570032000541687,
"rewards/unsafe_rewards": -0.6088122129440308,
"step": 980
},
{
"epoch": 0.53,
"learning_rate": 2.715196572027789e-07,
"logits/chosen": 0.15862391889095306,
"logits/rejected": 0.511070966720581,
"logps/chosen": -252.94137573242188,
"logps/rejected": -255.08187866210938,
"loss": 5628.2164,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6333836913108826,
"rewards/margins": 0.20889365673065186,
"rewards/rejected": -0.8422773480415344,
"rewards/safe_rewards": -0.6369217038154602,
"rewards/unsafe_rewards": -0.6703649163246155,
"step": 990
},
{
"epoch": 0.53,
"learning_rate": 2.6689980622612204e-07,
"logits/chosen": 0.08565627038478851,
"logits/rejected": 0.5222666263580322,
"logps/chosen": -255.2662811279297,
"logps/rejected": -253.49105834960938,
"loss": 5634.6316,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6020347476005554,
"rewards/margins": 0.19342327117919922,
"rewards/rejected": -0.7954580187797546,
"rewards/safe_rewards": -0.6501786708831787,
"rewards/unsafe_rewards": -0.6461445093154907,
"step": 1000
},
{
"epoch": 0.53,
"eval_logits/chosen": 0.41202229261398315,
"eval_logits/rejected": 1.1542474031448364,
"eval_logps/chosen": -220.34913635253906,
"eval_logps/rejected": -189.61671447753906,
"eval_loss": 4507.89453125,
"eval_rewards/accuracies": 0.6151915788650513,
"eval_rewards/chosen": -0.799996018409729,
"eval_rewards/margins": 0.07484080642461777,
"eval_rewards/rejected": -0.874836802482605,
"eval_rewards/safe_rewards": -0.7885684370994568,
"eval_rewards/unsafe_rewards": -0.784635066986084,
"eval_runtime": 2353.482,
"eval_samples_per_second": 14.89,
"eval_steps_per_second": 0.466,
"step": 1000
},
{
"epoch": 0.54,
"learning_rate": 2.622741498830969e-07,
"logits/chosen": 0.2431926727294922,
"logits/rejected": 0.40795207023620605,
"logps/chosen": -279.1517333984375,
"logps/rejected": -271.7449645996094,
"loss": 5872.2367,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6438090801239014,
"rewards/margins": 0.17429831624031067,
"rewards/rejected": -0.8181073069572449,
"rewards/safe_rewards": -0.6910767555236816,
"rewards/unsafe_rewards": -0.6460915803909302,
"step": 1010
},
{
"epoch": 0.54,
"learning_rate": 2.5764427716409815e-07,
"logits/chosen": -0.09687475860118866,
"logits/rejected": 0.4301505982875824,
"logps/chosen": -272.0554504394531,
"logps/rejected": -255.6719207763672,
"loss": 5816.6723,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5806029438972473,
"rewards/margins": 0.19818606972694397,
"rewards/rejected": -0.7787889838218689,
"rewards/safe_rewards": -0.5169692635536194,
"rewards/unsafe_rewards": -0.5289751291275024,
"step": 1020
},
{
"epoch": 0.55,
"learning_rate": 2.5301177850791616e-07,
"logits/chosen": 0.01663217321038246,
"logits/rejected": 0.6527854204177856,
"logps/chosen": -290.3711853027344,
"logps/rejected": -268.1048278808594,
"loss": 5912.7102,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6295832395553589,
"rewards/margins": 0.20760869979858398,
"rewards/rejected": -0.8371919393539429,
"rewards/safe_rewards": -0.642471432685852,
"rewards/unsafe_rewards": -0.6146708726882935,
"step": 1030
},
{
"epoch": 0.55,
"learning_rate": 2.4837824525539477e-07,
"logits/chosen": 0.17375509440898895,
"logits/rejected": 0.7390264272689819,
"logps/chosen": -270.261474609375,
"logps/rejected": -261.2465515136719,
"loss": 5659.6238,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6727645993232727,
"rewards/margins": 0.17281220853328705,
"rewards/rejected": -0.8455768823623657,
"rewards/safe_rewards": -0.6424635052680969,
"rewards/unsafe_rewards": -0.6337414979934692,
"step": 1040
},
{
"epoch": 0.56,
"learning_rate": 2.4374526910277886e-07,
"logits/chosen": 0.13272862136363983,
"logits/rejected": 0.57741779088974,
"logps/chosen": -270.9297790527344,
"logps/rejected": -267.14471435546875,
"loss": 5861.1039,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6448026895523071,
"rewards/margins": 0.2006601095199585,
"rewards/rejected": -0.8454626798629761,
"rewards/safe_rewards": -0.6065593361854553,
"rewards/unsafe_rewards": -0.6479047536849976,
"step": 1050
},
{
"epoch": 0.56,
"learning_rate": 2.391144415549403e-07,
"logits/chosen": 0.2520432770252228,
"logits/rejected": 0.7386651039123535,
"logps/chosen": -256.0111389160156,
"logps/rejected": -244.1455535888672,
"loss": 5928.0605,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6962358355522156,
"rewards/margins": 0.125870481133461,
"rewards/rejected": -0.8221063613891602,
"rewards/safe_rewards": -0.6803200244903564,
"rewards/unsafe_rewards": -0.6994472742080688,
"step": 1060
},
{
"epoch": 0.57,
"learning_rate": 2.3448735337866919e-07,
"logits/chosen": 0.26303520798683167,
"logits/rejected": 0.7426208257675171,
"logps/chosen": -247.3863983154297,
"logps/rejected": -244.02392578125,
"loss": 5880.1039,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6635211706161499,
"rewards/margins": 0.15260052680969238,
"rewards/rejected": -0.8161218762397766,
"rewards/safe_rewards": -0.706309974193573,
"rewards/unsafe_rewards": -0.6638337969779968,
"step": 1070
},
{
"epoch": 0.57,
"learning_rate": 2.2986559405621886e-07,
"logits/chosen": 0.030937856063246727,
"logits/rejected": 0.47169026732444763,
"logps/chosen": -279.0972595214844,
"logps/rejected": -268.9930725097656,
"loss": 5616.6,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6163111925125122,
"rewards/margins": 0.16996563971042633,
"rewards/rejected": -0.7862768173217773,
"rewards/safe_rewards": -0.6654713749885559,
"rewards/unsafe_rewards": -0.6399198770523071,
"step": 1080
},
{
"epoch": 0.58,
"learning_rate": 2.2525075123929213e-07,
"logits/chosen": 0.43386760354042053,
"logits/rejected": 0.7538164258003235,
"logps/chosen": -267.44134521484375,
"logps/rejected": -258.99249267578125,
"loss": 5716.7879,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6649960279464722,
"rewards/margins": 0.22522863745689392,
"rewards/rejected": -0.890224814414978,
"rewards/safe_rewards": -0.6375536322593689,
"rewards/unsafe_rewards": -0.6348733901977539,
"step": 1090
},
{
"epoch": 0.58,
"learning_rate": 2.206444102036565e-07,
"logits/chosen": 0.6684126257896423,
"logits/rejected": 0.9879862666130066,
"logps/chosen": -267.1449279785156,
"logps/rejected": -270.4283752441406,
"loss": 5974.3918,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.74274742603302,
"rewards/margins": 0.15645694732666016,
"rewards/rejected": -0.899204432964325,
"rewards/safe_rewards": -0.7267962694168091,
"rewards/unsafe_rewards": -0.6818505525588989,
"step": 1100
},
{
"epoch": 0.59,
"learning_rate": 2.160481533045751e-07,
"logits/chosen": 0.4061971604824066,
"logits/rejected": 0.9739459753036499,
"logps/chosen": -285.2103271484375,
"logps/rejected": -266.5544128417969,
"loss": 5749.7781,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7457272410392761,
"rewards/margins": 0.2004440277814865,
"rewards/rejected": -0.9461711645126343,
"rewards/safe_rewards": -0.7860220670700073,
"rewards/unsafe_rewards": -0.7390663623809814,
"step": 1110
},
{
"epoch": 0.59,
"learning_rate": 2.1146355943324148e-07,
"logits/chosen": 0.48321422934532166,
"logits/rejected": 0.9058516621589661,
"logps/chosen": -271.53924560546875,
"logps/rejected": -259.0006103515625,
"loss": 5805.548,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7600331902503967,
"rewards/margins": 0.13751891255378723,
"rewards/rejected": -0.8975521326065063,
"rewards/safe_rewards": -0.7516414523124695,
"rewards/unsafe_rewards": -0.7484757304191589,
"step": 1120
},
{
"epoch": 0.6,
"learning_rate": 2.0689220347440374e-07,
"logits/chosen": 0.1501261442899704,
"logits/rejected": 0.688166618347168,
"logps/chosen": -301.4822082519531,
"logps/rejected": -273.8033447265625,
"loss": 5622.9852,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6868051290512085,
"rewards/margins": 0.17512689530849457,
"rewards/rejected": -0.8619319796562195,
"rewards/safe_rewards": -0.6461024284362793,
"rewards/unsafe_rewards": -0.6649470329284668,
"step": 1130
},
{
"epoch": 0.6,
"learning_rate": 2.0233565576536564e-07,
"logits/chosen": 0.05991173908114433,
"logits/rejected": 0.42331352829933167,
"logps/chosen": -294.298095703125,
"logps/rejected": -287.5555419921875,
"loss": 5822.3992,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.7161829471588135,
"rewards/margins": 0.13876894116401672,
"rewards/rejected": -0.8549518585205078,
"rewards/safe_rewards": -0.7057495713233948,
"rewards/unsafe_rewards": -0.6698770523071289,
"step": 1140
},
{
"epoch": 0.61,
"learning_rate": 1.97795481556549e-07,
"logits/chosen": -0.03588150069117546,
"logits/rejected": 0.400505006313324,
"logps/chosen": -277.2012023925781,
"logps/rejected": -247.14804077148438,
"loss": 5935.0914,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6964778304100037,
"rewards/margins": 0.17653243243694305,
"rewards/rejected": -0.8730102777481079,
"rewards/safe_rewards": -0.6869702339172363,
"rewards/unsafe_rewards": -0.6601093411445618,
"step": 1150
},
{
"epoch": 0.62,
"learning_rate": 1.9327324047380422e-07,
"logits/chosen": -0.08701475709676743,
"logits/rejected": 0.4873865246772766,
"logps/chosen": -263.2158203125,
"logps/rejected": -258.84039306640625,
"loss": 5564.0863,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6252955198287964,
"rewards/margins": 0.22415871918201447,
"rewards/rejected": -0.8494542241096497,
"rewards/safe_rewards": -0.6420432329177856,
"rewards/unsafe_rewards": -0.6124902963638306,
"step": 1160
},
{
"epoch": 0.62,
"learning_rate": 1.887704859826528e-07,
"logits/chosen": 0.07522957026958466,
"logits/rejected": 0.3329767882823944,
"logps/chosen": -285.8026123046875,
"logps/rejected": -266.8732604980469,
"loss": 5750.982,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.6510334014892578,
"rewards/margins": 0.10930682718753815,
"rewards/rejected": -0.7603402137756348,
"rewards/safe_rewards": -0.6223952174186707,
"rewards/unsafe_rewards": -0.6682702302932739,
"step": 1170
},
{
"epoch": 0.63,
"learning_rate": 1.8428876485464572e-07,
"logits/chosen": -0.15613001585006714,
"logits/rejected": 0.41360145807266235,
"logps/chosen": -238.16897583007812,
"logps/rejected": -225.97802734375,
"loss": 5979.2156,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5804222822189331,
"rewards/margins": 0.1743427962064743,
"rewards/rejected": -0.7547650933265686,
"rewards/safe_rewards": -0.5962327718734741,
"rewards/unsafe_rewards": -0.6777797341346741,
"step": 1180
},
{
"epoch": 0.63,
"learning_rate": 1.798296166360216e-07,
"logits/chosen": -0.029682714492082596,
"logits/rejected": 0.5113533139228821,
"logps/chosen": -290.142822265625,
"logps/rejected": -269.4226989746094,
"loss": 6057.1922,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6241404414176941,
"rewards/margins": 0.1994599997997284,
"rewards/rejected": -0.8236004114151001,
"rewards/safe_rewards": -0.6254442930221558,
"rewards/unsafe_rewards": -0.6271675229072571,
"step": 1190
},
{
"epoch": 0.64,
"learning_rate": 1.7539457311884675e-07,
"logits/chosen": 0.1500866711139679,
"logits/rejected": 0.5680428743362427,
"logps/chosen": -262.3311462402344,
"logps/rejected": -251.67489624023438,
"loss": 5421.8398,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6509288549423218,
"rewards/margins": 0.2198909968137741,
"rewards/rejected": -0.8708198666572571,
"rewards/safe_rewards": -0.6651867032051086,
"rewards/unsafe_rewards": -0.6189877390861511,
"step": 1200
},
{
"epoch": 0.64,
"learning_rate": 1.7098515781481883e-07,
"logits/chosen": 0.4903317987918854,
"logits/rejected": 0.883372962474823,
"logps/chosen": -272.56097412109375,
"logps/rejected": -241.92919921875,
"loss": 5678.3117,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.6993108987808228,
"rewards/margins": 0.11801446974277496,
"rewards/rejected": -0.8173252940177917,
"rewards/safe_rewards": -0.6638237237930298,
"rewards/unsafe_rewards": -0.6766722202301025,
"step": 1210
},
{
"epoch": 0.65,
"learning_rate": 1.6660288543191568e-07,
"logits/chosen": 0.20008230209350586,
"logits/rejected": 1.072401523590088,
"logps/chosen": -292.7231140136719,
"logps/rejected": -264.1849365234375,
"loss": 5411.0453,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6634177565574646,
"rewards/margins": 0.19502988457679749,
"rewards/rejected": -0.8584476709365845,
"rewards/safe_rewards": -0.7102524638175964,
"rewards/unsafe_rewards": -0.6833497285842896,
"step": 1220
},
{
"epoch": 0.65,
"learning_rate": 1.6224926135406693e-07,
"logits/chosen": 0.4110666811466217,
"logits/rejected": 0.9241645932197571,
"logps/chosen": -291.5517272949219,
"logps/rejected": -268.79437255859375,
"loss": 5535.6395,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6780111193656921,
"rewards/margins": 0.2115507870912552,
"rewards/rejected": -0.8895619511604309,
"rewards/safe_rewards": -0.6748231053352356,
"rewards/unsafe_rewards": -0.7003692984580994,
"step": 1230
},
{
"epoch": 0.66,
"learning_rate": 1.579257811240298e-07,
"logits/chosen": 0.17879924178123474,
"logits/rejected": 0.82609623670578,
"logps/chosen": -283.47686767578125,
"logps/rejected": -269.6540832519531,
"loss": 5427.3156,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.7036404609680176,
"rewards/margins": 0.14344856142997742,
"rewards/rejected": -0.8470889925956726,
"rewards/safe_rewards": -0.6846009492874146,
"rewards/unsafe_rewards": -0.6783186197280884,
"step": 1240
},
{
"epoch": 0.66,
"learning_rate": 1.5363392992964523e-07,
"logits/chosen": 0.4139084815979004,
"logits/rejected": 0.7215920686721802,
"logps/chosen": -257.33319091796875,
"logps/rejected": -258.1666564941406,
"loss": 5595.8969,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.7196224927902222,
"rewards/margins": 0.11075691878795624,
"rewards/rejected": -0.8303793668746948,
"rewards/safe_rewards": -0.7594167590141296,
"rewards/unsafe_rewards": -0.7032173275947571,
"step": 1250
},
{
"epoch": 0.67,
"learning_rate": 1.4937518209365108e-07,
"logits/chosen": 0.2804068922996521,
"logits/rejected": 0.7492934465408325,
"logps/chosen": -299.9917297363281,
"logps/rejected": -274.86566162109375,
"loss": 5485.5156,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6413429975509644,
"rewards/margins": 0.18771231174468994,
"rewards/rejected": -0.8290553092956543,
"rewards/safe_rewards": -0.6320935487747192,
"rewards/unsafe_rewards": -0.6288415789604187,
"step": 1260
},
{
"epoch": 0.67,
"learning_rate": 1.4515100056722708e-07,
"logits/chosen": 0.49235549569129944,
"logits/rejected": 0.896806538105011,
"logps/chosen": -250.7898712158203,
"logps/rejected": -248.735107421875,
"loss": 5635.8461,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6494947671890259,
"rewards/margins": 0.2068520337343216,
"rewards/rejected": -0.8563467860221863,
"rewards/safe_rewards": -0.6947168707847595,
"rewards/unsafe_rewards": -0.6628744602203369,
"step": 1270
},
{
"epoch": 0.68,
"learning_rate": 1.4096283642744716e-07,
"logits/chosen": 0.564648449420929,
"logits/rejected": 1.1666864156723022,
"logps/chosen": -287.2496337890625,
"logps/rejected": -269.12689208984375,
"loss": 5744.0652,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6512799263000488,
"rewards/margins": 0.23767797648906708,
"rewards/rejected": -0.8889577984809875,
"rewards/safe_rewards": -0.6507743000984192,
"rewards/unsafe_rewards": -0.6260145306587219,
"step": 1280
},
{
"epoch": 0.68,
"learning_rate": 1.3681212837880977e-07,
"logits/chosen": 0.3310979902744293,
"logits/rejected": 0.946731686592102,
"logps/chosen": -283.14178466796875,
"logps/rejected": -268.6293029785156,
"loss": 5538.1773,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6541503667831421,
"rewards/margins": 0.20235121250152588,
"rewards/rejected": -0.856501579284668,
"rewards/safe_rewards": -0.7126244902610779,
"rewards/unsafe_rewards": -0.6116858124732971,
"step": 1290
},
{
"epoch": 0.69,
"learning_rate": 1.3270030225901908e-07,
"logits/chosen": 0.21446232497692108,
"logits/rejected": 0.9988247156143188,
"logps/chosen": -311.952392578125,
"logps/rejected": -264.99005126953125,
"loss": 5863.9875,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6609299778938293,
"rewards/margins": 0.20790867507457733,
"rewards/rejected": -0.8688386678695679,
"rewards/safe_rewards": -0.6820018291473389,
"rewards/unsafe_rewards": -0.6768487691879272,
"step": 1300
},
{
"epoch": 0.7,
"learning_rate": 1.2862877054918572e-07,
"logits/chosen": 0.43877673149108887,
"logits/rejected": 0.7122836112976074,
"logps/chosen": -263.78924560546875,
"logps/rejected": -267.306884765625,
"loss": 5915.4555,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6279779672622681,
"rewards/margins": 0.19203224778175354,
"rewards/rejected": -0.8200103044509888,
"rewards/safe_rewards": -0.5540001392364502,
"rewards/unsafe_rewards": -0.6103017926216125,
"step": 1310
},
{
"epoch": 0.7,
"learning_rate": 1.2459893188861613e-07,
"logits/chosen": 0.11050845682621002,
"logits/rejected": 0.638201117515564,
"logps/chosen": -230.92892456054688,
"logps/rejected": -223.246826171875,
"loss": 5522.6379,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.5677499771118164,
"rewards/margins": 0.1929033249616623,
"rewards/rejected": -0.7606532573699951,
"rewards/safe_rewards": -0.6029695272445679,
"rewards/unsafe_rewards": -0.6227617859840393,
"step": 1320
},
{
"epoch": 0.71,
"learning_rate": 1.206121705943558e-07,
"logits/chosen": 0.2380530834197998,
"logits/rejected": 0.772462785243988,
"logps/chosen": -265.9678039550781,
"logps/rejected": -236.330078125,
"loss": 5444.8687,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5695582628250122,
"rewards/margins": 0.17861400544643402,
"rewards/rejected": -0.7481723427772522,
"rewards/safe_rewards": -0.4967488646507263,
"rewards/unsafe_rewards": -0.5609390139579773,
"step": 1330
},
{
"epoch": 0.71,
"learning_rate": 1.1666985618565422e-07,
"logits/chosen": 0.7791303396224976,
"logits/rejected": 1.0070080757141113,
"logps/chosen": -239.6016082763672,
"logps/rejected": -250.1675567626953,
"loss": 5496.5402,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.643204391002655,
"rewards/margins": 0.212922140955925,
"rewards/rejected": -0.856126606464386,
"rewards/safe_rewards": -0.6307708024978638,
"rewards/unsafe_rewards": -0.6205247044563293,
"step": 1340
},
{
"epoch": 0.72,
"learning_rate": 1.1277334291351145e-07,
"logits/chosen": 0.6811083555221558,
"logits/rejected": 1.2308669090270996,
"logps/chosen": -240.9481964111328,
"logps/rejected": -251.2366485595703,
"loss": 5451.2172,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6521676778793335,
"rewards/margins": 0.1860547959804535,
"rewards/rejected": -0.8382223844528198,
"rewards/safe_rewards": -0.7259255647659302,
"rewards/unsafe_rewards": -0.6219838857650757,
"step": 1350
},
{
"epoch": 0.72,
"learning_rate": 1.089239692954701e-07,
"logits/chosen": 0.36615195870399475,
"logits/rejected": 0.9472381472587585,
"logps/chosen": -269.5465087890625,
"logps/rejected": -256.1499328613281,
"loss": 5717.6105,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6657227873802185,
"rewards/margins": 0.15908706188201904,
"rewards/rejected": -0.8248098492622375,
"rewards/safe_rewards": -0.7341758012771606,
"rewards/unsafe_rewards": -0.6227680444717407,
"step": 1360
},
{
"epoch": 0.73,
"learning_rate": 1.051230576558127e-07,
"logits/chosen": 0.7043350338935852,
"logits/rejected": 1.012446641921997,
"logps/chosen": -265.9175720214844,
"logps/rejected": -296.2731628417969,
"loss": 5307.2445,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7264591455459595,
"rewards/margins": 0.1706809252500534,
"rewards/rejected": -0.8971401453018188,
"rewards/safe_rewards": -0.7796869277954102,
"rewards/unsafe_rewards": -0.7442405819892883,
"step": 1370
},
{
"epoch": 0.73,
"learning_rate": 1.0137191367132078e-07,
"logits/chosen": 0.5799378156661987,
"logits/rejected": 1.0962615013122559,
"logps/chosen": -280.27587890625,
"logps/rejected": -261.3016052246094,
"loss": 5462.4613,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.659958004951477,
"rewards/margins": 0.24963033199310303,
"rewards/rejected": -0.9095882177352905,
"rewards/safe_rewards": -0.6955925226211548,
"rewards/unsafe_rewards": -0.6324699521064758,
"step": 1380
},
{
"epoch": 0.74,
"learning_rate": 9.76718259227532e-08,
"logits/chosen": 0.498538076877594,
"logits/rejected": 0.9989287257194519,
"logps/chosen": -272.96820068359375,
"logps/rejected": -256.63140869140625,
"loss": 5331.4734,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6205289363861084,
"rewards/margins": 0.21373698115348816,
"rewards/rejected": -0.8342660069465637,
"rewards/safe_rewards": -0.5949203372001648,
"rewards/unsafe_rewards": -0.6141771674156189,
"step": 1390
},
{
"epoch": 0.74,
"learning_rate": 9.402406545219676e-08,
"logits/chosen": 0.34590667486190796,
"logits/rejected": 0.8703553080558777,
"logps/chosen": -273.8531188964844,
"logps/rejected": -247.87466430664062,
"loss": 5546.1305,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6622526049613953,
"rewards/margins": 0.1561700403690338,
"rewards/rejected": -0.8184226751327515,
"rewards/safe_rewards": -0.6668413281440735,
"rewards/unsafe_rewards": -0.6589676141738892,
"step": 1400
},
{
"epoch": 0.75,
"learning_rate": 9.042988532644249e-08,
"logits/chosen": 0.2142190933227539,
"logits/rejected": 0.5996747016906738,
"logps/chosen": -308.82635498046875,
"logps/rejected": -276.37823486328125,
"loss": 5583.4395,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5863175392150879,
"rewards/margins": 0.23458845913410187,
"rewards/rejected": -0.8209059834480286,
"rewards/safe_rewards": -0.5638710260391235,
"rewards/unsafe_rewards": -0.5323917269706726,
"step": 1410
},
{
"epoch": 0.75,
"learning_rate": 8.689052020653592e-08,
"logits/chosen": -0.06605692207813263,
"logits/rejected": 0.6343873739242554,
"logps/chosen": -285.37225341796875,
"logps/rejected": -252.3105010986328,
"loss": 5576.0598,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5753235816955566,
"rewards/margins": 0.2064014971256256,
"rewards/rejected": -0.7817251086235046,
"rewards/safe_rewards": -0.5231102705001831,
"rewards/unsafe_rewards": -0.5478030443191528,
"step": 1420
},
{
"epoch": 0.76,
"learning_rate": 8.340718592365037e-08,
"logits/chosen": 0.4551053047180176,
"logits/rejected": 0.6916473507881165,
"logps/chosen": -259.25543212890625,
"logps/rejected": -269.81097412109375,
"loss": 5258.8734,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.6683470010757446,
"rewards/margins": 0.16762246191501617,
"rewards/rejected": -0.8359693288803101,
"rewards/safe_rewards": -0.6167613863945007,
"rewards/unsafe_rewards": -0.6983481645584106,
"step": 1430
},
{
"epoch": 0.76,
"learning_rate": 7.998107906142839e-08,
"logits/chosen": 0.4198254942893982,
"logits/rejected": 0.9249162673950195,
"logps/chosen": -256.2335205078125,
"logps/rejected": -243.9502716064453,
"loss": 5150.4359,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6530503034591675,
"rewards/margins": 0.22125795483589172,
"rewards/rejected": -0.8743082880973816,
"rewards/safe_rewards": -0.6435777544975281,
"rewards/unsafe_rewards": -0.6962872743606567,
"step": 1440
},
{
"epoch": 0.77,
"learning_rate": 7.661337654493575e-08,
"logits/chosen": 0.11405469477176666,
"logits/rejected": 0.8541787266731262,
"logps/chosen": -285.04632568359375,
"logps/rejected": -264.7653503417969,
"loss": 5838.1379,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6224103569984436,
"rewards/margins": 0.20319974422454834,
"rewards/rejected": -0.8256100416183472,
"rewards/safe_rewards": -0.6171637773513794,
"rewards/unsafe_rewards": -0.5961381793022156,
"step": 1450
},
{
"epoch": 0.77,
"learning_rate": 7.330523523636751e-08,
"logits/chosen": 0.33853933215141296,
"logits/rejected": 0.5890348553657532,
"logps/chosen": -267.7184753417969,
"logps/rejected": -279.6230163574219,
"loss": 5326.7477,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6186683177947998,
"rewards/margins": 0.19817940890789032,
"rewards/rejected": -0.8168476819992065,
"rewards/safe_rewards": -0.6040722727775574,
"rewards/unsafe_rewards": -0.6181649565696716,
"step": 1460
},
{
"epoch": 0.78,
"learning_rate": 7.005779153764682e-08,
"logits/chosen": 0.4181288182735443,
"logits/rejected": 0.7393978238105774,
"logps/chosen": -249.9525909423828,
"logps/rejected": -242.4307861328125,
"loss": 5633.5648,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6368721723556519,
"rewards/margins": 0.15112480521202087,
"rewards/rejected": -0.7879970073699951,
"rewards/safe_rewards": -0.6358110308647156,
"rewards/unsafe_rewards": -0.6208546161651611,
"step": 1470
},
{
"epoch": 0.79,
"learning_rate": 6.687216100005138e-08,
"logits/chosen": 0.6848994493484497,
"logits/rejected": 1.1733933687210083,
"logps/chosen": -284.51080322265625,
"logps/rejected": -288.7901916503906,
"loss": 5048.4258,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6762335300445557,
"rewards/margins": 0.1719200611114502,
"rewards/rejected": -0.8481537103652954,
"rewards/safe_rewards": -0.6376355290412903,
"rewards/unsafe_rewards": -0.7184177041053772,
"step": 1480
},
{
"epoch": 0.79,
"learning_rate": 6.374943794100349e-08,
"logits/chosen": 0.48638778924942017,
"logits/rejected": 1.259670615196228,
"logps/chosen": -267.34588623046875,
"logps/rejected": -245.59756469726562,
"loss": 5545.4941,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6003537178039551,
"rewards/margins": 0.22699756920337677,
"rewards/rejected": -0.8273512721061707,
"rewards/safe_rewards": -0.6312727332115173,
"rewards/unsafe_rewards": -0.6281502842903137,
"step": 1490
},
{
"epoch": 0.8,
"learning_rate": 6.069069506815325e-08,
"logits/chosen": 0.7533052563667297,
"logits/rejected": 1.2028855085372925,
"logps/chosen": -251.12496948242188,
"logps/rejected": -253.78408813476562,
"loss": 5749.5141,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6362664103507996,
"rewards/margins": 0.2198611944913864,
"rewards/rejected": -0.8561276197433472,
"rewards/safe_rewards": -0.622052013874054,
"rewards/unsafe_rewards": -0.704675555229187,
"step": 1500
},
{
"epoch": 0.8,
"eval_logits/chosen": 1.0718276500701904,
"eval_logits/rejected": 1.9546749591827393,
"eval_logps/chosen": -228.9304656982422,
"eval_logps/rejected": -199.36412048339844,
"eval_loss": 4458.44287109375,
"eval_rewards/accuracies": 0.6194114685058594,
"eval_rewards/chosen": -0.8858092427253723,
"eval_rewards/margins": 0.0865015909075737,
"eval_rewards/rejected": -0.9723107814788818,
"eval_rewards/safe_rewards": -0.874053955078125,
"eval_rewards/unsafe_rewards": -0.8699882626533508,
"eval_runtime": 2349.2554,
"eval_samples_per_second": 14.917,
"eval_steps_per_second": 0.467,
"step": 1500
},
{
"epoch": 0.8,
"learning_rate": 5.7696983110885746e-08,
"logits/chosen": 1.0346394777297974,
"logits/rejected": 1.4075425863265991,
"logps/chosen": -264.0049133300781,
"logps/rejected": -256.81793212890625,
"loss": 5875.7254,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.7450360059738159,
"rewards/margins": 0.13777832686901093,
"rewards/rejected": -0.8828142881393433,
"rewards/safe_rewards": -0.6767371892929077,
"rewards/unsafe_rewards": -0.7506189942359924,
"step": 1510
},
{
"epoch": 0.81,
"learning_rate": 5.47693304593777e-08,
"logits/chosen": 0.577034056186676,
"logits/rejected": 1.2275969982147217,
"logps/chosen": -280.673583984375,
"logps/rejected": -243.10635375976562,
"loss": 5531.6125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6422880291938782,
"rewards/margins": 0.22371160984039307,
"rewards/rejected": -0.8659995794296265,
"rewards/safe_rewards": -0.5432512164115906,
"rewards/unsafe_rewards": -0.6611617803573608,
"step": 1520
},
{
"epoch": 0.81,
"learning_rate": 5.190874281132851e-08,
"logits/chosen": 0.6209213733673096,
"logits/rejected": 0.9749325513839722,
"logps/chosen": -258.8196716308594,
"logps/rejected": -247.3189697265625,
"loss": 5541.2727,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.6575254201889038,
"rewards/margins": 0.12947872281074524,
"rewards/rejected": -0.7870042324066162,
"rewards/safe_rewards": -0.7655413746833801,
"rewards/unsafe_rewards": -0.7101870775222778,
"step": 1530
},
{
"epoch": 0.82,
"learning_rate": 4.9116202826486045e-08,
"logits/chosen": 0.7310935258865356,
"logits/rejected": 1.0775771141052246,
"logps/chosen": -272.3906555175781,
"logps/rejected": -257.2728271484375,
"loss": 5545.8492,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.6876263618469238,
"rewards/margins": 0.16089771687984467,
"rewards/rejected": -0.8485240936279297,
"rewards/safe_rewards": -0.6295339465141296,
"rewards/unsafe_rewards": -0.7383956909179688,
"step": 1540
},
{
"epoch": 0.82,
"learning_rate": 4.639266978908676e-08,
"logits/chosen": 0.6267167329788208,
"logits/rejected": 1.1266528367996216,
"logps/chosen": -297.58380126953125,
"logps/rejected": -271.4803161621094,
"loss": 5131.627,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6685757637023926,
"rewards/margins": 0.18729698657989502,
"rewards/rejected": -0.8558727502822876,
"rewards/safe_rewards": -0.6740354299545288,
"rewards/unsafe_rewards": -0.6281224489212036,
"step": 1550
},
{
"epoch": 0.83,
"learning_rate": 4.373907927832513e-08,
"logits/chosen": 0.6049357056617737,
"logits/rejected": 0.9919975996017456,
"logps/chosen": -265.62481689453125,
"logps/rejected": -285.9028625488281,
"loss": 5640.1398,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6182764172554016,
"rewards/margins": 0.22418944537639618,
"rewards/rejected": -0.842465877532959,
"rewards/safe_rewards": -0.6555901765823364,
"rewards/unsafe_rewards": -0.5656682848930359,
"step": 1560
},
{
"epoch": 0.83,
"learning_rate": 4.115634284696698e-08,
"logits/chosen": 0.49705711007118225,
"logits/rejected": 0.9479654431343079,
"logps/chosen": -261.2461853027344,
"logps/rejected": -270.83331298828125,
"loss": 5189.8301,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6632257699966431,
"rewards/margins": 0.21208517253398895,
"rewards/rejected": -0.8753108978271484,
"rewards/safe_rewards": -0.6663291454315186,
"rewards/unsafe_rewards": -0.6038998365402222,
"step": 1570
},
{
"epoch": 0.84,
"learning_rate": 3.864534770821559e-08,
"logits/chosen": 0.6149829626083374,
"logits/rejected": 1.1939442157745361,
"logps/chosen": -262.00933837890625,
"logps/rejected": -240.24581909179688,
"loss": 5618.5883,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6275893449783325,
"rewards/margins": 0.20411472022533417,
"rewards/rejected": -0.8317041397094727,
"rewards/safe_rewards": -0.6472023725509644,
"rewards/unsafe_rewards": -0.5557063817977905,
"step": 1580
},
{
"epoch": 0.84,
"learning_rate": 3.620695643093924e-08,
"logits/chosen": 0.43840399384498596,
"logits/rejected": 1.105423092842102,
"logps/chosen": -269.2837829589844,
"logps/rejected": -238.085205078125,
"loss": 5468.3313,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6394304037094116,
"rewards/margins": 0.22106070816516876,
"rewards/rejected": -0.860491156578064,
"rewards/safe_rewards": -0.6031507849693298,
"rewards/unsafe_rewards": -0.6791771650314331,
"step": 1590
},
{
"epoch": 0.85,
"learning_rate": 3.384200664336412e-08,
"logits/chosen": 0.5348480343818665,
"logits/rejected": 1.0058144330978394,
"logps/chosen": -268.3987731933594,
"logps/rejected": -247.79696655273438,
"loss": 5660.3645,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5938838720321655,
"rewards/margins": 0.21732494235038757,
"rewards/rejected": -0.8112088441848755,
"rewards/safe_rewards": -0.5639302134513855,
"rewards/unsafe_rewards": -0.6350196599960327,
"step": 1600
},
{
"epoch": 0.85,
"learning_rate": 3.155131074533529e-08,
"logits/chosen": 0.30334433913230896,
"logits/rejected": 0.9854658246040344,
"logps/chosen": -283.627685546875,
"logps/rejected": -263.83251953125,
"loss": 6043.9172,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6394412517547607,
"rewards/margins": 0.1600230187177658,
"rewards/rejected": -0.7994643449783325,
"rewards/safe_rewards": -0.6199285387992859,
"rewards/unsafe_rewards": -0.6412296295166016,
"step": 1610
},
{
"epoch": 0.86,
"learning_rate": 2.9335655629243645e-08,
"logits/chosen": 0.39362573623657227,
"logits/rejected": 0.9285033941268921,
"logps/chosen": -270.2079162597656,
"logps/rejected": -261.9796447753906,
"loss": 5957.5516,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6112038493156433,
"rewards/margins": 0.18837173283100128,
"rewards/rejected": -0.7995756268501282,
"rewards/safe_rewards": -0.6032061576843262,
"rewards/unsafe_rewards": -0.6732661724090576,
"step": 1620
},
{
"epoch": 0.86,
"learning_rate": 2.7195802409715197e-08,
"logits/chosen": 0.2444291114807129,
"logits/rejected": 0.9499914050102234,
"logps/chosen": -298.4200134277344,
"logps/rejected": -249.72866821289062,
"loss": 5750.8313,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.6592567563056946,
"rewards/margins": 0.1407555341720581,
"rewards/rejected": -0.8000122904777527,
"rewards/safe_rewards": -0.7100226283073425,
"rewards/unsafe_rewards": -0.7015893459320068,
"step": 1630
},
{
"epoch": 0.87,
"learning_rate": 2.513248616215527e-08,
"logits/chosen": 0.3666357100009918,
"logits/rejected": 0.9415947198867798,
"logps/chosen": -277.87518310546875,
"logps/rejected": -276.29119873046875,
"loss": 5205.8715,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6106274724006653,
"rewards/margins": 0.24805088341236115,
"rewards/rejected": -0.8586783409118652,
"rewards/safe_rewards": -0.6150985956192017,
"rewards/unsafe_rewards": -0.594727635383606,
"step": 1640
},
{
"epoch": 0.88,
"learning_rate": 2.31464156702382e-08,
"logits/chosen": 0.24014464020729065,
"logits/rejected": 0.9577549695968628,
"logps/chosen": -292.7112121582031,
"logps/rejected": -265.7065734863281,
"loss": 5896.8078,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5955285429954529,
"rewards/margins": 0.2333928644657135,
"rewards/rejected": -0.8289214372634888,
"rewards/safe_rewards": -0.6319350600242615,
"rewards/unsafe_rewards": -0.5868616104125977,
"step": 1650
},
{
"epoch": 0.88,
"learning_rate": 2.1238273182427933e-08,
"logits/chosen": 0.6973511576652527,
"logits/rejected": 1.2915074825286865,
"logps/chosen": -265.3111572265625,
"logps/rejected": -251.41201782226562,
"loss": 5434.0336,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6617192029953003,
"rewards/margins": 0.19598451256752014,
"rewards/rejected": -0.857703685760498,
"rewards/safe_rewards": -0.6422809362411499,
"rewards/unsafe_rewards": -0.6228102445602417,
"step": 1660
},
{
"epoch": 0.89,
"learning_rate": 1.9408714177614306e-08,
"logits/chosen": 0.5173779726028442,
"logits/rejected": 1.02643883228302,
"logps/chosen": -268.9621887207031,
"logps/rejected": -251.25808715820312,
"loss": 5243.4758,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6187028288841248,
"rewards/margins": 0.22567462921142578,
"rewards/rejected": -0.8443773984909058,
"rewards/safe_rewards": -0.6375213265419006,
"rewards/unsafe_rewards": -0.6421637535095215,
"step": 1670
},
{
"epoch": 0.89,
"learning_rate": 1.7658367139945228e-08,
"logits/chosen": 0.6539649963378906,
"logits/rejected": 1.0953106880187988,
"logps/chosen": -288.9885559082031,
"logps/rejected": -259.146728515625,
"loss": 5246.4344,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6686577200889587,
"rewards/margins": 0.19176754355430603,
"rewards/rejected": -0.8604252934455872,
"rewards/safe_rewards": -0.7045280933380127,
"rewards/unsafe_rewards": -0.7155130505561829,
"step": 1680
},
{
"epoch": 0.9,
"learning_rate": 1.5987833342931745e-08,
"logits/chosen": 0.4664410650730133,
"logits/rejected": 1.215132236480713,
"logps/chosen": -284.1900939941406,
"logps/rejected": -251.48379516601562,
"loss": 5564.9324,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6805782318115234,
"rewards/margins": 0.21095602214336395,
"rewards/rejected": -0.8915343284606934,
"rewards/safe_rewards": -0.67192143201828,
"rewards/unsafe_rewards": -0.6578537821769714,
"step": 1690
},
{
"epoch": 0.9,
"learning_rate": 1.439768664290053e-08,
"logits/chosen": 0.48882967233657837,
"logits/rejected": 1.0205453634262085,
"logps/chosen": -288.0510559082031,
"logps/rejected": -263.57122802734375,
"loss": 5705.5039,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6453284025192261,
"rewards/margins": 0.18227383494377136,
"rewards/rejected": -0.827602207660675,
"rewards/safe_rewards": -0.6023403406143188,
"rewards/unsafe_rewards": -0.6489912867546082,
"step": 1700
},
{
"epoch": 0.91,
"learning_rate": 1.2888473281864597e-08,
"logits/chosen": 0.3580858111381531,
"logits/rejected": 0.9355760812759399,
"logps/chosen": -252.00344848632812,
"logps/rejected": -256.7703552246094,
"loss": 5420.7055,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6472461819648743,
"rewards/margins": 0.19622859358787537,
"rewards/rejected": -0.8434747457504272,
"rewards/safe_rewards": -0.6663787364959717,
"rewards/unsafe_rewards": -0.6997274160385132,
"step": 1710
},
{
"epoch": 0.91,
"learning_rate": 1.1460711699880082e-08,
"logits/chosen": 0.32274478673934937,
"logits/rejected": 0.9183855056762695,
"logps/chosen": -281.06304931640625,
"logps/rejected": -268.91278076171875,
"loss": 5609.357,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.5867010951042175,
"rewards/margins": 0.23433193564414978,
"rewards/rejected": -0.8210331201553345,
"rewards/safe_rewards": -0.5630391240119934,
"rewards/unsafe_rewards": -0.6277604103088379,
"step": 1720
},
{
"epoch": 0.92,
"learning_rate": 1.0114892356953397e-08,
"logits/chosen": 0.381804883480072,
"logits/rejected": 0.9557956457138062,
"logps/chosen": -278.6263427734375,
"logps/rejected": -252.7932891845703,
"loss": 5676.834,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6421754360198975,
"rewards/margins": 0.1775234043598175,
"rewards/rejected": -0.8196988105773926,
"rewards/safe_rewards": -0.6115553379058838,
"rewards/unsafe_rewards": -0.6476501226425171,
"step": 1730
},
{
"epoch": 0.92,
"learning_rate": 8.851477564560061e-09,
"logits/chosen": 0.5100737810134888,
"logits/rejected": 0.932380199432373,
"logps/chosen": -263.25146484375,
"logps/rejected": -271.11676025390625,
"loss": 5593.4414,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6300482749938965,
"rewards/margins": 0.25807589292526245,
"rewards/rejected": -0.8881241679191589,
"rewards/safe_rewards": -0.6826761960983276,
"rewards/unsafe_rewards": -0.6732330322265625,
"step": 1740
},
{
"epoch": 0.93,
"learning_rate": 7.670901326832763e-09,
"logits/chosen": 0.6556006669998169,
"logits/rejected": 1.0529851913452148,
"logps/chosen": -272.6200866699219,
"logps/rejected": -291.10101318359375,
"loss": 5333.684,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7119321823120117,
"rewards/margins": 0.18222954869270325,
"rewards/rejected": -0.8941618204116821,
"rewards/safe_rewards": -0.7450841069221497,
"rewards/unsafe_rewards": -0.6783844232559204,
"step": 1750
},
{
"epoch": 0.93,
"learning_rate": 6.5735691914738936e-09,
"logits/chosen": 0.3428182005882263,
"logits/rejected": 0.6993114948272705,
"logps/chosen": -276.2501220703125,
"logps/rejected": -270.787841796875,
"loss": 6014.7414,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6672028303146362,
"rewards/margins": 0.16263318061828613,
"rewards/rejected": -0.8298360109329224,
"rewards/safe_rewards": -0.6557270288467407,
"rewards/unsafe_rewards": -0.7067701816558838,
"step": 1760
},
{
"epoch": 0.94,
"learning_rate": 5.559858110443016e-09,
"logits/chosen": 0.3265165388584137,
"logits/rejected": 0.9415761828422546,
"logps/chosen": -279.380615234375,
"logps/rejected": -258.53887939453125,
"loss": 5329.075,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6516368985176086,
"rewards/margins": 0.22732026875019073,
"rewards/rejected": -0.8789570927619934,
"rewards/safe_rewards": -0.6853364706039429,
"rewards/unsafe_rewards": -0.6284711360931396,
"step": 1770
},
{
"epoch": 0.94,
"learning_rate": 4.6301163104676685e-09,
"logits/chosen": 0.5433076620101929,
"logits/rejected": 0.899452805519104,
"logps/chosen": -262.05511474609375,
"logps/rejected": -280.93658447265625,
"loss": 5452.5277,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6632400751113892,
"rewards/margins": 0.19723954796791077,
"rewards/rejected": -0.8604797124862671,
"rewards/safe_rewards": -0.5747020244598389,
"rewards/unsafe_rewards": -0.6066412329673767,
"step": 1780
},
{
"epoch": 0.95,
"learning_rate": 3.784663173421438e-09,
"logits/chosen": 0.47608470916748047,
"logits/rejected": 0.8737590909004211,
"logps/chosen": -294.0523376464844,
"logps/rejected": -280.8829650878906,
"loss": 5532.6391,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6354952454566956,
"rewards/margins": 0.18091240525245667,
"rewards/rejected": -0.8164075613021851,
"rewards/safe_rewards": -0.6999973654747009,
"rewards/unsafe_rewards": -0.6226142644882202,
"step": 1790
},
{
"epoch": 0.96,
"learning_rate": 3.023789126611137e-09,
"logits/chosen": 0.6358956694602966,
"logits/rejected": 1.2913506031036377,
"logps/chosen": -276.2715148925781,
"logps/rejected": -243.6599884033203,
"loss": 5192.1734,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6617811918258667,
"rewards/margins": 0.21255967020988464,
"rewards/rejected": -0.874340832233429,
"rewards/safe_rewards": -0.665223240852356,
"rewards/unsafe_rewards": -0.67181396484375,
"step": 1800
},
{
"epoch": 0.96,
"learning_rate": 2.3477555430100604e-09,
"logits/chosen": 0.5863360166549683,
"logits/rejected": 1.0950720310211182,
"logps/chosen": -270.6855773925781,
"logps/rejected": -254.65771484375,
"loss": 5546.9984,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5831121206283569,
"rewards/margins": 0.2669592499732971,
"rewards/rejected": -0.8500713109970093,
"rewards/safe_rewards": -0.586032509803772,
"rewards/unsafe_rewards": -0.577675461769104,
"step": 1810
},
{
"epoch": 0.97,
"learning_rate": 1.7567946514721322e-09,
"logits/chosen": 0.6444328427314758,
"logits/rejected": 1.0208208560943604,
"logps/chosen": -269.35577392578125,
"logps/rejected": -271.528564453125,
"loss": 5601.7539,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6750708818435669,
"rewards/margins": 0.19110876321792603,
"rewards/rejected": -0.8661795854568481,
"rewards/safe_rewards": -0.6811034679412842,
"rewards/unsafe_rewards": -0.7294248342514038,
"step": 1820
},
{
"epoch": 0.97,
"learning_rate": 1.2511094569571668e-09,
"logits/chosen": 0.3397526741027832,
"logits/rejected": 1.0616391897201538,
"logps/chosen": -257.86822509765625,
"logps/rejected": -244.8105926513672,
"loss": 5620.3375,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.631868302822113,
"rewards/margins": 0.2000071257352829,
"rewards/rejected": -0.8318754434585571,
"rewards/safe_rewards": -0.5972138047218323,
"rewards/unsafe_rewards": -0.6459835171699524,
"step": 1830
},
{
"epoch": 0.98,
"learning_rate": 8.308736707954289e-10,
"logits/chosen": 0.518609881401062,
"logits/rejected": 1.1488319635391235,
"logps/chosen": -273.81390380859375,
"logps/rejected": -240.91372680664062,
"loss": 5548.0289,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6856581568717957,
"rewards/margins": 0.2014351636171341,
"rewards/rejected": -0.8870933651924133,
"rewards/safe_rewards": -0.6684737205505371,
"rewards/unsafe_rewards": -0.694146990776062,
"step": 1840
},
{
"epoch": 0.98,
"learning_rate": 4.962316510149222e-10,
"logits/chosen": 0.3395392894744873,
"logits/rejected": 1.0089718103408813,
"logps/chosen": -252.1464080810547,
"logps/rejected": -241.22982788085938,
"loss": 5356.7621,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6337156295776367,
"rewards/margins": 0.2152295857667923,
"rewards/rejected": -0.8489452600479126,
"rewards/safe_rewards": -0.6431758403778076,
"rewards/unsafe_rewards": -0.6494039297103882,
"step": 1850
},
{
"epoch": 0.99,
"learning_rate": 2.4729835275189016e-10,
"logits/chosen": 0.5798267722129822,
"logits/rejected": 0.9745955467224121,
"logps/chosen": -243.1245574951172,
"logps/rejected": -238.126220703125,
"loss": 5836.127,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6284788846969604,
"rewards/margins": 0.2039627581834793,
"rewards/rejected": -0.8324416279792786,
"rewards/safe_rewards": -0.5914771556854248,
"rewards/unsafe_rewards": -0.6241937279701233,
"step": 1860
},
{
"epoch": 0.99,
"learning_rate": 8.415928876176482e-11,
"logits/chosen": 0.4843016564846039,
"logits/rejected": 0.8851835131645203,
"logps/chosen": -258.23773193359375,
"logps/rejected": -251.73001098632812,
"loss": 6036.282,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6951759457588196,
"rewards/margins": 0.1390235722064972,
"rewards/rejected": -0.8341996073722839,
"rewards/safe_rewards": -0.7087674140930176,
"rewards/unsafe_rewards": -0.712031900882721,
"step": 1870
},
{
"epoch": 1.0,
"learning_rate": 6.870500044303673e-12,
"logits/chosen": 0.5293042063713074,
"logits/rejected": 0.8430191874504089,
"logps/chosen": -253.91397094726562,
"logps/rejected": -270.7514953613281,
"loss": 5497.6977,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5842832326889038,
"rewards/margins": 0.209587961435318,
"rewards/rejected": -0.7938712239265442,
"rewards/safe_rewards": -0.6020101308822632,
"rewards/unsafe_rewards": -0.6186091303825378,
"step": 1880
},
{
"epoch": 1.0,
"step": 1884,
"total_flos": 0.0,
"train_loss": 5859.617769083399,
"train_runtime": 32772.3871,
"train_samples_per_second": 3.68,
"train_steps_per_second": 0.057
}
],
"logging_steps": 10,
"max_steps": 1884,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}