martimfasantos's picture
Model save
850d0ec verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 4164,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007204610951008645,
"grad_norm": 16.86738074547546,
"learning_rate": 1.199040767386091e-10,
"logits/chosen": -1.901450514793396,
"logits/rejected": -1.9076323509216309,
"logps/chosen": -0.8524526953697205,
"logps/rejected": -0.9626365900039673,
"loss": 1.6316,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.704905390739441,
"rewards/margins": 0.22036786377429962,
"rewards/rejected": -1.9252731800079346,
"step": 1
},
{
"epoch": 0.007204610951008645,
"grad_norm": 20.665932859043128,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -2.0206007957458496,
"logits/rejected": -2.0063118934631348,
"logps/chosen": -1.0047835111618042,
"logps/rejected": -1.1094833612442017,
"loss": 1.6543,
"rewards/accuracies": 0.5208333134651184,
"rewards/chosen": -2.0095670223236084,
"rewards/margins": 0.20940010249614716,
"rewards/rejected": -2.2189667224884033,
"step": 10
},
{
"epoch": 0.01440922190201729,
"grad_norm": 26.20867949744724,
"learning_rate": 2.398081534772182e-09,
"logits/chosen": -2.0263831615448,
"logits/rejected": -2.023040294647217,
"logps/chosen": -1.052225112915039,
"logps/rejected": -1.183236002922058,
"loss": 1.6174,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.104450225830078,
"rewards/margins": 0.262021541595459,
"rewards/rejected": -2.366472005844116,
"step": 20
},
{
"epoch": 0.021613832853025938,
"grad_norm": 20.4376768783133,
"learning_rate": 3.597122302158273e-09,
"logits/chosen": -1.9816261529922485,
"logits/rejected": -1.9744749069213867,
"logps/chosen": -1.054040789604187,
"logps/rejected": -1.1520485877990723,
"loss": 1.6706,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.108081579208374,
"rewards/margins": 0.19601555168628693,
"rewards/rejected": -2.3040971755981445,
"step": 30
},
{
"epoch": 0.02881844380403458,
"grad_norm": 22.57924661663736,
"learning_rate": 4.796163069544364e-09,
"logits/chosen": -2.027863025665283,
"logits/rejected": -2.0277962684631348,
"logps/chosen": -1.0358012914657593,
"logps/rejected": -1.1370604038238525,
"loss": 1.6751,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0716025829315186,
"rewards/margins": 0.20251810550689697,
"rewards/rejected": -2.274120807647705,
"step": 40
},
{
"epoch": 0.03602305475504323,
"grad_norm": 17.157686014437257,
"learning_rate": 5.995203836930456e-09,
"logits/chosen": -1.9645040035247803,
"logits/rejected": -1.9652373790740967,
"logps/chosen": -0.9416173100471497,
"logps/rejected": -1.0079900026321411,
"loss": 1.7028,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8832346200942993,
"rewards/margins": 0.13274545967578888,
"rewards/rejected": -2.0159800052642822,
"step": 50
},
{
"epoch": 0.043227665706051875,
"grad_norm": 24.358115422988377,
"learning_rate": 7.194244604316546e-09,
"logits/chosen": -2.038653612136841,
"logits/rejected": -2.0341880321502686,
"logps/chosen": -1.0892280340194702,
"logps/rejected": -1.14542555809021,
"loss": 1.7183,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1784560680389404,
"rewards/margins": 0.11239476501941681,
"rewards/rejected": -2.29085111618042,
"step": 60
},
{
"epoch": 0.05043227665706052,
"grad_norm": 23.16153472591092,
"learning_rate": 8.393285371702639e-09,
"logits/chosen": -2.0306906700134277,
"logits/rejected": -2.018244743347168,
"logps/chosen": -1.110377311706543,
"logps/rejected": -1.204403281211853,
"loss": 1.6679,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.220754623413086,
"rewards/margins": 0.18805181980133057,
"rewards/rejected": -2.408806562423706,
"step": 70
},
{
"epoch": 0.05763688760806916,
"grad_norm": 28.356652908785378,
"learning_rate": 9.592326139088728e-09,
"logits/chosen": -2.046745777130127,
"logits/rejected": -2.043815851211548,
"logps/chosen": -1.1663789749145508,
"logps/rejected": -1.2379769086837769,
"loss": 1.7002,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.3327579498291016,
"rewards/margins": 0.14319580793380737,
"rewards/rejected": -2.4759538173675537,
"step": 80
},
{
"epoch": 0.06484149855907781,
"grad_norm": 18.121889263061643,
"learning_rate": 1.0791366906474819e-08,
"logits/chosen": -2.004173994064331,
"logits/rejected": -2.0055997371673584,
"logps/chosen": -1.0416834354400635,
"logps/rejected": -1.1493545770645142,
"loss": 1.6516,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.083366870880127,
"rewards/margins": 0.21534208953380585,
"rewards/rejected": -2.2987091541290283,
"step": 90
},
{
"epoch": 0.07204610951008646,
"grad_norm": 21.617985488027507,
"learning_rate": 1.1990407673860912e-08,
"logits/chosen": -2.0398590564727783,
"logits/rejected": -2.033644199371338,
"logps/chosen": -1.0076771974563599,
"logps/rejected": -1.1141164302825928,
"loss": 1.6554,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0153543949127197,
"rewards/margins": 0.21287837624549866,
"rewards/rejected": -2.2282328605651855,
"step": 100
},
{
"epoch": 0.0792507204610951,
"grad_norm": 18.314365281520494,
"learning_rate": 1.3189448441247003e-08,
"logits/chosen": -1.9825471639633179,
"logits/rejected": -1.9712765216827393,
"logps/chosen": -1.0293405055999756,
"logps/rejected": -1.1285316944122314,
"loss": 1.6658,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.058681011199951,
"rewards/margins": 0.19838199019432068,
"rewards/rejected": -2.257063388824463,
"step": 110
},
{
"epoch": 0.08645533141210375,
"grad_norm": 20.758021434418,
"learning_rate": 1.4388489208633092e-08,
"logits/chosen": -1.973820447921753,
"logits/rejected": -1.9720237255096436,
"logps/chosen": -0.9644045829772949,
"logps/rejected": -1.065753698348999,
"loss": 1.6486,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9288091659545898,
"rewards/margins": 0.20269834995269775,
"rewards/rejected": -2.131507396697998,
"step": 120
},
{
"epoch": 0.0936599423631124,
"grad_norm": 20.303280339058496,
"learning_rate": 1.5587529976019183e-08,
"logits/chosen": -2.065807580947876,
"logits/rejected": -2.065150260925293,
"logps/chosen": -1.0804673433303833,
"logps/rejected": -1.152398705482483,
"loss": 1.7011,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1609346866607666,
"rewards/margins": 0.1438627988100052,
"rewards/rejected": -2.304797410964966,
"step": 130
},
{
"epoch": 0.10086455331412104,
"grad_norm": 24.00105137691047,
"learning_rate": 1.6786570743405277e-08,
"logits/chosen": -1.9791269302368164,
"logits/rejected": -1.9726932048797607,
"logps/chosen": -0.9777523279190063,
"logps/rejected": -1.122584342956543,
"loss": 1.5985,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9555046558380127,
"rewards/margins": 0.28966379165649414,
"rewards/rejected": -2.245168685913086,
"step": 140
},
{
"epoch": 0.10806916426512968,
"grad_norm": 22.96932769588003,
"learning_rate": 1.7985611510791365e-08,
"logits/chosen": -1.9961645603179932,
"logits/rejected": -1.991847038269043,
"logps/chosen": -1.0192320346832275,
"logps/rejected": -1.1370147466659546,
"loss": 1.6399,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.038464069366455,
"rewards/margins": 0.2355656921863556,
"rewards/rejected": -2.274029493331909,
"step": 150
},
{
"epoch": 0.11527377521613832,
"grad_norm": 20.410382239314544,
"learning_rate": 1.9184652278177456e-08,
"logits/chosen": -2.0057005882263184,
"logits/rejected": -1.9997113943099976,
"logps/chosen": -0.9478748440742493,
"logps/rejected": -1.0972506999969482,
"loss": 1.5784,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.8957496881484985,
"rewards/margins": 0.29875144362449646,
"rewards/rejected": -2.1945013999938965,
"step": 160
},
{
"epoch": 0.12247838616714697,
"grad_norm": 25.132313050475904,
"learning_rate": 2.038369304556355e-08,
"logits/chosen": -2.0098798274993896,
"logits/rejected": -2.0025277137756348,
"logps/chosen": -1.036834478378296,
"logps/rejected": -1.160753846168518,
"loss": 1.6358,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.073668956756592,
"rewards/margins": 0.24783854186534882,
"rewards/rejected": -2.321507692337036,
"step": 170
},
{
"epoch": 0.12968299711815562,
"grad_norm": 26.431650447324927,
"learning_rate": 2.1582733812949638e-08,
"logits/chosen": -2.035036563873291,
"logits/rejected": -2.0282251834869385,
"logps/chosen": -1.0207078456878662,
"logps/rejected": -1.1081664562225342,
"loss": 1.6884,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.0414156913757324,
"rewards/margins": 0.17491717636585236,
"rewards/rejected": -2.2163329124450684,
"step": 180
},
{
"epoch": 0.13688760806916425,
"grad_norm": 25.864583918134127,
"learning_rate": 2.278177458033573e-08,
"logits/chosen": -2.0707240104675293,
"logits/rejected": -2.06858229637146,
"logps/chosen": -0.9697472453117371,
"logps/rejected": -1.0662331581115723,
"loss": 1.6531,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9394944906234741,
"rewards/margins": 0.19297190010547638,
"rewards/rejected": -2.1324663162231445,
"step": 190
},
{
"epoch": 0.1440922190201729,
"grad_norm": 26.217250772833463,
"learning_rate": 2.3980815347721823e-08,
"logits/chosen": -2.042173385620117,
"logits/rejected": -2.039220094680786,
"logps/chosen": -1.0262937545776367,
"logps/rejected": -1.1527516841888428,
"loss": 1.62,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0525875091552734,
"rewards/margins": 0.25291624665260315,
"rewards/rejected": -2.3055033683776855,
"step": 200
},
{
"epoch": 0.15129682997118155,
"grad_norm": 23.84528104787017,
"learning_rate": 2.517985611510791e-08,
"logits/chosen": -2.0345051288604736,
"logits/rejected": -2.0316567420959473,
"logps/chosen": -1.0741255283355713,
"logps/rejected": -1.1507861614227295,
"loss": 1.6946,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1482510566711426,
"rewards/margins": 0.15332157909870148,
"rewards/rejected": -2.301572322845459,
"step": 210
},
{
"epoch": 0.1585014409221902,
"grad_norm": 18.025084777510834,
"learning_rate": 2.6378896882494006e-08,
"logits/chosen": -1.985107421875,
"logits/rejected": -1.981000542640686,
"logps/chosen": -1.0078728199005127,
"logps/rejected": -1.1769083738327026,
"loss": 1.572,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0157456398010254,
"rewards/margins": 0.3380712866783142,
"rewards/rejected": -2.3538167476654053,
"step": 220
},
{
"epoch": 0.16570605187319884,
"grad_norm": 19.47784006510522,
"learning_rate": 2.7577937649880097e-08,
"logits/chosen": -2.020699977874756,
"logits/rejected": -2.02105712890625,
"logps/chosen": -1.0121351480484009,
"logps/rejected": -1.1260082721710205,
"loss": 1.6372,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0242702960968018,
"rewards/margins": 0.22774633765220642,
"rewards/rejected": -2.252016544342041,
"step": 230
},
{
"epoch": 0.1729106628242075,
"grad_norm": 25.56764168439221,
"learning_rate": 2.8776978417266184e-08,
"logits/chosen": -2.0520577430725098,
"logits/rejected": -2.047010660171509,
"logps/chosen": -1.0614417791366577,
"logps/rejected": -1.1393840312957764,
"loss": 1.7026,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1228835582733154,
"rewards/margins": 0.1558847576379776,
"rewards/rejected": -2.2787680625915527,
"step": 240
},
{
"epoch": 0.18011527377521613,
"grad_norm": 21.872217314499572,
"learning_rate": 2.997601918465228e-08,
"logits/chosen": -1.9690310955047607,
"logits/rejected": -1.9652650356292725,
"logps/chosen": -1.0821864604949951,
"logps/rejected": -1.1734070777893066,
"loss": 1.6765,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1643729209899902,
"rewards/margins": 0.18244096636772156,
"rewards/rejected": -2.3468141555786133,
"step": 250
},
{
"epoch": 0.1873198847262248,
"grad_norm": 23.863727142615843,
"learning_rate": 3.1175059952038366e-08,
"logits/chosen": -1.9906234741210938,
"logits/rejected": -1.9988559484481812,
"logps/chosen": -1.1053721904754639,
"logps/rejected": -1.215968370437622,
"loss": 1.6495,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.2107443809509277,
"rewards/margins": 0.221192866563797,
"rewards/rejected": -2.431936740875244,
"step": 260
},
{
"epoch": 0.19452449567723343,
"grad_norm": 23.628636477588664,
"learning_rate": 3.237410071942446e-08,
"logits/chosen": -2.0671885013580322,
"logits/rejected": -2.05928635597229,
"logps/chosen": -1.0713196992874146,
"logps/rejected": -1.2007124423980713,
"loss": 1.613,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.142639398574829,
"rewards/margins": 0.25878530740737915,
"rewards/rejected": -2.4014248847961426,
"step": 270
},
{
"epoch": 0.2017291066282421,
"grad_norm": 28.940868349014373,
"learning_rate": 3.3573141486810555e-08,
"logits/chosen": -2.012786865234375,
"logits/rejected": -2.0110411643981934,
"logps/chosen": -0.9350563883781433,
"logps/rejected": -1.0494303703308105,
"loss": 1.6344,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.8701127767562866,
"rewards/margins": 0.22874779999256134,
"rewards/rejected": -2.098860740661621,
"step": 280
},
{
"epoch": 0.20893371757925072,
"grad_norm": 24.54297495004221,
"learning_rate": 3.477218225419664e-08,
"logits/chosen": -2.043778657913208,
"logits/rejected": -2.0457494258880615,
"logps/chosen": -1.013352870941162,
"logps/rejected": -1.108412504196167,
"loss": 1.673,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.026705741882324,
"rewards/margins": 0.19011931121349335,
"rewards/rejected": -2.216825008392334,
"step": 290
},
{
"epoch": 0.21613832853025935,
"grad_norm": 23.81150794802503,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -2.0253958702087402,
"logits/rejected": -2.0171494483947754,
"logps/chosen": -1.0900764465332031,
"logps/rejected": -1.1909259557724,
"loss": 1.6538,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1801528930664062,
"rewards/margins": 0.20169894397258759,
"rewards/rejected": -2.3818519115448,
"step": 300
},
{
"epoch": 0.22334293948126802,
"grad_norm": 21.273729184669797,
"learning_rate": 3.717026378896883e-08,
"logits/chosen": -1.9567492008209229,
"logits/rejected": -1.9567224979400635,
"logps/chosen": -1.0873463153839111,
"logps/rejected": -1.1727124452590942,
"loss": 1.6796,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1746926307678223,
"rewards/margins": 0.1707322895526886,
"rewards/rejected": -2.3454248905181885,
"step": 310
},
{
"epoch": 0.23054755043227665,
"grad_norm": 18.981062829330995,
"learning_rate": 3.836930455635491e-08,
"logits/chosen": -2.0308775901794434,
"logits/rejected": -2.0224335193634033,
"logps/chosen": -1.0090181827545166,
"logps/rejected": -1.1410481929779053,
"loss": 1.6236,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.018036365509033,
"rewards/margins": 0.26405996084213257,
"rewards/rejected": -2.2820963859558105,
"step": 320
},
{
"epoch": 0.2377521613832853,
"grad_norm": 18.120122122389997,
"learning_rate": 3.9568345323741003e-08,
"logits/chosen": -2.0095202922821045,
"logits/rejected": -2.011646270751953,
"logps/chosen": -1.0461695194244385,
"logps/rejected": -1.0691479444503784,
"loss": 1.7906,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -2.092339038848877,
"rewards/margins": 0.045956991612911224,
"rewards/rejected": -2.138295888900757,
"step": 330
},
{
"epoch": 0.24495677233429394,
"grad_norm": 21.704181292006204,
"learning_rate": 4.07673860911271e-08,
"logits/chosen": -2.0557103157043457,
"logits/rejected": -2.049830675125122,
"logps/chosen": -1.0874840021133423,
"logps/rejected": -1.1680512428283691,
"loss": 1.6811,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1749680042266846,
"rewards/margins": 0.16113446652889252,
"rewards/rejected": -2.3361024856567383,
"step": 340
},
{
"epoch": 0.2521613832853026,
"grad_norm": 21.71777146771956,
"learning_rate": 4.1966426858513185e-08,
"logits/chosen": -1.9898334741592407,
"logits/rejected": -1.984086275100708,
"logps/chosen": -0.9887999296188354,
"logps/rejected": -1.1148191690444946,
"loss": 1.6204,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.977599859237671,
"rewards/margins": 0.25203877687454224,
"rewards/rejected": -2.2296383380889893,
"step": 350
},
{
"epoch": 0.25936599423631124,
"grad_norm": 24.253364705877487,
"learning_rate": 4.3165467625899276e-08,
"logits/chosen": -1.9987952709197998,
"logits/rejected": -1.994927167892456,
"logps/chosen": -1.0863444805145264,
"logps/rejected": -1.2027360200881958,
"loss": 1.6318,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1726889610290527,
"rewards/margins": 0.23278315365314484,
"rewards/rejected": -2.4054720401763916,
"step": 360
},
{
"epoch": 0.2665706051873199,
"grad_norm": 21.124442462099225,
"learning_rate": 4.4364508393285374e-08,
"logits/chosen": -2.0051770210266113,
"logits/rejected": -2.0051817893981934,
"logps/chosen": -1.0520938634872437,
"logps/rejected": -1.1808488368988037,
"loss": 1.61,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.1041877269744873,
"rewards/margins": 0.2575102150440216,
"rewards/rejected": -2.3616976737976074,
"step": 370
},
{
"epoch": 0.2737752161383285,
"grad_norm": 18.81450442447697,
"learning_rate": 4.556354916067146e-08,
"logits/chosen": -2.033322811126709,
"logits/rejected": -2.037569522857666,
"logps/chosen": -1.0127990245819092,
"logps/rejected": -1.0860477685928345,
"loss": 1.7123,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0255980491638184,
"rewards/margins": 0.14649739861488342,
"rewards/rejected": -2.172095537185669,
"step": 380
},
{
"epoch": 0.28097982708933716,
"grad_norm": 18.05994016141799,
"learning_rate": 4.676258992805755e-08,
"logits/chosen": -2.030771255493164,
"logits/rejected": -2.024479866027832,
"logps/chosen": -1.022430419921875,
"logps/rejected": -1.1492633819580078,
"loss": 1.6144,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.04486083984375,
"rewards/margins": 0.25366589426994324,
"rewards/rejected": -2.2985267639160156,
"step": 390
},
{
"epoch": 0.2881844380403458,
"grad_norm": 21.925695715915456,
"learning_rate": 4.796163069544365e-08,
"logits/chosen": -2.0310251712799072,
"logits/rejected": -2.031461238861084,
"logps/chosen": -0.9957631230354309,
"logps/rejected": -1.0482515096664429,
"loss": 1.7233,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.9915262460708618,
"rewards/margins": 0.10497663915157318,
"rewards/rejected": -2.0965030193328857,
"step": 400
},
{
"epoch": 0.2953890489913545,
"grad_norm": 21.245122356914116,
"learning_rate": 4.916067146282973e-08,
"logits/chosen": -2.03194260597229,
"logits/rejected": -2.029832601547241,
"logps/chosen": -1.0741499662399292,
"logps/rejected": -1.1456435918807983,
"loss": 1.7051,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1482999324798584,
"rewards/margins": 0.14298732578754425,
"rewards/rejected": -2.2912871837615967,
"step": 410
},
{
"epoch": 0.3025936599423631,
"grad_norm": 19.413173771495842,
"learning_rate": 4.999992091672379e-08,
"logits/chosen": -2.0077686309814453,
"logits/rejected": -2.0120043754577637,
"logps/chosen": -1.0452171564102173,
"logps/rejected": -1.1237493753433228,
"loss": 1.687,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0904343128204346,
"rewards/margins": 0.1570647954940796,
"rewards/rejected": -2.2474987506866455,
"step": 420
},
{
"epoch": 0.30979827089337175,
"grad_norm": 21.179286310681857,
"learning_rate": 4.999851500573209e-08,
"logits/chosen": -1.9912792444229126,
"logits/rejected": -1.992300271987915,
"logps/chosen": -1.0587327480316162,
"logps/rejected": -1.0991578102111816,
"loss": 1.7543,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -2.1174654960632324,
"rewards/margins": 0.08085022121667862,
"rewards/rejected": -2.1983156204223633,
"step": 430
},
{
"epoch": 0.3170028818443804,
"grad_norm": 18.636072992550215,
"learning_rate": 4.999535180235972e-08,
"logits/chosen": -1.9881870746612549,
"logits/rejected": -1.9882609844207764,
"logps/chosen": -1.0215884447097778,
"logps/rejected": -1.1435730457305908,
"loss": 1.6294,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0431768894195557,
"rewards/margins": 0.24396944046020508,
"rewards/rejected": -2.2871460914611816,
"step": 440
},
{
"epoch": 0.3242074927953891,
"grad_norm": 20.3863101378829,
"learning_rate": 4.9990431528966836e-08,
"logits/chosen": -2.011289119720459,
"logits/rejected": -2.0077245235443115,
"logps/chosen": -1.1449435949325562,
"logps/rejected": -1.1851942539215088,
"loss": 1.7534,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.2898871898651123,
"rewards/margins": 0.08050137013196945,
"rewards/rejected": -2.3703885078430176,
"step": 450
},
{
"epoch": 0.3314121037463977,
"grad_norm": 28.415794251101588,
"learning_rate": 4.9983754531428326e-08,
"logits/chosen": -2.0104928016662598,
"logits/rejected": -2.004953145980835,
"logps/chosen": -1.1698896884918213,
"logps/rejected": -1.287090539932251,
"loss": 1.6357,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.3397793769836426,
"rewards/margins": 0.23440217971801758,
"rewards/rejected": -2.574181079864502,
"step": 460
},
{
"epoch": 0.33861671469740634,
"grad_norm": 26.13115889759558,
"learning_rate": 4.997532127910954e-08,
"logits/chosen": -2.0472846031188965,
"logits/rejected": -2.0352375507354736,
"logps/chosen": -1.0996732711791992,
"logps/rejected": -1.2017238140106201,
"loss": 1.657,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1993465423583984,
"rewards/margins": 0.20410099625587463,
"rewards/rejected": -2.4034476280212402,
"step": 470
},
{
"epoch": 0.345821325648415,
"grad_norm": 24.8300011681321,
"learning_rate": 4.996513236483331e-08,
"logits/chosen": -2.0970568656921387,
"logits/rejected": -2.0868287086486816,
"logps/chosen": -0.9850471615791321,
"logps/rejected": -1.1070995330810547,
"loss": 1.6196,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9700943231582642,
"rewards/margins": 0.24410471320152283,
"rewards/rejected": -2.2141990661621094,
"step": 480
},
{
"epoch": 0.3530259365994236,
"grad_norm": 21.36422719755768,
"learning_rate": 4.9953188504838225e-08,
"logits/chosen": -2.0197677612304688,
"logits/rejected": -2.0189919471740723,
"logps/chosen": -0.9872976541519165,
"logps/rejected": -1.1019771099090576,
"loss": 1.6297,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.974595308303833,
"rewards/margins": 0.22935882210731506,
"rewards/rejected": -2.2039542198181152,
"step": 490
},
{
"epoch": 0.36023054755043227,
"grad_norm": 20.790629252352424,
"learning_rate": 4.993949053872834e-08,
"logits/chosen": -2.020406484603882,
"logits/rejected": -2.0069644451141357,
"logps/chosen": -1.0123459100723267,
"logps/rejected": -1.1401115655899048,
"loss": 1.6134,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0246918201446533,
"rewards/margins": 0.2555309236049652,
"rewards/rejected": -2.2802231311798096,
"step": 500
},
{
"epoch": 0.36743515850144093,
"grad_norm": 21.81997744622936,
"learning_rate": 4.9924039429414086e-08,
"logits/chosen": -2.092318058013916,
"logits/rejected": -2.085855722427368,
"logps/chosen": -1.0440946817398071,
"logps/rejected": -1.157952070236206,
"loss": 1.6424,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0881893634796143,
"rewards/margins": 0.22771528363227844,
"rewards/rejected": -2.315904140472412,
"step": 510
},
{
"epoch": 0.3746397694524496,
"grad_norm": 18.836919296616202,
"learning_rate": 4.990683626304467e-08,
"logits/chosen": -2.0161261558532715,
"logits/rejected": -2.0146827697753906,
"logps/chosen": -1.106687068939209,
"logps/rejected": -1.2028759717941284,
"loss": 1.6614,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.213374137878418,
"rewards/margins": 0.19237776100635529,
"rewards/rejected": -2.405751943588257,
"step": 520
},
{
"epoch": 0.3818443804034582,
"grad_norm": 19.949593538287314,
"learning_rate": 4.9887882248931646e-08,
"logits/chosen": -1.9806255102157593,
"logits/rejected": -1.9708988666534424,
"logps/chosen": -0.9840561151504517,
"logps/rejected": -1.0613772869110107,
"loss": 1.6935,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9681122303009033,
"rewards/margins": 0.15464219450950623,
"rewards/rejected": -2.1227545738220215,
"step": 530
},
{
"epoch": 0.38904899135446686,
"grad_norm": 25.373571489691113,
"learning_rate": 4.986717871946393e-08,
"logits/chosen": -2.001861333847046,
"logits/rejected": -1.9949573278427124,
"logps/chosen": -1.0307199954986572,
"logps/rejected": -1.1325619220733643,
"loss": 1.6593,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0614399909973145,
"rewards/margins": 0.20368380844593048,
"rewards/rejected": -2.2651238441467285,
"step": 540
},
{
"epoch": 0.3962536023054755,
"grad_norm": 19.975615437676616,
"learning_rate": 4.984472713001416e-08,
"logits/chosen": -1.9626245498657227,
"logits/rejected": -1.9631439447402954,
"logps/chosen": -0.9997963905334473,
"logps/rejected": -1.0775134563446045,
"loss": 1.709,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.9995927810668945,
"rewards/margins": 0.15543393790721893,
"rewards/rejected": -2.155026912689209,
"step": 550
},
{
"epoch": 0.4034582132564842,
"grad_norm": 19.622919568600462,
"learning_rate": 4.982052905883637e-08,
"logits/chosen": -2.0312600135803223,
"logits/rejected": -2.031934976577759,
"logps/chosen": -1.080683708190918,
"logps/rejected": -1.1808573007583618,
"loss": 1.6634,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.161367416381836,
"rewards/margins": 0.20034709572792053,
"rewards/rejected": -2.3617146015167236,
"step": 560
},
{
"epoch": 0.4106628242074928,
"grad_norm": 18.11727668329915,
"learning_rate": 4.979458620695505e-08,
"logits/chosen": -2.02905011177063,
"logits/rejected": -2.0147969722747803,
"logps/chosen": -1.094179391860962,
"logps/rejected": -1.2081434726715088,
"loss": 1.6429,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.188358783721924,
"rewards/margins": 0.22792859375476837,
"rewards/rejected": -2.4162869453430176,
"step": 570
},
{
"epoch": 0.41786743515850144,
"grad_norm": 21.724257832681683,
"learning_rate": 4.976690039804555e-08,
"logits/chosen": -2.0302157402038574,
"logits/rejected": -2.0284671783447266,
"logps/chosen": -0.9871706962585449,
"logps/rejected": -1.0677026510238647,
"loss": 1.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9743413925170898,
"rewards/margins": 0.1610640585422516,
"rewards/rejected": -2.1354053020477295,
"step": 580
},
{
"epoch": 0.4250720461095101,
"grad_norm": 24.00283287273116,
"learning_rate": 4.973747357830592e-08,
"logits/chosen": -2.019152879714966,
"logits/rejected": -2.0193400382995605,
"logps/chosen": -1.0266475677490234,
"logps/rejected": -1.1648699045181274,
"loss": 1.5973,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.053295135498047,
"rewards/margins": 0.27644452452659607,
"rewards/rejected": -2.329739809036255,
"step": 590
},
{
"epoch": 0.4322766570605187,
"grad_norm": 22.405917861477768,
"learning_rate": 4.970630781632009e-08,
"logits/chosen": -2.075407028198242,
"logits/rejected": -2.0713820457458496,
"logps/chosen": -1.032801628112793,
"logps/rejected": -1.174425721168518,
"loss": 1.5984,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.065603256225586,
"rewards/margins": 0.2832481861114502,
"rewards/rejected": -2.348851442337036,
"step": 600
},
{
"epoch": 0.43948126801152737,
"grad_norm": 24.34047631979251,
"learning_rate": 4.967340530291242e-08,
"logits/chosen": -2.0268642902374268,
"logits/rejected": -2.016897678375244,
"logps/chosen": -1.0920665264129639,
"logps/rejected": -1.1507337093353271,
"loss": 1.7166,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1841330528259277,
"rewards/margins": 0.11733441054821014,
"rewards/rejected": -2.3014674186706543,
"step": 610
},
{
"epoch": 0.44668587896253603,
"grad_norm": 28.014312030744783,
"learning_rate": 4.9638768350993755e-08,
"logits/chosen": -2.026273250579834,
"logits/rejected": -2.019042491912842,
"logps/chosen": -0.9955148696899414,
"logps/rejected": -1.082495927810669,
"loss": 1.6773,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.9910297393798828,
"rewards/margins": 0.17396244406700134,
"rewards/rejected": -2.164991855621338,
"step": 620
},
{
"epoch": 0.4538904899135447,
"grad_norm": 22.866148444031516,
"learning_rate": 4.9602399395398786e-08,
"logits/chosen": -2.0437943935394287,
"logits/rejected": -2.0437843799591064,
"logps/chosen": -1.0265557765960693,
"logps/rejected": -1.1544904708862305,
"loss": 1.6162,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0531115531921387,
"rewards/margins": 0.25586965680122375,
"rewards/rejected": -2.308980941772461,
"step": 630
},
{
"epoch": 0.4610951008645533,
"grad_norm": 18.890515150130014,
"learning_rate": 4.9564300992714914e-08,
"logits/chosen": -1.957597017288208,
"logits/rejected": -1.9585049152374268,
"logps/chosen": -1.0095851421356201,
"logps/rejected": -1.11627995967865,
"loss": 1.6467,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0191702842712402,
"rewards/margins": 0.21338967978954315,
"rewards/rejected": -2.2325599193573,
"step": 640
},
{
"epoch": 0.46829971181556196,
"grad_norm": 24.37442657575504,
"learning_rate": 4.952447582110253e-08,
"logits/chosen": -2.0568816661834717,
"logits/rejected": -2.042471408843994,
"logps/chosen": -1.0371555089950562,
"logps/rejected": -1.1173784732818604,
"loss": 1.6904,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.0743110179901123,
"rewards/margins": 0.16044630110263824,
"rewards/rejected": -2.2347569465637207,
"step": 650
},
{
"epoch": 0.4755043227665706,
"grad_norm": 26.422335691665552,
"learning_rate": 4.948292668010676e-08,
"logits/chosen": -2.032116174697876,
"logits/rejected": -2.0331506729125977,
"logps/chosen": -1.0876410007476807,
"logps/rejected": -1.174712896347046,
"loss": 1.6846,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1752820014953613,
"rewards/margins": 0.17414382100105286,
"rewards/rejected": -2.349425792694092,
"step": 660
},
{
"epoch": 0.4827089337175792,
"grad_norm": 24.027322225917015,
"learning_rate": 4.943965649046064e-08,
"logits/chosen": -2.005026340484619,
"logits/rejected": -1.9956611394882202,
"logps/chosen": -1.0621442794799805,
"logps/rejected": -1.166245460510254,
"loss": 1.6535,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.124288558959961,
"rewards/margins": 0.2082025706768036,
"rewards/rejected": -2.332490921020508,
"step": 670
},
{
"epoch": 0.4899135446685879,
"grad_norm": 21.807782406029403,
"learning_rate": 4.9394668293879835e-08,
"logits/chosen": -1.960857629776001,
"logits/rejected": -1.9516347646713257,
"logps/chosen": -1.0361614227294922,
"logps/rejected": -1.1057939529418945,
"loss": 1.7073,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0723228454589844,
"rewards/margins": 0.13926495611667633,
"rewards/rejected": -2.211587905883789,
"step": 680
},
{
"epoch": 0.49711815561959655,
"grad_norm": 29.962067466825946,
"learning_rate": 4.93479652528488e-08,
"logits/chosen": -2.0200858116149902,
"logits/rejected": -2.0147900581359863,
"logps/chosen": -1.1034678220748901,
"logps/rejected": -1.209357500076294,
"loss": 1.6608,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2069356441497803,
"rewards/margins": 0.21177926659584045,
"rewards/rejected": -2.418715000152588,
"step": 690
},
{
"epoch": 0.5043227665706052,
"grad_norm": 23.065678595280374,
"learning_rate": 4.929955065039848e-08,
"logits/chosen": -2.019347667694092,
"logits/rejected": -2.0137135982513428,
"logps/chosen": -1.0183324813842773,
"logps/rejected": -1.151181936264038,
"loss": 1.6148,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0366649627685547,
"rewards/margins": 0.2656988203525543,
"rewards/rejected": -2.302363872528076,
"step": 700
},
{
"epoch": 0.5115273775216138,
"grad_norm": 21.89139209126973,
"learning_rate": 4.92494278898755e-08,
"logits/chosen": -1.9896736145019531,
"logits/rejected": -1.9866260290145874,
"logps/chosen": -0.8968732953071594,
"logps/rejected": -1.0222662687301636,
"loss": 1.6288,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.7937465906143188,
"rewards/margins": 0.2507862448692322,
"rewards/rejected": -2.044532537460327,
"step": 710
},
{
"epoch": 0.5187319884726225,
"grad_norm": 21.896749853278514,
"learning_rate": 4.9197600494702955e-08,
"logits/chosen": -2.0112369060516357,
"logits/rejected": -2.005025625228882,
"logps/chosen": -1.0415465831756592,
"logps/rejected": -1.1650935411453247,
"loss": 1.6193,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0830931663513184,
"rewards/margins": 0.24709336459636688,
"rewards/rejected": -2.3301870822906494,
"step": 720
},
{
"epoch": 0.5259365994236311,
"grad_norm": 23.234379865312263,
"learning_rate": 4.9144072108132725e-08,
"logits/chosen": -2.014512300491333,
"logits/rejected": -2.003385305404663,
"logps/chosen": -1.0214173793792725,
"logps/rejected": -1.1051766872406006,
"loss": 1.69,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.042834758758545,
"rewards/margins": 0.16751877963542938,
"rewards/rejected": -2.210353374481201,
"step": 730
},
{
"epoch": 0.5331412103746398,
"grad_norm": 20.28090680157763,
"learning_rate": 4.908884649298937e-08,
"logits/chosen": -1.9976609945297241,
"logits/rejected": -2.004554271697998,
"logps/chosen": -1.0181934833526611,
"logps/rejected": -1.079618215560913,
"loss": 1.7294,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -2.0363869667053223,
"rewards/margins": 0.12284936755895615,
"rewards/rejected": -2.159236431121826,
"step": 740
},
{
"epoch": 0.5403458213256485,
"grad_norm": 26.163151024624415,
"learning_rate": 4.903192753140557e-08,
"logits/chosen": -2.0144410133361816,
"logits/rejected": -2.009019374847412,
"logps/chosen": -1.1000624895095825,
"logps/rejected": -1.1905133724212646,
"loss": 1.6789,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.200124979019165,
"rewards/margins": 0.18090197443962097,
"rewards/rejected": -2.3810267448425293,
"step": 750
},
{
"epoch": 0.547550432276657,
"grad_norm": 22.741665187331794,
"learning_rate": 4.897331922454931e-08,
"logits/chosen": -1.9788360595703125,
"logits/rejected": -1.9825944900512695,
"logps/chosen": -1.003287672996521,
"logps/rejected": -1.1135233640670776,
"loss": 1.6517,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.006575345993042,
"rewards/margins": 0.22047185897827148,
"rewards/rejected": -2.2270467281341553,
"step": 760
},
{
"epoch": 0.5547550432276657,
"grad_norm": 24.61264144766733,
"learning_rate": 4.891302569234256e-08,
"logits/chosen": -1.9734195470809937,
"logits/rejected": -1.9761135578155518,
"logps/chosen": -0.976268470287323,
"logps/rejected": -1.1288354396820068,
"loss": 1.5905,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.952536940574646,
"rewards/margins": 0.30513399839401245,
"rewards/rejected": -2.2576708793640137,
"step": 770
},
{
"epoch": 0.5619596541786743,
"grad_norm": 25.449879441421576,
"learning_rate": 4.8851051173171656e-08,
"logits/chosen": -1.992964506149292,
"logits/rejected": -1.9914066791534424,
"logps/chosen": -1.039645791053772,
"logps/rejected": -1.1218674182891846,
"loss": 1.682,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.079291582107544,
"rewards/margins": 0.1644435077905655,
"rewards/rejected": -2.243734836578369,
"step": 780
},
{
"epoch": 0.569164265129683,
"grad_norm": 19.97642043492772,
"learning_rate": 4.87874000235894e-08,
"logits/chosen": -2.013441801071167,
"logits/rejected": -2.0076568126678467,
"logps/chosen": -1.075208306312561,
"logps/rejected": -1.23297119140625,
"loss": 1.5847,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.150416612625122,
"rewards/margins": 0.31552577018737793,
"rewards/rejected": -2.4659423828125,
"step": 790
},
{
"epoch": 0.5763688760806917,
"grad_norm": 21.789770158220232,
"learning_rate": 4.872207671800876e-08,
"logits/chosen": -2.0366737842559814,
"logits/rejected": -2.03296160697937,
"logps/chosen": -1.0434991121292114,
"logps/rejected": -1.122003197669983,
"loss": 1.6989,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.086998224258423,
"rewards/margins": 0.15700823068618774,
"rewards/rejected": -2.244006395339966,
"step": 800
},
{
"epoch": 0.5835734870317003,
"grad_norm": 18.594763027326536,
"learning_rate": 4.865508584838841e-08,
"logits/chosen": -2.018526315689087,
"logits/rejected": -2.0210933685302734,
"logps/chosen": -1.0125329494476318,
"logps/rejected": -1.1024024486541748,
"loss": 1.6743,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0250658988952637,
"rewards/margins": 0.17973873019218445,
"rewards/rejected": -2.2048048973083496,
"step": 810
},
{
"epoch": 0.590778097982709,
"grad_norm": 24.248436921729628,
"learning_rate": 4.858643212390985e-08,
"logits/chosen": -2.01918363571167,
"logits/rejected": -2.009265422821045,
"logps/chosen": -1.0285472869873047,
"logps/rejected": -1.115457534790039,
"loss": 1.6899,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0570945739746094,
"rewards/margins": 0.17382054030895233,
"rewards/rejected": -2.230915069580078,
"step": 820
},
{
"epoch": 0.5979827089337176,
"grad_norm": 21.134388800838927,
"learning_rate": 4.851612037064643e-08,
"logits/chosen": -2.0015549659729004,
"logits/rejected": -1.9994051456451416,
"logps/chosen": -0.9596086740493774,
"logps/rejected": -1.0798084735870361,
"loss": 1.6365,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9192173480987549,
"rewards/margins": 0.2403998076915741,
"rewards/rejected": -2.1596169471740723,
"step": 830
},
{
"epoch": 0.6051873198847262,
"grad_norm": 18.585378419640243,
"learning_rate": 4.8444155531224065e-08,
"logits/chosen": -2.0270018577575684,
"logits/rejected": -2.0270333290100098,
"logps/chosen": -1.0868253707885742,
"logps/rejected": -1.1600069999694824,
"loss": 1.705,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1736507415771484,
"rewards/margins": 0.14636364579200745,
"rewards/rejected": -2.320013999938965,
"step": 840
},
{
"epoch": 0.6123919308357348,
"grad_norm": 18.461630690476433,
"learning_rate": 4.8370542664473805e-08,
"logits/chosen": -2.034163236618042,
"logits/rejected": -2.028323173522949,
"logps/chosen": -1.049387812614441,
"logps/rejected": -1.1546533107757568,
"loss": 1.6593,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.098775625228882,
"rewards/margins": 0.21053071320056915,
"rewards/rejected": -2.3093066215515137,
"step": 850
},
{
"epoch": 0.6195965417867435,
"grad_norm": 20.644121007821905,
"learning_rate": 4.829528694507624e-08,
"logits/chosen": -2.0090110301971436,
"logits/rejected": -2.0050089359283447,
"logps/chosen": -1.161203384399414,
"logps/rejected": -1.2181370258331299,
"loss": 1.7269,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.322406768798828,
"rewards/margins": 0.11386724561452866,
"rewards/rejected": -2.4362740516662598,
"step": 860
},
{
"epoch": 0.6268011527377522,
"grad_norm": 22.99115482206478,
"learning_rate": 4.821839366319768e-08,
"logits/chosen": -2.0472781658172607,
"logits/rejected": -2.041145086288452,
"logps/chosen": -1.0039831399917603,
"logps/rejected": -1.1226513385772705,
"loss": 1.6312,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0079662799835205,
"rewards/margins": 0.23733630776405334,
"rewards/rejected": -2.245302677154541,
"step": 870
},
{
"epoch": 0.6340057636887608,
"grad_norm": 23.509214819589577,
"learning_rate": 4.813986822411833e-08,
"logits/chosen": -2.0386948585510254,
"logits/rejected": -2.036702871322632,
"logps/chosen": -1.0144814252853394,
"logps/rejected": -1.0797975063323975,
"loss": 1.7131,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0289628505706787,
"rewards/margins": 0.13063256442546844,
"rewards/rejected": -2.159595012664795,
"step": 880
},
{
"epoch": 0.6412103746397695,
"grad_norm": 21.898531418971658,
"learning_rate": 4.805971614785231e-08,
"logits/chosen": -2.0628480911254883,
"logits/rejected": -2.0612945556640625,
"logps/chosen": -1.0150998830795288,
"logps/rejected": -1.1112868785858154,
"loss": 1.66,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0301997661590576,
"rewards/margins": 0.19237405061721802,
"rewards/rejected": -2.222573757171631,
"step": 890
},
{
"epoch": 0.6484149855907781,
"grad_norm": 23.49291997665142,
"learning_rate": 4.797794306875963e-08,
"logits/chosen": -1.9782159328460693,
"logits/rejected": -1.9796749353408813,
"logps/chosen": -1.141379952430725,
"logps/rejected": -1.2147481441497803,
"loss": 1.7099,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.28275990486145,
"rewards/margins": 0.14673666656017303,
"rewards/rejected": -2.4294962882995605,
"step": 900
},
{
"epoch": 0.6556195965417867,
"grad_norm": 22.767778132803752,
"learning_rate": 4.7894554735150076e-08,
"logits/chosen": -1.980843186378479,
"logits/rejected": -1.9842725992202759,
"logps/chosen": -1.0420039892196655,
"logps/rejected": -1.1086304187774658,
"loss": 1.7087,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.084007978439331,
"rewards/margins": 0.13325299322605133,
"rewards/rejected": -2.2172608375549316,
"step": 910
},
{
"epoch": 0.6628242074927954,
"grad_norm": 25.17340393441993,
"learning_rate": 4.7809557008879185e-08,
"logits/chosen": -2.017960548400879,
"logits/rejected": -2.01263165473938,
"logps/chosen": -0.9720123410224915,
"logps/rejected": -1.0615627765655518,
"loss": 1.6769,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.944024682044983,
"rewards/margins": 0.17910084128379822,
"rewards/rejected": -2.1231255531311035,
"step": 920
},
{
"epoch": 0.670028818443804,
"grad_norm": 20.599563542622022,
"learning_rate": 4.772295586493613e-08,
"logits/chosen": -2.0555853843688965,
"logits/rejected": -2.0527231693267822,
"logps/chosen": -1.0331850051879883,
"logps/rejected": -1.151185393333435,
"loss": 1.6268,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0663700103759766,
"rewards/margins": 0.23600046336650848,
"rewards/rejected": -2.30237078666687,
"step": 930
},
{
"epoch": 0.6772334293948127,
"grad_norm": 22.356981982643983,
"learning_rate": 4.763475739102374e-08,
"logits/chosen": -2.0080509185791016,
"logits/rejected": -2.014037609100342,
"logps/chosen": -1.1266523599624634,
"logps/rejected": -1.1936461925506592,
"loss": 1.7037,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2533047199249268,
"rewards/margins": 0.13398754596710205,
"rewards/rejected": -2.3872923851013184,
"step": 940
},
{
"epoch": 0.6844380403458213,
"grad_norm": 18.146033211795125,
"learning_rate": 4.754496778713054e-08,
"logits/chosen": -1.9653065204620361,
"logits/rejected": -1.9694888591766357,
"logps/chosen": -1.011007308959961,
"logps/rejected": -1.1335813999176025,
"loss": 1.6339,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.022014617919922,
"rewards/margins": 0.24514833092689514,
"rewards/rejected": -2.267162799835205,
"step": 950
},
{
"epoch": 0.69164265129683,
"grad_norm": 23.944663141260857,
"learning_rate": 4.7453593365094926e-08,
"logits/chosen": -2.041576862335205,
"logits/rejected": -2.0405728816986084,
"logps/chosen": -1.0485351085662842,
"logps/rejected": -1.1592615842819214,
"loss": 1.6439,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0970702171325684,
"rewards/margins": 0.221452996134758,
"rewards/rejected": -2.3185231685638428,
"step": 960
},
{
"epoch": 0.6988472622478387,
"grad_norm": 24.526469520821017,
"learning_rate": 4.736064054816145e-08,
"logits/chosen": -2.042876720428467,
"logits/rejected": -2.03905987739563,
"logps/chosen": -0.967331051826477,
"logps/rejected": -1.0941224098205566,
"loss": 1.6128,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.934662103652954,
"rewards/margins": 0.2535827159881592,
"rewards/rejected": -2.1882448196411133,
"step": 970
},
{
"epoch": 0.7060518731988472,
"grad_norm": 20.19357580628468,
"learning_rate": 4.726611587052933e-08,
"logits/chosen": -1.970882773399353,
"logits/rejected": -1.970483422279358,
"logps/chosen": -1.1071960926055908,
"logps/rejected": -1.2356629371643066,
"loss": 1.6116,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.2143921852111816,
"rewards/margins": 0.25693362951278687,
"rewards/rejected": -2.4713258743286133,
"step": 980
},
{
"epoch": 0.7132564841498559,
"grad_norm": 25.984367596777485,
"learning_rate": 4.71700259768931e-08,
"logits/chosen": -2.0280659198760986,
"logits/rejected": -2.0252366065979004,
"logps/chosen": -1.1083245277404785,
"logps/rejected": -1.20613694190979,
"loss": 1.6704,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.216649055480957,
"rewards/margins": 0.1956244558095932,
"rewards/rejected": -2.41227388381958,
"step": 990
},
{
"epoch": 0.7204610951008645,
"grad_norm": 22.322580503288012,
"learning_rate": 4.707237762197549e-08,
"logits/chosen": -2.0082552433013916,
"logits/rejected": -2.0052361488342285,
"logps/chosen": -1.0064256191253662,
"logps/rejected": -1.1272127628326416,
"loss": 1.6432,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0128512382507324,
"rewards/margins": 0.2415740191936493,
"rewards/rejected": -2.254425525665283,
"step": 1000
},
{
"epoch": 0.7276657060518732,
"grad_norm": 26.980563216893835,
"learning_rate": 4.697317767005265e-08,
"logits/chosen": -2.0239129066467285,
"logits/rejected": -2.0204639434814453,
"logps/chosen": -1.0004609823226929,
"logps/rejected": -1.0935966968536377,
"loss": 1.6934,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.0009219646453857,
"rewards/margins": 0.1862715780735016,
"rewards/rejected": -2.1871933937072754,
"step": 1010
},
{
"epoch": 0.7348703170028819,
"grad_norm": 20.718520630802683,
"learning_rate": 4.6872433094471577e-08,
"logits/chosen": -2.0184829235076904,
"logits/rejected": -2.0136635303497314,
"logps/chosen": -1.0312004089355469,
"logps/rejected": -1.1273009777069092,
"loss": 1.6531,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0624008178710938,
"rewards/margins": 0.19220107793807983,
"rewards/rejected": -2.2546019554138184,
"step": 1020
},
{
"epoch": 0.7420749279538905,
"grad_norm": 20.062163691159576,
"learning_rate": 4.677015097715994e-08,
"logits/chosen": -1.9711856842041016,
"logits/rejected": -1.970653772354126,
"logps/chosen": -1.020525574684143,
"logps/rejected": -1.1553295850753784,
"loss": 1.6278,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.041051149368286,
"rewards/margins": 0.2696080803871155,
"rewards/rejected": -2.310659170150757,
"step": 1030
},
{
"epoch": 0.7492795389048992,
"grad_norm": 19.97083818571589,
"learning_rate": 4.666633850812825e-08,
"logits/chosen": -2.0189290046691895,
"logits/rejected": -2.0129802227020264,
"logps/chosen": -1.0118273496627808,
"logps/rejected": -1.0940511226654053,
"loss": 1.6809,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0236546993255615,
"rewards/margins": 0.16444769501686096,
"rewards/rejected": -2.1881022453308105,
"step": 1040
},
{
"epoch": 0.7564841498559077,
"grad_norm": 21.00137849840187,
"learning_rate": 4.656100298496439e-08,
"logits/chosen": -1.967718482017517,
"logits/rejected": -1.9640108346939087,
"logps/chosen": -0.9364064931869507,
"logps/rejected": -1.068771481513977,
"loss": 1.615,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8728129863739014,
"rewards/margins": 0.2647300064563751,
"rewards/rejected": -2.137542963027954,
"step": 1050
},
{
"epoch": 0.7636887608069164,
"grad_norm": 21.497115116059465,
"learning_rate": 4.6454151812320715e-08,
"logits/chosen": -1.9955333471298218,
"logits/rejected": -1.989284873008728,
"logps/chosen": -1.0373084545135498,
"logps/rejected": -1.1469610929489136,
"loss": 1.6514,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0746169090270996,
"rewards/margins": 0.21930520236492157,
"rewards/rejected": -2.293922185897827,
"step": 1060
},
{
"epoch": 0.770893371757925,
"grad_norm": 23.26552290797272,
"learning_rate": 4.6345792501393434e-08,
"logits/chosen": -2.0028505325317383,
"logits/rejected": -2.001188278198242,
"logps/chosen": -1.0729515552520752,
"logps/rejected": -1.2014734745025635,
"loss": 1.6349,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1459031105041504,
"rewards/margins": 0.2570436894893646,
"rewards/rejected": -2.402946949005127,
"step": 1070
},
{
"epoch": 0.7780979827089337,
"grad_norm": 24.189590280718363,
"learning_rate": 4.6235932669394676e-08,
"logits/chosen": -2.0241146087646484,
"logits/rejected": -2.024709463119507,
"logps/chosen": -1.0855356454849243,
"logps/rejected": -1.1969963312149048,
"loss": 1.6497,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1710712909698486,
"rewards/margins": 0.22292128205299377,
"rewards/rejected": -2.3939926624298096,
"step": 1080
},
{
"epoch": 0.7853025936599424,
"grad_norm": 27.850960995358697,
"learning_rate": 4.612458003901698e-08,
"logits/chosen": -2.033730983734131,
"logits/rejected": -2.0262105464935303,
"logps/chosen": -1.107290506362915,
"logps/rejected": -1.2112653255462646,
"loss": 1.6632,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.21458101272583,
"rewards/margins": 0.2079494446516037,
"rewards/rejected": -2.4225306510925293,
"step": 1090
},
{
"epoch": 0.792507204610951,
"grad_norm": 26.689600712739118,
"learning_rate": 4.6011742437890476e-08,
"logits/chosen": -2.02357816696167,
"logits/rejected": -2.018139123916626,
"logps/chosen": -1.0435128211975098,
"logps/rejected": -1.179508090019226,
"loss": 1.6066,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0870256423950195,
"rewards/margins": 0.2719905972480774,
"rewards/rejected": -2.359016180038452,
"step": 1100
},
{
"epoch": 0.7997118155619597,
"grad_norm": 18.98026656983148,
"learning_rate": 4.589742779803259e-08,
"logits/chosen": -2.019516944885254,
"logits/rejected": -2.012485980987549,
"logps/chosen": -1.0080353021621704,
"logps/rejected": -1.1305128335952759,
"loss": 1.6271,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.016070604324341,
"rewards/margins": 0.24495509266853333,
"rewards/rejected": -2.2610256671905518,
"step": 1110
},
{
"epoch": 0.8069164265129684,
"grad_norm": 21.893859326791098,
"learning_rate": 4.5781644155290486e-08,
"logits/chosen": -1.980934739112854,
"logits/rejected": -1.973128318786621,
"logps/chosen": -1.0460084676742554,
"logps/rejected": -1.1076754331588745,
"loss": 1.7173,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0920169353485107,
"rewards/margins": 0.12333414703607559,
"rewards/rejected": -2.215350866317749,
"step": 1120
},
{
"epoch": 0.8141210374639769,
"grad_norm": 20.441423721125243,
"learning_rate": 4.566439964877613e-08,
"logits/chosen": -2.0089352130889893,
"logits/rejected": -2.004986047744751,
"logps/chosen": -0.9979375004768372,
"logps/rejected": -1.084967017173767,
"loss": 1.6849,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9958750009536743,
"rewards/margins": 0.17405889928340912,
"rewards/rejected": -2.169934034347534,
"step": 1130
},
{
"epoch": 0.8213256484149856,
"grad_norm": 19.078058822922983,
"learning_rate": 4.554570252029421e-08,
"logits/chosen": -2.0512731075286865,
"logits/rejected": -2.0500340461730957,
"logps/chosen": -1.0467660427093506,
"logps/rejected": -1.1654585599899292,
"loss": 1.6319,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.093532085418701,
"rewards/margins": 0.2373850792646408,
"rewards/rejected": -2.3309171199798584,
"step": 1140
},
{
"epoch": 0.8285302593659942,
"grad_norm": 21.272524194475157,
"learning_rate": 4.542556111376274e-08,
"logits/chosen": -2.0449581146240234,
"logits/rejected": -2.0386009216308594,
"logps/chosen": -1.0740997791290283,
"logps/rejected": -1.1670470237731934,
"loss": 1.6769,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1481995582580566,
"rewards/margins": 0.18589431047439575,
"rewards/rejected": -2.3340940475463867,
"step": 1150
},
{
"epoch": 0.8357348703170029,
"grad_norm": 26.365851681600105,
"learning_rate": 4.5303983874626506e-08,
"logits/chosen": -1.994737982749939,
"logits/rejected": -1.9932760000228882,
"logps/chosen": -1.0379770994186401,
"logps/rejected": -1.1171116828918457,
"loss": 1.7065,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0759541988372803,
"rewards/margins": 0.15826921164989471,
"rewards/rejected": -2.2342233657836914,
"step": 1160
},
{
"epoch": 0.8429394812680115,
"grad_norm": 24.389025234614373,
"learning_rate": 4.518097934926339e-08,
"logits/chosen": -1.9977524280548096,
"logits/rejected": -1.989159345626831,
"logps/chosen": -1.0152474641799927,
"logps/rejected": -1.1263688802719116,
"loss": 1.6418,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0304949283599854,
"rewards/margins": 0.22224298119544983,
"rewards/rejected": -2.2527377605438232,
"step": 1170
},
{
"epoch": 0.8501440922190202,
"grad_norm": 26.253793649660842,
"learning_rate": 4.505655618438363e-08,
"logits/chosen": -1.9624111652374268,
"logits/rejected": -1.9582713842391968,
"logps/chosen": -1.0600744485855103,
"logps/rejected": -1.1660957336425781,
"loss": 1.6631,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1201488971710205,
"rewards/margins": 0.2120426595211029,
"rewards/rejected": -2.3321914672851562,
"step": 1180
},
{
"epoch": 0.8573487031700289,
"grad_norm": 20.23600321587351,
"learning_rate": 4.4930723126421945e-08,
"logits/chosen": -2.05165433883667,
"logits/rejected": -2.044738531112671,
"logps/chosen": -1.0710397958755493,
"logps/rejected": -1.1480246782302856,
"loss": 1.6949,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1420795917510986,
"rewards/margins": 0.1539701223373413,
"rewards/rejected": -2.2960493564605713,
"step": 1190
},
{
"epoch": 0.8645533141210374,
"grad_norm": 25.63037318376481,
"learning_rate": 4.48034890209227e-08,
"logits/chosen": -1.9842115640640259,
"logits/rejected": -1.9720268249511719,
"logps/chosen": -1.086510419845581,
"logps/rejected": -1.17435622215271,
"loss": 1.6721,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.173020839691162,
"rewards/margins": 0.17569169402122498,
"rewards/rejected": -2.34871244430542,
"step": 1200
},
{
"epoch": 0.8717579250720461,
"grad_norm": 22.794093788758836,
"learning_rate": 4.4674862811918155e-08,
"logits/chosen": -1.9662357568740845,
"logits/rejected": -1.9744333028793335,
"logps/chosen": -0.9373486638069153,
"logps/rejected": -1.0918363332748413,
"loss": 1.5843,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8746973276138306,
"rewards/margins": 0.3089754581451416,
"rewards/rejected": -2.1836726665496826,
"step": 1210
},
{
"epoch": 0.8789625360230547,
"grad_norm": 20.323545230432092,
"learning_rate": 4.454485354129966e-08,
"logits/chosen": -1.9998928308486938,
"logits/rejected": -1.995469331741333,
"logps/chosen": -1.0085976123809814,
"logps/rejected": -1.1158851385116577,
"loss": 1.6557,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.017195224761963,
"rewards/margins": 0.21457497775554657,
"rewards/rejected": -2.2317702770233154,
"step": 1220
},
{
"epoch": 0.8861671469740634,
"grad_norm": 19.995091234995602,
"learning_rate": 4.4413470348182124e-08,
"logits/chosen": -1.9716050624847412,
"logits/rejected": -1.9596103429794312,
"logps/chosen": -0.9838314056396484,
"logps/rejected": -1.07748544216156,
"loss": 1.6698,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9676628112792969,
"rewards/margins": 0.1873079240322113,
"rewards/rejected": -2.15497088432312,
"step": 1230
},
{
"epoch": 0.8933717579250721,
"grad_norm": 23.9057321705134,
"learning_rate": 4.42807224682615e-08,
"logits/chosen": -1.9828884601593018,
"logits/rejected": -1.9808467626571655,
"logps/chosen": -0.9355411529541016,
"logps/rejected": -1.0736339092254639,
"loss": 1.6071,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8710823059082031,
"rewards/margins": 0.27618569135665894,
"rewards/rejected": -2.1472678184509277,
"step": 1240
},
{
"epoch": 0.9005763688760807,
"grad_norm": 21.284396831133705,
"learning_rate": 4.4146619233165604e-08,
"logits/chosen": -2.023496627807617,
"logits/rejected": -2.0257253646850586,
"logps/chosen": -1.0640500783920288,
"logps/rejected": -1.2194865942001343,
"loss": 1.5927,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1281001567840576,
"rewards/margins": 0.3108729124069214,
"rewards/rejected": -2.4389731884002686,
"step": 1250
},
{
"epoch": 0.9077809798270894,
"grad_norm": 29.022562118194454,
"learning_rate": 4.4011170069798126e-08,
"logits/chosen": -2.017789363861084,
"logits/rejected": -2.0227370262145996,
"logps/chosen": -1.1170909404754639,
"logps/rejected": -1.2450226545333862,
"loss": 1.623,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.2341818809509277,
"rewards/margins": 0.255863755941391,
"rewards/rejected": -2.4900453090667725,
"step": 1260
},
{
"epoch": 0.9149855907780979,
"grad_norm": 21.040680635282882,
"learning_rate": 4.387438449967594e-08,
"logits/chosen": -1.9829397201538086,
"logits/rejected": -1.9764759540557861,
"logps/chosen": -0.964381992816925,
"logps/rejected": -1.0864336490631104,
"loss": 1.6237,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.92876398563385,
"rewards/margins": 0.24410350620746613,
"rewards/rejected": -2.1728672981262207,
"step": 1270
},
{
"epoch": 0.9221902017291066,
"grad_norm": 24.550013299727784,
"learning_rate": 4.373627213825983e-08,
"logits/chosen": -2.0657618045806885,
"logits/rejected": -2.061527967453003,
"logps/chosen": -1.0255496501922607,
"logps/rejected": -1.1638200283050537,
"loss": 1.6084,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0510993003845215,
"rewards/margins": 0.2765410244464874,
"rewards/rejected": -2.3276400566101074,
"step": 1280
},
{
"epoch": 0.9293948126801153,
"grad_norm": 19.732937637348012,
"learning_rate": 4.359684269427848e-08,
"logits/chosen": -2.0326967239379883,
"logits/rejected": -2.0317554473876953,
"logps/chosen": -0.9937663078308105,
"logps/rejected": -1.099719762802124,
"loss": 1.648,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.987532615661621,
"rewards/margins": 0.2119067907333374,
"rewards/rejected": -2.199439525604248,
"step": 1290
},
{
"epoch": 0.9365994236311239,
"grad_norm": 25.631669490766754,
"learning_rate": 4.34561059690461e-08,
"logits/chosen": -2.077247381210327,
"logits/rejected": -2.079225540161133,
"logps/chosen": -1.0479168891906738,
"logps/rejected": -1.1121768951416016,
"loss": 1.7184,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0958337783813477,
"rewards/margins": 0.12852007150650024,
"rewards/rejected": -2.224353790283203,
"step": 1300
},
{
"epoch": 0.9438040345821326,
"grad_norm": 24.61195364772657,
"learning_rate": 4.3314071855773314e-08,
"logits/chosen": -2.0383481979370117,
"logits/rejected": -2.0388643741607666,
"logps/chosen": -0.9827170372009277,
"logps/rejected": -1.0801887512207031,
"loss": 1.6607,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9654340744018555,
"rewards/margins": 0.1949436366558075,
"rewards/rejected": -2.1603775024414062,
"step": 1310
},
{
"epoch": 0.9510086455331412,
"grad_norm": 24.093033036544337,
"learning_rate": 4.3170750338871806e-08,
"logits/chosen": -2.0104308128356934,
"logits/rejected": -2.0040740966796875,
"logps/chosen": -1.0747355222702026,
"logps/rejected": -1.220551609992981,
"loss": 1.5931,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1494710445404053,
"rewards/margins": 0.2916319668292999,
"rewards/rejected": -2.441103219985962,
"step": 1320
},
{
"epoch": 0.9582132564841499,
"grad_norm": 17.676250888332724,
"learning_rate": 4.3026151493252414e-08,
"logits/chosen": -2.0373542308807373,
"logits/rejected": -2.032808542251587,
"logps/chosen": -1.0594923496246338,
"logps/rejected": -1.183484673500061,
"loss": 1.6306,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1189846992492676,
"rewards/margins": 0.24798473715782166,
"rewards/rejected": -2.366969347000122,
"step": 1330
},
{
"epoch": 0.9654178674351584,
"grad_norm": 29.319097572050133,
"learning_rate": 4.2880285483616895e-08,
"logits/chosen": -2.004913806915283,
"logits/rejected": -2.005673408508301,
"logps/chosen": -1.015624761581421,
"logps/rejected": -1.133320689201355,
"loss": 1.641,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.031249523162842,
"rewards/margins": 0.23539182543754578,
"rewards/rejected": -2.26664137840271,
"step": 1340
},
{
"epoch": 0.9726224783861671,
"grad_norm": 18.366477723606742,
"learning_rate": 4.273316256374342e-08,
"logits/chosen": -1.933974027633667,
"logits/rejected": -1.9319394826889038,
"logps/chosen": -1.0121793746948242,
"logps/rejected": -1.087815523147583,
"loss": 1.705,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0243587493896484,
"rewards/margins": 0.15127214789390564,
"rewards/rejected": -2.175631046295166,
"step": 1350
},
{
"epoch": 0.9798270893371758,
"grad_norm": 19.021483566176542,
"learning_rate": 4.258479307576576e-08,
"logits/chosen": -1.9841632843017578,
"logits/rejected": -1.9821224212646484,
"logps/chosen": -0.9617762565612793,
"logps/rejected": -1.0556433200836182,
"loss": 1.6759,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9235525131225586,
"rewards/margins": 0.18773424625396729,
"rewards/rejected": -2.1112866401672363,
"step": 1360
},
{
"epoch": 0.9870317002881844,
"grad_norm": 25.022118857417556,
"learning_rate": 4.243518744944626e-08,
"logits/chosen": -2.0071628093719482,
"logits/rejected": -2.002535343170166,
"logps/chosen": -0.9993633031845093,
"logps/rejected": -1.123016595840454,
"loss": 1.6203,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9987266063690186,
"rewards/margins": 0.24730615317821503,
"rewards/rejected": -2.246033191680908,
"step": 1370
},
{
"epoch": 0.9942363112391931,
"grad_norm": 24.091606187020563,
"learning_rate": 4.22843562014427e-08,
"logits/chosen": -1.973891019821167,
"logits/rejected": -1.9703378677368164,
"logps/chosen": -1.0496742725372314,
"logps/rejected": -1.1253955364227295,
"loss": 1.6931,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.099348545074463,
"rewards/margins": 0.1514427214860916,
"rewards/rejected": -2.250791072845459,
"step": 1380
},
{
"epoch": 1.0014409221902016,
"grad_norm": 32.60072445084287,
"learning_rate": 4.2132309934569e-08,
"logits/chosen": -2.0483193397521973,
"logits/rejected": -2.048795223236084,
"logps/chosen": -1.014096975326538,
"logps/rejected": -1.129988670349121,
"loss": 1.6428,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.028193950653076,
"rewards/margins": 0.23178336024284363,
"rewards/rejected": -2.259977340698242,
"step": 1390
},
{
"epoch": 1.0086455331412103,
"grad_norm": 22.12849545877557,
"learning_rate": 4.197905933704989e-08,
"logits/chosen": -1.9475075006484985,
"logits/rejected": -1.94496750831604,
"logps/chosen": -1.0586014986038208,
"logps/rejected": -1.1966216564178467,
"loss": 1.6256,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1172029972076416,
"rewards/margins": 0.2760399878025055,
"rewards/rejected": -2.3932433128356934,
"step": 1400
},
{
"epoch": 1.015850144092219,
"grad_norm": 26.196327009154857,
"learning_rate": 4.1824615181769577e-08,
"logits/chosen": -1.9907910823822021,
"logits/rejected": -1.9951509237289429,
"logps/chosen": -1.0109912157058716,
"logps/rejected": -1.139633297920227,
"loss": 1.6317,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.021982431411743,
"rewards/margins": 0.25728410482406616,
"rewards/rejected": -2.279266595840454,
"step": 1410
},
{
"epoch": 1.0230547550432276,
"grad_norm": 21.70997161453033,
"learning_rate": 4.1668988325514434e-08,
"logits/chosen": -2.015436887741089,
"logits/rejected": -2.010326385498047,
"logps/chosen": -1.1151282787322998,
"logps/rejected": -1.2344862222671509,
"loss": 1.6553,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.2302565574645996,
"rewards/margins": 0.23871548473834991,
"rewards/rejected": -2.4689724445343018,
"step": 1420
},
{
"epoch": 1.0302593659942363,
"grad_norm": 24.489368436482035,
"learning_rate": 4.1512189708209844e-08,
"logits/chosen": -2.0582668781280518,
"logits/rejected": -2.0570740699768066,
"logps/chosen": -0.9387423396110535,
"logps/rejected": -1.0281721353530884,
"loss": 1.6834,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.877484679222107,
"rewards/margins": 0.17885959148406982,
"rewards/rejected": -2.0563442707061768,
"step": 1430
},
{
"epoch": 1.037463976945245,
"grad_norm": 26.799568343362726,
"learning_rate": 4.1354230352151143e-08,
"logits/chosen": -2.0068116188049316,
"logits/rejected": -2.000192165374756,
"logps/chosen": -1.1369554996490479,
"logps/rejected": -1.2212724685668945,
"loss": 1.696,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2739109992980957,
"rewards/margins": 0.16863420605659485,
"rewards/rejected": -2.442544937133789,
"step": 1440
},
{
"epoch": 1.0446685878962536,
"grad_norm": 19.49558850871138,
"learning_rate": 4.119512136122882e-08,
"logits/chosen": -2.069859266281128,
"logits/rejected": -2.0790157318115234,
"logps/chosen": -0.993932843208313,
"logps/rejected": -1.1478220224380493,
"loss": 1.5919,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.987865686416626,
"rewards/margins": 0.30777832865715027,
"rewards/rejected": -2.2956440448760986,
"step": 1450
},
{
"epoch": 1.0518731988472623,
"grad_norm": 19.10668753445954,
"learning_rate": 4.103487392014795e-08,
"logits/chosen": -1.9885203838348389,
"logits/rejected": -1.976251244544983,
"logps/chosen": -0.999312698841095,
"logps/rejected": -1.162035584449768,
"loss": 1.5665,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.99862539768219,
"rewards/margins": 0.32544589042663574,
"rewards/rejected": -2.324071168899536,
"step": 1460
},
{
"epoch": 1.059077809798271,
"grad_norm": 19.370074530775952,
"learning_rate": 4.087349929364192e-08,
"logits/chosen": -2.025360584259033,
"logits/rejected": -2.015761613845825,
"logps/chosen": -0.9582245945930481,
"logps/rejected": -1.0940570831298828,
"loss": 1.6113,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9164491891860962,
"rewards/margins": 0.2716650068759918,
"rewards/rejected": -2.1881141662597656,
"step": 1470
},
{
"epoch": 1.0662824207492796,
"grad_norm": 20.71760664678835,
"learning_rate": 4.0711008825680645e-08,
"logits/chosen": -1.9764623641967773,
"logits/rejected": -1.975449562072754,
"logps/chosen": -1.0053189992904663,
"logps/rejected": -1.125705361366272,
"loss": 1.6409,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0106379985809326,
"rewards/margins": 0.24077251553535461,
"rewards/rejected": -2.251410722732544,
"step": 1480
},
{
"epoch": 1.0734870317002883,
"grad_norm": 22.88411536785193,
"learning_rate": 4.054741393867306e-08,
"logits/chosen": -1.9940725564956665,
"logits/rejected": -1.991307020187378,
"logps/chosen": -1.110062837600708,
"logps/rejected": -1.1640889644622803,
"loss": 1.7326,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.220125675201416,
"rewards/margins": 0.10805213451385498,
"rewards/rejected": -2.3281779289245605,
"step": 1490
},
{
"epoch": 1.080691642651297,
"grad_norm": 21.542529455963674,
"learning_rate": 4.038272613266419e-08,
"logits/chosen": -1.995582938194275,
"logits/rejected": -1.9824492931365967,
"logps/chosen": -1.0086337327957153,
"logps/rejected": -1.1226236820220947,
"loss": 1.6355,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0172674655914307,
"rewards/margins": 0.22797977924346924,
"rewards/rejected": -2.2452473640441895,
"step": 1500
},
{
"epoch": 1.0878962536023056,
"grad_norm": 20.350925893402135,
"learning_rate": 4.0216956984526784e-08,
"logits/chosen": -2.0434131622314453,
"logits/rejected": -2.0453896522521973,
"logps/chosen": -1.0144180059432983,
"logps/rejected": -1.126713514328003,
"loss": 1.6479,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0288360118865967,
"rewards/margins": 0.22459068894386292,
"rewards/rejected": -2.253427028656006,
"step": 1510
},
{
"epoch": 1.0951008645533142,
"grad_norm": 18.958922435681895,
"learning_rate": 4.0050118147147446e-08,
"logits/chosen": -1.9865297079086304,
"logits/rejected": -1.9869588613510132,
"logps/chosen": -1.0981040000915527,
"logps/rejected": -1.1105927228927612,
"loss": 1.7979,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -2.1962080001831055,
"rewards/margins": 0.024977359920740128,
"rewards/rejected": -2.2211854457855225,
"step": 1520
},
{
"epoch": 1.1023054755043227,
"grad_norm": 20.44736409962546,
"learning_rate": 3.988222134860755e-08,
"logits/chosen": -2.0269482135772705,
"logits/rejected": -2.018134117126465,
"logps/chosen": -0.9491475224494934,
"logps/rejected": -1.1186093091964722,
"loss": 1.5605,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.8982950448989868,
"rewards/margins": 0.33892351388931274,
"rewards/rejected": -2.2372186183929443,
"step": 1530
},
{
"epoch": 1.1095100864553313,
"grad_norm": 28.227918425245296,
"learning_rate": 3.9713278391358724e-08,
"logits/chosen": -2.0332820415496826,
"logits/rejected": -2.027156352996826,
"logps/chosen": -1.0234980583190918,
"logps/rejected": -1.149630069732666,
"loss": 1.6176,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0469961166381836,
"rewards/margins": 0.2522641718387604,
"rewards/rejected": -2.299260139465332,
"step": 1540
},
{
"epoch": 1.11671469740634,
"grad_norm": 21.599435876957084,
"learning_rate": 3.954330115139328e-08,
"logits/chosen": -2.012049436569214,
"logits/rejected": -2.006786823272705,
"logps/chosen": -1.026759386062622,
"logps/rejected": -1.135288119316101,
"loss": 1.6551,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.053518772125244,
"rewards/margins": 0.2170577496290207,
"rewards/rejected": -2.270576238632202,
"step": 1550
},
{
"epoch": 1.1239193083573487,
"grad_norm": 30.010489670091495,
"learning_rate": 3.937230157740931e-08,
"logits/chosen": -2.06923246383667,
"logits/rejected": -2.063246488571167,
"logps/chosen": -1.0465221405029297,
"logps/rejected": -1.1864023208618164,
"loss": 1.6074,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0930442810058594,
"rewards/margins": 0.279760479927063,
"rewards/rejected": -2.372804641723633,
"step": 1560
},
{
"epoch": 1.1311239193083573,
"grad_norm": 19.2378025361386,
"learning_rate": 3.920029168997077e-08,
"logits/chosen": -2.047394037246704,
"logits/rejected": -2.045605182647705,
"logps/chosen": -1.0021626949310303,
"logps/rejected": -1.1329083442687988,
"loss": 1.6155,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0043253898620605,
"rewards/margins": 0.26149123907089233,
"rewards/rejected": -2.2658166885375977,
"step": 1570
},
{
"epoch": 1.138328530259366,
"grad_norm": 35.52052151454362,
"learning_rate": 3.9027283580662476e-08,
"logits/chosen": -2.015282392501831,
"logits/rejected": -2.00935697555542,
"logps/chosen": -1.046675205230713,
"logps/rejected": -1.1979637145996094,
"loss": 1.5981,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.093350410461426,
"rewards/margins": 0.3025774657726288,
"rewards/rejected": -2.3959274291992188,
"step": 1580
},
{
"epoch": 1.1455331412103746,
"grad_norm": 19.868592765575958,
"learning_rate": 3.885328941124014e-08,
"logits/chosen": -1.9911415576934814,
"logits/rejected": -1.9865388870239258,
"logps/chosen": -0.9653146862983704,
"logps/rejected": -1.1030223369598389,
"loss": 1.5978,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9306293725967407,
"rewards/margins": 0.27541524171829224,
"rewards/rejected": -2.2060446739196777,
"step": 1590
},
{
"epoch": 1.1527377521613833,
"grad_norm": 24.233182948016438,
"learning_rate": 3.867832141277539e-08,
"logits/chosen": -2.0305354595184326,
"logits/rejected": -2.0217463970184326,
"logps/chosen": -1.0669244527816772,
"logps/rejected": -1.1823561191558838,
"loss": 1.6418,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1338489055633545,
"rewards/margins": 0.23086321353912354,
"rewards/rejected": -2.3647122383117676,
"step": 1600
},
{
"epoch": 1.159942363112392,
"grad_norm": 24.72356012381405,
"learning_rate": 3.850239188479606e-08,
"logits/chosen": -1.9773706197738647,
"logits/rejected": -1.9808155298233032,
"logps/chosen": -1.0085049867630005,
"logps/rejected": -1.1022659540176392,
"loss": 1.6744,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.017009973526001,
"rewards/margins": 0.18752184510231018,
"rewards/rejected": -2.2045319080352783,
"step": 1610
},
{
"epoch": 1.1671469740634006,
"grad_norm": 24.775721760516777,
"learning_rate": 3.832551319442151e-08,
"logits/chosen": -2.0583930015563965,
"logits/rejected": -2.0596446990966797,
"logps/chosen": -1.0561758279800415,
"logps/rejected": -1.186408281326294,
"loss": 1.6182,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.112351655960083,
"rewards/margins": 0.2604648470878601,
"rewards/rejected": -2.372816562652588,
"step": 1620
},
{
"epoch": 1.1743515850144093,
"grad_norm": 20.23661779110279,
"learning_rate": 3.81476977754933e-08,
"logits/chosen": -1.951172113418579,
"logits/rejected": -1.9476385116577148,
"logps/chosen": -1.0267951488494873,
"logps/rejected": -1.1004960536956787,
"loss": 1.7001,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0535902976989746,
"rewards/margins": 0.14740177989006042,
"rewards/rejected": -2.2009921073913574,
"step": 1630
},
{
"epoch": 1.181556195965418,
"grad_norm": 19.554149170089566,
"learning_rate": 3.796895812770114e-08,
"logits/chosen": -1.9799621105194092,
"logits/rejected": -1.9810926914215088,
"logps/chosen": -1.0158445835113525,
"logps/rejected": -1.111926794052124,
"loss": 1.6759,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.031689167022705,
"rewards/margins": 0.1921643614768982,
"rewards/rejected": -2.223853588104248,
"step": 1640
},
{
"epoch": 1.1887608069164266,
"grad_norm": 25.592129333950247,
"learning_rate": 3.7789306815704216e-08,
"logits/chosen": -2.003976345062256,
"logits/rejected": -2.0018844604492188,
"logps/chosen": -1.0064101219177246,
"logps/rejected": -1.08089280128479,
"loss": 1.7033,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.012820243835449,
"rewards/margins": 0.14896517992019653,
"rewards/rejected": -2.16178560256958,
"step": 1650
},
{
"epoch": 1.195965417867435,
"grad_norm": 21.28393320695919,
"learning_rate": 3.760875646824795e-08,
"logits/chosen": -1.9339759349822998,
"logits/rejected": -1.937636375427246,
"logps/chosen": -0.9738245010375977,
"logps/rejected": -1.0825483798980713,
"loss": 1.6558,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9476490020751953,
"rewards/margins": 0.21744783222675323,
"rewards/rejected": -2.1650967597961426,
"step": 1660
},
{
"epoch": 1.2031700288184437,
"grad_norm": 26.447583069644832,
"learning_rate": 3.742731977727623e-08,
"logits/chosen": -2.0301713943481445,
"logits/rejected": -2.0270252227783203,
"logps/chosen": -1.0391323566436768,
"logps/rejected": -1.179983377456665,
"loss": 1.6045,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0782647132873535,
"rewards/margins": 0.2817017734050751,
"rewards/rejected": -2.35996675491333,
"step": 1670
},
{
"epoch": 1.2103746397694524,
"grad_norm": 23.740367749765426,
"learning_rate": 3.7245009497039244e-08,
"logits/chosen": -1.9730199575424194,
"logits/rejected": -1.9651565551757812,
"logps/chosen": -1.011291265487671,
"logps/rejected": -1.1528630256652832,
"loss": 1.5979,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.022582530975342,
"rewards/margins": 0.28314343094825745,
"rewards/rejected": -2.3057260513305664,
"step": 1680
},
{
"epoch": 1.217579250720461,
"grad_norm": 21.613562706441108,
"learning_rate": 3.7061838443196886e-08,
"logits/chosen": -2.0166778564453125,
"logits/rejected": -2.018479585647583,
"logps/chosen": -1.0249133110046387,
"logps/rejected": -1.1527760028839111,
"loss": 1.6175,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0498266220092773,
"rewards/margins": 0.25572583079338074,
"rewards/rejected": -2.3055520057678223,
"step": 1690
},
{
"epoch": 1.2247838616714697,
"grad_norm": 26.59797525557739,
"learning_rate": 3.68778194919179e-08,
"logits/chosen": -1.98636794090271,
"logits/rejected": -1.9875138998031616,
"logps/chosen": -1.076690673828125,
"logps/rejected": -1.204590916633606,
"loss": 1.6207,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.15338134765625,
"rewards/margins": 0.25580060482025146,
"rewards/rejected": -2.409181833267212,
"step": 1700
},
{
"epoch": 1.2319884726224783,
"grad_norm": 23.52633825308599,
"learning_rate": 3.66929655789747e-08,
"logits/chosen": -2.0286943912506104,
"logits/rejected": -2.0175669193267822,
"logps/chosen": -0.9385663866996765,
"logps/rejected": -1.0960757732391357,
"loss": 1.5837,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.877132773399353,
"rewards/margins": 0.31501883268356323,
"rewards/rejected": -2.1921515464782715,
"step": 1710
},
{
"epoch": 1.239193083573487,
"grad_norm": 19.78110352441659,
"learning_rate": 3.6507289698834064e-08,
"logits/chosen": -1.976352334022522,
"logits/rejected": -1.9729232788085938,
"logps/chosen": -0.984195351600647,
"logps/rejected": -1.120203971862793,
"loss": 1.6215,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.968390703201294,
"rewards/margins": 0.27201730012893677,
"rewards/rejected": -2.240407943725586,
"step": 1720
},
{
"epoch": 1.2463976945244957,
"grad_norm": 29.100393569917596,
"learning_rate": 3.6320804903743684e-08,
"logits/chosen": -2.019465923309326,
"logits/rejected": -2.019134283065796,
"logps/chosen": -1.035032033920288,
"logps/rejected": -1.1646802425384521,
"loss": 1.6245,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.070064067840576,
"rewards/margins": 0.25929638743400574,
"rewards/rejected": -2.3293604850769043,
"step": 1730
},
{
"epoch": 1.2536023054755043,
"grad_norm": 19.649258256134964,
"learning_rate": 3.61335243028146e-08,
"logits/chosen": -2.007497787475586,
"logits/rejected": -2.012320041656494,
"logps/chosen": -1.0912740230560303,
"logps/rejected": -1.2261770963668823,
"loss": 1.6174,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1825480461120605,
"rewards/margins": 0.26980629563331604,
"rewards/rejected": -2.4523541927337646,
"step": 1740
},
{
"epoch": 1.260806916426513,
"grad_norm": 21.835797474283304,
"learning_rate": 3.5945461061099736e-08,
"logits/chosen": -1.9699938297271729,
"logits/rejected": -1.9564844369888306,
"logps/chosen": -1.0432616472244263,
"logps/rejected": -1.1263704299926758,
"loss": 1.7047,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.0865232944488525,
"rewards/margins": 0.1662176102399826,
"rewards/rejected": -2.2527408599853516,
"step": 1750
},
{
"epoch": 1.2680115273775217,
"grad_norm": 23.15042193876464,
"learning_rate": 3.5756628398668446e-08,
"logits/chosen": -2.0515687465667725,
"logits/rejected": -2.056640148162842,
"logps/chosen": -1.1328608989715576,
"logps/rejected": -1.2376948595046997,
"loss": 1.672,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.2657217979431152,
"rewards/margins": 0.20966771245002747,
"rewards/rejected": -2.4753897190093994,
"step": 1760
},
{
"epoch": 1.2752161383285303,
"grad_norm": 21.73032924866429,
"learning_rate": 3.556703958967716e-08,
"logits/chosen": -2.0368218421936035,
"logits/rejected": -2.0322813987731934,
"logps/chosen": -1.0513852834701538,
"logps/rejected": -1.188612699508667,
"loss": 1.6166,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1027705669403076,
"rewards/margins": 0.27445459365844727,
"rewards/rejected": -2.377225399017334,
"step": 1770
},
{
"epoch": 1.282420749279539,
"grad_norm": 27.991096896336643,
"learning_rate": 3.5376707961436297e-08,
"logits/chosen": -2.0231380462646484,
"logits/rejected": -2.0172154903411865,
"logps/chosen": -1.1404026746749878,
"logps/rejected": -1.2050397396087646,
"loss": 1.716,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2808053493499756,
"rewards/margins": 0.12927410006523132,
"rewards/rejected": -2.4100794792175293,
"step": 1780
},
{
"epoch": 1.2896253602305476,
"grad_norm": 15.526485228812097,
"learning_rate": 3.51856468934734e-08,
"logits/chosen": -1.9784328937530518,
"logits/rejected": -1.9799473285675049,
"logps/chosen": -0.9755992889404297,
"logps/rejected": -1.0713945627212524,
"loss": 1.6629,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9511985778808594,
"rewards/margins": 0.19159065186977386,
"rewards/rejected": -2.142789125442505,
"step": 1790
},
{
"epoch": 1.2968299711815563,
"grad_norm": 23.420786518726864,
"learning_rate": 3.499386981659262e-08,
"logits/chosen": -2.0592575073242188,
"logits/rejected": -2.0537567138671875,
"logps/chosen": -1.0179462432861328,
"logps/rejected": -1.2126991748809814,
"loss": 1.5379,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0358924865722656,
"rewards/margins": 0.38950610160827637,
"rewards/rejected": -2.425398349761963,
"step": 1800
},
{
"epoch": 1.304034582132565,
"grad_norm": 25.260344669251943,
"learning_rate": 3.480139021193057e-08,
"logits/chosen": -1.9808191061019897,
"logits/rejected": -1.9826618432998657,
"logps/chosen": -0.9968624114990234,
"logps/rejected": -1.1215671300888062,
"loss": 1.6386,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9937248229980469,
"rewards/margins": 0.2494095265865326,
"rewards/rejected": -2.2431342601776123,
"step": 1810
},
{
"epoch": 1.3112391930835736,
"grad_norm": 31.402984893149874,
"learning_rate": 3.4608221610008666e-08,
"logits/chosen": -2.014035224914551,
"logits/rejected": -2.009444236755371,
"logps/chosen": -0.9725703001022339,
"logps/rejected": -1.1262907981872559,
"loss": 1.59,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9451406002044678,
"rewards/margins": 0.30744099617004395,
"rewards/rejected": -2.2525815963745117,
"step": 1820
},
{
"epoch": 1.318443804034582,
"grad_norm": 18.601121296835238,
"learning_rate": 3.4414377589782e-08,
"logits/chosen": -1.986240029335022,
"logits/rejected": -1.995163917541504,
"logps/chosen": -1.018169641494751,
"logps/rejected": -1.155580997467041,
"loss": 1.6207,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.036339282989502,
"rewards/margins": 0.27482283115386963,
"rewards/rejected": -2.311161994934082,
"step": 1830
},
{
"epoch": 1.3256484149855907,
"grad_norm": 21.14665341971537,
"learning_rate": 3.4219871777684745e-08,
"logits/chosen": -1.9896999597549438,
"logits/rejected": -1.9774585962295532,
"logps/chosen": -0.9940080642700195,
"logps/rejected": -1.1199101209640503,
"loss": 1.6367,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.988016128540039,
"rewards/margins": 0.2518041729927063,
"rewards/rejected": -2.2398202419281006,
"step": 1840
},
{
"epoch": 1.3328530259365994,
"grad_norm": 21.10945671522239,
"learning_rate": 3.4024717846672364e-08,
"logits/chosen": -2.0322022438049316,
"logits/rejected": -2.0256874561309814,
"logps/chosen": -0.9947555661201477,
"logps/rejected": -1.1280686855316162,
"loss": 1.6224,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9895111322402954,
"rewards/margins": 0.26662617921829224,
"rewards/rejected": -2.2561373710632324,
"step": 1850
},
{
"epoch": 1.340057636887608,
"grad_norm": 20.96016297776703,
"learning_rate": 3.382892951526036e-08,
"logits/chosen": -2.0151829719543457,
"logits/rejected": -2.0123233795166016,
"logps/chosen": -1.0515539646148682,
"logps/rejected": -1.2059379816055298,
"loss": 1.5835,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1031079292297363,
"rewards/margins": 0.3087681829929352,
"rewards/rejected": -2.4118759632110596,
"step": 1860
},
{
"epoch": 1.3472622478386167,
"grad_norm": 24.446492024491263,
"learning_rate": 3.3632520546559974e-08,
"logits/chosen": -1.9829241037368774,
"logits/rejected": -1.9711620807647705,
"logps/chosen": -0.9253702163696289,
"logps/rejected": -1.1010020971298218,
"loss": 1.5437,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8507404327392578,
"rewards/margins": 0.35126370191574097,
"rewards/rejected": -2.2020041942596436,
"step": 1870
},
{
"epoch": 1.3544668587896254,
"grad_norm": 22.257431028743113,
"learning_rate": 3.34355047473107e-08,
"logits/chosen": -1.9958999156951904,
"logits/rejected": -1.991745948791504,
"logps/chosen": -1.0290558338165283,
"logps/rejected": -1.1228206157684326,
"loss": 1.6814,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0581116676330566,
"rewards/margins": 0.18752947449684143,
"rewards/rejected": -2.2456412315368652,
"step": 1880
},
{
"epoch": 1.361671469740634,
"grad_norm": 26.52808003756447,
"learning_rate": 3.323789596690971e-08,
"logits/chosen": -1.9655685424804688,
"logits/rejected": -1.966602087020874,
"logps/chosen": -1.0216814279556274,
"logps/rejected": -1.1607329845428467,
"loss": 1.6043,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.043362855911255,
"rewards/margins": 0.27810320258140564,
"rewards/rejected": -2.3214659690856934,
"step": 1890
},
{
"epoch": 1.3688760806916427,
"grad_norm": 18.35881071746021,
"learning_rate": 3.303970809643828e-08,
"logits/chosen": -1.9993665218353271,
"logits/rejected": -2.0040156841278076,
"logps/chosen": -1.0356684923171997,
"logps/rejected": -1.169182300567627,
"loss": 1.6201,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0713369846343994,
"rewards/margins": 0.26702752709388733,
"rewards/rejected": -2.338364601135254,
"step": 1900
},
{
"epoch": 1.3760806916426513,
"grad_norm": 24.62927635906183,
"learning_rate": 3.2840955067685356e-08,
"logits/chosen": -2.027216672897339,
"logits/rejected": -2.0315163135528564,
"logps/chosen": -1.0546417236328125,
"logps/rejected": -1.2095133066177368,
"loss": 1.5822,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.109283447265625,
"rewards/margins": 0.30974280834198,
"rewards/rejected": -2.4190266132354736,
"step": 1910
},
{
"epoch": 1.38328530259366,
"grad_norm": 20.42510454332798,
"learning_rate": 3.264165085216817e-08,
"logits/chosen": -2.034405469894409,
"logits/rejected": -2.034414768218994,
"logps/chosen": -0.9350569844245911,
"logps/rejected": -1.109073281288147,
"loss": 1.5563,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8701139688491821,
"rewards/margins": 0.34803250432014465,
"rewards/rejected": -2.218146562576294,
"step": 1920
},
{
"epoch": 1.3904899135446687,
"grad_norm": 21.638674305028665,
"learning_rate": 3.244180946015008e-08,
"logits/chosen": -1.9656782150268555,
"logits/rejected": -1.9663848876953125,
"logps/chosen": -1.034440517425537,
"logps/rejected": -1.102476716041565,
"loss": 1.714,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.068881034851074,
"rewards/margins": 0.1360722780227661,
"rewards/rejected": -2.20495343208313,
"step": 1930
},
{
"epoch": 1.397694524495677,
"grad_norm": 18.917305957392887,
"learning_rate": 3.224144493965578e-08,
"logits/chosen": -2.0499091148376465,
"logits/rejected": -2.053539276123047,
"logps/chosen": -0.9922255277633667,
"logps/rejected": -1.1013872623443604,
"loss": 1.6507,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9844510555267334,
"rewards/margins": 0.21832342445850372,
"rewards/rejected": -2.2027745246887207,
"step": 1940
},
{
"epoch": 1.4048991354466858,
"grad_norm": 20.55745064569853,
"learning_rate": 3.204057137548371e-08,
"logits/chosen": -2.0113415718078613,
"logits/rejected": -2.005919933319092,
"logps/chosen": -0.9785380363464355,
"logps/rejected": -1.0900627374649048,
"loss": 1.6464,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.957076072692871,
"rewards/margins": 0.22304920852184296,
"rewards/rejected": -2.1801254749298096,
"step": 1950
},
{
"epoch": 1.4121037463976944,
"grad_norm": 22.56028475204301,
"learning_rate": 3.183920288821597e-08,
"logits/chosen": -1.9948104619979858,
"logits/rejected": -1.9915351867675781,
"logps/chosen": -1.0019850730895996,
"logps/rejected": -1.1697156429290771,
"loss": 1.5662,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.003970146179199,
"rewards/margins": 0.3354611098766327,
"rewards/rejected": -2.3394312858581543,
"step": 1960
},
{
"epoch": 1.419308357348703,
"grad_norm": 27.42995413253607,
"learning_rate": 3.1637353633225735e-08,
"logits/chosen": -2.037505626678467,
"logits/rejected": -2.031297206878662,
"logps/chosen": -1.0297725200653076,
"logps/rejected": -1.1842756271362305,
"loss": 1.5879,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0595450401306152,
"rewards/margins": 0.3090066909790039,
"rewards/rejected": -2.368551254272461,
"step": 1970
},
{
"epoch": 1.4265129682997117,
"grad_norm": 23.08165784353334,
"learning_rate": 3.143503779968213e-08,
"logits/chosen": -2.0119919776916504,
"logits/rejected": -2.0123164653778076,
"logps/chosen": -1.018511176109314,
"logps/rejected": -1.1586663722991943,
"loss": 1.623,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.037022352218628,
"rewards/margins": 0.2803104817867279,
"rewards/rejected": -2.3173327445983887,
"step": 1980
},
{
"epoch": 1.4337175792507204,
"grad_norm": 20.570737291684164,
"learning_rate": 3.1232269609552875e-08,
"logits/chosen": -1.9909594058990479,
"logits/rejected": -1.9883991479873657,
"logps/chosen": -0.9996950030326843,
"logps/rejected": -1.126483678817749,
"loss": 1.6305,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9993900060653687,
"rewards/margins": 0.25357744097709656,
"rewards/rejected": -2.252967357635498,
"step": 1990
},
{
"epoch": 1.440922190201729,
"grad_norm": 18.876341063543773,
"learning_rate": 3.102906331660444e-08,
"logits/chosen": -2.051898956298828,
"logits/rejected": -2.043545722961426,
"logps/chosen": -0.9927466511726379,
"logps/rejected": -1.1729360818862915,
"loss": 1.5476,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9854933023452759,
"rewards/margins": 0.36037883162498474,
"rewards/rejected": -2.345872163772583,
"step": 2000
},
{
"epoch": 1.4481268011527377,
"grad_norm": 18.989404751105003,
"learning_rate": 3.082543320540015e-08,
"logits/chosen": -1.9971063137054443,
"logits/rejected": -1.9903011322021484,
"logps/chosen": -1.0080406665802002,
"logps/rejected": -1.1570656299591064,
"loss": 1.5924,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0160813331604004,
"rewards/margins": 0.2980501055717468,
"rewards/rejected": -2.314131259918213,
"step": 2010
},
{
"epoch": 1.4553314121037464,
"grad_norm": 21.76127346796002,
"learning_rate": 3.062139359029599e-08,
"logits/chosen": -2.0260488986968994,
"logits/rejected": -2.0259416103363037,
"logps/chosen": -1.0300636291503906,
"logps/rejected": -1.1194459199905396,
"loss": 1.6832,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.0601272583007812,
"rewards/margins": 0.1787644922733307,
"rewards/rejected": -2.238891839981079,
"step": 2020
},
{
"epoch": 1.462536023054755,
"grad_norm": 22.315708105619215,
"learning_rate": 3.041695881443437e-08,
"logits/chosen": -2.0457310676574707,
"logits/rejected": -2.041228771209717,
"logps/chosen": -0.9756488800048828,
"logps/rejected": -1.1131236553192139,
"loss": 1.6062,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.9512977600097656,
"rewards/margins": 0.2749495506286621,
"rewards/rejected": -2.2262473106384277,
"step": 2030
},
{
"epoch": 1.4697406340057637,
"grad_norm": 26.741622993897,
"learning_rate": 3.0212143248735886e-08,
"logits/chosen": -2.0314245223999023,
"logits/rejected": -2.0318028926849365,
"logps/chosen": -1.000044345855713,
"logps/rejected": -1.1435314416885376,
"loss": 1.6005,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.000088691711426,
"rewards/margins": 0.28697413206100464,
"rewards/rejected": -2.287062883377075,
"step": 2040
},
{
"epoch": 1.4769452449567724,
"grad_norm": 23.377353479890957,
"learning_rate": 3.0006961290889077e-08,
"logits/chosen": -2.018728494644165,
"logits/rejected": -2.0098907947540283,
"logps/chosen": -1.1194074153900146,
"logps/rejected": -1.2942084074020386,
"loss": 1.5797,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.2388148307800293,
"rewards/margins": 0.3496018946170807,
"rewards/rejected": -2.588416814804077,
"step": 2050
},
{
"epoch": 1.484149855907781,
"grad_norm": 24.66862006119205,
"learning_rate": 2.980142736433833e-08,
"logits/chosen": -2.0064892768859863,
"logits/rejected": -1.9998804330825806,
"logps/chosen": -1.0339871644973755,
"logps/rejected": -1.1035549640655518,
"loss": 1.7141,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.067974328994751,
"rewards/margins": 0.1391356885433197,
"rewards/rejected": -2.2071099281311035,
"step": 2060
},
{
"epoch": 1.4913544668587897,
"grad_norm": 28.916182952269878,
"learning_rate": 2.9595555917269997e-08,
"logits/chosen": -2.0349533557891846,
"logits/rejected": -2.020362615585327,
"logps/chosen": -1.1436104774475098,
"logps/rejected": -1.247326135635376,
"loss": 1.6472,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.2872209548950195,
"rewards/margins": 0.2074313908815384,
"rewards/rejected": -2.494652271270752,
"step": 2070
},
{
"epoch": 1.4985590778097984,
"grad_norm": 21.48652689473404,
"learning_rate": 2.9389361421596725e-08,
"logits/chosen": -1.9501157999038696,
"logits/rejected": -1.952577829360962,
"logps/chosen": -1.0596842765808105,
"logps/rejected": -1.2010711431503296,
"loss": 1.604,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.119368553161621,
"rewards/margins": 0.2827739119529724,
"rewards/rejected": -2.402142286300659,
"step": 2080
},
{
"epoch": 1.505763688760807,
"grad_norm": 23.45025608625662,
"learning_rate": 2.9182858371940126e-08,
"logits/chosen": -2.0330328941345215,
"logits/rejected": -2.0277328491210938,
"logps/chosen": -1.049903154373169,
"logps/rejected": -1.1856356859207153,
"loss": 1.6123,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.099806308746338,
"rewards/margins": 0.2714654207229614,
"rewards/rejected": -2.3712713718414307,
"step": 2090
},
{
"epoch": 1.5129682997118157,
"grad_norm": 21.70840796820339,
"learning_rate": 2.8976061284611908e-08,
"logits/chosen": -1.9831184148788452,
"logits/rejected": -1.9919109344482422,
"logps/chosen": -0.9365741014480591,
"logps/rejected": -1.0736796855926514,
"loss": 1.6176,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8731482028961182,
"rewards/margins": 0.27421125769615173,
"rewards/rejected": -2.1473593711853027,
"step": 2100
},
{
"epoch": 1.5201729106628243,
"grad_norm": 25.188424132543844,
"learning_rate": 2.8768984696593384e-08,
"logits/chosen": -1.9801161289215088,
"logits/rejected": -1.9709136486053467,
"logps/chosen": -1.0179340839385986,
"logps/rejected": -1.1427983045578003,
"loss": 1.6414,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0358681678771973,
"rewards/margins": 0.24972863495349884,
"rewards/rejected": -2.2855966091156006,
"step": 2110
},
{
"epoch": 1.527377521613833,
"grad_norm": 21.634699280051397,
"learning_rate": 2.8561643164513637e-08,
"logits/chosen": -1.901829481124878,
"logits/rejected": -1.8984495401382446,
"logps/chosen": -1.0501524209976196,
"logps/rejected": -1.173863172531128,
"loss": 1.6306,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1003048419952393,
"rewards/margins": 0.24742154777050018,
"rewards/rejected": -2.347726345062256,
"step": 2120
},
{
"epoch": 1.5345821325648417,
"grad_norm": 23.61665024363779,
"learning_rate": 2.8354051263626227e-08,
"logits/chosen": -1.9798393249511719,
"logits/rejected": -1.9855964183807373,
"logps/chosen": -1.0607976913452148,
"logps/rejected": -1.181445837020874,
"loss": 1.6332,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1215953826904297,
"rewards/margins": 0.2412964552640915,
"rewards/rejected": -2.362891674041748,
"step": 2130
},
{
"epoch": 1.54178674351585,
"grad_norm": 23.186120463820025,
"learning_rate": 2.8146223586784573e-08,
"logits/chosen": -1.9751768112182617,
"logits/rejected": -1.9669930934906006,
"logps/chosen": -1.0672011375427246,
"logps/rejected": -1.2088868618011475,
"loss": 1.6089,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.134402275085449,
"rewards/margins": 0.2833711504936218,
"rewards/rejected": -2.417773723602295,
"step": 2140
},
{
"epoch": 1.5489913544668588,
"grad_norm": 30.08589108385336,
"learning_rate": 2.7938174743416205e-08,
"logits/chosen": -1.9352929592132568,
"logits/rejected": -1.93215811252594,
"logps/chosen": -1.0522792339324951,
"logps/rejected": -1.1689748764038086,
"loss": 1.644,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1045584678649902,
"rewards/margins": 0.23339109122753143,
"rewards/rejected": -2.337949752807617,
"step": 2150
},
{
"epoch": 1.5561959654178674,
"grad_norm": 23.105545692556433,
"learning_rate": 2.7729919358495728e-08,
"logits/chosen": -1.998682975769043,
"logits/rejected": -1.9997676610946655,
"logps/chosen": -1.1142494678497314,
"logps/rejected": -1.200404167175293,
"loss": 1.6925,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.228498935699463,
"rewards/margins": 0.1723092645406723,
"rewards/rejected": -2.400808334350586,
"step": 2160
},
{
"epoch": 1.563400576368876,
"grad_norm": 22.35367179960909,
"learning_rate": 2.7521472071516772e-08,
"logits/chosen": -1.9958610534667969,
"logits/rejected": -1.9947010278701782,
"logps/chosen": -0.9450345039367676,
"logps/rejected": -1.0654414892196655,
"loss": 1.6368,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.8900690078735352,
"rewards/margins": 0.2408140003681183,
"rewards/rejected": -2.130882978439331,
"step": 2170
},
{
"epoch": 1.5706051873198847,
"grad_norm": 25.688467296830364,
"learning_rate": 2.731284753546289e-08,
"logits/chosen": -1.986534833908081,
"logits/rejected": -1.9845256805419922,
"logps/chosen": -1.0848838090896606,
"logps/rejected": -1.2309720516204834,
"loss": 1.5979,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1697676181793213,
"rewards/margins": 0.29217639565467834,
"rewards/rejected": -2.461944103240967,
"step": 2180
},
{
"epoch": 1.5778097982708934,
"grad_norm": 25.40374296681987,
"learning_rate": 2.710406041577751e-08,
"logits/chosen": -2.0504424571990967,
"logits/rejected": -2.047253370285034,
"logps/chosen": -1.0334125757217407,
"logps/rejected": -1.1928437948226929,
"loss": 1.5819,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0668251514434814,
"rewards/margins": 0.31886276602745056,
"rewards/rejected": -2.3856875896453857,
"step": 2190
},
{
"epoch": 1.585014409221902,
"grad_norm": 21.131392600700334,
"learning_rate": 2.6895125389333017e-08,
"logits/chosen": -2.0092031955718994,
"logits/rejected": -2.0050158500671387,
"logps/chosen": -1.0256198644638062,
"logps/rejected": -1.1862627267837524,
"loss": 1.5762,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0512397289276123,
"rewards/margins": 0.3212856650352478,
"rewards/rejected": -2.372525453567505,
"step": 2200
},
{
"epoch": 1.5922190201729105,
"grad_norm": 20.15547075877672,
"learning_rate": 2.6686057143399028e-08,
"logits/chosen": -2.0078885555267334,
"logits/rejected": -2.0095622539520264,
"logps/chosen": -1.0626325607299805,
"logps/rejected": -1.167283296585083,
"loss": 1.6767,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.125265121459961,
"rewards/margins": 0.20930185914039612,
"rewards/rejected": -2.334566593170166,
"step": 2210
},
{
"epoch": 1.5994236311239192,
"grad_norm": 22.817140386866217,
"learning_rate": 2.647687037460996e-08,
"logits/chosen": -2.009531021118164,
"logits/rejected": -2.009129762649536,
"logps/chosen": -1.0871093273162842,
"logps/rejected": -1.292332649230957,
"loss": 1.5324,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.1742186546325684,
"rewards/margins": 0.41044631600379944,
"rewards/rejected": -2.584665298461914,
"step": 2220
},
{
"epoch": 1.6066282420749278,
"grad_norm": 24.027811416370326,
"learning_rate": 2.626757978793187e-08,
"logits/chosen": -2.017061710357666,
"logits/rejected": -2.010791301727295,
"logps/chosen": -1.0892200469970703,
"logps/rejected": -1.2187515497207642,
"loss": 1.6311,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1784400939941406,
"rewards/margins": 0.25906291604042053,
"rewards/rejected": -2.4375030994415283,
"step": 2230
},
{
"epoch": 1.6138328530259365,
"grad_norm": 27.47539280675231,
"learning_rate": 2.6058200095628797e-08,
"logits/chosen": -1.9886070489883423,
"logits/rejected": -1.9921098947525024,
"logps/chosen": -0.9186259508132935,
"logps/rejected": -1.098120093345642,
"loss": 1.5554,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.837251901626587,
"rewards/margins": 0.3589881956577301,
"rewards/rejected": -2.196240186691284,
"step": 2240
},
{
"epoch": 1.6210374639769451,
"grad_norm": 22.62589611549699,
"learning_rate": 2.584874601622854e-08,
"logits/chosen": -2.052700996398926,
"logits/rejected": -2.043602228164673,
"logps/chosen": -1.0874931812286377,
"logps/rejected": -1.2247369289398193,
"loss": 1.6309,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1749863624572754,
"rewards/margins": 0.2744874358177185,
"rewards/rejected": -2.4494738578796387,
"step": 2250
},
{
"epoch": 1.6282420749279538,
"grad_norm": 25.630077086272028,
"learning_rate": 2.5639232273487993e-08,
"logits/chosen": -1.9821510314941406,
"logits/rejected": -1.972529649734497,
"logps/chosen": -0.9799386262893677,
"logps/rejected": -1.1089693307876587,
"loss": 1.6268,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9598772525787354,
"rewards/margins": 0.2580614686012268,
"rewards/rejected": -2.2179386615753174,
"step": 2260
},
{
"epoch": 1.6354466858789625,
"grad_norm": 25.461200932800743,
"learning_rate": 2.5429673595358142e-08,
"logits/chosen": -2.0126545429229736,
"logits/rejected": -2.0112295150756836,
"logps/chosen": -1.0471864938735962,
"logps/rejected": -1.1747767925262451,
"loss": 1.626,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0943729877471924,
"rewards/margins": 0.25518038868904114,
"rewards/rejected": -2.3495535850524902,
"step": 2270
},
{
"epoch": 1.6426512968299711,
"grad_norm": 27.724134933705333,
"learning_rate": 2.5220084712948764e-08,
"logits/chosen": -1.9738900661468506,
"logits/rejected": -1.9630699157714844,
"logps/chosen": -1.1224391460418701,
"logps/rejected": -1.2453981637954712,
"loss": 1.6223,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.2448782920837402,
"rewards/margins": 0.24591811001300812,
"rewards/rejected": -2.4907963275909424,
"step": 2280
},
{
"epoch": 1.6498559077809798,
"grad_norm": 22.449014899141954,
"learning_rate": 2.5010480359492838e-08,
"logits/chosen": -1.9610662460327148,
"logits/rejected": -1.9581550359725952,
"logps/chosen": -1.0534158945083618,
"logps/rejected": -1.121906042098999,
"loss": 1.7249,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1068317890167236,
"rewards/margins": 0.13698022067546844,
"rewards/rejected": -2.243812084197998,
"step": 2290
},
{
"epoch": 1.6570605187319885,
"grad_norm": 24.99779388268088,
"learning_rate": 2.480087526931091e-08,
"logits/chosen": -2.0083067417144775,
"logits/rejected": -1.9961440563201904,
"logps/chosen": -1.005535364151001,
"logps/rejected": -1.1289100646972656,
"loss": 1.6406,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.011070728302002,
"rewards/margins": 0.24674968421459198,
"rewards/rejected": -2.2578201293945312,
"step": 2300
},
{
"epoch": 1.6642651296829971,
"grad_norm": 21.986886108291046,
"learning_rate": 2.4591284176775326e-08,
"logits/chosen": -1.967773199081421,
"logits/rejected": -1.9639675617218018,
"logps/chosen": -1.0770162343978882,
"logps/rejected": -1.1647992134094238,
"loss": 1.6916,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1540324687957764,
"rewards/margins": 0.17556606233119965,
"rewards/rejected": -2.3295984268188477,
"step": 2310
},
{
"epoch": 1.6714697406340058,
"grad_norm": 26.838033792582817,
"learning_rate": 2.4381721815274443e-08,
"logits/chosen": -2.0369725227355957,
"logits/rejected": -2.0373129844665527,
"logps/chosen": -1.0236698389053345,
"logps/rejected": -1.163711667060852,
"loss": 1.6162,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.047339677810669,
"rewards/margins": 0.2800835967063904,
"rewards/rejected": -2.327423334121704,
"step": 2320
},
{
"epoch": 1.6786743515850144,
"grad_norm": 23.085774593086864,
"learning_rate": 2.4172202916176936e-08,
"logits/chosen": -2.0389699935913086,
"logits/rejected": -2.041323184967041,
"logps/chosen": -0.971706211566925,
"logps/rejected": -1.1475776433944702,
"loss": 1.5717,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.94341242313385,
"rewards/margins": 0.35174277424812317,
"rewards/rejected": -2.2951552867889404,
"step": 2330
},
{
"epoch": 1.685878962536023,
"grad_norm": 22.57399934210468,
"learning_rate": 2.3962742207796268e-08,
"logits/chosen": -1.9822057485580444,
"logits/rejected": -1.9802274703979492,
"logps/chosen": -0.9580032229423523,
"logps/rejected": -1.131255865097046,
"loss": 1.572,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9160064458847046,
"rewards/margins": 0.3465050458908081,
"rewards/rejected": -2.262511730194092,
"step": 2340
},
{
"epoch": 1.6930835734870318,
"grad_norm": 26.337436430641183,
"learning_rate": 2.3753354414355334e-08,
"logits/chosen": -1.9447540044784546,
"logits/rejected": -1.9341723918914795,
"logps/chosen": -1.0690467357635498,
"logps/rejected": -1.191696047782898,
"loss": 1.6426,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1380934715270996,
"rewards/margins": 0.24529826641082764,
"rewards/rejected": -2.383392095565796,
"step": 2350
},
{
"epoch": 1.7002881844380404,
"grad_norm": 21.802361149754983,
"learning_rate": 2.3544054254951408e-08,
"logits/chosen": -1.9793498516082764,
"logits/rejected": -1.97073233127594,
"logps/chosen": -0.9383029937744141,
"logps/rejected": -1.1460812091827393,
"loss": 1.5183,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.8766059875488281,
"rewards/margins": 0.4155563414096832,
"rewards/rejected": -2.2921624183654785,
"step": 2360
},
{
"epoch": 1.707492795389049,
"grad_norm": 21.6845132948567,
"learning_rate": 2.3334856442521435e-08,
"logits/chosen": -2.0341391563415527,
"logits/rejected": -2.0266478061676025,
"logps/chosen": -1.0988128185272217,
"logps/rejected": -1.1753710508346558,
"loss": 1.707,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1976256370544434,
"rewards/margins": 0.15311647951602936,
"rewards/rejected": -2.3507421016693115,
"step": 2370
},
{
"epoch": 1.7146974063400577,
"grad_norm": 22.160651360455354,
"learning_rate": 2.3125775682807826e-08,
"logits/chosen": -2.0457205772399902,
"logits/rejected": -2.045448064804077,
"logps/chosen": -1.1700584888458252,
"logps/rejected": -1.2783877849578857,
"loss": 1.6602,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.3401169776916504,
"rewards/margins": 0.21665871143341064,
"rewards/rejected": -2.5567755699157715,
"step": 2380
},
{
"epoch": 1.7219020172910664,
"grad_norm": 24.211503707909046,
"learning_rate": 2.291682667332464e-08,
"logits/chosen": -2.0610098838806152,
"logits/rejected": -2.056089401245117,
"logps/chosen": -1.0519418716430664,
"logps/rejected": -1.1878581047058105,
"loss": 1.6179,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.103883743286133,
"rewards/margins": 0.2718324065208435,
"rewards/rejected": -2.375716209411621,
"step": 2390
},
{
"epoch": 1.729106628242075,
"grad_norm": 19.198310748892013,
"learning_rate": 2.2708024102324454e-08,
"logits/chosen": -2.0232059955596924,
"logits/rejected": -2.0173497200012207,
"logps/chosen": -1.0365447998046875,
"logps/rejected": -1.2240616083145142,
"loss": 1.5606,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.073089599609375,
"rewards/margins": 0.3750336468219757,
"rewards/rejected": -2.4481232166290283,
"step": 2400
},
{
"epoch": 1.7363112391930837,
"grad_norm": 26.19091650904539,
"learning_rate": 2.2499382647765797e-08,
"logits/chosen": -2.0218329429626465,
"logits/rejected": -2.01828932762146,
"logps/chosen": -1.0763471126556396,
"logps/rejected": -1.1715179681777954,
"loss": 1.6805,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1526942253112793,
"rewards/margins": 0.19034144282341003,
"rewards/rejected": -2.343035936355591,
"step": 2410
},
{
"epoch": 1.7435158501440924,
"grad_norm": 25.129161862706873,
"learning_rate": 2.2290916976281427e-08,
"logits/chosen": -2.001932144165039,
"logits/rejected": -1.9959653615951538,
"logps/chosen": -1.0039644241333008,
"logps/rejected": -1.14584219455719,
"loss": 1.6336,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0079288482666016,
"rewards/margins": 0.2837551236152649,
"rewards/rejected": -2.29168438911438,
"step": 2420
},
{
"epoch": 1.7507204610951008,
"grad_norm": 21.814621079989646,
"learning_rate": 2.2082641742147238e-08,
"logits/chosen": -1.9791107177734375,
"logits/rejected": -1.9726155996322632,
"logps/chosen": -1.0205609798431396,
"logps/rejected": -1.221226453781128,
"loss": 1.5273,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0411219596862793,
"rewards/margins": 0.40133076906204224,
"rewards/rejected": -2.442452907562256,
"step": 2430
},
{
"epoch": 1.7579250720461095,
"grad_norm": 23.142401741726285,
"learning_rate": 2.1874571586252177e-08,
"logits/chosen": -2.0248005390167236,
"logits/rejected": -2.0180201530456543,
"logps/chosen": -1.032070279121399,
"logps/rejected": -1.116600513458252,
"loss": 1.6925,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.064140558242798,
"rewards/margins": 0.1690603494644165,
"rewards/rejected": -2.233201026916504,
"step": 2440
},
{
"epoch": 1.7651296829971181,
"grad_norm": 23.772650056451177,
"learning_rate": 2.1666721135069037e-08,
"logits/chosen": -2.0153000354766846,
"logits/rejected": -2.0120010375976562,
"logps/chosen": -1.1153454780578613,
"logps/rejected": -1.216801404953003,
"loss": 1.6773,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.2306909561157227,
"rewards/margins": 0.20291194319725037,
"rewards/rejected": -2.433602809906006,
"step": 2450
},
{
"epoch": 1.7723342939481268,
"grad_norm": 18.806401369499547,
"learning_rate": 2.145910499962628e-08,
"logits/chosen": -2.0624032020568848,
"logits/rejected": -2.054765224456787,
"logps/chosen": -0.9628511667251587,
"logps/rejected": -1.1157209873199463,
"loss": 1.6013,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9257023334503174,
"rewards/margins": 0.30573925375938416,
"rewards/rejected": -2.2314419746398926,
"step": 2460
},
{
"epoch": 1.7795389048991355,
"grad_norm": 26.89395254493913,
"learning_rate": 2.1251737774480915e-08,
"logits/chosen": -2.0389955043792725,
"logits/rejected": -2.029460906982422,
"logps/chosen": -1.1734710931777954,
"logps/rejected": -1.2716848850250244,
"loss": 1.7022,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.346942186355591,
"rewards/margins": 0.19642747938632965,
"rewards/rejected": -2.543369770050049,
"step": 2470
},
{
"epoch": 1.7867435158501441,
"grad_norm": 20.139202633479794,
"learning_rate": 2.104463403669264e-08,
"logits/chosen": -1.9957828521728516,
"logits/rejected": -1.9933998584747314,
"logps/chosen": -1.0485862493515015,
"logps/rejected": -1.203293800354004,
"loss": 1.5981,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.097172498703003,
"rewards/margins": 0.3094151020050049,
"rewards/rejected": -2.406587600708008,
"step": 2480
},
{
"epoch": 1.7939481268011528,
"grad_norm": 20.172677879731765,
"learning_rate": 2.0837808344799028e-08,
"logits/chosen": -1.9755080938339233,
"logits/rejected": -1.971238136291504,
"logps/chosen": -0.9422762989997864,
"logps/rejected": -1.0841569900512695,
"loss": 1.604,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.8845525979995728,
"rewards/margins": 0.28376150131225586,
"rewards/rejected": -2.168313980102539,
"step": 2490
},
{
"epoch": 1.8011527377521612,
"grad_norm": 22.349389968529955,
"learning_rate": 2.063127523779219e-08,
"logits/chosen": -1.9753835201263428,
"logits/rejected": -1.9711145162582397,
"logps/chosen": -1.0105019807815552,
"logps/rejected": -1.208362340927124,
"loss": 1.5202,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.0210039615631104,
"rewards/margins": 0.3957210183143616,
"rewards/rejected": -2.416724681854248,
"step": 2500
},
{
"epoch": 1.8083573487031699,
"grad_norm": 24.22087521172144,
"learning_rate": 2.0425049234096737e-08,
"logits/chosen": -1.9858808517456055,
"logits/rejected": -1.9803756475448608,
"logps/chosen": -1.0131169557571411,
"logps/rejected": -1.1394225358963013,
"loss": 1.6405,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0262339115142822,
"rewards/margins": 0.2526114284992218,
"rewards/rejected": -2.2788450717926025,
"step": 2510
},
{
"epoch": 1.8155619596541785,
"grad_norm": 22.958910412289878,
"learning_rate": 2.0219144830549163e-08,
"logits/chosen": -1.9544498920440674,
"logits/rejected": -1.9534263610839844,
"logps/chosen": -1.0182510614395142,
"logps/rejected": -1.177302598953247,
"loss": 1.5993,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0365021228790283,
"rewards/margins": 0.31810325384140015,
"rewards/rejected": -2.354605197906494,
"step": 2520
},
{
"epoch": 1.8227665706051872,
"grad_norm": 21.62907044306249,
"learning_rate": 2.0013576501378823e-08,
"logits/chosen": -1.9767091274261475,
"logits/rejected": -1.9701858758926392,
"logps/chosen": -1.0102381706237793,
"logps/rejected": -1.1591678857803345,
"loss": 1.6085,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0204763412475586,
"rewards/margins": 0.2978593409061432,
"rewards/rejected": -2.318335771560669,
"step": 2530
},
{
"epoch": 1.8299711815561959,
"grad_norm": 24.093400959011806,
"learning_rate": 1.9808358697190426e-08,
"logits/chosen": -1.9653217792510986,
"logits/rejected": -1.9615375995635986,
"logps/chosen": -0.936444878578186,
"logps/rejected": -1.0819050073623657,
"loss": 1.6163,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.872889757156372,
"rewards/margins": 0.2909203767776489,
"rewards/rejected": -2.1638100147247314,
"step": 2540
},
{
"epoch": 1.8371757925072045,
"grad_norm": 25.800243545689497,
"learning_rate": 1.9603505843948214e-08,
"logits/chosen": -2.0147223472595215,
"logits/rejected": -2.004798412322998,
"logps/chosen": -0.9533373713493347,
"logps/rejected": -1.1323670148849487,
"loss": 1.5547,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.9066747426986694,
"rewards/margins": 0.3580593168735504,
"rewards/rejected": -2.2647340297698975,
"step": 2550
},
{
"epoch": 1.8443804034582132,
"grad_norm": 24.254590998607096,
"learning_rate": 1.9399032341961886e-08,
"logits/chosen": -1.9753398895263672,
"logits/rejected": -1.9594964981079102,
"logps/chosen": -0.9942548871040344,
"logps/rejected": -1.0747997760772705,
"loss": 1.7057,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.9885097742080688,
"rewards/margins": 0.1610899567604065,
"rewards/rejected": -2.149599552154541,
"step": 2560
},
{
"epoch": 1.8515850144092219,
"grad_norm": 30.793726543288717,
"learning_rate": 1.9194952564874323e-08,
"logits/chosen": -2.0176210403442383,
"logits/rejected": -2.011768341064453,
"logps/chosen": -1.0701502561569214,
"logps/rejected": -1.2221211194992065,
"loss": 1.5861,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1403005123138428,
"rewards/margins": 0.30394163727760315,
"rewards/rejected": -2.444242238998413,
"step": 2570
},
{
"epoch": 1.8587896253602305,
"grad_norm": 23.34142631480501,
"learning_rate": 1.8991280858651157e-08,
"logits/chosen": -1.974826455116272,
"logits/rejected": -1.968951940536499,
"logps/chosen": -1.0685060024261475,
"logps/rejected": -1.1608997583389282,
"loss": 1.684,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.137012004852295,
"rewards/margins": 0.1847873032093048,
"rewards/rejected": -2.3217995166778564,
"step": 2580
},
{
"epoch": 1.8659942363112392,
"grad_norm": 20.078022144210134,
"learning_rate": 1.8788031540572327e-08,
"logits/chosen": -1.9810216426849365,
"logits/rejected": -1.9728477001190186,
"logps/chosen": -1.0035111904144287,
"logps/rejected": -1.1611225605010986,
"loss": 1.5875,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0070223808288574,
"rewards/margins": 0.31522226333618164,
"rewards/rejected": -2.3222451210021973,
"step": 2590
},
{
"epoch": 1.8731988472622478,
"grad_norm": 20.268887631284258,
"learning_rate": 1.858521889822565e-08,
"logits/chosen": -1.9952818155288696,
"logits/rejected": -1.9979689121246338,
"logps/chosen": -0.977008044719696,
"logps/rejected": -1.093328595161438,
"loss": 1.6528,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.954016089439392,
"rewards/margins": 0.23264119029045105,
"rewards/rejected": -2.186657190322876,
"step": 2600
},
{
"epoch": 1.8804034582132565,
"grad_norm": 20.65067514824257,
"learning_rate": 1.8382857188502422e-08,
"logits/chosen": -1.9850199222564697,
"logits/rejected": -1.9801909923553467,
"logps/chosen": -0.9883711934089661,
"logps/rejected": -1.1270935535430908,
"loss": 1.6028,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9767423868179321,
"rewards/margins": 0.2774447500705719,
"rewards/rejected": -2.2541871070861816,
"step": 2610
},
{
"epoch": 1.8876080691642652,
"grad_norm": 26.07557235786858,
"learning_rate": 1.8180960636595234e-08,
"logits/chosen": -1.9607089757919312,
"logits/rejected": -1.9584417343139648,
"logps/chosen": -1.0416334867477417,
"logps/rejected": -1.1955784559249878,
"loss": 1.5974,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0832669734954834,
"rewards/margins": 0.3078901767730713,
"rewards/rejected": -2.3911569118499756,
"step": 2620
},
{
"epoch": 1.8948126801152738,
"grad_norm": 23.473893472630195,
"learning_rate": 1.7979543434998015e-08,
"logits/chosen": -2.029837131500244,
"logits/rejected": -2.034827709197998,
"logps/chosen": -1.1301052570343018,
"logps/rejected": -1.2223666906356812,
"loss": 1.6764,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.2602105140686035,
"rewards/margins": 0.18452298641204834,
"rewards/rejected": -2.4447333812713623,
"step": 2630
},
{
"epoch": 1.9020172910662825,
"grad_norm": 31.463222217407818,
"learning_rate": 1.7778619742508345e-08,
"logits/chosen": -1.9924646615982056,
"logits/rejected": -1.9858839511871338,
"logps/chosen": -1.1004579067230225,
"logps/rejected": -1.2025467157363892,
"loss": 1.6833,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.200915813446045,
"rewards/margins": 0.20417769253253937,
"rewards/rejected": -2.4050934314727783,
"step": 2640
},
{
"epoch": 1.9092219020172911,
"grad_norm": 27.42682717849318,
"learning_rate": 1.757820368323213e-08,
"logits/chosen": -1.9845936298370361,
"logits/rejected": -1.9750339984893799,
"logps/chosen": -1.110480785369873,
"logps/rejected": -1.2791945934295654,
"loss": 1.5742,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.220961570739746,
"rewards/margins": 0.33742767572402954,
"rewards/rejected": -2.558389186859131,
"step": 2650
},
{
"epoch": 1.9164265129682998,
"grad_norm": 25.747238627810656,
"learning_rate": 1.7378309345590803e-08,
"logits/chosen": -2.001929759979248,
"logits/rejected": -2.0115175247192383,
"logps/chosen": -1.091799020767212,
"logps/rejected": -1.242598533630371,
"loss": 1.6023,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.183598041534424,
"rewards/margins": 0.30159884691238403,
"rewards/rejected": -2.485197067260742,
"step": 2660
},
{
"epoch": 1.9236311239193085,
"grad_norm": 23.56149851612972,
"learning_rate": 1.717895078133088e-08,
"logits/chosen": -2.049691915512085,
"logits/rejected": -2.0456790924072266,
"logps/chosen": -1.0638725757598877,
"logps/rejected": -1.2155239582061768,
"loss": 1.5985,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1277451515197754,
"rewards/margins": 0.3033028244972229,
"rewards/rejected": -2.4310479164123535,
"step": 2670
},
{
"epoch": 1.9308357348703171,
"grad_norm": 25.342673498874493,
"learning_rate": 1.698014200453624e-08,
"logits/chosen": -2.0035059452056885,
"logits/rejected": -2.0112483501434326,
"logps/chosen": -1.036697506904602,
"logps/rejected": -1.171596884727478,
"loss": 1.6083,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.073395013809204,
"rewards/margins": 0.26979896426200867,
"rewards/rejected": -2.343193769454956,
"step": 2680
},
{
"epoch": 1.9380403458213258,
"grad_norm": 29.2866917522338,
"learning_rate": 1.6781896990642964e-08,
"logits/chosen": -1.9394210577011108,
"logits/rejected": -1.9367926120758057,
"logps/chosen": -1.1517702341079712,
"logps/rejected": -1.2503753900527954,
"loss": 1.6757,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.3035404682159424,
"rewards/margins": 0.19721055030822754,
"rewards/rejected": -2.500750780105591,
"step": 2690
},
{
"epoch": 1.9452449567723344,
"grad_norm": 27.895712891853,
"learning_rate": 1.658422967545693e-08,
"logits/chosen": -2.044224262237549,
"logits/rejected": -2.031148672103882,
"logps/chosen": -1.0125457048416138,
"logps/rejected": -1.1346945762634277,
"loss": 1.6456,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.0250914096832275,
"rewards/margins": 0.24429789185523987,
"rewards/rejected": -2.2693891525268555,
"step": 2700
},
{
"epoch": 1.952449567723343,
"grad_norm": 25.265909243180825,
"learning_rate": 1.638715395417418e-08,
"logits/chosen": -2.0184195041656494,
"logits/rejected": -2.016491413116455,
"logps/chosen": -1.0754286050796509,
"logps/rejected": -1.220522165298462,
"loss": 1.6055,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1508572101593018,
"rewards/margins": 0.2901870906352997,
"rewards/rejected": -2.441044330596924,
"step": 2710
},
{
"epoch": 1.9596541786743515,
"grad_norm": 26.14092408303687,
"learning_rate": 1.619068368040416e-08,
"logits/chosen": -2.014909267425537,
"logits/rejected": -2.0107204914093018,
"logps/chosen": -1.0064074993133545,
"logps/rejected": -1.1946723461151123,
"loss": 1.5392,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.012814998626709,
"rewards/margins": 0.37652960419654846,
"rewards/rejected": -2.3893446922302246,
"step": 2720
},
{
"epoch": 1.9668587896253602,
"grad_norm": 20.308279158480953,
"learning_rate": 1.5994832665195853e-08,
"logits/chosen": -1.9590470790863037,
"logits/rejected": -1.9595496654510498,
"logps/chosen": -1.0389412641525269,
"logps/rejected": -1.1585092544555664,
"loss": 1.6408,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0778825283050537,
"rewards/margins": 0.23913617432117462,
"rewards/rejected": -2.317018508911133,
"step": 2730
},
{
"epoch": 1.9740634005763689,
"grad_norm": 24.52856159698312,
"learning_rate": 1.5799614676066906e-08,
"logits/chosen": -2.0611324310302734,
"logits/rejected": -2.0581214427948,
"logps/chosen": -0.9563993215560913,
"logps/rejected": -1.104172706604004,
"loss": 1.5929,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.9127986431121826,
"rewards/margins": 0.2955467998981476,
"rewards/rejected": -2.208345413208008,
"step": 2740
},
{
"epoch": 1.9812680115273775,
"grad_norm": 18.949757053781095,
"learning_rate": 1.560504343603587e-08,
"logits/chosen": -1.973333716392517,
"logits/rejected": -1.9741255044937134,
"logps/chosen": -1.0730575323104858,
"logps/rejected": -1.2376629114151,
"loss": 1.576,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.1461150646209717,
"rewards/margins": 0.32921046018600464,
"rewards/rejected": -2.4753258228302,
"step": 2750
},
{
"epoch": 1.9884726224783862,
"grad_norm": 22.302877835841354,
"learning_rate": 1.541113262265748e-08,
"logits/chosen": -2.061547040939331,
"logits/rejected": -2.0595688819885254,
"logps/chosen": -1.0353928804397583,
"logps/rejected": -1.1622211933135986,
"loss": 1.6306,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0707857608795166,
"rewards/margins": 0.2536565363407135,
"rewards/rejected": -2.3244423866271973,
"step": 2760
},
{
"epoch": 1.9956772334293948,
"grad_norm": 30.115588938614223,
"learning_rate": 1.5217895867061227e-08,
"logits/chosen": -2.0031933784484863,
"logits/rejected": -1.9973207712173462,
"logps/chosen": -1.0904591083526611,
"logps/rejected": -1.1972274780273438,
"loss": 1.6758,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1809182167053223,
"rewards/margins": 0.21353654563426971,
"rewards/rejected": -2.3944549560546875,
"step": 2770
},
{
"epoch": 2.0028818443804033,
"grad_norm": 26.508187485308607,
"learning_rate": 1.5025346752993098e-08,
"logits/chosen": -1.9899861812591553,
"logits/rejected": -1.9919507503509521,
"logps/chosen": -1.0795637369155884,
"logps/rejected": -1.2126731872558594,
"loss": 1.6285,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1591274738311768,
"rewards/margins": 0.26621872186660767,
"rewards/rejected": -2.4253463745117188,
"step": 2780
},
{
"epoch": 2.010086455331412,
"grad_norm": 28.64222881063969,
"learning_rate": 1.4833498815860756e-08,
"logits/chosen": -2.044374942779541,
"logits/rejected": -2.0468382835388184,
"logps/chosen": -1.0054826736450195,
"logps/rejected": -1.2021020650863647,
"loss": 1.5559,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.010965347290039,
"rewards/margins": 0.3932386338710785,
"rewards/rejected": -2.4042041301727295,
"step": 2790
},
{
"epoch": 2.0172910662824206,
"grad_norm": 21.462192948449626,
"learning_rate": 1.4642365541781993e-08,
"logits/chosen": -1.9534351825714111,
"logits/rejected": -1.9449405670166016,
"logps/chosen": -1.0361294746398926,
"logps/rejected": -1.2111116647720337,
"loss": 1.5658,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.072258949279785,
"rewards/margins": 0.34996432065963745,
"rewards/rejected": -2.4222233295440674,
"step": 2800
},
{
"epoch": 2.0244956772334293,
"grad_norm": 20.76859057795762,
"learning_rate": 1.4451960366636745e-08,
"logits/chosen": -2.0219428539276123,
"logits/rejected": -2.033188819885254,
"logps/chosen": -1.0458028316497803,
"logps/rejected": -1.1884043216705322,
"loss": 1.605,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0916056632995605,
"rewards/margins": 0.28520235419273376,
"rewards/rejected": -2.3768086433410645,
"step": 2810
},
{
"epoch": 2.031700288184438,
"grad_norm": 23.353169437189916,
"learning_rate": 1.4262296675122592e-08,
"logits/chosen": -2.0090036392211914,
"logits/rejected": -2.005481243133545,
"logps/chosen": -1.0371921062469482,
"logps/rejected": -1.2073792219161987,
"loss": 1.5646,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0743842124938965,
"rewards/margins": 0.3403744101524353,
"rewards/rejected": -2.4147584438323975,
"step": 2820
},
{
"epoch": 2.0389048991354466,
"grad_norm": 23.121114498464102,
"learning_rate": 1.407338779981389e-08,
"logits/chosen": -1.9874942302703857,
"logits/rejected": -1.9855642318725586,
"logps/chosen": -0.9219571948051453,
"logps/rejected": -1.1140168905258179,
"loss": 1.5213,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.8439143896102905,
"rewards/margins": 0.3841188848018646,
"rewards/rejected": -2.2280337810516357,
"step": 2830
},
{
"epoch": 2.0461095100864553,
"grad_norm": 25.25341321880136,
"learning_rate": 1.3885247020224534e-08,
"logits/chosen": -1.9960615634918213,
"logits/rejected": -1.9914722442626953,
"logps/chosen": -1.009881854057312,
"logps/rejected": -1.1496226787567139,
"loss": 1.6128,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.019763708114624,
"rewards/margins": 0.2794816493988037,
"rewards/rejected": -2.2992453575134277,
"step": 2840
},
{
"epoch": 2.053314121037464,
"grad_norm": 20.971301133081738,
"learning_rate": 1.369788756187445e-08,
"logits/chosen": -2.0032501220703125,
"logits/rejected": -2.0000967979431152,
"logps/chosen": -1.0347437858581543,
"logps/rejected": -1.1370530128479004,
"loss": 1.6687,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.0694875717163086,
"rewards/margins": 0.20461849868297577,
"rewards/rejected": -2.274106025695801,
"step": 2850
},
{
"epoch": 2.0605187319884726,
"grad_norm": 21.9775203889279,
"learning_rate": 1.3511322595359925e-08,
"logits/chosen": -2.033581256866455,
"logits/rejected": -2.0254645347595215,
"logps/chosen": -0.944128692150116,
"logps/rejected": -1.125150442123413,
"loss": 1.5476,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.888257384300232,
"rewards/margins": 0.3620434105396271,
"rewards/rejected": -2.250300884246826,
"step": 2860
},
{
"epoch": 2.0677233429394812,
"grad_norm": 21.063191206550247,
"learning_rate": 1.3325565235427716e-08,
"logits/chosen": -2.0236756801605225,
"logits/rejected": -2.0223705768585205,
"logps/chosen": -0.9903309941291809,
"logps/rejected": -1.145707607269287,
"loss": 1.5929,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.9806619882583618,
"rewards/margins": 0.31075319647789,
"rewards/rejected": -2.291415214538574,
"step": 2870
},
{
"epoch": 2.07492795389049,
"grad_norm": 21.584640327110584,
"learning_rate": 1.3140628540053218e-08,
"logits/chosen": -1.986659049987793,
"logits/rejected": -1.9890410900115967,
"logps/chosen": -0.9793311953544617,
"logps/rejected": -1.123835802078247,
"loss": 1.6021,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9586623907089233,
"rewards/margins": 0.289009153842926,
"rewards/rejected": -2.247671604156494,
"step": 2880
},
{
"epoch": 2.0821325648414986,
"grad_norm": 22.942871845377027,
"learning_rate": 1.2956525509522451e-08,
"logits/chosen": -1.971453309059143,
"logits/rejected": -1.971011757850647,
"logps/chosen": -1.1158349514007568,
"logps/rejected": -1.2264450788497925,
"loss": 1.6636,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.2316699028015137,
"rewards/margins": 0.22122013568878174,
"rewards/rejected": -2.452890157699585,
"step": 2890
},
{
"epoch": 2.089337175792507,
"grad_norm": 23.820159972911735,
"learning_rate": 1.2773269085518267e-08,
"logits/chosen": -2.0029759407043457,
"logits/rejected": -2.004696846008301,
"logps/chosen": -1.0825508832931519,
"logps/rejected": -1.220155119895935,
"loss": 1.6086,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1651017665863037,
"rewards/margins": 0.2752082049846649,
"rewards/rejected": -2.44031023979187,
"step": 2900
},
{
"epoch": 2.096541786743516,
"grad_norm": 26.46761426332852,
"learning_rate": 1.2590872150210574e-08,
"logits/chosen": -2.0609946250915527,
"logits/rejected": -2.054304599761963,
"logps/chosen": -1.067638635635376,
"logps/rejected": -1.1893014907836914,
"loss": 1.6485,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.135277271270752,
"rewards/margins": 0.24332574009895325,
"rewards/rejected": -2.378602981567383,
"step": 2910
},
{
"epoch": 2.1037463976945245,
"grad_norm": 24.717381658629428,
"learning_rate": 1.2409347525350775e-08,
"logits/chosen": -2.024118185043335,
"logits/rejected": -2.014434337615967,
"logps/chosen": -1.114039659500122,
"logps/rejected": -1.2736414670944214,
"loss": 1.5805,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.228079319000244,
"rewards/margins": 0.31920376420021057,
"rewards/rejected": -2.5472829341888428,
"step": 2920
},
{
"epoch": 2.110951008645533,
"grad_norm": 26.147515087768657,
"learning_rate": 1.2228707971370421e-08,
"logits/chosen": -2.016010284423828,
"logits/rejected": -2.009105682373047,
"logps/chosen": -0.9981368184089661,
"logps/rejected": -1.1219263076782227,
"loss": 1.6463,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9962736368179321,
"rewards/margins": 0.24757930636405945,
"rewards/rejected": -2.2438526153564453,
"step": 2930
},
{
"epoch": 2.118155619596542,
"grad_norm": 25.906757042346783,
"learning_rate": 1.2048966186484282e-08,
"logits/chosen": -2.010725498199463,
"logits/rejected": -1.994248628616333,
"logps/chosen": -1.1241233348846436,
"logps/rejected": -1.2454028129577637,
"loss": 1.6413,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.248246669769287,
"rewards/margins": 0.2425590455532074,
"rewards/rejected": -2.4908056259155273,
"step": 2940
},
{
"epoch": 2.1253602305475505,
"grad_norm": 34.07975709431921,
"learning_rate": 1.187013480579762e-08,
"logits/chosen": -2.0079312324523926,
"logits/rejected": -2.010749340057373,
"logps/chosen": -1.0515167713165283,
"logps/rejected": -1.1958991289138794,
"loss": 1.6215,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1030335426330566,
"rewards/margins": 0.2887645661830902,
"rewards/rejected": -2.391798257827759,
"step": 2950
},
{
"epoch": 2.132564841498559,
"grad_norm": 48.68658598001818,
"learning_rate": 1.1692226400418073e-08,
"logits/chosen": -1.9440683126449585,
"logits/rejected": -1.9426988363265991,
"logps/chosen": -1.0936279296875,
"logps/rejected": -1.2332271337509155,
"loss": 1.6411,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.187255859375,
"rewards/margins": 0.27919843792915344,
"rewards/rejected": -2.466454267501831,
"step": 2960
},
{
"epoch": 2.139769452449568,
"grad_norm": 20.37382547923442,
"learning_rate": 1.1515253476571923e-08,
"logits/chosen": -1.9710067510604858,
"logits/rejected": -1.9651463031768799,
"logps/chosen": -1.017547845840454,
"logps/rejected": -1.2117187976837158,
"loss": 1.5272,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.035095691680908,
"rewards/margins": 0.3883420526981354,
"rewards/rejected": -2.4234375953674316,
"step": 2970
},
{
"epoch": 2.1469740634005765,
"grad_norm": 23.62593123554341,
"learning_rate": 1.133922847472496e-08,
"logits/chosen": -1.9900035858154297,
"logits/rejected": -1.9909718036651611,
"logps/chosen": -1.1151740550994873,
"logps/rejected": -1.224129319190979,
"loss": 1.6777,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2303481101989746,
"rewards/margins": 0.2179100066423416,
"rewards/rejected": -2.448258638381958,
"step": 2980
},
{
"epoch": 2.154178674351585,
"grad_norm": 27.11626352654653,
"learning_rate": 1.1164163768707952e-08,
"logits/chosen": -1.9934008121490479,
"logits/rejected": -1.9880987405776978,
"logps/chosen": -1.01137375831604,
"logps/rejected": -1.1608995199203491,
"loss": 1.6045,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.02274751663208,
"rewards/margins": 0.29905155301094055,
"rewards/rejected": -2.3217990398406982,
"step": 2990
},
{
"epoch": 2.161383285302594,
"grad_norm": 21.186985987415447,
"learning_rate": 1.0990071664846861e-08,
"logits/chosen": -1.9735314846038818,
"logits/rejected": -1.972663164138794,
"logps/chosen": -1.0251822471618652,
"logps/rejected": -1.2192916870117188,
"loss": 1.5629,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0503644943237305,
"rewards/margins": 0.38821902871131897,
"rewards/rejected": -2.4385833740234375,
"step": 3000
},
{
"epoch": 2.1685878962536025,
"grad_norm": 22.398067779662192,
"learning_rate": 1.0816964401097739e-08,
"logits/chosen": -1.9556314945220947,
"logits/rejected": -1.9524948596954346,
"logps/chosen": -0.9635698199272156,
"logps/rejected": -1.0958101749420166,
"loss": 1.6309,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9271396398544312,
"rewards/margins": 0.2644805312156677,
"rewards/rejected": -2.191620349884033,
"step": 3010
},
{
"epoch": 2.175792507204611,
"grad_norm": 24.597936445600215,
"learning_rate": 1.0644854146186406e-08,
"logits/chosen": -2.0202081203460693,
"logits/rejected": -2.0140926837921143,
"logps/chosen": -1.034292459487915,
"logps/rejected": -1.203952431678772,
"loss": 1.5761,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.06858491897583,
"rewards/margins": 0.33932000398635864,
"rewards/rejected": -2.407904863357544,
"step": 3020
},
{
"epoch": 2.18299711815562,
"grad_norm": 22.562346833112592,
"learning_rate": 1.0473752998753114e-08,
"logits/chosen": -1.9993441104888916,
"logits/rejected": -1.9910335540771484,
"logps/chosen": -1.0249227285385132,
"logps/rejected": -1.1977559328079224,
"loss": 1.5632,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0498454570770264,
"rewards/margins": 0.3456666171550751,
"rewards/rejected": -2.3955118656158447,
"step": 3030
},
{
"epoch": 2.1902017291066285,
"grad_norm": 23.766859260147648,
"learning_rate": 1.030367298650201e-08,
"logits/chosen": -2.01485013961792,
"logits/rejected": -2.0149147510528564,
"logps/chosen": -1.0467994213104248,
"logps/rejected": -1.208519697189331,
"loss": 1.5783,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0935988426208496,
"rewards/margins": 0.32344070076942444,
"rewards/rejected": -2.417039394378662,
"step": 3040
},
{
"epoch": 2.1974063400576367,
"grad_norm": 25.521958874894615,
"learning_rate": 1.0134626065355675e-08,
"logits/chosen": -2.0649075508117676,
"logits/rejected": -2.0617613792419434,
"logps/chosen": -1.0284671783447266,
"logps/rejected": -1.1858323812484741,
"loss": 1.6044,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.056934356689453,
"rewards/margins": 0.31473028659820557,
"rewards/rejected": -2.3716647624969482,
"step": 3050
},
{
"epoch": 2.2046109510086453,
"grad_norm": 23.646119810658234,
"learning_rate": 9.966624118614611e-09,
"logits/chosen": -2.0089523792266846,
"logits/rejected": -2.004260778427124,
"logps/chosen": -1.0687427520751953,
"logps/rejected": -1.2249877452850342,
"loss": 1.6068,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1374855041503906,
"rewards/margins": 0.31248974800109863,
"rewards/rejected": -2.4499754905700684,
"step": 3060
},
{
"epoch": 2.211815561959654,
"grad_norm": 18.119200799487714,
"learning_rate": 9.799678956121976e-09,
"logits/chosen": -1.9656593799591064,
"logits/rejected": -1.961615800857544,
"logps/chosen": -1.0386877059936523,
"logps/rejected": -1.1543315649032593,
"loss": 1.6312,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0773754119873047,
"rewards/margins": 0.2312876284122467,
"rewards/rejected": -2.3086631298065186,
"step": 3070
},
{
"epoch": 2.2190201729106627,
"grad_norm": 27.943065266424814,
"learning_rate": 9.633802313433314e-09,
"logits/chosen": -1.9396989345550537,
"logits/rejected": -1.9456799030303955,
"logps/chosen": -1.026071310043335,
"logps/rejected": -1.1421881914138794,
"loss": 1.6325,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.05214262008667,
"rewards/margins": 0.23223355412483215,
"rewards/rejected": -2.284376382827759,
"step": 3080
},
{
"epoch": 2.2262247838616713,
"grad_norm": 24.15226654416461,
"learning_rate": 9.469005850991705e-09,
"logits/chosen": -2.003962278366089,
"logits/rejected": -1.9982340335845947,
"logps/chosen": -1.0210195779800415,
"logps/rejected": -1.1483935117721558,
"loss": 1.6568,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.042039155960083,
"rewards/margins": 0.25474798679351807,
"rewards/rejected": -2.2967870235443115,
"step": 3090
},
{
"epoch": 2.23342939481268,
"grad_norm": 23.10826240141973,
"learning_rate": 9.305301153307949e-09,
"logits/chosen": -2.0031771659851074,
"logits/rejected": -2.0111162662506104,
"logps/chosen": -0.9573473930358887,
"logps/rejected": -1.1340956687927246,
"loss": 1.5703,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9146947860717773,
"rewards/margins": 0.35349661111831665,
"rewards/rejected": -2.268191337585449,
"step": 3100
},
{
"epoch": 2.2406340057636887,
"grad_norm": 22.536059648116893,
"learning_rate": 9.142699728146336e-09,
"logits/chosen": -1.9767051935195923,
"logits/rejected": -1.9701921939849854,
"logps/chosen": -1.040269136428833,
"logps/rejected": -1.1842563152313232,
"loss": 1.6209,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.080538272857666,
"rewards/margins": 0.28797417879104614,
"rewards/rejected": -2.3685126304626465,
"step": 3110
},
{
"epoch": 2.2478386167146973,
"grad_norm": 21.32782505584108,
"learning_rate": 8.981213005715627e-09,
"logits/chosen": -1.9988371133804321,
"logits/rejected": -2.0019822120666504,
"logps/chosen": -1.000981092453003,
"logps/rejected": -1.1834334135055542,
"loss": 1.5615,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.001962184906006,
"rewards/margins": 0.36490458250045776,
"rewards/rejected": -2.3668668270111084,
"step": 3120
},
{
"epoch": 2.255043227665706,
"grad_norm": 26.386255833154394,
"learning_rate": 8.820852337865611e-09,
"logits/chosen": -2.0248231887817383,
"logits/rejected": -2.0212666988372803,
"logps/chosen": -1.0024456977844238,
"logps/rejected": -1.1622812747955322,
"loss": 1.5876,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0048913955688477,
"rewards/margins": 0.3196714520454407,
"rewards/rejected": -2.3245625495910645,
"step": 3130
},
{
"epoch": 2.2622478386167146,
"grad_norm": 21.331730880197714,
"learning_rate": 8.661628997289044e-09,
"logits/chosen": -1.9632513523101807,
"logits/rejected": -1.9592435359954834,
"logps/chosen": -1.0223203897476196,
"logps/rejected": -1.1876834630966187,
"loss": 1.5872,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0446407794952393,
"rewards/margins": 0.3307264447212219,
"rewards/rejected": -2.3753669261932373,
"step": 3140
},
{
"epoch": 2.2694524495677233,
"grad_norm": 19.742128897565276,
"learning_rate": 8.503554176729341e-09,
"logits/chosen": -1.9689233303070068,
"logits/rejected": -1.9675970077514648,
"logps/chosen": -1.0326251983642578,
"logps/rejected": -1.2057253122329712,
"loss": 1.5801,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0652503967285156,
"rewards/margins": 0.34619995951652527,
"rewards/rejected": -2.4114506244659424,
"step": 3150
},
{
"epoch": 2.276657060518732,
"grad_norm": 28.382864078102244,
"learning_rate": 8.346638988193636e-09,
"logits/chosen": -1.9992077350616455,
"logits/rejected": -1.9941097497940063,
"logps/chosen": -0.9314563870429993,
"logps/rejected": -1.094285249710083,
"loss": 1.5924,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.8629127740859985,
"rewards/margins": 0.3256576359272003,
"rewards/rejected": -2.188570499420166,
"step": 3160
},
{
"epoch": 2.2838616714697406,
"grad_norm": 27.633265489696246,
"learning_rate": 8.19089446217176e-09,
"logits/chosen": -1.9723091125488281,
"logits/rejected": -1.9623254537582397,
"logps/chosen": -1.0091623067855835,
"logps/rejected": -1.2127363681793213,
"loss": 1.5228,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.018324613571167,
"rewards/margins": 0.40714770555496216,
"rewards/rejected": -2.4254727363586426,
"step": 3170
},
{
"epoch": 2.2910662824207493,
"grad_norm": 20.132916808178752,
"learning_rate": 8.036331546860777e-09,
"logits/chosen": -1.9806181192398071,
"logits/rejected": -1.9802868366241455,
"logps/chosen": -0.9595286250114441,
"logps/rejected": -1.0544369220733643,
"loss": 1.6841,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.9190572500228882,
"rewards/margins": 0.18981659412384033,
"rewards/rejected": -2.1088738441467285,
"step": 3180
},
{
"epoch": 2.298270893371758,
"grad_norm": 26.876429643756662,
"learning_rate": 7.882961107395416e-09,
"logits/chosen": -1.986196517944336,
"logits/rejected": -1.9807474613189697,
"logps/chosen": -1.1408748626708984,
"logps/rejected": -1.1967402696609497,
"loss": 1.7545,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.281749725341797,
"rewards/margins": 0.11173073202371597,
"rewards/rejected": -2.3934805393218994,
"step": 3190
},
{
"epoch": 2.3054755043227666,
"grad_norm": 30.561781481040974,
"learning_rate": 7.73079392508428e-09,
"logits/chosen": -1.961582899093628,
"logits/rejected": -1.9609956741333008,
"logps/chosen": -1.0990729331970215,
"logps/rejected": -1.302329659461975,
"loss": 1.5607,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.198145866394043,
"rewards/margins": 0.4065133035182953,
"rewards/rejected": -2.60465931892395,
"step": 3200
},
{
"epoch": 2.3126801152737753,
"grad_norm": 26.156879570027353,
"learning_rate": 7.579840696651938e-09,
"logits/chosen": -1.9922736883163452,
"logits/rejected": -1.989356279373169,
"logps/chosen": -1.057533621788025,
"logps/rejected": -1.1920627355575562,
"loss": 1.63,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.11506724357605,
"rewards/margins": 0.2690581679344177,
"rewards/rejected": -2.3841254711151123,
"step": 3210
},
{
"epoch": 2.319884726224784,
"grad_norm": 27.66473676438466,
"learning_rate": 7.43011203348704e-09,
"logits/chosen": -1.9100353717803955,
"logits/rejected": -1.9067933559417725,
"logps/chosen": -1.061374306678772,
"logps/rejected": -1.1472381353378296,
"loss": 1.702,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.122748613357544,
"rewards/margins": 0.17172771692276,
"rewards/rejected": -2.294476270675659,
"step": 3220
},
{
"epoch": 2.3270893371757926,
"grad_norm": 22.452700867913006,
"learning_rate": 7.281618460896344e-09,
"logits/chosen": -1.9864110946655273,
"logits/rejected": -1.9839799404144287,
"logps/chosen": -0.9730486869812012,
"logps/rejected": -1.126781702041626,
"loss": 1.59,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9460973739624023,
"rewards/margins": 0.30746573209762573,
"rewards/rejected": -2.253563404083252,
"step": 3230
},
{
"epoch": 2.3342939481268012,
"grad_norm": 24.011321706312618,
"learning_rate": 7.134370417364849e-09,
"logits/chosen": -1.9599173069000244,
"logits/rejected": -1.9596458673477173,
"logps/chosen": -1.0112863779067993,
"logps/rejected": -1.159676194190979,
"loss": 1.6227,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0225727558135986,
"rewards/margins": 0.29677945375442505,
"rewards/rejected": -2.319352388381958,
"step": 3240
},
{
"epoch": 2.34149855907781,
"grad_norm": 27.917867695174607,
"learning_rate": 6.988378253821981e-09,
"logits/chosen": -1.966631293296814,
"logits/rejected": -1.9656906127929688,
"logps/chosen": -1.0303471088409424,
"logps/rejected": -1.1579290628433228,
"loss": 1.6334,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0606942176818848,
"rewards/margins": 0.2551640570163727,
"rewards/rejected": -2.3158581256866455,
"step": 3250
},
{
"epoch": 2.3487031700288186,
"grad_norm": 24.28298696614824,
"learning_rate": 6.8436522329140186e-09,
"logits/chosen": -1.9699004888534546,
"logits/rejected": -1.9766439199447632,
"logps/chosen": -1.0417693853378296,
"logps/rejected": -1.175391435623169,
"loss": 1.6358,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.083538770675659,
"rewards/margins": 0.2672441601753235,
"rewards/rejected": -2.350782871246338,
"step": 3260
},
{
"epoch": 2.3559077809798272,
"grad_norm": 26.189687917994434,
"learning_rate": 6.700202528282603e-09,
"logits/chosen": -1.9680284261703491,
"logits/rejected": -1.9585533142089844,
"logps/chosen": -1.0375484228134155,
"logps/rejected": -1.165076494216919,
"loss": 1.6363,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.075096845626831,
"rewards/margins": 0.25505581498146057,
"rewards/rejected": -2.330152988433838,
"step": 3270
},
{
"epoch": 2.363112391930836,
"grad_norm": 27.52676939865246,
"learning_rate": 6.558039223849668e-09,
"logits/chosen": -2.0244338512420654,
"logits/rejected": -2.014971971511841,
"logps/chosen": -1.044699788093567,
"logps/rejected": -1.2688946723937988,
"loss": 1.5174,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.089399576187134,
"rewards/margins": 0.4483897089958191,
"rewards/rejected": -2.5377893447875977,
"step": 3280
},
{
"epoch": 2.3703170028818445,
"grad_norm": 26.223585423550503,
"learning_rate": 6.417172313108471e-09,
"logits/chosen": -1.9503448009490967,
"logits/rejected": -1.9449748992919922,
"logps/chosen": -0.9975979924201965,
"logps/rejected": -1.136448860168457,
"loss": 1.6211,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.995195984840393,
"rewards/margins": 0.2777020335197449,
"rewards/rejected": -2.272897720336914,
"step": 3290
},
{
"epoch": 2.377521613832853,
"grad_norm": 25.653460005188553,
"learning_rate": 6.277611698421179e-09,
"logits/chosen": -2.0117154121398926,
"logits/rejected": -2.003696918487549,
"logps/chosen": -0.9123650789260864,
"logps/rejected": -1.1209781169891357,
"loss": 1.5311,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8247301578521729,
"rewards/margins": 0.4172258973121643,
"rewards/rejected": -2.2419562339782715,
"step": 3300
},
{
"epoch": 2.3847262247838614,
"grad_norm": 27.5322217958583,
"learning_rate": 6.139367190322714e-09,
"logits/chosen": -2.0015804767608643,
"logits/rejected": -2.0014004707336426,
"logps/chosen": -1.0666191577911377,
"logps/rejected": -1.2348202466964722,
"loss": 1.579,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1332383155822754,
"rewards/margins": 0.3364020586013794,
"rewards/rejected": -2.4696404933929443,
"step": 3310
},
{
"epoch": 2.39193083573487,
"grad_norm": 19.961391280944266,
"learning_rate": 6.002448506831171e-09,
"logits/chosen": -1.9937254190444946,
"logits/rejected": -1.9888588190078735,
"logps/chosen": -0.9895680546760559,
"logps/rejected": -1.1448198556900024,
"loss": 1.5893,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9791361093521118,
"rewards/margins": 0.31050366163253784,
"rewards/rejected": -2.289639711380005,
"step": 3320
},
{
"epoch": 2.3991354466858787,
"grad_norm": 21.81418154842677,
"learning_rate": 5.866865272764607e-09,
"logits/chosen": -2.01529598236084,
"logits/rejected": -2.015200138092041,
"logps/chosen": -1.0282292366027832,
"logps/rejected": -1.182253360748291,
"loss": 1.5975,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0564584732055664,
"rewards/margins": 0.3080483078956604,
"rewards/rejected": -2.364506721496582,
"step": 3330
},
{
"epoch": 2.4063400576368874,
"grad_norm": 28.637110711539222,
"learning_rate": 5.7326270190645595e-09,
"logits/chosen": -1.8906781673431396,
"logits/rejected": -1.8924648761749268,
"logps/chosen": -1.0696254968643188,
"logps/rejected": -1.1889064311981201,
"loss": 1.6439,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1392509937286377,
"rewards/margins": 0.23856201767921448,
"rewards/rejected": -2.3778128623962402,
"step": 3340
},
{
"epoch": 2.413544668587896,
"grad_norm": 21.957493188360655,
"learning_rate": 5.599743182125938e-09,
"logits/chosen": -2.0409464836120605,
"logits/rejected": -2.0412380695343018,
"logps/chosen": -1.0565509796142578,
"logps/rejected": -1.20218825340271,
"loss": 1.6005,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.1131019592285156,
"rewards/margins": 0.2912743389606476,
"rewards/rejected": -2.40437650680542,
"step": 3350
},
{
"epoch": 2.4207492795389047,
"grad_norm": 24.603451670604123,
"learning_rate": 5.46822310313379e-09,
"logits/chosen": -2.0415310859680176,
"logits/rejected": -2.0513522624969482,
"logps/chosen": -1.1003459692001343,
"logps/rejected": -1.2101898193359375,
"loss": 1.6691,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.2006919384002686,
"rewards/margins": 0.21968770027160645,
"rewards/rejected": -2.420379638671875,
"step": 3360
},
{
"epoch": 2.4279538904899134,
"grad_norm": 24.665272996536196,
"learning_rate": 5.33807602740658e-09,
"logits/chosen": -2.0148816108703613,
"logits/rejected": -2.008512258529663,
"logps/chosen": -0.9646922945976257,
"logps/rejected": -1.1866120100021362,
"loss": 1.5074,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.9293845891952515,
"rewards/margins": 0.44383955001831055,
"rewards/rejected": -2.3732240200042725,
"step": 3370
},
{
"epoch": 2.435158501440922,
"grad_norm": 24.796160501653265,
"learning_rate": 5.209311103746334e-09,
"logits/chosen": -1.9915498495101929,
"logits/rejected": -1.9920551776885986,
"logps/chosen": -1.060661792755127,
"logps/rejected": -1.2435554265975952,
"loss": 1.5707,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.121323585510254,
"rewards/margins": 0.3657872676849365,
"rewards/rejected": -2.4871108531951904,
"step": 3380
},
{
"epoch": 2.4423631123919307,
"grad_norm": 29.011430928388492,
"learning_rate": 5.081937383795484e-09,
"logits/chosen": -1.9661788940429688,
"logits/rejected": -1.9654123783111572,
"logps/chosen": -0.9790254831314087,
"logps/rejected": -1.157768964767456,
"loss": 1.5575,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9580509662628174,
"rewards/margins": 0.35748690366744995,
"rewards/rejected": -2.315537929534912,
"step": 3390
},
{
"epoch": 2.4495677233429394,
"grad_norm": 22.287201293983628,
"learning_rate": 4.955963821400599e-09,
"logits/chosen": -2.015843152999878,
"logits/rejected": -2.010300397872925,
"logps/chosen": -1.03963041305542,
"logps/rejected": -1.1898655891418457,
"loss": 1.6122,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.07926082611084,
"rewards/margins": 0.30047017335891724,
"rewards/rejected": -2.3797311782836914,
"step": 3400
},
{
"epoch": 2.456772334293948,
"grad_norm": 18.680236697184657,
"learning_rate": 4.831399271982928e-09,
"logits/chosen": -1.9459985494613647,
"logits/rejected": -1.9379857778549194,
"logps/chosen": -1.0498692989349365,
"logps/rejected": -1.19268000125885,
"loss": 1.631,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.099738597869873,
"rewards/margins": 0.2856215536594391,
"rewards/rejected": -2.3853600025177,
"step": 3410
},
{
"epoch": 2.4639769452449567,
"grad_norm": 30.249393666448327,
"learning_rate": 4.708252491915951e-09,
"logits/chosen": -2.021432399749756,
"logits/rejected": -2.015430450439453,
"logps/chosen": -1.0544407367706299,
"logps/rejected": -1.2139813899993896,
"loss": 1.6108,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1088814735412598,
"rewards/margins": 0.31908124685287476,
"rewards/rejected": -2.4279627799987793,
"step": 3420
},
{
"epoch": 2.4711815561959654,
"grad_norm": 29.931099930216636,
"learning_rate": 4.58653213790981e-09,
"logits/chosen": -2.000617265701294,
"logits/rejected": -1.993115782737732,
"logps/chosen": -1.0341386795043945,
"logps/rejected": -1.194247841835022,
"loss": 1.5948,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.068277359008789,
"rewards/margins": 0.32021820545196533,
"rewards/rejected": -2.388495683670044,
"step": 3430
},
{
"epoch": 2.478386167146974,
"grad_norm": 22.79733837458826,
"learning_rate": 4.466246766402773e-09,
"logits/chosen": -1.9823423624038696,
"logits/rejected": -1.9762611389160156,
"logps/chosen": -1.047199010848999,
"logps/rejected": -1.2147526741027832,
"loss": 1.5942,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.094398021697998,
"rewards/margins": 0.33510738611221313,
"rewards/rejected": -2.4295053482055664,
"step": 3440
},
{
"epoch": 2.4855907780979827,
"grad_norm": 27.618677755187758,
"learning_rate": 4.347404832959775e-09,
"logits/chosen": -2.0297904014587402,
"logits/rejected": -2.0301997661590576,
"logps/chosen": -1.0434116125106812,
"logps/rejected": -1.2150681018829346,
"loss": 1.5743,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0868232250213623,
"rewards/margins": 0.3433128595352173,
"rewards/rejected": -2.430136203765869,
"step": 3450
},
{
"epoch": 2.4927953890489913,
"grad_norm": 37.06042245982399,
"learning_rate": 4.230014691678016e-09,
"logits/chosen": -1.9870338439941406,
"logits/rejected": -1.9875684976577759,
"logps/chosen": -1.070052146911621,
"logps/rejected": -1.1430634260177612,
"loss": 1.7119,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.140104293823242,
"rewards/margins": 0.14602291584014893,
"rewards/rejected": -2.2861268520355225,
"step": 3460
},
{
"epoch": 2.5,
"grad_norm": 21.559521464152546,
"learning_rate": 4.114084594599707e-09,
"logits/chosen": -1.9837043285369873,
"logits/rejected": -1.9837700128555298,
"logps/chosen": -1.0196425914764404,
"logps/rejected": -1.254660725593567,
"loss": 1.4967,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.039285182952881,
"rewards/margins": 0.47003644704818726,
"rewards/rejected": -2.509321451187134,
"step": 3470
},
{
"epoch": 2.5072046109510087,
"grad_norm": 25.359081837639064,
"learning_rate": 3.9996226911319546e-09,
"logits/chosen": -1.9840672016143799,
"logits/rejected": -1.9717687368392944,
"logps/chosen": -1.0251834392547607,
"logps/rejected": -1.1647298336029053,
"loss": 1.6113,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0503668785095215,
"rewards/margins": 0.2790928781032562,
"rewards/rejected": -2.3294596672058105,
"step": 3480
},
{
"epoch": 2.5144092219020173,
"grad_norm": 23.21571489418676,
"learning_rate": 3.886637027473949e-09,
"logits/chosen": -1.9924989938735962,
"logits/rejected": -1.9947795867919922,
"logps/chosen": -1.085985779762268,
"logps/rejected": -1.2627681493759155,
"loss": 1.5681,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.171971559524536,
"rewards/margins": 0.35356515645980835,
"rewards/rejected": -2.525536298751831,
"step": 3490
},
{
"epoch": 2.521613832853026,
"grad_norm": 23.50855743390047,
"learning_rate": 3.775135546051295e-09,
"logits/chosen": -1.9302011728286743,
"logits/rejected": -1.9312467575073242,
"logps/chosen": -1.0346893072128296,
"logps/rejected": -1.173813819885254,
"loss": 1.6196,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.069378614425659,
"rewards/margins": 0.27824902534484863,
"rewards/rejected": -2.347627639770508,
"step": 3500
},
{
"epoch": 2.5288184438040346,
"grad_norm": 29.164941549948743,
"learning_rate": 3.665126084957723e-09,
"logits/chosen": -1.9749339818954468,
"logits/rejected": -1.9792273044586182,
"logps/chosen": -1.1444737911224365,
"logps/rejected": -1.2538254261016846,
"loss": 1.6869,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.288947582244873,
"rewards/margins": 0.218703031539917,
"rewards/rejected": -2.507650852203369,
"step": 3510
},
{
"epoch": 2.5360230547550433,
"grad_norm": 24.755536040730522,
"learning_rate": 3.556616377404101e-09,
"logits/chosen": -2.0026092529296875,
"logits/rejected": -2.0009195804595947,
"logps/chosen": -1.090807557106018,
"logps/rejected": -1.2610208988189697,
"loss": 1.5663,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.181615114212036,
"rewards/margins": 0.34042656421661377,
"rewards/rejected": -2.5220417976379395,
"step": 3520
},
{
"epoch": 2.543227665706052,
"grad_norm": 24.281136701238548,
"learning_rate": 3.4496140511748125e-09,
"logits/chosen": -1.9856958389282227,
"logits/rejected": -1.98044753074646,
"logps/chosen": -1.0673859119415283,
"logps/rejected": -1.222153663635254,
"loss": 1.5935,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1347718238830566,
"rewards/margins": 0.30953553318977356,
"rewards/rejected": -2.444307327270508,
"step": 3530
},
{
"epoch": 2.5504322766570606,
"grad_norm": 36.508959716968505,
"learning_rate": 3.3441266280915427e-09,
"logits/chosen": -1.9778625965118408,
"logits/rejected": -1.978600263595581,
"logps/chosen": -1.1016578674316406,
"logps/rejected": -1.225250005722046,
"loss": 1.6389,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.2033157348632812,
"rewards/margins": 0.24718408286571503,
"rewards/rejected": -2.450500011444092,
"step": 3540
},
{
"epoch": 2.5576368876080693,
"grad_norm": 28.5877047659264,
"learning_rate": 3.2401615234845693e-09,
"logits/chosen": -2.0043270587921143,
"logits/rejected": -1.9988540410995483,
"logps/chosen": -1.10341477394104,
"logps/rejected": -1.2606594562530518,
"loss": 1.6032,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.20682954788208,
"rewards/margins": 0.31448912620544434,
"rewards/rejected": -2.5213189125061035,
"step": 3550
},
{
"epoch": 2.564841498559078,
"grad_norm": 20.7773010532923,
"learning_rate": 3.1377260456714375e-09,
"logits/chosen": -1.8968908786773682,
"logits/rejected": -1.8885040283203125,
"logps/chosen": -1.0720723867416382,
"logps/rejected": -1.2278337478637695,
"loss": 1.5831,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1441447734832764,
"rewards/margins": 0.3115227222442627,
"rewards/rejected": -2.455667495727539,
"step": 3560
},
{
"epoch": 2.5720461095100866,
"grad_norm": 21.319035140390216,
"learning_rate": 3.0368273954432698e-09,
"logits/chosen": -2.021033525466919,
"logits/rejected": -2.012899398803711,
"logps/chosen": -1.059136152267456,
"logps/rejected": -1.1730144023895264,
"loss": 1.6541,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.118272304534912,
"rewards/margins": 0.22775661945343018,
"rewards/rejected": -2.3460288047790527,
"step": 3570
},
{
"epoch": 2.5792507204610953,
"grad_norm": 21.491101038274447,
"learning_rate": 2.937472665558541e-09,
"logits/chosen": -2.019315242767334,
"logits/rejected": -2.020840644836426,
"logps/chosen": -1.0477993488311768,
"logps/rejected": -1.1753723621368408,
"loss": 1.6434,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0955986976623535,
"rewards/margins": 0.255145788192749,
"rewards/rejected": -2.3507447242736816,
"step": 3580
},
{
"epoch": 2.586455331412104,
"grad_norm": 25.855015008453492,
"learning_rate": 2.8396688402445053e-09,
"logits/chosen": -2.0597152709960938,
"logits/rejected": -2.05248761177063,
"logps/chosen": -1.0254089832305908,
"logps/rejected": -1.2497340440750122,
"loss": 1.5075,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0508179664611816,
"rewards/margins": 0.44864988327026367,
"rewards/rejected": -2.4994680881500244,
"step": 3590
},
{
"epoch": 2.5936599423631126,
"grad_norm": 27.89177787255965,
"learning_rate": 2.7434227947062324e-09,
"logits/chosen": -1.9996436834335327,
"logits/rejected": -1.9933786392211914,
"logps/chosen": -1.1413462162017822,
"logps/rejected": -1.2593032121658325,
"loss": 1.658,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.2826924324035645,
"rewards/margins": 0.23591408133506775,
"rewards/rejected": -2.518606424331665,
"step": 3600
},
{
"epoch": 2.6008645533141213,
"grad_norm": 21.719736867269187,
"learning_rate": 2.6487412946432976e-09,
"logits/chosen": -1.9642817974090576,
"logits/rejected": -1.9591954946517944,
"logps/chosen": -1.0852771997451782,
"logps/rejected": -1.2340834140777588,
"loss": 1.609,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1705543994903564,
"rewards/margins": 0.29761233925819397,
"rewards/rejected": -2.4681668281555176,
"step": 3610
},
{
"epoch": 2.60806916426513,
"grad_norm": 27.728503047567223,
"learning_rate": 2.5556309957742024e-09,
"logits/chosen": -1.9747917652130127,
"logits/rejected": -1.969839096069336,
"logps/chosen": -1.0310755968093872,
"logps/rejected": -1.242746353149414,
"loss": 1.5168,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0621511936187744,
"rewards/margins": 0.42334166169166565,
"rewards/rejected": -2.485492706298828,
"step": 3620
},
{
"epoch": 2.6152737752161386,
"grad_norm": 27.86461777769578,
"learning_rate": 2.4640984433684758e-09,
"logits/chosen": -2.0306482315063477,
"logits/rejected": -2.0319063663482666,
"logps/chosen": -1.1287659406661987,
"logps/rejected": -1.2553170919418335,
"loss": 1.6581,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.2575318813323975,
"rewards/margins": 0.2531023621559143,
"rewards/rejected": -2.510634183883667,
"step": 3630
},
{
"epoch": 2.6224783861671472,
"grad_norm": 22.019685163280375,
"learning_rate": 2.3741500717865987e-09,
"logits/chosen": -1.9839012622833252,
"logits/rejected": -1.9948337078094482,
"logps/chosen": -1.0139714479446411,
"logps/rejected": -1.170982003211975,
"loss": 1.5951,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0279428958892822,
"rewards/margins": 0.3140210211277008,
"rewards/rejected": -2.34196400642395,
"step": 3640
},
{
"epoch": 2.629682997118156,
"grad_norm": 22.05753538801505,
"learning_rate": 2.285792204027678e-09,
"logits/chosen": -1.9756708145141602,
"logits/rejected": -1.9732259511947632,
"logps/chosen": -1.0228625535964966,
"logps/rejected": -1.2338178157806396,
"loss": 1.5053,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.045725107192993,
"rewards/margins": 0.4219103455543518,
"rewards/rejected": -2.4676356315612793,
"step": 3650
},
{
"epoch": 2.636887608069164,
"grad_norm": 25.411892862416398,
"learning_rate": 2.199031051284972e-09,
"logits/chosen": -1.9979976415634155,
"logits/rejected": -1.9935623407363892,
"logps/chosen": -1.080296277999878,
"logps/rejected": -1.2212116718292236,
"loss": 1.634,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.160592555999756,
"rewards/margins": 0.28183117508888245,
"rewards/rejected": -2.4424233436584473,
"step": 3660
},
{
"epoch": 2.6440922190201728,
"grad_norm": 21.595154576529936,
"learning_rate": 2.113872712509254e-09,
"logits/chosen": -1.9839746952056885,
"logits/rejected": -1.9765218496322632,
"logps/chosen": -1.141282558441162,
"logps/rejected": -1.2657949924468994,
"loss": 1.6491,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.282565116882324,
"rewards/margins": 0.2490251064300537,
"rewards/rejected": -2.531589984893799,
"step": 3670
},
{
"epoch": 2.6512968299711814,
"grad_norm": 17.436350715731248,
"learning_rate": 2.0303231739801143e-09,
"logits/chosen": -1.9631779193878174,
"logits/rejected": -1.9526808261871338,
"logps/chosen": -1.030444860458374,
"logps/rejected": -1.1799728870391846,
"loss": 1.6027,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.060889720916748,
"rewards/margins": 0.2990562319755554,
"rewards/rejected": -2.359945774078369,
"step": 3680
},
{
"epoch": 2.65850144092219,
"grad_norm": 28.26132547261304,
"learning_rate": 1.948388308885102e-09,
"logits/chosen": -2.030214786529541,
"logits/rejected": -2.021780490875244,
"logps/chosen": -1.0733340978622437,
"logps/rejected": -1.192913293838501,
"loss": 1.6427,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1466681957244873,
"rewards/margins": 0.23915806412696838,
"rewards/rejected": -2.385826587677002,
"step": 3690
},
{
"epoch": 2.6657060518731988,
"grad_norm": 31.37293000800097,
"learning_rate": 1.86807387690692e-09,
"logits/chosen": -2.056385040283203,
"logits/rejected": -2.0531296730041504,
"logps/chosen": -1.0972204208374023,
"logps/rejected": -1.3030415773391724,
"loss": 1.5153,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1944408416748047,
"rewards/margins": 0.4116426110267639,
"rewards/rejected": -2.6060831546783447,
"step": 3700
},
{
"epoch": 2.6729106628242074,
"grad_norm": 24.09942757976874,
"learning_rate": 1.789385523818493e-09,
"logits/chosen": -2.0195093154907227,
"logits/rejected": -2.0211291313171387,
"logps/chosen": -1.0491076707839966,
"logps/rejected": -1.2321463823318481,
"loss": 1.5545,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.098215341567993,
"rewards/margins": 0.3660774827003479,
"rewards/rejected": -2.4642927646636963,
"step": 3710
},
{
"epoch": 2.680115273775216,
"grad_norm": 30.753165172362362,
"learning_rate": 1.712328781086131e-09,
"logits/chosen": -2.0431981086730957,
"logits/rejected": -2.038255214691162,
"logps/chosen": -1.1346651315689087,
"logps/rejected": -1.2389873266220093,
"loss": 1.6718,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.2693302631378174,
"rewards/margins": 0.20864447951316833,
"rewards/rejected": -2.4779746532440186,
"step": 3720
},
{
"epoch": 2.6873198847262247,
"grad_norm": 25.62420725486474,
"learning_rate": 1.6369090654806543e-09,
"logits/chosen": -2.04803466796875,
"logits/rejected": -2.041686534881592,
"logps/chosen": -1.0324945449829102,
"logps/rejected": -1.1873632669448853,
"loss": 1.5862,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0649890899658203,
"rewards/margins": 0.30973726511001587,
"rewards/rejected": -2.3747265338897705,
"step": 3730
},
{
"epoch": 2.6945244956772334,
"grad_norm": 23.41499250305915,
"learning_rate": 1.5631316786966498e-09,
"logits/chosen": -1.975886583328247,
"logits/rejected": -1.969620704650879,
"logps/chosen": -1.03388512134552,
"logps/rejected": -1.184274673461914,
"loss": 1.6153,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.06777024269104,
"rewards/margins": 0.3007793426513672,
"rewards/rejected": -2.368549346923828,
"step": 3740
},
{
"epoch": 2.701729106628242,
"grad_norm": 22.50489621056607,
"learning_rate": 1.491001806979772e-09,
"logits/chosen": -2.0254969596862793,
"logits/rejected": -2.0186665058135986,
"logps/chosen": -1.0850400924682617,
"logps/rejected": -1.246293067932129,
"loss": 1.5906,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1700801849365234,
"rewards/margins": 0.3225058913230896,
"rewards/rejected": -2.492586135864258,
"step": 3750
},
{
"epoch": 2.7089337175792507,
"grad_norm": 33.572444771947154,
"learning_rate": 1.4205245207621508e-09,
"logits/chosen": -1.9717979431152344,
"logits/rejected": -1.9695411920547485,
"logps/chosen": -1.1266522407531738,
"logps/rejected": -1.310119390487671,
"loss": 1.5624,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.2533044815063477,
"rewards/margins": 0.3669341802597046,
"rewards/rejected": -2.620238780975342,
"step": 3760
},
{
"epoch": 2.7161383285302594,
"grad_norm": 22.152945911675502,
"learning_rate": 1.3517047743059978e-09,
"logits/chosen": -2.009321928024292,
"logits/rejected": -2.012770891189575,
"logps/chosen": -1.08241605758667,
"logps/rejected": -1.2561628818511963,
"loss": 1.5653,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.16483211517334,
"rewards/margins": 0.34749364852905273,
"rewards/rejected": -2.5123257637023926,
"step": 3770
},
{
"epoch": 2.723342939481268,
"grad_norm": 20.70722793532355,
"learning_rate": 1.2845474053553156e-09,
"logits/chosen": -2.006065845489502,
"logits/rejected": -2.0020499229431152,
"logps/chosen": -1.0431503057479858,
"logps/rejected": -1.1901763677597046,
"loss": 1.623,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0863006114959717,
"rewards/margins": 0.29405173659324646,
"rewards/rejected": -2.380352735519409,
"step": 3780
},
{
"epoch": 2.7305475504322767,
"grad_norm": 26.64826043638499,
"learning_rate": 1.2190571347958422e-09,
"logits/chosen": -2.0325839519500732,
"logits/rejected": -2.033907413482666,
"logps/chosen": -0.9730801582336426,
"logps/rejected": -1.1865020990371704,
"loss": 1.5144,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9461603164672852,
"rewards/margins": 0.42684406042099,
"rewards/rejected": -2.373004198074341,
"step": 3790
},
{
"epoch": 2.7377521613832854,
"grad_norm": 21.63344100089033,
"learning_rate": 1.1552385663231634e-09,
"logits/chosen": -1.9884811639785767,
"logits/rejected": -1.9790737628936768,
"logps/chosen": -1.1036136150360107,
"logps/rejected": -1.2067337036132812,
"loss": 1.6742,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.2072272300720215,
"rewards/margins": 0.20624017715454102,
"rewards/rejected": -2.4134674072265625,
"step": 3800
},
{
"epoch": 2.744956772334294,
"grad_norm": 23.102000729253543,
"learning_rate": 1.0930961861191302e-09,
"logits/chosen": -1.9492992162704468,
"logits/rejected": -1.95420241355896,
"logps/chosen": -1.0491759777069092,
"logps/rejected": -1.2016587257385254,
"loss": 1.6216,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0983519554138184,
"rewards/margins": 0.30496540665626526,
"rewards/rejected": -2.403317451477051,
"step": 3810
},
{
"epoch": 2.7521613832853027,
"grad_norm": 20.452340499679426,
"learning_rate": 1.0326343625364608e-09,
"logits/chosen": -1.9641183614730835,
"logits/rejected": -1.959027886390686,
"logps/chosen": -1.051187515258789,
"logps/rejected": -1.2369552850723267,
"loss": 1.5457,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.102375030517578,
"rewards/margins": 0.3715355098247528,
"rewards/rejected": -2.4739105701446533,
"step": 3820
},
{
"epoch": 2.7593659942363113,
"grad_norm": 21.613705313436856,
"learning_rate": 9.738573457917066e-10,
"logits/chosen": -2.0394294261932373,
"logits/rejected": -2.038104772567749,
"logps/chosen": -1.060562252998352,
"logps/rejected": -1.2662984132766724,
"loss": 1.5136,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.121124505996704,
"rewards/margins": 0.4114726185798645,
"rewards/rejected": -2.5325968265533447,
"step": 3830
},
{
"epoch": 2.76657060518732,
"grad_norm": 24.920928975195235,
"learning_rate": 9.16769267666434e-10,
"logits/chosen": -2.002856731414795,
"logits/rejected": -2.000764846801758,
"logps/chosen": -1.084855318069458,
"logps/rejected": -1.165999174118042,
"loss": 1.7045,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.169710636138916,
"rewards/margins": 0.16228748857975006,
"rewards/rejected": -2.331998348236084,
"step": 3840
},
{
"epoch": 2.7737752161383287,
"grad_norm": 25.07731794836384,
"learning_rate": 8.613741412168113e-10,
"logits/chosen": -2.019688129425049,
"logits/rejected": -2.0192506313323975,
"logps/chosen": -1.0898449420928955,
"logps/rejected": -1.2301945686340332,
"loss": 1.6,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.179689884185791,
"rewards/margins": 0.2806992828845978,
"rewards/rejected": -2.4603891372680664,
"step": 3850
},
{
"epoch": 2.7809798270893373,
"grad_norm": 24.28344164460677,
"learning_rate": 8.076758604914802e-10,
"logits/chosen": -1.9492765665054321,
"logits/rejected": -1.9450442790985107,
"logps/chosen": -0.9892587661743164,
"logps/rejected": -1.1346309185028076,
"loss": 1.6167,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.9785175323486328,
"rewards/margins": 0.29074448347091675,
"rewards/rejected": -2.2692618370056152,
"step": 3860
},
{
"epoch": 2.7881844380403455,
"grad_norm": 28.329942268146258,
"learning_rate": 7.55678200257856e-10,
"logits/chosen": -1.9771426916122437,
"logits/rejected": -1.9707549810409546,
"logps/chosen": -1.0437920093536377,
"logps/rejected": -1.200476884841919,
"loss": 1.5889,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0875840187072754,
"rewards/margins": 0.3133697509765625,
"rewards/rejected": -2.400953769683838,
"step": 3870
},
{
"epoch": 2.795389048991354,
"grad_norm": 20.835071031526898,
"learning_rate": 7.053848157367315e-10,
"logits/chosen": -1.991943597793579,
"logits/rejected": -1.9867950677871704,
"logps/chosen": -1.0485520362854004,
"logps/rejected": -1.2103912830352783,
"loss": 1.5969,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.097104072570801,
"rewards/margins": 0.3236783444881439,
"rewards/rejected": -2.4207825660705566,
"step": 3880
},
{
"epoch": 2.802593659942363,
"grad_norm": 19.61293690218021,
"learning_rate": 6.567992423453794e-10,
"logits/chosen": -2.0088655948638916,
"logits/rejected": -2.007509708404541,
"logps/chosen": -0.9718767404556274,
"logps/rejected": -1.1004300117492676,
"loss": 1.6236,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9437534809112549,
"rewards/margins": 0.2571064531803131,
"rewards/rejected": -2.200860023498535,
"step": 3890
},
{
"epoch": 2.8097982708933715,
"grad_norm": 23.907550449023642,
"learning_rate": 6.099248954489794e-10,
"logits/chosen": -1.9489400386810303,
"logits/rejected": -1.9466136693954468,
"logps/chosen": -1.0784661769866943,
"logps/rejected": -1.2528491020202637,
"loss": 1.5667,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.1569323539733887,
"rewards/margins": 0.34876567125320435,
"rewards/rejected": -2.5056982040405273,
"step": 3900
},
{
"epoch": 2.81700288184438,
"grad_norm": 28.578873657679054,
"learning_rate": 5.647650701205653e-10,
"logits/chosen": -2.0213611125946045,
"logits/rejected": -2.0133721828460693,
"logps/chosen": -1.1192106008529663,
"logps/rejected": -1.291589617729187,
"loss": 1.5866,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.2384212017059326,
"rewards/margins": 0.34475821256637573,
"rewards/rejected": -2.583179235458374,
"step": 3910
},
{
"epoch": 2.824207492795389,
"grad_norm": 19.41067753405638,
"learning_rate": 5.213229409093856e-10,
"logits/chosen": -2.0236122608184814,
"logits/rejected": -2.018137216567993,
"logps/chosen": -1.0645720958709717,
"logps/rejected": -1.2068941593170166,
"loss": 1.6192,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1291441917419434,
"rewards/margins": 0.2846437692642212,
"rewards/rejected": -2.413788318634033,
"step": 3920
},
{
"epoch": 2.8314121037463975,
"grad_norm": 25.489467678642576,
"learning_rate": 4.796015616177401e-10,
"logits/chosen": -1.9871950149536133,
"logits/rejected": -1.9816499948501587,
"logps/chosen": -1.0776426792144775,
"logps/rejected": -1.1982210874557495,
"loss": 1.6416,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.155285358428955,
"rewards/margins": 0.24115705490112305,
"rewards/rejected": -2.396442174911499,
"step": 3930
},
{
"epoch": 2.838616714697406,
"grad_norm": 20.30364693658417,
"learning_rate": 4.3960386508631595e-10,
"logits/chosen": -1.921939492225647,
"logits/rejected": -1.9144452810287476,
"logps/chosen": -0.979306697845459,
"logps/rejected": -1.107569694519043,
"loss": 1.6521,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.958613395690918,
"rewards/margins": 0.2565256953239441,
"rewards/rejected": -2.215139389038086,
"step": 3940
},
{
"epoch": 2.845821325648415,
"grad_norm": 42.519384522550766,
"learning_rate": 4.013326629880243e-10,
"logits/chosen": -1.9709304571151733,
"logits/rejected": -1.961439847946167,
"logps/chosen": -1.1202346086502075,
"logps/rejected": -1.2615710496902466,
"loss": 1.622,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.240469217300415,
"rewards/margins": 0.2826729416847229,
"rewards/rejected": -2.523142099380493,
"step": 3950
},
{
"epoch": 2.8530259365994235,
"grad_norm": 23.238678680474873,
"learning_rate": 3.64790645630339e-10,
"logits/chosen": -1.9292995929718018,
"logits/rejected": -1.9286911487579346,
"logps/chosen": -1.061901330947876,
"logps/rejected": -1.138238787651062,
"loss": 1.7022,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.123802661895752,
"rewards/margins": 0.1526748687028885,
"rewards/rejected": -2.276477575302124,
"step": 3960
},
{
"epoch": 2.860230547550432,
"grad_norm": 25.943257260205577,
"learning_rate": 3.2998038176619e-10,
"logits/chosen": -1.973475694656372,
"logits/rejected": -1.9651120901107788,
"logps/chosen": -1.0681226253509521,
"logps/rejected": -1.200909972190857,
"loss": 1.6307,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1362452507019043,
"rewards/margins": 0.2655748426914215,
"rewards/rejected": -2.401819944381714,
"step": 3970
},
{
"epoch": 2.867435158501441,
"grad_norm": 25.046112858885824,
"learning_rate": 2.969043184133907e-10,
"logits/chosen": -2.0394976139068604,
"logits/rejected": -2.0383353233337402,
"logps/chosen": -0.9778817296028137,
"logps/rejected": -1.2098453044891357,
"loss": 1.4747,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9557634592056274,
"rewards/margins": 0.46392711997032166,
"rewards/rejected": -2.4196906089782715,
"step": 3980
},
{
"epoch": 2.8746397694524495,
"grad_norm": 22.389990606212322,
"learning_rate": 2.6556478068261447e-10,
"logits/chosen": -1.964787244796753,
"logits/rejected": -1.9624879360198975,
"logps/chosen": -0.9804242849349976,
"logps/rejected": -1.1250919103622437,
"loss": 1.6234,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9608485698699951,
"rewards/margins": 0.28933483362197876,
"rewards/rejected": -2.2501838207244873,
"step": 3990
},
{
"epoch": 2.881844380403458,
"grad_norm": 24.0117386682406,
"learning_rate": 2.3596397161395607e-10,
"logits/chosen": -2.0428566932678223,
"logits/rejected": -2.031513214111328,
"logps/chosen": -1.0748224258422852,
"logps/rejected": -1.255406141281128,
"loss": 1.5662,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.1496448516845703,
"rewards/margins": 0.36116713285446167,
"rewards/rejected": -2.510812282562256,
"step": 4000
},
{
"epoch": 2.889048991354467,
"grad_norm": 32.362804953121234,
"learning_rate": 2.0810397202206399e-10,
"logits/chosen": -1.9443575143814087,
"logits/rejected": -1.9498094320297241,
"logps/chosen": -1.0714243650436401,
"logps/rejected": -1.2111313343048096,
"loss": 1.6118,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1428487300872803,
"rewards/margins": 0.27941370010375977,
"rewards/rejected": -2.422262668609619,
"step": 4010
},
{
"epoch": 2.8962536023054755,
"grad_norm": 27.023698297841868,
"learning_rate": 1.819867403498737e-10,
"logits/chosen": -2.03318452835083,
"logits/rejected": -2.0310425758361816,
"logps/chosen": -1.0808742046356201,
"logps/rejected": -1.2229337692260742,
"loss": 1.6227,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1617484092712402,
"rewards/margins": 0.284119576215744,
"rewards/rejected": -2.4458675384521484,
"step": 4020
},
{
"epoch": 2.903458213256484,
"grad_norm": 26.71011918530369,
"learning_rate": 1.5761411253092382e-10,
"logits/chosen": -1.9595407247543335,
"logits/rejected": -1.9497658014297485,
"logps/chosen": -0.9974473714828491,
"logps/rejected": -1.1304714679718018,
"loss": 1.6212,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9948947429656982,
"rewards/margins": 0.2660483717918396,
"rewards/rejected": -2.2609429359436035,
"step": 4030
},
{
"epoch": 2.910662824207493,
"grad_norm": 24.25889585836036,
"learning_rate": 1.3498780186031455e-10,
"logits/chosen": -2.002887010574341,
"logits/rejected": -1.9993667602539062,
"logps/chosen": -1.1712630987167358,
"logps/rejected": -1.301621675491333,
"loss": 1.65,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.3425261974334717,
"rewards/margins": 0.26071739196777344,
"rewards/rejected": -2.603243350982666,
"step": 4040
},
{
"epoch": 2.9178674351585014,
"grad_norm": 19.7696756240332,
"learning_rate": 1.1410939887425141e-10,
"logits/chosen": -1.9911212921142578,
"logits/rejected": -1.993401288986206,
"logps/chosen": -1.0608654022216797,
"logps/rejected": -1.1971817016601562,
"loss": 1.6356,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1217308044433594,
"rewards/margins": 0.2726329267024994,
"rewards/rejected": -2.3943634033203125,
"step": 4050
},
{
"epoch": 2.92507204610951,
"grad_norm": 23.18031609093465,
"learning_rate": 9.498037123825686e-11,
"logits/chosen": -2.007997989654541,
"logits/rejected": -2.0048532485961914,
"logps/chosen": -1.0325133800506592,
"logps/rejected": -1.1695791482925415,
"loss": 1.6193,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0650267601013184,
"rewards/margins": 0.2741314768791199,
"rewards/rejected": -2.339158296585083,
"step": 4060
},
{
"epoch": 2.9322766570605188,
"grad_norm": 25.713514771607556,
"learning_rate": 7.760206364398614e-11,
"logits/chosen": -2.058072328567505,
"logits/rejected": -2.0552659034729004,
"logps/chosen": -1.0893595218658447,
"logps/rejected": -1.244717001914978,
"loss": 1.6012,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1787190437316895,
"rewards/margins": 0.3107149004936218,
"rewards/rejected": -2.489434003829956,
"step": 4070
},
{
"epoch": 2.9394812680115274,
"grad_norm": 26.762519657648443,
"learning_rate": 6.19756977147029e-11,
"logits/chosen": -1.9917011260986328,
"logits/rejected": -1.9884628057479858,
"logps/chosen": -1.040903925895691,
"logps/rejected": -1.26100492477417,
"loss": 1.5122,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.081807851791382,
"rewards/margins": 0.4402018189430237,
"rewards/rejected": -2.52200984954834,
"step": 4080
},
{
"epoch": 2.946685878962536,
"grad_norm": 25.259734222354364,
"learning_rate": 4.810237191940625e-11,
"logits/chosen": -1.9693422317504883,
"logits/rejected": -1.9684251546859741,
"logps/chosen": -1.0511581897735596,
"logps/rejected": -1.1897703409194946,
"loss": 1.6405,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.102316379547119,
"rewards/margins": 0.27722451090812683,
"rewards/rejected": -2.3795406818389893,
"step": 4090
},
{
"epoch": 2.9538904899135447,
"grad_norm": 23.752154925872336,
"learning_rate": 3.5983061495617476e-11,
"logits/chosen": -2.024099349975586,
"logits/rejected": -2.0245394706726074,
"logps/chosen": -1.1337699890136719,
"logps/rejected": -1.2917104959487915,
"loss": 1.6,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.2675399780273438,
"rewards/margins": 0.3158808946609497,
"rewards/rejected": -2.583420991897583,
"step": 4100
},
{
"epoch": 2.9610951008645534,
"grad_norm": 26.090688744328226,
"learning_rate": 2.5618618380812694e-11,
"logits/chosen": -2.0161375999450684,
"logits/rejected": -2.005847930908203,
"logps/chosen": -1.0120784044265747,
"logps/rejected": -1.1921513080596924,
"loss": 1.5759,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0241568088531494,
"rewards/margins": 0.3601462244987488,
"rewards/rejected": -2.3843026161193848,
"step": 4110
},
{
"epoch": 2.968299711815562,
"grad_norm": 27.27849102688369,
"learning_rate": 1.700977115254576e-11,
"logits/chosen": -1.9999454021453857,
"logits/rejected": -1.9970006942749023,
"logps/chosen": -1.0086463689804077,
"logps/rejected": -1.1693059206008911,
"loss": 1.5846,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0172927379608154,
"rewards/margins": 0.32131898403167725,
"rewards/rejected": -2.3386118412017822,
"step": 4120
},
{
"epoch": 2.9755043227665707,
"grad_norm": 24.842873331046228,
"learning_rate": 1.0157124977230868e-11,
"logits/chosen": -1.9786033630371094,
"logits/rejected": -1.977442979812622,
"logps/chosen": -0.9792217016220093,
"logps/rejected": -1.137369990348816,
"loss": 1.5873,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9584434032440186,
"rewards/margins": 0.3162967264652252,
"rewards/rejected": -2.274739980697632,
"step": 4130
},
{
"epoch": 2.9827089337175794,
"grad_norm": 25.86082420427953,
"learning_rate": 5.061161567596061e-12,
"logits/chosen": -1.9983164072036743,
"logits/rejected": -1.9942150115966797,
"logps/chosen": -1.0641577243804932,
"logps/rejected": -1.1579291820526123,
"loss": 1.6924,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1283154487609863,
"rewards/margins": 0.18754300475120544,
"rewards/rejected": -2.3158583641052246,
"step": 4140
},
{
"epoch": 2.989913544668588,
"grad_norm": 25.593515700161753,
"learning_rate": 1.7222391488297406e-12,
"logits/chosen": -2.018054246902466,
"logits/rejected": -2.0146076679229736,
"logps/chosen": -1.1173573732376099,
"logps/rejected": -1.27747642993927,
"loss": 1.591,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.2347147464752197,
"rewards/margins": 0.32023778557777405,
"rewards/rejected": -2.55495285987854,
"step": 4150
},
{
"epoch": 2.9971181556195967,
"grad_norm": 23.587314737717417,
"learning_rate": 1.4059243338693238e-13,
"logits/chosen": -1.99405038356781,
"logits/rejected": -1.9872567653656006,
"logps/chosen": -1.06746244430542,
"logps/rejected": -1.2030757665634155,
"loss": 1.6158,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.13492488861084,
"rewards/margins": 0.27122682332992554,
"rewards/rejected": -2.406151533126831,
"step": 4160
},
{
"epoch": 3.0,
"step": 4164,
"total_flos": 0.0,
"train_loss": 1.6295961157904577,
"train_runtime": 5477.0573,
"train_samples_per_second": 12.161,
"train_steps_per_second": 0.76
}
],
"logging_steps": 10,
"max_steps": 4164,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}