llama2-Chinese-human_emotion / trainer_state.json
PengceWang's picture
Upload 12 files
30fe2a6 verified
raw
history blame contribute delete
No virus
103 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6468305304010349,
"eval_steps": 500.0,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.6667505502700806,
"learning_rate": 6.666666666666668e-08,
"logits/chosen": 0.6656537652015686,
"logits/rejected": 0.8323326110839844,
"logps/chosen": -105.4136962890625,
"logps/rejected": -80.00390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 2.462697744369507,
"learning_rate": 6.666666666666667e-07,
"logits/chosen": 0.4595692455768585,
"logits/rejected": 0.40892449021339417,
"logps/chosen": -159.06524658203125,
"logps/rejected": -111.41619110107422,
"loss": 0.6921,
"rewards/accuracies": 0.37037038803100586,
"rewards/chosen": -0.005318281706422567,
"rewards/margins": 0.002280694665387273,
"rewards/rejected": -0.0075989761389791965,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.2759745121002197,
"learning_rate": 1.3333333333333334e-06,
"logits/chosen": 0.4103795886039734,
"logits/rejected": 0.3288261592388153,
"logps/chosen": -154.99789428710938,
"logps/rejected": -102.23339080810547,
"loss": 0.6872,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.011106210760772228,
"rewards/margins": 0.012265065684914589,
"rewards/rejected": -0.0011588542256504297,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 1.4769150018692017,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": 0.42967623472213745,
"logits/rejected": 0.3888375759124756,
"logps/chosen": -145.51222229003906,
"logps/rejected": -99.47395324707031,
"loss": 0.6943,
"rewards/accuracies": 0.43333330750465393,
"rewards/chosen": -0.0015143711352720857,
"rewards/margins": -0.001963119488209486,
"rewards/rejected": 0.0004487482365220785,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 2.340982675552368,
"learning_rate": 2.666666666666667e-06,
"logits/chosen": 0.31144046783447266,
"logits/rejected": 0.38159480690956116,
"logps/chosen": -221.299072265625,
"logps/rejected": -160.48690795898438,
"loss": 0.6928,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00796764437109232,
"rewards/margins": 0.0012136328732594848,
"rewards/rejected": 0.006754010915756226,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 1.5240986347198486,
"learning_rate": 3.3333333333333333e-06,
"logits/chosen": 0.5946449041366577,
"logits/rejected": 0.44660329818725586,
"logps/chosen": -190.49057006835938,
"logps/rejected": -139.8750457763672,
"loss": 0.6888,
"rewards/accuracies": 0.5666666626930237,
"rewards/chosen": 0.010610619559884071,
"rewards/margins": 0.009138532914221287,
"rewards/rejected": 0.0014720851322636008,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 1.5300071239471436,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": 0.42198482155799866,
"logits/rejected": 0.33618858456611633,
"logps/chosen": -173.13516235351562,
"logps/rejected": -121.2822036743164,
"loss": 0.6925,
"rewards/accuracies": 0.5333333015441895,
"rewards/chosen": 0.00521390326321125,
"rewards/margins": 0.0015516221756115556,
"rewards/rejected": 0.0036622812040150166,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 1.9583663940429688,
"learning_rate": 4.666666666666667e-06,
"logits/chosen": 0.33364781737327576,
"logits/rejected": 0.2776263952255249,
"logps/chosen": -209.07211303710938,
"logps/rejected": -158.00839233398438,
"loss": 0.6827,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": 0.026117617264389992,
"rewards/margins": 0.021206889301538467,
"rewards/rejected": 0.004910729825496674,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 1.7102651596069336,
"learning_rate": 5.333333333333334e-06,
"logits/chosen": 0.5094404816627502,
"logits/rejected": 0.4666718542575836,
"logps/chosen": -153.90780639648438,
"logps/rejected": -103.93388366699219,
"loss": 0.679,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": 0.038611821830272675,
"rewards/margins": 0.028847262263298035,
"rewards/rejected": 0.00976455770432949,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 2.9190127849578857,
"learning_rate": 6e-06,
"logits/chosen": 0.23412127792835236,
"logits/rejected": 0.16527244448661804,
"logps/chosen": -219.2425079345703,
"logps/rejected": -149.35684204101562,
"loss": 0.659,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": 0.07324406504631042,
"rewards/margins": 0.07025764882564545,
"rewards/rejected": 0.0029864185489714146,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 2.0400824546813965,
"learning_rate": 6.666666666666667e-06,
"logits/chosen": 0.3398872911930084,
"logits/rejected": 0.2985154092311859,
"logps/chosen": -185.22146606445312,
"logps/rejected": -121.82730865478516,
"loss": 0.6488,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10361306369304657,
"rewards/margins": 0.09121204167604446,
"rewards/rejected": 0.012401008978486061,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 2.192309617996216,
"learning_rate": 7.333333333333333e-06,
"logits/chosen": 0.4448312222957611,
"logits/rejected": 0.32321152091026306,
"logps/chosen": -189.84707641601562,
"logps/rejected": -124.48201751708984,
"loss": 0.6194,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19106850028038025,
"rewards/margins": 0.1558096706867218,
"rewards/rejected": 0.03525884076952934,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 2.386159896850586,
"learning_rate": 8.000000000000001e-06,
"logits/chosen": 0.2993674874305725,
"logits/rejected": 0.3050278127193451,
"logps/chosen": -187.64498901367188,
"logps/rejected": -127.5892105102539,
"loss": 0.5682,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30387812852859497,
"rewards/margins": 0.2730325758457184,
"rewards/rejected": 0.030845556408166885,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 2.4128735065460205,
"learning_rate": 8.666666666666668e-06,
"logits/chosen": 0.470858097076416,
"logits/rejected": 0.3819553256034851,
"logps/chosen": -147.3748321533203,
"logps/rejected": -106.97740173339844,
"loss": 0.5524,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3712579309940338,
"rewards/margins": 0.31013599038124084,
"rewards/rejected": 0.06112197786569595,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 3.5698485374450684,
"learning_rate": 9.333333333333334e-06,
"logits/chosen": 0.027050381526350975,
"logits/rejected": 0.06544498354196548,
"logps/chosen": -225.6494140625,
"logps/rejected": -170.77578735351562,
"loss": 0.4873,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5885292291641235,
"rewards/margins": 0.4800504744052887,
"rewards/rejected": 0.10847876965999603,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 2.3326575756073,
"learning_rate": 1e-05,
"logits/chosen": 0.4391583800315857,
"logits/rejected": 0.4003582000732422,
"logps/chosen": -177.93197631835938,
"logps/rejected": -129.4811248779297,
"loss": 0.4473,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7490917444229126,
"rewards/margins": 0.597412645816803,
"rewards/rejected": 0.15167909860610962,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 1.741441011428833,
"learning_rate": 9.989042296734605e-06,
"logits/chosen": 0.4242062568664551,
"logits/rejected": 0.4251991808414459,
"logps/chosen": -178.76414489746094,
"logps/rejected": -133.4027557373047,
"loss": 0.4098,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": 0.8510511517524719,
"rewards/margins": 0.7112440466880798,
"rewards/rejected": 0.1398070752620697,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 1.9917546510696411,
"learning_rate": 9.97808459346921e-06,
"logits/chosen": 0.254020094871521,
"logits/rejected": 0.23922643065452576,
"logps/chosen": -161.81881713867188,
"logps/rejected": -125.13037109375,
"loss": 0.3277,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1077089309692383,
"rewards/margins": 0.9747198224067688,
"rewards/rejected": 0.13298924267292023,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": 1.3671507835388184,
"learning_rate": 9.967126890203814e-06,
"logits/chosen": 0.2504037320613861,
"logits/rejected": 0.18875229358673096,
"logps/chosen": -147.4672088623047,
"logps/rejected": -104.1845474243164,
"loss": 0.2885,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3119361400604248,
"rewards/margins": 1.1814303398132324,
"rewards/rejected": 0.13050585985183716,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 1.9124622344970703,
"learning_rate": 9.956169186938418e-06,
"logits/chosen": 0.2291211187839508,
"logits/rejected": 0.24650339782238007,
"logps/chosen": -147.00360107421875,
"logps/rejected": -115.02952575683594,
"loss": 0.2414,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5095247030258179,
"rewards/margins": 1.3837789297103882,
"rewards/rejected": 0.12574569880962372,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 1.6257745027542114,
"learning_rate": 9.945211483673022e-06,
"logits/chosen": 0.18027305603027344,
"logits/rejected": 0.17534476518630981,
"logps/chosen": -180.81214904785156,
"logps/rejected": -138.56167602539062,
"loss": 0.2117,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7416807413101196,
"rewards/margins": 1.576603889465332,
"rewards/rejected": 0.16507670283317566,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": 1.6058803796768188,
"learning_rate": 9.934253780407628e-06,
"logits/chosen": 0.03618524968624115,
"logits/rejected": 0.06655623763799667,
"logps/chosen": -187.56057739257812,
"logps/rejected": -165.72299194335938,
"loss": 0.192,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6607027053833008,
"rewards/margins": 1.624441385269165,
"rewards/rejected": 0.03626134246587753,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 2.9326171875,
"learning_rate": 9.92329607714223e-06,
"logits/chosen": 0.15423543751239777,
"logits/rejected": 0.16546325385570526,
"logps/chosen": -122.26210021972656,
"logps/rejected": -105.48604583740234,
"loss": 0.262,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": 1.8262674808502197,
"rewards/margins": 1.484720230102539,
"rewards/rejected": 0.3415472209453583,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 1.0585615634918213,
"learning_rate": 9.912338373876837e-06,
"logits/chosen": 0.0944317951798439,
"logits/rejected": 0.0521501824259758,
"logps/chosen": -183.9434051513672,
"logps/rejected": -139.52586364746094,
"loss": 0.1305,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2049338817596436,
"rewards/margins": 2.331801652908325,
"rewards/rejected": -0.12686775624752045,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": 1.2967203855514526,
"learning_rate": 9.90138067061144e-06,
"logits/chosen": 0.03958363085985184,
"logits/rejected": -0.04342944175004959,
"logps/chosen": -147.5401611328125,
"logps/rejected": -118.40291595458984,
"loss": 0.1008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.246441602706909,
"rewards/margins": 2.4990291595458984,
"rewards/rejected": -0.25258734822273254,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 0.5034691095352173,
"learning_rate": 9.890422967346045e-06,
"logits/chosen": 0.03393579646945,
"logits/rejected": 0.003592267632484436,
"logps/chosen": -143.8810272216797,
"logps/rejected": -128.96066284179688,
"loss": 0.1339,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0601272583007812,
"rewards/margins": 2.3098137378692627,
"rewards/rejected": -0.24968591332435608,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 0.8256264328956604,
"learning_rate": 9.87946526408065e-06,
"logits/chosen": -0.09204194694757462,
"logits/rejected": -0.1390385925769806,
"logps/chosen": -165.94276428222656,
"logps/rejected": -140.0150909423828,
"loss": 0.1342,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1594629287719727,
"rewards/margins": 2.515359401702881,
"rewards/rejected": -0.35589665174484253,
"step": 260
},
{
"epoch": 0.09,
"grad_norm": 0.17470860481262207,
"learning_rate": 9.868507560815254e-06,
"logits/chosen": -0.09904654324054718,
"logits/rejected": -0.21556393802165985,
"logps/chosen": -138.27383422851562,
"logps/rejected": -117.7011489868164,
"loss": 0.1108,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1653850078582764,
"rewards/margins": 2.631995677947998,
"rewards/rejected": -0.4666108191013336,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 0.4645913541316986,
"learning_rate": 9.857549857549858e-06,
"logits/chosen": -0.014490666799247265,
"logits/rejected": -0.06418715417385101,
"logps/chosen": -180.47317504882812,
"logps/rejected": -161.57061767578125,
"loss": 0.0746,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.279376983642578,
"rewards/margins": 3.045623302459717,
"rewards/rejected": -0.766246497631073,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 0.6914668679237366,
"learning_rate": 9.846592154284462e-06,
"logits/chosen": -0.046889033168554306,
"logits/rejected": -0.1608402580022812,
"logps/chosen": -173.64353942871094,
"logps/rejected": -139.35154724121094,
"loss": 0.0832,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0064756870269775,
"rewards/margins": 2.9564709663391113,
"rewards/rejected": -0.9499956369400024,
"step": 290
},
{
"epoch": 0.1,
"grad_norm": 1.6635288000106812,
"learning_rate": 9.835634451019067e-06,
"logits/chosen": -0.017042959108948708,
"logits/rejected": -0.12096239626407623,
"logps/chosen": -159.43319702148438,
"logps/rejected": -139.1463623046875,
"loss": 0.0986,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9454389810562134,
"rewards/margins": 2.7836952209472656,
"rewards/rejected": -0.8382562398910522,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 1.5297130346298218,
"learning_rate": 9.824676747753673e-06,
"logits/chosen": -0.12943556904792786,
"logits/rejected": -0.2048647403717041,
"logps/chosen": -149.43092346191406,
"logps/rejected": -140.9419403076172,
"loss": 0.0967,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.958494782447815,
"rewards/margins": 3.017915964126587,
"rewards/rejected": -1.059421420097351,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 0.18703162670135498,
"learning_rate": 9.813719044488275e-06,
"logits/chosen": -0.044006917625665665,
"logits/rejected": -0.15224145352840424,
"logps/chosen": -153.7903594970703,
"logps/rejected": -128.21835327148438,
"loss": 0.0409,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.305603265762329,
"rewards/margins": 3.5984387397766113,
"rewards/rejected": -1.292834997177124,
"step": 320
},
{
"epoch": 0.11,
"grad_norm": 0.7461889982223511,
"learning_rate": 9.802761341222881e-06,
"logits/chosen": -0.14230886101722717,
"logits/rejected": -0.1584036648273468,
"logps/chosen": -131.12109375,
"logps/rejected": -111.15714263916016,
"loss": 0.0709,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7661842107772827,
"rewards/margins": 3.3135159015655518,
"rewards/rejected": -1.5473315715789795,
"step": 330
},
{
"epoch": 0.11,
"grad_norm": 1.7300301790237427,
"learning_rate": 9.791803637957486e-06,
"logits/chosen": -0.19161322712898254,
"logits/rejected": -0.2922630310058594,
"logps/chosen": -172.96087646484375,
"logps/rejected": -160.4773712158203,
"loss": 0.0519,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8651368618011475,
"rewards/margins": 3.7756409645080566,
"rewards/rejected": -1.9105039834976196,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 2.3742196559906006,
"learning_rate": 9.78084593469209e-06,
"logits/chosen": -0.3112620711326599,
"logits/rejected": -0.43214184045791626,
"logps/chosen": -158.68028259277344,
"logps/rejected": -136.09814453125,
"loss": 0.0671,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1260719299316406,
"rewards/margins": 3.7196857929229736,
"rewards/rejected": -1.5936137437820435,
"step": 350
},
{
"epoch": 0.12,
"grad_norm": 2.4912428855895996,
"learning_rate": 9.769888231426694e-06,
"logits/chosen": -0.0563586950302124,
"logits/rejected": -0.20379754900932312,
"logps/chosen": -117.54966735839844,
"logps/rejected": -115.3812255859375,
"loss": 0.0837,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9740089178085327,
"rewards/margins": 3.207712173461914,
"rewards/rejected": -1.23370361328125,
"step": 360
},
{
"epoch": 0.12,
"grad_norm": 0.3334919512271881,
"learning_rate": 9.758930528161298e-06,
"logits/chosen": -0.16167142987251282,
"logits/rejected": -0.24221567809581757,
"logps/chosen": -138.83132934570312,
"logps/rejected": -141.4235076904297,
"loss": 0.1185,
"rewards/accuracies": 0.9333333969116211,
"rewards/chosen": 1.8540862798690796,
"rewards/margins": 3.4170851707458496,
"rewards/rejected": -1.5629991292953491,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 0.1451215147972107,
"learning_rate": 9.747972824895903e-06,
"logits/chosen": -0.1488262414932251,
"logits/rejected": -0.30983203649520874,
"logps/chosen": -146.93450927734375,
"logps/rejected": -145.71795654296875,
"loss": 0.0315,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.60548996925354,
"rewards/margins": 4.409090995788574,
"rewards/rejected": -2.803601026535034,
"step": 380
},
{
"epoch": 0.13,
"grad_norm": 0.11398794502019882,
"learning_rate": 9.737015121630507e-06,
"logits/chosen": -0.3392196297645569,
"logits/rejected": -0.4596717357635498,
"logps/chosen": -192.10316467285156,
"logps/rejected": -179.76910400390625,
"loss": 0.04,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8670743703842163,
"rewards/margins": 4.218127250671387,
"rewards/rejected": -2.351052761077881,
"step": 390
},
{
"epoch": 0.13,
"grad_norm": 1.373371958732605,
"learning_rate": 9.726057418365111e-06,
"logits/chosen": -0.2288927286863327,
"logits/rejected": -0.4092441499233246,
"logps/chosen": -134.75244140625,
"logps/rejected": -126.3078384399414,
"loss": 0.0498,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7074005603790283,
"rewards/margins": 4.082337856292725,
"rewards/rejected": -2.3749375343322754,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 0.6404663920402527,
"learning_rate": 9.715099715099716e-06,
"logits/chosen": -0.21756196022033691,
"logits/rejected": -0.3667296767234802,
"logps/chosen": -111.0381088256836,
"logps/rejected": -113.7396011352539,
"loss": 0.0291,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8885473012924194,
"rewards/margins": 4.248038291931152,
"rewards/rejected": -2.3594906330108643,
"step": 410
},
{
"epoch": 0.14,
"grad_norm": 0.2947131395339966,
"learning_rate": 9.70414201183432e-06,
"logits/chosen": -0.21332868933677673,
"logits/rejected": -0.39654839038848877,
"logps/chosen": -184.7637939453125,
"logps/rejected": -165.0076904296875,
"loss": 0.016,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1396678686141968,
"rewards/margins": 4.9540605545043945,
"rewards/rejected": -3.8143928050994873,
"step": 420
},
{
"epoch": 0.14,
"grad_norm": 0.2587800621986389,
"learning_rate": 9.693184308568924e-06,
"logits/chosen": -0.15772652626037598,
"logits/rejected": -0.2967797815799713,
"logps/chosen": -126.998046875,
"logps/rejected": -127.1830825805664,
"loss": 0.0436,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7573896646499634,
"rewards/margins": 4.652812957763672,
"rewards/rejected": -2.895423412322998,
"step": 430
},
{
"epoch": 0.14,
"grad_norm": 0.18937894701957703,
"learning_rate": 9.68222660530353e-06,
"logits/chosen": -0.24051830172538757,
"logits/rejected": -0.3756439685821533,
"logps/chosen": -132.031494140625,
"logps/rejected": -143.25439453125,
"loss": 0.0317,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8294754028320312,
"rewards/margins": 4.800570487976074,
"rewards/rejected": -2.971095561981201,
"step": 440
},
{
"epoch": 0.15,
"grad_norm": 0.796097457408905,
"learning_rate": 9.671268902038133e-06,
"logits/chosen": -0.19832219183444977,
"logits/rejected": -0.351583868265152,
"logps/chosen": -128.74539184570312,
"logps/rejected": -126.41670227050781,
"loss": 0.0183,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6307995319366455,
"rewards/margins": 5.1652398109436035,
"rewards/rejected": -3.534440517425537,
"step": 450
},
{
"epoch": 0.15,
"grad_norm": 1.0965434312820435,
"learning_rate": 9.660311198772739e-06,
"logits/chosen": -0.1457364708185196,
"logits/rejected": -0.33718162775039673,
"logps/chosen": -153.73037719726562,
"logps/rejected": -153.25807189941406,
"loss": 0.0491,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8024961352348328,
"rewards/margins": 4.706381797790527,
"rewards/rejected": -3.903886079788208,
"step": 460
},
{
"epoch": 0.15,
"grad_norm": 0.4351007342338562,
"learning_rate": 9.649353495507341e-06,
"logits/chosen": -0.29992926120758057,
"logits/rejected": -0.45157748460769653,
"logps/chosen": -141.03488159179688,
"logps/rejected": -147.01473999023438,
"loss": 0.0354,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1137398481369019,
"rewards/margins": 4.605474948883057,
"rewards/rejected": -3.491734743118286,
"step": 470
},
{
"epoch": 0.16,
"grad_norm": 0.21928514540195465,
"learning_rate": 9.638395792241947e-06,
"logits/chosen": -0.2734539210796356,
"logits/rejected": -0.42611390352249146,
"logps/chosen": -178.15843200683594,
"logps/rejected": -171.8292999267578,
"loss": 0.0296,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0601214170455933,
"rewards/margins": 5.109463214874268,
"rewards/rejected": -4.049341678619385,
"step": 480
},
{
"epoch": 0.16,
"grad_norm": 0.37911301851272583,
"learning_rate": 9.627438088976552e-06,
"logits/chosen": -0.38659173250198364,
"logits/rejected": -0.5436467528343201,
"logps/chosen": -187.867431640625,
"logps/rejected": -188.06741333007812,
"loss": 0.0142,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8567007780075073,
"rewards/margins": 5.4622673988342285,
"rewards/rejected": -4.605566501617432,
"step": 490
},
{
"epoch": 0.16,
"grad_norm": 0.09822113811969757,
"learning_rate": 9.616480385711156e-06,
"logits/chosen": -0.37623029947280884,
"logits/rejected": -0.48185300827026367,
"logps/chosen": -202.08529663085938,
"logps/rejected": -199.71482849121094,
"loss": 0.0211,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24511973559856415,
"rewards/margins": 5.403919219970703,
"rewards/rejected": -5.158799171447754,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 0.3931962251663208,
"learning_rate": 9.60552268244576e-06,
"logits/chosen": -0.2853025794029236,
"logits/rejected": -0.413346529006958,
"logps/chosen": -124.1753921508789,
"logps/rejected": -140.43246459960938,
"loss": 0.0171,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2122681140899658,
"rewards/margins": 5.2854719161987305,
"rewards/rejected": -4.0732035636901855,
"step": 510
},
{
"epoch": 0.17,
"grad_norm": 1.2813736200332642,
"learning_rate": 9.594564979180364e-06,
"logits/chosen": -0.2814486622810364,
"logits/rejected": -0.5304259657859802,
"logps/chosen": -143.65679931640625,
"logps/rejected": -140.1746368408203,
"loss": 0.0151,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1810171604156494,
"rewards/margins": 5.255087375640869,
"rewards/rejected": -4.074069976806641,
"step": 520
},
{
"epoch": 0.17,
"grad_norm": 0.6368138790130615,
"learning_rate": 9.583607275914969e-06,
"logits/chosen": -0.23208048939704895,
"logits/rejected": -0.41035908460617065,
"logps/chosen": -152.8085174560547,
"logps/rejected": -158.786376953125,
"loss": 0.0101,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0337063074111938,
"rewards/margins": 5.832049369812012,
"rewards/rejected": -4.798343181610107,
"step": 530
},
{
"epoch": 0.17,
"grad_norm": 0.9199265241622925,
"learning_rate": 9.572649572649575e-06,
"logits/chosen": -0.3010120391845703,
"logits/rejected": -0.4592529237270355,
"logps/chosen": -217.2087860107422,
"logps/rejected": -198.68173217773438,
"loss": 0.0252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.41687291860580444,
"rewards/margins": 5.248563289642334,
"rewards/rejected": -5.6654372215271,
"step": 540
},
{
"epoch": 0.18,
"grad_norm": 0.013650404289364815,
"learning_rate": 9.561691869384177e-06,
"logits/chosen": -0.4347296357154846,
"logits/rejected": -0.6051202416419983,
"logps/chosen": -190.7089080810547,
"logps/rejected": -205.96047973632812,
"loss": 0.0149,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1618773639202118,
"rewards/margins": 6.865809440612793,
"rewards/rejected": -6.703932762145996,
"step": 550
},
{
"epoch": 0.18,
"grad_norm": 0.06230132654309273,
"learning_rate": 9.550734166118783e-06,
"logits/chosen": -0.3882770240306854,
"logits/rejected": -0.6216930747032166,
"logps/chosen": -175.98199462890625,
"logps/rejected": -177.3306427001953,
"loss": 0.0488,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": 0.032947491854429245,
"rewards/margins": 5.509705066680908,
"rewards/rejected": -5.476758003234863,
"step": 560
},
{
"epoch": 0.18,
"grad_norm": 0.16349956393241882,
"learning_rate": 9.539776462853386e-06,
"logits/chosen": -0.3122726082801819,
"logits/rejected": -0.4245499074459076,
"logps/chosen": -228.5991668701172,
"logps/rejected": -225.5084686279297,
"loss": 0.0161,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.799796462059021,
"rewards/margins": 5.888455390930176,
"rewards/rejected": -6.688251495361328,
"step": 570
},
{
"epoch": 0.19,
"grad_norm": 0.12062845379114151,
"learning_rate": 9.528818759587992e-06,
"logits/chosen": -0.5392414927482605,
"logits/rejected": -0.6697270274162292,
"logps/chosen": -170.96298217773438,
"logps/rejected": -198.06301879882812,
"loss": 0.0466,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008090650662779808,
"rewards/margins": 6.091658592224121,
"rewards/rejected": -6.0835676193237305,
"step": 580
},
{
"epoch": 0.19,
"grad_norm": 0.3678707778453827,
"learning_rate": 9.517861056322596e-06,
"logits/chosen": -0.4352661669254303,
"logits/rejected": -0.5481756925582886,
"logps/chosen": -174.7131805419922,
"logps/rejected": -171.861572265625,
"loss": 0.0116,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18627242743968964,
"rewards/margins": 6.163498878479004,
"rewards/rejected": -5.977226257324219,
"step": 590
},
{
"epoch": 0.19,
"grad_norm": 0.07159756124019623,
"learning_rate": 9.5069033530572e-06,
"logits/chosen": -0.20721349120140076,
"logits/rejected": -0.417985737323761,
"logps/chosen": -187.81405639648438,
"logps/rejected": -185.33331298828125,
"loss": 0.0098,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07214238494634628,
"rewards/margins": 5.932024002075195,
"rewards/rejected": -5.85988187789917,
"step": 600
},
{
"epoch": 0.2,
"grad_norm": 0.09394218772649765,
"learning_rate": 9.495945649791805e-06,
"logits/chosen": -0.24830050766468048,
"logits/rejected": -0.36983978748321533,
"logps/chosen": -129.41619873046875,
"logps/rejected": -145.60447692871094,
"loss": 0.0193,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3323943018913269,
"rewards/margins": 6.081761837005615,
"rewards/rejected": -5.749367713928223,
"step": 610
},
{
"epoch": 0.2,
"grad_norm": 0.561514675617218,
"learning_rate": 9.484987946526409e-06,
"logits/chosen": -0.3996688723564148,
"logits/rejected": -0.5800243020057678,
"logps/chosen": -202.79257202148438,
"logps/rejected": -203.62704467773438,
"loss": 0.0457,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -0.36100807785987854,
"rewards/margins": 6.192732810974121,
"rewards/rejected": -6.553740501403809,
"step": 620
},
{
"epoch": 0.2,
"grad_norm": 0.08309807628393173,
"learning_rate": 9.474030243261013e-06,
"logits/chosen": -0.3754512667655945,
"logits/rejected": -0.5741219520568848,
"logps/chosen": -174.9481658935547,
"logps/rejected": -182.9730224609375,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4041864275932312,
"rewards/margins": 6.596762180328369,
"rewards/rejected": -7.000948905944824,
"step": 630
},
{
"epoch": 0.21,
"grad_norm": 0.0921340063214302,
"learning_rate": 9.463072539995617e-06,
"logits/chosen": -0.30068254470825195,
"logits/rejected": -0.45074382424354553,
"logps/chosen": -173.031005859375,
"logps/rejected": -197.98251342773438,
"loss": 0.0166,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10172281414270401,
"rewards/margins": 6.632701873779297,
"rewards/rejected": -6.734424591064453,
"step": 640
},
{
"epoch": 0.21,
"grad_norm": 0.037897739559412,
"learning_rate": 9.452114836730222e-06,
"logits/chosen": -0.42777299880981445,
"logits/rejected": -0.4529387354850769,
"logps/chosen": -238.9905242919922,
"logps/rejected": -260.50054931640625,
"loss": 0.0165,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3973451852798462,
"rewards/margins": 6.604907035827637,
"rewards/rejected": -8.002251625061035,
"step": 650
},
{
"epoch": 0.21,
"grad_norm": 0.19550499320030212,
"learning_rate": 9.441157133464826e-06,
"logits/chosen": -0.23329667747020721,
"logits/rejected": -0.44183143973350525,
"logps/chosen": -139.6894073486328,
"logps/rejected": -164.62615966796875,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24222126603126526,
"rewards/margins": 6.836798667907715,
"rewards/rejected": -6.594576835632324,
"step": 660
},
{
"epoch": 0.22,
"grad_norm": 0.3139669895172119,
"learning_rate": 9.430199430199432e-06,
"logits/chosen": -0.4021090865135193,
"logits/rejected": -0.5786430239677429,
"logps/chosen": -181.46817016601562,
"logps/rejected": -197.1630096435547,
"loss": 0.0187,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.535909116268158,
"rewards/margins": 6.705661773681641,
"rewards/rejected": -7.241570949554443,
"step": 670
},
{
"epoch": 0.22,
"grad_norm": 0.05340191349387169,
"learning_rate": 9.419241726934035e-06,
"logits/chosen": -0.42853325605392456,
"logits/rejected": -0.6143732070922852,
"logps/chosen": -168.3545379638672,
"logps/rejected": -188.1068878173828,
"loss": 0.0095,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3123434782028198,
"rewards/margins": 6.992964267730713,
"rewards/rejected": -7.305306911468506,
"step": 680
},
{
"epoch": 0.22,
"grad_norm": 2.9327523708343506,
"learning_rate": 9.40828402366864e-06,
"logits/chosen": -0.38912615180015564,
"logits/rejected": -0.6179540753364563,
"logps/chosen": -255.75888061523438,
"logps/rejected": -249.5757598876953,
"loss": 0.0143,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6745103597640991,
"rewards/margins": 7.797115325927734,
"rewards/rejected": -9.471625328063965,
"step": 690
},
{
"epoch": 0.23,
"grad_norm": 0.03456411138176918,
"learning_rate": 9.397326320403243e-06,
"logits/chosen": -0.5495766997337341,
"logits/rejected": -0.6684257984161377,
"logps/chosen": -168.55702209472656,
"logps/rejected": -188.98629760742188,
"loss": 0.0119,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.858353316783905,
"rewards/margins": 6.359633445739746,
"rewards/rejected": -7.217986106872559,
"step": 700
},
{
"epoch": 0.23,
"grad_norm": 0.06367018818855286,
"learning_rate": 9.386368617137849e-06,
"logits/chosen": -0.4233613610267639,
"logits/rejected": -0.6099605560302734,
"logps/chosen": -175.0663604736328,
"logps/rejected": -189.76922607421875,
"loss": 0.0294,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5615413188934326,
"rewards/margins": 6.164674282073975,
"rewards/rejected": -7.7262163162231445,
"step": 710
},
{
"epoch": 0.23,
"grad_norm": 2.003807306289673,
"learning_rate": 9.375410913872453e-06,
"logits/chosen": -0.508701741695404,
"logits/rejected": -0.6367843747138977,
"logps/chosen": -201.49270629882812,
"logps/rejected": -214.9327850341797,
"loss": 0.0095,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2017481327056885,
"rewards/margins": 7.067420959472656,
"rewards/rejected": -8.269168853759766,
"step": 720
},
{
"epoch": 0.24,
"grad_norm": 0.8275896310806274,
"learning_rate": 9.364453210607058e-06,
"logits/chosen": -0.42019587755203247,
"logits/rejected": -0.5408454537391663,
"logps/chosen": -204.19387817382812,
"logps/rejected": -225.56607055664062,
"loss": 0.0117,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0568044185638428,
"rewards/margins": 7.597814083099365,
"rewards/rejected": -9.654618263244629,
"step": 730
},
{
"epoch": 0.24,
"grad_norm": 0.14503268897533417,
"learning_rate": 9.353495507341662e-06,
"logits/chosen": -0.39080002903938293,
"logits/rejected": -0.5268298983573914,
"logps/chosen": -184.9230194091797,
"logps/rejected": -208.14492797851562,
"loss": 0.0479,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -2.121675968170166,
"rewards/margins": 7.246522426605225,
"rewards/rejected": -9.368197441101074,
"step": 740
},
{
"epoch": 0.24,
"grad_norm": 0.9217634797096252,
"learning_rate": 9.342537804076266e-06,
"logits/chosen": -0.37332993745803833,
"logits/rejected": -0.5701053142547607,
"logps/chosen": -175.7810516357422,
"logps/rejected": -201.13003540039062,
"loss": 0.0408,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -1.581413745880127,
"rewards/margins": 6.754090309143066,
"rewards/rejected": -8.335504531860352,
"step": 750
},
{
"epoch": 0.25,
"grad_norm": 0.026290835812687874,
"learning_rate": 9.33158010081087e-06,
"logits/chosen": -0.4100477695465088,
"logits/rejected": -0.5200067162513733,
"logps/chosen": -233.86337280273438,
"logps/rejected": -258.96881103515625,
"loss": 0.0723,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -2.9427809715270996,
"rewards/margins": 6.980525016784668,
"rewards/rejected": -9.923307418823242,
"step": 760
},
{
"epoch": 0.25,
"grad_norm": 0.37062278389930725,
"learning_rate": 9.320622397545477e-06,
"logits/chosen": -0.3395642936229706,
"logits/rejected": -0.4273925721645355,
"logps/chosen": -242.663818359375,
"logps/rejected": -268.10089111328125,
"loss": 0.0156,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.880597114562988,
"rewards/margins": 7.653998374938965,
"rewards/rejected": -12.534595489501953,
"step": 770
},
{
"epoch": 0.25,
"grad_norm": 0.0023701719474047422,
"learning_rate": 9.309664694280079e-06,
"logits/chosen": -0.4241916537284851,
"logits/rejected": -0.5316244959831238,
"logps/chosen": -182.79238891601562,
"logps/rejected": -211.8367156982422,
"loss": 0.0338,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6987730264663696,
"rewards/margins": 7.190474510192871,
"rewards/rejected": -8.88924789428711,
"step": 780
},
{
"epoch": 0.26,
"grad_norm": 0.008612029254436493,
"learning_rate": 9.298706991014685e-06,
"logits/chosen": -0.4729032516479492,
"logits/rejected": -0.5640527606010437,
"logps/chosen": -238.87527465820312,
"logps/rejected": -276.45318603515625,
"loss": 0.0358,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -4.509866237640381,
"rewards/margins": 8.078734397888184,
"rewards/rejected": -12.588600158691406,
"step": 790
},
{
"epoch": 0.26,
"grad_norm": 0.01925979182124138,
"learning_rate": 9.287749287749288e-06,
"logits/chosen": -0.42346763610839844,
"logits/rejected": -0.5876745581626892,
"logps/chosen": -217.486572265625,
"logps/rejected": -251.65774536132812,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.6943228244781494,
"rewards/margins": 9.3502197265625,
"rewards/rejected": -12.044544219970703,
"step": 800
},
{
"epoch": 0.26,
"grad_norm": 0.13818100094795227,
"learning_rate": 9.276791584483894e-06,
"logits/chosen": -0.4230351448059082,
"logits/rejected": -0.5674090385437012,
"logps/chosen": -217.064453125,
"logps/rejected": -245.08786010742188,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.9776084423065186,
"rewards/margins": 8.626860618591309,
"rewards/rejected": -12.604470252990723,
"step": 810
},
{
"epoch": 0.27,
"grad_norm": 0.011727742850780487,
"learning_rate": 9.265833881218498e-06,
"logits/chosen": -0.4069291055202484,
"logits/rejected": -0.5598689913749695,
"logps/chosen": -172.05592346191406,
"logps/rejected": -200.79318237304688,
"loss": 0.0204,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.5719494819641113,
"rewards/margins": 7.285019874572754,
"rewards/rejected": -9.856969833374023,
"step": 820
},
{
"epoch": 0.27,
"grad_norm": 1.0111483335494995,
"learning_rate": 9.254876177953102e-06,
"logits/chosen": -0.4388393759727478,
"logits/rejected": -0.5350168347358704,
"logps/chosen": -306.37823486328125,
"logps/rejected": -342.58935546875,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.371355056762695,
"rewards/margins": 10.12095832824707,
"rewards/rejected": -16.492313385009766,
"step": 830
},
{
"epoch": 0.27,
"grad_norm": 0.0097459452226758,
"learning_rate": 9.243918474687706e-06,
"logits/chosen": -0.3321714401245117,
"logits/rejected": -0.5004499554634094,
"logps/chosen": -181.58326721191406,
"logps/rejected": -213.7935333251953,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.546564817428589,
"rewards/margins": 8.579878807067871,
"rewards/rejected": -11.126443862915039,
"step": 840
},
{
"epoch": 0.27,
"grad_norm": 0.0109526002779603,
"learning_rate": 9.23296077142231e-06,
"logits/chosen": -0.4445480704307556,
"logits/rejected": -0.5257662534713745,
"logps/chosen": -201.4680938720703,
"logps/rejected": -272.65118408203125,
"loss": 0.0059,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.2386157512664795,
"rewards/margins": 10.427217483520508,
"rewards/rejected": -13.665834426879883,
"step": 850
},
{
"epoch": 0.28,
"grad_norm": 4.875131607055664,
"learning_rate": 9.222003068156915e-06,
"logits/chosen": -0.4273023009300232,
"logits/rejected": -0.5373457670211792,
"logps/chosen": -263.1457824707031,
"logps/rejected": -278.6871032714844,
"loss": 0.0439,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.379282474517822,
"rewards/margins": 8.077393531799316,
"rewards/rejected": -12.456674575805664,
"step": 860
},
{
"epoch": 0.28,
"grad_norm": 0.012556626461446285,
"learning_rate": 9.21104536489152e-06,
"logits/chosen": -0.2668747007846832,
"logits/rejected": -0.44963914155960083,
"logps/chosen": -210.3493194580078,
"logps/rejected": -252.1433563232422,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.3864264488220215,
"rewards/margins": 8.853643417358398,
"rewards/rejected": -12.240070343017578,
"step": 870
},
{
"epoch": 0.28,
"grad_norm": 0.024654850363731384,
"learning_rate": 9.200087661626124e-06,
"logits/chosen": -0.46519985795021057,
"logits/rejected": -0.5054847002029419,
"logps/chosen": -242.70095825195312,
"logps/rejected": -250.40383911132812,
"loss": 0.0063,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.888115406036377,
"rewards/margins": 9.060136795043945,
"rewards/rejected": -12.948251724243164,
"step": 880
},
{
"epoch": 0.29,
"grad_norm": 0.0028619980439543724,
"learning_rate": 9.189129958360728e-06,
"logits/chosen": -0.39419493079185486,
"logits/rejected": -0.4969407916069031,
"logps/chosen": -210.2559814453125,
"logps/rejected": -268.48870849609375,
"loss": 0.0542,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -4.357173442840576,
"rewards/margins": 9.57945442199707,
"rewards/rejected": -13.936625480651855,
"step": 890
},
{
"epoch": 0.29,
"grad_norm": 0.5537806153297424,
"learning_rate": 9.178172255095332e-06,
"logits/chosen": -0.36690598726272583,
"logits/rejected": -0.5239487886428833,
"logps/chosen": -189.4766082763672,
"logps/rejected": -234.23739624023438,
"loss": 0.0072,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8814499378204346,
"rewards/margins": 8.542583465576172,
"rewards/rejected": -12.424032211303711,
"step": 900
},
{
"epoch": 0.29,
"grad_norm": 0.0037552732974290848,
"learning_rate": 9.167214551829936e-06,
"logits/chosen": -0.2370346486568451,
"logits/rejected": -0.35934606194496155,
"logps/chosen": -254.87759399414062,
"logps/rejected": -300.49700927734375,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.661134243011475,
"rewards/margins": 9.538171768188477,
"rewards/rejected": -16.19930648803711,
"step": 910
},
{
"epoch": 0.3,
"grad_norm": 0.016242943704128265,
"learning_rate": 9.156256848564542e-06,
"logits/chosen": -0.3371526598930359,
"logits/rejected": -0.4652339518070221,
"logps/chosen": -173.8836212158203,
"logps/rejected": -223.115478515625,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5612175464630127,
"rewards/margins": 9.151124954223633,
"rewards/rejected": -12.712343215942383,
"step": 920
},
{
"epoch": 0.3,
"grad_norm": 0.024046355858445168,
"learning_rate": 9.145299145299145e-06,
"logits/chosen": -0.3485228419303894,
"logits/rejected": -0.3939455449581146,
"logps/chosen": -290.3489990234375,
"logps/rejected": -343.43817138671875,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.7552947998046875,
"rewards/margins": 10.854909896850586,
"rewards/rejected": -18.610204696655273,
"step": 930
},
{
"epoch": 0.3,
"grad_norm": 0.046042028814554214,
"learning_rate": 9.134341442033751e-06,
"logits/chosen": -0.31552404165267944,
"logits/rejected": -0.4237368106842041,
"logps/chosen": -241.5800323486328,
"logps/rejected": -283.7861328125,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.1691718101501465,
"rewards/margins": 9.845176696777344,
"rewards/rejected": -15.014348030090332,
"step": 940
},
{
"epoch": 0.31,
"grad_norm": 2.655742883682251,
"learning_rate": 9.123383738768354e-06,
"logits/chosen": -0.28424203395843506,
"logits/rejected": -0.40134358406066895,
"logps/chosen": -250.8709716796875,
"logps/rejected": -309.8855285644531,
"loss": 0.0069,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.845712184906006,
"rewards/margins": 10.40029239654541,
"rewards/rejected": -16.24600601196289,
"step": 950
},
{
"epoch": 0.31,
"grad_norm": 0.00028849352383986115,
"learning_rate": 9.11242603550296e-06,
"logits/chosen": -0.2519022524356842,
"logits/rejected": -0.3044319748878479,
"logps/chosen": -256.7398986816406,
"logps/rejected": -308.69329833984375,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.2683329582214355,
"rewards/margins": 10.402109146118164,
"rewards/rejected": -16.670442581176758,
"step": 960
},
{
"epoch": 0.31,
"grad_norm": 0.0053086057305336,
"learning_rate": 9.101468332237564e-06,
"logits/chosen": -0.2518306076526642,
"logits/rejected": -0.3324928879737854,
"logps/chosen": -247.19223022460938,
"logps/rejected": -299.892822265625,
"loss": 0.0088,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.235478401184082,
"rewards/margins": 10.305525779724121,
"rewards/rejected": -17.541004180908203,
"step": 970
},
{
"epoch": 0.32,
"grad_norm": 0.054621316492557526,
"learning_rate": 9.090510628972168e-06,
"logits/chosen": -0.18655958771705627,
"logits/rejected": -0.30609598755836487,
"logps/chosen": -212.97329711914062,
"logps/rejected": -268.6554260253906,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.168915748596191,
"rewards/margins": 10.391229629516602,
"rewards/rejected": -15.560145378112793,
"step": 980
},
{
"epoch": 0.32,
"grad_norm": 0.023384546861052513,
"learning_rate": 9.079552925706772e-06,
"logits/chosen": -0.3303782641887665,
"logits/rejected": -0.38056522607803345,
"logps/chosen": -213.8797149658203,
"logps/rejected": -288.4650573730469,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.234591484069824,
"rewards/margins": 11.60734748840332,
"rewards/rejected": -16.84193992614746,
"step": 990
},
{
"epoch": 0.32,
"grad_norm": 0.028016259893774986,
"learning_rate": 9.068595222441378e-06,
"logits/chosen": -0.3271247446537018,
"logits/rejected": -0.3924880623817444,
"logps/chosen": -315.4078369140625,
"logps/rejected": -360.6003112792969,
"loss": 0.0054,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.352377891540527,
"rewards/margins": 10.600927352905273,
"rewards/rejected": -19.953306198120117,
"step": 1000
},
{
"epoch": 0.33,
"grad_norm": 0.039689064025878906,
"learning_rate": 9.057637519175981e-06,
"logits/chosen": -0.3742726743221283,
"logits/rejected": -0.5176577568054199,
"logps/chosen": -260.1962585449219,
"logps/rejected": -283.95538330078125,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.897824764251709,
"rewards/margins": 8.730276107788086,
"rewards/rejected": -16.628101348876953,
"step": 1010
},
{
"epoch": 0.33,
"grad_norm": 0.950749933719635,
"learning_rate": 9.046679815910587e-06,
"logits/chosen": -0.29500845074653625,
"logits/rejected": -0.342989444732666,
"logps/chosen": -235.64990234375,
"logps/rejected": -296.71636962890625,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.87273645401001,
"rewards/margins": 11.365694046020508,
"rewards/rejected": -17.23843002319336,
"step": 1020
},
{
"epoch": 0.33,
"grad_norm": 0.7492702603340149,
"learning_rate": 9.03572211264519e-06,
"logits/chosen": -0.29514080286026,
"logits/rejected": -0.3748084008693695,
"logps/chosen": -318.6951904296875,
"logps/rejected": -366.6182556152344,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.613062858581543,
"rewards/margins": 12.006413459777832,
"rewards/rejected": -20.619478225708008,
"step": 1030
},
{
"epoch": 0.34,
"grad_norm": 0.025907784700393677,
"learning_rate": 9.024764409379796e-06,
"logits/chosen": -0.2838585674762726,
"logits/rejected": -0.290244460105896,
"logps/chosen": -312.25347900390625,
"logps/rejected": -396.3223571777344,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.871122360229492,
"rewards/margins": 13.518135070800781,
"rewards/rejected": -23.38925552368164,
"step": 1040
},
{
"epoch": 0.34,
"grad_norm": 3.519346864777617e-05,
"learning_rate": 9.0138067061144e-06,
"logits/chosen": -0.33469122648239136,
"logits/rejected": -0.4082818627357483,
"logps/chosen": -285.1815185546875,
"logps/rejected": -362.09332275390625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.829544067382812,
"rewards/margins": 12.795137405395508,
"rewards/rejected": -21.624683380126953,
"step": 1050
},
{
"epoch": 0.34,
"grad_norm": 4.468189217732288e-05,
"learning_rate": 9.002849002849004e-06,
"logits/chosen": -0.1854289472103119,
"logits/rejected": -0.19475221633911133,
"logps/chosen": -247.6388397216797,
"logps/rejected": -315.0453186035156,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.3854804039001465,
"rewards/margins": 12.116926193237305,
"rewards/rejected": -19.50240707397461,
"step": 1060
},
{
"epoch": 0.35,
"grad_norm": 0.0009876694530248642,
"learning_rate": 8.991891299583608e-06,
"logits/chosen": -0.24933210015296936,
"logits/rejected": -0.28711989521980286,
"logps/chosen": -255.6931610107422,
"logps/rejected": -328.758056640625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.750466346740723,
"rewards/margins": 12.743404388427734,
"rewards/rejected": -19.493867874145508,
"step": 1070
},
{
"epoch": 0.35,
"grad_norm": 14.395219802856445,
"learning_rate": 8.980933596318213e-06,
"logits/chosen": -0.28123170137405396,
"logits/rejected": -0.31940752267837524,
"logps/chosen": -244.03466796875,
"logps/rejected": -313.8268737792969,
"loss": 0.0115,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.721076965332031,
"rewards/margins": 10.604429244995117,
"rewards/rejected": -18.32550811767578,
"step": 1080
},
{
"epoch": 0.35,
"grad_norm": 0.04704693332314491,
"learning_rate": 8.969975893052817e-06,
"logits/chosen": -0.28245311975479126,
"logits/rejected": -0.3764280378818512,
"logps/chosen": -248.2042694091797,
"logps/rejected": -286.869873046875,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.773460388183594,
"rewards/margins": 10.64108943939209,
"rewards/rejected": -17.414548873901367,
"step": 1090
},
{
"epoch": 0.36,
"grad_norm": 0.00043834373354911804,
"learning_rate": 8.959018189787421e-06,
"logits/chosen": -0.20289742946624756,
"logits/rejected": -0.2996065020561218,
"logps/chosen": -297.94512939453125,
"logps/rejected": -347.0626525878906,
"loss": 0.0139,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.10444164276123,
"rewards/margins": 12.04673957824707,
"rewards/rejected": -21.151180267333984,
"step": 1100
},
{
"epoch": 0.36,
"grad_norm": 0.12115999311208725,
"learning_rate": 8.948060486522026e-06,
"logits/chosen": -0.26539546251296997,
"logits/rejected": -0.33644038438796997,
"logps/chosen": -306.0853576660156,
"logps/rejected": -363.36517333984375,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.828388214111328,
"rewards/margins": 11.639673233032227,
"rewards/rejected": -20.468059539794922,
"step": 1110
},
{
"epoch": 0.36,
"grad_norm": 0.01209923718124628,
"learning_rate": 8.93710278325663e-06,
"logits/chosen": -0.20330910384655,
"logits/rejected": -0.2543596625328064,
"logps/chosen": -267.58465576171875,
"logps/rejected": -339.16552734375,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.954190254211426,
"rewards/margins": 12.7907133102417,
"rewards/rejected": -20.744905471801758,
"step": 1120
},
{
"epoch": 0.37,
"grad_norm": 0.017156679183244705,
"learning_rate": 8.926145079991234e-06,
"logits/chosen": -0.21037821471691132,
"logits/rejected": -0.335681676864624,
"logps/chosen": -238.2596435546875,
"logps/rejected": -287.96832275390625,
"loss": 0.1154,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -6.764012813568115,
"rewards/margins": 10.85376262664795,
"rewards/rejected": -17.617774963378906,
"step": 1130
},
{
"epoch": 0.37,
"grad_norm": 0.01817765086889267,
"learning_rate": 8.915187376725838e-06,
"logits/chosen": -0.27532559633255005,
"logits/rejected": -0.3343648314476013,
"logps/chosen": -237.79660034179688,
"logps/rejected": -278.28424072265625,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.450657844543457,
"rewards/margins": 9.784764289855957,
"rewards/rejected": -16.235422134399414,
"step": 1140
},
{
"epoch": 0.37,
"grad_norm": 0.3464580178260803,
"learning_rate": 8.904229673460444e-06,
"logits/chosen": -0.20038633048534393,
"logits/rejected": -0.34763047099113464,
"logps/chosen": -264.88238525390625,
"logps/rejected": -303.27056884765625,
"loss": 0.0065,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.401070594787598,
"rewards/margins": 10.594339370727539,
"rewards/rejected": -16.995410919189453,
"step": 1150
},
{
"epoch": 0.38,
"grad_norm": 0.010944131761789322,
"learning_rate": 8.893271970195047e-06,
"logits/chosen": -0.3168713450431824,
"logits/rejected": -0.3542029857635498,
"logps/chosen": -245.4883270263672,
"logps/rejected": -310.0267639160156,
"loss": 0.029,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.169398307800293,
"rewards/margins": 11.141454696655273,
"rewards/rejected": -17.310853958129883,
"step": 1160
},
{
"epoch": 0.38,
"grad_norm": 0.012095646932721138,
"learning_rate": 8.882314266929653e-06,
"logits/chosen": -0.2291782796382904,
"logits/rejected": -0.34653568267822266,
"logps/chosen": -222.2202606201172,
"logps/rejected": -258.69512939453125,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.978119850158691,
"rewards/margins": 9.679037094116211,
"rewards/rejected": -14.657157897949219,
"step": 1170
},
{
"epoch": 0.38,
"grad_norm": 0.14866305887699127,
"learning_rate": 8.871356563664255e-06,
"logits/chosen": -0.21821892261505127,
"logits/rejected": -0.28071773052215576,
"logps/chosen": -199.72018432617188,
"logps/rejected": -272.10589599609375,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8853073120117188,
"rewards/margins": 11.493281364440918,
"rewards/rejected": -15.37859058380127,
"step": 1180
},
{
"epoch": 0.38,
"grad_norm": 10.241555213928223,
"learning_rate": 8.860398860398861e-06,
"logits/chosen": -0.13861140608787537,
"logits/rejected": -0.20187318325042725,
"logps/chosen": -193.94729614257812,
"logps/rejected": -244.1735076904297,
"loss": 0.183,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -4.192530632019043,
"rewards/margins": 9.562729835510254,
"rewards/rejected": -13.755261421203613,
"step": 1190
},
{
"epoch": 0.39,
"grad_norm": 0.001308279111981392,
"learning_rate": 8.849441157133466e-06,
"logits/chosen": -0.2188553512096405,
"logits/rejected": -0.27837082743644714,
"logps/chosen": -256.08392333984375,
"logps/rejected": -314.3865661621094,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.751282691955566,
"rewards/margins": 11.724446296691895,
"rewards/rejected": -16.47572898864746,
"step": 1200
},
{
"epoch": 0.39,
"grad_norm": 0.0011354751186445355,
"learning_rate": 8.83848345386807e-06,
"logits/chosen": -0.33862900733947754,
"logits/rejected": -0.3313067555427551,
"logps/chosen": -259.02044677734375,
"logps/rejected": -325.8959655761719,
"loss": 0.0244,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -5.828982830047607,
"rewards/margins": 12.15399169921875,
"rewards/rejected": -17.982975006103516,
"step": 1210
},
{
"epoch": 0.39,
"grad_norm": 0.022255534306168556,
"learning_rate": 8.827525750602674e-06,
"logits/chosen": -0.1525467336177826,
"logits/rejected": -0.30764085054397583,
"logps/chosen": -164.30361938476562,
"logps/rejected": -220.8531494140625,
"loss": 0.0072,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6489479541778564,
"rewards/margins": 9.622652053833008,
"rewards/rejected": -13.271600723266602,
"step": 1220
},
{
"epoch": 0.4,
"grad_norm": 0.00422575231641531,
"learning_rate": 8.816568047337279e-06,
"logits/chosen": -0.3136499524116516,
"logits/rejected": -0.3792189657688141,
"logps/chosen": -228.77914428710938,
"logps/rejected": -287.58441162109375,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.276397705078125,
"rewards/margins": 11.693455696105957,
"rewards/rejected": -16.969852447509766,
"step": 1230
},
{
"epoch": 0.4,
"grad_norm": 0.0019199317321181297,
"learning_rate": 8.805610344071883e-06,
"logits/chosen": -0.26560407876968384,
"logits/rejected": -0.3271743655204773,
"logps/chosen": -268.156982421875,
"logps/rejected": -299.9332580566406,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.225447654724121,
"rewards/margins": 10.738948822021484,
"rewards/rejected": -16.964397430419922,
"step": 1240
},
{
"epoch": 0.4,
"grad_norm": 0.006867074873298407,
"learning_rate": 8.794652640806489e-06,
"logits/chosen": -0.15553151071071625,
"logits/rejected": -0.21395400166511536,
"logps/chosen": -241.8409423828125,
"logps/rejected": -289.00848388671875,
"loss": 0.0097,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.778923988342285,
"rewards/margins": 9.904958724975586,
"rewards/rejected": -16.683881759643555,
"step": 1250
},
{
"epoch": 0.41,
"grad_norm": 0.009473240934312344,
"learning_rate": 8.783694937541091e-06,
"logits/chosen": -0.23859646916389465,
"logits/rejected": -0.27141764760017395,
"logps/chosen": -215.42276000976562,
"logps/rejected": -278.4677429199219,
"loss": 0.0084,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.839932441711426,
"rewards/margins": 11.050471305847168,
"rewards/rejected": -16.890405654907227,
"step": 1260
},
{
"epoch": 0.41,
"grad_norm": 0.0002466948644723743,
"learning_rate": 8.772737234275697e-06,
"logits/chosen": -0.26885563135147095,
"logits/rejected": -0.31586629152297974,
"logps/chosen": -248.76797485351562,
"logps/rejected": -342.48406982421875,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.838107109069824,
"rewards/margins": 14.081052780151367,
"rewards/rejected": -20.919160842895508,
"step": 1270
},
{
"epoch": 0.41,
"grad_norm": 0.009549058973789215,
"learning_rate": 8.7617795310103e-06,
"logits/chosen": -0.10880441963672638,
"logits/rejected": -0.16234132647514343,
"logps/chosen": -250.2044677734375,
"logps/rejected": -319.41778564453125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.536848545074463,
"rewards/margins": 12.7142972946167,
"rewards/rejected": -20.251144409179688,
"step": 1280
},
{
"epoch": 0.42,
"grad_norm": 0.006435507442802191,
"learning_rate": 8.750821827744906e-06,
"logits/chosen": -0.18445457518100739,
"logits/rejected": -0.18460145592689514,
"logps/chosen": -262.4994201660156,
"logps/rejected": -336.7874755859375,
"loss": 0.0339,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -7.42992639541626,
"rewards/margins": 12.39716911315918,
"rewards/rejected": -19.82709503173828,
"step": 1290
},
{
"epoch": 0.42,
"grad_norm": 0.23167112469673157,
"learning_rate": 8.73986412447951e-06,
"logits/chosen": -0.14932586252689362,
"logits/rejected": -0.19626209139823914,
"logps/chosen": -247.1484832763672,
"logps/rejected": -318.3656921386719,
"loss": 0.0037,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.087119102478027,
"rewards/margins": 11.31184196472168,
"rewards/rejected": -18.39896011352539,
"step": 1300
},
{
"epoch": 0.42,
"grad_norm": 0.0005600708536803722,
"learning_rate": 8.728906421214115e-06,
"logits/chosen": -0.1394408792257309,
"logits/rejected": -0.14028649032115936,
"logps/chosen": -289.9681701660156,
"logps/rejected": -364.3168640136719,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.336925506591797,
"rewards/margins": 12.717875480651855,
"rewards/rejected": -22.054800033569336,
"step": 1310
},
{
"epoch": 0.43,
"grad_norm": 0.14742839336395264,
"learning_rate": 8.717948717948719e-06,
"logits/chosen": -0.06897391378879547,
"logits/rejected": -0.10997577756643295,
"logps/chosen": -258.9363708496094,
"logps/rejected": -328.3492736816406,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.73633098602295,
"rewards/margins": 11.629599571228027,
"rewards/rejected": -20.365930557250977,
"step": 1320
},
{
"epoch": 0.43,
"grad_norm": 0.0032621161080896854,
"learning_rate": 8.706991014683323e-06,
"logits/chosen": 0.09344655275344849,
"logits/rejected": 0.04874902218580246,
"logps/chosen": -256.26904296875,
"logps/rejected": -334.4007263183594,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.622756958007812,
"rewards/margins": 12.946261405944824,
"rewards/rejected": -21.569019317626953,
"step": 1330
},
{
"epoch": 0.43,
"grad_norm": 0.00016526717809028924,
"learning_rate": 8.696033311417927e-06,
"logits/chosen": -0.011213278397917747,
"logits/rejected": -0.029890483245253563,
"logps/chosen": -285.08502197265625,
"logps/rejected": -352.0074462890625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.995809555053711,
"rewards/margins": 12.493630409240723,
"rewards/rejected": -22.489439010620117,
"step": 1340
},
{
"epoch": 0.44,
"grad_norm": 0.0005608515930362046,
"learning_rate": 8.685075608152532e-06,
"logits/chosen": -0.0565456822514534,
"logits/rejected": -0.04827792942523956,
"logps/chosen": -273.8840637207031,
"logps/rejected": -346.35858154296875,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.661928176879883,
"rewards/margins": 12.965700149536133,
"rewards/rejected": -21.627628326416016,
"step": 1350
},
{
"epoch": 0.44,
"grad_norm": 0.2561068832874298,
"learning_rate": 8.674117904887136e-06,
"logits/chosen": -0.14858858287334442,
"logits/rejected": -0.1448180228471756,
"logps/chosen": -297.3650207519531,
"logps/rejected": -404.10540771484375,
"loss": 0.0238,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -10.200472831726074,
"rewards/margins": 15.200775146484375,
"rewards/rejected": -25.401248931884766,
"step": 1360
},
{
"epoch": 0.44,
"grad_norm": 0.6736346483230591,
"learning_rate": 8.66316020162174e-06,
"logits/chosen": 0.014827290549874306,
"logits/rejected": 0.013812633231282234,
"logps/chosen": -257.91583251953125,
"logps/rejected": -342.5897216796875,
"loss": 0.0117,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.798688888549805,
"rewards/margins": 14.296686172485352,
"rewards/rejected": -23.095375061035156,
"step": 1370
},
{
"epoch": 0.45,
"grad_norm": 5.739891093980987e-06,
"learning_rate": 8.652202498356346e-06,
"logits/chosen": -0.004905500914901495,
"logits/rejected": 0.008281905204057693,
"logps/chosen": -284.5867614746094,
"logps/rejected": -372.309326171875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.582194328308105,
"rewards/margins": 13.63042163848877,
"rewards/rejected": -24.212615966796875,
"step": 1380
},
{
"epoch": 0.45,
"grad_norm": 10.869261741638184,
"learning_rate": 8.641244795090949e-06,
"logits/chosen": -0.06507638841867447,
"logits/rejected": -0.0575793981552124,
"logps/chosen": -294.3569641113281,
"logps/rejected": -353.50250244140625,
"loss": 0.0136,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.367159843444824,
"rewards/margins": 11.184629440307617,
"rewards/rejected": -22.551788330078125,
"step": 1390
},
{
"epoch": 0.45,
"grad_norm": 0.0022530544083565474,
"learning_rate": 8.630287091825555e-06,
"logits/chosen": 0.012960417196154594,
"logits/rejected": -0.018600907176733017,
"logps/chosen": -246.05801391601562,
"logps/rejected": -325.35577392578125,
"loss": 0.021,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.39463996887207,
"rewards/margins": 12.46562385559082,
"rewards/rejected": -20.86026382446289,
"step": 1400
},
{
"epoch": 0.46,
"grad_norm": 0.0010853647254407406,
"learning_rate": 8.619329388560157e-06,
"logits/chosen": -0.03639475628733635,
"logits/rejected": 0.0061216773465275764,
"logps/chosen": -252.0388641357422,
"logps/rejected": -349.8416442871094,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.54641056060791,
"rewards/margins": 13.886810302734375,
"rewards/rejected": -22.4332218170166,
"step": 1410
},
{
"epoch": 0.46,
"grad_norm": 0.5238139033317566,
"learning_rate": 8.608371685294763e-06,
"logits/chosen": -0.11145244538784027,
"logits/rejected": -0.12166018784046173,
"logps/chosen": -252.606689453125,
"logps/rejected": -314.7965087890625,
"loss": 0.1077,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -7.48656702041626,
"rewards/margins": 11.383016586303711,
"rewards/rejected": -18.869583129882812,
"step": 1420
},
{
"epoch": 0.46,
"grad_norm": 0.383899062871933,
"learning_rate": 8.597413982029368e-06,
"logits/chosen": -0.0349181704223156,
"logits/rejected": -0.01999412663280964,
"logps/chosen": -250.32009887695312,
"logps/rejected": -321.7938537597656,
"loss": 0.0142,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.556001663208008,
"rewards/margins": 11.27842903137207,
"rewards/rejected": -19.834430694580078,
"step": 1430
},
{
"epoch": 0.47,
"grad_norm": 0.03882277384400368,
"learning_rate": 8.586456278763972e-06,
"logits/chosen": -0.041800715029239655,
"logits/rejected": 0.028328755870461464,
"logps/chosen": -264.7074890136719,
"logps/rejected": -359.95001220703125,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.131043434143066,
"rewards/margins": 14.964799880981445,
"rewards/rejected": -24.095842361450195,
"step": 1440
},
{
"epoch": 0.47,
"grad_norm": 0.005863294005393982,
"learning_rate": 8.575498575498576e-06,
"logits/chosen": -0.008469844236969948,
"logits/rejected": 0.002064249012619257,
"logps/chosen": -189.87020874023438,
"logps/rejected": -289.1915588378906,
"loss": 0.0173,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.485939979553223,
"rewards/margins": 12.884135246276855,
"rewards/rejected": -18.370075225830078,
"step": 1450
},
{
"epoch": 0.47,
"grad_norm": 0.09647411108016968,
"learning_rate": 8.56454087223318e-06,
"logits/chosen": 0.015825632959604263,
"logits/rejected": -0.02976151742041111,
"logps/chosen": -366.56439208984375,
"logps/rejected": -430.0389709472656,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.442001342773438,
"rewards/margins": 12.91467571258545,
"rewards/rejected": -26.356678009033203,
"step": 1460
},
{
"epoch": 0.48,
"grad_norm": 0.10359703004360199,
"learning_rate": 8.553583168967785e-06,
"logits/chosen": -0.09159889072179794,
"logits/rejected": -0.06940022855997086,
"logps/chosen": -315.4118347167969,
"logps/rejected": -377.8108215332031,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.500893592834473,
"rewards/margins": 13.29771614074707,
"rewards/rejected": -23.79861068725586,
"step": 1470
},
{
"epoch": 0.48,
"grad_norm": 0.011625263839960098,
"learning_rate": 8.54262546570239e-06,
"logits/chosen": -0.1008826494216919,
"logits/rejected": -0.05810718610882759,
"logps/chosen": -264.00909423828125,
"logps/rejected": -356.61163330078125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.526174545288086,
"rewards/margins": 14.310162544250488,
"rewards/rejected": -23.836336135864258,
"step": 1480
},
{
"epoch": 0.48,
"grad_norm": 0.024129299446940422,
"learning_rate": 8.531667762436993e-06,
"logits/chosen": 0.0249390359967947,
"logits/rejected": 0.004033858422189951,
"logps/chosen": -295.7063903808594,
"logps/rejected": -394.6076965332031,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.936558723449707,
"rewards/margins": 14.934048652648926,
"rewards/rejected": -25.87060546875,
"step": 1490
},
{
"epoch": 0.49,
"grad_norm": 0.006154273636639118,
"learning_rate": 8.5207100591716e-06,
"logits/chosen": 0.04360217973589897,
"logits/rejected": 0.03041175566613674,
"logps/chosen": -236.54794311523438,
"logps/rejected": -317.7098693847656,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.552594184875488,
"rewards/margins": 11.904518127441406,
"rewards/rejected": -20.457111358642578,
"step": 1500
},
{
"epoch": 0.49,
"grad_norm": 0.3130321502685547,
"learning_rate": 8.509752355906202e-06,
"logits/chosen": -0.07926555722951889,
"logits/rejected": -0.02901688776910305,
"logps/chosen": -248.68408203125,
"logps/rejected": -326.94940185546875,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.911336898803711,
"rewards/margins": 13.27922534942627,
"rewards/rejected": -22.190561294555664,
"step": 1510
},
{
"epoch": 0.49,
"grad_norm": 0.09558708965778351,
"learning_rate": 8.498794652640808e-06,
"logits/chosen": -0.0705290287733078,
"logits/rejected": -0.06556431949138641,
"logps/chosen": -289.7066955566406,
"logps/rejected": -359.8416748046875,
"loss": 0.0249,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -10.718439102172852,
"rewards/margins": 12.26536750793457,
"rewards/rejected": -22.983808517456055,
"step": 1520
},
{
"epoch": 0.49,
"grad_norm": 0.02135203592479229,
"learning_rate": 8.487836949375412e-06,
"logits/chosen": 0.08843693137168884,
"logits/rejected": 0.10300026834011078,
"logps/chosen": -283.0185546875,
"logps/rejected": -395.1629333496094,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.282575607299805,
"rewards/margins": 15.90794563293457,
"rewards/rejected": -26.190521240234375,
"step": 1530
},
{
"epoch": 0.5,
"grad_norm": 0.02689371071755886,
"learning_rate": 8.476879246110016e-06,
"logits/chosen": -0.054816532880067825,
"logits/rejected": -0.028935739770531654,
"logps/chosen": -340.44329833984375,
"logps/rejected": -424.52105712890625,
"loss": 0.0361,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -13.073100090026855,
"rewards/margins": 14.94751262664795,
"rewards/rejected": -28.020618438720703,
"step": 1540
},
{
"epoch": 0.5,
"grad_norm": 0.005785965360701084,
"learning_rate": 8.46592154284462e-06,
"logits/chosen": -0.014959866181015968,
"logits/rejected": 0.0628860592842102,
"logps/chosen": -238.8596954345703,
"logps/rejected": -339.1539611816406,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.337881088256836,
"rewards/margins": 14.204916000366211,
"rewards/rejected": -23.542797088623047,
"step": 1550
},
{
"epoch": 0.5,
"grad_norm": 0.1502775102853775,
"learning_rate": 8.454963839579225e-06,
"logits/chosen": 0.05573273450136185,
"logits/rejected": 0.09205415844917297,
"logps/chosen": -303.62701416015625,
"logps/rejected": -364.28204345703125,
"loss": 0.0096,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.095705032348633,
"rewards/margins": 12.889692306518555,
"rewards/rejected": -24.985401153564453,
"step": 1560
},
{
"epoch": 0.51,
"grad_norm": 0.01375632081180811,
"learning_rate": 8.44400613631383e-06,
"logits/chosen": -0.0028796226251870394,
"logits/rejected": 0.055630385875701904,
"logps/chosen": -247.17288208007812,
"logps/rejected": -334.44195556640625,
"loss": 0.0079,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.552030563354492,
"rewards/margins": 12.745870590209961,
"rewards/rejected": -21.297901153564453,
"step": 1570
},
{
"epoch": 0.51,
"grad_norm": 0.015819918364286423,
"learning_rate": 8.433048433048434e-06,
"logits/chosen": 0.029922613874077797,
"logits/rejected": 0.1054656133055687,
"logps/chosen": -290.9429626464844,
"logps/rejected": -389.000732421875,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.37781810760498,
"rewards/margins": 14.447565078735352,
"rewards/rejected": -25.82538414001465,
"step": 1580
},
{
"epoch": 0.51,
"grad_norm": 0.06199351325631142,
"learning_rate": 8.422090729783038e-06,
"logits/chosen": 0.034867942333221436,
"logits/rejected": 0.1408187299966812,
"logps/chosen": -304.77081298828125,
"logps/rejected": -402.5494079589844,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.836946487426758,
"rewards/margins": 15.10669994354248,
"rewards/rejected": -27.943645477294922,
"step": 1590
},
{
"epoch": 0.52,
"grad_norm": 0.011423285119235516,
"learning_rate": 8.411133026517642e-06,
"logits/chosen": 0.07202502340078354,
"logits/rejected": 0.15315476059913635,
"logps/chosen": -314.509033203125,
"logps/rejected": -408.44647216796875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.762028694152832,
"rewards/margins": 14.413556098937988,
"rewards/rejected": -27.175586700439453,
"step": 1600
},
{
"epoch": 0.52,
"grad_norm": 0.015212434343993664,
"learning_rate": 8.400175323252246e-06,
"logits/chosen": 0.07158254086971283,
"logits/rejected": 0.12116159498691559,
"logps/chosen": -301.7251892089844,
"logps/rejected": -424.9703674316406,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": -14.060445785522461,
"rewards/margins": 15.829069137573242,
"rewards/rejected": -29.889516830444336,
"step": 1610
},
{
"epoch": 0.52,
"grad_norm": 0.00037714597419835627,
"learning_rate": 8.38921761998685e-06,
"logits/chosen": 0.07507513463497162,
"logits/rejected": 0.1150326281785965,
"logps/chosen": -228.2713165283203,
"logps/rejected": -332.7132263183594,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.718751907348633,
"rewards/margins": 14.904324531555176,
"rewards/rejected": -23.623075485229492,
"step": 1620
},
{
"epoch": 0.53,
"grad_norm": 0.00014221732271835208,
"learning_rate": 8.378259916721457e-06,
"logits/chosen": 0.19829820096492767,
"logits/rejected": 0.1768883615732193,
"logps/chosen": -286.7662048339844,
"logps/rejected": -363.81463623046875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.576507568359375,
"rewards/margins": 13.473172187805176,
"rewards/rejected": -26.049678802490234,
"step": 1630
},
{
"epoch": 0.53,
"grad_norm": 0.2192688137292862,
"learning_rate": 8.36730221345606e-06,
"logits/chosen": 0.1942572295665741,
"logits/rejected": 0.22028391063213348,
"logps/chosen": -301.4276123046875,
"logps/rejected": -383.99432373046875,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.628840446472168,
"rewards/margins": 13.160855293273926,
"rewards/rejected": -26.789697647094727,
"step": 1640
},
{
"epoch": 0.53,
"grad_norm": 0.00010946116526611149,
"learning_rate": 8.356344510190665e-06,
"logits/chosen": 0.10659674555063248,
"logits/rejected": 0.1302722990512848,
"logps/chosen": -402.503173828125,
"logps/rejected": -484.22589111328125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -16.998889923095703,
"rewards/margins": 16.14960479736328,
"rewards/rejected": -33.148494720458984,
"step": 1650
},
{
"epoch": 0.54,
"grad_norm": 0.017199842259287834,
"learning_rate": 8.34538680692527e-06,
"logits/chosen": 0.07519405335187912,
"logits/rejected": 0.15728269517421722,
"logps/chosen": -284.95184326171875,
"logps/rejected": -384.1084289550781,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.825132369995117,
"rewards/margins": 12.739798545837402,
"rewards/rejected": -26.564931869506836,
"step": 1660
},
{
"epoch": 0.54,
"grad_norm": 0.01719660870730877,
"learning_rate": 8.334429103659874e-06,
"logits/chosen": 0.07660949975252151,
"logits/rejected": 0.10592161118984222,
"logps/chosen": -231.68411254882812,
"logps/rejected": -322.4700012207031,
"loss": 0.0173,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.59976863861084,
"rewards/margins": 12.979411125183105,
"rewards/rejected": -22.579181671142578,
"step": 1670
},
{
"epoch": 0.54,
"grad_norm": 0.012668246403336525,
"learning_rate": 8.323471400394478e-06,
"logits/chosen": 0.11749809980392456,
"logits/rejected": 0.1774546504020691,
"logps/chosen": -322.8683776855469,
"logps/rejected": -432.86553955078125,
"loss": 0.0232,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -12.755744934082031,
"rewards/margins": 16.210979461669922,
"rewards/rejected": -28.966724395751953,
"step": 1680
},
{
"epoch": 0.55,
"grad_norm": 0.007913627661764622,
"learning_rate": 8.312513697129082e-06,
"logits/chosen": 0.11255357414484024,
"logits/rejected": 0.1525614708662033,
"logps/chosen": -303.31060791015625,
"logps/rejected": -404.16766357421875,
"loss": 0.0255,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -12.78972339630127,
"rewards/margins": 14.71977424621582,
"rewards/rejected": -27.509496688842773,
"step": 1690
},
{
"epoch": 0.55,
"grad_norm": 7.69473408581689e-05,
"learning_rate": 8.301555993863687e-06,
"logits/chosen": 0.07176389545202255,
"logits/rejected": 0.14741338789463043,
"logps/chosen": -381.74859619140625,
"logps/rejected": -479.43280029296875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -15.687698364257812,
"rewards/margins": 17.027114868164062,
"rewards/rejected": -32.71481704711914,
"step": 1700
},
{
"epoch": 0.55,
"grad_norm": 0.020641475915908813,
"learning_rate": 8.290598290598293e-06,
"logits/chosen": 0.027815943583846092,
"logits/rejected": 0.06404396146535873,
"logps/chosen": -326.9677734375,
"logps/rejected": -426.5794982910156,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.755020141601562,
"rewards/margins": 15.451559066772461,
"rewards/rejected": -29.206579208374023,
"step": 1710
},
{
"epoch": 0.56,
"grad_norm": 1.0741193818830652e-06,
"learning_rate": 8.279640587332895e-06,
"logits/chosen": -0.04963821545243263,
"logits/rejected": 0.060973964631557465,
"logps/chosen": -282.719482421875,
"logps/rejected": -397.8350830078125,
"loss": 0.0378,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -10.77167797088623,
"rewards/margins": 16.46950912475586,
"rewards/rejected": -27.241186141967773,
"step": 1720
},
{
"epoch": 0.56,
"grad_norm": 0.0007996432832442224,
"learning_rate": 8.268682884067501e-06,
"logits/chosen": 0.0820159837603569,
"logits/rejected": 0.05762636661529541,
"logps/chosen": -317.8184509277344,
"logps/rejected": -399.0465087890625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.283085823059082,
"rewards/margins": 14.6398286819458,
"rewards/rejected": -26.922916412353516,
"step": 1730
},
{
"epoch": 0.56,
"grad_norm": 0.0067008682526648045,
"learning_rate": 8.257725180802104e-06,
"logits/chosen": 0.03673550486564636,
"logits/rejected": 0.06919295340776443,
"logps/chosen": -289.0475769042969,
"logps/rejected": -387.4378662109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.43751049041748,
"rewards/margins": 16.087310791015625,
"rewards/rejected": -25.52482032775879,
"step": 1740
},
{
"epoch": 0.57,
"grad_norm": 1.0896865129470825,
"learning_rate": 8.24676747753671e-06,
"logits/chosen": 0.10127731412649155,
"logits/rejected": 0.16299596428871155,
"logps/chosen": -288.1521911621094,
"logps/rejected": -375.989501953125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.477235794067383,
"rewards/margins": 15.621920585632324,
"rewards/rejected": -25.09915542602539,
"step": 1750
},
{
"epoch": 0.57,
"grad_norm": 0.21109241247177124,
"learning_rate": 8.235809774271314e-06,
"logits/chosen": 0.008235934190452099,
"logits/rejected": 0.028172463178634644,
"logps/chosen": -295.41619873046875,
"logps/rejected": -404.15533447265625,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.374763488769531,
"rewards/margins": 15.838516235351562,
"rewards/rejected": -27.21327781677246,
"step": 1760
},
{
"epoch": 0.57,
"grad_norm": 0.341794490814209,
"learning_rate": 8.224852071005918e-06,
"logits/chosen": 0.1264527142047882,
"logits/rejected": 0.15065120160579681,
"logps/chosen": -292.97430419921875,
"logps/rejected": -380.7444152832031,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.181876182556152,
"rewards/margins": 14.194729804992676,
"rewards/rejected": -25.376605987548828,
"step": 1770
},
{
"epoch": 0.58,
"grad_norm": 0.007328469771891832,
"learning_rate": 8.213894367740523e-06,
"logits/chosen": 0.023362448439002037,
"logits/rejected": 0.03181435540318489,
"logps/chosen": -292.8009948730469,
"logps/rejected": -382.83343505859375,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.59870719909668,
"rewards/margins": 12.62264347076416,
"rewards/rejected": -24.221351623535156,
"step": 1780
},
{
"epoch": 0.58,
"grad_norm": 0.27736029028892517,
"learning_rate": 8.202936664475127e-06,
"logits/chosen": -0.05786416679620743,
"logits/rejected": 0.0343763530254364,
"logps/chosen": -281.7183532714844,
"logps/rejected": -384.5142822265625,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.055322647094727,
"rewards/margins": 14.507980346679688,
"rewards/rejected": -25.56330108642578,
"step": 1790
},
{
"epoch": 0.58,
"grad_norm": 0.00044713946408592165,
"learning_rate": 8.191978961209731e-06,
"logits/chosen": 0.04160480573773384,
"logits/rejected": 0.15656664967536926,
"logps/chosen": -255.13864135742188,
"logps/rejected": -372.533935546875,
"loss": 0.0065,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.455907821655273,
"rewards/margins": 16.43738555908203,
"rewards/rejected": -25.893295288085938,
"step": 1800
},
{
"epoch": 0.59,
"grad_norm": 0.15391205251216888,
"learning_rate": 8.181021257944335e-06,
"logits/chosen": 0.13832136988639832,
"logits/rejected": 0.21045103669166565,
"logps/chosen": -304.4212341308594,
"logps/rejected": -408.509033203125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.320929527282715,
"rewards/margins": 14.877690315246582,
"rewards/rejected": -27.198617935180664,
"step": 1810
},
{
"epoch": 0.59,
"grad_norm": 0.1734129786491394,
"learning_rate": 8.17006355467894e-06,
"logits/chosen": -0.008830100297927856,
"logits/rejected": 0.08690972626209259,
"logps/chosen": -317.80670166015625,
"logps/rejected": -403.6852111816406,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.214055061340332,
"rewards/margins": 15.352259635925293,
"rewards/rejected": -28.566314697265625,
"step": 1820
},
{
"epoch": 0.59,
"grad_norm": 1.4723388064297183e-09,
"learning_rate": 8.159105851413544e-06,
"logits/chosen": 0.10790036618709564,
"logits/rejected": 0.20435258746147156,
"logps/chosen": -244.66781616210938,
"logps/rejected": -349.64190673828125,
"loss": 0.0391,
"rewards/accuracies": 0.966666579246521,
"rewards/chosen": -9.495941162109375,
"rewards/margins": 14.044004440307617,
"rewards/rejected": -23.539945602416992,
"step": 1830
},
{
"epoch": 0.6,
"grad_norm": 4.041919601149857e-05,
"learning_rate": 8.148148148148148e-06,
"logits/chosen": 0.1392899751663208,
"logits/rejected": 0.17356742918491364,
"logps/chosen": -271.93536376953125,
"logps/rejected": -376.3609313964844,
"loss": 0.0078,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.918790817260742,
"rewards/margins": 15.073224067687988,
"rewards/rejected": -26.992013931274414,
"step": 1840
},
{
"epoch": 0.6,
"grad_norm": 3.694919769259286e-06,
"learning_rate": 8.137190444882753e-06,
"logits/chosen": 0.2150353193283081,
"logits/rejected": 0.2910372018814087,
"logps/chosen": -296.4953918457031,
"logps/rejected": -429.4913635253906,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.478975296020508,
"rewards/margins": 17.906230926513672,
"rewards/rejected": -30.385208129882812,
"step": 1850
},
{
"epoch": 0.6,
"grad_norm": 14.997368812561035,
"learning_rate": 8.126232741617359e-06,
"logits/chosen": 0.13269153237342834,
"logits/rejected": 0.1637151837348938,
"logps/chosen": -393.65887451171875,
"logps/rejected": -488.2777404785156,
"loss": 0.0282,
"rewards/accuracies": 1.0,
"rewards/chosen": -17.513179779052734,
"rewards/margins": 16.430784225463867,
"rewards/rejected": -33.943965911865234,
"step": 1860
},
{
"epoch": 0.6,
"grad_norm": 0.0013834636192768812,
"learning_rate": 8.115275038351961e-06,
"logits/chosen": 0.07304046303033829,
"logits/rejected": 0.147117480635643,
"logps/chosen": -308.97161865234375,
"logps/rejected": -416.3505859375,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.166252136230469,
"rewards/margins": 15.148821830749512,
"rewards/rejected": -28.315073013305664,
"step": 1870
},
{
"epoch": 0.61,
"grad_norm": 0.005938555579632521,
"learning_rate": 8.104317335086567e-06,
"logits/chosen": 0.02907967008650303,
"logits/rejected": 0.11755422502756119,
"logps/chosen": -310.50933837890625,
"logps/rejected": -403.22247314453125,
"loss": 0.0225,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.069503784179688,
"rewards/margins": 15.674043655395508,
"rewards/rejected": -26.743549346923828,
"step": 1880
},
{
"epoch": 0.61,
"grad_norm": 0.3666568398475647,
"learning_rate": 8.09335963182117e-06,
"logits/chosen": -0.004576456733047962,
"logits/rejected": 0.07972874492406845,
"logps/chosen": -337.0972900390625,
"logps/rejected": -455.228759765625,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.475229263305664,
"rewards/margins": 17.417362213134766,
"rewards/rejected": -28.892593383789062,
"step": 1890
},
{
"epoch": 0.61,
"grad_norm": 0.050740547478199005,
"learning_rate": 8.082401928555776e-06,
"logits/chosen": 0.006110090762376785,
"logits/rejected": 0.045890793204307556,
"logps/chosen": -242.1782684326172,
"logps/rejected": -327.7618408203125,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.432751655578613,
"rewards/margins": 15.302406311035156,
"rewards/rejected": -22.735158920288086,
"step": 1900
},
{
"epoch": 0.62,
"grad_norm": 0.05140925943851471,
"learning_rate": 8.07144422529038e-06,
"logits/chosen": -0.109877809882164,
"logits/rejected": -0.02618744969367981,
"logps/chosen": -255.05599975585938,
"logps/rejected": -364.76654052734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.201934814453125,
"rewards/margins": 16.78182029724121,
"rewards/rejected": -24.983755111694336,
"step": 1910
},
{
"epoch": 0.62,
"grad_norm": 3.2723581790924072,
"learning_rate": 8.060486522024984e-06,
"logits/chosen": -0.030681187286973,
"logits/rejected": 0.031235750764608383,
"logps/chosen": -223.58474731445312,
"logps/rejected": -289.45196533203125,
"loss": 0.0087,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.744879722595215,
"rewards/margins": 11.148603439331055,
"rewards/rejected": -18.893482208251953,
"step": 1920
},
{
"epoch": 0.62,
"grad_norm": 1.3139744997024536,
"learning_rate": 8.049528818759589e-06,
"logits/chosen": -0.010395990684628487,
"logits/rejected": 0.053827375173568726,
"logps/chosen": -260.6311340332031,
"logps/rejected": -343.34796142578125,
"loss": 0.0085,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.475730895996094,
"rewards/margins": 12.925222396850586,
"rewards/rejected": -21.400951385498047,
"step": 1930
},
{
"epoch": 0.63,
"grad_norm": 0.0011528899194672704,
"learning_rate": 8.038571115494193e-06,
"logits/chosen": 0.004040165338665247,
"logits/rejected": 0.09620045125484467,
"logps/chosen": -265.73260498046875,
"logps/rejected": -368.19976806640625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.830063819885254,
"rewards/margins": 14.222076416015625,
"rewards/rejected": -23.052141189575195,
"step": 1940
},
{
"epoch": 0.63,
"grad_norm": 3.633260348578915e-05,
"learning_rate": 8.027613412228797e-06,
"logits/chosen": 0.08991348743438721,
"logits/rejected": 0.13095875084400177,
"logps/chosen": -250.6635284423828,
"logps/rejected": -348.5271301269531,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.241789817810059,
"rewards/margins": 15.90589714050293,
"rewards/rejected": -24.147687911987305,
"step": 1950
},
{
"epoch": 0.63,
"grad_norm": 0.008139081299304962,
"learning_rate": 8.016655708963403e-06,
"logits/chosen": -0.05623581260442734,
"logits/rejected": 0.008502885699272156,
"logps/chosen": -312.2218017578125,
"logps/rejected": -426.6249084472656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.09146785736084,
"rewards/margins": 17.29364013671875,
"rewards/rejected": -28.385107040405273,
"step": 1960
},
{
"epoch": 0.64,
"grad_norm": 0.0004545208648778498,
"learning_rate": 8.005698005698006e-06,
"logits/chosen": 0.013862645253539085,
"logits/rejected": 0.05150808021426201,
"logps/chosen": -273.6121826171875,
"logps/rejected": -365.71307373046875,
"loss": 0.0331,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -8.933341026306152,
"rewards/margins": 15.150711059570312,
"rewards/rejected": -24.08405113220215,
"step": 1970
},
{
"epoch": 0.64,
"grad_norm": 0.07295417040586472,
"learning_rate": 7.994740302432612e-06,
"logits/chosen": 0.04458921402692795,
"logits/rejected": 0.10884840786457062,
"logps/chosen": -224.69967651367188,
"logps/rejected": -325.59356689453125,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.435521125793457,
"rewards/margins": 14.62103271484375,
"rewards/rejected": -22.05655288696289,
"step": 1980
},
{
"epoch": 0.64,
"grad_norm": 0.00015990910469554365,
"learning_rate": 7.983782599167214e-06,
"logits/chosen": -0.07026857882738113,
"logits/rejected": 0.06809535622596741,
"logps/chosen": -344.2391357421875,
"logps/rejected": -496.5973205566406,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.867463111877441,
"rewards/margins": 20.219188690185547,
"rewards/rejected": -32.08665466308594,
"step": 1990
},
{
"epoch": 0.65,
"grad_norm": 0.023937899619340897,
"learning_rate": 7.97282489590182e-06,
"logits/chosen": 0.07235778868198395,
"logits/rejected": 0.1107616052031517,
"logps/chosen": -232.8976287841797,
"logps/rejected": -342.6751708984375,
"loss": 0.0496,
"rewards/accuracies": 0.9666666984558105,
"rewards/chosen": -8.303656578063965,
"rewards/margins": 14.42664909362793,
"rewards/rejected": -22.730304718017578,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 9276,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}