phi-2-gpo-v15-i1 / trainer_state.json
lole25's picture
Model save
d2ef5f5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996190476190476,
"eval_steps": 500,
"global_step": 656,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 7.575757575757576e-08,
"logits/chosen": 0.07398031651973724,
"logits/rejected": 0.059482574462890625,
"logps/chosen": -279.7221984863281,
"logps/rejected": -295.30865478515625,
"loss": 2.4106,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 7.575757575757576e-07,
"logits/chosen": 0.08179842680692673,
"logits/rejected": 0.2137567102909088,
"logps/chosen": -371.2894287109375,
"logps/rejected": -378.87701416015625,
"loss": 2.1369,
"rewards/accuracies": 0.3958333432674408,
"rewards/chosen": 0.000596717931330204,
"rewards/margins": 0.0007703733863309026,
"rewards/rejected": -0.00017365541134495288,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 1.5151515151515152e-06,
"logits/chosen": 0.13426382839679718,
"logits/rejected": 0.17069879174232483,
"logps/chosen": -337.7759704589844,
"logps/rejected": -351.1375427246094,
"loss": 2.1857,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.0013722162693738937,
"rewards/margins": -0.0006242281524464488,
"rewards/rejected": -0.0007479880005121231,
"step": 20
},
{
"epoch": 0.05,
"learning_rate": 2.2727272727272728e-06,
"logits/chosen": 0.11453332751989365,
"logits/rejected": 0.1672835648059845,
"logps/chosen": -343.336181640625,
"logps/rejected": -351.83966064453125,
"loss": 2.2006,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0007104009273461998,
"rewards/margins": 0.0031625095289200544,
"rewards/rejected": -0.0024521087761968374,
"step": 30
},
{
"epoch": 0.06,
"learning_rate": 3.0303030303030305e-06,
"logits/chosen": 0.14377865195274353,
"logits/rejected": 0.23349857330322266,
"logps/chosen": -338.24847412109375,
"logps/rejected": -321.5999450683594,
"loss": 2.0523,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.00015345169231295586,
"rewards/margins": 0.0033258639741688967,
"rewards/rejected": -0.003479315433651209,
"step": 40
},
{
"epoch": 0.08,
"learning_rate": 3.7878787878787882e-06,
"logits/chosen": 0.12487177550792694,
"logits/rejected": 0.23440325260162354,
"logps/chosen": -385.6036682128906,
"logps/rejected": -353.2607727050781,
"loss": 2.0721,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.0014706759247928858,
"rewards/margins": 0.005332515574991703,
"rewards/rejected": -0.006803191266953945,
"step": 50
},
{
"epoch": 0.09,
"learning_rate": 4.5454545454545455e-06,
"logits/chosen": 0.1113414317369461,
"logits/rejected": 0.1453917920589447,
"logps/chosen": -375.4793701171875,
"logps/rejected": -355.79571533203125,
"loss": 2.0509,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.002245596144348383,
"rewards/margins": 0.019777730107307434,
"rewards/rejected": -0.02202332578599453,
"step": 60
},
{
"epoch": 0.11,
"learning_rate": 4.999432965739786e-06,
"logits/chosen": 0.13350918889045715,
"logits/rejected": 0.1631946861743927,
"logps/chosen": -323.2510986328125,
"logps/rejected": -328.84039306640625,
"loss": 2.0626,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0012016391847282648,
"rewards/margins": 0.03956783190369606,
"rewards/rejected": -0.040769465267658234,
"step": 70
},
{
"epoch": 0.12,
"learning_rate": 4.9930567839810125e-06,
"logits/chosen": 0.0718630701303482,
"logits/rejected": 0.19179414212703705,
"logps/chosen": -378.56396484375,
"logps/rejected": -368.79949951171875,
"loss": 2.0501,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.003595351707190275,
"rewards/margins": 0.06342312693595886,
"rewards/rejected": -0.0598277822136879,
"step": 80
},
{
"epoch": 0.14,
"learning_rate": 4.979613761906212e-06,
"logits/chosen": 0.10020647943019867,
"logits/rejected": 0.21944165229797363,
"logps/chosen": -358.9504089355469,
"logps/rejected": -345.926513671875,
"loss": 1.8966,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.010454339906573296,
"rewards/margins": 0.09268515557050705,
"rewards/rejected": -0.1031394973397255,
"step": 90
},
{
"epoch": 0.15,
"learning_rate": 4.959142005221991e-06,
"logits/chosen": 0.1388009488582611,
"logits/rejected": 0.26329106092453003,
"logps/chosen": -337.170166015625,
"logps/rejected": -351.45172119140625,
"loss": 1.9484,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.009544052183628082,
"rewards/margins": 0.14409229159355164,
"rewards/rejected": -0.13454824686050415,
"step": 100
},
{
"epoch": 0.17,
"learning_rate": 4.931699543346854e-06,
"logits/chosen": 0.10444238036870956,
"logits/rejected": 0.20885030925273895,
"logps/chosen": -329.65020751953125,
"logps/rejected": -367.345458984375,
"loss": 1.7584,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03416838496923447,
"rewards/margins": 0.19879209995269775,
"rewards/rejected": -0.23296049237251282,
"step": 110
},
{
"epoch": 0.18,
"learning_rate": 4.897364164920515e-06,
"logits/chosen": 0.10559381544589996,
"logits/rejected": 0.1961037516593933,
"logps/chosen": -354.2994079589844,
"logps/rejected": -344.41925048828125,
"loss": 1.7929,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.05705786868929863,
"rewards/margins": 0.17845068871974945,
"rewards/rejected": -0.23550856113433838,
"step": 120
},
{
"epoch": 0.2,
"learning_rate": 4.8562331973035396e-06,
"logits/chosen": 0.12288101017475128,
"logits/rejected": 0.22050254046916962,
"logps/chosen": -327.88311767578125,
"logps/rejected": -356.74407958984375,
"loss": 1.8191,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07013808190822601,
"rewards/margins": 0.22160351276397705,
"rewards/rejected": -0.29174157977104187,
"step": 130
},
{
"epoch": 0.21,
"learning_rate": 4.808423230692374e-06,
"logits/chosen": 0.1860750913619995,
"logits/rejected": 0.18267032504081726,
"logps/chosen": -338.1869812011719,
"logps/rejected": -380.5123596191406,
"loss": 1.7142,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.07972263544797897,
"rewards/margins": 0.2994547486305237,
"rewards/rejected": -0.37917739152908325,
"step": 140
},
{
"epoch": 0.23,
"learning_rate": 4.754069787631761e-06,
"logits/chosen": 0.15666987001895905,
"logits/rejected": 0.24864494800567627,
"logps/chosen": -409.4656677246094,
"logps/rejected": -393.28466796875,
"loss": 1.6771,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.06894813477993011,
"rewards/margins": 0.3388601243495941,
"rewards/rejected": -0.40780824422836304,
"step": 150
},
{
"epoch": 0.24,
"learning_rate": 4.693326938861367e-06,
"logits/chosen": 0.12029329687356949,
"logits/rejected": 0.17621104419231415,
"logps/chosen": -326.62701416015625,
"logps/rejected": -357.37164306640625,
"loss": 1.74,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.045461155474185944,
"rewards/margins": 0.287504643201828,
"rewards/rejected": -0.33296579122543335,
"step": 160
},
{
"epoch": 0.26,
"learning_rate": 4.626366866585528e-06,
"logits/chosen": 0.17087192833423615,
"logits/rejected": 0.25556960701942444,
"logps/chosen": -338.80230712890625,
"logps/rejected": -350.237060546875,
"loss": 1.6582,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.02211242914199829,
"rewards/margins": 0.2152978479862213,
"rewards/rejected": -0.237410306930542,
"step": 170
},
{
"epoch": 0.27,
"learning_rate": 4.553379376404085e-06,
"logits/chosen": 0.14991971850395203,
"logits/rejected": 0.1636931598186493,
"logps/chosen": -308.0624084472656,
"logps/rejected": -344.2478942871094,
"loss": 1.6719,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.004089848138391972,
"rewards/margins": 0.2675096392631531,
"rewards/rejected": -0.2634198069572449,
"step": 180
},
{
"epoch": 0.29,
"learning_rate": 4.474571359287791e-06,
"logits/chosen": 0.15207740664482117,
"logits/rejected": 0.211543008685112,
"logps/chosen": -336.4064636230469,
"logps/rejected": -340.141357421875,
"loss": 1.6014,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01036889385432005,
"rewards/margins": 0.2858952283859253,
"rewards/rejected": -0.296264111995697,
"step": 190
},
{
"epoch": 0.3,
"learning_rate": 4.3901662051233755e-06,
"logits/chosen": 0.1840183436870575,
"logits/rejected": 0.22152027487754822,
"logps/chosen": -404.16021728515625,
"logps/rejected": -356.364013671875,
"loss": 1.7463,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.03463779762387276,
"rewards/margins": 0.25136780738830566,
"rewards/rejected": -0.21672996878623962,
"step": 200
},
{
"epoch": 0.32,
"learning_rate": 4.30040316949064e-06,
"logits/chosen": 0.1487782895565033,
"logits/rejected": 0.20272579789161682,
"logps/chosen": -347.5115966796875,
"logps/rejected": -344.53753662109375,
"loss": 1.6848,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.019732611253857613,
"rewards/margins": 0.21993084251880646,
"rewards/rejected": -0.2001982480287552,
"step": 210
},
{
"epoch": 0.34,
"learning_rate": 4.205536695466524e-06,
"logits/chosen": 0.11921755224466324,
"logits/rejected": 0.16543138027191162,
"logps/chosen": -302.01861572265625,
"logps/rejected": -345.60491943359375,
"loss": 1.7084,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0038991228211671114,
"rewards/margins": 0.31049156188964844,
"rewards/rejected": -0.3143906593322754,
"step": 220
},
{
"epoch": 0.35,
"learning_rate": 4.105835692378557e-06,
"logits/chosen": 0.13227376341819763,
"logits/rejected": 0.17412447929382324,
"logps/chosen": -337.3625793457031,
"logps/rejected": -365.3255615234375,
"loss": 1.6995,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.05614470690488815,
"rewards/margins": 0.28102201223373413,
"rewards/rejected": -0.3371667265892029,
"step": 230
},
{
"epoch": 0.37,
"learning_rate": 4.001582773552153e-06,
"logits/chosen": 0.13456036150455475,
"logits/rejected": 0.22667856514453888,
"logps/chosen": -403.4490966796875,
"logps/rejected": -408.80413818359375,
"loss": 1.4862,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.025508727878332138,
"rewards/margins": 0.3425619602203369,
"rewards/rejected": -0.36807072162628174,
"step": 240
},
{
"epoch": 0.38,
"learning_rate": 3.893073455212438e-06,
"logits/chosen": 0.13273295760154724,
"logits/rejected": 0.21381357312202454,
"logps/chosen": -335.8673095703125,
"logps/rejected": -351.50103759765625,
"loss": 1.5854,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09447745978832245,
"rewards/margins": 0.27237147092819214,
"rewards/rejected": -0.3668489158153534,
"step": 250
},
{
"epoch": 0.4,
"learning_rate": 3.7806153188114027e-06,
"logits/chosen": 0.18772640824317932,
"logits/rejected": 0.20650401711463928,
"logps/chosen": -320.12945556640625,
"logps/rejected": -362.57281494140625,
"loss": 1.7323,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.056845515966415405,
"rewards/margins": 0.2855423092842102,
"rewards/rejected": -0.342387855052948,
"step": 260
},
{
"epoch": 0.41,
"learning_rate": 3.6645271391548542e-06,
"logits/chosen": 0.154958575963974,
"logits/rejected": 0.19303588569164276,
"logps/chosen": -360.35443115234375,
"logps/rejected": -359.91497802734375,
"loss": 1.6388,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.005552726797759533,
"rewards/margins": 0.27676287293434143,
"rewards/rejected": -0.2712101340293884,
"step": 270
},
{
"epoch": 0.43,
"learning_rate": 3.5451379808006014e-06,
"logits/chosen": 0.1470947563648224,
"logits/rejected": 0.19554203748703003,
"logps/chosen": -343.96466064453125,
"logps/rejected": -351.0820007324219,
"loss": 1.6004,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.018333502113819122,
"rewards/margins": 0.3293381631374359,
"rewards/rejected": -0.3110046684741974,
"step": 280
},
{
"epoch": 0.44,
"learning_rate": 3.4227862652892106e-06,
"logits/chosen": 0.18207962810993195,
"logits/rejected": 0.25772327184677124,
"logps/chosen": -379.73248291015625,
"logps/rejected": -392.091552734375,
"loss": 1.631,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.0037728759925812483,
"rewards/margins": 0.3134722113609314,
"rewards/rejected": -0.3172450661659241,
"step": 290
},
{
"epoch": 0.46,
"learning_rate": 3.2978188118513814e-06,
"logits/chosen": 0.18880879878997803,
"logits/rejected": 0.22835353016853333,
"logps/chosen": -318.1977844238281,
"logps/rejected": -360.68096923828125,
"loss": 1.6509,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.051277369260787964,
"rewards/margins": 0.28354349732398987,
"rewards/rejected": -0.33482086658477783,
"step": 300
},
{
"epoch": 0.47,
"learning_rate": 3.1705898543111576e-06,
"logits/chosen": 0.1640356034040451,
"logits/rejected": 0.2013256549835205,
"logps/chosen": -345.88494873046875,
"logps/rejected": -396.95489501953125,
"loss": 1.5511,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03583758696913719,
"rewards/margins": 0.3143077492713928,
"rewards/rejected": -0.35014528036117554,
"step": 310
},
{
"epoch": 0.49,
"learning_rate": 3.041460036971664e-06,
"logits/chosen": 0.10814084857702255,
"logits/rejected": 0.17361339926719666,
"logps/chosen": -331.90240478515625,
"logps/rejected": -345.795166015625,
"loss": 1.6032,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06420620530843735,
"rewards/margins": 0.19957861304283142,
"rewards/rejected": -0.26378482580184937,
"step": 320
},
{
"epoch": 0.5,
"learning_rate": 2.910795392329649e-06,
"logits/chosen": 0.13951388001441956,
"logits/rejected": 0.19447830319404602,
"logps/chosen": -364.3550720214844,
"logps/rejected": -359.9488830566406,
"loss": 1.595,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.010126419365406036,
"rewards/margins": 0.31530141830444336,
"rewards/rejected": -0.3254278302192688,
"step": 330
},
{
"epoch": 0.52,
"learning_rate": 2.7789663035166035e-06,
"logits/chosen": 0.1637967824935913,
"logits/rejected": 0.15295840799808502,
"logps/chosen": -340.40386962890625,
"logps/rejected": -370.7852478027344,
"loss": 1.5882,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09541453421115875,
"rewards/margins": 0.31104880571365356,
"rewards/rejected": -0.4064633250236511,
"step": 340
},
{
"epoch": 0.53,
"learning_rate": 2.6463464544075344e-06,
"logits/chosen": 0.14287754893302917,
"logits/rejected": 0.21446409821510315,
"logps/chosen": -355.376220703125,
"logps/rejected": -388.11944580078125,
"loss": 1.4669,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.019069373607635498,
"rewards/margins": 0.37106311321258545,
"rewards/rejected": -0.39013251662254333,
"step": 350
},
{
"epoch": 0.55,
"learning_rate": 2.513311770373421e-06,
"logits/chosen": 0.13659325242042542,
"logits/rejected": 0.22452709078788757,
"logps/chosen": -303.7241516113281,
"logps/rejected": -364.5718688964844,
"loss": 1.5243,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.022251833230257034,
"rewards/margins": 0.3609643578529358,
"rewards/rejected": -0.38321617245674133,
"step": 360
},
{
"epoch": 0.56,
"learning_rate": 2.380239352679908e-06,
"logits/chosen": 0.13927368819713593,
"logits/rejected": 0.2216307818889618,
"logps/chosen": -325.48150634765625,
"logps/rejected": -382.0296325683594,
"loss": 1.5408,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.0005920141702517867,
"rewards/margins": 0.3364425599575043,
"rewards/rejected": -0.3358505666255951,
"step": 370
},
{
"epoch": 0.58,
"learning_rate": 2.247506409552795e-06,
"logits/chosen": 0.15144166350364685,
"logits/rejected": 0.20430748164653778,
"logps/chosen": -369.327880859375,
"logps/rejected": -383.3594055175781,
"loss": 1.6408,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.05453786998987198,
"rewards/margins": 0.41130223870277405,
"rewards/rejected": -0.46584004163742065,
"step": 380
},
{
"epoch": 0.59,
"learning_rate": 2.1154891869403436e-06,
"logits/chosen": 0.11367179453372955,
"logits/rejected": 0.20442676544189453,
"logps/chosen": -361.3442077636719,
"logps/rejected": -390.0636291503906,
"loss": 1.641,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.013566520996391773,
"rewards/margins": 0.45338043570518494,
"rewards/rejected": -0.4669469892978668,
"step": 390
},
{
"epoch": 0.61,
"learning_rate": 1.9845619020032552e-06,
"logits/chosen": 0.15614674985408783,
"logits/rejected": 0.20679621398448944,
"logps/chosen": -328.5157165527344,
"logps/rejected": -368.5887451171875,
"loss": 1.6521,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.036636289209127426,
"rewards/margins": 0.31485632061958313,
"rewards/rejected": -0.35149258375167847,
"step": 400
},
{
"epoch": 0.62,
"learning_rate": 1.8550956823554708e-06,
"logits/chosen": 0.12708225846290588,
"logits/rejected": 0.21543464064598083,
"logps/chosen": -377.3260498046875,
"logps/rejected": -364.48870849609375,
"loss": 1.5909,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0568900890648365,
"rewards/margins": 0.30653566122055054,
"rewards/rejected": -0.36342576146125793,
"step": 410
},
{
"epoch": 0.64,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": 0.20352402329444885,
"logits/rejected": 0.27957600355148315,
"logps/chosen": -347.7279357910156,
"logps/rejected": -355.5521545410156,
"loss": 1.6226,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.06915120780467987,
"rewards/margins": 0.2660463750362396,
"rewards/rejected": -0.3351975977420807,
"step": 420
},
{
"epoch": 0.66,
"learning_rate": 1.6020092013802002e-06,
"logits/chosen": 0.14161694049835205,
"logits/rejected": 0.22023169696331024,
"logps/chosen": -323.6744689941406,
"logps/rejected": -365.5609130859375,
"loss": 1.5258,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.026104014366865158,
"rewards/margins": 0.40274888277053833,
"rewards/rejected": -0.4288528859615326,
"step": 430
},
{
"epoch": 0.67,
"learning_rate": 1.4791063411799938e-06,
"logits/chosen": 0.20196688175201416,
"logits/rejected": 0.22374701499938965,
"logps/chosen": -346.626220703125,
"logps/rejected": -398.343994140625,
"loss": 1.6026,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.14315533638000488,
"rewards/margins": 0.2796045243740082,
"rewards/rejected": -0.42275986075401306,
"step": 440
},
{
"epoch": 0.69,
"learning_rate": 1.3590973149722103e-06,
"logits/chosen": 0.16043411195278168,
"logits/rejected": 0.24400117993354797,
"logps/chosen": -350.2712097167969,
"logps/rejected": -377.76129150390625,
"loss": 1.6442,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.05976473540067673,
"rewards/margins": 0.2989664673805237,
"rewards/rejected": -0.3587311804294586,
"step": 450
},
{
"epoch": 0.7,
"learning_rate": 1.2423223013801946e-06,
"logits/chosen": 0.14352941513061523,
"logits/rejected": 0.24110408127307892,
"logps/chosen": -367.91851806640625,
"logps/rejected": -397.81072998046875,
"loss": 1.6837,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.08046244829893112,
"rewards/margins": 0.36456337571144104,
"rewards/rejected": -0.44502583146095276,
"step": 460
},
{
"epoch": 0.72,
"learning_rate": 1.1291123118671665e-06,
"logits/chosen": 0.0973966121673584,
"logits/rejected": 0.18068069219589233,
"logps/chosen": -339.65704345703125,
"logps/rejected": -340.48236083984375,
"loss": 1.6314,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.06265803426504135,
"rewards/margins": 0.30512723326683044,
"rewards/rejected": -0.3677853047847748,
"step": 470
},
{
"epoch": 0.73,
"learning_rate": 1.019788252448267e-06,
"logits/chosen": 0.17376969754695892,
"logits/rejected": 0.21862807869911194,
"logps/chosen": -355.0315856933594,
"logps/rejected": -376.3943786621094,
"loss": 1.5767,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.02587694302201271,
"rewards/margins": 0.3213956654071808,
"rewards/rejected": -0.3472725749015808,
"step": 480
},
{
"epoch": 0.75,
"learning_rate": 9.146600140475945e-07,
"logits/chosen": 0.1421867460012436,
"logits/rejected": 0.23107881844043732,
"logps/chosen": -391.0975341796875,
"logps/rejected": -383.6336975097656,
"loss": 1.7495,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.10866842418909073,
"rewards/margins": 0.2634437382221222,
"rewards/rejected": -0.3721121549606323,
"step": 490
},
{
"epoch": 0.76,
"learning_rate": 8.140255940787059e-07,
"logits/chosen": 0.13602428138256073,
"logits/rejected": 0.23974844813346863,
"logps/chosen": -341.78582763671875,
"logps/rejected": -399.82904052734375,
"loss": 1.5854,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.04695357754826546,
"rewards/margins": 0.3549908697605133,
"rewards/rejected": -0.40194445848464966,
"step": 500
},
{
"epoch": 0.78,
"learning_rate": 7.181702517385789e-07,
"logits/chosen": 0.170148104429245,
"logits/rejected": 0.21931186318397522,
"logps/chosen": -323.8975524902344,
"logps/rejected": -348.66766357421875,
"loss": 1.7339,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07490874826908112,
"rewards/margins": 0.3289056718349457,
"rewards/rejected": -0.4038144052028656,
"step": 510
},
{
"epoch": 0.79,
"learning_rate": 6.273656994094232e-07,
"logits/chosen": 0.17631427943706512,
"logits/rejected": 0.23277851939201355,
"logps/chosen": -345.8653259277344,
"logps/rejected": -342.0807800292969,
"loss": 1.6504,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02272309735417366,
"rewards/margins": 0.33489790558815,
"rewards/rejected": -0.3576210141181946,
"step": 520
},
{
"epoch": 0.81,
"learning_rate": 5.418693324604082e-07,
"logits/chosen": 0.1863461136817932,
"logits/rejected": 0.25381818413734436,
"logps/chosen": -358.6033630371094,
"logps/rejected": -392.04302978515625,
"loss": 1.542,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.001642666757106781,
"rewards/margins": 0.41695213317871094,
"rewards/rejected": -0.41530942916870117,
"step": 530
},
{
"epoch": 0.82,
"learning_rate": 4.619234996325314e-07,
"logits/chosen": 0.11545145511627197,
"logits/rejected": 0.20592764019966125,
"logps/chosen": -349.9122619628906,
"logps/rejected": -408.61590576171875,
"loss": 1.5374,
"rewards/accuracies": 0.75,
"rewards/chosen": 8.928254101192579e-05,
"rewards/margins": 0.40838712453842163,
"rewards/rejected": -0.4082978367805481,
"step": 540
},
{
"epoch": 0.84,
"learning_rate": 3.877548160747768e-07,
"logits/chosen": 0.12814117968082428,
"logits/rejected": 0.19134709239006042,
"logps/chosen": -337.3287658691406,
"logps/rejected": -354.94415283203125,
"loss": 1.6835,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.006277731154114008,
"rewards/margins": 0.3102794587612152,
"rewards/rejected": -0.30400174856185913,
"step": 550
},
{
"epoch": 0.85,
"learning_rate": 3.195735209788528e-07,
"logits/chosen": 0.1329401135444641,
"logits/rejected": 0.2162102907896042,
"logps/chosen": -341.5213928222656,
"logps/rejected": -338.03179931640625,
"loss": 1.6469,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.0008542388677597046,
"rewards/margins": 0.3375299572944641,
"rewards/rejected": -0.338384211063385,
"step": 560
},
{
"epoch": 0.87,
"learning_rate": 2.5757288163336806e-07,
"logits/chosen": 0.1493878811597824,
"logits/rejected": 0.20596106350421906,
"logps/chosen": -352.513916015625,
"logps/rejected": -391.4380798339844,
"loss": 1.6831,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06621219962835312,
"rewards/margins": 0.2802005708217621,
"rewards/rejected": -0.3464128077030182,
"step": 570
},
{
"epoch": 0.88,
"learning_rate": 2.019286455866981e-07,
"logits/chosen": 0.1281604915857315,
"logits/rejected": 0.19645507633686066,
"logps/chosen": -302.35040283203125,
"logps/rejected": -354.00372314453125,
"loss": 1.6607,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.044999100267887115,
"rewards/margins": 0.3108140826225281,
"rewards/rejected": -0.3558131754398346,
"step": 580
},
{
"epoch": 0.9,
"learning_rate": 1.5279854247146703e-07,
"logits/chosen": 0.1600816547870636,
"logits/rejected": 0.2620231509208679,
"logps/chosen": -363.3172607421875,
"logps/rejected": -383.7359619140625,
"loss": 1.5077,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.035707950592041016,
"rewards/margins": 0.3358200192451477,
"rewards/rejected": -0.3715279698371887,
"step": 590
},
{
"epoch": 0.91,
"learning_rate": 1.1032183690276754e-07,
"logits/chosen": 0.1881883442401886,
"logits/rejected": 0.23025290668010712,
"logps/chosen": -348.2078552246094,
"logps/rejected": -356.3308410644531,
"loss": 1.4724,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.01992412842810154,
"rewards/margins": 0.35621514916419983,
"rewards/rejected": -0.37613925337791443,
"step": 600
},
{
"epoch": 0.93,
"learning_rate": 7.46189337174788e-08,
"logits/chosen": 0.16047361493110657,
"logits/rejected": 0.21806029975414276,
"logps/chosen": -338.9239196777344,
"logps/rejected": -370.13238525390625,
"loss": 1.5501,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.0008598908898420632,
"rewards/margins": 0.35586023330688477,
"rewards/rejected": -0.35672011971473694,
"step": 610
},
{
"epoch": 0.94,
"learning_rate": 4.579103667367385e-08,
"logits/chosen": 0.1737244576215744,
"logits/rejected": 0.2040444165468216,
"logps/chosen": -367.3244323730469,
"logps/rejected": -375.1661071777344,
"loss": 1.6325,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.08697754144668579,
"rewards/margins": 0.260633647441864,
"rewards/rejected": -0.3476111590862274,
"step": 620
},
{
"epoch": 0.96,
"learning_rate": 2.3919861577572924e-08,
"logits/chosen": 0.17082975804805756,
"logits/rejected": 0.2609696090221405,
"logps/chosen": -356.7315979003906,
"logps/rejected": -364.6842041015625,
"loss": 1.6992,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03417596220970154,
"rewards/margins": 0.30746400356292725,
"rewards/rejected": -0.34163999557495117,
"step": 630
},
{
"epoch": 0.98,
"learning_rate": 9.067404651211808e-09,
"logits/chosen": 0.07360972464084625,
"logits/rejected": 0.17394272983074188,
"logps/chosen": -343.9101867675781,
"logps/rejected": -367.1744079589844,
"loss": 1.4701,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.053998060524463654,
"rewards/margins": 0.355236679315567,
"rewards/rejected": -0.4092347025871277,
"step": 640
},
{
"epoch": 0.99,
"learning_rate": 1.2757667974155896e-09,
"logits/chosen": 0.16294406354427338,
"logits/rejected": 0.23806321620941162,
"logps/chosen": -380.12554931640625,
"logps/rejected": -385.9973449707031,
"loss": 1.6559,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.054873187094926834,
"rewards/margins": 0.2882222533226013,
"rewards/rejected": -0.34309545159339905,
"step": 650
},
{
"epoch": 1.0,
"step": 656,
"total_flos": 0.0,
"train_loss": 1.6983561014256827,
"train_runtime": 7833.1099,
"train_samples_per_second": 2.681,
"train_steps_per_second": 0.084
}
],
"logging_steps": 10,
"max_steps": 656,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}