Jimmy19991222's picture
Upload folder using huggingface_hub
00c02a8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 48.927791324930695,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.0180829763412476,
"logits/rejected": -0.9883173704147339,
"logps/chosen": -0.2738715410232544,
"logps/rejected": -0.2716783285140991,
"loss": 3.0574,
"rewards/accuracies": 0.4375,
"rewards/chosen": -2.738715648651123,
"rewards/margins": -0.021932203322649002,
"rewards/rejected": -2.716783046722412,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 39.813279548661036,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0492197275161743,
"logits/rejected": -0.9815438985824585,
"logps/chosen": -0.2942040264606476,
"logps/rejected": -0.29975026845932007,
"loss": 3.0033,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.942039966583252,
"rewards/margins": 0.055462419986724854,
"rewards/rejected": -2.997502326965332,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 54.64580630838249,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.9780637621879578,
"logits/rejected": -0.9978879690170288,
"logps/chosen": -0.2642993927001953,
"logps/rejected": -0.3006458878517151,
"loss": 2.9877,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.642993688583374,
"rewards/margins": 0.363465279340744,
"rewards/rejected": -3.0064589977264404,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 78.63474777212464,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.9655851125717163,
"logits/rejected": -0.9391099810600281,
"logps/chosen": -0.2776910662651062,
"logps/rejected": -0.291360080242157,
"loss": 2.9252,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.7769107818603516,
"rewards/margins": 0.13669000566005707,
"rewards/rejected": -2.9136006832122803,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 53.858972431024775,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.0097562074661255,
"logits/rejected": -0.9812997579574585,
"logps/chosen": -0.2714676260948181,
"logps/rejected": -0.27822521328926086,
"loss": 3.0821,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -2.7146763801574707,
"rewards/margins": 0.06757592409849167,
"rewards/rejected": -2.782252073287964,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 44.312475927746796,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.9986146688461304,
"logits/rejected": -0.9536568522453308,
"logps/chosen": -0.27314493060112,
"logps/rejected": -0.27925461530685425,
"loss": 2.937,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -2.731449604034424,
"rewards/margins": 0.06109660863876343,
"rewards/rejected": -2.792546033859253,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 55.321940182511284,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.0669300556182861,
"logits/rejected": -0.9896968603134155,
"logps/chosen": -0.29428571462631226,
"logps/rejected": -0.3205253481864929,
"loss": 2.905,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.942857265472412,
"rewards/margins": 0.26239633560180664,
"rewards/rejected": -3.2052536010742188,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 53.68098989474069,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -1.0166269540786743,
"logits/rejected": -0.9719806909561157,
"logps/chosen": -0.2796934247016907,
"logps/rejected": -0.32216984033584595,
"loss": 2.916,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.796934127807617,
"rewards/margins": 0.42476367950439453,
"rewards/rejected": -3.221698045730591,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 36.765236755711314,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -1.0554900169372559,
"logits/rejected": -1.0124839544296265,
"logps/chosen": -0.3013826012611389,
"logps/rejected": -0.3502373695373535,
"loss": 2.9447,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.0138256549835205,
"rewards/margins": 0.4885478913784027,
"rewards/rejected": -3.5023739337921143,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 72.12342853911701,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -1.033050298690796,
"logits/rejected": -0.9839521646499634,
"logps/chosen": -0.3049773573875427,
"logps/rejected": -0.3382193446159363,
"loss": 2.976,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -3.0497734546661377,
"rewards/margins": 0.33241981267929077,
"rewards/rejected": -3.382193088531494,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 67.04896260966717,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -1.0543005466461182,
"logits/rejected": -1.0193541049957275,
"logps/chosen": -0.2847168445587158,
"logps/rejected": -0.34575051069259644,
"loss": 2.7924,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.847168445587158,
"rewards/margins": 0.6103365421295166,
"rewards/rejected": -3.457504987716675,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 53.20515583895435,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -1.1007188558578491,
"logits/rejected": -1.066847801208496,
"logps/chosen": -0.32495683431625366,
"logps/rejected": -0.3465155363082886,
"loss": 2.8738,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -3.249568462371826,
"rewards/margins": 0.215586856007576,
"rewards/rejected": -3.4651551246643066,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 54.54015013992033,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -1.0070468187332153,
"logits/rejected": -0.9784091711044312,
"logps/chosen": -0.37832310795783997,
"logps/rejected": -0.43590840697288513,
"loss": 2.7895,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -3.783231258392334,
"rewards/margins": 0.5758528113365173,
"rewards/rejected": -4.359084129333496,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 38.242775225934125,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -1.0234776735305786,
"logits/rejected": -0.9988471269607544,
"logps/chosen": -0.3544539511203766,
"logps/rejected": -0.4332161545753479,
"loss": 2.8516,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -3.5445396900177,
"rewards/margins": 0.7876222729682922,
"rewards/rejected": -4.332161903381348,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 70.74640041136536,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.9775687456130981,
"logits/rejected": -0.9074035882949829,
"logps/chosen": -0.3704521059989929,
"logps/rejected": -0.42546525597572327,
"loss": 2.7815,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -3.704521656036377,
"rewards/margins": 0.5501310229301453,
"rewards/rejected": -4.254652500152588,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 46.9909884312478,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.9548114538192749,
"logits/rejected": -0.94190514087677,
"logps/chosen": -0.35945671796798706,
"logps/rejected": -0.4592272639274597,
"loss": 2.7108,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -3.594567060470581,
"rewards/margins": 0.9977054595947266,
"rewards/rejected": -4.592272758483887,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 54.79418392154241,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9647032618522644,
"logits/rejected": -0.9432573318481445,
"logps/chosen": -0.3421172797679901,
"logps/rejected": -0.4004732072353363,
"loss": 2.6569,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -3.421172618865967,
"rewards/margins": 0.5835592746734619,
"rewards/rejected": -4.00473165512085,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 63.87918692389446,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -1.0345466136932373,
"logits/rejected": -0.9992335438728333,
"logps/chosen": -0.4216434061527252,
"logps/rejected": -0.5047457218170166,
"loss": 2.8483,
"rewards/accuracies": 0.59375,
"rewards/chosen": -4.216434001922607,
"rewards/margins": 0.8310235142707825,
"rewards/rejected": -5.047457695007324,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 53.84055400604519,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.0933572053909302,
"logits/rejected": -1.012095332145691,
"logps/chosen": -0.4486677050590515,
"logps/rejected": -0.4948577880859375,
"loss": 2.7206,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -4.4866766929626465,
"rewards/margins": 0.46190088987350464,
"rewards/rejected": -4.948577404022217,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 69.39656295840837,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.9949450492858887,
"logits/rejected": -0.9710448384284973,
"logps/chosen": -0.42737340927124023,
"logps/rejected": -0.511344850063324,
"loss": 2.7353,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -4.273734092712402,
"rewards/margins": 0.8397142291069031,
"rewards/rejected": -5.113448143005371,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 63.49627205534197,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -1.00840425491333,
"logits/rejected": -0.9560264348983765,
"logps/chosen": -0.4261465072631836,
"logps/rejected": -0.5318101644515991,
"loss": 2.6988,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -4.261464595794678,
"rewards/margins": 1.0566365718841553,
"rewards/rejected": -5.318101406097412,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 67.07988857179406,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.9696318507194519,
"logits/rejected": -0.9108623266220093,
"logps/chosen": -0.48374947905540466,
"logps/rejected": -0.6151714324951172,
"loss": 2.6096,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -4.837494373321533,
"rewards/margins": 1.3142198324203491,
"rewards/rejected": -6.151714324951172,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 80.4417839343177,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -1.033552646636963,
"logits/rejected": -0.9741662740707397,
"logps/chosen": -0.5227991938591003,
"logps/rejected": -0.5981119275093079,
"loss": 2.4723,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -5.227993011474609,
"rewards/margins": 0.7531263828277588,
"rewards/rejected": -5.981118202209473,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 67.69889462049662,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.9916391372680664,
"logits/rejected": -0.9028812646865845,
"logps/chosen": -0.5420633554458618,
"logps/rejected": -0.7466092705726624,
"loss": 2.377,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -5.420632839202881,
"rewards/margins": 2.0454587936401367,
"rewards/rejected": -7.466092109680176,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 64.90166370238528,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.0659786462783813,
"logits/rejected": -1.0236841440200806,
"logps/chosen": -0.6124440431594849,
"logps/rejected": -0.7124758958816528,
"loss": 2.2955,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -6.1244401931762695,
"rewards/margins": 1.0003182888031006,
"rewards/rejected": -7.124758720397949,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 75.48258438787046,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.0887296199798584,
"logits/rejected": -1.0823543071746826,
"logps/chosen": -0.6110976934432983,
"logps/rejected": -0.8805627822875977,
"loss": 2.1296,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -6.110977649688721,
"rewards/margins": 2.694650888442993,
"rewards/rejected": -8.805627822875977,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 62.13046213587147,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -1.061156153678894,
"logits/rejected": -1.0147919654846191,
"logps/chosen": -0.7112447023391724,
"logps/rejected": -0.8724945783615112,
"loss": 2.1133,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -7.1124467849731445,
"rewards/margins": 1.6124988794326782,
"rewards/rejected": -8.724946975708008,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 79.80676489486827,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -1.1300795078277588,
"logits/rejected": -1.1087987422943115,
"logps/chosen": -0.8216513395309448,
"logps/rejected": -0.9944013357162476,
"loss": 2.0323,
"rewards/accuracies": 0.75,
"rewards/chosen": -8.216513633728027,
"rewards/margins": 1.7274997234344482,
"rewards/rejected": -9.944013595581055,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 119.11117858285472,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -1.0456678867340088,
"logits/rejected": -1.0206925868988037,
"logps/chosen": -0.8874173164367676,
"logps/rejected": -1.1297991275787354,
"loss": 2.0077,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -8.874174118041992,
"rewards/margins": 2.423818588256836,
"rewards/rejected": -11.297992706298828,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 80.60289814144,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -1.0754765272140503,
"logits/rejected": -1.0576502084732056,
"logps/chosen": -0.9953246116638184,
"logps/rejected": -1.2399874925613403,
"loss": 1.968,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -9.953246116638184,
"rewards/margins": 2.4466278553009033,
"rewards/rejected": -12.399874687194824,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 79.65950829440058,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.080444097518921,
"logits/rejected": -1.0592705011367798,
"logps/chosen": -1.0582973957061768,
"logps/rejected": -1.3756240606307983,
"loss": 1.9981,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -10.582974433898926,
"rewards/margins": 3.173267364501953,
"rewards/rejected": -13.756240844726562,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 81.44098785800907,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.0870612859725952,
"logits/rejected": -1.069802165031433,
"logps/chosen": -1.1801505088806152,
"logps/rejected": -1.5819157361984253,
"loss": 1.9469,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -11.801506042480469,
"rewards/margins": 4.017651557922363,
"rewards/rejected": -15.819157600402832,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 61.9394419875011,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.0850841999053955,
"logits/rejected": -1.061554193496704,
"logps/chosen": -1.1361093521118164,
"logps/rejected": -1.5122711658477783,
"loss": 1.8308,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -11.361093521118164,
"rewards/margins": 3.761617660522461,
"rewards/rejected": -15.122709274291992,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 86.89706327407258,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.0892133712768555,
"logits/rejected": -1.0675928592681885,
"logps/chosen": -1.1062101125717163,
"logps/rejected": -1.4951918125152588,
"loss": 1.7802,
"rewards/accuracies": 0.78125,
"rewards/chosen": -11.062100410461426,
"rewards/margins": 3.8898162841796875,
"rewards/rejected": -14.951919555664062,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 96.87652305461658,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -1.1229137182235718,
"logits/rejected": -1.0774867534637451,
"logps/chosen": -1.1681886911392212,
"logps/rejected": -1.4487732648849487,
"loss": 1.6772,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -11.681886672973633,
"rewards/margins": 2.805846691131592,
"rewards/rejected": -14.487733840942383,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 80.44938362402195,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -1.0816049575805664,
"logits/rejected": -1.0617396831512451,
"logps/chosen": -1.1198861598968506,
"logps/rejected": -1.4944720268249512,
"loss": 1.5945,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -11.198859214782715,
"rewards/margins": 3.7458598613739014,
"rewards/rejected": -14.944720268249512,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 89.9964846943623,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.1142748594284058,
"logits/rejected": -1.061927080154419,
"logps/chosen": -1.1488279104232788,
"logps/rejected": -1.5771600008010864,
"loss": 1.6746,
"rewards/accuracies": 0.84375,
"rewards/chosen": -11.488279342651367,
"rewards/margins": 4.283320426940918,
"rewards/rejected": -15.771600723266602,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 91.4567322928116,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -1.113872766494751,
"logits/rejected": -1.1223859786987305,
"logps/chosen": -1.2559322118759155,
"logps/rejected": -1.7311124801635742,
"loss": 1.5468,
"rewards/accuracies": 0.84375,
"rewards/chosen": -12.55932331085205,
"rewards/margins": 4.751800060272217,
"rewards/rejected": -17.31112289428711,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 71.15679417803156,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -1.0601518154144287,
"logits/rejected": -1.043198823928833,
"logps/chosen": -1.2675104141235352,
"logps/rejected": -1.6440922021865845,
"loss": 1.6056,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -12.675103187561035,
"rewards/margins": 3.7658183574676514,
"rewards/rejected": -16.440921783447266,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 88.98069899942548,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -1.0721577405929565,
"logits/rejected": -1.0621263980865479,
"logps/chosen": -1.3113422393798828,
"logps/rejected": -1.726875901222229,
"loss": 1.5055,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -13.113421440124512,
"rewards/margins": 4.155338287353516,
"rewards/rejected": -17.268760681152344,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 96.85728294484134,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.10856032371521,
"logits/rejected": -1.059822916984558,
"logps/chosen": -1.3952258825302124,
"logps/rejected": -1.8415533304214478,
"loss": 1.716,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -13.952260971069336,
"rewards/margins": 4.4632720947265625,
"rewards/rejected": -18.415531158447266,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 98.7584341258845,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.134487271308899,
"logits/rejected": -1.1236417293548584,
"logps/chosen": -1.4038760662078857,
"logps/rejected": -1.8213703632354736,
"loss": 1.5993,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -14.0387601852417,
"rewards/margins": 4.174942970275879,
"rewards/rejected": -18.213703155517578,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 116.36934325190856,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -1.083676815032959,
"logits/rejected": -1.0672903060913086,
"logps/chosen": -1.3948618173599243,
"logps/rejected": -1.87642502784729,
"loss": 1.4766,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -13.948617935180664,
"rewards/margins": 4.8156328201293945,
"rewards/rejected": -18.76424789428711,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 96.57054428988462,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.1386303901672363,
"logits/rejected": -1.1223524808883667,
"logps/chosen": -1.477141261100769,
"logps/rejected": -1.971549391746521,
"loss": 1.431,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -14.77141284942627,
"rewards/margins": 4.9440813064575195,
"rewards/rejected": -19.71549415588379,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 72.99627339556893,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -1.1213773488998413,
"logits/rejected": -1.0908575057983398,
"logps/chosen": -1.5149943828582764,
"logps/rejected": -1.969143271446228,
"loss": 1.4658,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -15.149943351745605,
"rewards/margins": 4.541489601135254,
"rewards/rejected": -19.69143295288086,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 75.07337643391894,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.1983073949813843,
"logits/rejected": -1.150994896888733,
"logps/chosen": -1.4561713933944702,
"logps/rejected": -1.9137779474258423,
"loss": 1.3907,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.561712265014648,
"rewards/margins": 4.5760674476623535,
"rewards/rejected": -19.137781143188477,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 89.06305062801928,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.227797031402588,
"logits/rejected": -1.2002477645874023,
"logps/chosen": -1.4925800561904907,
"logps/rejected": -1.9937610626220703,
"loss": 1.4023,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -14.925801277160645,
"rewards/margins": 5.011811256408691,
"rewards/rejected": -19.937610626220703,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 89.0732695289788,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.189410924911499,
"logits/rejected": -1.1908595561981201,
"logps/chosen": -1.4204081296920776,
"logps/rejected": -1.9320650100708008,
"loss": 1.4327,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -14.204083442687988,
"rewards/margins": 5.116568088531494,
"rewards/rejected": -19.320650100708008,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 99.61038425380444,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -1.2636008262634277,
"logits/rejected": -1.2102385759353638,
"logps/chosen": -1.492004156112671,
"logps/rejected": -2.0921199321746826,
"loss": 1.3755,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -14.920039176940918,
"rewards/margins": 6.001158714294434,
"rewards/rejected": -20.921199798583984,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 143.41066987990183,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -1.2205616235733032,
"logits/rejected": -1.2053756713867188,
"logps/chosen": -1.5569369792938232,
"logps/rejected": -2.1403331756591797,
"loss": 1.3485,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.569369316101074,
"rewards/margins": 5.833963394165039,
"rewards/rejected": -21.403331756591797,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 71.93551703878607,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.2752165794372559,
"logits/rejected": -1.2500503063201904,
"logps/chosen": -1.6620187759399414,
"logps/rejected": -2.134455442428589,
"loss": 1.4857,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.620187759399414,
"rewards/margins": 4.724367141723633,
"rewards/rejected": -21.344552993774414,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 85.67142749873541,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -1.2445859909057617,
"logits/rejected": -1.216204047203064,
"logps/chosen": -1.5793449878692627,
"logps/rejected": -2.078167676925659,
"loss": 1.4255,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.793449401855469,
"rewards/margins": 4.988225936889648,
"rewards/rejected": -20.781675338745117,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 74.44253878678798,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.1956579685211182,
"logits/rejected": -1.1797969341278076,
"logps/chosen": -1.6723514795303345,
"logps/rejected": -2.195023536682129,
"loss": 1.3414,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.723514556884766,
"rewards/margins": 5.22672176361084,
"rewards/rejected": -21.95023536682129,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 114.96460787224315,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -1.1912486553192139,
"logits/rejected": -1.1648938655853271,
"logps/chosen": -1.463266134262085,
"logps/rejected": -2.001335620880127,
"loss": 1.3473,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.632661819458008,
"rewards/margins": 5.3806915283203125,
"rewards/rejected": -20.013355255126953,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 75.32343278326546,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -1.1563775539398193,
"logits/rejected": -1.1241414546966553,
"logps/chosen": -1.4626271724700928,
"logps/rejected": -1.924564003944397,
"loss": 1.4306,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -14.626272201538086,
"rewards/margins": 4.619367599487305,
"rewards/rejected": -19.24563980102539,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 112.30854407154642,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -1.2617356777191162,
"logits/rejected": -1.2384282350540161,
"logps/chosen": -1.5061413049697876,
"logps/rejected": -2.0556976795196533,
"loss": 1.3243,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -15.06141185760498,
"rewards/margins": 5.495565891265869,
"rewards/rejected": -20.556978225708008,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 102.49061452491978,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.2426598072052002,
"logits/rejected": -1.2112630605697632,
"logps/chosen": -1.5387237071990967,
"logps/rejected": -2.120283842086792,
"loss": 1.0685,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -15.387234687805176,
"rewards/margins": 5.815601825714111,
"rewards/rejected": -21.202838897705078,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 86.79253258499234,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.2638859748840332,
"logits/rejected": -1.2220103740692139,
"logps/chosen": -1.564584493637085,
"logps/rejected": -2.053191661834717,
"loss": 1.3472,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.645845413208008,
"rewards/margins": 4.886073589324951,
"rewards/rejected": -20.531917572021484,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 93.44397121318542,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.2045689821243286,
"logits/rejected": -1.191235899925232,
"logps/chosen": -1.5795795917510986,
"logps/rejected": -2.093400239944458,
"loss": 1.1752,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -15.795794486999512,
"rewards/margins": 5.138205528259277,
"rewards/rejected": -20.934001922607422,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 83.42376671175532,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -1.128404974937439,
"logits/rejected": -1.1020969152450562,
"logps/chosen": -1.6557916402816772,
"logps/rejected": -2.1081161499023438,
"loss": 1.574,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -16.557918548583984,
"rewards/margins": 4.523244857788086,
"rewards/rejected": -21.081159591674805,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 87.28007107027204,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.235114574432373,
"logits/rejected": -1.19254469871521,
"logps/chosen": -1.6206077337265015,
"logps/rejected": -2.079169750213623,
"loss": 1.3167,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.206077575683594,
"rewards/margins": 4.585621356964111,
"rewards/rejected": -20.791696548461914,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 105.45685254547827,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -1.267155408859253,
"logits/rejected": -1.2484853267669678,
"logps/chosen": -1.6359084844589233,
"logps/rejected": -2.1494529247283936,
"loss": 1.3629,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -16.359085083007812,
"rewards/margins": 5.135441780090332,
"rewards/rejected": -21.49452781677246,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 87.29974596975983,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.2641007900238037,
"logits/rejected": -1.2104285955429077,
"logps/chosen": -1.6273491382598877,
"logps/rejected": -2.1474812030792236,
"loss": 1.3491,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.273488998413086,
"rewards/margins": 5.201323509216309,
"rewards/rejected": -21.47481346130371,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 93.70048699997521,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.247004747390747,
"logits/rejected": -1.2437224388122559,
"logps/chosen": -1.6495912075042725,
"logps/rejected": -2.273390293121338,
"loss": 1.2651,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -16.495912551879883,
"rewards/margins": 6.237987518310547,
"rewards/rejected": -22.73390007019043,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 76.81018981722117,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -1.26289701461792,
"logits/rejected": -1.2167103290557861,
"logps/chosen": -1.7046712636947632,
"logps/rejected": -2.372957944869995,
"loss": 1.3105,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -17.046714782714844,
"rewards/margins": 6.682864189147949,
"rewards/rejected": -23.72957992553711,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 73.64401812634293,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.178143858909607,
"logits/rejected": -1.1377698183059692,
"logps/chosen": -1.6760982275009155,
"logps/rejected": -2.156362533569336,
"loss": 1.3515,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.760982513427734,
"rewards/margins": 4.802641868591309,
"rewards/rejected": -21.56362533569336,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 100.69110505698991,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.2217228412628174,
"logits/rejected": -1.20427405834198,
"logps/chosen": -1.6592464447021484,
"logps/rejected": -2.2141623497009277,
"loss": 1.1861,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.592464447021484,
"rewards/margins": 5.549159049987793,
"rewards/rejected": -22.141624450683594,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 96.51234191429023,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.2305556535720825,
"logits/rejected": -1.1986171007156372,
"logps/chosen": -1.5974411964416504,
"logps/rejected": -2.150116443634033,
"loss": 1.3127,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.974411010742188,
"rewards/margins": 5.526752948760986,
"rewards/rejected": -21.501163482666016,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 82.01592774884807,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.2591183185577393,
"logits/rejected": -1.2364073991775513,
"logps/chosen": -1.667109727859497,
"logps/rejected": -2.311634063720703,
"loss": 1.3624,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.671098709106445,
"rewards/margins": 6.445242404937744,
"rewards/rejected": -23.1163387298584,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 122.79704197237824,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.2474735975265503,
"logits/rejected": -1.2506452798843384,
"logps/chosen": -1.5353481769561768,
"logps/rejected": -2.0822863578796387,
"loss": 1.2838,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.353483200073242,
"rewards/margins": 5.4693803787231445,
"rewards/rejected": -20.822864532470703,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 96.25560337558127,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.1812379360198975,
"logits/rejected": -1.1956241130828857,
"logps/chosen": -1.5455963611602783,
"logps/rejected": -2.1094608306884766,
"loss": 1.1903,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.455963134765625,
"rewards/margins": 5.638647079467773,
"rewards/rejected": -21.0946102142334,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 79.54472628433167,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -1.2227225303649902,
"logits/rejected": -1.223512053489685,
"logps/chosen": -1.562652349472046,
"logps/rejected": -2.2054429054260254,
"loss": 1.3011,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -15.626523971557617,
"rewards/margins": 6.4279046058654785,
"rewards/rejected": -22.054428100585938,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 67.31957818166626,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.2667648792266846,
"logits/rejected": -1.204973816871643,
"logps/chosen": -1.611985206604004,
"logps/rejected": -2.2405993938446045,
"loss": 1.2638,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -16.119850158691406,
"rewards/margins": 6.286141872406006,
"rewards/rejected": -22.405994415283203,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 96.4652631691847,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.1908237934112549,
"logits/rejected": -1.1797075271606445,
"logps/chosen": -1.5888497829437256,
"logps/rejected": -2.0819642543792725,
"loss": 1.2368,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.888498306274414,
"rewards/margins": 4.931147575378418,
"rewards/rejected": -20.819644927978516,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 77.32657538767864,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -1.22549307346344,
"logits/rejected": -1.2306774854660034,
"logps/chosen": -1.7054897546768188,
"logps/rejected": -2.2909984588623047,
"loss": 1.2394,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.05489730834961,
"rewards/margins": 5.855085372924805,
"rewards/rejected": -22.909982681274414,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 122.77103138361475,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.2489427328109741,
"logits/rejected": -1.2302041053771973,
"logps/chosen": -1.59738028049469,
"logps/rejected": -2.1246509552001953,
"loss": 1.3954,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.973803520202637,
"rewards/margins": 5.272706031799316,
"rewards/rejected": -21.246509552001953,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 73.11130573539627,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -1.231930136680603,
"logits/rejected": -1.2102787494659424,
"logps/chosen": -1.6145036220550537,
"logps/rejected": -2.3103325366973877,
"loss": 1.0955,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.145038604736328,
"rewards/margins": 6.958285331726074,
"rewards/rejected": -23.103322982788086,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 80.68579596437256,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.2316021919250488,
"logits/rejected": -1.2188332080841064,
"logps/chosen": -1.6731784343719482,
"logps/rejected": -2.2686033248901367,
"loss": 1.2377,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.73178482055664,
"rewards/margins": 5.95424747467041,
"rewards/rejected": -22.686031341552734,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 77.88673283635482,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -1.2400703430175781,
"logits/rejected": -1.2209936380386353,
"logps/chosen": -1.5812984704971313,
"logps/rejected": -2.1461308002471924,
"loss": 1.2574,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -15.812983512878418,
"rewards/margins": 5.648324012756348,
"rewards/rejected": -21.461307525634766,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 64.27634143705052,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -1.2457908391952515,
"logits/rejected": -1.2307510375976562,
"logps/chosen": -1.6303634643554688,
"logps/rejected": -2.144191265106201,
"loss": 1.2352,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.303632736206055,
"rewards/margins": 5.138282775878906,
"rewards/rejected": -21.441913604736328,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -1.4050133228302002,
"eval_logits/rejected": -1.4148539304733276,
"eval_logps/chosen": -1.6315457820892334,
"eval_logps/rejected": -2.184220314025879,
"eval_loss": 1.3035991191864014,
"eval_rewards/accuracies": 0.8313007950782776,
"eval_rewards/chosen": -16.315458297729492,
"eval_rewards/margins": 5.526745319366455,
"eval_rewards/rejected": -21.842201232910156,
"eval_runtime": 114.1272,
"eval_samples_per_second": 17.183,
"eval_steps_per_second": 1.078,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 93.3112085508996,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.2086267471313477,
"logits/rejected": -1.2275283336639404,
"logps/chosen": -1.705394983291626,
"logps/rejected": -2.2604918479919434,
"loss": 1.2335,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -17.053951263427734,
"rewards/margins": 5.550968647003174,
"rewards/rejected": -22.604917526245117,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 147.49347048623574,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -1.224875569343567,
"logits/rejected": -1.2125729322433472,
"logps/chosen": -1.6484178304672241,
"logps/rejected": -2.173166036605835,
"loss": 1.3786,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -16.48417854309082,
"rewards/margins": 5.247479438781738,
"rewards/rejected": -21.731660842895508,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 72.56853127664434,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.216326355934143,
"logits/rejected": -1.1681609153747559,
"logps/chosen": -1.584081768989563,
"logps/rejected": -2.2398409843444824,
"loss": 1.3224,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.84081745147705,
"rewards/margins": 6.557589530944824,
"rewards/rejected": -22.398406982421875,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 96.60767749787689,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -1.252618432044983,
"logits/rejected": -1.206176996231079,
"logps/chosen": -1.533342719078064,
"logps/rejected": -2.1350560188293457,
"loss": 1.2256,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -15.333427429199219,
"rewards/margins": 6.0171332359313965,
"rewards/rejected": -21.350561141967773,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 102.43117197696006,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -1.2422844171524048,
"logits/rejected": -1.2342640161514282,
"logps/chosen": -1.7160053253173828,
"logps/rejected": -2.2498655319213867,
"loss": 1.4539,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -17.160053253173828,
"rewards/margins": 5.338602542877197,
"rewards/rejected": -22.498653411865234,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 95.6241453357728,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -1.232668161392212,
"logits/rejected": -1.2184712886810303,
"logps/chosen": -1.7022215127944946,
"logps/rejected": -2.2985284328460693,
"loss": 1.267,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.022212982177734,
"rewards/margins": 5.963072299957275,
"rewards/rejected": -22.985288619995117,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 76.99966381399814,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.182472825050354,
"logits/rejected": -1.1473052501678467,
"logps/chosen": -1.5595757961273193,
"logps/rejected": -2.154953956604004,
"loss": 1.2015,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -15.595758438110352,
"rewards/margins": 5.953780174255371,
"rewards/rejected": -21.54953956604004,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 84.23154902337001,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -1.2160289287567139,
"logits/rejected": -1.1602892875671387,
"logps/chosen": -1.6690679788589478,
"logps/rejected": -2.205056667327881,
"loss": 1.245,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.690677642822266,
"rewards/margins": 5.359889984130859,
"rewards/rejected": -22.050569534301758,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 75.27496517042923,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.1662180423736572,
"logits/rejected": -1.1511404514312744,
"logps/chosen": -1.6556246280670166,
"logps/rejected": -2.3011534214019775,
"loss": 1.1055,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.556243896484375,
"rewards/margins": 6.4552903175354,
"rewards/rejected": -23.011533737182617,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 92.06659067628235,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -1.278028130531311,
"logits/rejected": -1.2591049671173096,
"logps/chosen": -1.6093896627426147,
"logps/rejected": -2.1693015098571777,
"loss": 1.3206,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.093896865844727,
"rewards/margins": 5.599120140075684,
"rewards/rejected": -21.693017959594727,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 100.7331017689662,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.2293764352798462,
"logits/rejected": -1.1971036195755005,
"logps/chosen": -1.6129881143569946,
"logps/rejected": -2.264960765838623,
"loss": 1.0669,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -16.129878997802734,
"rewards/margins": 6.5197248458862305,
"rewards/rejected": -22.649606704711914,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 88.7868280064186,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -1.241369366645813,
"logits/rejected": -1.2309256792068481,
"logps/chosen": -1.6784422397613525,
"logps/rejected": -2.295506000518799,
"loss": 1.1912,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -16.784420013427734,
"rewards/margins": 6.170637607574463,
"rewards/rejected": -22.95505714416504,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 89.2084840240269,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.2126357555389404,
"logits/rejected": -1.2189154624938965,
"logps/chosen": -1.7046855688095093,
"logps/rejected": -2.3294055461883545,
"loss": 1.3123,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.046857833862305,
"rewards/margins": 6.247200965881348,
"rewards/rejected": -23.294055938720703,
"step": 465
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 1.7982526555561662,
"train_runtime": 17001.7268,
"train_samples_per_second": 3.522,
"train_steps_per_second": 0.027
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}