zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
05f301d verified
raw
history blame
No virus
51.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994242947610823,
"eval_steps": 100,
"global_step": 868,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011514104778353484,
"grad_norm": 35.91765211885503,
"learning_rate": 5.747126436781609e-09,
"logits/chosen": -2.086653709411621,
"logits/rejected": -2.069509267807007,
"logps/chosen": -361.22979736328125,
"logps/rejected": -328.4201354980469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.011514104778353483,
"grad_norm": 37.62574042925476,
"learning_rate": 5.747126436781609e-08,
"logits/chosen": -2.192697763442993,
"logits/rejected": -2.1893699169158936,
"logps/chosen": -346.8982238769531,
"logps/rejected": -305.4053039550781,
"loss": 0.6929,
"rewards/accuracies": 0.4652777910232544,
"rewards/chosen": 0.00022573958267457783,
"rewards/margins": 0.00043605040991678834,
"rewards/rejected": -0.00021031053620390594,
"step": 10
},
{
"epoch": 0.023028209556706966,
"grad_norm": 33.76619596156607,
"learning_rate": 1.1494252873563217e-07,
"logits/chosen": -2.170515775680542,
"logits/rejected": -2.1960134506225586,
"logps/chosen": -322.89593505859375,
"logps/rejected": -279.732177734375,
"loss": 0.6923,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.006018324755132198,
"rewards/margins": 0.0009490437805652618,
"rewards/rejected": 0.005069280508905649,
"step": 20
},
{
"epoch": 0.03454231433506045,
"grad_norm": 36.02949439768653,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": -2.226337194442749,
"logits/rejected": -2.215334415435791,
"logps/chosen": -343.44012451171875,
"logps/rejected": -305.6834411621094,
"loss": 0.6875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0371861457824707,
"rewards/margins": 0.012388146482408047,
"rewards/rejected": 0.02479800209403038,
"step": 30
},
{
"epoch": 0.04605641911341393,
"grad_norm": 30.794242683432575,
"learning_rate": 2.2988505747126435e-07,
"logits/chosen": -2.3109958171844482,
"logits/rejected": -2.272737979888916,
"logps/chosen": -313.8249206542969,
"logps/rejected": -281.3092956542969,
"loss": 0.6758,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.10629389435052872,
"rewards/margins": 0.035184551030397415,
"rewards/rejected": 0.071109339594841,
"step": 40
},
{
"epoch": 0.057570523891767415,
"grad_norm": 29.832104382822315,
"learning_rate": 2.873563218390804e-07,
"logits/chosen": -2.4144537448883057,
"logits/rejected": -2.4051060676574707,
"logps/chosen": -335.85626220703125,
"logps/rejected": -322.4024658203125,
"loss": 0.664,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.20595140755176544,
"rewards/margins": 0.058795731514692307,
"rewards/rejected": 0.14715565741062164,
"step": 50
},
{
"epoch": 0.0690846286701209,
"grad_norm": 27.97699348851217,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": -2.4252865314483643,
"logits/rejected": -2.4110381603240967,
"logps/chosen": -293.0983581542969,
"logps/rejected": -276.4584655761719,
"loss": 0.6437,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.26091432571411133,
"rewards/margins": 0.12072187662124634,
"rewards/rejected": 0.140192449092865,
"step": 60
},
{
"epoch": 0.08059873344847437,
"grad_norm": 26.14817360357517,
"learning_rate": 4.0229885057471266e-07,
"logits/chosen": -2.5252156257629395,
"logits/rejected": -2.488867998123169,
"logps/chosen": -341.91156005859375,
"logps/rejected": -308.27032470703125,
"loss": 0.6192,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.3610069155693054,
"rewards/margins": 0.20518210530281067,
"rewards/rejected": 0.15582481026649475,
"step": 70
},
{
"epoch": 0.09211283822682786,
"grad_norm": 26.01503586020309,
"learning_rate": 4.597701149425287e-07,
"logits/chosen": -2.443207263946533,
"logits/rejected": -2.4321365356445312,
"logps/chosen": -303.1759948730469,
"logps/rejected": -293.99212646484375,
"loss": 0.5946,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.2370336949825287,
"rewards/margins": 0.22374853491783142,
"rewards/rejected": 0.013285147957503796,
"step": 80
},
{
"epoch": 0.10362694300518134,
"grad_norm": 28.597789728089687,
"learning_rate": 4.999817969178237e-07,
"logits/chosen": -2.468017578125,
"logits/rejected": -2.45894718170166,
"logps/chosen": -341.286376953125,
"logps/rejected": -346.0598449707031,
"loss": 0.5438,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.2997075915336609,
"rewards/margins": 0.4598621726036072,
"rewards/rejected": -0.16015461087226868,
"step": 90
},
{
"epoch": 0.11514104778353483,
"grad_norm": 31.239635888342793,
"learning_rate": 4.996582603056428e-07,
"logits/chosen": -2.290760040283203,
"logits/rejected": -2.2722649574279785,
"logps/chosen": -325.2711181640625,
"logps/rejected": -352.16949462890625,
"loss": 0.5118,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0031170793808996677,
"rewards/margins": 0.5678674578666687,
"rewards/rejected": -0.5709845423698425,
"step": 100
},
{
"epoch": 0.11514104778353483,
"eval_logits/chosen": -2.2212953567504883,
"eval_logits/rejected": -2.1984219551086426,
"eval_logps/chosen": -390.5766296386719,
"eval_logps/rejected": -417.6701354980469,
"eval_loss": 0.592314600944519,
"eval_rewards/accuracies": 0.70703125,
"eval_rewards/chosen": -0.11199207603931427,
"eval_rewards/margins": 0.3385947644710541,
"eval_rewards/rejected": -0.45058679580688477,
"eval_runtime": 98.608,
"eval_samples_per_second": 20.282,
"eval_steps_per_second": 0.325,
"step": 100
},
{
"epoch": 0.1266551525618883,
"grad_norm": 28.87850245767613,
"learning_rate": 4.989308132738126e-07,
"logits/chosen": -2.224853754043579,
"logits/rejected": -2.1996631622314453,
"logps/chosen": -334.91888427734375,
"logps/rejected": -380.91668701171875,
"loss": 0.4719,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.0493912398815155,
"rewards/margins": 0.8100606203079224,
"rewards/rejected": -0.7606694102287292,
"step": 110
},
{
"epoch": 0.1381692573402418,
"grad_norm": 29.398659404338673,
"learning_rate": 4.978006327248536e-07,
"logits/chosen": -2.199742555618286,
"logits/rejected": -2.1492202281951904,
"logps/chosen": -314.296142578125,
"logps/rejected": -369.991455078125,
"loss": 0.4704,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.09014983475208282,
"rewards/margins": 0.9132173657417297,
"rewards/rejected": -0.8230674862861633,
"step": 120
},
{
"epoch": 0.1496833621185953,
"grad_norm": 30.44019666597221,
"learning_rate": 4.962695471250032e-07,
"logits/chosen": -2.1790311336517334,
"logits/rejected": -2.1547985076904297,
"logps/chosen": -302.8690490722656,
"logps/rejected": -415.23095703125,
"loss": 0.4555,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.09897075593471527,
"rewards/margins": 1.2424136400222778,
"rewards/rejected": -1.1434428691864014,
"step": 130
},
{
"epoch": 0.16119746689694875,
"grad_norm": 33.58601902040164,
"learning_rate": 4.94340033546025e-07,
"logits/chosen": -2.2502989768981934,
"logits/rejected": -2.2536580562591553,
"logps/chosen": -325.1845397949219,
"logps/rejected": -431.7062072753906,
"loss": 0.4345,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.23212842643260956,
"rewards/margins": 1.2962288856506348,
"rewards/rejected": -1.0641005039215088,
"step": 140
},
{
"epoch": 0.17271157167530224,
"grad_norm": 32.120902840689595,
"learning_rate": 4.920152136576705e-07,
"logits/chosen": -2.44754958152771,
"logits/rejected": -2.4280953407287598,
"logps/chosen": -325.13916015625,
"logps/rejected": -465.1835021972656,
"loss": 0.4604,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.07414035499095917,
"rewards/margins": 1.312412977218628,
"rewards/rejected": -1.2382725477218628,
"step": 150
},
{
"epoch": 0.18422567645365573,
"grad_norm": 27.142754060910285,
"learning_rate": 4.892988486772756e-07,
"logits/chosen": -2.7220418453216553,
"logits/rejected": -2.731748342514038,
"logps/chosen": -341.7224426269531,
"logps/rejected": -451.0387268066406,
"loss": 0.4331,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.20678754150867462,
"rewards/margins": 1.3990733623504639,
"rewards/rejected": -1.1922857761383057,
"step": 160
},
{
"epoch": 0.19573978123200922,
"grad_norm": 29.918359187167102,
"learning_rate": 4.861953332846629e-07,
"logits/chosen": -2.796257495880127,
"logits/rejected": -2.810292959213257,
"logps/chosen": -360.57257080078125,
"logps/rejected": -441.2469787597656,
"loss": 0.4495,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.04986714571714401,
"rewards/margins": 1.442570447921753,
"rewards/rejected": -1.3927034139633179,
"step": 170
},
{
"epoch": 0.20725388601036268,
"grad_norm": 28.18581518610586,
"learning_rate": 4.827096885121953e-07,
"logits/chosen": -2.9461441040039062,
"logits/rejected": -2.936654567718506,
"logps/chosen": -342.01666259765625,
"logps/rejected": -421.7103576660156,
"loss": 0.435,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.026858195662498474,
"rewards/margins": 1.3959574699401855,
"rewards/rejected": -1.4228156805038452,
"step": 180
},
{
"epoch": 0.21876799078871617,
"grad_norm": 35.53737142925795,
"learning_rate": 4.788475536214821e-07,
"logits/chosen": -3.022078275680542,
"logits/rejected": -3.0052285194396973,
"logps/chosen": -336.94830322265625,
"logps/rejected": -493.62359619140625,
"loss": 0.4228,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.03777497634291649,
"rewards/margins": 1.5011249780654907,
"rewards/rejected": -1.4633500576019287,
"step": 190
},
{
"epoch": 0.23028209556706966,
"grad_norm": 32.357788149040054,
"learning_rate": 4.746151769798818e-07,
"logits/chosen": -3.098576545715332,
"logits/rejected": -3.122755527496338,
"logps/chosen": -350.237060546875,
"logps/rejected": -476.60345458984375,
"loss": 0.4206,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.029024356976151466,
"rewards/margins": 1.669870376586914,
"rewards/rejected": -1.6408460140228271,
"step": 200
},
{
"epoch": 0.23028209556706966,
"eval_logits/chosen": -3.164449691772461,
"eval_logits/rejected": -3.2280213832855225,
"eval_logps/chosen": -408.5089416503906,
"eval_logps/rejected": -480.46405029296875,
"eval_loss": 0.5054616928100586,
"eval_rewards/accuracies": 0.80078125,
"eval_rewards/chosen": -0.2913154363632202,
"eval_rewards/margins": 0.7872099280357361,
"eval_rewards/rejected": -1.078525424003601,
"eval_runtime": 98.2744,
"eval_samples_per_second": 20.351,
"eval_steps_per_second": 0.326,
"step": 200
},
{
"epoch": 0.24179620034542315,
"grad_norm": 33.674165033906036,
"learning_rate": 4.7001940595156055e-07,
"logits/chosen": -3.1950924396514893,
"logits/rejected": -3.276893138885498,
"logps/chosen": -364.2984313964844,
"logps/rejected": -458.85418701171875,
"loss": 0.4096,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.04986700415611267,
"rewards/margins": 1.6173715591430664,
"rewards/rejected": -1.6672385931015015,
"step": 210
},
{
"epoch": 0.2533103051237766,
"grad_norm": 33.42353087043008,
"learning_rate": 4.650676758194623e-07,
"logits/chosen": -3.289186477661133,
"logits/rejected": -3.4233367443084717,
"logps/chosen": -340.89410400390625,
"logps/rejected": -531.8297729492188,
"loss": 0.417,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3013092875480652,
"rewards/margins": 2.0576224327087402,
"rewards/rejected": -2.35893177986145,
"step": 220
},
{
"epoch": 0.26482440990213013,
"grad_norm": 28.030706610514635,
"learning_rate": 4.5976799775611215e-07,
"logits/chosen": -3.4384427070617676,
"logits/rejected": -3.6002049446105957,
"logps/chosen": -357.27099609375,
"logps/rejected": -521.6351318359375,
"loss": 0.4404,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.060726016759872437,
"rewards/margins": 2.054325580596924,
"rewards/rejected": -2.115051746368408,
"step": 230
},
{
"epoch": 0.2763385146804836,
"grad_norm": 30.164608033500873,
"learning_rate": 4.5412894586271543e-07,
"logits/chosen": -3.5104153156280518,
"logits/rejected": -3.591907024383545,
"logps/chosen": -341.6837463378906,
"logps/rejected": -471.0796813964844,
"loss": 0.4392,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.23911134898662567,
"rewards/margins": 1.611322045326233,
"rewards/rejected": -1.850433588027954,
"step": 240
},
{
"epoch": 0.28785261945883706,
"grad_norm": 31.949435858685035,
"learning_rate": 4.481596432975201e-07,
"logits/chosen": -3.528832197189331,
"logits/rejected": -3.651289463043213,
"logps/chosen": -336.5597229003906,
"logps/rejected": -484.8773498535156,
"loss": 0.425,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.026675838977098465,
"rewards/margins": 1.7153713703155518,
"rewards/rejected": -1.6886956691741943,
"step": 250
},
{
"epoch": 0.2993667242371906,
"grad_norm": 27.939909687462926,
"learning_rate": 4.41869747515886e-07,
"logits/chosen": -3.489166736602783,
"logits/rejected": -3.7278106212615967,
"logps/chosen": -356.98907470703125,
"logps/rejected": -521.9197387695312,
"loss": 0.4148,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.07938538491725922,
"rewards/margins": 2.32578706741333,
"rewards/rejected": -2.24640154838562,
"step": 260
},
{
"epoch": 0.31088082901554404,
"grad_norm": 34.336437982786,
"learning_rate": 4.352694346459396e-07,
"logits/chosen": -3.69819974899292,
"logits/rejected": -3.856245756149292,
"logps/chosen": -312.3550109863281,
"logps/rejected": -512.3087768554688,
"loss": 0.3868,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.007610364351421595,
"rewards/margins": 2.3179588317871094,
"rewards/rejected": -2.3103487491607666,
"step": 270
},
{
"epoch": 0.3223949337938975,
"grad_norm": 31.93422033932675,
"learning_rate": 4.2836938302509256e-07,
"logits/chosen": -3.8322901725769043,
"logits/rejected": -4.021459579467773,
"logps/chosen": -364.43157958984375,
"logps/rejected": -556.7454223632812,
"loss": 0.3795,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.3510279357433319,
"rewards/margins": 2.118349075317383,
"rewards/rejected": -2.469377040863037,
"step": 280
},
{
"epoch": 0.333909038572251,
"grad_norm": 43.67643614347539,
"learning_rate": 4.2118075592405874e-07,
"logits/chosen": -4.014069080352783,
"logits/rejected": -4.166284561157227,
"logps/chosen": -366.17498779296875,
"logps/rejected": -511.95806884765625,
"loss": 0.4028,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.3753136992454529,
"rewards/margins": 1.9316318035125732,
"rewards/rejected": -2.306945562362671,
"step": 290
},
{
"epoch": 0.3454231433506045,
"grad_norm": 33.05155256360138,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -3.932748794555664,
"logits/rejected": -4.1272077560424805,
"logps/chosen": -338.482666015625,
"logps/rejected": -491.4756774902344,
"loss": 0.4144,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.12368359416723251,
"rewards/margins": 1.6778045892715454,
"rewards/rejected": -1.8014881610870361,
"step": 300
},
{
"epoch": 0.3454231433506045,
"eval_logits/chosen": -3.886050224304199,
"eval_logits/rejected": -4.0962815284729,
"eval_logps/chosen": -410.2217712402344,
"eval_logps/rejected": -499.97003173828125,
"eval_loss": 0.45044589042663574,
"eval_rewards/accuracies": 0.77734375,
"eval_rewards/chosen": -0.3084433674812317,
"eval_rewards/margins": 0.9651419520378113,
"eval_rewards/rejected": -1.273585319519043,
"eval_runtime": 99.0297,
"eval_samples_per_second": 20.196,
"eval_steps_per_second": 0.323,
"step": 300
},
{
"epoch": 0.356937248128958,
"grad_norm": 30.758950038626843,
"learning_rate": 4.059847439122671e-07,
"logits/chosen": -4.072343826293945,
"logits/rejected": -4.278454780578613,
"logps/chosen": -332.38323974609375,
"logps/rejected": -486.20587158203125,
"loss": 0.4126,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.11183549463748932,
"rewards/margins": 1.9423106908798218,
"rewards/rejected": -1.8304752111434937,
"step": 310
},
{
"epoch": 0.36845135290731146,
"grad_norm": 35.899670349090925,
"learning_rate": 3.98001943918432e-07,
"logits/chosen": -4.233328819274902,
"logits/rejected": -4.456056594848633,
"logps/chosen": -370.2253723144531,
"logps/rejected": -577.809814453125,
"loss": 0.3732,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.1710590422153473,
"rewards/margins": 2.226891279220581,
"rewards/rejected": -2.3979504108428955,
"step": 320
},
{
"epoch": 0.3799654576856649,
"grad_norm": 31.506974249108822,
"learning_rate": 3.8977969850346866e-07,
"logits/chosen": -4.291365146636963,
"logits/rejected": -4.589537143707275,
"logps/chosen": -402.2667541503906,
"logps/rejected": -580.32080078125,
"loss": 0.4158,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.40963658690452576,
"rewards/margins": 2.1939713954925537,
"rewards/rejected": -2.6036081314086914,
"step": 330
},
{
"epoch": 0.39147956246401844,
"grad_norm": 42.312479747132286,
"learning_rate": 3.8133131005357465e-07,
"logits/chosen": -4.51456356048584,
"logits/rejected": -4.711074352264404,
"logps/chosen": -356.7383117675781,
"logps/rejected": -599.3222045898438,
"loss": 0.3868,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.3934357762336731,
"rewards/margins": 2.4568190574645996,
"rewards/rejected": -2.850255012512207,
"step": 340
},
{
"epoch": 0.4029936672423719,
"grad_norm": 34.94322397599626,
"learning_rate": 3.7267044682118435e-07,
"logits/chosen": -4.381545066833496,
"logits/rejected": -4.7945661544799805,
"logps/chosen": -396.62408447265625,
"logps/rejected": -617.2008666992188,
"loss": 0.3886,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -0.23957356810569763,
"rewards/margins": 2.6808698177337646,
"rewards/rejected": -2.920443296432495,
"step": 350
},
{
"epoch": 0.41450777202072536,
"grad_norm": 35.153895155661694,
"learning_rate": 3.638111208117425e-07,
"logits/chosen": -4.376262664794922,
"logits/rejected": -4.689536094665527,
"logps/chosen": -387.55474853515625,
"logps/rejected": -586.8858642578125,
"loss": 0.4037,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.23621347546577454,
"rewards/margins": 2.256948232650757,
"rewards/rejected": -2.493161678314209,
"step": 360
},
{
"epoch": 0.4260218767990789,
"grad_norm": 30.56527510711544,
"learning_rate": 3.5476766511433605e-07,
"logits/chosen": -4.566588878631592,
"logits/rejected": -4.897808074951172,
"logps/chosen": -381.00604248046875,
"logps/rejected": -585.059814453125,
"loss": 0.3902,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.14318397641181946,
"rewards/margins": 2.517329692840576,
"rewards/rejected": -2.6605141162872314,
"step": 370
},
{
"epoch": 0.43753598157743234,
"grad_norm": 34.017679923693805,
"learning_rate": 3.455547107128602e-07,
"logits/chosen": -4.60725736618042,
"logits/rejected": -5.102498531341553,
"logps/chosen": -385.83770751953125,
"logps/rejected": -623.3347778320312,
"loss": 0.3929,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5362241268157959,
"rewards/margins": 2.6802401542663574,
"rewards/rejected": -3.2164645195007324,
"step": 380
},
{
"epoch": 0.44905008635578586,
"grad_norm": 33.15867623899776,
"learning_rate": 3.361871628152338e-07,
"logits/chosen": -4.563677787780762,
"logits/rejected": -4.989599227905273,
"logps/chosen": -367.84814453125,
"logps/rejected": -567.6351318359375,
"loss": 0.4213,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3700157105922699,
"rewards/margins": 2.4626548290252686,
"rewards/rejected": -2.8326706886291504,
"step": 390
},
{
"epoch": 0.4605641911341393,
"grad_norm": 35.10207305823101,
"learning_rate": 3.2668017673896077e-07,
"logits/chosen": -4.686192035675049,
"logits/rejected": -5.130132675170898,
"logps/chosen": -351.6319885253906,
"logps/rejected": -523.5940551757812,
"loss": 0.4011,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2101125717163086,
"rewards/margins": 2.3180549144744873,
"rewards/rejected": -2.528167247772217,
"step": 400
},
{
"epoch": 0.4605641911341393,
"eval_logits/chosen": -4.5018205642700195,
"eval_logits/rejected": -4.837046146392822,
"eval_logps/chosen": -421.8441162109375,
"eval_logps/rejected": -525.9361572265625,
"eval_loss": 0.4135480225086212,
"eval_rewards/accuracies": 0.80859375,
"eval_rewards/chosen": -0.42466747760772705,
"eval_rewards/margins": 1.1085797548294067,
"eval_rewards/rejected": -1.5332471132278442,
"eval_runtime": 98.3292,
"eval_samples_per_second": 20.34,
"eval_steps_per_second": 0.325,
"step": 400
},
{
"epoch": 0.4720782959124928,
"grad_norm": 33.086992992339596,
"learning_rate": 3.1704913339205103e-07,
"logits/chosen": -4.71237850189209,
"logits/rejected": -5.09951639175415,
"logps/chosen": -392.43292236328125,
"logps/rejected": -596.8004150390625,
"loss": 0.3894,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.45191723108291626,
"rewards/margins": 2.4984166622161865,
"rewards/rejected": -2.950334072113037,
"step": 410
},
{
"epoch": 0.4835924006908463,
"grad_norm": 36.9499485623677,
"learning_rate": 3.0730961438896885e-07,
"logits/chosen": -4.71737003326416,
"logits/rejected": -5.089630603790283,
"logps/chosen": -371.7138977050781,
"logps/rejected": -539.5205078125,
"loss": 0.3986,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.6353754997253418,
"rewards/margins": 1.956162452697754,
"rewards/rejected": -2.591538190841675,
"step": 420
},
{
"epoch": 0.49510650546919976,
"grad_norm": 28.416064555595714,
"learning_rate": 2.9747737684186795e-07,
"logits/chosen": -4.5956220626831055,
"logits/rejected": -5.009639263153076,
"logps/chosen": -388.5729064941406,
"logps/rejected": -566.389892578125,
"loss": 0.3953,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5186115503311157,
"rewards/margins": 2.118881940841675,
"rewards/rejected": -2.63749361038208,
"step": 430
},
{
"epoch": 0.5066206102475532,
"grad_norm": 35.02068361332514,
"learning_rate": 2.8756832786789663e-07,
"logits/chosen": -4.5723748207092285,
"logits/rejected": -5.229958534240723,
"logps/chosen": -344.8235778808594,
"logps/rejected": -562.1149291992188,
"loss": 0.3753,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.18356148898601532,
"rewards/margins": 2.6801793575286865,
"rewards/rejected": -2.863740921020508,
"step": 440
},
{
"epoch": 0.5181347150259067,
"grad_norm": 29.90766637224572,
"learning_rate": 2.7759849885381747e-07,
"logits/chosen": -4.58120059967041,
"logits/rejected": -5.108014106750488,
"logps/chosen": -380.8218688964844,
"logps/rejected": -558.5294189453125,
"loss": 0.395,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.36003825068473816,
"rewards/margins": 2.234218120574951,
"rewards/rejected": -2.594256639480591,
"step": 450
},
{
"epoch": 0.5296488198042603,
"grad_norm": 43.539308942722826,
"learning_rate": 2.675840195195762e-07,
"logits/chosen": -4.849000453948975,
"logits/rejected": -5.308794975280762,
"logps/chosen": -353.55523681640625,
"logps/rejected": -619.9716796875,
"loss": 0.3685,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.29138100147247314,
"rewards/margins": 2.825038433074951,
"rewards/rejected": -3.116419553756714,
"step": 460
},
{
"epoch": 0.5411629245826137,
"grad_norm": 33.774855687056665,
"learning_rate": 2.575410918227829e-07,
"logits/chosen": -4.863161087036133,
"logits/rejected": -5.457709312438965,
"logps/chosen": -411.6463317871094,
"logps/rejected": -598.97314453125,
"loss": 0.3821,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.46561044454574585,
"rewards/margins": 2.4459636211395264,
"rewards/rejected": -2.911574602127075,
"step": 470
},
{
"epoch": 0.5526770293609672,
"grad_norm": 33.53580470090372,
"learning_rate": 2.474859637463226e-07,
"logits/chosen": -5.079291343688965,
"logits/rejected": -5.424225807189941,
"logps/chosen": -389.027099609375,
"logps/rejected": -587.9437255859375,
"loss": 0.3962,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.4632614254951477,
"rewards/margins": 2.3001296520233154,
"rewards/rejected": -2.7633910179138184,
"step": 480
},
{
"epoch": 0.5641911341393206,
"grad_norm": 32.1453411001328,
"learning_rate": 2.3743490301150355e-07,
"logits/chosen": -5.007067680358887,
"logits/rejected": -5.361691474914551,
"logps/chosen": -343.4484558105469,
"logps/rejected": -570.6577758789062,
"loss": 0.3902,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.14810001850128174,
"rewards/margins": 2.4624667167663574,
"rewards/rejected": -2.6105666160583496,
"step": 490
},
{
"epoch": 0.5757052389176741,
"grad_norm": 32.90845084744282,
"learning_rate": 2.274041707592724e-07,
"logits/chosen": -4.921438694000244,
"logits/rejected": -5.355481147766113,
"logps/chosen": -339.01129150390625,
"logps/rejected": -556.4103393554688,
"loss": 0.3915,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.14777924120426178,
"rewards/margins": 2.432879686355591,
"rewards/rejected": -2.5806591510772705,
"step": 500
},
{
"epoch": 0.5757052389176741,
"eval_logits/chosen": -4.767510890960693,
"eval_logits/rejected": -5.187655925750732,
"eval_logps/chosen": -418.29376220703125,
"eval_logps/rejected": -544.0393676757812,
"eval_loss": 0.37398749589920044,
"eval_rewards/accuracies": 0.8515625,
"eval_rewards/chosen": -0.389164000749588,
"eval_rewards/margins": 1.3251150846481323,
"eval_rewards/rejected": -1.7142791748046875,
"eval_runtime": 98.0381,
"eval_samples_per_second": 20.4,
"eval_steps_per_second": 0.326,
"step": 500
},
{
"epoch": 0.5872193436960277,
"grad_norm": 31.42761305876207,
"learning_rate": 2.17409995242075e-07,
"logits/chosen": -5.038609504699707,
"logits/rejected": -5.722345352172852,
"logps/chosen": -372.905517578125,
"logps/rejected": -569.4352416992188,
"loss": 0.376,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.27033573389053345,
"rewards/margins": 2.4031970500946045,
"rewards/rejected": -2.6735329627990723,
"step": 510
},
{
"epoch": 0.5987334484743811,
"grad_norm": 29.61275457382243,
"learning_rate": 2.0746854556892544e-07,
"logits/chosen": -5.438863754272461,
"logits/rejected": -5.798094749450684,
"logps/chosen": -407.27008056640625,
"logps/rejected": -620.6509399414062,
"loss": 0.3645,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.43467459082603455,
"rewards/margins": 2.4455971717834473,
"rewards/rejected": -2.8802719116210938,
"step": 520
},
{
"epoch": 0.6102475532527346,
"grad_norm": 27.24117353879226,
"learning_rate": 1.9759590554616173e-07,
"logits/chosen": -5.715832710266113,
"logits/rejected": -6.058187961578369,
"logps/chosen": -397.95849609375,
"logps/rejected": -609.6741943359375,
"loss": 0.3968,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6830942034721375,
"rewards/margins": 2.4185569286346436,
"rewards/rejected": -3.101651191711426,
"step": 530
},
{
"epoch": 0.6217616580310881,
"grad_norm": 30.859422948077256,
"learning_rate": 1.8780804765620746e-07,
"logits/chosen": -5.4331769943237305,
"logits/rejected": -5.7857160568237305,
"logps/chosen": -373.3824462890625,
"logps/rejected": -528.5029296875,
"loss": 0.4178,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4058764576911926,
"rewards/margins": 1.9241279363632202,
"rewards/rejected": -2.3300044536590576,
"step": 540
},
{
"epoch": 0.6332757628094415,
"grad_norm": 35.78902948656132,
"learning_rate": 1.7812080721643973e-07,
"logits/chosen": -5.20429801940918,
"logits/rejected": -5.622688293457031,
"logps/chosen": -401.1048889160156,
"logps/rejected": -605.438232421875,
"loss": 0.3956,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.27011531591415405,
"rewards/margins": 2.323632001876831,
"rewards/rejected": -2.593747615814209,
"step": 550
},
{
"epoch": 0.644789867587795,
"grad_norm": 31.09337668064834,
"learning_rate": 1.6854985675997063e-07,
"logits/chosen": -5.3274736404418945,
"logits/rejected": -5.779025554656982,
"logps/chosen": -370.87823486328125,
"logps/rejected": -599.370361328125,
"loss": 0.377,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.30361196398735046,
"rewards/margins": 2.5692386627197266,
"rewards/rejected": -2.8728506565093994,
"step": 560
},
{
"epoch": 0.6563039723661486,
"grad_norm": 31.49748801480019,
"learning_rate": 1.5911068067978818e-07,
"logits/chosen": -5.422667503356934,
"logits/rejected": -5.991160869598389,
"logps/chosen": -363.42791748046875,
"logps/rejected": -606.8687744140625,
"loss": 0.3651,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -0.3893515467643738,
"rewards/margins": 2.7044646739959717,
"rewards/rejected": -3.093816041946411,
"step": 570
},
{
"epoch": 0.667818077144502,
"grad_norm": 40.80686884426901,
"learning_rate": 1.4981855017728197e-07,
"logits/chosen": -5.2194623947143555,
"logits/rejected": -5.8604302406311035,
"logps/chosen": -378.5892028808594,
"logps/rejected": -623.4224853515625,
"loss": 0.3681,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.4009127616882324,
"rewards/margins": 2.839203357696533,
"rewards/rejected": -3.2401161193847656,
"step": 580
},
{
"epoch": 0.6793321819228555,
"grad_norm": 35.637123676945,
"learning_rate": 1.406884985556804e-07,
"logits/chosen": -5.340333461761475,
"logits/rejected": -5.9213457107543945,
"logps/chosen": -366.98126220703125,
"logps/rejected": -646.6055297851562,
"loss": 0.3892,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.3502456843852997,
"rewards/margins": 3.1350584030151367,
"rewards/rejected": -3.4853038787841797,
"step": 590
},
{
"epoch": 0.690846286701209,
"grad_norm": 38.133176182262396,
"learning_rate": 1.3173529689837354e-07,
"logits/chosen": -5.227208137512207,
"logits/rejected": -5.730982780456543,
"logps/chosen": -406.6194152832031,
"logps/rejected": -642.0016479492188,
"loss": 0.3726,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.19344040751457214,
"rewards/margins": 2.756740093231201,
"rewards/rejected": -2.9501805305480957,
"step": 600
},
{
"epoch": 0.690846286701209,
"eval_logits/chosen": -5.146116256713867,
"eval_logits/rejected": -5.624752044677734,
"eval_logps/chosen": -427.4439392089844,
"eval_logps/rejected": -561.528564453125,
"eval_loss": 0.3467547297477722,
"eval_rewards/accuracies": 0.84375,
"eval_rewards/chosen": -0.4806651175022125,
"eval_rewards/margins": 1.408505916595459,
"eval_rewards/rejected": -1.8891710042953491,
"eval_runtime": 98.3003,
"eval_samples_per_second": 20.346,
"eval_steps_per_second": 0.326,
"step": 600
},
{
"epoch": 0.7023603914795624,
"grad_norm": 35.76369238749813,
"learning_rate": 1.2297343017146726e-07,
"logits/chosen": -5.63295316696167,
"logits/rejected": -6.0680012702941895,
"logps/chosen": -352.22650146484375,
"logps/rejected": -569.6236572265625,
"loss": 0.3654,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.400468111038208,
"rewards/margins": 2.252286672592163,
"rewards/rejected": -2.65275502204895,
"step": 610
},
{
"epoch": 0.713874496257916,
"grad_norm": 42.53908245265289,
"learning_rate": 1.1441707378923474e-07,
"logits/chosen": -5.555817604064941,
"logits/rejected": -5.891648292541504,
"logps/chosen": -372.3026123046875,
"logps/rejected": -608.4457397460938,
"loss": 0.3719,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.5105666518211365,
"rewards/margins": 2.334003448486328,
"rewards/rejected": -2.844569683074951,
"step": 620
},
{
"epoch": 0.7253886010362695,
"grad_norm": 33.40462593975916,
"learning_rate": 1.06080070680377e-07,
"logits/chosen": -5.389917850494385,
"logits/rejected": -5.883559226989746,
"logps/chosen": -380.6363525390625,
"logps/rejected": -589.5970458984375,
"loss": 0.3608,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.4320860803127289,
"rewards/margins": 2.423119068145752,
"rewards/rejected": -2.8552052974700928,
"step": 630
},
{
"epoch": 0.7369027058146229,
"grad_norm": 40.31781331240861,
"learning_rate": 9.797590889219587e-08,
"logits/chosen": -5.418898582458496,
"logits/rejected": -6.029601097106934,
"logps/chosen": -331.7992248535156,
"logps/rejected": -644.7623291015625,
"loss": 0.4071,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.26965656876564026,
"rewards/margins": 3.317509174346924,
"rewards/rejected": -3.5871658325195312,
"step": 640
},
{
"epoch": 0.7484168105929764,
"grad_norm": 30.964195430126203,
"learning_rate": 9.011769976891367e-08,
"logits/chosen": -5.33644962310791,
"logits/rejected": -5.905170440673828,
"logps/chosen": -370.828369140625,
"logps/rejected": -630.619140625,
"loss": 0.3809,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.340393990278244,
"rewards/margins": 2.9275107383728027,
"rewards/rejected": -3.267904758453369,
"step": 650
},
{
"epoch": 0.7599309153713298,
"grad_norm": 34.09027033994428,
"learning_rate": 8.251815673944218e-08,
"logits/chosen": -5.566973686218262,
"logits/rejected": -5.901907444000244,
"logps/chosen": -373.8709411621094,
"logps/rejected": -626.88720703125,
"loss": 0.3664,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.31639060378074646,
"rewards/margins": 2.5317635536193848,
"rewards/rejected": -2.848154067993164,
"step": 660
},
{
"epoch": 0.7714450201496834,
"grad_norm": 33.748663190230474,
"learning_rate": 7.518957474892148e-08,
"logits/chosen": -5.544904708862305,
"logits/rejected": -6.055120468139648,
"logps/chosen": -366.33306884765625,
"logps/rejected": -662.8927001953125,
"loss": 0.3675,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.4155319333076477,
"rewards/margins": 3.206387758255005,
"rewards/rejected": -3.621919631958008,
"step": 670
},
{
"epoch": 0.7829591249280369,
"grad_norm": 33.43366335799461,
"learning_rate": 6.814381036730274e-08,
"logits/chosen": -5.3579840660095215,
"logits/rejected": -5.930968284606934,
"logps/chosen": -384.45245361328125,
"logps/rejected": -620.3960571289062,
"loss": 0.3748,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.3938636779785156,
"rewards/margins": 2.738201856613159,
"rewards/rejected": -3.132065773010254,
"step": 680
},
{
"epoch": 0.7944732297063903,
"grad_norm": 31.210525154632403,
"learning_rate": 6.139226260715872e-08,
"logits/chosen": -5.434956073760986,
"logits/rejected": -5.966610908508301,
"logps/chosen": -387.60162353515625,
"logps/rejected": -664.8744506835938,
"loss": 0.355,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4180675446987152,
"rewards/margins": 2.967360019683838,
"rewards/rejected": -3.385427474975586,
"step": 690
},
{
"epoch": 0.8059873344847438,
"grad_norm": 33.963445753535076,
"learning_rate": 5.4945854481754734e-08,
"logits/chosen": -5.527676105499268,
"logits/rejected": -5.960885047912598,
"logps/chosen": -374.95916748046875,
"logps/rejected": -630.1693725585938,
"loss": 0.3522,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.3166103959083557,
"rewards/margins": 2.8152191638946533,
"rewards/rejected": -3.1318297386169434,
"step": 700
},
{
"epoch": 0.8059873344847438,
"eval_logits/chosen": -5.210726261138916,
"eval_logits/rejected": -5.681924343109131,
"eval_logps/chosen": -433.6905517578125,
"eval_logps/rejected": -577.3692016601562,
"eval_loss": 0.32489100098609924,
"eval_rewards/accuracies": 0.87890625,
"eval_rewards/chosen": -0.5431313514709473,
"eval_rewards/margins": 1.5044457912445068,
"eval_rewards/rejected": -2.047577142715454,
"eval_runtime": 98.0334,
"eval_samples_per_second": 20.401,
"eval_steps_per_second": 0.326,
"step": 700
},
{
"epoch": 0.8175014392630973,
"grad_norm": 32.382102785679976,
"learning_rate": 4.881501533321605e-08,
"logits/chosen": -5.631700038909912,
"logits/rejected": -6.175845146179199,
"logps/chosen": -364.59674072265625,
"logps/rejected": -615.4799194335938,
"loss": 0.3861,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.4184879660606384,
"rewards/margins": 2.884592294692993,
"rewards/rejected": -3.3030803203582764,
"step": 710
},
{
"epoch": 0.8290155440414507,
"grad_norm": 29.844564520231344,
"learning_rate": 4.300966395938377e-08,
"logits/chosen": -5.579652309417725,
"logits/rejected": -6.021969795227051,
"logps/chosen": -410.3070373535156,
"logps/rejected": -654.1072387695312,
"loss": 0.3805,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -0.40225619077682495,
"rewards/margins": 2.8050906658172607,
"rewards/rejected": -3.2073471546173096,
"step": 720
},
{
"epoch": 0.8405296488198043,
"grad_norm": 34.64605949847163,
"learning_rate": 3.7539192566655246e-08,
"logits/chosen": -5.749828338623047,
"logits/rejected": -6.230714321136475,
"logps/chosen": -372.4962463378906,
"logps/rejected": -620.4830932617188,
"loss": 0.3701,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -0.3709852397441864,
"rewards/margins": 2.7844834327697754,
"rewards/rejected": -3.155468702316284,
"step": 730
},
{
"epoch": 0.8520437535981578,
"grad_norm": 38.917435902608844,
"learning_rate": 3.24124515747731e-08,
"logits/chosen": -5.770384311676025,
"logits/rejected": -6.440248966217041,
"logps/chosen": -377.38360595703125,
"logps/rejected": -670.9470825195312,
"loss": 0.3725,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.46737533807754517,
"rewards/margins": 3.3466858863830566,
"rewards/rejected": -3.814060926437378,
"step": 740
},
{
"epoch": 0.8635578583765112,
"grad_norm": 35.39576347923302,
"learning_rate": 2.763773529814506e-08,
"logits/chosen": -5.80182409286499,
"logits/rejected": -6.183619976043701,
"logps/chosen": -363.37359619140625,
"logps/rejected": -643.1031494140625,
"loss": 0.3736,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.513085663318634,
"rewards/margins": 2.7367725372314453,
"rewards/rejected": -3.2498581409454346,
"step": 750
},
{
"epoch": 0.8750719631548647,
"grad_norm": 35.82536365897154,
"learning_rate": 2.3222768526860698e-08,
"logits/chosen": -5.800836563110352,
"logits/rejected": -6.234482288360596,
"logps/chosen": -365.31903076171875,
"logps/rejected": -579.0399169921875,
"loss": 0.3663,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.5012763738632202,
"rewards/margins": 2.1673426628112793,
"rewards/rejected": -2.668619394302368,
"step": 760
},
{
"epoch": 0.8865860679332181,
"grad_norm": 37.880330092886545,
"learning_rate": 1.9174694029115146e-08,
"logits/chosen": -5.784181594848633,
"logits/rejected": -6.484677314758301,
"logps/chosen": -376.74908447265625,
"logps/rejected": -637.3211059570312,
"loss": 0.38,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3697873055934906,
"rewards/margins": 3.116102933883667,
"rewards/rejected": -3.4858901500701904,
"step": 770
},
{
"epoch": 0.8981001727115717,
"grad_norm": 37.173154353795034,
"learning_rate": 1.5500060995258134e-08,
"logits/chosen": -5.590546607971191,
"logits/rejected": -6.252056121826172,
"logps/chosen": -404.06219482421875,
"logps/rejected": -671.0790405273438,
"loss": 0.3644,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.4821314811706543,
"rewards/margins": 2.973552703857422,
"rewards/rejected": -3.455684185028076,
"step": 780
},
{
"epoch": 0.9096142774899252,
"grad_norm": 38.483209821819536,
"learning_rate": 1.2204814442165812e-08,
"logits/chosen": -5.847277641296387,
"logits/rejected": -6.545414924621582,
"logps/chosen": -402.4599609375,
"logps/rejected": -618.3992309570312,
"loss": 0.3744,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.5441657900810242,
"rewards/margins": 2.8156542778015137,
"rewards/rejected": -3.3598198890686035,
"step": 790
},
{
"epoch": 0.9211283822682786,
"grad_norm": 36.88952100776894,
"learning_rate": 9.294285595075669e-09,
"logits/chosen": -5.882547378540039,
"logits/rejected": -6.232880115509033,
"logps/chosen": -359.8563537597656,
"logps/rejected": -655.06787109375,
"loss": 0.3643,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.496969074010849,
"rewards/margins": 2.759918689727783,
"rewards/rejected": -3.256887912750244,
"step": 800
},
{
"epoch": 0.9211283822682786,
"eval_logits/chosen": -5.403136253356934,
"eval_logits/rejected": -5.885165214538574,
"eval_logps/chosen": -439.6992492675781,
"eval_logps/rejected": -584.2129516601562,
"eval_loss": 0.31831786036491394,
"eval_rewards/accuracies": 0.87109375,
"eval_rewards/chosen": -0.6032183170318604,
"eval_rewards/margins": 1.5127967596054077,
"eval_rewards/rejected": -2.1160147190093994,
"eval_runtime": 98.1126,
"eval_samples_per_second": 20.385,
"eval_steps_per_second": 0.326,
"step": 800
},
{
"epoch": 0.9326424870466321,
"grad_norm": 43.94120514478602,
"learning_rate": 6.773183262446914e-09,
"logits/chosen": -5.6489362716674805,
"logits/rejected": -6.28032112121582,
"logps/chosen": -353.1646423339844,
"logps/rejected": -609.9522705078125,
"loss": 0.3848,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.40985745191574097,
"rewards/margins": 2.7903153896331787,
"rewards/rejected": -3.2001731395721436,
"step": 810
},
{
"epoch": 0.9441565918249856,
"grad_norm": 33.525448706821926,
"learning_rate": 4.645586217799452e-09,
"logits/chosen": -5.750053882598877,
"logits/rejected": -6.382951259613037,
"logps/chosen": -408.31915283203125,
"logps/rejected": -624.9613037109375,
"loss": 0.3682,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.44873589277267456,
"rewards/margins": 2.5182459354400635,
"rewards/rejected": -2.966981887817383,
"step": 820
},
{
"epoch": 0.9556706966033391,
"grad_norm": 32.59312352646331,
"learning_rate": 2.9149366008568987e-09,
"logits/chosen": -5.68507194519043,
"logits/rejected": -6.2285284996032715,
"logps/chosen": -345.0586853027344,
"logps/rejected": -635.7188720703125,
"loss": 0.3761,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -0.39172735810279846,
"rewards/margins": 2.9998083114624023,
"rewards/rejected": -3.391535520553589,
"step": 830
},
{
"epoch": 0.9671848013816926,
"grad_norm": 37.49243505993372,
"learning_rate": 1.5840343486700215e-09,
"logits/chosen": -5.730424404144287,
"logits/rejected": -6.221343040466309,
"logps/chosen": -356.298583984375,
"logps/rejected": -621.7361450195312,
"loss": 0.3928,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.4219423830509186,
"rewards/margins": 2.8504931926727295,
"rewards/rejected": -3.272435426712036,
"step": 840
},
{
"epoch": 0.9786989061600461,
"grad_norm": 33.08948980944996,
"learning_rate": 6.550326657293881e-10,
"logits/chosen": -5.9162678718566895,
"logits/rejected": -6.479850769042969,
"logps/chosen": -360.3614196777344,
"logps/rejected": -608.4212646484375,
"loss": 0.3596,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.3865709900856018,
"rewards/margins": 2.8733856678009033,
"rewards/rejected": -3.2599568367004395,
"step": 850
},
{
"epoch": 0.9902130109383995,
"grad_norm": 33.68247028780298,
"learning_rate": 1.2943454039654467e-10,
"logits/chosen": -5.6706414222717285,
"logits/rejected": -6.1612443923950195,
"logps/chosen": -388.79510498046875,
"logps/rejected": -634.7048950195312,
"loss": 0.3777,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3972472846508026,
"rewards/margins": 2.8383138179779053,
"rewards/rejected": -3.2355613708496094,
"step": 860
},
{
"epoch": 0.9994242947610823,
"step": 868,
"total_flos": 0.0,
"train_loss": 0.4218231642850533,
"train_runtime": 14967.0092,
"train_samples_per_second": 7.425,
"train_steps_per_second": 0.058
}
],
"logging_steps": 10,
"max_steps": 868,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}