zephyr-7b-dpo-full / trainer_state.json
wzhouad's picture
Model save
8a4c724 verified
raw
history blame
No virus
26 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.8099329471588135,
"logits/rejected": -2.7572641372680664,
"logps/chosen": -241.48843383789062,
"logps/rejected": -197.4517822265625,
"loss": 271.7943,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.83237361907959,
"logits/rejected": -2.808957815170288,
"logps/chosen": -292.6072692871094,
"logps/rejected": -278.4604797363281,
"loss": 286.0386,
"rewards/accuracies": 0.4166666567325592,
"rewards/chosen": 0.0008353570010513067,
"rewards/margins": -0.0004216647648718208,
"rewards/rejected": 0.0012570219114422798,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.81878662109375,
"logits/rejected": -2.7905821800231934,
"logps/chosen": -286.19378662109375,
"logps/rejected": -286.7618103027344,
"loss": 264.9378,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.04205816239118576,
"rewards/margins": 0.0025776384864002466,
"rewards/rejected": 0.03948052600026131,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.7921807765960693,
"logits/rejected": -2.7616498470306396,
"logps/chosen": -232.4526824951172,
"logps/rejected": -212.8272705078125,
"loss": 266.1199,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.13909421861171722,
"rewards/margins": 0.004568194039165974,
"rewards/rejected": 0.13452602922916412,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.8503639698028564,
"logits/rejected": -2.819370985031128,
"logps/chosen": -280.4808654785156,
"logps/rejected": -243.86935424804688,
"loss": 258.2878,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.19040736556053162,
"rewards/margins": 0.01646682806313038,
"rewards/rejected": 0.1739405393600464,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.764444589614868,
"logits/rejected": -2.7427210807800293,
"logps/chosen": -254.5093231201172,
"logps/rejected": -240.6798858642578,
"loss": 244.2991,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.22921113669872284,
"rewards/margins": -0.0009602505015209317,
"rewards/rejected": 0.23017136752605438,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.7702980041503906,
"logits/rejected": -2.742863893508911,
"logps/chosen": -235.5380401611328,
"logps/rejected": -211.11770629882812,
"loss": 255.6714,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.23538246750831604,
"rewards/margins": 0.020409177988767624,
"rewards/rejected": 0.2149733006954193,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.7509877681732178,
"logits/rejected": -2.7182183265686035,
"logps/chosen": -239.36514282226562,
"logps/rejected": -210.6224822998047,
"loss": 253.836,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.29057446122169495,
"rewards/margins": 0.04796246066689491,
"rewards/rejected": 0.24261197447776794,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.699618339538574,
"logits/rejected": -2.697359800338745,
"logps/chosen": -241.5849609375,
"logps/rejected": -241.43533325195312,
"loss": 261.2479,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.29146844148635864,
"rewards/margins": 0.030574629083275795,
"rewards/rejected": 0.2608937919139862,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.7679967880249023,
"logits/rejected": -2.7326865196228027,
"logps/chosen": -225.57156372070312,
"logps/rejected": -208.76693725585938,
"loss": 252.1535,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.29392915964126587,
"rewards/margins": 0.03465462103486061,
"rewards/rejected": 0.25927454233169556,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.819113254547119,
"logits/rejected": -2.783386707305908,
"logps/chosen": -260.7179260253906,
"logps/rejected": -238.54434204101562,
"loss": 256.7688,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.28721123933792114,
"rewards/margins": -0.005946027580648661,
"rewards/rejected": 0.29315727949142456,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.7863247394561768,
"eval_logits/rejected": -2.767011880874634,
"eval_logps/chosen": -227.56491088867188,
"eval_logps/rejected": -230.47303771972656,
"eval_loss": 245.72764587402344,
"eval_rewards/accuracies": 0.57421875,
"eval_rewards/chosen": 0.29474756121635437,
"eval_rewards/margins": 0.025945277884602547,
"eval_rewards/rejected": 0.268802285194397,
"eval_runtime": 53.5253,
"eval_samples_per_second": 37.365,
"eval_steps_per_second": 0.598,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.6783273220062256,
"logits/rejected": -2.644078016281128,
"logps/chosen": -237.9336395263672,
"logps/rejected": -188.83895874023438,
"loss": 240.9402,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.32163843512535095,
"rewards/margins": -0.008066670037806034,
"rewards/rejected": 0.3297051787376404,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -2.746994733810425,
"logits/rejected": -2.724551200866699,
"logps/chosen": -240.14242553710938,
"logps/rejected": -237.6074981689453,
"loss": 245.7042,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.2870863080024719,
"rewards/margins": 0.03465163707733154,
"rewards/rejected": 0.25243470072746277,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -2.7405877113342285,
"logits/rejected": -2.7226414680480957,
"logps/chosen": -242.31460571289062,
"logps/rejected": -222.0249786376953,
"loss": 239.1022,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.3218019902706146,
"rewards/margins": 0.03552493453025818,
"rewards/rejected": 0.28627708554267883,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -2.7363457679748535,
"logits/rejected": -2.710092067718506,
"logps/chosen": -248.31982421875,
"logps/rejected": -242.44064331054688,
"loss": 240.1779,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.32314616441726685,
"rewards/margins": 0.03991778939962387,
"rewards/rejected": 0.2832283675670624,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -2.6783087253570557,
"logits/rejected": -2.707118511199951,
"logps/chosen": -196.87429809570312,
"logps/rejected": -205.2587890625,
"loss": 236.8216,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.2980460226535797,
"rewards/margins": -0.012227327562868595,
"rewards/rejected": 0.31027334928512573,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -2.7062594890594482,
"logits/rejected": -2.6792685985565186,
"logps/chosen": -224.0284881591797,
"logps/rejected": -215.8988494873047,
"loss": 233.851,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.31943365931510925,
"rewards/margins": 0.01964866928756237,
"rewards/rejected": 0.29978498816490173,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -2.6613574028015137,
"logits/rejected": -2.6377127170562744,
"logps/chosen": -249.140869140625,
"logps/rejected": -229.2135009765625,
"loss": 238.7709,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.3302595019340515,
"rewards/margins": 0.02071164920926094,
"rewards/rejected": 0.30954790115356445,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -2.697937488555908,
"logits/rejected": -2.6982693672180176,
"logps/chosen": -237.61752319335938,
"logps/rejected": -222.94534301757812,
"loss": 240.8094,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.3156043589115143,
"rewards/margins": -0.00877897534519434,
"rewards/rejected": 0.32438334822654724,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -2.716660499572754,
"logits/rejected": -2.6782338619232178,
"logps/chosen": -256.08953857421875,
"logps/rejected": -233.2887725830078,
"loss": 241.1514,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.35306206345558167,
"rewards/margins": 0.0451970100402832,
"rewards/rejected": 0.3078650236129761,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -2.6975326538085938,
"logits/rejected": -2.715744733810425,
"logps/chosen": -260.04095458984375,
"logps/rejected": -232.90744018554688,
"loss": 239.4804,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.3612653315067291,
"rewards/margins": 0.047182150185108185,
"rewards/rejected": 0.31408315896987915,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -2.7289865016937256,
"eval_logits/rejected": -2.7082109451293945,
"eval_logps/chosen": -223.84622192382812,
"eval_logps/rejected": -226.99331665039062,
"eval_loss": 241.5078125,
"eval_rewards/accuracies": 0.57421875,
"eval_rewards/chosen": 0.33193421363830566,
"eval_rewards/margins": 0.028334595263004303,
"eval_rewards/rejected": 0.30359959602355957,
"eval_runtime": 53.4363,
"eval_samples_per_second": 37.428,
"eval_steps_per_second": 0.599,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -2.653046131134033,
"logits/rejected": -2.639833927154541,
"logps/chosen": -236.7248992919922,
"logps/rejected": -215.8306427001953,
"loss": 232.2403,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.3630830645561218,
"rewards/margins": 0.002032138407230377,
"rewards/rejected": 0.36105093359947205,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -2.6577677726745605,
"logits/rejected": -2.610652446746826,
"logps/chosen": -234.62515258789062,
"logps/rejected": -224.5327911376953,
"loss": 236.0736,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.403405100107193,
"rewards/margins": 0.025665929540991783,
"rewards/rejected": 0.3777391314506531,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -2.7138988971710205,
"logits/rejected": -2.6695990562438965,
"logps/chosen": -238.20166015625,
"logps/rejected": -238.32510375976562,
"loss": 243.5431,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.36346226930618286,
"rewards/margins": 0.04655776172876358,
"rewards/rejected": 0.31690454483032227,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -2.6672616004943848,
"logits/rejected": -2.6338298320770264,
"logps/chosen": -245.7289581298828,
"logps/rejected": -222.80068969726562,
"loss": 241.8247,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.4044111371040344,
"rewards/margins": 0.09945273399353027,
"rewards/rejected": 0.30495840311050415,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -2.711887836456299,
"logits/rejected": -2.660736083984375,
"logps/chosen": -236.48410034179688,
"logps/rejected": -220.5592803955078,
"loss": 232.2589,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.3508817255496979,
"rewards/margins": 0.012243595905601978,
"rewards/rejected": 0.3386381268501282,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -2.6873819828033447,
"logits/rejected": -2.665743827819824,
"logps/chosen": -226.0983428955078,
"logps/rejected": -222.646484375,
"loss": 249.1208,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.33049115538597107,
"rewards/margins": -0.025500113144516945,
"rewards/rejected": 0.35599130392074585,
"step": 260
},
{
"epoch": 0.56,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -2.684255599975586,
"logits/rejected": -2.6710708141326904,
"logps/chosen": -250.91659545898438,
"logps/rejected": -232.743896484375,
"loss": 232.0498,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.3753630220890045,
"rewards/margins": 0.05215846374630928,
"rewards/rejected": 0.32320457696914673,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -2.642585515975952,
"logits/rejected": -2.6435678005218506,
"logps/chosen": -271.9496154785156,
"logps/rejected": -232.7229461669922,
"loss": 242.3832,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.3579340875148773,
"rewards/margins": 0.029489517211914062,
"rewards/rejected": 0.32844457030296326,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -2.6854400634765625,
"logits/rejected": -2.6508984565734863,
"logps/chosen": -238.96533203125,
"logps/rejected": -214.7186737060547,
"loss": 237.8058,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.38798388838768005,
"rewards/margins": 0.002142349723726511,
"rewards/rejected": 0.3858415484428406,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -2.6979777812957764,
"logits/rejected": -2.6885297298431396,
"logps/chosen": -218.0729522705078,
"logps/rejected": -206.6052703857422,
"loss": 240.5041,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.3353363871574402,
"rewards/margins": -0.021952930837869644,
"rewards/rejected": 0.35728925466537476,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -2.7041783332824707,
"eval_logits/rejected": -2.680725574493408,
"eval_logps/chosen": -222.30435180664062,
"eval_logps/rejected": -225.71856689453125,
"eval_loss": 239.8052978515625,
"eval_rewards/accuracies": 0.5625,
"eval_rewards/chosen": 0.3473527431488037,
"eval_rewards/margins": 0.03100587986409664,
"eval_rewards/rejected": 0.3163468837738037,
"eval_runtime": 53.5009,
"eval_samples_per_second": 37.383,
"eval_steps_per_second": 0.598,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -2.681898832321167,
"logits/rejected": -2.647827625274658,
"logps/chosen": -249.84933471679688,
"logps/rejected": -221.06466674804688,
"loss": 244.5816,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.37856870889663696,
"rewards/margins": 0.031483568251132965,
"rewards/rejected": 0.347085177898407,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -2.6419777870178223,
"logits/rejected": -2.6089978218078613,
"logps/chosen": -247.5414581298828,
"logps/rejected": -228.00442504882812,
"loss": 227.1146,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.38319897651672363,
"rewards/margins": 0.010034086182713509,
"rewards/rejected": 0.3731648921966553,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -2.6546921730041504,
"logits/rejected": -2.619576930999756,
"logps/chosen": -235.1280059814453,
"logps/rejected": -228.36550903320312,
"loss": 235.3102,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.3890138268470764,
"rewards/margins": 0.030328262597322464,
"rewards/rejected": 0.35868555307388306,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -2.6908440589904785,
"logits/rejected": -2.6842880249023438,
"logps/chosen": -221.61752319335938,
"logps/rejected": -221.1806182861328,
"loss": 226.7886,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.3297863006591797,
"rewards/margins": 0.012335492298007011,
"rewards/rejected": 0.3174508213996887,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -2.6660337448120117,
"logits/rejected": -2.6440846920013428,
"logps/chosen": -235.5915069580078,
"logps/rejected": -227.4123077392578,
"loss": 235.1443,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.4086208939552307,
"rewards/margins": 0.052063293755054474,
"rewards/rejected": 0.35655760765075684,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -2.631333112716675,
"logits/rejected": -2.605966806411743,
"logps/chosen": -223.34115600585938,
"logps/rejected": -207.91500854492188,
"loss": 246.5647,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.35043418407440186,
"rewards/margins": -0.0040539586916565895,
"rewards/rejected": 0.35448816418647766,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -2.667217254638672,
"logits/rejected": -2.6636977195739746,
"logps/chosen": -240.44247436523438,
"logps/rejected": -231.893798828125,
"loss": 236.9863,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.37617605924606323,
"rewards/margins": 0.05858853459358215,
"rewards/rejected": 0.3175875246524811,
"step": 370
},
{
"epoch": 0.79,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -2.6146702766418457,
"logits/rejected": -2.611351728439331,
"logps/chosen": -189.4939422607422,
"logps/rejected": -206.30081176757812,
"loss": 233.4487,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.3334718644618988,
"rewards/margins": -0.03314907103776932,
"rewards/rejected": 0.3666209578514099,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -2.663173198699951,
"logits/rejected": -2.634059429168701,
"logps/chosen": -221.12240600585938,
"logps/rejected": -204.8622589111328,
"loss": 241.2531,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.30437472462654114,
"rewards/margins": -0.002047918038442731,
"rewards/rejected": 0.3064226508140564,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -2.646238088607788,
"logits/rejected": -2.6048245429992676,
"logps/chosen": -250.01693725585938,
"logps/rejected": -215.4422607421875,
"loss": 236.8453,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.35686779022216797,
"rewards/margins": 0.03598792105913162,
"rewards/rejected": 0.3208799362182617,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -2.697472333908081,
"eval_logits/rejected": -2.673011541366577,
"eval_logps/chosen": -221.73487854003906,
"eval_logps/rejected": -225.18995666503906,
"eval_loss": 239.1992950439453,
"eval_rewards/accuracies": 0.5625,
"eval_rewards/chosen": 0.3530477285385132,
"eval_rewards/margins": 0.031414665281772614,
"eval_rewards/rejected": 0.3216330409049988,
"eval_runtime": 53.4218,
"eval_samples_per_second": 37.438,
"eval_steps_per_second": 0.599,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -2.684819459915161,
"logits/rejected": -2.622917652130127,
"logps/chosen": -226.3255157470703,
"logps/rejected": -246.3280487060547,
"loss": 234.8678,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.3726418912410736,
"rewards/margins": 0.02128712832927704,
"rewards/rejected": 0.3513546884059906,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -2.6737873554229736,
"logits/rejected": -2.638714551925659,
"logps/chosen": -250.0021514892578,
"logps/rejected": -255.8992156982422,
"loss": 246.8406,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.3681090474128723,
"rewards/margins": 0.0030112355016171932,
"rewards/rejected": 0.3650978207588196,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -2.6430375576019287,
"logits/rejected": -2.6089277267456055,
"logps/chosen": -255.19985961914062,
"logps/rejected": -234.4241485595703,
"loss": 234.6739,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.3505329489707947,
"rewards/margins": -0.049122948199510574,
"rewards/rejected": 0.39965590834617615,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -2.665576934814453,
"logits/rejected": -2.637112617492676,
"logps/chosen": -278.3708190917969,
"logps/rejected": -230.59423828125,
"loss": 242.5247,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.36680158972740173,
"rewards/margins": 0.01681143045425415,
"rewards/rejected": 0.34999018907546997,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -2.6472713947296143,
"logits/rejected": -2.6246514320373535,
"logps/chosen": -242.03775024414062,
"logps/rejected": -210.9549560546875,
"loss": 239.1569,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.3881983160972595,
"rewards/margins": -0.0041311681270599365,
"rewards/rejected": 0.39232948422431946,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -2.6596500873565674,
"logits/rejected": -2.6326732635498047,
"logps/chosen": -259.3081359863281,
"logps/rejected": -237.2527618408203,
"loss": 246.6495,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.4107862412929535,
"rewards/margins": 0.07695204019546509,
"rewards/rejected": 0.3338342308998108,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -2.6772663593292236,
"logits/rejected": -2.6103363037109375,
"logps/chosen": -256.0906677246094,
"logps/rejected": -202.92539978027344,
"loss": 248.9801,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.42115721106529236,
"rewards/margins": 0.12000129371881485,
"rewards/rejected": 0.3011559247970581,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 243.0746703846185,
"train_runtime": 4321.456,
"train_samples_per_second": 14.147,
"train_steps_per_second": 0.111
}
],
"logging_steps": 10,
"max_steps": 478,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}