zephyr-7b-dpo-full / trainer_state.json
wzhouad's picture
Model save
8ca0574 verified
raw
history blame
No virus
25.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.7386245727539062,
"logits/rejected": -2.7273669242858887,
"logps/chosen": -262.8376159667969,
"logps/rejected": -255.88758850097656,
"loss": 0.1038,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.7419047355651855,
"logits/rejected": -2.7360031604766846,
"logps/chosen": -305.9395446777344,
"logps/rejected": -270.57177734375,
"loss": 0.1063,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.0002741153002716601,
"rewards/margins": 0.0006307306466624141,
"rewards/rejected": -0.00035661537549458444,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.7987372875213623,
"logits/rejected": -2.779291868209839,
"logps/chosen": -296.0432434082031,
"logps/rejected": -258.17041015625,
"loss": 0.1055,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 2.8045265935361385e-05,
"rewards/margins": 0.0010506389662623405,
"rewards/rejected": -0.001022593816742301,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.83036470413208,
"logits/rejected": -2.802358627319336,
"logps/chosen": -300.7704162597656,
"logps/rejected": -259.5246276855469,
"loss": 0.1044,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.0002994390088133514,
"rewards/margins": 0.0064557394944131374,
"rewards/rejected": -0.006755178328603506,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.7836341857910156,
"logits/rejected": -2.757286310195923,
"logps/chosen": -257.14385986328125,
"logps/rejected": -248.82925415039062,
"loss": 0.0977,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.001975560560822487,
"rewards/margins": 0.018592100590467453,
"rewards/rejected": -0.02056765928864479,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.773463726043701,
"logits/rejected": -2.75862979888916,
"logps/chosen": -255.62783813476562,
"logps/rejected": -247.96707153320312,
"loss": 0.092,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.023576391860842705,
"rewards/margins": 0.053500402718782425,
"rewards/rejected": -0.07707679271697998,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.7168498039245605,
"logits/rejected": -2.684145450592041,
"logps/chosen": -265.1424255371094,
"logps/rejected": -254.54867553710938,
"loss": 0.0899,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.08215885609388351,
"rewards/margins": 0.07760664075613022,
"rewards/rejected": -0.15976549685001373,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.741403341293335,
"logits/rejected": -2.7200100421905518,
"logps/chosen": -289.6435852050781,
"logps/rejected": -287.03662109375,
"loss": 0.0779,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1611556112766266,
"rewards/margins": 0.1475805938243866,
"rewards/rejected": -0.3087361752986908,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.7668230533599854,
"logits/rejected": -2.7418100833892822,
"logps/chosen": -306.257568359375,
"logps/rejected": -304.8079833984375,
"loss": 0.0713,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.28854140639305115,
"rewards/margins": 0.16743852198123932,
"rewards/rejected": -0.45597997307777405,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.7547733783721924,
"logits/rejected": -2.7255868911743164,
"logps/chosen": -280.0272216796875,
"logps/rejected": -278.74127197265625,
"loss": 0.0594,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3458347022533417,
"rewards/margins": 0.24216556549072266,
"rewards/rejected": -0.5880002975463867,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.6905529499053955,
"logits/rejected": -2.6901133060455322,
"logps/chosen": -340.79461669921875,
"logps/rejected": -338.3218688964844,
"loss": 0.0486,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6042592525482178,
"rewards/margins": 0.2673302888870239,
"rewards/rejected": -0.8715896606445312,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.732572317123413,
"eval_logits/rejected": -2.717289686203003,
"eval_logps/chosen": -324.24517822265625,
"eval_logps/rejected": -360.8448791503906,
"eval_loss": 0.04314277693629265,
"eval_rewards/accuracies": 0.6875,
"eval_rewards/chosen": -0.6720553636550903,
"eval_rewards/margins": 0.3628607988357544,
"eval_rewards/rejected": -1.0349161624908447,
"eval_runtime": 53.2647,
"eval_samples_per_second": 37.548,
"eval_steps_per_second": 0.601,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.720991373062134,
"logits/rejected": -2.688431978225708,
"logps/chosen": -372.272705078125,
"logps/rejected": -368.93316650390625,
"loss": 0.0429,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7657500505447388,
"rewards/margins": 0.34390324354171753,
"rewards/rejected": -1.109653353691101,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -2.6478002071380615,
"logits/rejected": -2.6571507453918457,
"logps/chosen": -302.8161315917969,
"logps/rejected": -345.29022216796875,
"loss": 0.0369,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7161726951599121,
"rewards/margins": 0.4097130298614502,
"rewards/rejected": -1.1258857250213623,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -2.6481852531433105,
"logits/rejected": -2.6323132514953613,
"logps/chosen": -315.4286804199219,
"logps/rejected": -349.3882751464844,
"loss": 0.0366,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7872930765151978,
"rewards/margins": 0.33242180943489075,
"rewards/rejected": -1.1197148561477661,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -2.6520018577575684,
"logits/rejected": -2.6314806938171387,
"logps/chosen": -349.292236328125,
"logps/rejected": -358.19696044921875,
"loss": 0.0328,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8543764352798462,
"rewards/margins": 0.41757732629776,
"rewards/rejected": -1.271953821182251,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -2.5678088665008545,
"logits/rejected": -2.561540126800537,
"logps/chosen": -360.6986999511719,
"logps/rejected": -373.29876708984375,
"loss": 0.0346,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9629270434379578,
"rewards/margins": 0.4645315110683441,
"rewards/rejected": -1.4274585247039795,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -2.510655641555786,
"logits/rejected": -2.5043094158172607,
"logps/chosen": -390.89556884765625,
"logps/rejected": -418.712646484375,
"loss": 0.0284,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.16335129737854,
"rewards/margins": 0.5290186405181885,
"rewards/rejected": -1.692370057106018,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -2.5995492935180664,
"logits/rejected": -2.5737595558166504,
"logps/chosen": -412.41259765625,
"logps/rejected": -393.64605712890625,
"loss": 0.0321,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8894079327583313,
"rewards/margins": 0.48425012826919556,
"rewards/rejected": -1.3736579418182373,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -2.5762248039245605,
"logits/rejected": -2.5727803707122803,
"logps/chosen": -357.29132080078125,
"logps/rejected": -409.08453369140625,
"loss": 0.0305,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.9876778721809387,
"rewards/margins": 0.4671412408351898,
"rewards/rejected": -1.4548190832138062,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -2.4623260498046875,
"logits/rejected": -2.4131171703338623,
"logps/chosen": -391.2403564453125,
"logps/rejected": -413.74554443359375,
"loss": 0.0281,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1796742677688599,
"rewards/margins": 0.40752944350242615,
"rewards/rejected": -1.5872037410736084,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -2.451063394546509,
"logits/rejected": -2.4568967819213867,
"logps/chosen": -348.5596618652344,
"logps/rejected": -400.4520263671875,
"loss": 0.027,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1439893245697021,
"rewards/margins": 0.4998703896999359,
"rewards/rejected": -1.64385986328125,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -2.5091135501861572,
"eval_logits/rejected": -2.4957656860351562,
"eval_logps/chosen": -366.6208190917969,
"eval_logps/rejected": -425.7963562011719,
"eval_loss": 0.02966611087322235,
"eval_rewards/accuracies": 0.6953125,
"eval_rewards/chosen": -1.0958118438720703,
"eval_rewards/margins": 0.5886186957359314,
"eval_rewards/rejected": -1.6844305992126465,
"eval_runtime": 53.1867,
"eval_samples_per_second": 37.603,
"eval_steps_per_second": 0.602,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -2.4863812923431396,
"logits/rejected": -2.407597780227661,
"logps/chosen": -417.18634033203125,
"logps/rejected": -401.4176025390625,
"loss": 0.0316,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.203452467918396,
"rewards/margins": 0.41743287444114685,
"rewards/rejected": -1.6208854913711548,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -2.5017166137695312,
"logits/rejected": -2.444180965423584,
"logps/chosen": -391.02996826171875,
"logps/rejected": -405.67987060546875,
"loss": 0.029,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1130907535552979,
"rewards/margins": 0.5619007349014282,
"rewards/rejected": -1.6749913692474365,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -2.5100924968719482,
"logits/rejected": -2.4785008430480957,
"logps/chosen": -392.33062744140625,
"logps/rejected": -428.2881774902344,
"loss": 0.029,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0476332902908325,
"rewards/margins": 0.6075866222381592,
"rewards/rejected": -1.6552197933197021,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -2.4946725368499756,
"logits/rejected": -2.490581512451172,
"logps/chosen": -392.0195007324219,
"logps/rejected": -388.649169921875,
"loss": 0.0315,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.875754177570343,
"rewards/margins": 0.5238613486289978,
"rewards/rejected": -1.3996155261993408,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -2.431591272354126,
"logits/rejected": -2.410667896270752,
"logps/chosen": -353.4183654785156,
"logps/rejected": -399.400146484375,
"loss": 0.0286,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0008418560028076,
"rewards/margins": 0.5946453809738159,
"rewards/rejected": -1.595487356185913,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -2.4323439598083496,
"logits/rejected": -2.4124226570129395,
"logps/chosen": -417.405029296875,
"logps/rejected": -445.43707275390625,
"loss": 0.0298,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1992970705032349,
"rewards/margins": 0.47717300057411194,
"rewards/rejected": -1.676470160484314,
"step": 260
},
{
"epoch": 0.56,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -2.475048542022705,
"logits/rejected": -2.4506657123565674,
"logps/chosen": -392.7640075683594,
"logps/rejected": -430.6897888183594,
"loss": 0.0253,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.3913904428482056,
"rewards/margins": 0.3153776526451111,
"rewards/rejected": -1.7067680358886719,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -2.4361844062805176,
"logits/rejected": -2.4216103553771973,
"logps/chosen": -388.96063232421875,
"logps/rejected": -430.0042419433594,
"loss": 0.0261,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2630523443222046,
"rewards/margins": 0.5633870363235474,
"rewards/rejected": -1.8264392614364624,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -2.500619411468506,
"logits/rejected": -2.46304988861084,
"logps/chosen": -409.1497497558594,
"logps/rejected": -441.10198974609375,
"loss": 0.0304,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0858951807022095,
"rewards/margins": 0.46196287870407104,
"rewards/rejected": -1.5478579998016357,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -2.4326109886169434,
"logits/rejected": -2.4290225505828857,
"logps/chosen": -392.1640930175781,
"logps/rejected": -414.16351318359375,
"loss": 0.0267,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1948238611221313,
"rewards/margins": 0.47188109159469604,
"rewards/rejected": -1.6667048931121826,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -2.471911668777466,
"eval_logits/rejected": -2.458660840988159,
"eval_logps/chosen": -367.22003173828125,
"eval_logps/rejected": -428.8892517089844,
"eval_loss": 0.02865579165518284,
"eval_rewards/accuracies": 0.734375,
"eval_rewards/chosen": -1.1018041372299194,
"eval_rewards/margins": 0.6135556101799011,
"eval_rewards/rejected": -1.7153598070144653,
"eval_runtime": 53.2357,
"eval_samples_per_second": 37.569,
"eval_steps_per_second": 0.601,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -2.4415957927703857,
"logits/rejected": -2.4284932613372803,
"logps/chosen": -407.1797790527344,
"logps/rejected": -432.66436767578125,
"loss": 0.0263,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1042468547821045,
"rewards/margins": 0.6383775472640991,
"rewards/rejected": -1.7426245212554932,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -2.4184253215789795,
"logits/rejected": -2.3910305500030518,
"logps/chosen": -378.49432373046875,
"logps/rejected": -415.8314514160156,
"loss": 0.0269,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.379817247390747,
"rewards/margins": 0.4037790894508362,
"rewards/rejected": -1.783596396446228,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -2.390575885772705,
"logits/rejected": -2.3807907104492188,
"logps/chosen": -402.01800537109375,
"logps/rejected": -452.57598876953125,
"loss": 0.0257,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1732884645462036,
"rewards/margins": 0.4634523391723633,
"rewards/rejected": -1.6367409229278564,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -2.4380133152008057,
"logits/rejected": -2.4201126098632812,
"logps/chosen": -409.64093017578125,
"logps/rejected": -423.3724670410156,
"loss": 0.0238,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.243157982826233,
"rewards/margins": 0.5037888288497925,
"rewards/rejected": -1.7469466924667358,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -2.4198803901672363,
"logits/rejected": -2.369533061981201,
"logps/chosen": -408.26910400390625,
"logps/rejected": -487.27435302734375,
"loss": 0.0235,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.5223808288574219,
"rewards/margins": 0.7387748956680298,
"rewards/rejected": -2.261155843734741,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -2.3843138217926025,
"logits/rejected": -2.3418803215026855,
"logps/chosen": -431.44854736328125,
"logps/rejected": -472.52813720703125,
"loss": 0.0225,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4586023092269897,
"rewards/margins": 0.6385560035705566,
"rewards/rejected": -2.097158432006836,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -2.4029147624969482,
"logits/rejected": -2.3779168128967285,
"logps/chosen": -383.9462890625,
"logps/rejected": -458.59735107421875,
"loss": 0.0216,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.391872763633728,
"rewards/margins": 0.7170418500900269,
"rewards/rejected": -2.108914375305176,
"step": 370
},
{
"epoch": 0.79,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -2.3961706161499023,
"logits/rejected": -2.36126708984375,
"logps/chosen": -409.074951171875,
"logps/rejected": -450.57452392578125,
"loss": 0.0207,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4532839059829712,
"rewards/margins": 0.6719815731048584,
"rewards/rejected": -2.125265598297119,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -2.337531328201294,
"logits/rejected": -2.3046772480010986,
"logps/chosen": -442.0828552246094,
"logps/rejected": -477.5122985839844,
"loss": 0.0231,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.542133092880249,
"rewards/margins": 0.5396715402603149,
"rewards/rejected": -2.0818047523498535,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -2.4024269580841064,
"logits/rejected": -2.3905534744262695,
"logps/chosen": -429.47369384765625,
"logps/rejected": -499.0704650878906,
"loss": 0.0208,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6260488033294678,
"rewards/margins": 0.7209797501564026,
"rewards/rejected": -2.3470287322998047,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -2.407222270965576,
"eval_logits/rejected": -2.3938333988189697,
"eval_logps/chosen": -404.4031677246094,
"eval_logps/rejected": -479.36236572265625,
"eval_loss": 0.022896816954016685,
"eval_rewards/accuracies": 0.71484375,
"eval_rewards/chosen": -1.4736356735229492,
"eval_rewards/margins": 0.746455192565918,
"eval_rewards/rejected": -2.220090866088867,
"eval_runtime": 53.1792,
"eval_samples_per_second": 37.609,
"eval_steps_per_second": 0.602,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -2.379589796066284,
"logits/rejected": -2.371739387512207,
"logps/chosen": -441.45648193359375,
"logps/rejected": -446.16265869140625,
"loss": 0.0207,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.5874555110931396,
"rewards/margins": 0.516998291015625,
"rewards/rejected": -2.1044538021087646,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -2.4584357738494873,
"logits/rejected": -2.4399354457855225,
"logps/chosen": -441.65179443359375,
"logps/rejected": -459.64208984375,
"loss": 0.0232,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.469987392425537,
"rewards/margins": 0.5242463946342468,
"rewards/rejected": -1.9942338466644287,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -2.4622554779052734,
"logits/rejected": -2.4207379817962646,
"logps/chosen": -428.8905334472656,
"logps/rejected": -432.05108642578125,
"loss": 0.021,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.411439299583435,
"rewards/margins": 0.6273307800292969,
"rewards/rejected": -2.0387701988220215,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -2.443328857421875,
"logits/rejected": -2.4111621379852295,
"logps/chosen": -445.6102600097656,
"logps/rejected": -446.654052734375,
"loss": 0.0227,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4215877056121826,
"rewards/margins": 0.5511332154273987,
"rewards/rejected": -1.972720742225647,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -2.457451581954956,
"logits/rejected": -2.4286131858825684,
"logps/chosen": -411.8194885253906,
"logps/rejected": -525.404296875,
"loss": 0.0243,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3488930463790894,
"rewards/margins": 0.9843934774398804,
"rewards/rejected": -2.333286762237549,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -2.392413377761841,
"logits/rejected": -2.3623125553131104,
"logps/chosen": -452.897216796875,
"logps/rejected": -460.3353576660156,
"loss": 0.0226,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4256120920181274,
"rewards/margins": 0.5736899375915527,
"rewards/rejected": -1.9993021488189697,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -2.4287571907043457,
"logits/rejected": -2.387329578399658,
"logps/chosen": -451.8321228027344,
"logps/rejected": -473.54388427734375,
"loss": 0.0243,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4879666566848755,
"rewards/margins": 0.6231549978256226,
"rewards/rejected": -2.111121654510498,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.03937680171373998,
"train_runtime": 4352.8265,
"train_samples_per_second": 14.045,
"train_steps_per_second": 0.11
}
],
"logging_steps": 10,
"max_steps": 478,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}