{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.8099329471588135, "logits/rejected": -2.7572641372680664, "logps/chosen": -241.48843383789062, "logps/rejected": -197.4517822265625, "loss": 271.7943, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.83237361907959, "logits/rejected": -2.808957815170288, "logps/chosen": -292.6072692871094, "logps/rejected": -278.4604797363281, "loss": 286.0386, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.0008353570010513067, "rewards/margins": -0.0004216647648718208, "rewards/rejected": 0.0012570219114422798, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.81878662109375, "logits/rejected": -2.7905821800231934, "logps/chosen": -286.19378662109375, "logps/rejected": -286.7618103027344, "loss": 264.9378, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.04205816239118576, "rewards/margins": 0.0025776384864002466, "rewards/rejected": 0.03948052600026131, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.7921807765960693, "logits/rejected": -2.7616498470306396, "logps/chosen": -232.4526824951172, "logps/rejected": -212.8272705078125, "loss": 266.1199, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.13909421861171722, "rewards/margins": 0.004568194039165974, "rewards/rejected": 0.13452602922916412, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.8503639698028564, "logits/rejected": -2.819370985031128, "logps/chosen": -280.4808654785156, "logps/rejected": -243.86935424804688, "loss": 258.2878, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.19040736556053162, "rewards/margins": 0.01646682806313038, "rewards/rejected": 0.1739405393600464, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.764444589614868, "logits/rejected": -2.7427210807800293, "logps/chosen": -254.5093231201172, "logps/rejected": -240.6798858642578, "loss": 244.2991, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.22921113669872284, "rewards/margins": -0.0009602505015209317, "rewards/rejected": 0.23017136752605438, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.7702980041503906, "logits/rejected": -2.742863893508911, "logps/chosen": -235.5380401611328, "logps/rejected": -211.11770629882812, "loss": 255.6714, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.23538246750831604, "rewards/margins": 0.020409177988767624, "rewards/rejected": 0.2149733006954193, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.7509877681732178, "logits/rejected": -2.7182183265686035, "logps/chosen": -239.36514282226562, "logps/rejected": -210.6224822998047, "loss": 253.836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.29057446122169495, "rewards/margins": 0.04796246066689491, "rewards/rejected": 0.24261197447776794, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.699618339538574, "logits/rejected": -2.697359800338745, "logps/chosen": -241.5849609375, "logps/rejected": -241.43533325195312, "loss": 261.2479, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.29146844148635864, "rewards/margins": 0.030574629083275795, "rewards/rejected": 0.2608937919139862, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.7679967880249023, "logits/rejected": -2.7326865196228027, "logps/chosen": -225.57156372070312, "logps/rejected": -208.76693725585938, "loss": 252.1535, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.29392915964126587, "rewards/margins": 0.03465462103486061, "rewards/rejected": 0.25927454233169556, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.819113254547119, "logits/rejected": -2.783386707305908, "logps/chosen": -260.7179260253906, "logps/rejected": -238.54434204101562, "loss": 256.7688, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.28721123933792114, "rewards/margins": -0.005946027580648661, "rewards/rejected": 0.29315727949142456, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.7863247394561768, "eval_logits/rejected": -2.767011880874634, "eval_logps/chosen": -227.56491088867188, "eval_logps/rejected": -230.47303771972656, "eval_loss": 245.72764587402344, "eval_rewards/accuracies": 0.57421875, "eval_rewards/chosen": 0.29474756121635437, "eval_rewards/margins": 0.025945277884602547, "eval_rewards/rejected": 0.268802285194397, "eval_runtime": 53.5253, "eval_samples_per_second": 37.365, "eval_steps_per_second": 0.598, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.6783273220062256, "logits/rejected": -2.644078016281128, "logps/chosen": -237.9336395263672, "logps/rejected": -188.83895874023438, "loss": 240.9402, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.32163843512535095, "rewards/margins": -0.008066670037806034, "rewards/rejected": 0.3297051787376404, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.746994733810425, "logits/rejected": -2.724551200866699, "logps/chosen": -240.14242553710938, "logps/rejected": -237.6074981689453, "loss": 245.7042, "rewards/accuracies": 0.53125, "rewards/chosen": 0.2870863080024719, "rewards/margins": 0.03465163707733154, "rewards/rejected": 0.25243470072746277, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.7405877113342285, "logits/rejected": -2.7226414680480957, "logps/chosen": -242.31460571289062, "logps/rejected": -222.0249786376953, "loss": 239.1022, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3218019902706146, "rewards/margins": 0.03552493453025818, "rewards/rejected": 0.28627708554267883, "step": 130 }, { "epoch": 0.29, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.7363457679748535, "logits/rejected": -2.710092067718506, "logps/chosen": -248.31982421875, "logps/rejected": -242.44064331054688, "loss": 240.1779, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.32314616441726685, "rewards/margins": 0.03991778939962387, "rewards/rejected": 0.2832283675670624, "step": 140 }, { "epoch": 0.31, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.6783087253570557, "logits/rejected": -2.707118511199951, "logps/chosen": -196.87429809570312, "logps/rejected": -205.2587890625, "loss": 236.8216, "rewards/accuracies": 0.46875, "rewards/chosen": 0.2980460226535797, "rewards/margins": -0.012227327562868595, "rewards/rejected": 0.31027334928512573, "step": 150 }, { "epoch": 0.33, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.7062594890594482, "logits/rejected": -2.6792685985565186, "logps/chosen": -224.0284881591797, "logps/rejected": -215.8988494873047, "loss": 233.851, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.31943365931510925, "rewards/margins": 0.01964866928756237, "rewards/rejected": 0.29978498816490173, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.6613574028015137, "logits/rejected": -2.6377127170562744, "logps/chosen": -249.140869140625, "logps/rejected": -229.2135009765625, "loss": 238.7709, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3302595019340515, "rewards/margins": 0.02071164920926094, "rewards/rejected": 0.30954790115356445, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.697937488555908, "logits/rejected": -2.6982693672180176, "logps/chosen": -237.61752319335938, "logps/rejected": -222.94534301757812, "loss": 240.8094, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.3156043589115143, "rewards/margins": -0.00877897534519434, "rewards/rejected": 0.32438334822654724, "step": 180 }, { "epoch": 0.4, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.716660499572754, "logits/rejected": -2.6782338619232178, "logps/chosen": -256.08953857421875, "logps/rejected": -233.2887725830078, "loss": 241.1514, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.35306206345558167, "rewards/margins": 0.0451970100402832, "rewards/rejected": 0.3078650236129761, "step": 190 }, { "epoch": 0.42, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.6975326538085938, "logits/rejected": -2.715744733810425, "logps/chosen": -260.04095458984375, "logps/rejected": -232.90744018554688, "loss": 239.4804, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.3612653315067291, "rewards/margins": 0.047182150185108185, "rewards/rejected": 0.31408315896987915, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -2.7289865016937256, "eval_logits/rejected": -2.7082109451293945, "eval_logps/chosen": -223.84622192382812, "eval_logps/rejected": -226.99331665039062, "eval_loss": 241.5078125, "eval_rewards/accuracies": 0.57421875, "eval_rewards/chosen": 0.33193421363830566, "eval_rewards/margins": 0.028334595263004303, "eval_rewards/rejected": 0.30359959602355957, "eval_runtime": 53.4363, "eval_samples_per_second": 37.428, "eval_steps_per_second": 0.599, "step": 200 }, { "epoch": 0.44, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.653046131134033, "logits/rejected": -2.639833927154541, "logps/chosen": -236.7248992919922, "logps/rejected": -215.8306427001953, "loss": 232.2403, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.3630830645561218, "rewards/margins": 0.002032138407230377, "rewards/rejected": 0.36105093359947205, "step": 210 }, { "epoch": 0.46, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.6577677726745605, "logits/rejected": -2.610652446746826, "logps/chosen": -234.62515258789062, "logps/rejected": -224.5327911376953, "loss": 236.0736, "rewards/accuracies": 0.5625, "rewards/chosen": 0.403405100107193, "rewards/margins": 0.025665929540991783, "rewards/rejected": 0.3777391314506531, "step": 220 }, { "epoch": 0.48, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.7138988971710205, "logits/rejected": -2.6695990562438965, "logps/chosen": -238.20166015625, "logps/rejected": -238.32510375976562, "loss": 243.5431, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.36346226930618286, "rewards/margins": 0.04655776172876358, "rewards/rejected": 0.31690454483032227, "step": 230 }, { "epoch": 0.5, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.6672616004943848, "logits/rejected": -2.6338298320770264, "logps/chosen": -245.7289581298828, "logps/rejected": -222.80068969726562, "loss": 241.8247, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4044111371040344, "rewards/margins": 0.09945273399353027, "rewards/rejected": 0.30495840311050415, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.711887836456299, "logits/rejected": -2.660736083984375, "logps/chosen": -236.48410034179688, "logps/rejected": -220.5592803955078, "loss": 232.2589, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.3508817255496979, "rewards/margins": 0.012243595905601978, "rewards/rejected": 0.3386381268501282, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.6873819828033447, "logits/rejected": -2.665743827819824, "logps/chosen": -226.0983428955078, "logps/rejected": -222.646484375, "loss": 249.1208, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.33049115538597107, "rewards/margins": -0.025500113144516945, "rewards/rejected": 0.35599130392074585, "step": 260 }, { "epoch": 0.56, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.684255599975586, "logits/rejected": -2.6710708141326904, "logps/chosen": -250.91659545898438, "logps/rejected": -232.743896484375, "loss": 232.0498, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3753630220890045, "rewards/margins": 0.05215846374630928, "rewards/rejected": 0.32320457696914673, "step": 270 }, { "epoch": 0.59, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.642585515975952, "logits/rejected": -2.6435678005218506, "logps/chosen": -271.9496154785156, "logps/rejected": -232.7229461669922, "loss": 242.3832, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3579340875148773, "rewards/margins": 0.029489517211914062, "rewards/rejected": 0.32844457030296326, "step": 280 }, { "epoch": 0.61, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.6854400634765625, "logits/rejected": -2.6508984565734863, "logps/chosen": -238.96533203125, "logps/rejected": -214.7186737060547, "loss": 237.8058, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.38798388838768005, "rewards/margins": 0.002142349723726511, "rewards/rejected": 0.3858415484428406, "step": 290 }, { "epoch": 0.63, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.6979777812957764, "logits/rejected": -2.6885297298431396, "logps/chosen": -218.0729522705078, "logps/rejected": -206.6052703857422, "loss": 240.5041, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.3353363871574402, "rewards/margins": -0.021952930837869644, "rewards/rejected": 0.35728925466537476, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -2.7041783332824707, "eval_logits/rejected": -2.680725574493408, "eval_logps/chosen": -222.30435180664062, "eval_logps/rejected": -225.71856689453125, "eval_loss": 239.8052978515625, "eval_rewards/accuracies": 0.5625, "eval_rewards/chosen": 0.3473527431488037, "eval_rewards/margins": 0.03100587986409664, "eval_rewards/rejected": 0.3163468837738037, "eval_runtime": 53.5009, "eval_samples_per_second": 37.383, "eval_steps_per_second": 0.598, "step": 300 }, { "epoch": 0.65, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.681898832321167, "logits/rejected": -2.647827625274658, "logps/chosen": -249.84933471679688, "logps/rejected": -221.06466674804688, "loss": 244.5816, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.37856870889663696, "rewards/margins": 0.031483568251132965, "rewards/rejected": 0.347085177898407, "step": 310 }, { "epoch": 0.67, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.6419777870178223, "logits/rejected": -2.6089978218078613, "logps/chosen": -247.5414581298828, "logps/rejected": -228.00442504882812, "loss": 227.1146, "rewards/accuracies": 0.5625, "rewards/chosen": 0.38319897651672363, "rewards/margins": 0.010034086182713509, "rewards/rejected": 0.3731648921966553, "step": 320 }, { "epoch": 0.69, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.6546921730041504, "logits/rejected": -2.619576930999756, "logps/chosen": -235.1280059814453, "logps/rejected": -228.36550903320312, "loss": 235.3102, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3890138268470764, "rewards/margins": 0.030328262597322464, "rewards/rejected": 0.35868555307388306, "step": 330 }, { "epoch": 0.71, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.6908440589904785, "logits/rejected": -2.6842880249023438, "logps/chosen": -221.61752319335938, "logps/rejected": -221.1806182861328, "loss": 226.7886, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.3297863006591797, "rewards/margins": 0.012335492298007011, "rewards/rejected": 0.3174508213996887, "step": 340 }, { "epoch": 0.73, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.6660337448120117, "logits/rejected": -2.6440846920013428, "logps/chosen": -235.5915069580078, "logps/rejected": -227.4123077392578, "loss": 235.1443, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.4086208939552307, "rewards/margins": 0.052063293755054474, "rewards/rejected": 0.35655760765075684, "step": 350 }, { "epoch": 0.75, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.631333112716675, "logits/rejected": -2.605966806411743, "logps/chosen": -223.34115600585938, "logps/rejected": -207.91500854492188, "loss": 246.5647, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.35043418407440186, "rewards/margins": -0.0040539586916565895, "rewards/rejected": 0.35448816418647766, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.667217254638672, "logits/rejected": -2.6636977195739746, "logps/chosen": -240.44247436523438, "logps/rejected": -231.893798828125, "loss": 236.9863, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.37617605924606323, "rewards/margins": 0.05858853459358215, "rewards/rejected": 0.3175875246524811, "step": 370 }, { "epoch": 0.79, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.6146702766418457, "logits/rejected": -2.611351728439331, "logps/chosen": -189.4939422607422, "logps/rejected": -206.30081176757812, "loss": 233.4487, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.3334718644618988, "rewards/margins": -0.03314907103776932, "rewards/rejected": 0.3666209578514099, "step": 380 }, { "epoch": 0.82, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.663173198699951, "logits/rejected": -2.634059429168701, "logps/chosen": -221.12240600585938, "logps/rejected": -204.8622589111328, "loss": 241.2531, "rewards/accuracies": 0.53125, "rewards/chosen": 0.30437472462654114, "rewards/margins": -0.002047918038442731, "rewards/rejected": 0.3064226508140564, "step": 390 }, { "epoch": 0.84, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.646238088607788, "logits/rejected": -2.6048245429992676, "logps/chosen": -250.01693725585938, "logps/rejected": -215.4422607421875, "loss": 236.8453, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.35686779022216797, "rewards/margins": 0.03598792105913162, "rewards/rejected": 0.3208799362182617, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -2.697472333908081, "eval_logits/rejected": -2.673011541366577, "eval_logps/chosen": -221.73487854003906, "eval_logps/rejected": -225.18995666503906, "eval_loss": 239.1992950439453, "eval_rewards/accuracies": 0.5625, "eval_rewards/chosen": 0.3530477285385132, "eval_rewards/margins": 0.031414665281772614, "eval_rewards/rejected": 0.3216330409049988, "eval_runtime": 53.4218, "eval_samples_per_second": 37.438, "eval_steps_per_second": 0.599, "step": 400 }, { "epoch": 0.86, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.684819459915161, "logits/rejected": -2.622917652130127, "logps/chosen": -226.3255157470703, "logps/rejected": -246.3280487060547, "loss": 234.8678, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.3726418912410736, "rewards/margins": 0.02128712832927704, "rewards/rejected": 0.3513546884059906, "step": 410 }, { "epoch": 0.88, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.6737873554229736, "logits/rejected": -2.638714551925659, "logps/chosen": -250.0021514892578, "logps/rejected": -255.8992156982422, "loss": 246.8406, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.3681090474128723, "rewards/margins": 0.0030112355016171932, "rewards/rejected": 0.3650978207588196, "step": 420 }, { "epoch": 0.9, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.6430375576019287, "logits/rejected": -2.6089277267456055, "logps/chosen": -255.19985961914062, "logps/rejected": -234.4241485595703, "loss": 234.6739, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.3505329489707947, "rewards/margins": -0.049122948199510574, "rewards/rejected": 0.39965590834617615, "step": 430 }, { "epoch": 0.92, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.665576934814453, "logits/rejected": -2.637112617492676, "logps/chosen": -278.3708190917969, "logps/rejected": -230.59423828125, "loss": 242.5247, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.36680158972740173, "rewards/margins": 0.01681143045425415, "rewards/rejected": 0.34999018907546997, "step": 440 }, { "epoch": 0.94, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.6472713947296143, "logits/rejected": -2.6246514320373535, "logps/chosen": -242.03775024414062, "logps/rejected": -210.9549560546875, "loss": 239.1569, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3881983160972595, "rewards/margins": -0.0041311681270599365, "rewards/rejected": 0.39232948422431946, "step": 450 }, { "epoch": 0.96, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.6596500873565674, "logits/rejected": -2.6326732635498047, "logps/chosen": -259.3081359863281, "logps/rejected": -237.2527618408203, "loss": 246.6495, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4107862412929535, "rewards/margins": 0.07695204019546509, "rewards/rejected": 0.3338342308998108, "step": 460 }, { "epoch": 0.98, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.6772663593292236, "logits/rejected": -2.6103363037109375, "logps/chosen": -256.0906677246094, "logps/rejected": -202.92539978027344, "loss": 248.9801, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.42115721106529236, "rewards/margins": 0.12000129371881485, "rewards/rejected": 0.3011559247970581, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 243.0746703846185, "train_runtime": 4321.456, "train_samples_per_second": 14.147, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 478, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }