{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9976019184652278, "eval_steps": 500, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.3809523809523811e-07, "logits/chosen": 0.15803536772727966, "logits/rejected": 0.08697354793548584, "logps/chosen": -431.6365661621094, "logps/rejected": -312.2266845703125, "loss": 0.3497, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.08879230171442032, "logits/rejected": 0.23703241348266602, "logps/chosen": -334.3096008300781, "logps/rejected": -325.03387451171875, "loss": 0.3916, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.011717023327946663, "rewards/margins": 0.0023966077715158463, "rewards/rejected": 0.009320415556430817, "step": 10 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.10182257741689682, "logits/rejected": 0.21816711127758026, "logps/chosen": -337.0960388183594, "logps/rejected": -311.6546936035156, "loss": 0.3814, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.020739946514368057, "rewards/margins": 0.02319016307592392, "rewards/rejected": -0.04393010586500168, "step": 20 }, { "epoch": 0.14, "learning_rate": 4.97147773390341e-06, "logits/chosen": 0.10252387821674347, "logits/rejected": 0.20911017060279846, "logps/chosen": -333.58074951171875, "logps/rejected": -314.22686767578125, "loss": 0.3406, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.03605792671442032, "rewards/margins": 0.06152229756116867, "rewards/rejected": -0.0254643764346838, "step": 30 }, { "epoch": 0.19, "learning_rate": 4.873717504456219e-06, "logits/chosen": 0.1256047487258911, "logits/rejected": 0.18428723514080048, "logps/chosen": -361.2447509765625, "logps/rejected": -337.5652770996094, "loss": 0.3075, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1919463872909546, "rewards/margins": 0.11055928468704224, "rewards/rejected": 0.08138711750507355, "step": 40 }, { "epoch": 0.24, "learning_rate": 4.709119209978242e-06, "logits/chosen": 0.13542751967906952, "logits/rejected": 0.17227646708488464, "logps/chosen": -339.5179138183594, "logps/rejected": -339.02984619140625, "loss": 0.3253, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.29387253522872925, "rewards/margins": 0.18539837002754211, "rewards/rejected": 0.10847418010234833, "step": 50 }, { "epoch": 0.29, "learning_rate": 4.482317534878901e-06, "logits/chosen": 0.14710070192813873, "logits/rejected": 0.1668437272310257, "logps/chosen": -338.6156311035156, "logps/rejected": -317.2750549316406, "loss": 0.3199, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2612247169017792, "rewards/margins": 0.14398948848247528, "rewards/rejected": 0.1172352284193039, "step": 60 }, { "epoch": 0.34, "learning_rate": 4.199698658255298e-06, "logits/chosen": 0.10598815977573395, "logits/rejected": 0.15820710361003876, "logps/chosen": -339.61456298828125, "logps/rejected": -329.31536865234375, "loss": 0.2793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3697130084037781, "rewards/margins": 0.20087119936943054, "rewards/rejected": 0.16884183883666992, "step": 70 }, { "epoch": 0.38, "learning_rate": 3.869220434746509e-06, "logits/chosen": 0.07073510438203812, "logits/rejected": 0.15016858279705048, "logps/chosen": -314.06280517578125, "logps/rejected": -323.94268798828125, "loss": 0.2928, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3701319396495819, "rewards/margins": 0.16388371586799622, "rewards/rejected": 0.2062481939792633, "step": 80 }, { "epoch": 0.43, "learning_rate": 3.5001883208580668e-06, "logits/chosen": 0.12372653186321259, "logits/rejected": 0.20820951461791992, "logps/chosen": -373.20831298828125, "logps/rejected": -339.1376037597656, "loss": 0.2895, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.33848610520362854, "rewards/margins": 0.1839863359928131, "rewards/rejected": 0.15449976921081543, "step": 90 }, { "epoch": 0.48, "learning_rate": 3.102993356121938e-06, "logits/chosen": 0.11043532192707062, "logits/rejected": 0.21166983246803284, "logps/chosen": -332.3284606933594, "logps/rejected": -328.7100524902344, "loss": 0.3023, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.28233280777931213, "rewards/margins": 0.1811174899339676, "rewards/rejected": 0.10121532529592514, "step": 100 }, { "epoch": 0.53, "learning_rate": 2.6888195769001147e-06, "logits/chosen": 0.12863442301750183, "logits/rejected": 0.2186942994594574, "logps/chosen": -322.6536560058594, "logps/rejected": -326.0906066894531, "loss": 0.2747, "rewards/accuracies": 0.75, "rewards/chosen": 0.3540286421775818, "rewards/margins": 0.2065799981355667, "rewards/rejected": 0.14744864404201508, "step": 110 }, { "epoch": 0.58, "learning_rate": 2.269329101341745e-06, "logits/chosen": 0.11101800203323364, "logits/rejected": 0.21909146010875702, "logps/chosen": -386.37506103515625, "logps/rejected": -338.628662109375, "loss": 0.2594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.32755765318870544, "rewards/margins": 0.19837772846221924, "rewards/rejected": 0.1291799247264862, "step": 120 }, { "epoch": 0.62, "learning_rate": 1.856333752729311e-06, "logits/chosen": 0.108365498483181, "logits/rejected": 0.21159549057483673, "logps/chosen": -354.3942565917969, "logps/rejected": -350.58026123046875, "loss": 0.2845, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32795828580856323, "rewards/margins": 0.22649607062339783, "rewards/rejected": 0.10146218538284302, "step": 130 }, { "epoch": 0.67, "learning_rate": 1.4614624674952843e-06, "logits/chosen": 0.13152627646923065, "logits/rejected": 0.1252177655696869, "logps/chosen": -310.82440185546875, "logps/rejected": -306.65057373046875, "loss": 0.3034, "rewards/accuracies": 0.625, "rewards/chosen": 0.28935328125953674, "rewards/margins": 0.17421108484268188, "rewards/rejected": 0.11514218151569366, "step": 140 }, { "epoch": 0.72, "learning_rate": 1.0958338528840893e-06, "logits/chosen": 0.14218227565288544, "logits/rejected": 0.17058388888835907, "logps/chosen": -365.53497314453125, "logps/rejected": -336.30670166015625, "loss": 0.279, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.34200993180274963, "rewards/margins": 0.18471720814704895, "rewards/rejected": 0.1572926938533783, "step": 150 }, { "epoch": 0.77, "learning_rate": 7.697431142327633e-07, "logits/chosen": 0.1493266373872757, "logits/rejected": 0.1491091400384903, "logps/chosen": -363.55841064453125, "logps/rejected": -332.8086853027344, "loss": 0.2657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3300887942314148, "rewards/margins": 0.20138521492481232, "rewards/rejected": 0.12870360910892487, "step": 160 }, { "epoch": 0.82, "learning_rate": 4.923721672305148e-07, "logits/chosen": 0.09402619302272797, "logits/rejected": 0.1920977234840393, "logps/chosen": -334.378662109375, "logps/rejected": -331.62322998046875, "loss": 0.2685, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3077259361743927, "rewards/margins": 0.17248141765594482, "rewards/rejected": 0.13524451851844788, "step": 170 }, { "epoch": 0.86, "learning_rate": 2.7153109768518926e-07, "logits/chosen": 0.11278879642486572, "logits/rejected": 0.18854503333568573, "logps/chosen": -388.7989501953125, "logps/rejected": -336.02105712890625, "loss": 0.2612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.34268996119499207, "rewards/margins": 0.21422357857227325, "rewards/rejected": 0.12846639752388, "step": 180 }, { "epoch": 0.91, "learning_rate": 1.1343824865573422e-07, "logits/chosen": 0.09929057955741882, "logits/rejected": 0.15045389533042908, "logps/chosen": -343.9807434082031, "logps/rejected": -318.48028564453125, "loss": 0.2662, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.29964178800582886, "rewards/margins": 0.17701460421085358, "rewards/rejected": 0.1226271539926529, "step": 190 }, { "epoch": 0.96, "learning_rate": 2.2545127157831416e-08, "logits/chosen": 0.07950419932603836, "logits/rejected": 0.178897887468338, "logps/chosen": -297.21661376953125, "logps/rejected": -294.276611328125, "loss": 0.2553, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3100071847438812, "rewards/margins": 0.16973480582237244, "rewards/rejected": 0.14027239382266998, "step": 200 }, { "epoch": 1.0, "step": 208, "total_flos": 0.0, "train_loss": 0.29713730256144816, "train_runtime": 2891.8659, "train_samples_per_second": 3.458, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 208, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }