{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8, "eval_steps": 900, "global_step": 12600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022222222222222223, "grad_norm": 13.25, "learning_rate": 2.222222222222222e-09, "logits/chosen": -3.4569907188415527, "logits/rejected": -3.426312208175659, "logps/chosen": -237.60638427734375, "logps/rejected": -149.21551513671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.2, "grad_norm": 6.96875, "learning_rate": 9.655172413793103e-07, "logits/chosen": -3.2259371280670166, "logits/rejected": -3.237309217453003, "logps/chosen": -157.7607879638672, "logps/rejected": -135.18301391601562, "loss": 0.6984, "rewards/accuracies": 0.4966629445552826, "rewards/chosen": -0.002921752631664276, "rewards/margins": -0.0014779088087379932, "rewards/rejected": -0.0014438438229262829, "step": 900 }, { "epoch": 0.2, "eval_logits/chosen": -3.013286590576172, "eval_logits/rejected": -3.027402639389038, "eval_logps/chosen": -157.49420166015625, "eval_logps/rejected": -135.04640197753906, "eval_loss": 0.6940267086029053, "eval_rewards/accuracies": 0.5178571343421936, "eval_rewards/chosen": 0.005377354100346565, "eval_rewards/margins": 0.006346767768263817, "eval_rewards/rejected": -0.0009694137261249125, "eval_runtime": 621.0759, "eval_samples_per_second": 1.61, "eval_steps_per_second": 0.101, "step": 900 }, { "epoch": 0.4, "grad_norm": 5.15625, "learning_rate": 8.96551724137931e-07, "logits/chosen": -3.2368738651275635, "logits/rejected": -3.2453866004943848, "logps/chosen": -156.84564208984375, "logps/rejected": -132.406005859375, "loss": 0.6979, "rewards/accuracies": 0.4933333396911621, "rewards/chosen": 0.0008400398073717952, "rewards/margins": -0.0006429057684727013, "rewards/rejected": 0.0014829455176368356, "step": 1800 }, { "epoch": 0.4, "eval_logits/chosen": -3.013197898864746, "eval_logits/rejected": -3.0273756980895996, "eval_logps/chosen": -157.50088500976562, "eval_logps/rejected": -135.04953002929688, "eval_loss": 0.6951879858970642, "eval_rewards/accuracies": 0.5267857313156128, "eval_rewards/chosen": 0.0020369377452880144, "eval_rewards/margins": 0.004572988487780094, "eval_rewards/rejected": -0.0025360507424920797, "eval_runtime": 617.5747, "eval_samples_per_second": 1.619, "eval_steps_per_second": 0.102, "step": 1800 }, { "epoch": 0.6, "grad_norm": 5.90625, "learning_rate": 8.275862068965517e-07, "logits/chosen": -3.228177547454834, "logits/rejected": -3.2369861602783203, "logps/chosen": -155.81085205078125, "logps/rejected": -132.52528381347656, "loss": 0.6937, "rewards/accuracies": 0.5061110854148865, "rewards/chosen": 0.007656366564333439, "rewards/margins": 0.006786289159208536, "rewards/rejected": 0.0008700773469172418, "step": 2700 }, { "epoch": 0.6, "eval_logits/chosen": -3.013126850128174, "eval_logits/rejected": -3.027287483215332, "eval_logps/chosen": -157.47140502929688, "eval_logps/rejected": -135.03514099121094, "eval_loss": 0.6915069818496704, "eval_rewards/accuracies": 0.52182537317276, "eval_rewards/chosen": 0.016774658113718033, "eval_rewards/margins": 0.012120993807911873, "eval_rewards/rejected": 0.004653665702790022, "eval_runtime": 616.7891, "eval_samples_per_second": 1.621, "eval_steps_per_second": 0.102, "step": 2700 }, { "epoch": 0.8, "grad_norm": 4.75, "learning_rate": 7.586206896551724e-07, "logits/chosen": -3.235166549682617, "logits/rejected": -3.2431561946868896, "logps/chosen": -156.77923583984375, "logps/rejected": -133.5782012939453, "loss": 0.6962, "rewards/accuracies": 0.5005555748939514, "rewards/chosen": 0.00986157450824976, "rewards/margins": 0.0027794605121016502, "rewards/rejected": 0.007082113530486822, "step": 3600 }, { "epoch": 0.8, "eval_logits/chosen": -3.013155937194824, "eval_logits/rejected": -3.027271032333374, "eval_logps/chosen": -157.477294921875, "eval_logps/rejected": -135.04197692871094, "eval_loss": 0.6910666823387146, "eval_rewards/accuracies": 0.52182537317276, "eval_rewards/chosen": 0.013831890188157558, "eval_rewards/margins": 0.012587492354214191, "eval_rewards/rejected": 0.0012443973682820797, "eval_runtime": 617.039, "eval_samples_per_second": 1.621, "eval_steps_per_second": 0.102, "step": 3600 }, { "epoch": 1.0, "grad_norm": 7.53125, "learning_rate": 6.896551724137931e-07, "logits/chosen": -3.2322757244110107, "logits/rejected": -3.242344379425049, "logps/chosen": -158.4110107421875, "logps/rejected": -134.92042541503906, "loss": 0.7006, "rewards/accuracies": 0.4866666793823242, "rewards/chosen": 0.014087031595408916, "rewards/margins": -0.005671821068972349, "rewards/rejected": 0.01975885219871998, "step": 4500 }, { "epoch": 1.0, "eval_logits/chosen": -3.013131618499756, "eval_logits/rejected": -3.027305841445923, "eval_logps/chosen": -157.4510498046875, "eval_logps/rejected": -135.02581787109375, "eval_loss": 0.688616931438446, "eval_rewards/accuracies": 0.5248016119003296, "eval_rewards/chosen": 0.026958029717206955, "eval_rewards/margins": 0.01763634942471981, "eval_rewards/rejected": 0.009321682155132294, "eval_runtime": 616.636, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 4500 }, { "epoch": 1.2, "grad_norm": 5.25, "learning_rate": 6.206896551724138e-07, "logits/chosen": -3.237251043319702, "logits/rejected": -3.2465975284576416, "logps/chosen": -157.82240295410156, "logps/rejected": -135.47181701660156, "loss": 0.6947, "rewards/accuracies": 0.5116666555404663, "rewards/chosen": 0.023857368156313896, "rewards/margins": 0.005588435102254152, "rewards/rejected": 0.01826893351972103, "step": 5400 }, { "epoch": 1.2, "eval_logits/chosen": -3.012895107269287, "eval_logits/rejected": -3.027073383331299, "eval_logps/chosen": -157.4395751953125, "eval_logps/rejected": -135.0121307373047, "eval_loss": 0.6891811490058899, "eval_rewards/accuracies": 0.5466269850730896, "eval_rewards/chosen": 0.032687753438949585, "eval_rewards/margins": 0.016526944935321808, "eval_rewards/rejected": 0.016160808503627777, "eval_runtime": 617.1923, "eval_samples_per_second": 1.62, "eval_steps_per_second": 0.102, "step": 5400 }, { "epoch": 1.4, "grad_norm": 10.8125, "learning_rate": 5.517241379310344e-07, "logits/chosen": -3.2299792766571045, "logits/rejected": -3.239255666732788, "logps/chosen": -157.90017700195312, "logps/rejected": -133.9901580810547, "loss": 0.6936, "rewards/accuracies": 0.5111111402511597, "rewards/chosen": 0.02419172413647175, "rewards/margins": 0.007937068119645119, "rewards/rejected": 0.01625465415418148, "step": 6300 }, { "epoch": 1.4, "eval_logits/chosen": -3.013338565826416, "eval_logits/rejected": -3.027513027191162, "eval_logps/chosen": -157.42361450195312, "eval_logps/rejected": -135.00498962402344, "eval_loss": 0.6873784065246582, "eval_rewards/accuracies": 0.5416666865348816, "eval_rewards/chosen": 0.04067719727754593, "eval_rewards/margins": 0.020947163924574852, "eval_rewards/rejected": 0.019730033352971077, "eval_runtime": 616.84, "eval_samples_per_second": 1.621, "eval_steps_per_second": 0.102, "step": 6300 }, { "epoch": 1.6, "grad_norm": 7.09375, "learning_rate": 4.827586206896552e-07, "logits/chosen": -3.2270286083221436, "logits/rejected": -3.2372705936431885, "logps/chosen": -156.0079345703125, "logps/rejected": -133.16619873046875, "loss": 0.6928, "rewards/accuracies": 0.523888885974884, "rewards/chosen": 0.026351599022746086, "rewards/margins": 0.009284625761210918, "rewards/rejected": 0.017066972330212593, "step": 7200 }, { "epoch": 1.6, "eval_logits/chosen": -3.0130512714385986, "eval_logits/rejected": -3.027252674102783, "eval_logps/chosen": -157.42318725585938, "eval_logps/rejected": -135.01333618164062, "eval_loss": 0.6849371194839478, "eval_rewards/accuracies": 0.5535714030265808, "eval_rewards/chosen": 0.040884003043174744, "eval_rewards/margins": 0.02531503513455391, "eval_rewards/rejected": 0.015568966045975685, "eval_runtime": 616.6049, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 7200 }, { "epoch": 1.8, "grad_norm": 3.59375, "learning_rate": 4.1379310344827586e-07, "logits/chosen": -3.233133316040039, "logits/rejected": -3.241081476211548, "logps/chosen": -156.33836364746094, "logps/rejected": -131.24473571777344, "loss": 0.692, "rewards/accuracies": 0.5299999713897705, "rewards/chosen": 0.026600120589137077, "rewards/margins": 0.011210680939257145, "rewards/rejected": 0.015389441512525082, "step": 8100 }, { "epoch": 1.8, "eval_logits/chosen": -3.0134432315826416, "eval_logits/rejected": -3.0276236534118652, "eval_logps/chosen": -157.4323272705078, "eval_logps/rejected": -135.00303649902344, "eval_loss": 0.6896921992301941, "eval_rewards/accuracies": 0.5208333134651184, "eval_rewards/chosen": 0.03631395846605301, "eval_rewards/margins": 0.01561205368489027, "eval_rewards/rejected": 0.020701901987195015, "eval_runtime": 616.6729, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 8100 }, { "epoch": 2.0, "grad_norm": 3.46875, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -3.2326736450195312, "logits/rejected": -3.2417149543762207, "logps/chosen": -157.4275360107422, "logps/rejected": -134.63137817382812, "loss": 0.6947, "rewards/accuracies": 0.5133333206176758, "rewards/chosen": 0.029039518907666206, "rewards/margins": 0.006200558505952358, "rewards/rejected": 0.022838961333036423, "step": 9000 }, { "epoch": 2.0, "eval_logits/chosen": -3.0129990577697754, "eval_logits/rejected": -3.0272481441497803, "eval_logps/chosen": -157.42913818359375, "eval_logps/rejected": -135.008056640625, "eval_loss": 0.687827467918396, "eval_rewards/accuracies": 0.5367063283920288, "eval_rewards/chosen": 0.03790082782506943, "eval_rewards/margins": 0.019700102508068085, "eval_rewards/rejected": 0.018200723454356194, "eval_runtime": 617.771, "eval_samples_per_second": 1.619, "eval_steps_per_second": 0.102, "step": 9000 }, { "epoch": 2.2, "grad_norm": 5.53125, "learning_rate": 2.758620689655172e-07, "logits/chosen": -3.2307889461517334, "logits/rejected": -3.2417702674865723, "logps/chosen": -156.54052734375, "logps/rejected": -133.07623291015625, "loss": 0.6933, "rewards/accuracies": 0.5266666412353516, "rewards/chosen": 0.028480403125286102, "rewards/margins": 0.008195818401873112, "rewards/rejected": 0.020284580066800117, "step": 9900 }, { "epoch": 2.2, "eval_logits/chosen": -3.0130786895751953, "eval_logits/rejected": -3.0272867679595947, "eval_logps/chosen": -157.4261016845703, "eval_logps/rejected": -135.01654052734375, "eval_loss": 0.6852558851242065, "eval_rewards/accuracies": 0.5585317611694336, "eval_rewards/chosen": 0.039425503462553024, "eval_rewards/margins": 0.025462908670306206, "eval_rewards/rejected": 0.013962591998279095, "eval_runtime": 616.6357, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 9900 }, { "epoch": 2.4, "grad_norm": 5.78125, "learning_rate": 2.0689655172413793e-07, "logits/chosen": -3.2307872772216797, "logits/rejected": -3.240145444869995, "logps/chosen": -159.01539611816406, "logps/rejected": -133.73573303222656, "loss": 0.6887, "rewards/accuracies": 0.5527777671813965, "rewards/chosen": 0.036232445389032364, "rewards/margins": 0.01768229715526104, "rewards/rejected": 0.018550144508481026, "step": 10800 }, { "epoch": 2.4, "eval_logits/chosen": -3.01315975189209, "eval_logits/rejected": -3.0273892879486084, "eval_logps/chosen": -157.42787170410156, "eval_logps/rejected": -135.01727294921875, "eval_loss": 0.6853997707366943, "eval_rewards/accuracies": 0.5486111044883728, "eval_rewards/chosen": 0.038546331226825714, "eval_rewards/margins": 0.024960007518529892, "eval_rewards/rejected": 0.013586324639618397, "eval_runtime": 616.6757, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 10800 }, { "epoch": 2.6, "grad_norm": 6.25, "learning_rate": 1.379310344827586e-07, "logits/chosen": -3.236450672149658, "logits/rejected": -3.243511199951172, "logps/chosen": -157.64993286132812, "logps/rejected": -134.20144653320312, "loss": 0.6883, "rewards/accuracies": 0.527222216129303, "rewards/chosen": 0.03525533899664879, "rewards/margins": 0.019149743020534515, "rewards/rejected": 0.016105594113469124, "step": 11700 }, { "epoch": 2.6, "eval_logits/chosen": -3.012953996658325, "eval_logits/rejected": -3.027122974395752, "eval_logps/chosen": -157.4271240234375, "eval_logps/rejected": -135.00865173339844, "eval_loss": 0.6870063543319702, "eval_rewards/accuracies": 0.5446428656578064, "eval_rewards/chosen": 0.0389074869453907, "eval_rewards/margins": 0.021000539883971214, "eval_rewards/rejected": 0.017906947061419487, "eval_runtime": 616.7063, "eval_samples_per_second": 1.622, "eval_steps_per_second": 0.102, "step": 11700 }, { "epoch": 2.8, "grad_norm": 6.9375, "learning_rate": 6.89655172413793e-08, "logits/chosen": -3.2290825843811035, "logits/rejected": -3.238609552383423, "logps/chosen": -155.42266845703125, "logps/rejected": -132.62841796875, "loss": 0.6884, "rewards/accuracies": 0.5483333468437195, "rewards/chosen": 0.03833283483982086, "rewards/margins": 0.018470091745257378, "rewards/rejected": 0.019862744957208633, "step": 12600 }, { "epoch": 2.8, "eval_logits/chosen": -3.013180732727051, "eval_logits/rejected": -3.0272974967956543, "eval_logps/chosen": -157.42144775390625, "eval_logps/rejected": -134.997314453125, "eval_loss": 0.6886058449745178, "eval_rewards/accuracies": 0.54067462682724, "eval_rewards/chosen": 0.041760578751564026, "eval_rewards/margins": 0.018192334100604057, "eval_rewards/rejected": 0.02356824465095997, "eval_runtime": 617.3224, "eval_samples_per_second": 1.62, "eval_steps_per_second": 0.102, "step": 12600 } ], "logging_steps": 900, "max_steps": 13500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 900, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }