{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9905956112852664, "eval_steps": 500, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 6.25e-08, "logits/chosen": -2.457242012023926, "logits/rejected": -2.4024434089660645, "logps/chosen": -202.63397216796875, "logps/pi_response": -193.8072509765625, "logps/ref_response": -193.8072509765625, "logps/rejected": -294.0563659667969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.13, "learning_rate": 4.990217055187362e-07, "logits/chosen": -2.7324564456939697, "logits/rejected": -2.677666425704956, "logps/chosen": -255.35565185546875, "logps/pi_response": -165.3203125, "logps/ref_response": -165.90737915039062, "logps/rejected": -332.898193359375, "loss": 0.6729, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.024918710812926292, "rewards/margins": 0.045975301414728165, "rewards/rejected": -0.07089401036500931, "step": 10 }, { "epoch": 0.25, "learning_rate": 4.655786431300069e-07, "logits/chosen": -2.6490871906280518, "logits/rejected": -2.573331832885742, "logps/chosen": -296.2843933105469, "logps/pi_response": -163.0939483642578, "logps/ref_response": -150.6802215576172, "logps/rejected": -442.4339904785156, "loss": 0.5922, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6179492473602295, "rewards/margins": 0.6742614507675171, "rewards/rejected": -1.2922108173370361, "step": 20 }, { "epoch": 0.38, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -2.638205051422119, "logits/rejected": -2.593559741973877, "logps/chosen": -291.2914733886719, "logps/pi_response": -157.58370971679688, "logps/ref_response": -143.96243286132812, "logps/rejected": -447.7432556152344, "loss": 0.5521, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6356722116470337, "rewards/margins": 0.8006154894828796, "rewards/rejected": -1.4362876415252686, "step": 30 }, { "epoch": 0.5, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -2.6024458408355713, "logits/rejected": -2.533780336380005, "logps/chosen": -275.9115905761719, "logps/pi_response": -154.2270965576172, "logps/ref_response": -132.5021514892578, "logps/rejected": -433.9896545410156, "loss": 0.5214, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5712152123451233, "rewards/margins": 0.7623130679130554, "rewards/rejected": -1.3335282802581787, "step": 40 }, { "epoch": 0.63, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -2.5458426475524902, "logits/rejected": -2.504864454269409, "logps/chosen": -285.959716796875, "logps/pi_response": -164.4518280029297, "logps/ref_response": -142.59683227539062, "logps/rejected": -467.680419921875, "loss": 0.5024, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.5851860046386719, "rewards/margins": 0.8030134439468384, "rewards/rejected": -1.3881994485855103, "step": 50 }, { "epoch": 0.75, "learning_rate": 8.32661172908373e-08, "logits/chosen": -2.49611234664917, "logits/rejected": -2.4642136096954346, "logps/chosen": -285.41571044921875, "logps/pi_response": -171.76272583007812, "logps/ref_response": -150.0667266845703, "logps/rejected": -459.05035400390625, "loss": 0.4991, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6012312173843384, "rewards/margins": 0.7958024144172668, "rewards/rejected": -1.39703369140625, "step": 60 }, { "epoch": 0.88, "learning_rate": 1.956279997278043e-08, "logits/chosen": -2.526583671569824, "logits/rejected": -2.4693140983581543, "logps/chosen": -286.43743896484375, "logps/pi_response": -172.4198760986328, "logps/ref_response": -153.6392822265625, "logps/rejected": -480.53826904296875, "loss": 0.4835, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5305203199386597, "rewards/margins": 0.9574079513549805, "rewards/rejected": -1.4879281520843506, "step": 70 }, { "epoch": 0.99, "step": 79, "total_flos": 0.0, "train_loss": 0.536045919490766, "train_runtime": 4637.5848, "train_samples_per_second": 4.394, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 79, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }