{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.6666666666666664e-08, "logits/chosen": -2.3145697116851807, "logits/rejected": -2.255990743637085, "logps/chosen": -240.6372833251953, "logps/pi_response": -133.796875, "logps/ref_response": -133.796875, "logps/rejected": -520.8372192382812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.17, "learning_rate": 9.860114570402053e-08, "logits/chosen": -2.33054780960083, "logits/rejected": -2.218600273132324, "logps/chosen": -279.32952880859375, "logps/pi_response": -142.03379821777344, "logps/ref_response": -141.8440399169922, "logps/rejected": -580.2430419921875, "loss": 0.6855, "rewards/accuracies": 0.5763888955116272, "rewards/chosen": -0.008015867322683334, "rewards/margins": 0.018631011247634888, "rewards/rejected": -0.026646876707673073, "step": 10 }, { "epoch": 0.33, "learning_rate": 8.374915007591053e-08, "logits/chosen": -2.273662567138672, "logits/rejected": -2.187835931777954, "logps/chosen": -318.18304443359375, "logps/pi_response": -160.6199951171875, "logps/ref_response": -153.63629150390625, "logps/rejected": -610.4090576171875, "loss": 0.6098, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.20712228119373322, "rewards/margins": 0.21442191302776337, "rewards/rejected": -0.4215441644191742, "step": 20 }, { "epoch": 0.5, "learning_rate": 5.738232820012406e-08, "logits/chosen": -2.2064340114593506, "logits/rejected": -2.1401524543762207, "logps/chosen": -325.96112060546875, "logps/pi_response": -152.35244750976562, "logps/ref_response": -142.01828002929688, "logps/rejected": -649.8997802734375, "loss": 0.5528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3760668635368347, "rewards/margins": 0.47578781843185425, "rewards/rejected": -0.8518548011779785, "step": 30 }, { "epoch": 0.67, "learning_rate": 2.8496739886173992e-08, "logits/chosen": -2.1908040046691895, "logits/rejected": -2.1057353019714355, "logps/chosen": -340.5452880859375, "logps/pi_response": -159.50143432617188, "logps/ref_response": -146.47113037109375, "logps/rejected": -705.52978515625, "loss": 0.507, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5094891786575317, "rewards/margins": 0.7861413955688477, "rewards/rejected": -1.295630693435669, "step": 40 }, { "epoch": 0.84, "learning_rate": 6.947819411632222e-09, "logits/chosen": -2.184225082397461, "logits/rejected": -2.1084225177764893, "logps/chosen": -371.94671630859375, "logps/pi_response": -157.12008666992188, "logps/ref_response": -142.58538818359375, "logps/rejected": -764.1049194335938, "loss": 0.4925, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.6093698143959045, "rewards/margins": 0.9578849077224731, "rewards/rejected": -1.5672547817230225, "step": 50 }, { "epoch": 0.99, "step": 59, "total_flos": 0.0, "train_loss": 0.5555570570089049, "train_runtime": 3425.432, "train_samples_per_second": 4.462, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }