{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.6666666666666667e-09, "logits/chosen": -2.5598204135894775, "logits/rejected": -2.54148006439209, "logps/chosen": -231.18460083007812, "logps/pi_response": -153.6551971435547, "logps/ref_response": -153.6551971435547, "logps/rejected": -481.5467529296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.17, "learning_rate": 9.860114570402054e-09, "logits/chosen": -2.579503297805786, "logits/rejected": -2.5002052783966064, "logps/chosen": -276.4703369140625, "logps/pi_response": -135.45098876953125, "logps/ref_response": -135.50714111328125, "logps/rejected": -483.57708740234375, "loss": 0.6932, "rewards/accuracies": 0.4756944477558136, "rewards/chosen": 0.0008863827679306269, "rewards/margins": 0.00028209862648509443, "rewards/rejected": 0.0006042842287570238, "step": 10 }, { "epoch": 0.33, "learning_rate": 8.374915007591053e-09, "logits/chosen": -2.543732166290283, "logits/rejected": -2.499950408935547, "logps/chosen": -286.89471435546875, "logps/pi_response": -147.18540954589844, "logps/ref_response": -147.16122436523438, "logps/rejected": -460.33807373046875, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0019976082257926464, "rewards/margins": 0.0029969532042741776, "rewards/rejected": -0.004994561430066824, "step": 20 }, { "epoch": 0.5, "learning_rate": 5.738232820012407e-09, "logits/chosen": -2.52839732170105, "logits/rejected": -2.4818997383117676, "logps/chosen": -271.01190185546875, "logps/pi_response": -141.57553100585938, "logps/ref_response": -141.3872528076172, "logps/rejected": -487.7052307128906, "loss": 0.69, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005116280633956194, "rewards/margins": 0.007777981460094452, "rewards/rejected": -0.012894262559711933, "step": 30 }, { "epoch": 0.67, "learning_rate": 2.8496739886173994e-09, "logits/chosen": -2.5488953590393066, "logits/rejected": -2.505760908126831, "logps/chosen": -280.8428955078125, "logps/pi_response": -146.02664184570312, "logps/ref_response": -145.77267456054688, "logps/rejected": -494.5501403808594, "loss": 0.6877, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.013240592554211617, "rewards/margins": 0.008154135197401047, "rewards/rejected": -0.021394727751612663, "step": 40 }, { "epoch": 0.84, "learning_rate": 6.947819411632222e-10, "logits/chosen": -2.562298536300659, "logits/rejected": -2.5050930976867676, "logps/chosen": -302.87677001953125, "logps/pi_response": -149.38168334960938, "logps/ref_response": -148.97946166992188, "logps/rejected": -509.5029296875, "loss": 0.6847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.017983367666602135, "rewards/margins": 0.016902435570955276, "rewards/rejected": -0.03488580137491226, "step": 50 }, { "epoch": 0.99, "step": 59, "total_flos": 0.0, "train_loss": 0.6886599670022221, "train_runtime": 3488.7106, "train_samples_per_second": 4.381, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }