{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9952153110047847, "eval_steps": 500, "global_step": 52, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.8187968730926514, "logits/rejected": -2.8237557411193848, "logps/chosen": -257.11737060546875, "logps/pi_response": -65.15000915527344, "logps/ref_response": -65.15000915527344, "logps/rejected": -166.6063995361328, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.19, "learning_rate": 4.907293218369498e-07, "logits/chosen": -2.834031105041504, "logits/rejected": -2.794311046600342, "logps/chosen": -247.26991271972656, "logps/pi_response": -71.50384521484375, "logps/ref_response": -71.02489471435547, "logps/rejected": -163.82879638671875, "loss": 0.6885, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.007015190087258816, "rewards/margins": 0.0075297304429113865, "rewards/rejected": -0.0005145410541445017, "step": 10 }, { "epoch": 0.38, "learning_rate": 3.941700805287168e-07, "logits/chosen": -2.7059969902038574, "logits/rejected": -2.682796001434326, "logps/chosen": -233.1520538330078, "logps/pi_response": -87.71420288085938, "logps/ref_response": -74.39585876464844, "logps/rejected": -170.5820770263672, "loss": 0.6558, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.023435983806848526, "rewards/margins": 0.09170379489660263, "rewards/rejected": -0.0682678073644638, "step": 20 }, { "epoch": 0.57, "learning_rate": 2.3293939665883228e-07, "logits/chosen": -2.607896327972412, "logits/rejected": -2.5777342319488525, "logps/chosen": -247.3583526611328, "logps/pi_response": -125.3393325805664, "logps/ref_response": -79.46585845947266, "logps/rejected": -196.52218627929688, "loss": 0.6311, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16959059238433838, "rewards/margins": 0.12048976123332977, "rewards/rejected": -0.29008033871650696, "step": 30 }, { "epoch": 0.77, "learning_rate": 7.936171419533652e-08, "logits/chosen": -2.627960443496704, "logits/rejected": -2.5933032035827637, "logps/chosen": -272.2423400878906, "logps/pi_response": -141.93287658691406, "logps/ref_response": -77.87845611572266, "logps/rejected": -232.07913208007812, "loss": 0.5952, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.19411739706993103, "rewards/margins": 0.2757735848426819, "rewards/rejected": -0.4698909819126129, "step": 40 }, { "epoch": 0.96, "learning_rate": 2.328513490917311e-09, "logits/chosen": -2.6459906101226807, "logits/rejected": -2.6088039875030518, "logps/chosen": -273.8865051269531, "logps/pi_response": -136.73643493652344, "logps/ref_response": -74.40654754638672, "logps/rejected": -227.901123046875, "loss": 0.5864, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.17903554439544678, "rewards/margins": 0.36069053411483765, "rewards/rejected": -0.5397260785102844, "step": 50 }, { "epoch": 1.0, "step": 52, "total_flos": 0.0, "train_loss": 0.6293867803536929, "train_runtime": 3116.1807, "train_samples_per_second": 4.279, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 52, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }