{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 6.25e-08, "logits/chosen": -2.8984241485595703, "logits/rejected": -2.9192605018615723, "logps/chosen": -223.2982635498047, "logps/pi_response": -113.8905029296875, "logps/ref_response": -113.8905029296875, "logps/rejected": -200.18243408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12, "learning_rate": 4.990486745229364e-07, "logits/chosen": -2.7623088359832764, "logits/rejected": -2.7416441440582275, "logps/chosen": -217.3842010498047, "logps/pi_response": -115.41315460205078, "logps/ref_response": -115.2744140625, "logps/rejected": -178.284423828125, "loss": 0.6927, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.006774479523301125, "rewards/margins": 0.005107560195028782, "rewards/rejected": 0.0016669193282723427, "step": 10 }, { "epoch": 0.25, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -2.7812881469726562, "logits/rejected": -2.734970808029175, "logps/chosen": -220.35397338867188, "logps/pi_response": -135.1557159423828, "logps/ref_response": -125.4894027709961, "logps/rejected": -214.3368682861328, "loss": 0.6815, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04864302650094032, "rewards/margins": 0.025030791759490967, "rewards/rejected": -0.07367382198572159, "step": 20 }, { "epoch": 0.38, "learning_rate": 3.933941090877615e-07, "logits/chosen": -2.618950366973877, "logits/rejected": -2.6074278354644775, "logps/chosen": -220.7846221923828, "logps/pi_response": -125.65977478027344, "logps/ref_response": -107.89570617675781, "logps/rejected": -210.37643432617188, "loss": 0.66, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0882532075047493, "rewards/margins": 0.10665383189916611, "rewards/rejected": -0.1949070394039154, "step": 30 }, { "epoch": 0.5, "learning_rate": 2.934120444167326e-07, "logits/chosen": -2.648308277130127, "logits/rejected": -2.6359620094299316, "logps/chosen": -236.2592010498047, "logps/pi_response": -141.26565551757812, "logps/ref_response": -108.48822021484375, "logps/rejected": -221.27908325195312, "loss": 0.6481, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21243707835674286, "rewards/margins": 0.1420426070690155, "rewards/rejected": -0.35447970032691956, "step": 40 }, { "epoch": 0.62, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -2.5701727867126465, "logits/rejected": -2.566606044769287, "logps/chosen": -217.37423706054688, "logps/pi_response": -137.55845642089844, "logps/ref_response": -103.4356918334961, "logps/rejected": -214.7988739013672, "loss": 0.6323, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18530786037445068, "rewards/margins": 0.15332159399986267, "rewards/rejected": -0.33862942457199097, "step": 50 }, { "epoch": 0.75, "learning_rate": 8.930309757836516e-08, "logits/chosen": -2.591656446456909, "logits/rejected": -2.5513439178466797, "logps/chosen": -255.78726196289062, "logps/pi_response": -154.42214965820312, "logps/ref_response": -109.31465148925781, "logps/rejected": -229.62564086914062, "loss": 0.6312, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.30983734130859375, "rewards/margins": 0.17774702608585358, "rewards/rejected": -0.48758435249328613, "step": 60 }, { "epoch": 0.88, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -2.5838165283203125, "logits/rejected": -2.5525498390197754, "logps/chosen": -240.0748291015625, "logps/pi_response": -156.8232879638672, "logps/ref_response": -100.11727142333984, "logps/rejected": -239.44021606445312, "loss": 0.6211, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.39128464460372925, "rewards/margins": 0.22986645996570587, "rewards/rejected": -0.6211511492729187, "step": 70 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -2.63145112991333, "logits/rejected": -2.6038479804992676, "logps/chosen": -247.91909790039062, "logps/pi_response": -164.45394897460938, "logps/ref_response": -113.96580505371094, "logps/rejected": -237.40869140625, "loss": 0.6236, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.37112683057785034, "rewards/margins": 0.1667536199092865, "rewards/rejected": -0.5378804802894592, "step": 80 }, { "epoch": 1.0, "step": 80, "total_flos": 0.0, "train_loss": 0.6488030612468719, "train_runtime": 2263.2556, "train_samples_per_second": 4.502, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }