{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 73.53016894281072, "learning_rate": 1.6666666666666664e-08, "logits/chosen": -2.377516984939575, "logits/rejected": -2.342954158782959, "logps/chosen": -267.31927490234375, "logps/pi_response": -149.59169006347656, "logps/ref_response": -149.59169006347656, "logps/rejected": -539.9591674804688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 80.4074524611749, "learning_rate": 9.860114570402053e-08, "logits/chosen": -2.3693184852600098, "logits/rejected": -2.286440372467041, "logps/chosen": -298.61932373046875, "logps/pi_response": -132.10630798339844, "logps/ref_response": -131.78355407714844, "logps/rejected": -554.6019897460938, "loss": 0.6859, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": -0.00936645083129406, "rewards/margins": 0.014261982403695583, "rewards/rejected": -0.023628434166312218, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 46.77444881870846, "learning_rate": 8.374915007591053e-08, "logits/chosen": -2.3324151039123535, "logits/rejected": -2.2493882179260254, "logps/chosen": -298.06707763671875, "logps/pi_response": -146.1650390625, "logps/ref_response": -140.46481323242188, "logps/rejected": -580.0590209960938, "loss": 0.6135, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.18148620426654816, "rewards/margins": 0.20174245536327362, "rewards/rejected": -0.3832286298274994, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 42.64368867415699, "learning_rate": 5.738232820012406e-08, "logits/chosen": -2.2694995403289795, "logits/rejected": -2.207530975341797, "logps/chosen": -327.8448181152344, "logps/pi_response": -139.88900756835938, "logps/ref_response": -131.197509765625, "logps/rejected": -638.9929809570312, "loss": 0.545, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.371069997549057, "rewards/margins": 0.4864231050014496, "rewards/rejected": -0.8574931025505066, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 40.23447563253812, "learning_rate": 2.8496739886173992e-08, "logits/chosen": -2.242140054702759, "logits/rejected": -2.160322666168213, "logps/chosen": -358.05889892578125, "logps/pi_response": -157.74525451660156, "logps/ref_response": -145.66329956054688, "logps/rejected": -707.8968505859375, "loss": 0.5184, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5855440497398376, "rewards/margins": 0.7927876114845276, "rewards/rejected": -1.3783317804336548, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 82.30441781177622, "learning_rate": 6.947819411632222e-09, "logits/chosen": -2.2379250526428223, "logits/rejected": -2.158501386642456, "logps/chosen": -376.53204345703125, "logps/pi_response": -153.90878295898438, "logps/ref_response": -141.55398559570312, "logps/rejected": -747.1853637695312, "loss": 0.4999, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6662783622741699, "rewards/margins": 0.9346103668212891, "rewards/rejected": -1.6008888483047485, "step": 50 }, { "epoch": 0.9874476987447699, "step": 59, "total_flos": 0.0, "train_loss": 0.5589375010991501, "train_runtime": 2582.4535, "train_samples_per_second": 5.918, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }