{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.696837902069092, "logits/rejected": -2.596374273300171, "logps/chosen": -215.154052734375, "logps/pi_response": -176.18202209472656, "logps/ref_response": -176.18202209472656, "logps/rejected": -289.2363586425781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.17, "learning_rate": 4.930057285201027e-07, "logits/chosen": -2.621392250061035, "logits/rejected": -2.592118978500366, "logps/chosen": -235.2295684814453, "logps/pi_response": -177.52685546875, "logps/ref_response": -178.44647216796875, "logps/rejected": -303.8392333984375, "loss": 0.6804, "rewards/accuracies": 0.6076388955116272, "rewards/chosen": -0.016610508784651756, "rewards/margins": 0.039140839129686356, "rewards/rejected": -0.05575134977698326, "step": 10 }, { "epoch": 0.33, "learning_rate": 4.187457503795526e-07, "logits/chosen": -2.651366710662842, "logits/rejected": -2.6005194187164307, "logps/chosen": -272.60650634765625, "logps/pi_response": -166.6388397216797, "logps/ref_response": -174.33192443847656, "logps/rejected": -367.46075439453125, "loss": 0.6014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.24459533393383026, "rewards/margins": 0.45078325271606445, "rewards/rejected": -0.6953786015510559, "step": 20 }, { "epoch": 0.5, "learning_rate": 2.8691164100062034e-07, "logits/chosen": -2.671945333480835, "logits/rejected": -2.617596387863159, "logps/chosen": -301.997802734375, "logps/pi_response": -184.88941955566406, "logps/ref_response": -172.4146270751953, "logps/rejected": -417.10992431640625, "loss": 0.5494, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5642356872558594, "rewards/margins": 0.7100368738174438, "rewards/rejected": -1.2742725610733032, "step": 30 }, { "epoch": 0.67, "learning_rate": 1.4248369943086995e-07, "logits/chosen": -2.594676971435547, "logits/rejected": -2.551079750061035, "logps/chosen": -301.282470703125, "logps/pi_response": -201.1599884033203, "logps/ref_response": -164.72154235839844, "logps/rejected": -445.9717712402344, "loss": 0.5072, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7301871180534363, "rewards/margins": 0.8498421907424927, "rewards/rejected": -1.5800292491912842, "step": 40 }, { "epoch": 0.84, "learning_rate": 3.473909705816111e-08, "logits/chosen": -2.5517385005950928, "logits/rejected": -2.5155932903289795, "logps/chosen": -337.0827331542969, "logps/pi_response": -220.1709442138672, "logps/ref_response": -174.74102783203125, "logps/rejected": -440.63677978515625, "loss": 0.4993, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9003208875656128, "rewards/margins": 0.711205780506134, "rewards/rejected": -1.6115267276763916, "step": 50 }, { "epoch": 0.99, "step": 59, "total_flos": 0.0, "train_loss": 0.5582792960991294, "train_runtime": 3533.8117, "train_samples_per_second": 4.325, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }