{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9757287474082205, "eval_steps": 1000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.24, "learning_rate": 4.30097988740375e-05, "logits/chosen": 0.4760148227214813, "logits/rejected": 0.18961936235427856, "logps/chosen": -138.3477783203125, "logps/rejected": -107.43299865722656, "loss": 0.0534, "rewards/accuracies": 0.9904999732971191, "rewards/chosen": -1.208303689956665, "rewards/margins": 6.7370734214782715, "rewards/rejected": -7.945377349853516, "step": 1000 }, { "epoch": 0.24, "eval_logits/chosen": 0.5734850168228149, "eval_logits/rejected": 0.30642059445381165, "eval_logps/chosen": -140.49411010742188, "eval_logps/rejected": -130.59207153320312, "eval_loss": 0.021729281172156334, "eval_rewards/accuracies": 0.994514524936676, "eval_rewards/chosen": -1.671390414237976, "eval_rewards/margins": 8.56452751159668, "eval_rewards/rejected": -10.235918045043945, "eval_runtime": 379.8081, "eval_samples_per_second": 9.597, "eval_steps_per_second": 4.8, "step": 1000 }, { "epoch": 0.49, "learning_rate": 2.594822843866259e-05, "logits/chosen": 0.9377780556678772, "logits/rejected": 0.5601473450660706, "logps/chosen": -137.52035522460938, "logps/rejected": -132.42645263671875, "loss": 0.0182, "rewards/accuracies": 0.9955000281333923, "rewards/chosen": -1.4818373918533325, "rewards/margins": 8.971912384033203, "rewards/rejected": -10.453749656677246, "step": 2000 }, { "epoch": 0.49, "eval_logits/chosen": 1.070937156677246, "eval_logits/rejected": 0.6229708194732666, "eval_logps/chosen": -139.24871826171875, "eval_logps/rejected": -137.83485412597656, "eval_loss": 0.01748650148510933, "eval_rewards/accuracies": 0.995063066482544, "eval_rewards/chosen": -1.5468522310256958, "eval_rewards/margins": 9.413342475891113, "eval_rewards/rejected": -10.96019458770752, "eval_runtime": 382.2871, "eval_samples_per_second": 9.535, "eval_steps_per_second": 4.769, "step": 2000 }, { "epoch": 0.73, "learning_rate": 8.35639340331897e-06, "logits/chosen": 1.0557544231414795, "logits/rejected": 0.6803003549575806, "logps/chosen": -137.27392578125, "logps/rejected": -142.61241149902344, "loss": 0.0162, "rewards/accuracies": 0.9959999918937683, "rewards/chosen": -1.7070084810256958, "rewards/margins": 9.696061134338379, "rewards/rejected": -11.403070449829102, "step": 3000 }, { "epoch": 0.73, "eval_logits/chosen": 1.0047662258148193, "eval_logits/rejected": 0.6324675679206848, "eval_logps/chosen": -139.297607421875, "eval_logps/rejected": -142.67721557617188, "eval_loss": 0.017071055248379707, "eval_rewards/accuracies": 0.9961601495742798, "eval_rewards/chosen": -1.5517414808273315, "eval_rewards/margins": 9.89268970489502, "eval_rewards/rejected": -11.444432258605957, "eval_runtime": 382.4643, "eval_samples_per_second": 9.53, "eval_steps_per_second": 4.766, "step": 3000 }, { "epoch": 0.98, "learning_rate": 7.193097375108037e-08, "logits/chosen": 0.9620022773742676, "logits/rejected": 0.5997971892356873, "logps/chosen": -140.70065307617188, "logps/rejected": -143.5229949951172, "loss": 0.0154, "rewards/accuracies": 0.9947500228881836, "rewards/chosen": -1.644036054611206, "rewards/margins": 9.865877151489258, "rewards/rejected": -11.509913444519043, "step": 4000 }, { "epoch": 0.98, "eval_logits/chosen": 0.9728695154190063, "eval_logits/rejected": 0.6051246523857117, "eval_logps/chosen": -139.5214080810547, "eval_logps/rejected": -142.23641967773438, "eval_loss": 0.016766143962740898, "eval_rewards/accuracies": 0.9956116080284119, "eval_rewards/chosen": -1.5741225481033325, "eval_rewards/margins": 9.8262300491333, "eval_rewards/rejected": -11.40035343170166, "eval_runtime": 383.0087, "eval_samples_per_second": 9.517, "eval_steps_per_second": 4.76, "step": 4000 } ], "logging_steps": 1000, "max_steps": 4099, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.7962050589084877e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }