{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 6.5958876428735564, "learning_rate": 2.941176470588235e-08, "logits/chosen": -1.6130714416503906, "logits/rejected": -1.7848026752471924, "logps/chosen": -143.55209350585938, "logps/rejected": -137.43441772460938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 5.967532383605112, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.8283494710922241, "logits/rejected": -1.7852643728256226, "logps/chosen": -158.81536865234375, "logps/rejected": -151.6327362060547, "loss": 0.693, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 8.654648991068825e-05, "rewards/margins": 0.0005829257424920797, "rewards/rejected": -0.0004963793326169252, "step": 10 }, { "epoch": 0.12, "grad_norm": 5.606818404653461, "learning_rate": 4.994863481875841e-07, "logits/chosen": -1.8151414394378662, "logits/rejected": -1.7734615802764893, "logps/chosen": -151.97584533691406, "logps/rejected": -164.20437622070312, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.003920617047697306, "rewards/margins": 0.0024364024866372347, "rewards/rejected": 0.001484214561060071, "step": 20 }, { "epoch": 0.18, "grad_norm": 6.452038531330129, "learning_rate": 4.904133592102591e-07, "logits/chosen": -1.8305763006210327, "logits/rejected": -1.7172702550888062, "logps/chosen": -154.3677520751953, "logps/rejected": -148.50753784179688, "loss": 0.6882, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.016027290374040604, "rewards/margins": 0.00950100552290678, "rewards/rejected": 0.006526285316795111, "step": 30 }, { "epoch": 0.24, "grad_norm": 6.2953570308846825, "learning_rate": 4.704015606870022e-07, "logits/chosen": -1.7697455883026123, "logits/rejected": -1.7966588735580444, "logps/chosen": -143.58848571777344, "logps/rejected": -166.49522399902344, "loss": 0.6829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03490619733929634, "rewards/margins": 0.02003355883061886, "rewards/rejected": 0.014872634783387184, "step": 40 }, { "epoch": 0.3, "grad_norm": 6.274119591898531, "learning_rate": 4.4036148959228356e-07, "logits/chosen": -1.7394487857818604, "logits/rejected": -1.804693579673767, "logps/chosen": -159.61492919921875, "logps/rejected": -136.1581268310547, "loss": 0.6763, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.06023404002189636, "rewards/margins": 0.042321957647800446, "rewards/rejected": 0.017912080511450768, "step": 50 }, { "epoch": 0.37, "grad_norm": 6.180992532830828, "learning_rate": 4.016599693735638e-07, "logits/chosen": -1.6605278253555298, "logits/rejected": -1.724905252456665, "logps/chosen": -146.7899932861328, "logps/rejected": -148.02505493164062, "loss": 0.6733, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0686994269490242, "rewards/margins": 0.04312276840209961, "rewards/rejected": 0.02557666040956974, "step": 60 }, { "epoch": 0.43, "grad_norm": 5.590599679916071, "learning_rate": 3.5605791947475926e-07, "logits/chosen": -1.7533237934112549, "logits/rejected": -1.702845811843872, "logps/chosen": -146.6136474609375, "logps/rejected": -140.97921752929688, "loss": 0.6631, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07920090854167938, "rewards/margins": 0.053236376494169235, "rewards/rejected": 0.0259645227342844, "step": 70 }, { "epoch": 0.49, "grad_norm": 5.096416269116106, "learning_rate": 3.056302334890786e-07, "logits/chosen": -1.616193413734436, "logits/rejected": -1.6094154119491577, "logps/chosen": -142.79188537597656, "logps/rejected": -140.85447692871094, "loss": 0.6609, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09861920028924942, "rewards/margins": 0.0706188827753067, "rewards/rejected": 0.028000324964523315, "step": 80 }, { "epoch": 0.55, "grad_norm": 5.517912420297569, "learning_rate": 2.526713714858433e-07, "logits/chosen": -1.608278512954712, "logits/rejected": -1.5585658550262451, "logps/chosen": -132.39981079101562, "logps/rejected": -143.10488891601562, "loss": 0.6557, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.1142318844795227, "rewards/margins": 0.07896542549133301, "rewards/rejected": 0.0352664515376091, "step": 90 }, { "epoch": 0.61, "grad_norm": 5.179137970855667, "learning_rate": 1.9959096206109175e-07, "logits/chosen": -1.5899827480316162, "logits/rejected": -1.5742290019989014, "logps/chosen": -136.0356903076172, "logps/rejected": -162.7815704345703, "loss": 0.6508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.11762702465057373, "rewards/margins": 0.08622404932975769, "rewards/rejected": 0.03140297532081604, "step": 100 }, { "epoch": 0.61, "eval_logits/chosen": -1.6967989206314087, "eval_logits/rejected": -1.6722551584243774, "eval_logps/chosen": -158.87005615234375, "eval_logps/rejected": -170.24278259277344, "eval_loss": 0.6690559983253479, "eval_rewards/accuracies": 0.6940954923629761, "eval_rewards/chosen": 0.07056128978729248, "eval_rewards/margins": 0.050339534878730774, "eval_rewards/rejected": 0.020221758633852005, "eval_runtime": 1977.6877, "eval_samples_per_second": 9.659, "eval_steps_per_second": 0.302, "step": 100 }, { "epoch": 0.67, "grad_norm": 5.410829812028072, "learning_rate": 1.4880416421940154e-07, "logits/chosen": -1.6502714157104492, "logits/rejected": -1.6523603200912476, "logps/chosen": -134.38687133789062, "logps/rejected": -157.00936889648438, "loss": 0.6512, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.11995081603527069, "rewards/margins": 0.09394902735948563, "rewards/rejected": 0.026001790538430214, "step": 110 }, { "epoch": 0.73, "grad_norm": 5.845780336717107, "learning_rate": 1.0262177762208507e-07, "logits/chosen": -1.565212607383728, "logits/rejected": -1.6423566341400146, "logps/chosen": -143.96304321289062, "logps/rejected": -149.28546142578125, "loss": 0.6496, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.13488885760307312, "rewards/margins": 0.10831846296787262, "rewards/rejected": 0.026570383459329605, "step": 120 }, { "epoch": 0.79, "grad_norm": 5.76403048084688, "learning_rate": 6.31451011862412e-08, "logits/chosen": -1.6332323551177979, "logits/rejected": -1.6044152975082397, "logps/chosen": -137.62985229492188, "logps/rejected": -159.90980529785156, "loss": 0.6439, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.13013367354869843, "rewards/margins": 0.10071909427642822, "rewards/rejected": 0.02941458486020565, "step": 130 }, { "epoch": 0.85, "grad_norm": 5.119446644831888, "learning_rate": 3.217032396915265e-08, "logits/chosen": -1.569746971130371, "logits/rejected": -1.6146259307861328, "logps/chosen": -130.83258056640625, "logps/rejected": -160.59701538085938, "loss": 0.6439, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1322535276412964, "rewards/margins": 0.10249896347522736, "rewards/rejected": 0.029754554852843285, "step": 140 }, { "epoch": 0.91, "grad_norm": 5.590191167835734, "learning_rate": 1.1106798553464802e-08, "logits/chosen": -1.6109774112701416, "logits/rejected": -1.607143759727478, "logps/chosen": -145.5422821044922, "logps/rejected": -155.8082733154297, "loss": 0.6426, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.14719954133033752, "rewards/margins": 0.11081697046756744, "rewards/rejected": 0.03638254478573799, "step": 150 }, { "epoch": 0.98, "grad_norm": 5.417981503927173, "learning_rate": 9.129154946982687e-10, "logits/chosen": -1.5755327939987183, "logits/rejected": -1.6533405780792236, "logps/chosen": -144.75936889648438, "logps/rejected": -150.3732452392578, "loss": 0.6439, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.1261114478111267, "rewards/margins": 0.10229575634002686, "rewards/rejected": 0.023815687745809555, "step": 160 }, { "epoch": 1.0, "step": 164, "total_flos": 0.0, "train_loss": 0.2519006322069866, "train_runtime": 787.0698, "train_samples_per_second": 13.311, "train_steps_per_second": 0.208 } ], "logging_steps": 10, "max_steps": 164, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }