{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1061.4292758130512, "learning_rate": 3.125e-09, "logits/chosen": -3.9499800205230713, "logits/rejected": -4.237819194793701, "logps/chosen": -300.693115234375, "logps/rejected": -249.96307373046875, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 1100.0352396547798, "learning_rate": 3.125e-08, "logits/chosen": -4.129236221313477, "logits/rejected": -4.351477146148682, "logps/chosen": -351.48150634765625, "logps/rejected": -308.8130187988281, "loss": 0.7282, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -0.027542730793356895, "rewards/margins": -0.10096076130867004, "rewards/rejected": 0.073418028652668, "step": 10 }, { "epoch": 0.13, "grad_norm": 1023.2979189429582, "learning_rate": 4.9899357349880975e-08, "logits/chosen": -4.195385932922363, "logits/rejected": -4.382458686828613, "logps/chosen": -334.9761962890625, "logps/rejected": -293.8690185546875, "loss": 0.6994, "rewards/accuracies": 0.5625, "rewards/chosen": 0.18794658780097961, "rewards/margins": 0.09464438259601593, "rewards/rejected": 0.09330219030380249, "step": 20 }, { "epoch": 0.19, "grad_norm": 716.4790257172086, "learning_rate": 4.877641290737884e-08, "logits/chosen": -4.231568813323975, "logits/rejected": -4.364924430847168, "logps/chosen": -327.92413330078125, "logps/rejected": -295.37176513671875, "loss": 0.6055, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.8660896420478821, "rewards/margins": 0.425116628408432, "rewards/rejected": 0.44097304344177246, "step": 30 }, { "epoch": 0.26, "grad_norm": 820.682467230606, "learning_rate": 4.646121984004665e-08, "logits/chosen": -4.1439104080200195, "logits/rejected": -4.345718860626221, "logps/chosen": -330.7722473144531, "logps/rejected": -288.60272216796875, "loss": 0.5304, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.6021575927734375, "rewards/margins": 0.956769585609436, "rewards/rejected": 0.6453880071640015, "step": 40 }, { "epoch": 0.32, "grad_norm": 587.6344375884266, "learning_rate": 4.3069871595684784e-08, "logits/chosen": -4.235655784606934, "logits/rejected": -4.4143476486206055, "logps/chosen": -330.8318786621094, "logps/rejected": -291.75067138671875, "loss": 0.525, "rewards/accuracies": 0.8125, "rewards/chosen": 2.010468006134033, "rewards/margins": 1.2047243118286133, "rewards/rejected": 0.8057435750961304, "step": 50 }, { "epoch": 0.38, "grad_norm": 586.1895231882175, "learning_rate": 3.8772424536302564e-08, "logits/chosen": -4.247714519500732, "logits/rejected": -4.417937278747559, "logps/chosen": -322.380126953125, "logps/rejected": -291.76654052734375, "loss": 0.517, "rewards/accuracies": 0.8218749761581421, "rewards/chosen": 2.19193696975708, "rewards/margins": 1.311265230178833, "rewards/rejected": 0.8806716203689575, "step": 60 }, { "epoch": 0.45, "grad_norm": 528.8947001003231, "learning_rate": 3.378437060203357e-08, "logits/chosen": -4.170197486877441, "logits/rejected": -4.359009742736816, "logps/chosen": -322.4336242675781, "logps/rejected": -289.197998046875, "loss": 0.5096, "rewards/accuracies": 0.815625011920929, "rewards/chosen": 2.203657865524292, "rewards/margins": 1.3729288578033447, "rewards/rejected": 0.8307291269302368, "step": 70 }, { "epoch": 0.51, "grad_norm": 544.8408089721645, "learning_rate": 2.8355831645441387e-08, "logits/chosen": -4.038945198059082, "logits/rejected": -4.326292991638184, "logps/chosen": -348.65472412109375, "logps/rejected": -308.1936950683594, "loss": 0.5115, "rewards/accuracies": 0.875, "rewards/chosen": 2.3145015239715576, "rewards/margins": 1.5040271282196045, "rewards/rejected": 0.8104745149612427, "step": 80 }, { "epoch": 0.58, "grad_norm": 526.2097671297138, "learning_rate": 2.2759017277414164e-08, "logits/chosen": -4.170694351196289, "logits/rejected": -4.379502296447754, "logps/chosen": -335.43402099609375, "logps/rejected": -295.8087463378906, "loss": 0.5066, "rewards/accuracies": 0.859375, "rewards/chosen": 1.9496158361434937, "rewards/margins": 1.3254307508468628, "rewards/rejected": 0.6241849660873413, "step": 90 }, { "epoch": 0.64, "grad_norm": 595.4206000749634, "learning_rate": 1.7274575140626317e-08, "logits/chosen": -4.159605979919434, "logits/rejected": -4.377494812011719, "logps/chosen": -333.1622314453125, "logps/rejected": -286.3451232910156, "loss": 0.5142, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 2.0654988288879395, "rewards/margins": 1.4395055770874023, "rewards/rejected": 0.6259931921958923, "step": 100 }, { "epoch": 0.7, "grad_norm": 528.6243658840883, "learning_rate": 1.217751806485235e-08, "logits/chosen": -4.138543128967285, "logits/rejected": -4.379323959350586, "logps/chosen": -314.89556884765625, "logps/rejected": -277.0162658691406, "loss": 0.5071, "rewards/accuracies": 0.840624988079071, "rewards/chosen": 2.0163025856018066, "rewards/margins": 1.3896347284317017, "rewards/rejected": 0.6266676783561707, "step": 110 }, { "epoch": 0.77, "grad_norm": 557.9046270518194, "learning_rate": 7.723433775328384e-09, "logits/chosen": -4.136676788330078, "logits/rejected": -4.345128536224365, "logps/chosen": -328.94482421875, "logps/rejected": -281.4302673339844, "loss": 0.5052, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.9894015789031982, "rewards/margins": 1.363441824913025, "rewards/rejected": 0.625959575176239, "step": 120 }, { "epoch": 0.83, "grad_norm": 523.8022952711281, "learning_rate": 4.135668656967433e-09, "logits/chosen": -4.222664833068848, "logits/rejected": -4.383507251739502, "logps/chosen": -334.4266357421875, "logps/rejected": -287.63067626953125, "loss": 0.5059, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.026325225830078, "rewards/margins": 1.3963022232055664, "rewards/rejected": 0.6300228834152222, "step": 130 }, { "epoch": 0.9, "grad_norm": 601.0204054485596, "learning_rate": 1.5941282340065698e-09, "logits/chosen": -4.176892280578613, "logits/rejected": -4.3910746574401855, "logps/chosen": -335.8815612792969, "logps/rejected": -304.35723876953125, "loss": 0.5101, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.9034252166748047, "rewards/margins": 1.3076425790786743, "rewards/rejected": 0.5957827568054199, "step": 140 }, { "epoch": 0.96, "grad_norm": 496.1092972793449, "learning_rate": 2.262559558016325e-10, "logits/chosen": -4.114329814910889, "logits/rejected": -4.342306137084961, "logps/chosen": -342.4503479003906, "logps/rejected": -296.07086181640625, "loss": 0.506, "rewards/accuracies": 0.840624988079071, "rewards/chosen": 2.0278801918029785, "rewards/margins": 1.378021001815796, "rewards/rejected": 0.6498591303825378, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.5437000210468586, "train_runtime": 4622.4032, "train_samples_per_second": 8.641, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }