{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9960474308300395, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 26.908694644642612, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -4.638427734375, "logits/rejected": -4.891327857971191, "logps/chosen": -198.52749633789062, "logps/rejected": -147.3392791748047, "loss": 0.6929, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 27.07664091604203, "learning_rate": 2.631578947368421e-07, "logits/chosen": -4.496448993682861, "logits/rejected": -4.815927028656006, "logps/chosen": -224.28125, "logps/rejected": -167.94735717773438, "loss": 0.6915, "rewards/accuracies": 0.5034722089767456, "rewards/chosen": 0.0034646072890609503, "rewards/margins": 0.002639756305143237, "rewards/rejected": 0.0008248506928794086, "step": 10 }, { "epoch": 0.11, "grad_norm": 26.30402064096193, "learning_rate": 4.999573126145131e-07, "logits/chosen": -4.625959873199463, "logits/rejected": -4.94482421875, "logps/chosen": -231.04525756835938, "logps/rejected": -196.3661651611328, "loss": 0.6465, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04837086424231529, "rewards/margins": 0.11359457671642303, "rewards/rejected": -0.16196544468402863, "step": 20 }, { "epoch": 0.16, "grad_norm": 25.70165553073792, "learning_rate": 4.948524419003415e-07, "logits/chosen": -4.869608402252197, "logits/rejected": -5.148451805114746, "logps/chosen": -273.7060241699219, "logps/rejected": -259.2108154296875, "loss": 0.5717, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.5196550488471985, "rewards/margins": 0.3445150554180145, "rewards/rejected": -0.8641700744628906, "step": 30 }, { "epoch": 0.21, "grad_norm": 32.38040367732233, "learning_rate": 4.81409414945389e-07, "logits/chosen": -4.95624303817749, "logits/rejected": -5.334275245666504, "logps/chosen": -321.26739501953125, "logps/rejected": -317.9222106933594, "loss": 0.5311, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9283856153488159, "rewards/margins": 0.48562851548194885, "rewards/rejected": -1.4140141010284424, "step": 40 }, { "epoch": 0.26, "grad_norm": 27.107807886309228, "learning_rate": 4.6008601790947314e-07, "logits/chosen": -5.323241233825684, "logits/rejected": -5.817015171051025, "logps/chosen": -357.8787536621094, "logps/rejected": -385.47576904296875, "loss": 0.4831, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.411299467086792, "rewards/margins": 0.7530988454818726, "rewards/rejected": -2.164398193359375, "step": 50 }, { "epoch": 0.32, "grad_norm": 32.232061879934236, "learning_rate": 4.3160839350405605e-07, "logits/chosen": -5.831389904022217, "logits/rejected": -6.2499542236328125, "logps/chosen": -395.7707824707031, "logps/rejected": -446.3265686035156, "loss": 0.4294, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -1.821434736251831, "rewards/margins": 0.9432821273803711, "rewards/rejected": -2.7647171020507812, "step": 60 }, { "epoch": 0.37, "grad_norm": 35.03072007251475, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -6.090306758880615, "logits/rejected": -6.541258335113525, "logps/chosen": -430.2369689941406, "logps/rejected": -496.2119140625, "loss": 0.424, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1488282680511475, "rewards/margins": 1.0692826509475708, "rewards/rejected": -3.218111038208008, "step": 70 }, { "epoch": 0.42, "grad_norm": 30.667469826354093, "learning_rate": 3.572801521931522e-07, "logits/chosen": -6.3887619972229, "logits/rejected": -6.877404689788818, "logps/chosen": -439.2911071777344, "logps/rejected": -526.5487060546875, "loss": 0.4001, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -2.3115358352661133, "rewards/margins": 1.2141239643096924, "rewards/rejected": -3.5256600379943848, "step": 80 }, { "epoch": 0.47, "grad_norm": 37.36819911889553, "learning_rate": 3.139606943986089e-07, "logits/chosen": -6.5696258544921875, "logits/rejected": -7.1035637855529785, "logps/chosen": -458.3387756347656, "logps/rejected": -556.1650390625, "loss": 0.3875, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -2.5067451000213623, "rewards/margins": 1.3494058847427368, "rewards/rejected": -3.8561508655548096, "step": 90 }, { "epoch": 0.53, "grad_norm": 33.15053822353323, "learning_rate": 2.684631318687185e-07, "logits/chosen": -6.621747016906738, "logits/rejected": -7.236710548400879, "logps/chosen": -467.0467834472656, "logps/rejected": -582.046142578125, "loss": 0.3867, "rewards/accuracies": 0.796875, "rewards/chosen": -2.4837827682495117, "rewards/margins": 1.5418504476547241, "rewards/rejected": -4.025633811950684, "step": 100 }, { "epoch": 0.58, "grad_norm": 37.45830028947681, "learning_rate": 2.2233682952712483e-07, "logits/chosen": -6.568659782409668, "logits/rejected": -7.284300327301025, "logps/chosen": -460.4766540527344, "logps/rejected": -578.6600341796875, "loss": 0.3771, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -2.3609726428985596, "rewards/margins": 1.647943139076233, "rewards/rejected": -4.008915901184082, "step": 110 }, { "epoch": 0.63, "grad_norm": 33.8427535333109, "learning_rate": 1.7715256327766884e-07, "logits/chosen": -6.796021461486816, "logits/rejected": -7.497170925140381, "logps/chosen": -504.50543212890625, "logps/rejected": -621.22314453125, "loss": 0.3508, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -2.883434295654297, "rewards/margins": 1.6248239278793335, "rewards/rejected": -4.50825834274292, "step": 120 }, { "epoch": 0.69, "grad_norm": 35.353347844932394, "learning_rate": 1.3444902911492174e-07, "logits/chosen": -6.833544731140137, "logits/rejected": -7.472651481628418, "logps/chosen": -521.9656372070312, "logps/rejected": -659.3110961914062, "loss": 0.3705, "rewards/accuracies": 0.840624988079071, "rewards/chosen": -2.989759922027588, "rewards/margins": 1.8119176626205444, "rewards/rejected": -4.801677227020264, "step": 130 }, { "epoch": 0.74, "grad_norm": 32.6045025544378, "learning_rate": 9.56804446775518e-08, "logits/chosen": -6.738868713378906, "logits/rejected": -7.498864650726318, "logps/chosen": -470.77337646484375, "logps/rejected": -584.4710083007812, "loss": 0.3591, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.558176040649414, "rewards/margins": 1.6057535409927368, "rewards/rejected": -4.163929462432861, "step": 140 }, { "epoch": 0.79, "grad_norm": 32.49183208247093, "learning_rate": 6.216702761078166e-08, "logits/chosen": -7.049106597900391, "logits/rejected": -7.772597312927246, "logps/chosen": -487.25726318359375, "logps/rejected": -619.6534423828125, "loss": 0.3576, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -2.82848858833313, "rewards/margins": 1.7726719379425049, "rewards/rejected": -4.601161003112793, "step": 150 }, { "epoch": 0.84, "grad_norm": 37.26747220029015, "learning_rate": 3.5050037137906885e-08, "logits/chosen": -6.9701337814331055, "logits/rejected": -7.731366157531738, "logps/chosen": -494.5716247558594, "logps/rejected": -623.4630737304688, "loss": 0.3502, "rewards/accuracies": 0.84375, "rewards/chosen": -2.750206470489502, "rewards/margins": 1.7980148792266846, "rewards/rejected": -4.548220634460449, "step": 160 }, { "epoch": 0.9, "grad_norm": 31.918546112926368, "learning_rate": 1.5252909846235894e-08, "logits/chosen": -7.007571220397949, "logits/rejected": -7.6982011795043945, "logps/chosen": -509.54388427734375, "logps/rejected": -666.7489624023438, "loss": 0.3631, "rewards/accuracies": 0.890625, "rewards/chosen": -2.9002063274383545, "rewards/margins": 2.0059866905212402, "rewards/rejected": -4.906193733215332, "step": 170 }, { "epoch": 0.95, "grad_norm": 29.32551345390984, "learning_rate": 3.4498131616493565e-09, "logits/chosen": -6.939836025238037, "logits/rejected": -7.576680660247803, "logps/chosen": -514.7128295898438, "logps/rejected": -656.9924926757812, "loss": 0.3518, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -2.932391881942749, "rewards/margins": 1.7769733667373657, "rewards/rejected": -4.709364891052246, "step": 180 }, { "epoch": 1.0, "step": 189, "total_flos": 0.0, "train_loss": 0.42979541909757746, "train_runtime": 5368.3646, "train_samples_per_second": 9.04, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 189, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }