{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9748953974895398, "eval_steps": 500, "global_step": 118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 118.10906601297802, "learning_rate": 8.333333333333332e-09, "logits/chosen": -2.0019314289093018, "logits/rejected": -1.9766970872879028, "logps/chosen": -309.98992919921875, "logps/pi_response": -172.89300537109375, "logps/ref_response": -172.89300537109375, "logps/rejected": -664.0586547851562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 119.81663055217236, "learning_rate": 8.333333333333334e-08, "logits/chosen": -1.9912662506103516, "logits/rejected": -1.9100477695465088, "logps/chosen": -339.4380187988281, "logps/pi_response": -145.0324249267578, "logps/ref_response": -144.99766540527344, "logps/rejected": -690.2503051757812, "loss": 0.6896, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -0.005152760073542595, "rewards/margins": 0.007620910182595253, "rewards/rejected": -0.012773669324815273, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 61.89513911795292, "learning_rate": 9.860114570402053e-08, "logits/chosen": -1.942497968673706, "logits/rejected": -1.850992202758789, "logps/chosen": -344.79351806640625, "logps/pi_response": -157.76609802246094, "logps/ref_response": -152.63552856445312, "logps/rejected": -725.7340698242188, "loss": 0.6134, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.1949012279510498, "rewards/margins": 0.2145640105009079, "rewards/rejected": -0.4094652235507965, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 41.75290331160563, "learning_rate": 9.305218058836777e-08, "logits/chosen": -1.8073937892913818, "logits/rejected": -1.748516321182251, "logps/chosen": -373.4971923828125, "logps/pi_response": -154.7528839111328, "logps/ref_response": -136.81463623046875, "logps/rejected": -826.9029541015625, "loss": 0.5314, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5469005703926086, "rewards/margins": 0.8107492327690125, "rewards/rejected": -1.357649803161621, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 60.21218903804784, "learning_rate": 8.374915007591053e-08, "logits/chosen": -1.7726547718048096, "logits/rejected": -1.6935310363769531, "logps/chosen": -415.587890625, "logps/pi_response": -174.06515502929688, "logps/ref_response": -150.7990264892578, "logps/rejected": -888.4119262695312, "loss": 0.5313, "rewards/accuracies": 0.75, "rewards/chosen": -0.7993820905685425, "rewards/margins": 1.1050827503204346, "rewards/rejected": -1.9044649600982666, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 60.309635521609, "learning_rate": 7.150326011382603e-08, "logits/chosen": -1.7549054622650146, "logits/rejected": -1.6928848028182983, "logps/chosen": -417.91644287109375, "logps/pi_response": -171.1042938232422, "logps/ref_response": -149.61317443847656, "logps/rejected": -928.3387451171875, "loss": 0.5012, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.7739541530609131, "rewards/margins": 1.2372756004333496, "rewards/rejected": -2.0112297534942627, "step": 50 }, { "epoch": 1.00418410041841, "grad_norm": 43.55054669917714, "learning_rate": 5.738232820012406e-08, "logits/chosen": -1.7201545238494873, "logits/rejected": -1.660457968711853, "logps/chosen": -363.9744873046875, "logps/pi_response": -158.1030731201172, "logps/ref_response": -138.2023468017578, "logps/rejected": -817.6099853515625, "loss": 0.4936, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6088830232620239, "rewards/margins": 1.028526782989502, "rewards/rejected": -1.6374098062515259, "step": 60 }, { "epoch": 1.1715481171548117, "grad_norm": 39.95163711230341, "learning_rate": 4.2617671799875946e-08, "logits/chosen": -1.768969178199768, "logits/rejected": -1.6774520874023438, "logps/chosen": -409.07965087890625, "logps/pi_response": -178.50694274902344, "logps/ref_response": -157.27854919433594, "logps/rejected": -837.8878784179688, "loss": 0.4784, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6469767689704895, "rewards/margins": 1.0017244815826416, "rewards/rejected": -1.6487010717391968, "step": 70 }, { "epoch": 1.3389121338912133, "grad_norm": 54.62109419599635, "learning_rate": 2.8496739886173992e-08, "logits/chosen": -1.7565717697143555, "logits/rejected": -1.6744270324707031, "logps/chosen": -414.45989990234375, "logps/pi_response": -171.05670166015625, "logps/ref_response": -150.3345947265625, "logps/rejected": -869.4351806640625, "loss": 0.4682, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6550418138504028, "rewards/margins": 1.0387578010559082, "rewards/rejected": -1.693799614906311, "step": 80 }, { "epoch": 1.506276150627615, "grad_norm": 47.344285151326375, "learning_rate": 1.6250849924089483e-08, "logits/chosen": -1.7577073574066162, "logits/rejected": -1.6519912481307983, "logps/chosen": -414.08984375, "logps/pi_response": -160.68191528320312, "logps/ref_response": -142.70220947265625, "logps/rejected": -822.3709716796875, "loss": 0.4783, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6929503083229065, "rewards/margins": 0.93775475025177, "rewards/rejected": -1.6307048797607422, "step": 90 }, { "epoch": 1.6736401673640167, "grad_norm": 54.4733158152604, "learning_rate": 6.947819411632222e-09, "logits/chosen": -1.8023641109466553, "logits/rejected": -1.725804328918457, "logps/chosen": -395.9789733886719, "logps/pi_response": -165.1318817138672, "logps/ref_response": -145.55709838867188, "logps/rejected": -855.1980590820312, "loss": 0.4809, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -0.6401264071464539, "rewards/margins": 1.0448520183563232, "rewards/rejected": -1.6849782466888428, "step": 100 }, { "epoch": 1.8410041841004183, "grad_norm": 38.619103871977664, "learning_rate": 1.3988542959794625e-09, "logits/chosen": -1.7393662929534912, "logits/rejected": -1.651567816734314, "logps/chosen": -383.98895263671875, "logps/pi_response": -170.61387634277344, "logps/ref_response": -149.04595947265625, "logps/rejected": -837.4610595703125, "loss": 0.4635, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.6302980780601501, "rewards/margins": 1.0058103799819946, "rewards/rejected": -1.6361083984375, "step": 110 }, { "epoch": 1.9748953974895398, "step": 118, "total_flos": 0.0, "train_loss": 0.5197100033194332, "train_runtime": 5205.1722, "train_samples_per_second": 5.872, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 118, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }