{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-08, "logits/chosen": -2.9335875511169434, "logits/rejected": -2.708059310913086, "logps/chosen": -367.5562744140625, "logps/rejected": -316.3052062988281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.7960729598999023, "logits/rejected": -2.7260401248931885, "logps/chosen": -274.3499755859375, "logps/rejected": -233.05935668945312, "loss": 0.6928, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 7.348848885158077e-05, "rewards/margins": 0.00036154486588202417, "rewards/rejected": -0.0002880564716178924, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.7711944580078125, "logits/rejected": -2.701967716217041, "logps/chosen": -303.2231140136719, "logps/rejected": -258.30401611328125, "loss": 0.6874, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.02738715335726738, "rewards/margins": 0.014244809746742249, "rewards/rejected": 0.013142342679202557, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.877641290737883e-07, "logits/chosen": -2.702401638031006, "logits/rejected": -2.6644959449768066, "logps/chosen": -298.130859375, "logps/rejected": -281.53778076171875, "loss": 0.6707, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0661591961979866, "rewards/margins": 0.05363141745328903, "rewards/rejected": 0.012527775950729847, "step": 30 }, { "epoch": 0.26, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.713608741760254, "logits/rejected": -2.580498218536377, "logps/chosen": -287.88385009765625, "logps/rejected": -264.4161376953125, "loss": 0.648, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0008304916555061936, "rewards/margins": 0.12442084401845932, "rewards/rejected": -0.12359035015106201, "step": 40 }, { "epoch": 0.32, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -2.5537896156311035, "logits/rejected": -2.501384735107422, "logps/chosen": -269.0035705566406, "logps/rejected": -289.73883056640625, "loss": 0.6372, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.021835196763277054, "rewards/margins": 0.18140152096748352, "rewards/rejected": -0.20323672890663147, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.577971935272217, "logits/rejected": -2.5247650146484375, "logps/chosen": -304.6644592285156, "logps/rejected": -324.11370849609375, "loss": 0.608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12026672065258026, "rewards/margins": 0.289926141500473, "rewards/rejected": -0.4101928174495697, "step": 60 }, { "epoch": 0.45, "learning_rate": 3.378437060203357e-07, "logits/chosen": -2.5257785320281982, "logits/rejected": -2.4621853828430176, "logps/chosen": -316.51153564453125, "logps/rejected": -312.25164794921875, "loss": 0.6138, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1019580215215683, "rewards/margins": 0.32222574949264526, "rewards/rejected": -0.42418378591537476, "step": 70 }, { "epoch": 0.51, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.629840135574341, "logits/rejected": -2.492844343185425, "logps/chosen": -348.1588439941406, "logps/rejected": -312.782470703125, "loss": 0.5935, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14375950396060944, "rewards/margins": 0.3931245803833008, "rewards/rejected": -0.5368840098381042, "step": 80 }, { "epoch": 0.58, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -2.368922710418701, "logits/rejected": -2.346982479095459, "logps/chosen": -337.67193603515625, "logps/rejected": -351.6241760253906, "loss": 0.5782, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3990400731563568, "rewards/margins": 0.3560941815376282, "rewards/rejected": -0.7551342844963074, "step": 90 }, { "epoch": 0.64, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.164156675338745, "logits/rejected": -2.03346586227417, "logps/chosen": -310.364013671875, "logps/rejected": -301.4464111328125, "loss": 0.5887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2869732081890106, "rewards/margins": 0.44102102518081665, "rewards/rejected": -0.7279942035675049, "step": 100 }, { "epoch": 0.7, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -1.8856881856918335, "logits/rejected": -1.5883822441101074, "logps/chosen": -327.2856750488281, "logps/rejected": -338.85137939453125, "loss": 0.5656, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3426055610179901, "rewards/margins": 0.4826560616493225, "rewards/rejected": -0.8252617120742798, "step": 110 }, { "epoch": 0.77, "learning_rate": 7.723433775328384e-08, "logits/chosen": -1.5672389268875122, "logits/rejected": -1.4351329803466797, "logps/chosen": -334.88446044921875, "logps/rejected": -357.7071838378906, "loss": 0.5718, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46880921721458435, "rewards/margins": 0.29977938532829285, "rewards/rejected": -0.768588662147522, "step": 120 }, { "epoch": 0.83, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -1.5653654336929321, "logits/rejected": -1.2208584547042847, "logps/chosen": -330.20635986328125, "logps/rejected": -355.11126708984375, "loss": 0.5632, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.34367313981056213, "rewards/margins": 0.5312191247940063, "rewards/rejected": -0.8748922348022461, "step": 130 }, { "epoch": 0.9, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -1.5312021970748901, "logits/rejected": -1.239989995956421, "logps/chosen": -325.05908203125, "logps/rejected": -337.0903015136719, "loss": 0.5504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2957434356212616, "rewards/margins": 0.4055696427822113, "rewards/rejected": -0.7013131380081177, "step": 140 }, { "epoch": 0.96, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -1.4374409914016724, "logits/rejected": -1.128422498703003, "logps/chosen": -334.95135498046875, "logps/rejected": -365.4905700683594, "loss": 0.5687, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3784746527671814, "rewards/margins": 0.64166659116745, "rewards/rejected": -1.020141363143921, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.3960995582433847, "train_runtime": 6783.1442, "train_samples_per_second": 2.948, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }