{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.957345971563981, "eval_steps": 100, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 265.9657020142556, "learning_rate": 3.125e-08, "logits/chosen": 123.11854553222656, "logits/rejected": 97.00198364257812, "logps/chosen": -425.18585205078125, "logps/rejected": -424.1869201660156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.19, "grad_norm": 372.96475604860683, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 117.39414978027344, "logits/rejected": 136.32305908203125, "logps/chosen": -442.28045654296875, "logps/rejected": -524.1576538085938, "loss": 0.7748, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.1107947826385498, "rewards/margins": 0.115071140229702, "rewards/rejected": -0.004276359919458628, "step": 10 }, { "epoch": 0.38, "grad_norm": 213.17360776034892, "learning_rate": 4.989935734988097e-07, "logits/chosen": 125.45426177978516, "logits/rejected": 133.0935821533203, "logps/chosen": -426.3124084472656, "logps/rejected": -493.6737365722656, "loss": 0.6502, "rewards/accuracies": 0.625, "rewards/chosen": -0.004780772142112255, "rewards/margins": 0.6115384101867676, "rewards/rejected": -0.6163192987442017, "step": 20 }, { "epoch": 0.57, "grad_norm": 204.32015271324505, "learning_rate": 4.877641290737883e-07, "logits/chosen": 123.5244369506836, "logits/rejected": 126.99784851074219, "logps/chosen": -466.4947814941406, "logps/rejected": -534.4097900390625, "loss": 0.6377, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.137466549873352, "rewards/margins": 1.5820324420928955, "rewards/rejected": -2.719499111175537, "step": 30 }, { "epoch": 0.76, "grad_norm": 200.5277184123756, "learning_rate": 4.646121984004665e-07, "logits/chosen": 126.0523910522461, "logits/rejected": 120.98759460449219, "logps/chosen": -489.78033447265625, "logps/rejected": -514.3892211914062, "loss": 0.6145, "rewards/accuracies": 0.6875, "rewards/chosen": -1.659891128540039, "rewards/margins": 1.3025611639022827, "rewards/rejected": -2.9624521732330322, "step": 40 }, { "epoch": 0.95, "grad_norm": 205.88684610219255, "learning_rate": 4.3069871595684787e-07, "logits/chosen": 134.4938201904297, "logits/rejected": 134.79849243164062, "logps/chosen": -502.4814453125, "logps/rejected": -523.4627685546875, "loss": 0.5901, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9658725261688232, "rewards/margins": 1.2555662393569946, "rewards/rejected": -3.2214386463165283, "step": 50 }, { "epoch": 1.14, "grad_norm": 87.33372866728267, "learning_rate": 3.877242453630256e-07, "logits/chosen": 133.0808563232422, "logits/rejected": 136.4200897216797, "logps/chosen": -467.0462951660156, "logps/rejected": -506.5567321777344, "loss": 0.2517, "rewards/accuracies": 0.875, "rewards/chosen": -1.0280801057815552, "rewards/margins": 3.1976516246795654, "rewards/rejected": -4.22573184967041, "step": 60 }, { "epoch": 1.33, "grad_norm": 71.00765502932312, "learning_rate": 3.378437060203357e-07, "logits/chosen": 128.8972625732422, "logits/rejected": 129.42971801757812, "logps/chosen": -440.6957092285156, "logps/rejected": -552.7991943359375, "loss": 0.1231, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1781926155090332, "rewards/margins": 3.8879973888397217, "rewards/rejected": -5.066189765930176, "step": 70 }, { "epoch": 1.52, "grad_norm": 68.54852270837435, "learning_rate": 2.8355831645441387e-07, "logits/chosen": 133.30160522460938, "logits/rejected": 134.1454620361328, "logps/chosen": -499.032958984375, "logps/rejected": -585.234130859375, "loss": 0.1193, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9932767748832703, "rewards/margins": 4.856534481048584, "rewards/rejected": -5.849810600280762, "step": 80 }, { "epoch": 1.71, "grad_norm": 96.90359229935069, "learning_rate": 2.2759017277414164e-07, "logits/chosen": 120.51957702636719, "logits/rejected": 123.01716613769531, "logps/chosen": -477.44366455078125, "logps/rejected": -548.8724365234375, "loss": 0.1182, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5276060104370117, "rewards/margins": 4.922442436218262, "rewards/rejected": -6.450047969818115, "step": 90 }, { "epoch": 1.9, "grad_norm": 57.0241100183833, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 134.63583374023438, "logits/rejected": 125.1494140625, "logps/chosen": -492.84930419921875, "logps/rejected": -590.646728515625, "loss": 0.1371, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4459283351898193, "rewards/margins": 5.340741157531738, "rewards/rejected": -6.7866692543029785, "step": 100 }, { "epoch": 1.9, "eval_logits/chosen": 102.37699127197266, "eval_logits/rejected": 96.37848663330078, "eval_logps/chosen": -483.2002868652344, "eval_logps/rejected": -502.7766418457031, "eval_loss": 0.5052046775817871, "eval_rewards/accuracies": 0.7708333134651184, "eval_rewards/chosen": -2.9795796871185303, "eval_rewards/margins": 2.143129587173462, "eval_rewards/rejected": -5.122709274291992, "eval_runtime": 48.3488, "eval_samples_per_second": 15.512, "eval_steps_per_second": 0.496, "step": 100 }, { "epoch": 2.09, "grad_norm": 49.58918461406247, "learning_rate": 1.2177518064852348e-07, "logits/chosen": 113.58479309082031, "logits/rejected": 127.19537353515625, "logps/chosen": -511.4662170410156, "logps/rejected": -620.75537109375, "loss": 0.0929, "rewards/accuracies": 0.96875, "rewards/chosen": -1.6378005743026733, "rewards/margins": 5.002486228942871, "rewards/rejected": -6.640286922454834, "step": 110 }, { "epoch": 2.27, "grad_norm": 30.33881666826054, "learning_rate": 7.723433775328384e-08, "logits/chosen": 124.79753112792969, "logits/rejected": 125.82017517089844, "logps/chosen": -493.6553649902344, "logps/rejected": -574.6585693359375, "loss": 0.053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6378300189971924, "rewards/margins": 5.504368305206299, "rewards/rejected": -7.142198085784912, "step": 120 }, { "epoch": 2.46, "grad_norm": 23.06668849787155, "learning_rate": 4.1356686569674335e-08, "logits/chosen": 129.1642608642578, "logits/rejected": 134.2584228515625, "logps/chosen": -508.4425354003906, "logps/rejected": -594.6097412109375, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4683927297592163, "rewards/margins": 5.420845985412598, "rewards/rejected": -6.8892388343811035, "step": 130 }, { "epoch": 2.65, "grad_norm": 38.07392153670512, "learning_rate": 1.5941282340065697e-08, "logits/chosen": 114.87281799316406, "logits/rejected": 115.1664047241211, "logps/chosen": -465.3794860839844, "logps/rejected": -584.0101318359375, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -1.9863094091415405, "rewards/margins": 5.705286979675293, "rewards/rejected": -7.691596984863281, "step": 140 }, { "epoch": 2.84, "grad_norm": 46.62238024751895, "learning_rate": 2.2625595580163247e-09, "logits/chosen": 121.98884582519531, "logits/rejected": 135.7744598388672, "logps/chosen": -492.438720703125, "logps/rejected": -580.8795166015625, "loss": 0.0589, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7609163522720337, "rewards/margins": 5.512405872344971, "rewards/rejected": -7.273321628570557, "step": 150 }, { "epoch": 2.96, "step": 156, "total_flos": 0.0, "train_loss": 0.27917395799587935, "train_runtime": 1799.8719, "train_samples_per_second": 11.251, "train_steps_per_second": 0.087 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }