{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 121, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 44.33979039021003, "learning_rate": 3.846153846153846e-08, "logits/chosen": -3.751237154006958, "logits/rejected": -3.652125358581543, "logps/chosen": -995.5263671875, "logps/rejected": -1318.9669189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 35.835351014179686, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -3.71555233001709, "logits/rejected": -3.6416714191436768, "logps/chosen": -873.4622192382812, "logps/rejected": -1458.0814208984375, "loss": 0.6871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.000241430607275106, "rewards/margins": 0.014367452822625637, "rewards/rejected": -0.014608883298933506, "step": 10 }, { "epoch": 0.17, "grad_norm": 29.834055642040035, "learning_rate": 4.948351554413879e-07, "logits/chosen": -3.8361048698425293, "logits/rejected": -3.7403998374938965, "logps/chosen": -946.3114013671875, "logps/rejected": -1385.838134765625, "loss": 0.5664, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.045135729014873505, "rewards/margins": 0.27529585361480713, "rewards/rejected": -0.32043159008026123, "step": 20 }, { "epoch": 0.25, "grad_norm": 19.851574913053625, "learning_rate": 4.700503477950277e-07, "logits/chosen": -4.062998294830322, "logits/rejected": -4.014255046844482, "logps/chosen": -949.5398559570312, "logps/rejected": -1611.4945068359375, "loss": 0.3144, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.3165217936038971, "rewards/margins": 1.7194368839263916, "rewards/rejected": -2.035958766937256, "step": 30 }, { "epoch": 0.33, "grad_norm": 20.09319706827758, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -4.247502326965332, "logits/rejected": -4.206511974334717, "logps/chosen": -1011.1369018554688, "logps/rejected": -1838.808349609375, "loss": 0.2629, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.8711185455322266, "rewards/margins": 3.5932796001434326, "rewards/rejected": -4.464398384094238, "step": 40 }, { "epoch": 0.41, "grad_norm": 17.14618338383071, "learning_rate": 3.686500924369101e-07, "logits/chosen": -4.1785712242126465, "logits/rejected": -4.129928112030029, "logps/chosen": -1003.81005859375, "logps/rejected": -1773.5804443359375, "loss": 0.2915, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.6944042444229126, "rewards/margins": 3.5044853687286377, "rewards/rejected": -4.19888973236084, "step": 50 }, { "epoch": 0.5, "grad_norm": 22.001296504737557, "learning_rate": 3.005543930830095e-07, "logits/chosen": -4.102808475494385, "logits/rejected": -4.0734052658081055, "logps/chosen": -1009.1546020507812, "logps/rejected": -1869.724365234375, "loss": 0.1679, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5237128138542175, "rewards/margins": 4.181941509246826, "rewards/rejected": -4.705655097961426, "step": 60 }, { "epoch": 0.58, "grad_norm": 22.911683114341198, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -4.085400104522705, "logits/rejected": -4.0569257736206055, "logps/chosen": -1070.66650390625, "logps/rejected": -1899.2978515625, "loss": 0.1352, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6565460562705994, "rewards/margins": 4.237053871154785, "rewards/rejected": -4.893599987030029, "step": 70 }, { "epoch": 0.66, "grad_norm": 13.574682459891369, "learning_rate": 1.5769846317182892e-07, "logits/chosen": -4.115840435028076, "logits/rejected": -4.082175254821777, "logps/chosen": -976.5784912109375, "logps/rejected": -2003.0341796875, "loss": 0.119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6749869585037231, "rewards/margins": 5.019448757171631, "rewards/rejected": -5.694436073303223, "step": 80 }, { "epoch": 0.74, "grad_norm": 17.137536022407392, "learning_rate": 9.494112718293502e-08, "logits/chosen": -4.116685390472412, "logits/rejected": -4.105366230010986, "logps/chosen": -969.3150634765625, "logps/rejected": -2016.6741943359375, "loss": 0.105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6770291328430176, "rewards/margins": 5.554581642150879, "rewards/rejected": -6.231610298156738, "step": 90 }, { "epoch": 0.83, "grad_norm": 19.248054573741623, "learning_rate": 4.521198892775202e-08, "logits/chosen": -4.130964279174805, "logits/rejected": -4.107068061828613, "logps/chosen": -997.1163940429688, "logps/rejected": -2062.62109375, "loss": 0.1232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8503810167312622, "rewards/margins": 5.699716091156006, "rewards/rejected": -6.550097465515137, "step": 100 }, { "epoch": 0.83, "eval_logits/chosen": -4.124736309051514, "eval_logits/rejected": -4.084261894226074, "eval_logps/chosen": -456.256591796875, "eval_logps/rejected": -691.86181640625, "eval_loss": 0.46714693307876587, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.502022922039032, "eval_rewards/margins": 0.8244528770446777, "eval_rewards/rejected": -1.326475739479065, "eval_runtime": 14.3746, "eval_samples_per_second": 5.287, "eval_steps_per_second": 0.209, "step": 100 }, { "epoch": 0.91, "grad_norm": 12.936607152276345, "learning_rate": 1.2689339106741526e-08, "logits/chosen": -4.11704158782959, "logits/rejected": -4.1085028648376465, "logps/chosen": -983.2184448242188, "logps/rejected": -2123.88623046875, "loss": 0.1405, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7897412776947021, "rewards/margins": 5.98792028427124, "rewards/rejected": -6.7776618003845215, "step": 110 }, { "epoch": 0.99, "grad_norm": 18.334577187135583, "learning_rate": 1.0576247944985018e-10, "logits/chosen": -4.1415205001831055, "logits/rejected": -4.129425048828125, "logps/chosen": -978.7972412109375, "logps/rejected": -2053.748046875, "loss": 0.1121, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8511762619018555, "rewards/margins": 5.528683185577393, "rewards/rejected": -6.37985897064209, "step": 120 }, { "epoch": 1.0, "step": 121, "total_flos": 0.0, "train_loss": 0.021612109477854958, "train_runtime": 343.2656, "train_samples_per_second": 22.536, "train_steps_per_second": 0.352 } ], "logging_steps": 10, "max_steps": 121, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }