{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00034021127119941485, "eval_steps": 500, "global_step": 10, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.4021127119941484e-05, "grad_norm": 33.091712951660156, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -3.133824586868286, "logits/rejected": -3.1356313228607178, "logps/chosen": -317.8847961425781, "logps/rejected": -306.8866271972656, "loss": 0.8544, "rewards/accuracies": 0.75, "rewards/chosen": -0.10576057434082031, "rewards/margins": 0.2207910716533661, "rewards/rejected": -0.3265516459941864, "step": 1 }, { "epoch": 6.804225423988297e-05, "grad_norm": 38.49541473388672, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -3.1433358192443848, "logits/rejected": -3.1124918460845947, "logps/chosen": -212.93820190429688, "logps/rejected": -198.22511291503906, "loss": 1.02, "rewards/accuracies": 0.5, "rewards/chosen": -0.28638574481010437, "rewards/margins": -0.37813109159469604, "rewards/rejected": 0.0917452871799469, "step": 2 }, { "epoch": 0.00010206338135982445, "grad_norm": 27.203516006469727, "learning_rate": 3e-06, "logits/chosen": -3.2104451656341553, "logits/rejected": -3.2103281021118164, "logps/chosen": -235.89254760742188, "logps/rejected": -206.0758514404297, "loss": 0.605, "rewards/accuracies": 0.75, "rewards/chosen": 0.40017586946487427, "rewards/margins": 0.24887371063232422, "rewards/rejected": 0.15130215883255005, "step": 3 }, { "epoch": 0.00013608450847976594, "grad_norm": 26.42736053466797, "learning_rate": 4.000000000000001e-06, "logits/chosen": -3.0741634368896484, "logits/rejected": -2.953117847442627, "logps/chosen": -428.7364196777344, "logps/rejected": -192.357666015625, "loss": 0.4914, "rewards/accuracies": 0.75, "rewards/chosen": 0.4332427978515625, "rewards/margins": 0.5804362893104553, "rewards/rejected": -0.1471935510635376, "step": 4 }, { "epoch": 0.00017010563559970743, "grad_norm": 35.623634338378906, "learning_rate": 5e-06, "logits/chosen": -2.985285758972168, "logits/rejected": -2.8854246139526367, "logps/chosen": -342.38604736328125, "logps/rejected": -211.78237915039062, "loss": 0.8273, "rewards/accuracies": 0.25, "rewards/chosen": -0.3475755751132965, "rewards/margins": -0.19371166825294495, "rewards/rejected": -0.15386392176151276, "step": 5 }, { "epoch": 0.0002041267627196489, "grad_norm": 31.70409393310547, "learning_rate": 6e-06, "logits/chosen": -3.074589967727661, "logits/rejected": -2.85581636428833, "logps/chosen": -593.6435546875, "logps/rejected": -259.5284729003906, "loss": 0.5473, "rewards/accuracies": 0.75, "rewards/chosen": 0.2495414763689041, "rewards/margins": 0.35888367891311646, "rewards/rejected": -0.10934218764305115, "step": 6 }, { "epoch": 0.00023814788983959038, "grad_norm": 32.41214370727539, "learning_rate": 7.000000000000001e-06, "logits/chosen": -2.976675033569336, "logits/rejected": -2.978133201599121, "logps/chosen": -218.11688232421875, "logps/rejected": -231.7698974609375, "loss": 0.7124, "rewards/accuracies": 0.25, "rewards/chosen": -0.08671779930591583, "rewards/margins": 0.21950224041938782, "rewards/rejected": -0.30622005462646484, "step": 7 }, { "epoch": 0.00027216901695953187, "grad_norm": 27.637351989746094, "learning_rate": 8.000000000000001e-06, "logits/chosen": -2.997600793838501, "logits/rejected": -3.1078853607177734, "logps/chosen": -165.71444702148438, "logps/rejected": -335.5938720703125, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": -0.057393088936805725, "rewards/margins": 0.34109365940093994, "rewards/rejected": -0.3984867036342621, "step": 8 }, { "epoch": 0.00030619014407947336, "grad_norm": 32.246559143066406, "learning_rate": 9e-06, "logits/chosen": -3.0760390758514404, "logits/rejected": -2.938197135925293, "logps/chosen": -463.1333312988281, "logps/rejected": -173.99951171875, "loss": 0.6398, "rewards/accuracies": 0.5, "rewards/chosen": 0.5581258535385132, "rewards/margins": 0.7653559446334839, "rewards/rejected": -0.20723000168800354, "step": 9 }, { "epoch": 0.00034021127119941485, "grad_norm": 39.84626388549805, "learning_rate": 1e-05, "logits/chosen": -3.192596673965454, "logits/rejected": -2.9751501083374023, "logps/chosen": -473.14532470703125, "logps/rejected": -110.90155029296875, "loss": 0.9794, "rewards/accuracies": 0.25, "rewards/chosen": -0.3494076132774353, "rewards/margins": -0.31546899676322937, "rewards/rejected": -0.03393859416246414, "step": 10 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }