{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9905956112852664, "eval_steps": 500, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012539184952978056, "grad_norm": 17.60484197489494, "learning_rate": 1.25e-08, "logits/chosen": -2.501312494277954, "logits/rejected": -2.4659743309020996, "logps/chosen": -237.65167236328125, "logps/pi_response": -87.47057342529297, "logps/ref_response": -87.47057342529297, "logps/rejected": -259.50030517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12539184952978055, "grad_norm": 18.455497790052533, "learning_rate": 9.980434110374723e-08, "logits/chosen": -2.823746681213379, "logits/rejected": -2.7818007469177246, "logps/chosen": -247.26499938964844, "logps/pi_response": -128.94358825683594, "logps/ref_response": -128.91282653808594, "logps/rejected": -296.67608642578125, "loss": 0.6924, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": -0.0013697165995836258, "rewards/margins": 0.0020414802711457014, "rewards/rejected": -0.0034111966378986835, "step": 10 }, { "epoch": 0.2507836990595611, "grad_norm": 14.481290926955113, "learning_rate": 9.311572862600138e-08, "logits/chosen": -2.7733101844787598, "logits/rejected": -2.699920177459717, "logps/chosen": -233.1176300048828, "logps/pi_response": -114.28636169433594, "logps/ref_response": -113.6148452758789, "logps/rejected": -288.5294189453125, "loss": 0.6789, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.035814352333545685, "rewards/margins": 0.035103101283311844, "rewards/rejected": -0.07091744989156723, "step": 20 }, { "epoch": 0.3761755485893417, "grad_norm": 14.68893281635145, "learning_rate": 7.812246438203904e-08, "logits/chosen": -2.7502968311309814, "logits/rejected": -2.703037977218628, "logps/chosen": -231.7689208984375, "logps/pi_response": -110.3453369140625, "logps/ref_response": -107.25315856933594, "logps/rejected": -274.8539733886719, "loss": 0.6614, "rewards/accuracies": 0.625, "rewards/chosen": -0.12032069265842438, "rewards/margins": 0.06732077896595001, "rewards/rejected": -0.1876414716243744, "step": 30 }, { "epoch": 0.5015673981191222, "grad_norm": 17.44332771197273, "learning_rate": 5.771244664826511e-08, "logits/chosen": -2.75547456741333, "logits/rejected": -2.694068193435669, "logps/chosen": -232.6242218017578, "logps/pi_response": -101.28120422363281, "logps/ref_response": -101.38029479980469, "logps/rejected": -279.0029296875, "loss": 0.6389, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.15924027562141418, "rewards/margins": 0.11631258577108383, "rewards/rejected": -0.2755528390407562, "step": 40 }, { "epoch": 0.6269592476489029, "grad_norm": 16.899777837177197, "learning_rate": 3.581691108328516e-08, "logits/chosen": -2.7600324153900146, "logits/rejected": -2.7391488552093506, "logps/chosen": -246.18197631835938, "logps/pi_response": -99.16355895996094, "logps/ref_response": -100.3008804321289, "logps/rejected": -325.86993408203125, "loss": 0.6351, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.2081223726272583, "rewards/margins": 0.16830840706825256, "rewards/rejected": -0.3764307498931885, "step": 50 }, { "epoch": 0.7523510971786834, "grad_norm": 14.91990933947179, "learning_rate": 1.665322345816746e-08, "logits/chosen": -2.7647507190704346, "logits/rejected": -2.728092670440674, "logps/chosen": -233.04061889648438, "logps/pi_response": -109.2228012084961, "logps/ref_response": -110.14857482910156, "logps/rejected": -318.4779052734375, "loss": 0.6294, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.19416303932666779, "rewards/margins": 0.18744848668575287, "rewards/rejected": -0.38161152601242065, "step": 60 }, { "epoch": 0.877742946708464, "grad_norm": 14.025892286123561, "learning_rate": 3.912559994556086e-09, "logits/chosen": -2.8212244510650635, "logits/rejected": -2.7732884883880615, "logps/chosen": -252.75625610351562, "logps/pi_response": -115.48026275634766, "logps/ref_response": -117.57647705078125, "logps/rejected": -331.065673828125, "loss": 0.6235, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.1880345642566681, "rewards/margins": 0.22486238181591034, "rewards/rejected": -0.4128969609737396, "step": 70 }, { "epoch": 0.9905956112852664, "step": 79, "total_flos": 0.0, "train_loss": 0.6487136973610407, "train_runtime": 3558.6618, "train_samples_per_second": 5.726, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 79, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }