{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9962825278810409, "eval_steps": 100, "global_step": 134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 7.393402431483719, "learning_rate": 3.571428571428571e-08, "logits/chosen": -0.5970903635025024, "logits/rejected": -0.02967279776930809, "logps/chosen": -254.73361206054688, "logps/rejected": -449.335693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 6.97808175581924, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -0.502315104007721, "logits/rejected": -0.24948199093341827, "logps/chosen": -339.6366271972656, "logps/rejected": -657.9154663085938, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0001597129157744348, "rewards/margins": 0.00033807966974563897, "rewards/rejected": -0.0004977926146239042, "step": 10 }, { "epoch": 0.15, "grad_norm": 7.671337358423795, "learning_rate": 4.969220851487844e-07, "logits/chosen": -0.5676344037055969, "logits/rejected": -0.3287120759487152, "logps/chosen": -378.62664794921875, "logps/rejected": -670.591552734375, "loss": 0.6827, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0052469945512712, "rewards/margins": 0.02313617616891861, "rewards/rejected": -0.028383171185851097, "step": 20 }, { "epoch": 0.22, "grad_norm": 6.338334841496394, "learning_rate": 4.783863644106502e-07, "logits/chosen": -0.5812798738479614, "logits/rejected": -0.34431666135787964, "logps/chosen": -353.8559875488281, "logps/rejected": -717.6322021484375, "loss": 0.6376, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.018838122487068176, "rewards/margins": 0.11194054782390594, "rewards/rejected": -0.13077868521213531, "step": 30 }, { "epoch": 0.3, "grad_norm": 5.540070129895064, "learning_rate": 4.442864903642427e-07, "logits/chosen": -0.4757254719734192, "logits/rejected": -0.3771602213382721, "logps/chosen": -361.31365966796875, "logps/rejected": -770.7361450195312, "loss": 0.5666, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.04972491413354874, "rewards/margins": 0.309120774269104, "rewards/rejected": -0.35884565114974976, "step": 40 }, { "epoch": 0.37, "grad_norm": 6.028650135342188, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -0.5247567892074585, "logits/rejected": -0.45507222414016724, "logps/chosen": -293.3636169433594, "logps/rejected": -845.0416870117188, "loss": 0.4561, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.1027413159608841, "rewards/margins": 0.9158787727355957, "rewards/rejected": -1.0186201333999634, "step": 50 }, { "epoch": 0.45, "grad_norm": 5.965290480598111, "learning_rate": 3.39591987386325e-07, "logits/chosen": -0.5522093772888184, "logits/rejected": -0.4290170669555664, "logps/chosen": -363.2244873046875, "logps/rejected": -951.5051879882812, "loss": 0.3684, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.2073672115802765, "rewards/margins": 2.0674309730529785, "rewards/rejected": -2.2747981548309326, "step": 60 }, { "epoch": 0.52, "grad_norm": 5.3960424532927345, "learning_rate": 2.761321158169134e-07, "logits/chosen": -0.47125476598739624, "logits/rejected": -0.4486091136932373, "logps/chosen": -341.56646728515625, "logps/rejected": -1014.1658935546875, "loss": 0.338, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.219242125749588, "rewards/margins": 3.1757972240448, "rewards/rejected": -3.3950393199920654, "step": 70 }, { "epoch": 0.59, "grad_norm": 5.338829151255127, "learning_rate": 2.1089138373994222e-07, "logits/chosen": -0.5217522382736206, "logits/rejected": -0.5397945642471313, "logps/chosen": -321.6473083496094, "logps/rejected": -1235.2142333984375, "loss": 0.2868, "rewards/accuracies": 0.90625, "rewards/chosen": -0.31155842542648315, "rewards/margins": 4.753512382507324, "rewards/rejected": -5.065071105957031, "step": 80 }, { "epoch": 0.67, "grad_norm": 4.274164409019951, "learning_rate": 1.4831583923104998e-07, "logits/chosen": -0.44177961349487305, "logits/rejected": -0.528927743434906, "logps/chosen": -327.2131042480469, "logps/rejected": -1242.676513671875, "loss": 0.2479, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.2681874930858612, "rewards/margins": 5.103245735168457, "rewards/rejected": -5.371432781219482, "step": 90 }, { "epoch": 0.74, "grad_norm": 3.847681923589658, "learning_rate": 9.266990223754067e-08, "logits/chosen": -0.3910934329032898, "logits/rejected": -0.5766850709915161, "logps/chosen": -417.525390625, "logps/rejected": -1386.257080078125, "loss": 0.2497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5247961282730103, "rewards/margins": 5.741795539855957, "rewards/rejected": -6.266592979431152, "step": 100 }, { "epoch": 0.74, "eval_logits/chosen": -0.7826768159866333, "eval_logits/rejected": -0.5636682510375977, "eval_logps/chosen": -311.038330078125, "eval_logps/rejected": -748.6944580078125, "eval_loss": 0.3023545444011688, "eval_rewards/accuracies": 0.949999988079071, "eval_rewards/chosen": -0.08790449053049088, "eval_rewards/margins": 1.8343137502670288, "eval_rewards/rejected": -1.9222180843353271, "eval_runtime": 15.884, "eval_samples_per_second": 9.569, "eval_steps_per_second": 0.315, "step": 100 }, { "epoch": 0.82, "grad_norm": 4.194490591135143, "learning_rate": 4.774575140626316e-08, "logits/chosen": -0.42816129326820374, "logits/rejected": -0.4562205374240875, "logps/chosen": -361.47930908203125, "logps/rejected": -1313.421630859375, "loss": 0.2458, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.5410887002944946, "rewards/margins": 5.888722896575928, "rewards/rejected": -6.429811954498291, "step": 110 }, { "epoch": 0.89, "grad_norm": 3.9693871459743573, "learning_rate": 1.6604893375699592e-08, "logits/chosen": -0.4562758803367615, "logits/rejected": -0.5703433156013489, "logps/chosen": -393.4559631347656, "logps/rejected": -1514.4947509765625, "loss": 0.2112, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.518619954586029, "rewards/margins": 7.754377841949463, "rewards/rejected": -8.272997856140137, "step": 120 }, { "epoch": 0.97, "grad_norm": 4.192543496278868, "learning_rate": 1.3695261579316775e-09, "logits/chosen": -0.39360299706459045, "logits/rejected": -0.4867175221443176, "logps/chosen": -386.438232421875, "logps/rejected": -1469.9609375, "loss": 0.2181, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6078141927719116, "rewards/margins": 7.127106666564941, "rewards/rejected": -7.734920501708984, "step": 130 }, { "epoch": 1.0, "step": 134, "total_flos": 0.0, "train_loss": 0.3946254751575527, "train_runtime": 1910.8543, "train_samples_per_second": 4.489, "train_steps_per_second": 0.07 } ], "logging_steps": 10, "max_steps": 134, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }