{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9748953974895398, "eval_steps": 500, "global_step": 118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 254.03603103806802, "learning_rate": 8.333333333333332e-09, "logits/chosen": 0.40769851207733154, "logits/rejected": 0.6983045935630798, "logps/chosen": -597.6331176757812, "logps/pi_response": -454.7916259765625, "logps/ref_response": -454.7916259765625, "logps/rejected": -933.78369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 223.89448476390103, "learning_rate": 8.333333333333334e-08, "logits/chosen": 0.3617916703224182, "logits/rejected": 0.8798990249633789, "logps/chosen": -520.3926391601562, "logps/pi_response": -373.90179443359375, "logps/ref_response": -369.4568176269531, "logps/rejected": -942.7015991210938, "loss": 0.6814, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": -0.003062439849600196, "rewards/margins": 0.031748898327350616, "rewards/rejected": -0.034811343997716904, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 275.758299798461, "learning_rate": 9.860114570402053e-08, "logits/chosen": 0.41715487837791443, "logits/rejected": 0.8884197473526001, "logps/chosen": -551.1149291992188, "logps/pi_response": -498.0843200683594, "logps/ref_response": -378.30291748046875, "logps/rejected": -1005.5443115234375, "loss": 0.5604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09888347238302231, "rewards/margins": 1.0031945705413818, "rewards/rejected": -1.1020780801773071, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 151.4646821692468, "learning_rate": 9.305218058836777e-08, "logits/chosen": 0.5644534826278687, "logits/rejected": 0.9862662553787231, "logps/chosen": -576.3702392578125, "logps/pi_response": -784.0135498046875, "logps/ref_response": -373.52685546875, "logps/rejected": -1288.76220703125, "loss": 0.4456, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5222317576408386, "rewards/margins": 3.5375170707702637, "rewards/rejected": -4.059748649597168, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 99.31936965278155, "learning_rate": 8.374915007591053e-08, "logits/chosen": 0.6754584312438965, "logits/rejected": 1.3761770725250244, "logps/chosen": -637.95263671875, "logps/pi_response": -662.8466796875, "logps/ref_response": -352.9124450683594, "logps/rejected": -1316.7098388671875, "loss": 0.3955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1482101678848267, "rewards/margins": 2.7251365184783936, "rewards/rejected": -3.8733463287353516, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 81.4813373754742, "learning_rate": 7.150326011382603e-08, "logits/chosen": 0.8304530382156372, "logits/rejected": 1.3797433376312256, "logps/chosen": -703.5577392578125, "logps/pi_response": -740.5267944335938, "logps/ref_response": -367.9375305175781, "logps/rejected": -1414.14453125, "loss": 0.389, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.441198468208313, "rewards/margins": 3.4418697357177734, "rewards/rejected": -4.883068084716797, "step": 50 }, { "epoch": 1.00418410041841, "grad_norm": 68.59895920394845, "learning_rate": 5.738232820012406e-08, "logits/chosen": 0.8587929606437683, "logits/rejected": 1.2634632587432861, "logps/chosen": -588.1586303710938, "logps/pi_response": -831.5247192382812, "logps/ref_response": -366.019287109375, "logps/rejected": -1391.0379638671875, "loss": 0.3563, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0079401731491089, "rewards/margins": 4.322751045227051, "rewards/rejected": -5.330691337585449, "step": 60 }, { "epoch": 1.1715481171548117, "grad_norm": 78.3617679229796, "learning_rate": 4.2617671799875946e-08, "logits/chosen": 0.7778602838516235, "logits/rejected": 1.3484394550323486, "logps/chosen": -664.4850463867188, "logps/pi_response": -791.7720947265625, "logps/ref_response": -386.8902893066406, "logps/rejected": -1374.6849365234375, "loss": 0.3516, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -1.1506381034851074, "rewards/margins": 3.6012942790985107, "rewards/rejected": -4.751932621002197, "step": 70 }, { "epoch": 1.3389121338912133, "grad_norm": 54.85306379624032, "learning_rate": 2.8496739886173992e-08, "logits/chosen": 0.8767589330673218, "logits/rejected": 1.290276288986206, "logps/chosen": -640.9560546875, "logps/pi_response": -780.9495239257812, "logps/ref_response": -370.62164306640625, "logps/rejected": -1382.840576171875, "loss": 0.3358, "rewards/accuracies": 0.875, "rewards/chosen": -1.0898466110229492, "rewards/margins": 3.9194438457489014, "rewards/rejected": -5.0092902183532715, "step": 80 }, { "epoch": 1.506276150627615, "grad_norm": 88.99295644400407, "learning_rate": 1.6250849924089483e-08, "logits/chosen": 0.710281491279602, "logits/rejected": 1.3649709224700928, "logps/chosen": -608.7789306640625, "logps/pi_response": -748.4505615234375, "logps/ref_response": -349.5060729980469, "logps/rejected": -1372.438232421875, "loss": 0.3777, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0407220125198364, "rewards/margins": 3.8326003551483154, "rewards/rejected": -4.873322486877441, "step": 90 }, { "epoch": 1.6736401673640167, "grad_norm": 63.93568226735586, "learning_rate": 6.947819411632222e-09, "logits/chosen": 0.664978563785553, "logits/rejected": 1.2786552906036377, "logps/chosen": -617.9248657226562, "logps/pi_response": -712.550048828125, "logps/ref_response": -370.06866455078125, "logps/rejected": -1386.747314453125, "loss": 0.3394, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.01349937915802, "rewards/margins": 3.325646162033081, "rewards/rejected": -4.339145183563232, "step": 100 }, { "epoch": 1.8410041841004183, "grad_norm": 48.91170773014705, "learning_rate": 1.3988542959794625e-09, "logits/chosen": 0.6376602649688721, "logits/rejected": 1.2136269807815552, "logps/chosen": -611.235595703125, "logps/pi_response": -798.3106689453125, "logps/ref_response": -364.35662841796875, "logps/rejected": -1384.1715087890625, "loss": 0.3528, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1132670640945435, "rewards/margins": 3.8225269317626953, "rewards/rejected": -4.935793876647949, "step": 110 }, { "epoch": 1.9748953974895398, "step": 118, "total_flos": 0.0, "train_loss": 0.4142875792616505, "train_runtime": 5502.1117, "train_samples_per_second": 5.555, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 118, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }