{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968652037617555, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-09, "logits/chosen": -2.6344056129455566, "logits/rejected": -2.5906338691711426, "logps/chosen": -158.64126586914062, "logps/pi_response": -86.09246826171875, "logps/ref_response": -86.09246826171875, "logps/rejected": -148.42047119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.125e-08, "logits/chosen": -2.6937406063079834, "logits/rejected": -2.697803258895874, "logps/chosen": -232.31423950195312, "logps/pi_response": -118.80592346191406, "logps/ref_response": -118.89913940429688, "logps/rejected": -251.14076232910156, "loss": 0.6932, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.00028485539951361716, "rewards/margins": -0.0006583214271813631, "rewards/rejected": 0.0009431770886294544, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-08, "logits/chosen": -2.7444348335266113, "logits/rejected": -2.718367338180542, "logps/chosen": -232.554443359375, "logps/pi_response": -131.306884765625, "logps/ref_response": -131.12376403808594, "logps/rejected": -254.0548553466797, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00253383070230484, "rewards/margins": 0.0006815333035774529, "rewards/rejected": -0.003215363947674632, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.8826812513685484e-08, "logits/chosen": -2.6866040229797363, "logits/rejected": -2.6289615631103516, "logps/chosen": -227.64889526367188, "logps/pi_response": -114.9122085571289, "logps/ref_response": -113.8835220336914, "logps/rejected": -221.958251953125, "loss": 0.6908, "rewards/accuracies": 0.53125, "rewards/chosen": -0.013358505442738533, "rewards/margins": 0.004771661013364792, "rewards/rejected": -0.018130164593458176, "step": 30 }, { "epoch": 0.25, "learning_rate": 4.6604720940421204e-08, "logits/chosen": -2.658876895904541, "logits/rejected": -2.6415140628814697, "logps/chosen": -206.0329132080078, "logps/pi_response": -113.687744140625, "logps/ref_response": -110.53662109375, "logps/rejected": -263.42254638671875, "loss": 0.6875, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03745666891336441, "rewards/margins": 0.013648083433508873, "rewards/rejected": -0.051104746758937836, "step": 40 }, { "epoch": 0.31, "learning_rate": 4.3344075855595095e-08, "logits/chosen": -2.6783342361450195, "logits/rejected": -2.6550586223602295, "logps/chosen": -232.83877563476562, "logps/pi_response": -125.37564849853516, "logps/ref_response": -120.1254653930664, "logps/rejected": -245.05154418945312, "loss": 0.6848, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0704035758972168, "rewards/margins": 0.005661136005073786, "rewards/rejected": -0.07606470584869385, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-08, "logits/chosen": -2.647156238555908, "logits/rejected": -2.6198782920837402, "logps/chosen": -222.61935424804688, "logps/pi_response": -114.81854248046875, "logps/ref_response": -107.5267333984375, "logps/rejected": -237.662109375, "loss": 0.6811, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08966623246669769, "rewards/margins": 0.0316656231880188, "rewards/rejected": -0.12133185565471649, "step": 60 }, { "epoch": 0.44, "learning_rate": 3.437648009023905e-08, "logits/chosen": -2.644777536392212, "logits/rejected": -2.615166664123535, "logps/chosen": -226.83267211914062, "logps/pi_response": -119.3505859375, "logps/ref_response": -112.31196594238281, "logps/rejected": -212.1949920654297, "loss": 0.6752, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08632274717092514, "rewards/margins": 0.04537486657500267, "rewards/rejected": -0.1316976249217987, "step": 70 }, { "epoch": 0.5, "learning_rate": 2.9100607788275543e-08, "logits/chosen": -2.640345573425293, "logits/rejected": -2.597229480743408, "logps/chosen": -205.22775268554688, "logps/pi_response": -108.34332275390625, "logps/ref_response": -99.59819793701172, "logps/rejected": -236.5616912841797, "loss": 0.6718, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09881611168384552, "rewards/margins": 0.06056177616119385, "rewards/rejected": -0.15937790274620056, "step": 80 }, { "epoch": 0.56, "learning_rate": 2.362761650339181e-08, "logits/chosen": -2.681976556777954, "logits/rejected": -2.656203508377075, "logps/chosen": -221.44039916992188, "logps/pi_response": -112.46112060546875, "logps/ref_response": -100.86174011230469, "logps/rejected": -234.1958465576172, "loss": 0.6698, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13516300916671753, "rewards/margins": 0.055410224944353104, "rewards/rejected": -0.19057324528694153, "step": 90 }, { "epoch": 0.63, "learning_rate": 1.8220596619089574e-08, "logits/chosen": -2.590778112411499, "logits/rejected": -2.5902531147003174, "logps/chosen": -220.1934814453125, "logps/pi_response": -122.29930114746094, "logps/ref_response": -112.6884765625, "logps/rejected": -272.1497802734375, "loss": 0.6681, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11862450838088989, "rewards/margins": 0.06530530005693436, "rewards/rejected": -0.18392980098724365, "step": 100 }, { "epoch": 0.69, "learning_rate": 1.3139467229135997e-08, "logits/chosen": -2.6394224166870117, "logits/rejected": -2.660050392150879, "logps/chosen": -230.77310180664062, "logps/pi_response": -112.18680572509766, "logps/ref_response": -100.60249328613281, "logps/rejected": -226.88265991210938, "loss": 0.6736, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.14069566130638123, "rewards/margins": 0.02159450575709343, "rewards/rejected": -0.16229018568992615, "step": 110 }, { "epoch": 0.75, "learning_rate": 8.628481651367876e-09, "logits/chosen": -2.631422519683838, "logits/rejected": -2.5770680904388428, "logps/chosen": -250.26956176757812, "logps/pi_response": -141.0382537841797, "logps/ref_response": -130.39340209960938, "logps/rejected": -256.5991516113281, "loss": 0.6699, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12506115436553955, "rewards/margins": 0.058550190180540085, "rewards/rejected": -0.18361134827136993, "step": 120 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-09, "logits/chosen": -2.669673442840576, "logits/rejected": -2.6708672046661377, "logps/chosen": -225.036865234375, "logps/pi_response": -123.17768859863281, "logps/ref_response": -113.8158950805664, "logps/rejected": -251.93417358398438, "loss": 0.67, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11341943591833115, "rewards/margins": 0.06858594715595245, "rewards/rejected": -0.182005375623703, "step": 130 }, { "epoch": 0.88, "learning_rate": 2.1464952759020853e-09, "logits/chosen": -2.6793627738952637, "logits/rejected": -2.6371798515319824, "logps/chosen": -242.1341552734375, "logps/pi_response": -131.11148071289062, "logps/ref_response": -119.49796295166016, "logps/rejected": -253.3504180908203, "loss": 0.6699, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13750413060188293, "rewards/margins": 0.06477151811122894, "rewards/rejected": -0.20227563381195068, "step": 140 }, { "epoch": 0.94, "learning_rate": 4.870879364444108e-10, "logits/chosen": -2.6344664096832275, "logits/rejected": -2.61822509765625, "logps/chosen": -228.68997192382812, "logps/pi_response": -112.9517593383789, "logps/ref_response": -101.72931671142578, "logps/rejected": -230.86636352539062, "loss": 0.6685, "rewards/accuracies": 0.625, "rewards/chosen": -0.13467349112033844, "rewards/margins": 0.053079742938280106, "rewards/rejected": -0.18775323033332825, "step": 150 }, { "epoch": 1.0, "step": 159, "total_flos": 0.0, "train_loss": 0.6778795974059675, "train_runtime": 4527.8424, "train_samples_per_second": 4.501, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }