{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9973333333333333, "eval_steps": 100, "global_step": 187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 19.109572167610484, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -2.964515209197998, "logits/rejected": -2.865140914916992, "logps/chosen": -485.6763916015625, "logps/rejected": -1249.7501220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 18.895223645335697, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.7736825942993164, "logits/rejected": -2.7408108711242676, "logps/chosen": -604.7006225585938, "logps/rejected": -1056.1942138671875, "loss": 0.6926, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.0012125401990488172, "rewards/margins": 0.001352548599243164, "rewards/rejected": -0.00014000837109051645, "step": 10 }, { "epoch": 0.11, "grad_norm": 19.562748691217283, "learning_rate": 4.999562902281866e-07, "logits/chosen": -2.7962822914123535, "logits/rejected": -2.8271851539611816, "logps/chosen": -571.3375854492188, "logps/rejected": -971.5126953125, "loss": 0.6749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03103743866086006, "rewards/margins": 0.03241748735308647, "rewards/rejected": -0.0013800484593957663, "step": 20 }, { "epoch": 0.16, "grad_norm": 23.57935669375875, "learning_rate": 4.947295864744121e-07, "logits/chosen": -2.859532117843628, "logits/rejected": -2.8859381675720215, "logps/chosen": -529.7252197265625, "logps/rejected": -1093.7412109375, "loss": 0.6296, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.047512348741292953, "rewards/margins": 0.15334269404411316, "rewards/rejected": -0.10583032667636871, "step": 30 }, { "epoch": 0.21, "grad_norm": 74.03794269111636, "learning_rate": 4.809698831278217e-07, "logits/chosen": -3.1058590412139893, "logits/rejected": -3.105548143386841, "logps/chosen": -631.2692260742188, "logps/rejected": -1100.1131591796875, "loss": 0.5067, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4612053334712982, "rewards/margins": 0.6213432550430298, "rewards/rejected": -1.0825484991073608, "step": 40 }, { "epoch": 0.27, "grad_norm": 80.08928437177174, "learning_rate": 4.591569405016049e-07, "logits/chosen": -3.1383297443389893, "logits/rejected": -3.338413953781128, "logps/chosen": -614.7294921875, "logps/rejected": -1324.274658203125, "loss": 0.3007, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.7203965187072754, "rewards/margins": 2.4570107460021973, "rewards/rejected": -3.1774070262908936, "step": 50 }, { "epoch": 0.32, "grad_norm": 52.8412534701194, "learning_rate": 4.3005131163403164e-07, "logits/chosen": -3.232844829559326, "logits/rejected": -3.4020397663116455, "logps/chosen": -607.4974365234375, "logps/rejected": -1571.42578125, "loss": 0.2467, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.6835159063339233, "rewards/margins": 4.190090656280518, "rewards/rejected": -4.8736066818237305, "step": 60 }, { "epoch": 0.37, "grad_norm": 45.803944170508274, "learning_rate": 3.946678240449515e-07, "logits/chosen": -3.016165256500244, "logits/rejected": -3.2087910175323486, "logps/chosen": -602.6742553710938, "logps/rejected": -1499.858154296875, "loss": 0.2227, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.6613572239875793, "rewards/margins": 4.315842628479004, "rewards/rejected": -4.977200031280518, "step": 70 }, { "epoch": 0.43, "grad_norm": 33.74568647416123, "learning_rate": 3.5424019569033206e-07, "logits/chosen": -2.980517864227295, "logits/rejected": -2.997511863708496, "logps/chosen": -698.8486328125, "logps/rejected": -1709.7763671875, "loss": 0.2216, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.0458548069000244, "rewards/margins": 5.626683712005615, "rewards/rejected": -6.672537803649902, "step": 80 }, { "epoch": 0.48, "grad_norm": 32.76518067019826, "learning_rate": 3.1017801885224326e-07, "logits/chosen": -3.0111451148986816, "logits/rejected": -3.0090878009796143, "logps/chosen": -650.3148193359375, "logps/rejected": -1498.55419921875, "loss": 0.2021, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8722761869430542, "rewards/margins": 4.187361717224121, "rewards/rejected": -5.059638023376465, "step": 90 }, { "epoch": 0.53, "grad_norm": 64.24324243411806, "learning_rate": 2.640176118092979e-07, "logits/chosen": -2.9020493030548096, "logits/rejected": -2.935757875442505, "logps/chosen": -751.5125732421875, "logps/rejected": -1689.5228271484375, "loss": 0.1645, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.2032721042633057, "rewards/margins": 4.921408653259277, "rewards/rejected": -6.124680995941162, "step": 100 }, { "epoch": 0.53, "eval_logits/chosen": -3.0544369220733643, "eval_logits/rejected": -2.793405294418335, "eval_logps/chosen": -725.9426879882812, "eval_logps/rejected": -1452.9771728515625, "eval_loss": 0.25031739473342896, "eval_rewards/accuracies": 0.831250011920929, "eval_rewards/chosen": -1.6025804281234741, "eval_rewards/margins": 3.9000518321990967, "eval_rewards/rejected": -5.502632141113281, "eval_runtime": 65.7537, "eval_samples_per_second": 9.368, "eval_steps_per_second": 0.304, "step": 100 }, { "epoch": 0.59, "grad_norm": 41.59873680369454, "learning_rate": 2.1736845194498716e-07, "logits/chosen": -2.9784274101257324, "logits/rejected": -2.980086088180542, "logps/chosen": -600.6064453125, "logps/rejected": -1670.901611328125, "loss": 0.1595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.918400764465332, "rewards/margins": 6.283780574798584, "rewards/rejected": -7.202181339263916, "step": 110 }, { "epoch": 0.64, "grad_norm": 28.23680644032835, "learning_rate": 1.718570580135889e-07, "logits/chosen": -3.0252156257629395, "logits/rejected": -3.080897569656372, "logps/chosen": -611.710693359375, "logps/rejected": -1694.8226318359375, "loss": 0.1391, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8532626032829285, "rewards/margins": 5.446272850036621, "rewards/rejected": -6.299535751342773, "step": 120 }, { "epoch": 0.69, "grad_norm": 40.906944468121836, "learning_rate": 1.2907027822369005e-07, "logits/chosen": -2.9933369159698486, "logits/rejected": -3.124406576156616, "logps/chosen": -700.328125, "logps/rejected": -1804.997802734375, "loss": 0.1477, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.007743000984192, "rewards/margins": 6.478204250335693, "rewards/rejected": -7.485948085784912, "step": 130 }, { "epoch": 0.75, "grad_norm": 22.754078194499957, "learning_rate": 9.049996151674788e-08, "logits/chosen": -3.086073875427246, "logits/rejected": -3.1164612770080566, "logps/chosen": -631.7467651367188, "logps/rejected": -1740.2171630859375, "loss": 0.1821, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0232716798782349, "rewards/margins": 5.889337539672852, "rewards/rejected": -6.912609100341797, "step": 140 }, { "epoch": 0.8, "grad_norm": 20.144359719952234, "learning_rate": 5.74909411901843e-08, "logits/chosen": -2.9675424098968506, "logits/rejected": -2.990185499191284, "logps/chosen": -617.1038818359375, "logps/rejected": -1656.051513671875, "loss": 0.1413, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9472934603691101, "rewards/margins": 5.291378974914551, "rewards/rejected": -6.238672733306885, "step": 150 }, { "epoch": 0.85, "grad_norm": 26.642508471840806, "learning_rate": 3.119414452281158e-08, "logits/chosen": -2.9869649410247803, "logits/rejected": -3.0431644916534424, "logps/chosen": -662.4171142578125, "logps/rejected": -1831.9390869140625, "loss": 0.1189, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8813556432723999, "rewards/margins": 6.860285758972168, "rewards/rejected": -7.741641044616699, "step": 160 }, { "epoch": 0.91, "grad_norm": 18.842250875900756, "learning_rate": 1.2526463331788501e-08, "logits/chosen": -3.083080291748047, "logits/rejected": -2.9783942699432373, "logps/chosen": -638.3408203125, "logps/rejected": -1725.673583984375, "loss": 0.1265, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9777008891105652, "rewards/margins": 6.12181282043457, "rewards/rejected": -7.099513053894043, "step": 170 }, { "epoch": 0.96, "grad_norm": 34.250119439829845, "learning_rate": 2.1387846565474044e-09, "logits/chosen": -3.0460267066955566, "logits/rejected": -2.9695019721984863, "logps/chosen": -608.745849609375, "logps/rejected": -1744.884521484375, "loss": 0.1257, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.041512131690979, "rewards/margins": 6.2788825035095215, "rewards/rejected": -7.320394992828369, "step": 180 }, { "epoch": 1.0, "step": 187, "total_flos": 0.0, "train_loss": 0.2699868052719749, "train_runtime": 2833.2764, "train_samples_per_second": 4.234, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 187, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }