{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974424552429667, "eval_steps": 500, "global_step": 195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1310.024749740419, "learning_rate": 2.5e-08, "logits/chosen": -5.0504608154296875, "logits/rejected": -5.35328483581543, "logps/chosen": -242.7239990234375, "logps/rejected": -185.90835571289062, "loss": 0.6893, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 1343.8700325036616, "learning_rate": 2.5e-07, "logits/chosen": -4.959235191345215, "logits/rejected": -5.051504135131836, "logps/chosen": -226.43630981445312, "logps/rejected": -216.47547912597656, "loss": 0.7205, "rewards/accuracies": 0.4479166567325592, "rewards/chosen": 0.07974544167518616, "rewards/margins": 0.013408761471509933, "rewards/rejected": 0.06633666902780533, "step": 10 }, { "epoch": 0.1, "grad_norm": 1443.7667771719773, "learning_rate": 5e-07, "logits/chosen": -4.906929969787598, "logits/rejected": -5.0118937492370605, "logps/chosen": -240.65188598632812, "logps/rejected": -220.84378051757812, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7429171204566956, "rewards/margins": 1.1278517246246338, "rewards/rejected": -0.38493460416793823, "step": 20 }, { "epoch": 0.15, "grad_norm": 1641.6770420153719, "learning_rate": 4.959823971496574e-07, "logits/chosen": -4.913812637329102, "logits/rejected": -5.012935638427734, "logps/chosen": -238.8269805908203, "logps/rejected": -228.05404663085938, "loss": 0.8116, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 1.8061437606811523, "rewards/margins": 4.523256301879883, "rewards/rejected": -2.7171127796173096, "step": 30 }, { "epoch": 0.2, "grad_norm": 1382.4291689510926, "learning_rate": 4.840587176599343e-07, "logits/chosen": -4.964416980743408, "logits/rejected": -5.0027852058410645, "logps/chosen": -249.1742706298828, "logps/rejected": -235.87576293945312, "loss": 0.9983, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 1.3685696125030518, "rewards/margins": 4.053561210632324, "rewards/rejected": -2.6849913597106934, "step": 40 }, { "epoch": 0.26, "grad_norm": 1428.1508779981239, "learning_rate": 4.646121984004665e-07, "logits/chosen": -4.990395545959473, "logits/rejected": -5.134562015533447, "logps/chosen": -251.7528076171875, "logps/rejected": -226.17306518554688, "loss": 0.9987, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": 2.2698659896850586, "rewards/margins": 5.616934299468994, "rewards/rejected": -3.3470687866210938, "step": 50 }, { "epoch": 0.31, "grad_norm": 1429.7364912941882, "learning_rate": 4.3826786650090273e-07, "logits/chosen": -5.023388385772705, "logits/rejected": -5.144254684448242, "logps/chosen": -250.6563720703125, "logps/rejected": -241.12484741210938, "loss": 0.993, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 1.217611312866211, "rewards/margins": 6.1895647048950195, "rewards/rejected": -4.97195291519165, "step": 60 }, { "epoch": 0.36, "grad_norm": 1385.9054301583744, "learning_rate": 4.058724504646834e-07, "logits/chosen": -4.992190361022949, "logits/rejected": -5.075345039367676, "logps/chosen": -256.97406005859375, "logps/rejected": -242.94003295898438, "loss": 1.1539, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 2.1734097003936768, "rewards/margins": 5.453003883361816, "rewards/rejected": -3.2795944213867188, "step": 70 }, { "epoch": 0.41, "grad_norm": 1267.3737422156325, "learning_rate": 3.6846716561824967e-07, "logits/chosen": -5.066686630249023, "logits/rejected": -5.165375709533691, "logps/chosen": -246.781982421875, "logps/rejected": -232.3020477294922, "loss": 1.1127, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 2.182149887084961, "rewards/margins": 6.110042095184326, "rewards/rejected": -3.927891492843628, "step": 80 }, { "epoch": 0.46, "grad_norm": 1414.9882610729042, "learning_rate": 3.272542485937368e-07, "logits/chosen": -5.056512355804443, "logits/rejected": -5.19997501373291, "logps/chosen": -236.23886108398438, "logps/rejected": -219.4969940185547, "loss": 1.1651, "rewards/accuracies": 0.59375, "rewards/chosen": 2.3071811199188232, "rewards/margins": 4.593169212341309, "rewards/rejected": -2.2859878540039062, "step": 90 }, { "epoch": 0.51, "grad_norm": 1730.7459110414102, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -5.051321506500244, "logits/rejected": -5.197503089904785, "logps/chosen": -245.94680786132812, "logps/rejected": -224.7979278564453, "loss": 1.1049, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.0447471141815186, "rewards/margins": 3.989384412765503, "rewards/rejected": -1.9446370601654053, "step": 100 }, { "epoch": 0.56, "grad_norm": 1376.721155787266, "learning_rate": 2.3878379241237134e-07, "logits/chosen": -5.05279541015625, "logits/rejected": -5.2380499839782715, "logps/chosen": -231.46408081054688, "logps/rejected": -221.2686309814453, "loss": 1.0653, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.9433412551879883, "rewards/margins": 7.433489799499512, "rewards/rejected": -4.490148544311523, "step": 110 }, { "epoch": 0.61, "grad_norm": 1298.5481767381427, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -4.989577293395996, "logits/rejected": -5.143449306488037, "logps/chosen": -250.3534698486328, "logps/rejected": -237.04074096679688, "loss": 1.0694, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 2.3243861198425293, "rewards/margins": 8.470600128173828, "rewards/rejected": -6.146214485168457, "step": 120 }, { "epoch": 0.66, "grad_norm": 1456.9702892975145, "learning_rate": 1.517437420865191e-07, "logits/chosen": -5.036610126495361, "logits/rejected": -5.181552886962891, "logps/chosen": -234.2519073486328, "logps/rejected": -226.05050659179688, "loss": 1.1374, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 2.612969160079956, "rewards/margins": 6.129396915435791, "rewards/rejected": -3.516427516937256, "step": 130 }, { "epoch": 0.72, "grad_norm": 1414.11944634508, "learning_rate": 1.1227575463697439e-07, "logits/chosen": -5.011117458343506, "logits/rejected": -5.0677995681762695, "logps/chosen": -246.2405242919922, "logps/rejected": -240.97647094726562, "loss": 1.0012, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.1312901973724365, "rewards/margins": 6.49268102645874, "rewards/rejected": -4.361390590667725, "step": 140 }, { "epoch": 0.77, "grad_norm": 1391.6252979817953, "learning_rate": 7.723433775328384e-08, "logits/chosen": -5.031737327575684, "logits/rejected": -5.141982078552246, "logps/chosen": -247.31640625, "logps/rejected": -245.01284790039062, "loss": 1.0468, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 3.413778781890869, "rewards/margins": 8.60617446899414, "rewards/rejected": -5.19239616394043, "step": 150 }, { "epoch": 0.82, "grad_norm": 1305.4800329449993, "learning_rate": 4.774575140626316e-08, "logits/chosen": -4.959289073944092, "logits/rejected": -5.040767192840576, "logps/chosen": -253.7027587890625, "logps/rejected": -250.91659545898438, "loss": 0.9992, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 3.046278476715088, "rewards/margins": 8.344175338745117, "rewards/rejected": -5.297896862030029, "step": 160 }, { "epoch": 0.87, "grad_norm": 1228.1104796269808, "learning_rate": 2.475778302439524e-08, "logits/chosen": -5.096159934997559, "logits/rejected": -5.178959369659424, "logps/chosen": -251.2628631591797, "logps/rejected": -233.06857299804688, "loss": 1.0057, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 2.824694871902466, "rewards/margins": 6.200740814208984, "rewards/rejected": -3.3760459423065186, "step": 170 }, { "epoch": 0.92, "grad_norm": 1348.827014256151, "learning_rate": 9.009284826036689e-09, "logits/chosen": -4.995651721954346, "logits/rejected": -5.102165222167969, "logps/chosen": -237.61990356445312, "logps/rejected": -232.7886962890625, "loss": 0.9321, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 2.423119068145752, "rewards/margins": 4.8792009353637695, "rewards/rejected": -2.4560813903808594, "step": 180 }, { "epoch": 0.97, "grad_norm": 1117.1672982866971, "learning_rate": 1.0064265011902328e-09, "logits/chosen": -5.071808815002441, "logits/rejected": -5.110179901123047, "logps/chosen": -236.14224243164062, "logps/rejected": -233.5693359375, "loss": 0.9891, "rewards/accuracies": 0.640625, "rewards/chosen": 1.8652112483978271, "rewards/margins": 5.820201873779297, "rewards/rejected": -3.9549899101257324, "step": 190 }, { "epoch": 1.0, "step": 195, "total_flos": 0.0, "train_loss": 0.9975380127246564, "train_runtime": 5482.1546, "train_samples_per_second": 9.12, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 195, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }