{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 647945.4912541932, "learning_rate": 1.5625e-08, "logits/chosen": -0.34773391485214233, "logits/rejected": -0.6075438261032104, "logps/chosen": -72.6761474609375, "logps/rejected": -90.11207580566406, "loss": 128855.9062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032, "grad_norm": 973324.1712020065, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.5611530542373657, "logits/rejected": -0.5887401103973389, "logps/chosen": -80.2381591796875, "logps/rejected": -83.50374603271484, "loss": 124005.5694, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.0019423539051786065, "rewards/margins": 5.1506802265066653e-05, "rewards/rejected": -0.0019938608165830374, "step": 10 }, { "epoch": 0.064, "grad_norm": 619327.407060219, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.6772833466529846, "logits/rejected": -0.6759974360466003, "logps/chosen": -103.69559478759766, "logps/rejected": -107.43603515625, "loss": 124210.2125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004284867085516453, "rewards/margins": -3.467009082669392e-05, "rewards/rejected": -0.004250196740031242, "step": 20 }, { "epoch": 0.096, "grad_norm": 698173.4505162692, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.7464536428451538, "logits/rejected": -0.7253994345664978, "logps/chosen": -90.76727294921875, "logps/rejected": -93.79044342041016, "loss": 126548.2375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003106231102719903, "rewards/margins": -0.0005979427369311452, "rewards/rejected": -0.0025082884822040796, "step": 30 }, { "epoch": 0.128, "grad_norm": 637174.9970357245, "learning_rate": 4.857142857142857e-07, "logits/chosen": -0.7085025906562805, "logits/rejected": -0.7023540139198303, "logps/chosen": -87.2509765625, "logps/rejected": -88.0642318725586, "loss": 124747.6875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007863897830247879, "rewards/margins": -0.0013397409347817302, "rewards/rejected": -0.006524157710373402, "step": 40 }, { "epoch": 0.16, "grad_norm": 759040.4009588562, "learning_rate": 4.6785714285714283e-07, "logits/chosen": -0.5708094835281372, "logits/rejected": -0.55577552318573, "logps/chosen": -99.05384826660156, "logps/rejected": -96.9248046875, "loss": 127056.3875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011573193594813347, "rewards/margins": -0.0007376443827524781, "rewards/rejected": -0.010835548862814903, "step": 50 }, { "epoch": 0.192, "grad_norm": 818448.4874125579, "learning_rate": 4.5e-07, "logits/chosen": -0.5234788060188293, "logits/rejected": -0.5684272646903992, "logps/chosen": -84.0132064819336, "logps/rejected": -89.70082092285156, "loss": 124101.0125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011890527792274952, "rewards/margins": 0.0017182690789923072, "rewards/rejected": -0.013608796522021294, "step": 60 }, { "epoch": 0.224, "grad_norm": 764315.259548912, "learning_rate": 4.3214285714285713e-07, "logits/chosen": -0.672571063041687, "logits/rejected": -0.6554594039916992, "logps/chosen": -102.6801986694336, "logps/rejected": -114.0815658569336, "loss": 125767.8, "rewards/accuracies": 0.625, "rewards/chosen": -0.013903990387916565, "rewards/margins": 0.0018995633581653237, "rewards/rejected": -0.01580355316400528, "step": 70 }, { "epoch": 0.256, "grad_norm": 792832.7721251897, "learning_rate": 4.142857142857143e-07, "logits/chosen": -0.6233155131340027, "logits/rejected": -0.6050644516944885, "logps/chosen": -89.83741760253906, "logps/rejected": -96.45980072021484, "loss": 126646.1, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.011440077796578407, "rewards/margins": -0.0004714619426522404, "rewards/rejected": -0.010968615300953388, "step": 80 }, { "epoch": 0.288, "grad_norm": 810791.4710150602, "learning_rate": 3.9642857142857137e-07, "logits/chosen": -0.5288355946540833, "logits/rejected": -0.507430911064148, "logps/chosen": -77.9104232788086, "logps/rejected": -74.20404052734375, "loss": 126600.7625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005952201783657074, "rewards/margins": -0.001560600707307458, "rewards/rejected": -0.004391600843518972, "step": 90 }, { "epoch": 0.32, "grad_norm": 612814.6572972395, "learning_rate": 3.785714285714285e-07, "logits/chosen": -0.6446259617805481, "logits/rejected": -0.6776315569877625, "logps/chosen": -92.22976684570312, "logps/rejected": -100.54733276367188, "loss": 124326.1, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011474112048745155, "rewards/margins": 0.002196565503254533, "rewards/rejected": -0.013670678250491619, "step": 100 }, { "epoch": 0.352, "grad_norm": 769940.7880329042, "learning_rate": 3.607142857142857e-07, "logits/chosen": -0.5441879630088806, "logits/rejected": -0.5395065546035767, "logps/chosen": -64.47439575195312, "logps/rejected": -78.48651123046875, "loss": 127264.1375, "rewards/accuracies": 0.625, "rewards/chosen": -0.008155420422554016, "rewards/margins": 0.005157289560884237, "rewards/rejected": -0.013312709517776966, "step": 110 }, { "epoch": 0.384, "grad_norm": 781127.2959197527, "learning_rate": 3.4285714285714286e-07, "logits/chosen": -0.7074313759803772, "logits/rejected": -0.6893147230148315, "logps/chosen": -99.30326843261719, "logps/rejected": -100.26654815673828, "loss": 126373.0, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.006027103401720524, "rewards/margins": -0.0006245746044442058, "rewards/rejected": -0.005402528680860996, "step": 120 }, { "epoch": 0.416, "grad_norm": 942915.0070681617, "learning_rate": 3.25e-07, "logits/chosen": -0.5311844348907471, "logits/rejected": -0.5678432583808899, "logps/chosen": -89.84095001220703, "logps/rejected": -95.73307800292969, "loss": 126546.9625, "rewards/accuracies": 0.5, "rewards/chosen": -0.005261361598968506, "rewards/margins": -0.00025905706570483744, "rewards/rejected": -0.00500230398029089, "step": 130 }, { "epoch": 0.448, "grad_norm": 802161.2678528542, "learning_rate": 3.0714285714285716e-07, "logits/chosen": -0.6184743642807007, "logits/rejected": -0.6451131701469421, "logps/chosen": -109.21659088134766, "logps/rejected": -114.1061019897461, "loss": 125730.125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011759540066123009, "rewards/margins": 0.0014495229115709662, "rewards/rejected": -0.01320906262844801, "step": 140 }, { "epoch": 0.48, "grad_norm": 866428.7327389624, "learning_rate": 2.892857142857143e-07, "logits/chosen": -0.6030551195144653, "logits/rejected": -0.5557407140731812, "logps/chosen": -82.86506652832031, "logps/rejected": -85.31071472167969, "loss": 125425.025, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.010830635204911232, "rewards/margins": -6.357554957503453e-05, "rewards/rejected": -0.010767060332000256, "step": 150 }, { "epoch": 0.512, "grad_norm": 743330.5276750317, "learning_rate": 2.714285714285714e-07, "logits/chosen": -0.5015612840652466, "logits/rejected": -0.5147450566291809, "logps/chosen": -82.76224517822266, "logps/rejected": -91.91256713867188, "loss": 124215.3, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.010083668865263462, "rewards/margins": 0.0024900883436203003, "rewards/rejected": -0.012573758140206337, "step": 160 }, { "epoch": 0.544, "grad_norm": 863614.5495224567, "learning_rate": 2.5357142857142855e-07, "logits/chosen": -0.5797610878944397, "logits/rejected": -0.5199266672134399, "logps/chosen": -94.99356842041016, "logps/rejected": -96.22293090820312, "loss": 127004.7, "rewards/accuracies": 0.625, "rewards/chosen": -0.012711484916508198, "rewards/margins": 0.004797719419002533, "rewards/rejected": -0.017509203404188156, "step": 170 }, { "epoch": 0.576, "grad_norm": 831681.0077569862, "learning_rate": 2.357142857142857e-07, "logits/chosen": -0.6032494902610779, "logits/rejected": -0.579995334148407, "logps/chosen": -104.5300521850586, "logps/rejected": -108.78277587890625, "loss": 125979.4375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009229556657373905, "rewards/margins": 0.004828121047466993, "rewards/rejected": -0.014057678170502186, "step": 180 }, { "epoch": 0.608, "grad_norm": 780274.1467706825, "learning_rate": 2.1785714285714284e-07, "logits/chosen": -0.7121313810348511, "logits/rejected": -0.667202353477478, "logps/chosen": -115.69401550292969, "logps/rejected": -110.82621765136719, "loss": 124809.7, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.012815780937671661, "rewards/margins": -0.0001598205417394638, "rewards/rejected": -0.012655961327254772, "step": 190 }, { "epoch": 0.64, "grad_norm": 774598.0171325745, "learning_rate": 2e-07, "logits/chosen": -0.612346351146698, "logits/rejected": -0.6116153001785278, "logps/chosen": -91.24519348144531, "logps/rejected": -97.00153350830078, "loss": 123650.5375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01422748900949955, "rewards/margins": 0.0018453721422702074, "rewards/rejected": -0.01607285998761654, "step": 200 }, { "epoch": 0.672, "grad_norm": 1137683.0365726806, "learning_rate": 1.8214285714285714e-07, "logits/chosen": -0.6241598725318909, "logits/rejected": -0.6161590814590454, "logps/chosen": -82.91732788085938, "logps/rejected": -92.75973510742188, "loss": 125116.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01631699874997139, "rewards/margins": 0.002573491772636771, "rewards/rejected": -0.018890492618083954, "step": 210 }, { "epoch": 0.704, "grad_norm": 921161.3498685773, "learning_rate": 1.6428571428571429e-07, "logits/chosen": -0.6814984083175659, "logits/rejected": -0.6642488241195679, "logps/chosen": -134.07284545898438, "logps/rejected": -134.7923126220703, "loss": 125720.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01326170563697815, "rewards/margins": 0.0025010218378156424, "rewards/rejected": -0.015762727707624435, "step": 220 }, { "epoch": 0.736, "grad_norm": 813896.4945325998, "learning_rate": 1.4642857142857143e-07, "logits/chosen": -0.5411783456802368, "logits/rejected": -0.5778718590736389, "logps/chosen": -104.65946197509766, "logps/rejected": -107.73319244384766, "loss": 125973.8125, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.012594607658684254, "rewards/margins": 0.0011098148534074426, "rewards/rejected": -0.013704421930015087, "step": 230 }, { "epoch": 0.768, "grad_norm": 1031122.2282012746, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -0.6678429841995239, "logits/rejected": -0.6291283369064331, "logps/chosen": -104.91682434082031, "logps/rejected": -111.02679443359375, "loss": 126001.475, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008200669661164284, "rewards/margins": 0.0015530238160863519, "rewards/rejected": -0.009753693826496601, "step": 240 }, { "epoch": 0.8, "grad_norm": 858633.8039080129, "learning_rate": 1.107142857142857e-07, "logits/chosen": -0.6295119524002075, "logits/rejected": -0.6167672872543335, "logps/chosen": -123.36985778808594, "logps/rejected": -133.19418334960938, "loss": 126223.65, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011947548016905785, "rewards/margins": 0.006852240767329931, "rewards/rejected": -0.018799791112542152, "step": 250 }, { "epoch": 0.832, "grad_norm": 951847.1640935472, "learning_rate": 9.285714285714286e-08, "logits/chosen": -0.6834455728530884, "logits/rejected": -0.7226243615150452, "logps/chosen": -86.39234924316406, "logps/rejected": -95.36772155761719, "loss": 124640.2, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01843985728919506, "rewards/margins": 0.003491448936983943, "rewards/rejected": -0.021931307390332222, "step": 260 }, { "epoch": 0.864, "grad_norm": 816825.5268517752, "learning_rate": 7.5e-08, "logits/chosen": -0.6084921956062317, "logits/rejected": -0.606655478477478, "logps/chosen": -95.06122589111328, "logps/rejected": -100.9395523071289, "loss": 126797.975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013821562752127647, "rewards/margins": 0.0025993138551712036, "rewards/rejected": -0.01642087660729885, "step": 270 }, { "epoch": 0.896, "grad_norm": 823903.2164322428, "learning_rate": 5.714285714285714e-08, "logits/chosen": -0.7316595315933228, "logits/rejected": -0.7817249298095703, "logps/chosen": -97.38008880615234, "logps/rejected": -122.05289459228516, "loss": 122803.6375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.012123498134315014, "rewards/margins": 0.006116434000432491, "rewards/rejected": -0.018239933997392654, "step": 280 }, { "epoch": 0.928, "grad_norm": 1213103.129361221, "learning_rate": 3.9285714285714285e-08, "logits/chosen": -0.7132126092910767, "logits/rejected": -0.7211403846740723, "logps/chosen": -115.4140853881836, "logps/rejected": -124.9251480102539, "loss": 125220.8875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008542357943952084, "rewards/margins": 0.007235427852720022, "rewards/rejected": -0.01577778533101082, "step": 290 }, { "epoch": 0.96, "grad_norm": 826125.8509083999, "learning_rate": 2.142857142857143e-08, "logits/chosen": -0.4794866144657135, "logits/rejected": -0.48627161979675293, "logps/chosen": -106.44710540771484, "logps/rejected": -113.4127197265625, "loss": 124190.425, "rewards/accuracies": 0.625, "rewards/chosen": -0.016785580664873123, "rewards/margins": 0.002466305159032345, "rewards/rejected": -0.019251886755228043, "step": 300 }, { "epoch": 0.992, "grad_norm": 853168.6471782625, "learning_rate": 3.571428571428571e-09, "logits/chosen": -0.6391203999519348, "logits/rejected": -0.6226745843887329, "logps/chosen": -105.24736022949219, "logps/rejected": -109.426025390625, "loss": 122976.65, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01043027639389038, "rewards/margins": 0.003538835793733597, "rewards/rejected": -0.013969110324978828, "step": 310 }, { "epoch": 0.9984, "step": 312, "total_flos": 0.0, "train_loss": 125356.69771634616, "train_runtime": 2759.785, "train_samples_per_second": 7.245, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }