{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991645781119465, "eval_steps": 10000000, "global_step": 299, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 94.51455806402136, "learning_rate": 3.333333333333333e-10, "logits/chosen": -1.693521499633789, "logits/rejected": -1.6753541231155396, "logps/chosen": -1.041430115699768, "logps/rejected": -0.9273841977119446, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 100.10048800952612, "learning_rate": 3.3333333333333334e-09, "logits/chosen": -1.7426362037658691, "logits/rejected": -1.7463488578796387, "logps/chosen": -1.0522818565368652, "logps/rejected": -1.0174607038497925, "loss": 0.6933, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.0006895202095620334, "rewards/margins": -0.0014206942869350314, "rewards/rejected": 0.0007311741355806589, "step": 10 }, { "epoch": 0.07, "grad_norm": 100.2747717498339, "learning_rate": 6.666666666666667e-09, "logits/chosen": -1.876529335975647, "logits/rejected": -1.8286478519439697, "logps/chosen": -1.0717421770095825, "logps/rejected": -1.0434354543685913, "loss": 0.6934, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0013453494757413864, "rewards/margins": 0.001054912805557251, "rewards/rejected": 0.00029043667018413544, "step": 20 }, { "epoch": 0.1, "grad_norm": 91.78075458726622, "learning_rate": 1e-08, "logits/chosen": -1.7869594097137451, "logits/rejected": -1.7599306106567383, "logps/chosen": -1.066955327987671, "logps/rejected": -1.0448154211044312, "loss": 0.6926, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0009901206940412521, "rewards/margins": 0.005651786923408508, "rewards/rejected": -0.004661666229367256, "step": 30 }, { "epoch": 0.13, "grad_norm": 83.1668176080734, "learning_rate": 9.96594024562513e-09, "logits/chosen": -1.8331626653671265, "logits/rejected": -1.8235479593276978, "logps/chosen": -1.069947361946106, "logps/rejected": -1.0194944143295288, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00406806543469429, "rewards/margins": 0.006294439546763897, "rewards/rejected": -0.0022263741120696068, "step": 40 }, { "epoch": 0.17, "grad_norm": 97.3484905394127, "learning_rate": 9.86422500924775e-09, "logits/chosen": -1.8598533868789673, "logits/rejected": -1.8249794244766235, "logps/chosen": -1.0844773054122925, "logps/rejected": -1.0546021461486816, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0010452494025230408, "rewards/margins": 0.003718096762895584, "rewards/rejected": -0.0026728473603725433, "step": 50 }, { "epoch": 0.2, "grad_norm": 89.15270505321199, "learning_rate": 9.696240049254743e-09, "logits/chosen": -1.8162918090820312, "logits/rejected": -1.8095808029174805, "logps/chosen": -1.1147956848144531, "logps/rejected": -1.10584557056427, "loss": 0.692, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003025440499186516, "rewards/margins": 0.006887376308441162, "rewards/rejected": -0.0038619358092546463, "step": 60 }, { "epoch": 0.23, "grad_norm": 92.66102585547152, "learning_rate": 9.464273976236516e-09, "logits/chosen": -1.7991975545883179, "logits/rejected": -1.7613614797592163, "logps/chosen": -1.0598714351654053, "logps/rejected": -1.0530080795288086, "loss": 0.6927, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0042045507580041885, "rewards/margins": 0.0019652042537927628, "rewards/rejected": 0.002239346969872713, "step": 70 }, { "epoch": 0.27, "grad_norm": 194.03821825764928, "learning_rate": 9.171487073181198e-09, "logits/chosen": -1.8440383672714233, "logits/rejected": -1.8225574493408203, "logps/chosen": -1.0219953060150146, "logps/rejected": -1.0044893026351929, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.0038851741701364517, "rewards/margins": 0.009898573160171509, "rewards/rejected": -0.006013398990035057, "step": 80 }, { "epoch": 0.3, "grad_norm": 89.9314461991181, "learning_rate": 8.821868240089676e-09, "logits/chosen": -1.8029550313949585, "logits/rejected": -1.7841434478759766, "logps/chosen": -1.045693039894104, "logps/rejected": -1.0079419612884521, "loss": 0.6912, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.005337424576282501, "rewards/margins": 0.004554492421448231, "rewards/rejected": 0.0007829321548342705, "step": 90 }, { "epoch": 0.33, "grad_norm": 100.68694545007727, "learning_rate": 8.42018064959393e-09, "logits/chosen": -1.8531291484832764, "logits/rejected": -1.8357194662094116, "logps/chosen": -1.078748345375061, "logps/rejected": -1.0813863277435303, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.007160305976867676, "rewards/margins": 0.007577991578727961, "rewards/rejected": -0.0004176851361989975, "step": 100 }, { "epoch": 0.37, "grad_norm": 91.99753152902545, "learning_rate": 7.971896853961043e-09, "logits/chosen": -1.8346866369247437, "logits/rejected": -1.8052761554718018, "logps/chosen": -1.0582704544067383, "logps/rejected": -1.0159928798675537, "loss": 0.6901, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0007529235444962978, "rewards/margins": 0.0036198147572577, "rewards/rejected": -0.004372738301753998, "step": 110 }, { "epoch": 0.4, "grad_norm": 76.03854240063603, "learning_rate": 7.48312422757881e-09, "logits/chosen": -1.8829187154769897, "logits/rejected": -1.8501228094100952, "logps/chosen": -1.0292680263519287, "logps/rejected": -1.019852876663208, "loss": 0.6887, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00865122489631176, "rewards/margins": 0.011260826140642166, "rewards/rejected": -0.0026096017099916935, "step": 120 }, { "epoch": 0.43, "grad_norm": 91.54881621872774, "learning_rate": 6.96052176068713e-09, "logits/chosen": -1.7626311779022217, "logits/rejected": -1.7299768924713135, "logps/chosen": -1.0229580402374268, "logps/rejected": -1.0213210582733154, "loss": 0.6874, "rewards/accuracies": 0.65625, "rewards/chosen": 0.010313736274838448, "rewards/margins": 0.01158633828163147, "rewards/rejected": -0.0012726020067930222, "step": 130 }, { "epoch": 0.47, "grad_norm": 88.02230042083127, "learning_rate": 6.4112093379492135e-09, "logits/chosen": -1.796229600906372, "logits/rejected": -1.7824671268463135, "logps/chosen": -1.078906774520874, "logps/rejected": -1.0356519222259521, "loss": 0.6887, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015195205807685852, "rewards/margins": 0.011107077822089195, "rewards/rejected": 0.004088127985596657, "step": 140 }, { "epoch": 0.5, "grad_norm": 89.70971937959732, "learning_rate": 5.842670737842467e-09, "logits/chosen": -1.8206145763397217, "logits/rejected": -1.788490891456604, "logps/chosen": -1.0580933094024658, "logps/rejected": -1.0225986242294312, "loss": 0.6877, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004929685965180397, "rewards/margins": 0.005628877319395542, "rewards/rejected": -0.0006991913542151451, "step": 150 }, { "epoch": 0.53, "grad_norm": 81.17857900193553, "learning_rate": 5.262651674395798e-09, "logits/chosen": -1.8310163021087646, "logits/rejected": -1.8322261571884155, "logps/chosen": -1.0157067775726318, "logps/rejected": -1.0004805326461792, "loss": 0.6876, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.010493971407413483, "rewards/margins": 0.013428708538413048, "rewards/rejected": -0.0029347380623221397, "step": 160 }, { "epoch": 0.57, "grad_norm": 87.11260929204627, "learning_rate": 4.679054270342702e-09, "logits/chosen": -1.8449046611785889, "logits/rejected": -1.7946765422821045, "logps/chosen": -1.0548999309539795, "logps/rejected": -1.051992654800415, "loss": 0.6869, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.010436683893203735, "rewards/margins": 0.010068513453006744, "rewards/rejected": 0.00036817044019699097, "step": 170 }, { "epoch": 0.6, "grad_norm": 93.41415146032115, "learning_rate": 4.099829399377524e-09, "logits/chosen": -1.8277971744537354, "logits/rejected": -1.7856277227401733, "logps/chosen": -1.0608714818954468, "logps/rejected": -1.0331629514694214, "loss": 0.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015542459674179554, "rewards/margins": 0.01706361211836338, "rewards/rejected": -0.0015211515128612518, "step": 180 }, { "epoch": 0.63, "grad_norm": 91.20164752256586, "learning_rate": 3.532868364233416e-09, "logits/chosen": -1.8144668340682983, "logits/rejected": -1.7934455871582031, "logps/chosen": -1.0488895177841187, "logps/rejected": -1.0484153032302856, "loss": 0.6869, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.015009616501629353, "rewards/margins": 0.011918185278773308, "rewards/rejected": 0.00309143029153347, "step": 190 }, { "epoch": 0.67, "grad_norm": 103.72123679470863, "learning_rate": 2.985895386349233e-09, "logits/chosen": -1.783926248550415, "logits/rejected": -1.7509727478027344, "logps/chosen": -1.033827543258667, "logps/rejected": -1.0074162483215332, "loss": 0.6856, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018476296216249466, "rewards/margins": 0.019362105056643486, "rewards/rejected": -0.0008858110522851348, "step": 200 }, { "epoch": 0.7, "grad_norm": 86.58629109761597, "learning_rate": 2.4663623718355446e-09, "logits/chosen": -1.842024803161621, "logits/rejected": -1.8078495264053345, "logps/chosen": -1.0763428211212158, "logps/rejected": -1.0434763431549072, "loss": 0.6853, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.018699195235967636, "rewards/margins": 0.017539886757731438, "rewards/rejected": 0.0011593066155910492, "step": 210 }, { "epoch": 0.74, "grad_norm": 94.3943636617196, "learning_rate": 1.9813473874379397e-09, "logits/chosen": -1.732317328453064, "logits/rejected": -1.7312743663787842, "logps/chosen": -1.073425054550171, "logps/rejected": -1.0629937648773193, "loss": 0.6847, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.013539738953113556, "rewards/margins": 0.013838117942214012, "rewards/rejected": -0.00029837898910045624, "step": 220 }, { "epoch": 0.77, "grad_norm": 92.64370783949347, "learning_rate": 1.5374582296511053e-09, "logits/chosen": -1.7242523431777954, "logits/rejected": -1.6965806484222412, "logps/chosen": -1.0366003513336182, "logps/rejected": -0.9941840171813965, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.021335098892450333, "rewards/margins": 0.015094568021595478, "rewards/rejected": 0.006240529473870993, "step": 230 }, { "epoch": 0.8, "grad_norm": 91.14853265672438, "learning_rate": 1.1407424007485927e-09, "logits/chosen": -1.881166696548462, "logits/rejected": -1.8595256805419922, "logps/chosen": -1.0768239498138428, "logps/rejected": -1.0254974365234375, "loss": 0.6848, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.021426241844892502, "rewards/margins": 0.021464312449097633, "rewards/rejected": -3.8067344576120377e-05, "step": 240 }, { "epoch": 0.84, "grad_norm": 88.07457325362287, "learning_rate": 7.966047182060226e-10, "logits/chosen": -1.8788058757781982, "logits/rejected": -1.881800889968872, "logps/chosen": -1.0638504028320312, "logps/rejected": -1.028044581413269, "loss": 0.6828, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.018116505816578865, "rewards/margins": 0.015344863757491112, "rewards/rejected": 0.0027716427575796843, "step": 250 }, { "epoch": 0.87, "grad_norm": 99.87255814500503, "learning_rate": 5.097336799988067e-10, "logits/chosen": -1.8842456340789795, "logits/rejected": -1.8642467260360718, "logps/chosen": -1.0772615671157837, "logps/rejected": -1.0665724277496338, "loss": 0.6848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.023015262559056282, "rewards/margins": 0.02001366578042507, "rewards/rejected": 0.0030015967786312103, "step": 260 }, { "epoch": 0.9, "grad_norm": 107.74820920122671, "learning_rate": 2.840375889663871e-10, "logits/chosen": -1.8729417324066162, "logits/rejected": -1.8372119665145874, "logps/chosen": -0.9936866760253906, "logps/rejected": -1.0166294574737549, "loss": 0.6847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01751965843141079, "rewards/margins": 0.02036522701382637, "rewards/rejected": -0.0028455646242946386, "step": 270 }, { "epoch": 0.94, "grad_norm": 87.56807066875254, "learning_rate": 1.2259130647833627e-10, "logits/chosen": -1.7625566720962524, "logits/rejected": -1.7224514484405518, "logps/chosen": -1.093827724456787, "logps/rejected": -1.0462143421173096, "loss": 0.6834, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.01200016774237156, "rewards/margins": 0.019106844440102577, "rewards/rejected": -0.0071066757664084435, "step": 280 }, { "epoch": 0.97, "grad_norm": 81.16688874976322, "learning_rate": 2.7594360825166643e-11, "logits/chosen": -1.780118703842163, "logits/rejected": -1.7672898769378662, "logps/chosen": -1.0616685152053833, "logps/rejected": -1.0320245027542114, "loss": 0.6845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.028133947402238846, "rewards/margins": 0.020768892019987106, "rewards/rejected": 0.007365054450929165, "step": 290 }, { "epoch": 1.0, "step": 299, "total_flos": 0.0, "train_loss": 0.6880149315033469, "train_runtime": 4603.368, "train_samples_per_second": 8.317, "train_steps_per_second": 0.065 } ], "logging_steps": 10, "max_steps": 299, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }