{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 5.960413904919439, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06863687932491302, "logits/rejected": 0.14140453934669495, "logps/chosen": -1.7160797119140625, "logps/rejected": -1.8894574642181396, "loss": 0.7285, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7160797119140625, "rewards/margins": 0.1733776032924652, "rewards/rejected": -1.8894574642181396, "sft_loss": 1.4685349464416504, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 10.283120910113055, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.007261426188051701, "logits/rejected": 0.11421868950128555, "logps/chosen": -1.8034416437149048, "logps/rejected": -1.8454005718231201, "loss": 0.8198, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8034416437149048, "rewards/margins": 0.041958972811698914, "rewards/rejected": -1.8454005718231201, "sft_loss": 1.5082828998565674, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 10.95594283910855, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.03530733659863472, "logits/rejected": 0.0647643432021141, "logps/chosen": -1.6338050365447998, "logps/rejected": -1.7646713256835938, "loss": 0.8026, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6338050365447998, "rewards/margins": 0.13086609542369843, "rewards/rejected": -1.7646713256835938, "sft_loss": 1.5001604557037354, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 6.14709723690152, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.046555109322071075, "logits/rejected": 0.0392180010676384, "logps/chosen": -1.7248073816299438, "logps/rejected": -1.8061683177947998, "loss": 0.8257, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7248073816299438, "rewards/margins": 0.08136085420846939, "rewards/rejected": -1.8061683177947998, "sft_loss": 1.5003635883331299, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 16.53309947885159, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.06515147536993027, "logits/rejected": 0.021384865045547485, "logps/chosen": -1.8690541982650757, "logps/rejected": -1.7782766819000244, "loss": 0.9645, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -1.8690541982650757, "rewards/margins": -0.09077732264995575, "rewards/rejected": -1.7782766819000244, "sft_loss": 1.5456018447875977, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 9.241174984603704, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.08728420734405518, "logits/rejected": 0.006704470608383417, "logps/chosen": -1.9081052541732788, "logps/rejected": -1.8311151266098022, "loss": 0.8797, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9081052541732788, "rewards/margins": -0.07699020951986313, "rewards/rejected": -1.8311151266098022, "sft_loss": 1.6464630365371704, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 10.141825796126035, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.06290142983198166, "logits/rejected": 0.09779242426156998, "logps/chosen": -1.84554922580719, "logps/rejected": -1.9966566562652588, "loss": 0.8619, "rewards/accuracies": 0.5, "rewards/chosen": -1.84554922580719, "rewards/margins": 0.15110749006271362, "rewards/rejected": -1.9966566562652588, "sft_loss": 1.5615030527114868, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 9.60539728118551, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.018712041899561882, "logits/rejected": 0.19298934936523438, "logps/chosen": -1.8813413381576538, "logps/rejected": -1.7433903217315674, "loss": 0.907, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8813413381576538, "rewards/margins": -0.1379513442516327, "rewards/rejected": -1.7433903217315674, "sft_loss": 1.51904296875, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 15.279796660776759, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.021166115999221802, "logits/rejected": 0.2210729420185089, "logps/chosen": -1.83597731590271, "logps/rejected": -1.8712654113769531, "loss": 0.8692, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.83597731590271, "rewards/margins": 0.03528793901205063, "rewards/rejected": -1.8712654113769531, "sft_loss": 1.5357192754745483, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 12.369890038233116, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.04084717482328415, "logits/rejected": 0.11164959520101547, "logps/chosen": -1.8961776494979858, "logps/rejected": -1.7771739959716797, "loss": 0.9233, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8961776494979858, "rewards/margins": -0.11900367587804794, "rewards/rejected": -1.7771739959716797, "sft_loss": 1.5827200412750244, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 8.172530142387867, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.11221468448638916, "logits/rejected": 0.11364294588565826, "logps/chosen": -1.830999732017517, "logps/rejected": -1.8655872344970703, "loss": 0.8864, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.830999732017517, "rewards/margins": 0.034587521106004715, "rewards/rejected": -1.8655872344970703, "sft_loss": 1.582080602645874, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.708516139332287, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.0882994756102562, "logits/rejected": 0.10581319034099579, "logps/chosen": -1.786087989807129, "logps/rejected": -1.8903782367706299, "loss": 0.7979, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.786087989807129, "rewards/margins": 0.10429023206233978, "rewards/rejected": -1.8903782367706299, "sft_loss": 1.542878270149231, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 6.784833173876897, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.021900106221437454, "logits/rejected": 0.12525932490825653, "logps/chosen": -1.6341216564178467, "logps/rejected": -1.7630643844604492, "loss": 0.7555, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6341216564178467, "rewards/margins": 0.128942608833313, "rewards/rejected": -1.7630643844604492, "sft_loss": 1.473035454750061, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 12.066547435002901, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06697101891040802, "logits/rejected": 0.08831731230020523, "logps/chosen": -1.7613502740859985, "logps/rejected": -1.8077118396759033, "loss": 0.8785, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7613502740859985, "rewards/margins": 0.04636149853467941, "rewards/rejected": -1.8077118396759033, "sft_loss": 1.6272966861724854, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 13.435717052234807, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.058324266225099564, "logits/rejected": 0.12552091479301453, "logps/chosen": -1.7716388702392578, "logps/rejected": -2.032058000564575, "loss": 0.7572, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7716388702392578, "rewards/margins": 0.26041918992996216, "rewards/rejected": -2.032058000564575, "sft_loss": 1.5637356042861938, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 9.171672146207309, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.010003057308495045, "logits/rejected": 0.09709867835044861, "logps/chosen": -1.7084449529647827, "logps/rejected": -1.7419836521148682, "loss": 0.8303, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7084449529647827, "rewards/margins": 0.03353862091898918, "rewards/rejected": -1.7419836521148682, "sft_loss": 1.5209019184112549, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 6.253780556003053, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.12957318127155304, "logits/rejected": 0.1263594627380371, "logps/chosen": -1.7734349966049194, "logps/rejected": -1.947437047958374, "loss": 0.7969, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7734349966049194, "rewards/margins": 0.17400197684764862, "rewards/rejected": -1.947437047958374, "sft_loss": 1.4895031452178955, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 15.574243557399985, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.10436828434467316, "logits/rejected": 0.06782497465610504, "logps/chosen": -1.7253191471099854, "logps/rejected": -1.758958101272583, "loss": 0.8626, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.7253191471099854, "rewards/margins": 0.03363896161317825, "rewards/rejected": -1.758958101272583, "sft_loss": 1.4510555267333984, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 6.8301329645871744, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.07905842363834381, "logits/rejected": 0.0742034763097763, "logps/chosen": -1.7728517055511475, "logps/rejected": -1.8891079425811768, "loss": 0.8211, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7728517055511475, "rewards/margins": 0.1162562221288681, "rewards/rejected": -1.8891079425811768, "sft_loss": 1.513932228088379, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 5.552900149636991, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.038597553968429565, "logits/rejected": 0.027492264285683632, "logps/chosen": -1.6718080043792725, "logps/rejected": -1.7776873111724854, "loss": 0.7783, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.6718080043792725, "rewards/margins": 0.1058792918920517, "rewards/rejected": -1.7776873111724854, "sft_loss": 1.4830631017684937, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 10.33948626839002, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.03208022564649582, "logits/rejected": 0.06198335438966751, "logps/chosen": -1.6234149932861328, "logps/rejected": -1.7916975021362305, "loss": 0.75, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6234149932861328, "rewards/margins": 0.16828235983848572, "rewards/rejected": -1.7916975021362305, "sft_loss": 1.4286878108978271, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.843753554089203, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.016793223097920418, "logits/rejected": 0.08014251291751862, "logps/chosen": -1.6443202495574951, "logps/rejected": -1.7000795602798462, "loss": 0.8239, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6443202495574951, "rewards/margins": 0.05575937032699585, "rewards/rejected": -1.7000795602798462, "sft_loss": 1.4516596794128418, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 11.566384576174354, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.03288597613573074, "logits/rejected": 0.24429932236671448, "logps/chosen": -1.6173032522201538, "logps/rejected": -1.8868554830551147, "loss": 0.7148, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6173032522201538, "rewards/margins": 0.2695521414279938, "rewards/rejected": -1.8868554830551147, "sft_loss": 1.541107177734375, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 6.913730001102249, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.09573063254356384, "logits/rejected": 0.07752221822738647, "logps/chosen": -1.6756315231323242, "logps/rejected": -1.7910346984863281, "loss": 0.7763, "rewards/accuracies": 0.5, "rewards/chosen": -1.6756315231323242, "rewards/margins": 0.11540316045284271, "rewards/rejected": -1.7910346984863281, "sft_loss": 1.5277470350265503, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 6.1554867852846415, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.08294974267482758, "logits/rejected": 0.050920598208904266, "logps/chosen": -1.6054375171661377, "logps/rejected": -1.5668278932571411, "loss": 0.8437, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6054375171661377, "rewards/margins": -0.03860952705144882, "rewards/rejected": -1.5668278932571411, "sft_loss": 1.4991239309310913, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 9.989171102195591, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.03443840518593788, "logits/rejected": 0.1720113605260849, "logps/chosen": -1.6433916091918945, "logps/rejected": -1.7639652490615845, "loss": 0.7318, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6433916091918945, "rewards/margins": 0.12057371437549591, "rewards/rejected": -1.7639652490615845, "sft_loss": 1.5579571723937988, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 16.717284162597252, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.05925334617495537, "logits/rejected": 0.0586424246430397, "logps/chosen": -1.6929420232772827, "logps/rejected": -1.7211458683013916, "loss": 0.8289, "rewards/accuracies": 0.5, "rewards/chosen": -1.6929420232772827, "rewards/margins": 0.028203705325722694, "rewards/rejected": -1.7211458683013916, "sft_loss": 1.4951034784317017, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 10.788978294247835, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.04839114844799042, "logits/rejected": 0.11875492334365845, "logps/chosen": -1.6551433801651, "logps/rejected": -1.7841171026229858, "loss": 0.7597, "rewards/accuracies": 0.5, "rewards/chosen": -1.6551433801651, "rewards/margins": 0.12897393107414246, "rewards/rejected": -1.7841171026229858, "sft_loss": 1.5421087741851807, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 10.497061215269742, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.02363504469394684, "logits/rejected": 0.13335788249969482, "logps/chosen": -1.5652744770050049, "logps/rejected": -1.6764402389526367, "loss": 0.7541, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.5652744770050049, "rewards/margins": 0.1111656203866005, "rewards/rejected": -1.6764402389526367, "sft_loss": 1.4905664920806885, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 11.824220911593985, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.06564657390117645, "logits/rejected": 0.09819936007261276, "logps/chosen": -1.5161938667297363, "logps/rejected": -1.5157290697097778, "loss": 0.8118, "rewards/accuracies": 0.5, "rewards/chosen": -1.5161938667297363, "rewards/margins": -0.00046480895252898335, "rewards/rejected": -1.5157290697097778, "sft_loss": 1.3504148721694946, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 9.777524019352478, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.052200376987457275, "logits/rejected": 0.000967084604781121, "logps/chosen": -1.5276451110839844, "logps/rejected": -1.6252696514129639, "loss": 0.7564, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5276451110839844, "rewards/margins": 0.09762442111968994, "rewards/rejected": -1.6252696514129639, "sft_loss": 1.4340415000915527, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 10.105912104578321, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.15139448642730713, "logits/rejected": -0.006771622691303492, "logps/chosen": -1.638725996017456, "logps/rejected": -1.6153501272201538, "loss": 0.8533, "rewards/accuracies": 0.46875, "rewards/chosen": -1.638725996017456, "rewards/margins": -0.02337591163814068, "rewards/rejected": -1.6153501272201538, "sft_loss": 1.4840831756591797, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 9.03891423231266, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.08140228688716888, "logits/rejected": 0.08782564848661423, "logps/chosen": -1.4777276515960693, "logps/rejected": -1.598278284072876, "loss": 0.7631, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4777276515960693, "rewards/margins": 0.12055070698261261, "rewards/rejected": -1.598278284072876, "sft_loss": 1.3730732202529907, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 17.13980495820537, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.08505090326070786, "logits/rejected": -0.03041454777121544, "logps/chosen": -1.5955283641815186, "logps/rejected": -1.6537210941314697, "loss": 0.7996, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5955283641815186, "rewards/margins": 0.058192551136016846, "rewards/rejected": -1.6537210941314697, "sft_loss": 1.4852439165115356, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 9.642144891608261, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.038266055285930634, "logits/rejected": 0.0400555320084095, "logps/chosen": -1.4493739604949951, "logps/rejected": -1.5499675273895264, "loss": 0.7664, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4493739604949951, "rewards/margins": 0.10059352964162827, "rewards/rejected": -1.5499675273895264, "sft_loss": 1.4198567867279053, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 9.1649894632413, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.012896222993731499, "logits/rejected": -0.012625837698578835, "logps/chosen": -1.4367154836654663, "logps/rejected": -1.6282215118408203, "loss": 0.745, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4367154836654663, "rewards/margins": 0.1915060132741928, "rewards/rejected": -1.6282215118408203, "sft_loss": 1.4140430688858032, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 8.447001689686841, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.17267711460590363, "logits/rejected": -0.08615531027317047, "logps/chosen": -1.3968846797943115, "logps/rejected": -1.459826946258545, "loss": 0.799, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3968846797943115, "rewards/margins": 0.06294231116771698, "rewards/rejected": -1.459826946258545, "sft_loss": 1.3795416355133057, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 10.92962736637001, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.09284885972738266, "logits/rejected": 0.022700410336256027, "logps/chosen": -1.3386284112930298, "logps/rejected": -1.4749207496643066, "loss": 0.7308, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3386284112930298, "rewards/margins": 0.13629236817359924, "rewards/rejected": -1.4749207496643066, "sft_loss": 1.3290035724639893, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 6.43306218127895, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.00904055405408144, "logits/rejected": 0.14529991149902344, "logps/chosen": -1.2881543636322021, "logps/rejected": -1.4579533338546753, "loss": 0.7028, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2881543636322021, "rewards/margins": 0.1697990745306015, "rewards/rejected": -1.4579533338546753, "sft_loss": 1.3130781650543213, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 16.45987301331402, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.11670553684234619, "logits/rejected": 0.017344793304800987, "logps/chosen": -1.4145252704620361, "logps/rejected": -1.4540348052978516, "loss": 0.7736, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4145252704620361, "rewards/margins": 0.03950957953929901, "rewards/rejected": -1.4540348052978516, "sft_loss": 1.4150238037109375, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 14.278219923361767, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.09433043003082275, "logits/rejected": 0.04523424059152603, "logps/chosen": -1.3298991918563843, "logps/rejected": -1.4006279706954956, "loss": 0.76, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3298991918563843, "rewards/margins": 0.07072871923446655, "rewards/rejected": -1.4006279706954956, "sft_loss": 1.3061649799346924, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 10.534106245791639, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.17358310520648956, "logits/rejected": 0.012704399414360523, "logps/chosen": -1.4094555377960205, "logps/rejected": -1.5506460666656494, "loss": 0.7436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4094555377960205, "rewards/margins": 0.1411905735731125, "rewards/rejected": -1.5506460666656494, "sft_loss": 1.3683921098709106, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 9.099909343280414, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.19610531628131866, "logits/rejected": 0.04747116565704346, "logps/chosen": -1.4265471696853638, "logps/rejected": -1.502264142036438, "loss": 0.7316, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4265471696853638, "rewards/margins": 0.07571707665920258, "rewards/rejected": -1.502264142036438, "sft_loss": 1.3935734033584595, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 16.994371531459464, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.024409957230091095, "logits/rejected": 0.12192012369632721, "logps/chosen": -1.3803361654281616, "logps/rejected": -1.5626693964004517, "loss": 0.7115, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3803361654281616, "rewards/margins": 0.1823331117630005, "rewards/rejected": -1.5626693964004517, "sft_loss": 1.3772460222244263, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.560055258862408, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.11784350872039795, "logits/rejected": 0.04851361736655235, "logps/chosen": -1.3842450380325317, "logps/rejected": -1.5287258625030518, "loss": 0.706, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3842450380325317, "rewards/margins": 0.14448070526123047, "rewards/rejected": -1.5287258625030518, "sft_loss": 1.3530828952789307, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 6.752363954250565, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.03928910568356514, "logits/rejected": 0.036488309502601624, "logps/chosen": -1.4256141185760498, "logps/rejected": -1.5976355075836182, "loss": 0.7152, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4256141185760498, "rewards/margins": 0.17202138900756836, "rewards/rejected": -1.5976355075836182, "sft_loss": 1.3297736644744873, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 13.272456740531851, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.01150902546942234, "logits/rejected": 0.12112773954868317, "logps/chosen": -1.3811085224151611, "logps/rejected": -1.5527503490447998, "loss": 0.6883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3811085224151611, "rewards/margins": 0.17164193093776703, "rewards/rejected": -1.5527503490447998, "sft_loss": 1.3338648080825806, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 5.611535575434795, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.04383648559451103, "logits/rejected": 0.07974930107593536, "logps/chosen": -1.3846567869186401, "logps/rejected": -1.582404613494873, "loss": 0.7098, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3846567869186401, "rewards/margins": 0.19774766266345978, "rewards/rejected": -1.582404613494873, "sft_loss": 1.3970749378204346, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 8.21392188565044, "learning_rate": 4.3672014260249554e-07, "logits/chosen": -0.00035706907510757446, "logits/rejected": 0.11412318050861359, "logps/chosen": -1.4996440410614014, "logps/rejected": -1.5583584308624268, "loss": 0.7687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4996440410614014, "rewards/margins": 0.058714479207992554, "rewards/rejected": -1.5583584308624268, "sft_loss": 1.4782047271728516, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 16.18873353125527, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.07753191888332367, "logits/rejected": 0.0833100825548172, "logps/chosen": -1.424626111984253, "logps/rejected": -1.492189884185791, "loss": 0.7836, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.424626111984253, "rewards/margins": 0.06756364554166794, "rewards/rejected": -1.492189884185791, "sft_loss": 1.3677934408187866, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 10.420223868825996, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.06023973226547241, "logits/rejected": 0.07743427902460098, "logps/chosen": -1.359100341796875, "logps/rejected": -1.4964516162872314, "loss": 0.7151, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.359100341796875, "rewards/margins": 0.1373511254787445, "rewards/rejected": -1.4964516162872314, "sft_loss": 1.2914539575576782, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.087682456530805, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2527022659778595, "logits/rejected": -0.1475326120853424, "logps/chosen": -1.4745581150054932, "logps/rejected": -1.6276257038116455, "loss": 0.6963, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4745581150054932, "rewards/margins": 0.15306781232357025, "rewards/rejected": -1.6276257038116455, "sft_loss": 1.4316717386245728, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 11.224112493004597, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.11539869010448456, "logits/rejected": -0.03174302354454994, "logps/chosen": -1.469599962234497, "logps/rejected": -1.6331703662872314, "loss": 0.7424, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.469599962234497, "rewards/margins": 0.16357028484344482, "rewards/rejected": -1.6331703662872314, "sft_loss": 1.4724925756454468, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 5.68586018826389, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.10837896913290024, "logits/rejected": 0.01979227364063263, "logps/chosen": -1.3989070653915405, "logps/rejected": -1.5062487125396729, "loss": 0.7324, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3989070653915405, "rewards/margins": 0.10734158754348755, "rewards/rejected": -1.5062487125396729, "sft_loss": 1.3805054426193237, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 9.169435724630166, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.03658795356750488, "logits/rejected": 0.060106754302978516, "logps/chosen": -1.3430955410003662, "logps/rejected": -1.5379518270492554, "loss": 0.7119, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3430955410003662, "rewards/margins": 0.19485625624656677, "rewards/rejected": -1.5379518270492554, "sft_loss": 1.3056552410125732, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 11.642343753910186, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.10649528354406357, "logits/rejected": 0.050676681101322174, "logps/chosen": -1.4193177223205566, "logps/rejected": -1.532428503036499, "loss": 0.744, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4193177223205566, "rewards/margins": 0.11311081796884537, "rewards/rejected": -1.532428503036499, "sft_loss": 1.3757294416427612, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 9.477273557866901, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.08700501173734665, "logits/rejected": 0.05256549268960953, "logps/chosen": -1.4146549701690674, "logps/rejected": -1.521597146987915, "loss": 0.7508, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4146549701690674, "rewards/margins": 0.10694216191768646, "rewards/rejected": -1.521597146987915, "sft_loss": 1.4378408193588257, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 7.858316458995477, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.1313084065914154, "logits/rejected": 0.16333989799022675, "logps/chosen": -1.4311192035675049, "logps/rejected": -1.5807135105133057, "loss": 0.7008, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4311192035675049, "rewards/margins": 0.14959420263767242, "rewards/rejected": -1.5807135105133057, "sft_loss": 1.408268928527832, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 10.722054726747858, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.07545675337314606, "logits/rejected": -0.0185849629342556, "logps/chosen": -1.356143832206726, "logps/rejected": -1.4903559684753418, "loss": 0.7084, "rewards/accuracies": 0.5625, "rewards/chosen": -1.356143832206726, "rewards/margins": 0.13421215116977692, "rewards/rejected": -1.4903559684753418, "sft_loss": 1.3286858797073364, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 8.396936759589622, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.07630442082881927, "logits/rejected": 0.09307421743869781, "logps/chosen": -1.3904763460159302, "logps/rejected": -1.4916353225708008, "loss": 0.7406, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3904763460159302, "rewards/margins": 0.10115914046764374, "rewards/rejected": -1.4916353225708008, "sft_loss": 1.4179452657699585, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 7.5461255699415615, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.021782396361231804, "logits/rejected": 0.0503949336707592, "logps/chosen": -1.5006964206695557, "logps/rejected": -1.5041887760162354, "loss": 0.8034, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.5006964206695557, "rewards/margins": 0.0034922778140753508, "rewards/rejected": -1.5041887760162354, "sft_loss": 1.470004916191101, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 10.379361247174433, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.20494525134563446, "logits/rejected": -0.11462219059467316, "logps/chosen": -1.4621378183364868, "logps/rejected": -1.5664069652557373, "loss": 0.7687, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.4621378183364868, "rewards/margins": 0.1042691320180893, "rewards/rejected": -1.5664069652557373, "sft_loss": 1.4343583583831787, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 12.206956445803208, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.0364091582596302, "logits/rejected": 0.12094012647867203, "logps/chosen": -1.4632132053375244, "logps/rejected": -1.6362049579620361, "loss": 0.7321, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4632132053375244, "rewards/margins": 0.17299169301986694, "rewards/rejected": -1.6362049579620361, "sft_loss": 1.4501442909240723, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 7.8218244456780175, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.06194036453962326, "logits/rejected": 0.0724940076470375, "logps/chosen": -1.4121302366256714, "logps/rejected": -1.473226547241211, "loss": 0.7475, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4121302366256714, "rewards/margins": 0.06109621003270149, "rewards/rejected": -1.473226547241211, "sft_loss": 1.4019798040390015, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 10.966456850479648, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.13376030325889587, "logits/rejected": -0.014714968390762806, "logps/chosen": -1.4268195629119873, "logps/rejected": -1.7038514614105225, "loss": 0.6965, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4268195629119873, "rewards/margins": 0.2770320475101471, "rewards/rejected": -1.7038514614105225, "sft_loss": 1.4688560962677002, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 13.122909386225336, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.014597897417843342, "logits/rejected": 0.13579130172729492, "logps/chosen": -1.4340598583221436, "logps/rejected": -1.6640983819961548, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4340598583221436, "rewards/margins": 0.23003847897052765, "rewards/rejected": -1.6640983819961548, "sft_loss": 1.4217156171798706, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 14.708678526323988, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.042291101068258286, "logits/rejected": 0.14999577403068542, "logps/chosen": -1.467124581336975, "logps/rejected": -1.5221842527389526, "loss": 0.7524, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.467124581336975, "rewards/margins": 0.05505945533514023, "rewards/rejected": -1.5221842527389526, "sft_loss": 1.4110476970672607, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 13.006814045736004, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.030298244208097458, "logits/rejected": 0.11877082288265228, "logps/chosen": -1.5452629327774048, "logps/rejected": -1.641233205795288, "loss": 0.776, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5452629327774048, "rewards/margins": 0.09597032517194748, "rewards/rejected": -1.641233205795288, "sft_loss": 1.4715862274169922, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 14.681577501181096, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.03941451385617256, "logits/rejected": 0.06996399164199829, "logps/chosen": -1.4516140222549438, "logps/rejected": -1.6157200336456299, "loss": 0.7219, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4516140222549438, "rewards/margins": 0.16410626471042633, "rewards/rejected": -1.6157200336456299, "sft_loss": 1.43868088722229, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 11.93474643394763, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.0037063672207295895, "logits/rejected": 0.09875715523958206, "logps/chosen": -1.3943617343902588, "logps/rejected": -1.522147536277771, "loss": 0.7419, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3943617343902588, "rewards/margins": 0.12778589129447937, "rewards/rejected": -1.522147536277771, "sft_loss": 1.4115302562713623, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 8.821079935796888, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.10563834756612778, "logits/rejected": 0.11631409823894501, "logps/chosen": -1.481509804725647, "logps/rejected": -1.5383491516113281, "loss": 0.7743, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.481509804725647, "rewards/margins": 0.05683939531445503, "rewards/rejected": -1.5383491516113281, "sft_loss": 1.471835732460022, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 9.718394718090156, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.07945707440376282, "logits/rejected": 0.0006481289747171104, "logps/chosen": -1.433421015739441, "logps/rejected": -1.5861304998397827, "loss": 0.7337, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.433421015739441, "rewards/margins": 0.1527094841003418, "rewards/rejected": -1.5861304998397827, "sft_loss": 1.376868486404419, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 15.56970534009352, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.010357332415878773, "logits/rejected": 0.07142248749732971, "logps/chosen": -1.4000120162963867, "logps/rejected": -1.4971468448638916, "loss": 0.7601, "rewards/accuracies": 0.625, "rewards/chosen": -1.4000120162963867, "rewards/margins": 0.09713465720415115, "rewards/rejected": -1.4971468448638916, "sft_loss": 1.339832067489624, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 11.018058444333253, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.04047941416501999, "logits/rejected": 0.05441926792263985, "logps/chosen": -1.3855236768722534, "logps/rejected": -1.4579277038574219, "loss": 0.7709, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3855236768722534, "rewards/margins": 0.07240404933691025, "rewards/rejected": -1.4579277038574219, "sft_loss": 1.3380168676376343, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 13.123972784133054, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.08183223009109497, "logits/rejected": 0.07531268894672394, "logps/chosen": -1.3839911222457886, "logps/rejected": -1.5580480098724365, "loss": 0.7248, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3839911222457886, "rewards/margins": 0.17405681312084198, "rewards/rejected": -1.5580480098724365, "sft_loss": 1.3942081928253174, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 6.492217241230478, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.06026136875152588, "logits/rejected": 0.022083550691604614, "logps/chosen": -1.417571783065796, "logps/rejected": -1.6119505167007446, "loss": 0.6757, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.417571783065796, "rewards/margins": 0.19437862932682037, "rewards/rejected": -1.6119505167007446, "sft_loss": 1.3851673603057861, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.285375790935912, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.004461781587451696, "logits/rejected": 0.0782126933336258, "logps/chosen": -1.5011136531829834, "logps/rejected": -1.5273463726043701, "loss": 0.7812, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5011136531829834, "rewards/margins": 0.026232635602355003, "rewards/rejected": -1.5273463726043701, "sft_loss": 1.4751291275024414, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 12.608344518189268, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.048131298273801804, "logits/rejected": 0.21444062888622284, "logps/chosen": -1.5212723016738892, "logps/rejected": -1.623795747756958, "loss": 0.7575, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5212723016738892, "rewards/margins": 0.10252338647842407, "rewards/rejected": -1.623795747756958, "sft_loss": 1.47605299949646, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 7.482378689086901, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.0485985204577446, "logits/rejected": 0.11298723518848419, "logps/chosen": -1.476596474647522, "logps/rejected": -1.5395801067352295, "loss": 0.7418, "rewards/accuracies": 0.53125, "rewards/chosen": -1.476596474647522, "rewards/margins": 0.06298379600048065, "rewards/rejected": -1.5395801067352295, "sft_loss": 1.4238179922103882, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 8.087243389889734, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.06979383528232574, "logits/rejected": 0.1674462854862213, "logps/chosen": -1.4954628944396973, "logps/rejected": -1.612022042274475, "loss": 0.7198, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4954628944396973, "rewards/margins": 0.11655900627374649, "rewards/rejected": -1.612022042274475, "sft_loss": 1.4028011560440063, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2917894423007965, "eval_logits/rejected": 0.3831002116203308, "eval_logps/chosen": -1.5116333961486816, "eval_logps/rejected": -1.6721185445785522, "eval_loss": 0.7212684750556946, "eval_rewards/accuracies": 0.5556379556655884, "eval_rewards/chosen": -1.5116333961486816, "eval_rewards/margins": 0.16048528254032135, "eval_rewards/rejected": -1.6721185445785522, "eval_runtime": 49.8544, "eval_samples_per_second": 26.979, "eval_sft_loss": 1.442051887512207, "eval_steps_per_second": 6.76, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.673976548115488, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.0018800109392032027, "logits/rejected": 0.09711170196533203, "logps/chosen": -1.5108472108840942, "logps/rejected": -1.6167656183242798, "loss": 0.7613, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5108472108840942, "rewards/margins": 0.10591830313205719, "rewards/rejected": -1.6167656183242798, "sft_loss": 1.4360049962997437, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 16.658005859941053, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.012961970642209053, "logits/rejected": 0.14409925043582916, "logps/chosen": -1.442063570022583, "logps/rejected": -1.5798479318618774, "loss": 0.7233, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.442063570022583, "rewards/margins": 0.13778413832187653, "rewards/rejected": -1.5798479318618774, "sft_loss": 1.4210898876190186, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 7.1966242834238425, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.020494680851697922, "logits/rejected": 0.019982147961854935, "logps/chosen": -1.4275459051132202, "logps/rejected": -1.6003360748291016, "loss": 0.7133, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4275459051132202, "rewards/margins": 0.172790065407753, "rewards/rejected": -1.6003360748291016, "sft_loss": 1.3976166248321533, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 8.634312077569724, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.03358606994152069, "logits/rejected": 0.1547229290008545, "logps/chosen": -1.3682522773742676, "logps/rejected": -1.496917963027954, "loss": 0.74, "rewards/accuracies": 0.5, "rewards/chosen": -1.3682522773742676, "rewards/margins": 0.12866561114788055, "rewards/rejected": -1.496917963027954, "sft_loss": 1.3897976875305176, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 9.656064150679228, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.07592150568962097, "logits/rejected": 0.11702374368906021, "logps/chosen": -1.4173481464385986, "logps/rejected": -1.6226460933685303, "loss": 0.6801, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4173481464385986, "rewards/margins": 0.2052977979183197, "rewards/rejected": -1.6226460933685303, "sft_loss": 1.4624885320663452, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 11.2083299615921, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.10108651965856552, "logits/rejected": 0.09405249357223511, "logps/chosen": -1.4314601421356201, "logps/rejected": -1.637756586074829, "loss": 0.6848, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4314601421356201, "rewards/margins": 0.20629656314849854, "rewards/rejected": -1.637756586074829, "sft_loss": 1.4554636478424072, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 9.669481078109088, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.019327210262417793, "logits/rejected": 0.07081412523984909, "logps/chosen": -1.322218656539917, "logps/rejected": -1.4697790145874023, "loss": 0.695, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.322218656539917, "rewards/margins": 0.1475602388381958, "rewards/rejected": -1.4697790145874023, "sft_loss": 1.3607923984527588, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 10.428163109914133, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.021354446187615395, "logits/rejected": 0.06936828047037125, "logps/chosen": -1.3904298543930054, "logps/rejected": -1.5355784893035889, "loss": 0.7124, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3904298543930054, "rewards/margins": 0.14514875411987305, "rewards/rejected": -1.5355784893035889, "sft_loss": 1.3935145139694214, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 11.07544413713983, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.07900739461183548, "logits/rejected": 0.03180045634508133, "logps/chosen": -1.4274473190307617, "logps/rejected": -1.6190907955169678, "loss": 0.7381, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4274473190307617, "rewards/margins": 0.19164356589317322, "rewards/rejected": -1.6190907955169678, "sft_loss": 1.435367465019226, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 12.53024445306653, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.020625609904527664, "logits/rejected": 0.10038242489099503, "logps/chosen": -1.449824571609497, "logps/rejected": -1.6178346872329712, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.449824571609497, "rewards/margins": 0.16801026463508606, "rewards/rejected": -1.6178346872329712, "sft_loss": 1.3960511684417725, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 16.569495110491154, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.010531505569815636, "logits/rejected": 0.10226383060216904, "logps/chosen": -1.4113497734069824, "logps/rejected": -1.6499805450439453, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4113497734069824, "rewards/margins": 0.23863062262535095, "rewards/rejected": -1.6499805450439453, "sft_loss": 1.3804184198379517, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 10.178410801248045, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.14100396633148193, "logits/rejected": -0.019715065136551857, "logps/chosen": -1.5282491445541382, "logps/rejected": -1.6294291019439697, "loss": 0.7417, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5282491445541382, "rewards/margins": 0.10117986053228378, "rewards/rejected": -1.6294291019439697, "sft_loss": 1.5199025869369507, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 13.275624616560746, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.1582765281200409, "logits/rejected": 0.180677130818367, "logps/chosen": -1.5120041370391846, "logps/rejected": -1.699751853942871, "loss": 0.7239, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5120041370391846, "rewards/margins": 0.18774764239788055, "rewards/rejected": -1.699751853942871, "sft_loss": 1.4534732103347778, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 9.492688236948386, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.16971933841705322, "logits/rejected": 0.12275818735361099, "logps/chosen": -1.4381402730941772, "logps/rejected": -1.641122579574585, "loss": 0.7046, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4381402730941772, "rewards/margins": 0.2029825747013092, "rewards/rejected": -1.641122579574585, "sft_loss": 1.4118239879608154, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 8.759624212285418, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.034986358135938644, "logits/rejected": 0.11416208744049072, "logps/chosen": -1.4491407871246338, "logps/rejected": -1.778569221496582, "loss": 0.6601, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4491407871246338, "rewards/margins": 0.329428493976593, "rewards/rejected": -1.778569221496582, "sft_loss": 1.4499163627624512, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 17.90670295613648, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.00017474293417762965, "logits/rejected": 0.2095792293548584, "logps/chosen": -1.4264090061187744, "logps/rejected": -1.5594055652618408, "loss": 0.7328, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4264090061187744, "rewards/margins": 0.13299642503261566, "rewards/rejected": -1.5594055652618408, "sft_loss": 1.4186687469482422, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 18.812647759724136, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.038185965269804, "logits/rejected": 0.08179084956645966, "logps/chosen": -1.5820934772491455, "logps/rejected": -1.686597466468811, "loss": 0.755, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5820934772491455, "rewards/margins": 0.10450379550457001, "rewards/rejected": -1.686597466468811, "sft_loss": 1.5076513290405273, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 8.799166305860778, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.045380812138319016, "logits/rejected": 0.1188635379076004, "logps/chosen": -1.5420925617218018, "logps/rejected": -1.652745008468628, "loss": 0.7556, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5420925617218018, "rewards/margins": 0.11065268516540527, "rewards/rejected": -1.652745008468628, "sft_loss": 1.461670994758606, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 14.31453780443302, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.010799932293593884, "logits/rejected": 0.011995360255241394, "logps/chosen": -1.5552622079849243, "logps/rejected": -1.6749614477157593, "loss": 0.7415, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5552622079849243, "rewards/margins": 0.11969934403896332, "rewards/rejected": -1.6749614477157593, "sft_loss": 1.533546805381775, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 9.790672917266049, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.021091187372803688, "logits/rejected": 0.08316578716039658, "logps/chosen": -1.4617230892181396, "logps/rejected": -1.651269555091858, "loss": 0.7205, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4617230892181396, "rewards/margins": 0.189546599984169, "rewards/rejected": -1.651269555091858, "sft_loss": 1.4185668230056763, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 9.993005038856948, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.0358644500374794, "logits/rejected": 0.11196329444646835, "logps/chosen": -1.5400495529174805, "logps/rejected": -1.6119331121444702, "loss": 0.7458, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5400495529174805, "rewards/margins": 0.0718836560845375, "rewards/rejected": -1.6119331121444702, "sft_loss": 1.4943927526474, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 7.301408856819212, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.12172931432723999, "logits/rejected": 0.186055988073349, "logps/chosen": -1.4746077060699463, "logps/rejected": -1.6851272583007812, "loss": 0.6833, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4746077060699463, "rewards/margins": 0.21051974594593048, "rewards/rejected": -1.6851272583007812, "sft_loss": 1.4081268310546875, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 8.16894501894189, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.08651556074619293, "logits/rejected": 0.18817943334579468, "logps/chosen": -1.3984525203704834, "logps/rejected": -1.5859407186508179, "loss": 0.6888, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3984525203704834, "rewards/margins": 0.18748828768730164, "rewards/rejected": -1.5859407186508179, "sft_loss": 1.3933144807815552, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 6.611398368137036, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.05394362285733223, "logits/rejected": 0.08871600031852722, "logps/chosen": -1.4527640342712402, "logps/rejected": -1.6061521768569946, "loss": 0.725, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4527640342712402, "rewards/margins": 0.15338827669620514, "rewards/rejected": -1.6061521768569946, "sft_loss": 1.5176591873168945, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 16.311152981957626, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.14393463730812073, "logits/rejected": 0.221922367811203, "logps/chosen": -1.4483121633529663, "logps/rejected": -1.6912574768066406, "loss": 0.7012, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4483121633529663, "rewards/margins": 0.24294531345367432, "rewards/rejected": -1.6912574768066406, "sft_loss": 1.4978748559951782, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 7.814584788227321, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.11198891699314117, "logits/rejected": 0.2009311467409134, "logps/chosen": -1.403903603553772, "logps/rejected": -1.5798251628875732, "loss": 0.7275, "rewards/accuracies": 0.5625, "rewards/chosen": -1.403903603553772, "rewards/margins": 0.17592160403728485, "rewards/rejected": -1.5798251628875732, "sft_loss": 1.3508328199386597, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 8.901087935063803, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.05171307176351547, "logits/rejected": 0.2174970805644989, "logps/chosen": -1.4211546182632446, "logps/rejected": -1.5635936260223389, "loss": 0.6988, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4211546182632446, "rewards/margins": 0.1424390822649002, "rewards/rejected": -1.5635936260223389, "sft_loss": 1.3733739852905273, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 6.850510093446501, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.06330250203609467, "logits/rejected": 0.1406676471233368, "logps/chosen": -1.565857172012329, "logps/rejected": -1.7019507884979248, "loss": 0.7262, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.565857172012329, "rewards/margins": 0.1360936015844345, "rewards/rejected": -1.7019507884979248, "sft_loss": 1.572394609451294, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 7.960687627957905, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.04025112837553024, "logits/rejected": 0.16977599263191223, "logps/chosen": -1.4802181720733643, "logps/rejected": -1.6751117706298828, "loss": 0.6705, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4802181720733643, "rewards/margins": 0.19489355385303497, "rewards/rejected": -1.6751117706298828, "sft_loss": 1.4630837440490723, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 8.11698133990825, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.12934860587120056, "logits/rejected": 0.20263417065143585, "logps/chosen": -1.5041999816894531, "logps/rejected": -1.6858787536621094, "loss": 0.6857, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5041999816894531, "rewards/margins": 0.18167902529239655, "rewards/rejected": -1.6858787536621094, "sft_loss": 1.4602586030960083, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 14.065711458465282, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.009817545302212238, "logits/rejected": 0.11904706805944443, "logps/chosen": -1.6011021137237549, "logps/rejected": -1.7125682830810547, "loss": 0.7452, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6011021137237549, "rewards/margins": 0.11146605014801025, "rewards/rejected": -1.7125682830810547, "sft_loss": 1.5662167072296143, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 11.494237494395797, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.10378079116344452, "logits/rejected": 0.12546811997890472, "logps/chosen": -1.4556339979171753, "logps/rejected": -1.6442264318466187, "loss": 0.6829, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4556339979171753, "rewards/margins": 0.1885923445224762, "rewards/rejected": -1.6442264318466187, "sft_loss": 1.5452286005020142, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 7.026135520570362, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.07938935607671738, "logits/rejected": 0.14415811002254486, "logps/chosen": -1.5560821294784546, "logps/rejected": -1.753104567527771, "loss": 0.6979, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5560821294784546, "rewards/margins": 0.19702255725860596, "rewards/rejected": -1.753104567527771, "sft_loss": 1.553784966468811, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 13.085706709126768, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.0006553709390573204, "logits/rejected": 0.23008818924427032, "logps/chosen": -1.5384807586669922, "logps/rejected": -1.6943981647491455, "loss": 0.718, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5384807586669922, "rewards/margins": 0.15591737627983093, "rewards/rejected": -1.6943981647491455, "sft_loss": 1.565394401550293, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 7.562416716843187, "learning_rate": 9.999809841765644e-07, "logits/chosen": 0.01612495258450508, "logits/rejected": 0.07761206477880478, "logps/chosen": -1.4772237539291382, "logps/rejected": -1.6728931665420532, "loss": 0.7012, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4772237539291382, "rewards/margins": 0.19566944241523743, "rewards/rejected": -1.6728931665420532, "sft_loss": 1.4823495149612427, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 7.27759841113723, "learning_rate": 9.999649761447477e-07, "logits/chosen": 0.02421320043504238, "logits/rejected": 0.18904590606689453, "logps/chosen": -1.4877642393112183, "logps/rejected": -1.7540662288665771, "loss": 0.6545, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4877642393112183, "rewards/margins": 0.2663021683692932, "rewards/rejected": -1.7540662288665771, "sft_loss": 1.4846785068511963, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 8.545572857334468, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.03330926224589348, "logits/rejected": 0.07859676331281662, "logps/chosen": -1.694360375404358, "logps/rejected": -1.8174225091934204, "loss": 0.7547, "rewards/accuracies": 0.5625, "rewards/chosen": -1.694360375404358, "rewards/margins": 0.12306202948093414, "rewards/rejected": -1.8174225091934204, "sft_loss": 1.6499773263931274, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 20.068562800951632, "learning_rate": 9.999184082963116e-07, "logits/chosen": 0.02297041192650795, "logits/rejected": 0.15485970675945282, "logps/chosen": -1.6264880895614624, "logps/rejected": -1.7158950567245483, "loss": 0.737, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6264880895614624, "rewards/margins": 0.08940695226192474, "rewards/rejected": -1.7158950567245483, "sft_loss": 1.6207870244979858, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 11.548169014320758, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.08564107120037079, "logits/rejected": 0.22062385082244873, "logps/chosen": -1.5454473495483398, "logps/rejected": -1.7348639965057373, "loss": 0.6962, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5454473495483398, "rewards/margins": 0.18941658735275269, "rewards/rejected": -1.7348639965057373, "sft_loss": 1.540062665939331, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 7.903169561620104, "learning_rate": 9.99852439652573e-07, "logits/chosen": 0.03412085771560669, "logits/rejected": 0.19069956243038177, "logps/chosen": -1.5859081745147705, "logps/rejected": -1.7322183847427368, "loss": 0.7088, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5859081745147705, "rewards/margins": 0.1463100016117096, "rewards/rejected": -1.7322183847427368, "sft_loss": 1.5760464668273926, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 12.78791682535348, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.016653526574373245, "logits/rejected": 0.07038216292858124, "logps/chosen": -1.7270981073379517, "logps/rejected": -1.93124520778656, "loss": 0.6967, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7270981073379517, "rewards/margins": 0.20414705574512482, "rewards/rejected": -1.93124520778656, "sft_loss": 1.6932952404022217, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 19.968400745148678, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.10672630369663239, "logits/rejected": 0.27668988704681396, "logps/chosen": -1.6665118932724, "logps/rejected": -1.8740952014923096, "loss": 0.6986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6665118932724, "rewards/margins": 0.20758314430713654, "rewards/rejected": -1.8740952014923096, "sft_loss": 1.6268879175186157, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 6.570304438637934, "learning_rate": 9.99717116001853e-07, "logits/chosen": 0.028351670131087303, "logits/rejected": 0.14044256508350372, "logps/chosen": -1.6373220682144165, "logps/rejected": -1.9211671352386475, "loss": 0.6661, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6373220682144165, "rewards/margins": 0.2838451862335205, "rewards/rejected": -1.9211671352386475, "sft_loss": 1.6178901195526123, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 8.128605670361818, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.1174500435590744, "logits/rejected": 0.18964903056621552, "logps/chosen": -1.7414772510528564, "logps/rejected": -1.9283758401870728, "loss": 0.7044, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7414772510528564, "rewards/margins": 0.1868988275527954, "rewards/rejected": -1.9283758401870728, "sft_loss": 1.7075878381729126, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 11.036972242567145, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.1186809092760086, "logits/rejected": 0.2449244260787964, "logps/chosen": -1.6226027011871338, "logps/rejected": -1.9033540487289429, "loss": 0.6386, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6226027011871338, "rewards/margins": 0.28075140714645386, "rewards/rejected": -1.9033540487289429, "sft_loss": 1.6231905221939087, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 10.804970717169716, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.04840857535600662, "logits/rejected": 0.16239185631275177, "logps/chosen": -1.6699421405792236, "logps/rejected": -1.9459302425384521, "loss": 0.6439, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6699421405792236, "rewards/margins": 0.27598804235458374, "rewards/rejected": -1.9459302425384521, "sft_loss": 1.6243393421173096, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 7.257398978817098, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.04751784726977348, "logits/rejected": 0.2962645888328552, "logps/chosen": -1.761138677597046, "logps/rejected": -1.9825347661972046, "loss": 0.6997, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.761138677597046, "rewards/margins": 0.22139613330364227, "rewards/rejected": -1.9825347661972046, "sft_loss": 1.7507600784301758, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 18.5328736159243, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.047339897602796555, "logits/rejected": 0.159702330827713, "logps/chosen": -1.7461141347885132, "logps/rejected": -1.9744231700897217, "loss": 0.7026, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7461141347885132, "rewards/margins": 0.22830912470817566, "rewards/rejected": -1.9744231700897217, "sft_loss": 1.7676417827606201, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 9.49519419022593, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.03701364994049072, "logits/rejected": 0.047867387533187866, "logps/chosen": -1.6813396215438843, "logps/rejected": -1.9668573141098022, "loss": 0.6555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6813396215438843, "rewards/margins": 0.28551778197288513, "rewards/rejected": -1.9668573141098022, "sft_loss": 1.6320127248764038, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 14.110541145250972, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.023934122174978256, "logits/rejected": 0.16555899381637573, "logps/chosen": -1.8226124048233032, "logps/rejected": -2.125878095626831, "loss": 0.6608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8226124048233032, "rewards/margins": 0.3032657504081726, "rewards/rejected": -2.125878095626831, "sft_loss": 1.8263565301895142, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 14.501783200578824, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.0873773917555809, "logits/rejected": 0.10016919672489166, "logps/chosen": -1.7935062646865845, "logps/rejected": -2.083219289779663, "loss": 0.6994, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7935062646865845, "rewards/margins": 0.2897128462791443, "rewards/rejected": -2.083219289779663, "sft_loss": 1.787320852279663, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 11.901703209101552, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.058928169310092926, "logits/rejected": 0.18308427929878235, "logps/chosen": -1.7760273218154907, "logps/rejected": -1.9454076290130615, "loss": 0.704, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7760273218154907, "rewards/margins": 0.16938039660453796, "rewards/rejected": -1.9454076290130615, "sft_loss": 1.7750753164291382, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.257840074108619, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.04965372011065483, "logits/rejected": 0.18989132344722748, "logps/chosen": -1.7178142070770264, "logps/rejected": -1.983058214187622, "loss": 0.6622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7178142070770264, "rewards/margins": 0.26524391770362854, "rewards/rejected": -1.983058214187622, "sft_loss": 1.7471628189086914, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 9.43646865006057, "learning_rate": 9.988477467616445e-07, "logits/chosen": 0.02612454816699028, "logits/rejected": 0.22704274952411652, "logps/chosen": -1.760839819908142, "logps/rejected": -1.984262228012085, "loss": 0.6536, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.760839819908142, "rewards/margins": 0.22342228889465332, "rewards/rejected": -1.984262228012085, "sft_loss": 1.8557103872299194, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 12.75217129491488, "learning_rate": 9.987396563355205e-07, "logits/chosen": 0.03415621817111969, "logits/rejected": 0.11213034391403198, "logps/chosen": -1.7600212097167969, "logps/rejected": -2.105306625366211, "loss": 0.6401, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7600212097167969, "rewards/margins": 0.3452851176261902, "rewards/rejected": -2.105306625366211, "sft_loss": 1.8122920989990234, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 14.136886280968675, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.12487022578716278, "logits/rejected": 0.2956189513206482, "logps/chosen": -1.863404631614685, "logps/rejected": -2.0956015586853027, "loss": 0.7424, "rewards/accuracies": 0.5625, "rewards/chosen": -1.863404631614685, "rewards/margins": 0.23219680786132812, "rewards/rejected": -2.0956015586853027, "sft_loss": 1.8058325052261353, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 23.573937335629136, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.08845379203557968, "logits/rejected": 0.24961254000663757, "logps/chosen": -1.875475525856018, "logps/rejected": -2.1358890533447266, "loss": 0.6855, "rewards/accuracies": 0.65625, "rewards/chosen": -1.875475525856018, "rewards/margins": 0.2604133188724518, "rewards/rejected": -2.1358890533447266, "sft_loss": 1.8293803930282593, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 10.229793160551953, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.10676054656505585, "logits/rejected": 0.14324404299259186, "logps/chosen": -1.8982778787612915, "logps/rejected": -2.15161395072937, "loss": 0.6836, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.8982778787612915, "rewards/margins": 0.25333622097969055, "rewards/rejected": -2.15161395072937, "sft_loss": 1.9185289144515991, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 8.294142865496967, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.06274578720331192, "logits/rejected": 0.1509443074464798, "logps/chosen": -1.7770349979400635, "logps/rejected": -2.099757194519043, "loss": 0.6382, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7770349979400635, "rewards/margins": 0.3227222263813019, "rewards/rejected": -2.099757194519043, "sft_loss": 1.8025732040405273, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 12.13623780002283, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.0386006124317646, "logits/rejected": 0.1029144749045372, "logps/chosen": -1.9709434509277344, "logps/rejected": -2.2089548110961914, "loss": 0.6667, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9709434509277344, "rewards/margins": 0.2380111962556839, "rewards/rejected": -2.2089548110961914, "sft_loss": 1.9142709970474243, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 10.247038800111389, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.0016391247045248747, "logits/rejected": 0.16924506425857544, "logps/chosen": -1.9363353252410889, "logps/rejected": -2.3193938732147217, "loss": 0.6286, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9363353252410889, "rewards/margins": 0.38305872678756714, "rewards/rejected": -2.3193938732147217, "sft_loss": 1.9278072118759155, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 10.824747447247148, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.10746796429157257, "logits/rejected": 0.20378378033638, "logps/chosen": -1.9353691339492798, "logps/rejected": -2.2340164184570312, "loss": 0.6387, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9353691339492798, "rewards/margins": 0.2986472249031067, "rewards/rejected": -2.2340164184570312, "sft_loss": 1.8967138528823853, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 11.75047292485389, "learning_rate": 9.97700834996658e-07, "logits/chosen": 0.030619556084275246, "logits/rejected": 0.19418159127235413, "logps/chosen": -2.069227933883667, "logps/rejected": -2.339308261871338, "loss": 0.6662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.069227933883667, "rewards/margins": 0.27007991075515747, "rewards/rejected": -2.339308261871338, "sft_loss": 1.9816768169403076, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 17.695396071152995, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.09588191658258438, "logits/rejected": 0.28006869554519653, "logps/chosen": -2.1513028144836426, "logps/rejected": -2.4341063499450684, "loss": 0.697, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1513028144836426, "rewards/margins": 0.2828032970428467, "rewards/rejected": -2.4341063499450684, "sft_loss": 2.1287317276000977, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 10.947508153385135, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.03072550520300865, "logits/rejected": 0.10771825164556503, "logps/chosen": -1.934647798538208, "logps/rejected": -2.3204731941223145, "loss": 0.6085, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.934647798538208, "rewards/margins": 0.3858256936073303, "rewards/rejected": -2.3204731941223145, "sft_loss": 1.9882686138153076, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 19.265460613648305, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.05715986341238022, "logits/rejected": 0.0929824709892273, "logps/chosen": -2.092118978500366, "logps/rejected": -2.363680124282837, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -2.092118978500366, "rewards/margins": 0.2715609073638916, "rewards/rejected": -2.363680124282837, "sft_loss": 2.1064469814300537, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 10.638788847537189, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.022910332307219505, "logits/rejected": 0.07507751882076263, "logps/chosen": -2.084163188934326, "logps/rejected": -2.3188323974609375, "loss": 0.6646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.084163188934326, "rewards/margins": 0.23466899991035461, "rewards/rejected": -2.3188323974609375, "sft_loss": 2.1011803150177, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 15.490968768923501, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.026397373527288437, "logits/rejected": 0.13944368064403534, "logps/chosen": -2.146524667739868, "logps/rejected": -2.450518846511841, "loss": 0.6567, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.146524667739868, "rewards/margins": 0.3039940893650055, "rewards/rejected": -2.450518846511841, "sft_loss": 2.1449663639068604, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 6.928646909317261, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.09138406813144684, "logits/rejected": 0.16776150465011597, "logps/chosen": -2.144055128097534, "logps/rejected": -2.5063459873199463, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": -2.144055128097534, "rewards/margins": 0.3622905910015106, "rewards/rejected": -2.5063459873199463, "sft_loss": 2.130777359008789, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 17.01523123010893, "learning_rate": 9.965383459518179e-07, "logits/chosen": 0.044910602271556854, "logits/rejected": 0.21089403331279755, "logps/chosen": -2.10158109664917, "logps/rejected": -2.466341972351074, "loss": 0.6458, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.10158109664917, "rewards/margins": 0.3647606372833252, "rewards/rejected": -2.466341972351074, "sft_loss": 2.08829927444458, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 9.971064318414882, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.0854736715555191, "logits/rejected": 0.22234101593494415, "logps/chosen": -2.100032091140747, "logps/rejected": -2.390237331390381, "loss": 0.6847, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.100032091140747, "rewards/margins": 0.29020509123802185, "rewards/rejected": -2.390237331390381, "sft_loss": 2.0968873500823975, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 6.942490028615991, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.019798316061496735, "logits/rejected": 0.06015967205166817, "logps/chosen": -2.1238718032836914, "logps/rejected": -2.4312360286712646, "loss": 0.6619, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1238718032836914, "rewards/margins": 0.30736416578292847, "rewards/rejected": -2.4312360286712646, "sft_loss": 2.1534500122070312, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 8.678598883925492, "learning_rate": 9.959678417085998e-07, "logits/chosen": 0.03256041929125786, "logits/rejected": 0.12763187289237976, "logps/chosen": -2.0395474433898926, "logps/rejected": -2.3078253269195557, "loss": 0.6768, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0395474433898926, "rewards/margins": 0.26827770471572876, "rewards/rejected": -2.3078253269195557, "sft_loss": 2.047736883163452, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 10.086072034634013, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.1441255360841751, "logits/rejected": 0.2772209346294403, "logps/chosen": -2.025604724884033, "logps/rejected": -2.478121519088745, "loss": 0.6098, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.025604724884033, "rewards/margins": 0.45251646637916565, "rewards/rejected": -2.478121519088745, "sft_loss": 2.0043413639068604, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 6.9563669209630135, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.027880221605300903, "logits/rejected": 0.15025661885738373, "logps/chosen": -2.039132833480835, "logps/rejected": -2.3555057048797607, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": -2.039132833480835, "rewards/margins": 0.3163727819919586, "rewards/rejected": -2.3555057048797607, "sft_loss": 1.9834359884262085, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 10.1422790063844, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.08938129246234894, "logits/rejected": 0.06203896924853325, "logps/chosen": -2.033430576324463, "logps/rejected": -2.3355135917663574, "loss": 0.6575, "rewards/accuracies": 0.59375, "rewards/chosen": -2.033430576324463, "rewards/margins": 0.302082896232605, "rewards/rejected": -2.3355135917663574, "sft_loss": 2.0773210525512695, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 9.246731434824245, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.05133030563592911, "logits/rejected": 0.18070444464683533, "logps/chosen": -1.9821866750717163, "logps/rejected": -2.3778305053710938, "loss": 0.6272, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9821866750717163, "rewards/margins": 0.39564332365989685, "rewards/rejected": -2.3778305053710938, "sft_loss": 1.9979397058486938, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 10.058741885647887, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.013221639208495617, "logits/rejected": 0.12137987464666367, "logps/chosen": -1.9625238180160522, "logps/rejected": -2.3376636505126953, "loss": 0.6178, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9625238180160522, "rewards/margins": 0.3751398026943207, "rewards/rejected": -2.3376636505126953, "sft_loss": 1.949650526046753, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 11.369308771538593, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.017232773825526237, "logits/rejected": 0.17944982647895813, "logps/chosen": -2.0125339031219482, "logps/rejected": -2.472736358642578, "loss": 0.6233, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0125339031219482, "rewards/margins": 0.46020251512527466, "rewards/rejected": -2.472736358642578, "sft_loss": 2.0411736965179443, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 8.691476566345305, "learning_rate": 9.944683435341155e-07, "logits/chosen": 0.0012146488297730684, "logits/rejected": 0.08047197759151459, "logps/chosen": -2.0661959648132324, "logps/rejected": -2.431159496307373, "loss": 0.6256, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0661959648132324, "rewards/margins": 0.3649637699127197, "rewards/rejected": -2.431159496307373, "sft_loss": 2.0064785480499268, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.3408060073852539, "eval_logits/rejected": 0.43282878398895264, "eval_logps/chosen": -2.1063621044158936, "eval_logps/rejected": -2.51962947845459, "eval_loss": 0.6210696697235107, "eval_rewards/accuracies": 0.6654302477836609, "eval_rewards/chosen": -2.1063621044158936, "eval_rewards/margins": 0.41326722502708435, "eval_rewards/rejected": -2.51962947845459, "eval_runtime": 51.7853, "eval_samples_per_second": 25.973, "eval_sft_loss": 2.064507246017456, "eval_steps_per_second": 6.508, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 11.272499184163516, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.028055086731910706, "logits/rejected": 0.14415912330150604, "logps/chosen": -2.0782713890075684, "logps/rejected": -2.608614444732666, "loss": 0.5881, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0782713890075684, "rewards/margins": 0.5303429961204529, "rewards/rejected": -2.608614444732666, "sft_loss": 2.0765321254730225, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 10.196181196736118, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.07339145243167877, "logits/rejected": 0.1465933471918106, "logps/chosen": -2.146256923675537, "logps/rejected": -2.4748787879943848, "loss": 0.6468, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.146256923675537, "rewards/margins": 0.32862168550491333, "rewards/rejected": -2.4748787879943848, "sft_loss": 2.156471014022827, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 12.834641747300186, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.09053969383239746, "logits/rejected": 0.22762493789196014, "logps/chosen": -2.0983657836914062, "logps/rejected": -2.6281564235687256, "loss": 0.6087, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0983657836914062, "rewards/margins": 0.5297908782958984, "rewards/rejected": -2.6281564235687256, "sft_loss": 2.1565418243408203, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 10.407568591424436, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.10789525508880615, "logits/rejected": 0.15350130200386047, "logps/chosen": -2.1824188232421875, "logps/rejected": -2.57214617729187, "loss": 0.6614, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1824188232421875, "rewards/margins": 0.3897269070148468, "rewards/rejected": -2.57214617729187, "sft_loss": 2.1535208225250244, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 16.32583749722319, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.04362216964364052, "logits/rejected": 0.16212865710258484, "logps/chosen": -2.0230681896209717, "logps/rejected": -2.4962611198425293, "loss": 0.6143, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0230681896209717, "rewards/margins": 0.47319287061691284, "rewards/rejected": -2.4962611198425293, "sft_loss": 2.1321260929107666, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 9.679449553639905, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.10882987082004547, "logits/rejected": 0.27502506971359253, "logps/chosen": -2.0700125694274902, "logps/rejected": -2.392230987548828, "loss": 0.653, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0700125694274902, "rewards/margins": 0.3222183287143707, "rewards/rejected": -2.392230987548828, "sft_loss": 2.0672836303710938, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 11.783414410888401, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.06562662124633789, "logits/rejected": 0.19133488833904266, "logps/chosen": -2.0099849700927734, "logps/rejected": -2.356341600418091, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": -2.0099849700927734, "rewards/margins": 0.34635668992996216, "rewards/rejected": -2.356341600418091, "sft_loss": 2.1112921237945557, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 11.977442879693111, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.049711473286151886, "logits/rejected": 0.21187114715576172, "logps/chosen": -2.1671743392944336, "logps/rejected": -2.5274157524108887, "loss": 0.6755, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1671743392944336, "rewards/margins": 0.36024102568626404, "rewards/rejected": -2.5274157524108887, "sft_loss": 2.248695135116577, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 6.468856534604243, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.1306043565273285, "logits/rejected": 0.19352035224437714, "logps/chosen": -2.1376121044158936, "logps/rejected": -2.382310628890991, "loss": 0.7009, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1376121044158936, "rewards/margins": 0.24469847977161407, "rewards/rejected": -2.382310628890991, "sft_loss": 2.2619433403015137, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 9.605540692516087, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.0685216411948204, "logits/rejected": 0.1462893933057785, "logps/chosen": -2.1614136695861816, "logps/rejected": -2.4543886184692383, "loss": 0.6515, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1614136695861816, "rewards/margins": 0.2929750084877014, "rewards/rejected": -2.4543886184692383, "sft_loss": 2.1964187622070312, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 13.607347229557735, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.03243374079465866, "logits/rejected": 0.18373778462409973, "logps/chosen": -2.1962177753448486, "logps/rejected": -2.599600315093994, "loss": 0.6816, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1962177753448486, "rewards/margins": 0.40338245034217834, "rewards/rejected": -2.599600315093994, "sft_loss": 2.2184200286865234, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 7.112280570191756, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.006338512059301138, "logits/rejected": 0.16377900540828705, "logps/chosen": -2.3264520168304443, "logps/rejected": -2.855248212814331, "loss": 0.5984, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3264520168304443, "rewards/margins": 0.5287963151931763, "rewards/rejected": -2.855248212814331, "sft_loss": 2.329258918762207, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 18.411463377434387, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.06724239140748978, "logits/rejected": 0.16363653540611267, "logps/chosen": -2.276106357574463, "logps/rejected": -2.728752374649048, "loss": 0.6566, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.276106357574463, "rewards/margins": 0.45264577865600586, "rewards/rejected": -2.728752374649048, "sft_loss": 2.194448947906494, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 8.602773177915976, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.05895795673131943, "logits/rejected": 0.20239920914173126, "logps/chosen": -2.257357120513916, "logps/rejected": -2.714703321456909, "loss": 0.6051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.257357120513916, "rewards/margins": 0.4573463499546051, "rewards/rejected": -2.714703321456909, "sft_loss": 2.27178692817688, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 11.413058495257092, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.0611516535282135, "logits/rejected": 0.14355552196502686, "logps/chosen": -2.314882278442383, "logps/rejected": -2.6507880687713623, "loss": 0.7265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.314882278442383, "rewards/margins": 0.3359060287475586, "rewards/rejected": -2.6507880687713623, "sft_loss": 2.268735647201538, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 8.322484118302752, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.07006438076496124, "logits/rejected": 0.2505702078342438, "logps/chosen": -2.313701868057251, "logps/rejected": -2.7320985794067383, "loss": 0.6242, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.313701868057251, "rewards/margins": 0.41839662194252014, "rewards/rejected": -2.7320985794067383, "sft_loss": 2.212843179702759, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 11.110994213765201, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.003151841461658478, "logits/rejected": 0.0851263627409935, "logps/chosen": -2.2502312660217285, "logps/rejected": -2.7037360668182373, "loss": 0.5925, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2502312660217285, "rewards/margins": 0.45350486040115356, "rewards/rejected": -2.7037360668182373, "sft_loss": 2.3255438804626465, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 8.042578958055598, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.07694842666387558, "logits/rejected": 0.054141778498888016, "logps/chosen": -2.2050156593322754, "logps/rejected": -2.605039119720459, "loss": 0.628, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2050156593322754, "rewards/margins": 0.40002351999282837, "rewards/rejected": -2.605039119720459, "sft_loss": 2.29542875289917, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 10.772931639154226, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.12407276779413223, "logits/rejected": 0.22976458072662354, "logps/chosen": -2.0931012630462646, "logps/rejected": -2.411670446395874, "loss": 0.6527, "rewards/accuracies": 0.625, "rewards/chosen": -2.0931012630462646, "rewards/margins": 0.3185690641403198, "rewards/rejected": -2.411670446395874, "sft_loss": 2.15762996673584, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 11.709109730569184, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.031045908108353615, "logits/rejected": 0.046828627586364746, "logps/chosen": -2.079315662384033, "logps/rejected": -2.49480938911438, "loss": 0.6455, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.079315662384033, "rewards/margins": 0.41549381613731384, "rewards/rejected": -2.49480938911438, "sft_loss": 2.1446011066436768, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 11.282429312529446, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.06443838775157928, "logits/rejected": 0.2664431929588318, "logps/chosen": -2.190359354019165, "logps/rejected": -2.6365315914154053, "loss": 0.6507, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.190359354019165, "rewards/margins": 0.44617241621017456, "rewards/rejected": -2.6365315914154053, "sft_loss": 2.263021230697632, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 6.609977962565628, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.037000637501478195, "logits/rejected": 0.13551054894924164, "logps/chosen": -2.2155261039733887, "logps/rejected": -2.5495128631591797, "loss": 0.6795, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2155261039733887, "rewards/margins": 0.3339867889881134, "rewards/rejected": -2.5495128631591797, "sft_loss": 2.206447124481201, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 8.953583728302595, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.007858863100409508, "logits/rejected": 0.20538286864757538, "logps/chosen": -2.1532692909240723, "logps/rejected": -2.579277515411377, "loss": 0.617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1532692909240723, "rewards/margins": 0.42600807547569275, "rewards/rejected": -2.579277515411377, "sft_loss": 2.1827797889709473, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 11.136986263164165, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.1128419041633606, "logits/rejected": 0.2760482430458069, "logps/chosen": -2.1042284965515137, "logps/rejected": -2.5994343757629395, "loss": 0.6181, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1042284965515137, "rewards/margins": 0.49520596861839294, "rewards/rejected": -2.5994343757629395, "sft_loss": 2.201141595840454, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 11.552133892516244, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.020469993352890015, "logits/rejected": 0.1490965336561203, "logps/chosen": -2.3277976512908936, "logps/rejected": -2.7004261016845703, "loss": 0.6247, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3277976512908936, "rewards/margins": 0.372628390789032, "rewards/rejected": -2.7004261016845703, "sft_loss": 2.2944016456604004, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 11.712552792289904, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.053301334381103516, "logits/rejected": 0.060594916343688965, "logps/chosen": -2.2250044345855713, "logps/rejected": -2.7405011653900146, "loss": 0.6411, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2250044345855713, "rewards/margins": 0.5154968500137329, "rewards/rejected": -2.7405011653900146, "sft_loss": 2.2513134479522705, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 9.738315063652102, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.024926379323005676, "logits/rejected": 0.06959456950426102, "logps/chosen": -2.3082454204559326, "logps/rejected": -2.7023775577545166, "loss": 0.641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3082454204559326, "rewards/margins": 0.39413195848464966, "rewards/rejected": -2.7023775577545166, "sft_loss": 2.3833935260772705, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 10.449726981814146, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.009898416697978973, "logits/rejected": 0.10434658825397491, "logps/chosen": -2.2227025032043457, "logps/rejected": -2.627720594406128, "loss": 0.6159, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2227025032043457, "rewards/margins": 0.4050180912017822, "rewards/rejected": -2.627720594406128, "sft_loss": 2.2642316818237305, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 10.750391532240872, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.05544097349047661, "logits/rejected": 0.08029208332300186, "logps/chosen": -2.1924877166748047, "logps/rejected": -2.6801917552948, "loss": 0.5819, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1924877166748047, "rewards/margins": 0.487703800201416, "rewards/rejected": -2.6801917552948, "sft_loss": 2.233238697052002, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 8.575901452586978, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.05543569475412369, "logits/rejected": 0.11257767677307129, "logps/chosen": -2.3032419681549072, "logps/rejected": -2.6663010120391846, "loss": 0.659, "rewards/accuracies": 0.625, "rewards/chosen": -2.3032419681549072, "rewards/margins": 0.3630586564540863, "rewards/rejected": -2.6663010120391846, "sft_loss": 2.2930896282196045, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 8.858314895755024, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.0004590839089360088, "logits/rejected": 0.21903236210346222, "logps/chosen": -2.3906993865966797, "logps/rejected": -2.7964766025543213, "loss": 0.6359, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3906993865966797, "rewards/margins": 0.4057773947715759, "rewards/rejected": -2.7964766025543213, "sft_loss": 2.451749801635742, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 8.898233512349927, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.04713328927755356, "logits/rejected": 0.09548879414796829, "logps/chosen": -2.3923017978668213, "logps/rejected": -2.7829506397247314, "loss": 0.6556, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3923017978668213, "rewards/margins": 0.3906486928462982, "rewards/rejected": -2.7829506397247314, "sft_loss": 2.457034111022949, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 10.236929503532881, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.045480988919734955, "logits/rejected": 0.05475884675979614, "logps/chosen": -2.4740447998046875, "logps/rejected": -2.9208970069885254, "loss": 0.6572, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4740447998046875, "rewards/margins": 0.44685202836990356, "rewards/rejected": -2.9208970069885254, "sft_loss": 2.5296459197998047, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 10.991585515412387, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.05277659744024277, "logits/rejected": 0.06578487157821655, "logps/chosen": -2.321049451828003, "logps/rejected": -2.6783082485198975, "loss": 0.6606, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.321049451828003, "rewards/margins": 0.35725873708724976, "rewards/rejected": -2.6783082485198975, "sft_loss": 2.3697457313537598, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 9.505797728949158, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.07590442150831223, "logits/rejected": 0.11572281271219254, "logps/chosen": -2.364682197570801, "logps/rejected": -2.8034095764160156, "loss": 0.5979, "rewards/accuracies": 0.71875, "rewards/chosen": -2.364682197570801, "rewards/margins": 0.4387272894382477, "rewards/rejected": -2.8034095764160156, "sft_loss": 2.4401233196258545, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 18.395820893073818, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.08162043243646622, "logits/rejected": 0.10748519748449326, "logps/chosen": -2.491405963897705, "logps/rejected": -2.876349925994873, "loss": 0.6539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.491405963897705, "rewards/margins": 0.3849434554576874, "rewards/rejected": -2.876349925994873, "sft_loss": 2.516740560531616, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 10.764029069829844, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.005502620246261358, "logits/rejected": 0.20565679669380188, "logps/chosen": -2.4020817279815674, "logps/rejected": -2.7886009216308594, "loss": 0.6282, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4020817279815674, "rewards/margins": 0.38651904463768005, "rewards/rejected": -2.7886009216308594, "sft_loss": 2.452831745147705, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 9.926095190413394, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.08108798414468765, "logits/rejected": 0.10665123164653778, "logps/chosen": -2.2424654960632324, "logps/rejected": -2.74579119682312, "loss": 0.6075, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2424654960632324, "rewards/margins": 0.5033257603645325, "rewards/rejected": -2.74579119682312, "sft_loss": 2.4026360511779785, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 11.831281235348646, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.10297316312789917, "logits/rejected": 0.18497176468372345, "logps/chosen": -2.3841609954833984, "logps/rejected": -2.7636427879333496, "loss": 0.6799, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3841609954833984, "rewards/margins": 0.37948182225227356, "rewards/rejected": -2.7636427879333496, "sft_loss": 2.5231597423553467, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 12.148857237997744, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.0838220939040184, "logits/rejected": 0.2115117609500885, "logps/chosen": -2.5137131214141846, "logps/rejected": -3.0223727226257324, "loss": 0.5939, "rewards/accuracies": 0.65625, "rewards/chosen": -2.5137131214141846, "rewards/margins": 0.5086593627929688, "rewards/rejected": -3.0223727226257324, "sft_loss": 2.5393052101135254, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 14.629770964322283, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.06634237617254257, "logits/rejected": 0.23861713707447052, "logps/chosen": -2.6201236248016357, "logps/rejected": -3.0741326808929443, "loss": 0.6586, "rewards/accuracies": 0.65625, "rewards/chosen": -2.6201236248016357, "rewards/margins": 0.45400896668434143, "rewards/rejected": -3.0741326808929443, "sft_loss": 2.6818976402282715, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 10.03296199140755, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.030392784625291824, "logits/rejected": 0.22696132957935333, "logps/chosen": -2.3675222396850586, "logps/rejected": -2.956252336502075, "loss": 0.5959, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3675222396850586, "rewards/margins": 0.5887301564216614, "rewards/rejected": -2.956252336502075, "sft_loss": 2.461897850036621, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 10.073673041792262, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.027452487498521805, "logits/rejected": 0.20025476813316345, "logps/chosen": -2.364055633544922, "logps/rejected": -2.828199625015259, "loss": 0.6378, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.364055633544922, "rewards/margins": 0.464143842458725, "rewards/rejected": -2.828199625015259, "sft_loss": 2.470456600189209, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 12.618955481917041, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.029651781544089317, "logits/rejected": 0.18079546093940735, "logps/chosen": -2.3804421424865723, "logps/rejected": -2.89245867729187, "loss": 0.5858, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.3804421424865723, "rewards/margins": 0.5120163559913635, "rewards/rejected": -2.89245867729187, "sft_loss": 2.506648063659668, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 13.157924185810748, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.012971627525985241, "logits/rejected": 0.11660315841436386, "logps/chosen": -2.403261184692383, "logps/rejected": -2.9521636962890625, "loss": 0.6047, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.403261184692383, "rewards/margins": 0.5489023923873901, "rewards/rejected": -2.9521636962890625, "sft_loss": 2.4948441982269287, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 19.127254297505065, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.0366203635931015, "logits/rejected": 0.12366179376840591, "logps/chosen": -2.4076895713806152, "logps/rejected": -2.8217995166778564, "loss": 0.6607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.4076895713806152, "rewards/margins": 0.41411009430885315, "rewards/rejected": -2.8217995166778564, "sft_loss": 2.4844412803649902, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 9.835627653069569, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.008364463225007057, "logits/rejected": 0.10912313312292099, "logps/chosen": -2.4262290000915527, "logps/rejected": -2.7828643321990967, "loss": 0.6512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.4262290000915527, "rewards/margins": 0.35663533210754395, "rewards/rejected": -2.7828643321990967, "sft_loss": 2.4651858806610107, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 14.97168100471102, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.019193686544895172, "logits/rejected": 0.18119914829730988, "logps/chosen": -2.1704416275024414, "logps/rejected": -2.723491668701172, "loss": 0.5976, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1704416275024414, "rewards/margins": 0.55305016040802, "rewards/rejected": -2.723491668701172, "sft_loss": 2.256765604019165, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 8.608115838074811, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.05023932456970215, "logits/rejected": 0.09959036856889725, "logps/chosen": -2.277273416519165, "logps/rejected": -2.8040313720703125, "loss": 0.5946, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.277273416519165, "rewards/margins": 0.526758074760437, "rewards/rejected": -2.8040313720703125, "sft_loss": 2.397162914276123, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 18.060015330254146, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.07415231317281723, "logits/rejected": 0.0306253582239151, "logps/chosen": -2.4391984939575195, "logps/rejected": -2.969973087310791, "loss": 0.5954, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4391984939575195, "rewards/margins": 0.5307744145393372, "rewards/rejected": -2.969973087310791, "sft_loss": 2.513673782348633, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 14.88256555882873, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.0031950809061527252, "logits/rejected": 0.07587514817714691, "logps/chosen": -2.466426372528076, "logps/rejected": -2.9754433631896973, "loss": 0.5991, "rewards/accuracies": 0.71875, "rewards/chosen": -2.466426372528076, "rewards/margins": 0.5090171694755554, "rewards/rejected": -2.9754433631896973, "sft_loss": 2.4860310554504395, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 14.577243346146625, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.044427402317523956, "logits/rejected": 0.2288268506526947, "logps/chosen": -2.6163885593414307, "logps/rejected": -3.047532320022583, "loss": 0.6489, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6163885593414307, "rewards/margins": 0.4311438202857971, "rewards/rejected": -3.047532320022583, "sft_loss": 2.5802102088928223, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 13.460320440360055, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.037521276623010635, "logits/rejected": 0.17139963805675507, "logps/chosen": -2.5272560119628906, "logps/rejected": -3.0960605144500732, "loss": 0.587, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5272560119628906, "rewards/margins": 0.5688048005104065, "rewards/rejected": -3.0960605144500732, "sft_loss": 2.6325743198394775, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 8.556134708359306, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.06615705788135529, "logits/rejected": 0.18242689967155457, "logps/chosen": -2.5637755393981934, "logps/rejected": -3.100074052810669, "loss": 0.5931, "rewards/accuracies": 0.65625, "rewards/chosen": -2.5637755393981934, "rewards/margins": 0.5362985134124756, "rewards/rejected": -3.100074052810669, "sft_loss": 2.6259491443634033, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 7.643138342066282, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.010040968656539917, "logits/rejected": 0.141854926943779, "logps/chosen": -2.3528027534484863, "logps/rejected": -2.9125170707702637, "loss": 0.6114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3528027534484863, "rewards/margins": 0.5597147345542908, "rewards/rejected": -2.9125170707702637, "sft_loss": 2.4413468837738037, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 9.667325714568168, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.0742809846997261, "logits/rejected": 0.04069014638662338, "logps/chosen": -2.461108684539795, "logps/rejected": -2.9574544429779053, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": -2.461108684539795, "rewards/margins": 0.4963456690311432, "rewards/rejected": -2.9574544429779053, "sft_loss": 2.6110215187072754, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 12.00641733717943, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.002614037599414587, "logits/rejected": 0.16859155893325806, "logps/chosen": -2.38588285446167, "logps/rejected": -2.7110936641693115, "loss": 0.6804, "rewards/accuracies": 0.625, "rewards/chosen": -2.38588285446167, "rewards/margins": 0.3252108693122864, "rewards/rejected": -2.7110936641693115, "sft_loss": 2.4398577213287354, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 8.887866413388897, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.017975768074393272, "logits/rejected": 0.126893550157547, "logps/chosen": -2.442229986190796, "logps/rejected": -2.8189597129821777, "loss": 0.6606, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.442229986190796, "rewards/margins": 0.3767297863960266, "rewards/rejected": -2.8189597129821777, "sft_loss": 2.5147817134857178, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 13.634216782277669, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.11448929458856583, "logits/rejected": 0.04175081476569176, "logps/chosen": -2.4436726570129395, "logps/rejected": -2.931497097015381, "loss": 0.6069, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4436726570129395, "rewards/margins": 0.48782461881637573, "rewards/rejected": -2.931497097015381, "sft_loss": 2.564622402191162, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 14.502468297807804, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.013986131176352501, "logits/rejected": 0.1314467340707779, "logps/chosen": -2.3802475929260254, "logps/rejected": -2.9986507892608643, "loss": 0.5917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3802475929260254, "rewards/margins": 0.6184031963348389, "rewards/rejected": -2.9986507892608643, "sft_loss": 2.479921817779541, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 10.560583498259847, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.07381193339824677, "logits/rejected": 0.025053244084119797, "logps/chosen": -2.515423536300659, "logps/rejected": -2.880089521408081, "loss": 0.6627, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.515423536300659, "rewards/margins": 0.36466631293296814, "rewards/rejected": -2.880089521408081, "sft_loss": 2.5252633094787598, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 16.477565978507176, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.04238230735063553, "logits/rejected": 0.09166856110095978, "logps/chosen": -2.4855003356933594, "logps/rejected": -2.8636887073516846, "loss": 0.639, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.4855003356933594, "rewards/margins": 0.37818849086761475, "rewards/rejected": -2.8636887073516846, "sft_loss": 2.6017448902130127, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 22.795719086000116, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.009466012939810753, "logits/rejected": 0.1582622528076172, "logps/chosen": -2.2711384296417236, "logps/rejected": -2.729496479034424, "loss": 0.6323, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2711384296417236, "rewards/margins": 0.45835790038108826, "rewards/rejected": -2.729496479034424, "sft_loss": 2.3448898792266846, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 8.150525288694316, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.14355149865150452, "logits/rejected": -0.022237544879317284, "logps/chosen": -2.287654399871826, "logps/rejected": -2.7592105865478516, "loss": 0.608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.287654399871826, "rewards/margins": 0.47155603766441345, "rewards/rejected": -2.7592105865478516, "sft_loss": 2.4397618770599365, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 10.603609318641688, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.13433615863323212, "logits/rejected": 0.06298176944255829, "logps/chosen": -2.3329834938049316, "logps/rejected": -2.8423380851745605, "loss": 0.5815, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3329834938049316, "rewards/margins": 0.5093545317649841, "rewards/rejected": -2.8423380851745605, "sft_loss": 2.368196487426758, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 14.037296058485492, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.04950593039393425, "logits/rejected": 0.12211354821920395, "logps/chosen": -2.235487461090088, "logps/rejected": -2.6551833152770996, "loss": 0.6361, "rewards/accuracies": 0.65625, "rewards/chosen": -2.235487461090088, "rewards/margins": 0.4196963906288147, "rewards/rejected": -2.6551833152770996, "sft_loss": 2.243699789047241, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 10.96100978751047, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.04126668721437454, "logits/rejected": 0.13179424405097961, "logps/chosen": -2.3691515922546387, "logps/rejected": -2.8386237621307373, "loss": 0.6283, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3691515922546387, "rewards/margins": 0.46947216987609863, "rewards/rejected": -2.8386237621307373, "sft_loss": 2.3317360877990723, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 10.295095221983487, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.03282228857278824, "logits/rejected": 0.013936568982899189, "logps/chosen": -2.3662123680114746, "logps/rejected": -2.6578352451324463, "loss": 0.6672, "rewards/accuracies": 0.625, "rewards/chosen": -2.3662123680114746, "rewards/margins": 0.2916230261325836, "rewards/rejected": -2.6578352451324463, "sft_loss": 2.444215774536133, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 9.482162758527256, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.16914108395576477, "logits/rejected": -0.031090175732970238, "logps/chosen": -2.3121986389160156, "logps/rejected": -2.846667528152466, "loss": 0.6255, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3121986389160156, "rewards/margins": 0.534468948841095, "rewards/rejected": -2.846667528152466, "sft_loss": 2.4279391765594482, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 10.838291866151629, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.13112396001815796, "logits/rejected": 0.006520587019622326, "logps/chosen": -2.263624429702759, "logps/rejected": -2.783017635345459, "loss": 0.5878, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.263624429702759, "rewards/margins": 0.519393265247345, "rewards/rejected": -2.783017635345459, "sft_loss": 2.3268685340881348, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 8.980724497171297, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.043813712894916534, "logits/rejected": 0.06382577121257782, "logps/chosen": -2.2976884841918945, "logps/rejected": -2.7014777660369873, "loss": 0.6231, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2976884841918945, "rewards/margins": 0.4037889838218689, "rewards/rejected": -2.7014777660369873, "sft_loss": 2.4225406646728516, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 11.245760940970241, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.007918231189250946, "logits/rejected": 0.1136241927742958, "logps/chosen": -2.3954429626464844, "logps/rejected": -2.982637882232666, "loss": 0.5482, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3954429626464844, "rewards/margins": 0.5871953368186951, "rewards/rejected": -2.982637882232666, "sft_loss": 2.5971601009368896, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 9.536133362700639, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.06791021674871445, "logits/rejected": 0.029560793191194534, "logps/chosen": -2.4827258586883545, "logps/rejected": -2.875556230545044, "loss": 0.6381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4827258586883545, "rewards/margins": 0.39283058047294617, "rewards/rejected": -2.875556230545044, "sft_loss": 2.6223301887512207, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 11.297407838882163, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.04049893841147423, "logits/rejected": 0.11827345192432404, "logps/chosen": -2.772382974624634, "logps/rejected": -3.2351608276367188, "loss": 0.6649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.772382974624634, "rewards/margins": 0.4627775251865387, "rewards/rejected": -3.2351608276367188, "sft_loss": 2.763463258743286, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 7.303873199808049, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.042949218302965164, "logits/rejected": 0.16490450501441956, "logps/chosen": -2.4589686393737793, "logps/rejected": -2.966370105743408, "loss": 0.6061, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4589686393737793, "rewards/margins": 0.5074009895324707, "rewards/rejected": -2.966370105743408, "sft_loss": 2.608790159225464, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 10.367996592364596, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.06765934824943542, "logits/rejected": 0.09714097529649734, "logps/chosen": -2.715125560760498, "logps/rejected": -3.250112533569336, "loss": 0.6225, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.715125560760498, "rewards/margins": 0.5349869728088379, "rewards/rejected": -3.250112533569336, "sft_loss": 2.7810139656066895, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 9.58430608083217, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.001974116312339902, "logits/rejected": 0.15458719432353973, "logps/chosen": -2.6295740604400635, "logps/rejected": -3.2241318225860596, "loss": 0.5955, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6295740604400635, "rewards/margins": 0.5945574045181274, "rewards/rejected": -3.2241318225860596, "sft_loss": 2.6677796840667725, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 11.885649791284852, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.07231882214546204, "logits/rejected": 0.08323998749256134, "logps/chosen": -2.6960177421569824, "logps/rejected": -3.336521625518799, "loss": 0.5726, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6960177421569824, "rewards/margins": 0.6405037641525269, "rewards/rejected": -3.336521625518799, "sft_loss": 2.8467204570770264, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 9.78909696850363, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.022339412942528725, "logits/rejected": 0.15590617060661316, "logps/chosen": -2.4645371437072754, "logps/rejected": -3.0680432319641113, "loss": 0.5755, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4645371437072754, "rewards/margins": 0.6035064458847046, "rewards/rejected": -3.0680432319641113, "sft_loss": 2.643650531768799, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 13.395794212531905, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.046702008694410324, "logits/rejected": 0.029883652925491333, "logps/chosen": -2.6166794300079346, "logps/rejected": -3.108447790145874, "loss": 0.6247, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6166794300079346, "rewards/margins": 0.4917687475681305, "rewards/rejected": -3.108447790145874, "sft_loss": 2.6548516750335693, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.30381494760513306, "eval_logits/rejected": 0.40509033203125, "eval_logps/chosen": -2.514362335205078, "eval_logps/rejected": -3.0950636863708496, "eval_loss": 0.5880183577537537, "eval_rewards/accuracies": 0.6965875625610352, "eval_rewards/chosen": -2.514362335205078, "eval_rewards/margins": 0.5807018876075745, "eval_rewards/rejected": -3.0950636863708496, "eval_runtime": 52.2125, "eval_samples_per_second": 25.76, "eval_sft_loss": 2.6367032527923584, "eval_steps_per_second": 6.454, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 14.596912231575132, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.14092347025871277, "logits/rejected": 0.05596120283007622, "logps/chosen": -2.4769959449768066, "logps/rejected": -3.0365443229675293, "loss": 0.5774, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4769959449768066, "rewards/margins": 0.5595483183860779, "rewards/rejected": -3.0365443229675293, "sft_loss": 2.5679807662963867, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 10.905534215006766, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.05486919730901718, "logits/rejected": 0.15231744945049286, "logps/chosen": -2.4324145317077637, "logps/rejected": -2.9520673751831055, "loss": 0.5815, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4324145317077637, "rewards/margins": 0.5196529626846313, "rewards/rejected": -2.9520673751831055, "sft_loss": 2.52256441116333, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 9.633793347973821, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.11220196634531021, "logits/rejected": 0.012372380122542381, "logps/chosen": -2.554220676422119, "logps/rejected": -2.970442295074463, "loss": 0.6393, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.554220676422119, "rewards/margins": 0.4162214696407318, "rewards/rejected": -2.970442295074463, "sft_loss": 2.600206136703491, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 10.474965982793881, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.08466102927923203, "logits/rejected": 0.2119368016719818, "logps/chosen": -2.3087267875671387, "logps/rejected": -2.7697083950042725, "loss": 0.6234, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3087267875671387, "rewards/margins": 0.46098190546035767, "rewards/rejected": -2.7697083950042725, "sft_loss": 2.337554931640625, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 10.567533632382021, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.14341183006763458, "logits/rejected": 0.08627209067344666, "logps/chosen": -2.5253093242645264, "logps/rejected": -2.9244608879089355, "loss": 0.6651, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5253093242645264, "rewards/margins": 0.3991513252258301, "rewards/rejected": -2.9244608879089355, "sft_loss": 2.554520845413208, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 12.36533370644545, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.026950160041451454, "logits/rejected": 0.0841381847858429, "logps/chosen": -2.5097880363464355, "logps/rejected": -2.9253194332122803, "loss": 0.645, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.5097880363464355, "rewards/margins": 0.4155314564704895, "rewards/rejected": -2.9253194332122803, "sft_loss": 2.5107533931732178, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 11.395902605646452, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.04188716784119606, "logits/rejected": 0.09305311739444733, "logps/chosen": -2.451453685760498, "logps/rejected": -2.845390796661377, "loss": 0.6245, "rewards/accuracies": 0.65625, "rewards/chosen": -2.451453685760498, "rewards/margins": 0.393937349319458, "rewards/rejected": -2.845390796661377, "sft_loss": 2.5635507106781006, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 10.423991842768576, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.1326659470796585, "logits/rejected": 0.010229384526610374, "logps/chosen": -2.410856008529663, "logps/rejected": -2.8557980060577393, "loss": 0.6034, "rewards/accuracies": 0.65625, "rewards/chosen": -2.410856008529663, "rewards/margins": 0.4449416995048523, "rewards/rejected": -2.8557980060577393, "sft_loss": 2.5360636711120605, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 12.024052772828735, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.04977841302752495, "logits/rejected": 0.14385564625263214, "logps/chosen": -2.3730082511901855, "logps/rejected": -2.846153974533081, "loss": 0.6114, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.3730082511901855, "rewards/margins": 0.4731457829475403, "rewards/rejected": -2.846153974533081, "sft_loss": 2.4019570350646973, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 15.743615619080508, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.00495730247348547, "logits/rejected": 0.2099606990814209, "logps/chosen": -2.516322612762451, "logps/rejected": -2.905848979949951, "loss": 0.6711, "rewards/accuracies": 0.59375, "rewards/chosen": -2.516322612762451, "rewards/margins": 0.38952645659446716, "rewards/rejected": -2.905848979949951, "sft_loss": 2.5147805213928223, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 10.553637253359975, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.07317081093788147, "logits/rejected": 0.051881879568099976, "logps/chosen": -2.4117274284362793, "logps/rejected": -2.902822494506836, "loss": 0.6066, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4117274284362793, "rewards/margins": 0.4910949766635895, "rewards/rejected": -2.902822494506836, "sft_loss": 2.575561046600342, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 9.959680874105779, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.061844177544116974, "logits/rejected": 0.0837896317243576, "logps/chosen": -2.44221568107605, "logps/rejected": -3.0768089294433594, "loss": 0.5839, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.44221568107605, "rewards/margins": 0.63459312915802, "rewards/rejected": -3.0768089294433594, "sft_loss": 2.600975751876831, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 13.05513480259246, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.03136907145380974, "logits/rejected": 0.12455201148986816, "logps/chosen": -2.47062349319458, "logps/rejected": -3.0895168781280518, "loss": 0.5719, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.47062349319458, "rewards/margins": 0.6188936829566956, "rewards/rejected": -3.0895168781280518, "sft_loss": 2.5600526332855225, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 11.902034042794147, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.10893243551254272, "logits/rejected": -0.019856825470924377, "logps/chosen": -2.5645415782928467, "logps/rejected": -3.071223735809326, "loss": 0.6129, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5645415782928467, "rewards/margins": 0.5066820979118347, "rewards/rejected": -3.071223735809326, "sft_loss": 2.6555449962615967, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 9.803927227171691, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.07021275162696838, "logits/rejected": 0.04914160445332527, "logps/chosen": -2.699671745300293, "logps/rejected": -3.1752398014068604, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": -2.699671745300293, "rewards/margins": 0.4755678176879883, "rewards/rejected": -3.1752398014068604, "sft_loss": 2.689988136291504, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 13.390660658299637, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.047956183552742004, "logits/rejected": 0.12472472339868546, "logps/chosen": -2.5990281105041504, "logps/rejected": -3.0923752784729004, "loss": 0.6194, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5990281105041504, "rewards/margins": 0.49334701895713806, "rewards/rejected": -3.0923752784729004, "sft_loss": 2.686919689178467, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 12.692293278068258, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.026659566909074783, "logits/rejected": 0.12806643545627594, "logps/chosen": -2.487506866455078, "logps/rejected": -2.900245428085327, "loss": 0.6237, "rewards/accuracies": 0.6875, "rewards/chosen": -2.487506866455078, "rewards/margins": 0.4127384126186371, "rewards/rejected": -2.900245428085327, "sft_loss": 2.5144777297973633, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 12.475581061613873, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.15530245006084442, "logits/rejected": 0.007811339106410742, "logps/chosen": -2.631837844848633, "logps/rejected": -2.9907565116882324, "loss": 0.671, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.631837844848633, "rewards/margins": 0.3589186668395996, "rewards/rejected": -2.9907565116882324, "sft_loss": 2.694824457168579, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 9.920761501515237, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.03656279668211937, "logits/rejected": -0.03115103766322136, "logps/chosen": -2.5637030601501465, "logps/rejected": -3.136502742767334, "loss": 0.5941, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5637030601501465, "rewards/margins": 0.5728000402450562, "rewards/rejected": -3.136502742767334, "sft_loss": 2.6105880737304688, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 13.549074636681373, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.07891203463077545, "logits/rejected": 0.19809429347515106, "logps/chosen": -2.4845290184020996, "logps/rejected": -3.1138668060302734, "loss": 0.5857, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4845290184020996, "rewards/margins": 0.6293376684188843, "rewards/rejected": -3.1138668060302734, "sft_loss": 2.5858771800994873, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 8.42170392456347, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.04925096780061722, "logits/rejected": 0.09650204330682755, "logps/chosen": -2.5102059841156006, "logps/rejected": -2.9356038570404053, "loss": 0.652, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.5102059841156006, "rewards/margins": 0.42539793252944946, "rewards/rejected": -2.9356038570404053, "sft_loss": 2.518486499786377, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 10.651506064370555, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.046065930277109146, "logits/rejected": 0.1338791847229004, "logps/chosen": -2.371086597442627, "logps/rejected": -2.912052631378174, "loss": 0.6024, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.371086597442627, "rewards/margins": 0.5409659147262573, "rewards/rejected": -2.912052631378174, "sft_loss": 2.4932312965393066, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 11.904745337777952, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.046427834779024124, "logits/rejected": 0.1345222443342209, "logps/chosen": -2.43359375, "logps/rejected": -2.9828577041625977, "loss": 0.5972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.43359375, "rewards/margins": 0.5492635369300842, "rewards/rejected": -2.9828577041625977, "sft_loss": 2.488131284713745, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 9.711103384168968, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.14108577370643616, "logits/rejected": 0.06483618170022964, "logps/chosen": -2.425170660018921, "logps/rejected": -2.924701690673828, "loss": 0.5948, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.425170660018921, "rewards/margins": 0.49953120946884155, "rewards/rejected": -2.924701690673828, "sft_loss": 2.5391621589660645, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 14.583305860389924, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.006739693693816662, "logits/rejected": 0.11771132797002792, "logps/chosen": -2.378840923309326, "logps/rejected": -2.857478618621826, "loss": 0.6699, "rewards/accuracies": 0.59375, "rewards/chosen": -2.378840923309326, "rewards/margins": 0.4786376357078552, "rewards/rejected": -2.857478618621826, "sft_loss": 2.3840348720550537, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 12.03553674525137, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.028889214619994164, "logits/rejected": 0.0867237076163292, "logps/chosen": -2.4030401706695557, "logps/rejected": -2.889843463897705, "loss": 0.6259, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.4030401706695557, "rewards/margins": 0.4868030548095703, "rewards/rejected": -2.889843463897705, "sft_loss": 2.4857749938964844, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 12.184739129523157, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.04891614243388176, "logits/rejected": 0.042119212448596954, "logps/chosen": -2.378790855407715, "logps/rejected": -2.828828811645508, "loss": 0.6266, "rewards/accuracies": 0.6875, "rewards/chosen": -2.378790855407715, "rewards/margins": 0.45003795623779297, "rewards/rejected": -2.828828811645508, "sft_loss": 2.418074131011963, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 16.356956717006213, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.05136380344629288, "logits/rejected": -0.009587121196091175, "logps/chosen": -2.2361412048339844, "logps/rejected": -2.6039490699768066, "loss": 0.6459, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2361412048339844, "rewards/margins": 0.3678080141544342, "rewards/rejected": -2.6039490699768066, "sft_loss": 2.2793796062469482, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 9.192707974512697, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.0014787286054342985, "logits/rejected": 0.20284879207611084, "logps/chosen": -2.2834632396698, "logps/rejected": -2.858455181121826, "loss": 0.5436, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2834632396698, "rewards/margins": 0.5749918222427368, "rewards/rejected": -2.858455181121826, "sft_loss": 2.40155029296875, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 13.138651117833307, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.007256925106048584, "logits/rejected": 0.08275376260280609, "logps/chosen": -2.3986923694610596, "logps/rejected": -2.7163703441619873, "loss": 0.6698, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3986923694610596, "rewards/margins": 0.31767791509628296, "rewards/rejected": -2.7163703441619873, "sft_loss": 2.5431981086730957, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 11.667087496374497, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.056357402354478836, "logits/rejected": 0.24279463291168213, "logps/chosen": -2.555568218231201, "logps/rejected": -3.031388759613037, "loss": 0.6144, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.555568218231201, "rewards/margins": 0.4758206307888031, "rewards/rejected": -3.031388759613037, "sft_loss": 2.605769634246826, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 12.788470711956991, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.15414592623710632, "logits/rejected": -0.024874219670891762, "logps/chosen": -2.698474168777466, "logps/rejected": -3.311936616897583, "loss": 0.5811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.698474168777466, "rewards/margins": 0.6134623289108276, "rewards/rejected": -3.311936616897583, "sft_loss": 2.8778061866760254, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 7.833984274422432, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.07727707922458649, "logits/rejected": 0.06534862518310547, "logps/chosen": -2.554556369781494, "logps/rejected": -3.1005702018737793, "loss": 0.5939, "rewards/accuracies": 0.6875, "rewards/chosen": -2.554556369781494, "rewards/margins": 0.5460137128829956, "rewards/rejected": -3.1005702018737793, "sft_loss": 2.639235258102417, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 36.94322657241421, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.043488435447216034, "logits/rejected": 0.08955095708370209, "logps/chosen": -2.6442129611968994, "logps/rejected": -3.3849635124206543, "loss": 0.579, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6442129611968994, "rewards/margins": 0.7407506108283997, "rewards/rejected": -3.3849635124206543, "sft_loss": 2.7124366760253906, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 8.09388086134637, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.08221219480037689, "logits/rejected": 0.07584551721811295, "logps/chosen": -2.562751293182373, "logps/rejected": -3.230823516845703, "loss": 0.5687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.562751293182373, "rewards/margins": 0.6680727005004883, "rewards/rejected": -3.230823516845703, "sft_loss": 2.666548490524292, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 10.802977372135773, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.12206602096557617, "logits/rejected": 0.014964615926146507, "logps/chosen": -2.536492109298706, "logps/rejected": -2.920431613922119, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": -2.536492109298706, "rewards/margins": 0.38393938541412354, "rewards/rejected": -2.920431613922119, "sft_loss": 2.5786898136138916, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 18.137912494220632, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.11852151155471802, "logits/rejected": 0.0182892344892025, "logps/chosen": -2.264341354370117, "logps/rejected": -2.783043384552002, "loss": 0.6116, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.264341354370117, "rewards/margins": 0.5187021493911743, "rewards/rejected": -2.783043384552002, "sft_loss": 2.3643131256103516, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 10.143001252686217, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.010053953155875206, "logits/rejected": 0.12595690786838531, "logps/chosen": -2.3592190742492676, "logps/rejected": -2.8065428733825684, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3592190742492676, "rewards/margins": 0.4473237991333008, "rewards/rejected": -2.8065428733825684, "sft_loss": 2.4362292289733887, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 11.33999682092262, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.00567228440195322, "logits/rejected": 0.099876768887043, "logps/chosen": -2.412571668624878, "logps/rejected": -2.805781602859497, "loss": 0.6335, "rewards/accuracies": 0.65625, "rewards/chosen": -2.412571668624878, "rewards/margins": 0.39320993423461914, "rewards/rejected": -2.805781602859497, "sft_loss": 2.475083589553833, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 9.03668816129448, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.21119725704193115, "logits/rejected": -0.0663188025355339, "logps/chosen": -2.5495543479919434, "logps/rejected": -2.9691262245178223, "loss": 0.6221, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.5495543479919434, "rewards/margins": 0.41957202553749084, "rewards/rejected": -2.9691262245178223, "sft_loss": 2.6195390224456787, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 10.611564828740077, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.09098132699728012, "logits/rejected": 0.10673109441995621, "logps/chosen": -2.3393394947052, "logps/rejected": -2.9454898834228516, "loss": 0.6236, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3393394947052, "rewards/margins": 0.6061506271362305, "rewards/rejected": -2.9454898834228516, "sft_loss": 2.4549148082733154, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 14.889925201224557, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.09605233371257782, "logits/rejected": 0.09647075086832047, "logps/chosen": -2.418492555618286, "logps/rejected": -2.98441743850708, "loss": 0.5642, "rewards/accuracies": 0.71875, "rewards/chosen": -2.418492555618286, "rewards/margins": 0.5659249424934387, "rewards/rejected": -2.98441743850708, "sft_loss": 2.5511069297790527, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 16.924932065414907, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.012868774123489857, "logits/rejected": 0.13494983315467834, "logps/chosen": -2.3770487308502197, "logps/rejected": -2.7323498725891113, "loss": 0.6441, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3770487308502197, "rewards/margins": 0.3553008735179901, "rewards/rejected": -2.7323498725891113, "sft_loss": 2.36963152885437, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 8.08049445992655, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.00014423727407120168, "logits/rejected": 0.15110808610916138, "logps/chosen": -2.4371447563171387, "logps/rejected": -3.1043925285339355, "loss": 0.5964, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.4371447563171387, "rewards/margins": 0.6672475934028625, "rewards/rejected": -3.1043925285339355, "sft_loss": 2.590604066848755, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 8.365953003889633, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.0053975535556674, "logits/rejected": 0.10485513508319855, "logps/chosen": -2.463087558746338, "logps/rejected": -3.069497585296631, "loss": 0.5976, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.463087558746338, "rewards/margins": 0.606410026550293, "rewards/rejected": -3.069497585296631, "sft_loss": 2.5250258445739746, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 14.122983727549176, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.1040046364068985, "logits/rejected": 0.043417539447546005, "logps/chosen": -2.6348214149475098, "logps/rejected": -3.121133804321289, "loss": 0.6123, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6348214149475098, "rewards/margins": 0.48631221055984497, "rewards/rejected": -3.121133804321289, "sft_loss": 2.6830124855041504, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 12.061809049765845, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.031954310834407806, "logits/rejected": 0.09408613294363022, "logps/chosen": -2.5333523750305176, "logps/rejected": -3.221738338470459, "loss": 0.5818, "rewards/accuracies": 0.75, "rewards/chosen": -2.5333523750305176, "rewards/margins": 0.6883862614631653, "rewards/rejected": -3.221738338470459, "sft_loss": 2.635631561279297, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 8.854422848321477, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.06054989621043205, "logits/rejected": 0.0525406114757061, "logps/chosen": -2.517436981201172, "logps/rejected": -2.995318651199341, "loss": 0.6267, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.517436981201172, "rewards/margins": 0.4778814911842346, "rewards/rejected": -2.995318651199341, "sft_loss": 2.6538000106811523, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 14.109188635463664, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.12987467646598816, "logits/rejected": 0.0863143727183342, "logps/chosen": -2.5347824096679688, "logps/rejected": -3.0305581092834473, "loss": 0.6116, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.5347824096679688, "rewards/margins": 0.49577564001083374, "rewards/rejected": -3.0305581092834473, "sft_loss": 2.6745293140411377, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 17.81440390033541, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.003881660057231784, "logits/rejected": 0.10796427726745605, "logps/chosen": -2.5523457527160645, "logps/rejected": -3.016080856323242, "loss": 0.6686, "rewards/accuracies": 0.65625, "rewards/chosen": -2.5523457527160645, "rewards/margins": 0.4637354016304016, "rewards/rejected": -3.016080856323242, "sft_loss": 2.617882013320923, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 11.42605708100968, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.06866137683391571, "logits/rejected": 0.09967400878667831, "logps/chosen": -2.4362640380859375, "logps/rejected": -2.9746646881103516, "loss": 0.6049, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4362640380859375, "rewards/margins": 0.5384005308151245, "rewards/rejected": -2.9746646881103516, "sft_loss": 2.526332139968872, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 8.45931725262656, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.04088922217488289, "logits/rejected": 0.14248767495155334, "logps/chosen": -2.4679696559906006, "logps/rejected": -3.048508644104004, "loss": 0.5875, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4679696559906006, "rewards/margins": 0.5805387496948242, "rewards/rejected": -3.048508644104004, "sft_loss": 2.4981863498687744, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 8.693422332707609, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.0389975979924202, "logits/rejected": 0.09865973144769669, "logps/chosen": -2.6156318187713623, "logps/rejected": -3.040283679962158, "loss": 0.638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6156318187713623, "rewards/margins": 0.4246513843536377, "rewards/rejected": -3.040283679962158, "sft_loss": 2.6994106769561768, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 9.618870707688409, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.11955811083316803, "logits/rejected": -0.08894553035497665, "logps/chosen": -2.564248561859131, "logps/rejected": -3.060483455657959, "loss": 0.6321, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.564248561859131, "rewards/margins": 0.4962351322174072, "rewards/rejected": -3.060483455657959, "sft_loss": 2.657533645629883, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 10.696480953576044, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.12024404108524323, "logits/rejected": 0.06615696847438812, "logps/chosen": -2.527562379837036, "logps/rejected": -3.061629295349121, "loss": 0.6019, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.527562379837036, "rewards/margins": 0.5340667963027954, "rewards/rejected": -3.061629295349121, "sft_loss": 2.5972416400909424, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 12.054364447689002, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.005958074238151312, "logits/rejected": 0.10832609236240387, "logps/chosen": -2.411243200302124, "logps/rejected": -3.0436832904815674, "loss": 0.595, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.411243200302124, "rewards/margins": 0.6324400901794434, "rewards/rejected": -3.0436832904815674, "sft_loss": 2.4762954711914062, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 13.297039351708401, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.003141486318781972, "logits/rejected": 0.16741812229156494, "logps/chosen": -2.579014539718628, "logps/rejected": -3.246927261352539, "loss": 0.5531, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.579014539718628, "rewards/margins": 0.6679128408432007, "rewards/rejected": -3.246927261352539, "sft_loss": 2.678422212600708, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 13.486736198358573, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.0727333202958107, "logits/rejected": 0.05342050641775131, "logps/chosen": -2.5079057216644287, "logps/rejected": -3.046525478363037, "loss": 0.6101, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5079057216644287, "rewards/margins": 0.5386193990707397, "rewards/rejected": -3.046525478363037, "sft_loss": 2.6102094650268555, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 12.378423003736247, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.03273031860589981, "logits/rejected": 0.07781729847192764, "logps/chosen": -2.753040075302124, "logps/rejected": -3.1269867420196533, "loss": 0.6583, "rewards/accuracies": 0.625, "rewards/chosen": -2.753040075302124, "rewards/margins": 0.3739466071128845, "rewards/rejected": -3.1269867420196533, "sft_loss": 2.9218058586120605, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 11.683946013575603, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.08013884723186493, "logits/rejected": 0.034627676010131836, "logps/chosen": -2.6559319496154785, "logps/rejected": -3.1538009643554688, "loss": 0.6609, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.6559319496154785, "rewards/margins": 0.497869074344635, "rewards/rejected": -3.1538009643554688, "sft_loss": 2.8204636573791504, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 11.430235771854319, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.003335425164550543, "logits/rejected": 0.08415015041828156, "logps/chosen": -2.660719394683838, "logps/rejected": -3.2465882301330566, "loss": 0.5838, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.660719394683838, "rewards/margins": 0.5858690738677979, "rewards/rejected": -3.2465882301330566, "sft_loss": 2.7394461631774902, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 7.0444059398549035, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.05530339479446411, "logits/rejected": 0.2480275183916092, "logps/chosen": -2.5760626792907715, "logps/rejected": -3.228938341140747, "loss": 0.5571, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5760626792907715, "rewards/margins": 0.6528751850128174, "rewards/rejected": -3.228938341140747, "sft_loss": 2.6400935649871826, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 10.242804568872272, "learning_rate": 9.142689663565577e-07, "logits/chosen": 0.029218971729278564, "logits/rejected": 0.10311850160360336, "logps/chosen": -2.518911123275757, "logps/rejected": -3.0885183811187744, "loss": 0.5873, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.518911123275757, "rewards/margins": 0.5696069598197937, "rewards/rejected": -3.0885183811187744, "sft_loss": 2.62086820602417, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 10.426215268521785, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.008902695961296558, "logits/rejected": 0.09115861356258392, "logps/chosen": -2.458341121673584, "logps/rejected": -3.0484235286712646, "loss": 0.5696, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.458341121673584, "rewards/margins": 0.5900823473930359, "rewards/rejected": -3.0484235286712646, "sft_loss": 2.6848063468933105, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 9.322478402993056, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.036038726568222046, "logits/rejected": 0.16355329751968384, "logps/chosen": -2.571803092956543, "logps/rejected": -3.060861587524414, "loss": 0.6206, "rewards/accuracies": 0.65625, "rewards/chosen": -2.571803092956543, "rewards/margins": 0.4890584945678711, "rewards/rejected": -3.060861587524414, "sft_loss": 2.6588542461395264, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 16.133013311367087, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.059060920029878616, "logits/rejected": 0.08883248269557953, "logps/chosen": -2.6606833934783936, "logps/rejected": -3.106626272201538, "loss": 0.6284, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6606833934783936, "rewards/margins": 0.4459429383277893, "rewards/rejected": -3.106626272201538, "sft_loss": 2.746152400970459, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 10.74009173882518, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.07637111842632294, "logits/rejected": 0.12358202785253525, "logps/chosen": -2.522994041442871, "logps/rejected": -3.0614607334136963, "loss": 0.5745, "rewards/accuracies": 0.75, "rewards/chosen": -2.522994041442871, "rewards/margins": 0.5384668707847595, "rewards/rejected": -3.0614607334136963, "sft_loss": 2.630826234817505, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 12.589286926809567, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.007654297165572643, "logits/rejected": 0.16238531470298767, "logps/chosen": -2.5522336959838867, "logps/rejected": -3.288139820098877, "loss": 0.5539, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5522336959838867, "rewards/margins": 0.7359061241149902, "rewards/rejected": -3.288139820098877, "sft_loss": 2.6166367530822754, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 10.204031227197607, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.0032329752575606108, "logits/rejected": 0.16776946187019348, "logps/chosen": -2.5196006298065186, "logps/rejected": -3.0311813354492188, "loss": 0.6056, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5196006298065186, "rewards/margins": 0.5115808248519897, "rewards/rejected": -3.0311813354492188, "sft_loss": 2.6822352409362793, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 10.445154609189464, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.15377630293369293, "logits/rejected": 0.12954029440879822, "logps/chosen": -2.508655548095703, "logps/rejected": -3.2540974617004395, "loss": 0.5356, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.508655548095703, "rewards/margins": 0.7454419136047363, "rewards/rejected": -3.2540974617004395, "sft_loss": 2.5944552421569824, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 11.455076652369373, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.12354922294616699, "logits/rejected": 0.16556768119335175, "logps/chosen": -2.5879435539245605, "logps/rejected": -3.3230979442596436, "loss": 0.5937, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5879435539245605, "rewards/margins": 0.7351543307304382, "rewards/rejected": -3.3230979442596436, "sft_loss": 2.6574831008911133, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 11.105711383512512, "learning_rate": 9.062588561782354e-07, "logits/chosen": 0.008125312626361847, "logits/rejected": 0.09454827010631561, "logps/chosen": -2.7672481536865234, "logps/rejected": -3.295419216156006, "loss": 0.6376, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.7672481536865234, "rewards/margins": 0.5281708836555481, "rewards/rejected": -3.295419216156006, "sft_loss": 2.950342893600464, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 9.102603916421376, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.00024118572764564306, "logits/rejected": 0.0962609052658081, "logps/chosen": -2.770569086074829, "logps/rejected": -3.313864231109619, "loss": 0.6154, "rewards/accuracies": 0.71875, "rewards/chosen": -2.770569086074829, "rewards/margins": 0.5432949066162109, "rewards/rejected": -3.313864231109619, "sft_loss": 2.8849356174468994, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 18.46276019459034, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.005351717583835125, "logits/rejected": 0.044967781752347946, "logps/chosen": -2.6325314044952393, "logps/rejected": -3.045335292816162, "loss": 0.6741, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6325314044952393, "rewards/margins": 0.4128040671348572, "rewards/rejected": -3.045335292816162, "sft_loss": 2.838153123855591, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 9.744510912842921, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.0044286372140049934, "logits/rejected": 0.10475552082061768, "logps/chosen": -2.3398423194885254, "logps/rejected": -2.780120372772217, "loss": 0.6058, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3398423194885254, "rewards/margins": 0.4402780532836914, "rewards/rejected": -2.780120372772217, "sft_loss": 2.416719436645508, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 10.136785199712552, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.029368087649345398, "logits/rejected": 0.2844516932964325, "logps/chosen": -2.4363322257995605, "logps/rejected": -2.983987808227539, "loss": 0.5794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4363322257995605, "rewards/margins": 0.5476558208465576, "rewards/rejected": -2.983987808227539, "sft_loss": 2.530247688293457, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 8.083867650728607, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.01698513887822628, "logits/rejected": 0.15768086910247803, "logps/chosen": -2.3244481086730957, "logps/rejected": -2.885270118713379, "loss": 0.5732, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3244481086730957, "rewards/margins": 0.5608222484588623, "rewards/rejected": -2.885270118713379, "sft_loss": 2.433962345123291, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 19.128381491713746, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.09792274236679077, "logits/rejected": -0.009612299501895905, "logps/chosen": -2.360170364379883, "logps/rejected": -2.749636650085449, "loss": 0.6318, "rewards/accuracies": 0.65625, "rewards/chosen": -2.360170364379883, "rewards/margins": 0.38946622610092163, "rewards/rejected": -2.749636650085449, "sft_loss": 2.4644882678985596, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 11.983132406771713, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.010349246673285961, "logits/rejected": 0.06283728778362274, "logps/chosen": -2.491295099258423, "logps/rejected": -2.836306095123291, "loss": 0.6653, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.491295099258423, "rewards/margins": 0.34501126408576965, "rewards/rejected": -2.836306095123291, "sft_loss": 2.5202624797821045, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 8.077069174188852, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.1035315990447998, "logits/rejected": 0.1056133285164833, "logps/chosen": -2.4259142875671387, "logps/rejected": -3.097588300704956, "loss": 0.5355, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4259142875671387, "rewards/margins": 0.6716742515563965, "rewards/rejected": -3.097588300704956, "sft_loss": 2.545226573944092, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.3133206367492676, "eval_logits/rejected": 0.41915422677993774, "eval_logps/chosen": -2.4305131435394287, "eval_logps/rejected": -2.997382879257202, "eval_loss": 0.575062096118927, "eval_rewards/accuracies": 0.7062314748764038, "eval_rewards/chosen": -2.4305131435394287, "eval_rewards/margins": 0.5668694972991943, "eval_rewards/rejected": -2.997382879257202, "eval_runtime": 52.9695, "eval_samples_per_second": 25.392, "eval_sft_loss": 2.5634634494781494, "eval_steps_per_second": 6.362, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 10.914004888548867, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.0882820338010788, "logits/rejected": 0.163892924785614, "logps/chosen": -2.531283140182495, "logps/rejected": -3.127635955810547, "loss": 0.5666, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.531283140182495, "rewards/margins": 0.5963530540466309, "rewards/rejected": -3.127635955810547, "sft_loss": 2.6074588298797607, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 10.16876236997209, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.1145872101187706, "logits/rejected": 0.07419227063655853, "logps/chosen": -2.5505528450012207, "logps/rejected": -3.200885057449341, "loss": 0.5697, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.5505528450012207, "rewards/margins": 0.6503321528434753, "rewards/rejected": -3.200885057449341, "sft_loss": 2.7054452896118164, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 14.031412502648942, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.04301288723945618, "logits/rejected": 0.08894392102956772, "logps/chosen": -2.5927734375, "logps/rejected": -3.142228841781616, "loss": 0.6424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5927734375, "rewards/margins": 0.5494555234909058, "rewards/rejected": -3.142228841781616, "sft_loss": 2.692375898361206, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 10.380932361427314, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.026785671710968018, "logits/rejected": 0.14500652253627777, "logps/chosen": -2.513824224472046, "logps/rejected": -3.0432636737823486, "loss": 0.6333, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.513824224472046, "rewards/margins": 0.5294393301010132, "rewards/rejected": -3.0432636737823486, "sft_loss": 2.635474443435669, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 11.788160329897083, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.06771846115589142, "logits/rejected": 0.12368135154247284, "logps/chosen": -2.5657715797424316, "logps/rejected": -2.8500313758850098, "loss": 0.7095, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.5657715797424316, "rewards/margins": 0.284260094165802, "rewards/rejected": -2.8500313758850098, "sft_loss": 2.569617509841919, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 7.220376061788953, "learning_rate": 8.931665393857983e-07, "logits/chosen": 0.00953853689134121, "logits/rejected": 0.16754086315631866, "logps/chosen": -2.31351375579834, "logps/rejected": -2.8156228065490723, "loss": 0.5925, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.31351375579834, "rewards/margins": 0.502109169960022, "rewards/rejected": -2.8156228065490723, "sft_loss": 2.38759708404541, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 8.497784358932309, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.08759839832782745, "logits/rejected": 0.05009545758366585, "logps/chosen": -2.1406750679016113, "logps/rejected": -2.7028450965881348, "loss": 0.5683, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1406750679016113, "rewards/margins": 0.5621696710586548, "rewards/rejected": -2.7028450965881348, "sft_loss": 2.215157985687256, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 10.362610570766398, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.040607232600450516, "logits/rejected": 0.09515784680843353, "logps/chosen": -2.210148334503174, "logps/rejected": -2.790614604949951, "loss": 0.5847, "rewards/accuracies": 0.6875, "rewards/chosen": -2.210148334503174, "rewards/margins": 0.5804664492607117, "rewards/rejected": -2.790614604949951, "sft_loss": 2.3017220497131348, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 10.46273728676422, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.016619618982076645, "logits/rejected": 0.034366074949502945, "logps/chosen": -2.285428524017334, "logps/rejected": -2.6628036499023438, "loss": 0.6234, "rewards/accuracies": 0.65625, "rewards/chosen": -2.285428524017334, "rewards/margins": 0.37737521529197693, "rewards/rejected": -2.6628036499023438, "sft_loss": 2.3670566082000732, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 11.988584939935281, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.07678203284740448, "logits/rejected": 0.13864430785179138, "logps/chosen": -2.2349565029144287, "logps/rejected": -2.7476844787597656, "loss": 0.5722, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2349565029144287, "rewards/margins": 0.5127274990081787, "rewards/rejected": -2.7476844787597656, "sft_loss": 2.2399978637695312, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 10.684458877547073, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.03853264078497887, "logits/rejected": 0.08583366125822067, "logps/chosen": -2.3231568336486816, "logps/rejected": -2.8199455738067627, "loss": 0.5862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3231568336486816, "rewards/margins": 0.4967884123325348, "rewards/rejected": -2.8199455738067627, "sft_loss": 2.355069875717163, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 10.614315640564005, "learning_rate": 8.873252100389377e-07, "logits/chosen": 0.009507464244961739, "logits/rejected": 0.06045049428939819, "logps/chosen": -2.2326788902282715, "logps/rejected": -2.739642381668091, "loss": 0.5685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2326788902282715, "rewards/margins": 0.5069635510444641, "rewards/rejected": -2.739642381668091, "sft_loss": 2.2458808422088623, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 12.65809689170843, "learning_rate": 8.863384473200411e-07, "logits/chosen": 0.013400441035628319, "logits/rejected": 0.08506964892148972, "logps/chosen": -2.543550968170166, "logps/rejected": -2.9200243949890137, "loss": 0.6529, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.543550968170166, "rewards/margins": 0.37647324800491333, "rewards/rejected": -2.9200243949890137, "sft_loss": 2.5819945335388184, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 10.470244663605284, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.05795196816325188, "logits/rejected": 0.2349679172039032, "logps/chosen": -2.51816987991333, "logps/rejected": -2.9122061729431152, "loss": 0.6769, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.51816987991333, "rewards/margins": 0.39403635263442993, "rewards/rejected": -2.9122061729431152, "sft_loss": 2.468186140060425, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 9.382088774982067, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.02824602648615837, "logits/rejected": 0.254594087600708, "logps/chosen": -2.4892547130584717, "logps/rejected": -3.094567060470581, "loss": 0.5948, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.4892547130584717, "rewards/margins": 0.6053122878074646, "rewards/rejected": -3.094567060470581, "sft_loss": 2.575155735015869, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 12.346178841407921, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.06833375990390778, "logits/rejected": 0.06057899445295334, "logps/chosen": -2.488093614578247, "logps/rejected": -2.9171090126037598, "loss": 0.6347, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.488093614578247, "rewards/margins": 0.42901507019996643, "rewards/rejected": -2.9171090126037598, "sft_loss": 2.5657894611358643, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 13.056931547739362, "learning_rate": 8.823540101520381e-07, "logits/chosen": 0.0010797411669045687, "logits/rejected": 0.24253420531749725, "logps/chosen": -2.361572742462158, "logps/rejected": -2.9080519676208496, "loss": 0.6233, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.361572742462158, "rewards/margins": 0.5464791059494019, "rewards/rejected": -2.9080519676208496, "sft_loss": 2.4400172233581543, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 9.03111975442011, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.0008448451990261674, "logits/rejected": 0.20967717468738556, "logps/chosen": -2.329864025115967, "logps/rejected": -2.955321788787842, "loss": 0.5453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.329864025115967, "rewards/margins": 0.6254577040672302, "rewards/rejected": -2.955321788787842, "sft_loss": 2.4391579627990723, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 17.373789805328247, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.1382381170988083, "logits/rejected": 0.0370989628136158, "logps/chosen": -2.4268882274627686, "logps/rejected": -2.9267404079437256, "loss": 0.601, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4268882274627686, "rewards/margins": 0.49985241889953613, "rewards/rejected": -2.9267404079437256, "sft_loss": 2.5433390140533447, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 12.136056902371177, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.04478234052658081, "logits/rejected": -0.008689765818417072, "logps/chosen": -2.5137197971343994, "logps/rejected": -2.812690019607544, "loss": 0.6569, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.5137197971343994, "rewards/margins": 0.2989702820777893, "rewards/rejected": -2.812690019607544, "sft_loss": 2.6227807998657227, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 19.782307301050942, "learning_rate": 8.783102200993085e-07, "logits/chosen": 0.04610518366098404, "logits/rejected": 0.20527955889701843, "logps/chosen": -2.4842915534973145, "logps/rejected": -3.0323004722595215, "loss": 0.587, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4842915534973145, "rewards/margins": 0.5480088591575623, "rewards/rejected": -3.0323004722595215, "sft_loss": 2.596231698989868, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 9.505496059640969, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.013093151152133942, "logits/rejected": 0.09648257493972778, "logps/chosen": -2.617058038711548, "logps/rejected": -3.034942150115967, "loss": 0.63, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.617058038711548, "rewards/margins": 0.4178840219974518, "rewards/rejected": -3.034942150115967, "sft_loss": 2.7770838737487793, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 11.789714973897295, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.015839237719774246, "logits/rejected": 0.19138559699058533, "logps/chosen": -2.698256015777588, "logps/rejected": -3.1933929920196533, "loss": 0.6454, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.698256015777588, "rewards/margins": 0.49513691663742065, "rewards/rejected": -3.1933929920196533, "sft_loss": 2.781946897506714, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 11.412043051469436, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.0009754031780175865, "logits/rejected": 0.08380867540836334, "logps/chosen": -2.5303540229797363, "logps/rejected": -3.1968514919281006, "loss": 0.5591, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5303540229797363, "rewards/margins": 0.6664971113204956, "rewards/rejected": -3.1968514919281006, "sft_loss": 2.7090067863464355, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 8.995570834686255, "learning_rate": 8.74207704880141e-07, "logits/chosen": 0.018614167347550392, "logits/rejected": 0.14034906029701233, "logps/chosen": -2.7271647453308105, "logps/rejected": -3.4992728233337402, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": -2.7271647453308105, "rewards/margins": 0.7721078991889954, "rewards/rejected": -3.4992728233337402, "sft_loss": 2.928816318511963, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 12.144945793536012, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.05527348071336746, "logits/rejected": 0.13575479388237, "logps/chosen": -2.6572043895721436, "logps/rejected": -3.217125654220581, "loss": 0.5988, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6572043895721436, "rewards/margins": 0.559921145439148, "rewards/rejected": -3.217125654220581, "sft_loss": 2.908268451690674, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 12.236971878685171, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.11078289896249771, "logits/rejected": 0.10976402461528778, "logps/chosen": -2.8110814094543457, "logps/rejected": -3.594407558441162, "loss": 0.5748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8110814094543457, "rewards/margins": 0.7833271026611328, "rewards/rejected": -3.594407558441162, "sft_loss": 2.929760456085205, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 11.502499419546336, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.05445173382759094, "logits/rejected": 0.1365424394607544, "logps/chosen": -2.9095003604888916, "logps/rejected": -3.4668147563934326, "loss": 0.5874, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9095003604888916, "rewards/margins": 0.5573142170906067, "rewards/rejected": -3.4668147563934326, "sft_loss": 3.011441946029663, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 8.443109951812232, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.06129883602261543, "logits/rejected": 0.09885939955711365, "logps/chosen": -2.6593096256256104, "logps/rejected": -3.227020263671875, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6593096256256104, "rewards/margins": 0.5677107572555542, "rewards/rejected": -3.227020263671875, "sft_loss": 2.8268625736236572, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 16.493045146068436, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.0019575455226004124, "logits/rejected": 0.06592272222042084, "logps/chosen": -2.8948216438293457, "logps/rejected": -3.321528911590576, "loss": 0.6763, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.8948216438293457, "rewards/margins": 0.4267074167728424, "rewards/rejected": -3.321528911590576, "sft_loss": 3.0294787883758545, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 11.25773668875448, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.04980425536632538, "logits/rejected": 0.07690002769231796, "logps/chosen": -2.7664177417755127, "logps/rejected": -3.3948757648468018, "loss": 0.5577, "rewards/accuracies": 0.75, "rewards/chosen": -2.7664177417755127, "rewards/margins": 0.62845778465271, "rewards/rejected": -3.3948757648468018, "sft_loss": 2.962599277496338, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 13.445498840947511, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.024612322449684143, "logits/rejected": 0.12784752249717712, "logps/chosen": -2.734321117401123, "logps/rejected": -3.302389621734619, "loss": 0.5772, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.734321117401123, "rewards/margins": 0.5680681467056274, "rewards/rejected": -3.302389621734619, "sft_loss": 2.892714262008667, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 13.079442066176235, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.022269438952207565, "logits/rejected": 0.05613626167178154, "logps/chosen": -2.763192653656006, "logps/rejected": -3.321763515472412, "loss": 0.6194, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.763192653656006, "rewards/margins": 0.5585710406303406, "rewards/rejected": -3.321763515472412, "sft_loss": 2.930387496948242, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 10.737616747574133, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.007398630026727915, "logits/rejected": 0.1687568575143814, "logps/chosen": -2.7443089485168457, "logps/rejected": -3.232015609741211, "loss": 0.626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.7443089485168457, "rewards/margins": 0.4877067506313324, "rewards/rejected": -3.232015609741211, "sft_loss": 2.9475789070129395, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 11.013113629479797, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.0652175024151802, "logits/rejected": 0.053606174886226654, "logps/chosen": -2.545154333114624, "logps/rejected": -3.148621082305908, "loss": 0.5846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.545154333114624, "rewards/margins": 0.6034666895866394, "rewards/rejected": -3.148621082305908, "sft_loss": 2.7204573154449463, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 10.651167349747164, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.03264901041984558, "logits/rejected": 0.10116423666477203, "logps/chosen": -2.4588916301727295, "logps/rejected": -3.0085623264312744, "loss": 0.5825, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4588916301727295, "rewards/margins": 0.5496702790260315, "rewards/rejected": -3.0085623264312744, "sft_loss": 2.547887086868286, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 15.623065206500607, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.04355254024267197, "logits/rejected": 0.1262408196926117, "logps/chosen": -2.4711506366729736, "logps/rejected": -2.809096336364746, "loss": 0.6557, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.4711506366729736, "rewards/margins": 0.33794572949409485, "rewards/rejected": -2.809096336364746, "sft_loss": 2.529564619064331, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 13.881284427213853, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.08312052488327026, "logits/rejected": 0.16090969741344452, "logps/chosen": -2.548485279083252, "logps/rejected": -2.9776816368103027, "loss": 0.6146, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.548485279083252, "rewards/margins": 0.42919617891311646, "rewards/rejected": -2.9776816368103027, "sft_loss": 2.6493735313415527, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 8.169881197230874, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.057188816368579865, "logits/rejected": 0.09091904014348984, "logps/chosen": -2.39776349067688, "logps/rejected": -2.983966588973999, "loss": 0.5808, "rewards/accuracies": 0.71875, "rewards/chosen": -2.39776349067688, "rewards/margins": 0.5862034559249878, "rewards/rejected": -2.983966588973999, "sft_loss": 2.574922800064087, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 9.207422884537966, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.02254396490752697, "logits/rejected": 0.06835971772670746, "logps/chosen": -2.5146327018737793, "logps/rejected": -2.956911563873291, "loss": 0.6064, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.5146327018737793, "rewards/margins": 0.44227901101112366, "rewards/rejected": -2.956911563873291, "sft_loss": 2.6073718070983887, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 11.647852185686641, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.1361158937215805, "logits/rejected": 0.005249606911092997, "logps/chosen": -2.355909824371338, "logps/rejected": -3.086822509765625, "loss": 0.5193, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.355909824371338, "rewards/margins": 0.7309123277664185, "rewards/rejected": -3.086822509765625, "sft_loss": 2.463071346282959, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 10.142898164937732, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.011810372583568096, "logits/rejected": 0.1351340115070343, "logps/chosen": -2.4389145374298096, "logps/rejected": -2.9896650314331055, "loss": 0.5704, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4389145374298096, "rewards/margins": 0.5507505536079407, "rewards/rejected": -2.9896650314331055, "sft_loss": 2.5153541564941406, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 9.915497698492498, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.061279840767383575, "logits/rejected": 0.09167703241109848, "logps/chosen": -2.593907117843628, "logps/rejected": -3.216207504272461, "loss": 0.5646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.593907117843628, "rewards/margins": 0.6223001480102539, "rewards/rejected": -3.216207504272461, "sft_loss": 2.7145023345947266, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 11.75845777405208, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.03900023549795151, "logits/rejected": 0.016491200774908066, "logps/chosen": -2.6452884674072266, "logps/rejected": -3.208444118499756, "loss": 0.5861, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.6452884674072266, "rewards/margins": 0.5631558299064636, "rewards/rejected": -3.208444118499756, "sft_loss": 2.685883045196533, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 10.256406392333993, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.1221277266740799, "logits/rejected": 0.047670863568782806, "logps/chosen": -2.4664437770843506, "logps/rejected": -3.2738850116729736, "loss": 0.541, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4664437770843506, "rewards/margins": 0.807441234588623, "rewards/rejected": -3.2738850116729736, "sft_loss": 2.6216683387756348, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 11.766825056658277, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.008026264607906342, "logits/rejected": 0.0632171481847763, "logps/chosen": -2.8379218578338623, "logps/rejected": -3.3935370445251465, "loss": 0.6042, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.8379218578338623, "rewards/margins": 0.5556154251098633, "rewards/rejected": -3.3935370445251465, "sft_loss": 2.949826955795288, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 12.179530743385463, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.1061452180147171, "logits/rejected": -0.022467706352472305, "logps/chosen": -2.8701460361480713, "logps/rejected": -3.3405520915985107, "loss": 0.6117, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8701460361480713, "rewards/margins": 0.47040629386901855, "rewards/rejected": -3.3405520915985107, "sft_loss": 2.990164279937744, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 10.285677662244042, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.057100921869277954, "logits/rejected": 0.10320702940225601, "logps/chosen": -3.0298407077789307, "logps/rejected": -3.7158799171447754, "loss": 0.5614, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0298407077789307, "rewards/margins": 0.6860392093658447, "rewards/rejected": -3.7158799171447754, "sft_loss": 3.214426040649414, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 18.13952521344405, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.08308350294828415, "logits/rejected": 0.027438625693321228, "logps/chosen": -3.064002513885498, "logps/rejected": -3.6987290382385254, "loss": 0.6068, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.064002513885498, "rewards/margins": 0.6347264051437378, "rewards/rejected": -3.6987290382385254, "sft_loss": 3.298839569091797, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 15.704432394955504, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.21905367076396942, "logits/rejected": -0.06774080544710159, "logps/chosen": -3.2466511726379395, "logps/rejected": -3.815436840057373, "loss": 0.6149, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.2466511726379395, "rewards/margins": 0.5687858462333679, "rewards/rejected": -3.815436840057373, "sft_loss": 3.38019061088562, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 13.349749187918693, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.11080940067768097, "logits/rejected": 0.0334743857383728, "logps/chosen": -3.068286418914795, "logps/rejected": -3.7758116722106934, "loss": 0.6029, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.068286418914795, "rewards/margins": 0.7075250744819641, "rewards/rejected": -3.7758116722106934, "sft_loss": 3.3414759635925293, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 16.693407798704268, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.0996282771229744, "logits/rejected": 0.02317485585808754, "logps/chosen": -3.228316068649292, "logps/rejected": -3.8254122734069824, "loss": 0.6024, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.228316068649292, "rewards/margins": 0.5970960259437561, "rewards/rejected": -3.8254122734069824, "sft_loss": 3.3384742736816406, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 12.2068079158362, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.09745468199253082, "logits/rejected": -0.00797833502292633, "logps/chosen": -2.809922695159912, "logps/rejected": -3.3745949268341064, "loss": 0.6085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.809922695159912, "rewards/margins": 0.5646719932556152, "rewards/rejected": -3.3745949268341064, "sft_loss": 2.949306011199951, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 10.777854658083896, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.13089512288570404, "logits/rejected": 0.013808004558086395, "logps/chosen": -2.7855277061462402, "logps/rejected": -3.438579559326172, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": -2.7855277061462402, "rewards/margins": 0.6530521512031555, "rewards/rejected": -3.438579559326172, "sft_loss": 2.905142307281494, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 14.023255853880016, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.16114047169685364, "logits/rejected": -0.011397371999919415, "logps/chosen": -2.7088358402252197, "logps/rejected": -3.1518595218658447, "loss": 0.6186, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.7088358402252197, "rewards/margins": 0.44302353262901306, "rewards/rejected": -3.1518595218658447, "sft_loss": 2.7553300857543945, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 10.75467613467904, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.03138697147369385, "logits/rejected": 0.13168206810951233, "logps/chosen": -2.6109001636505127, "logps/rejected": -3.409217119216919, "loss": 0.5165, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.6109001636505127, "rewards/margins": 0.7983168959617615, "rewards/rejected": -3.409217119216919, "sft_loss": 2.7761216163635254, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 9.570772308359619, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.10052132606506348, "logits/rejected": -0.013578740879893303, "logps/chosen": -2.5562167167663574, "logps/rejected": -3.0964174270629883, "loss": 0.6024, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5562167167663574, "rewards/margins": 0.54020094871521, "rewards/rejected": -3.0964174270629883, "sft_loss": 2.6829943656921387, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 13.743912767629043, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.10133364051580429, "logits/rejected": 0.12248275429010391, "logps/chosen": -2.7118899822235107, "logps/rejected": -3.332120418548584, "loss": 0.567, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7118899822235107, "rewards/margins": 0.620230495929718, "rewards/rejected": -3.332120418548584, "sft_loss": 2.9557461738586426, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 10.624028115849335, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.05728140473365784, "logits/rejected": -0.016837697476148605, "logps/chosen": -2.6872398853302, "logps/rejected": -3.2197844982147217, "loss": 0.5828, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6872398853302, "rewards/margins": 0.5325449109077454, "rewards/rejected": -3.2197844982147217, "sft_loss": 2.794914722442627, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 10.188213475469688, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1692381203174591, "logits/rejected": 0.10236310958862305, "logps/chosen": -2.693068265914917, "logps/rejected": -3.3337535858154297, "loss": 0.5717, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.693068265914917, "rewards/margins": 0.6406850814819336, "rewards/rejected": -3.3337535858154297, "sft_loss": 2.888385772705078, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 9.126408253755638, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.07820995151996613, "logits/rejected": 0.06870969384908676, "logps/chosen": -2.7257328033447266, "logps/rejected": -3.2857375144958496, "loss": 0.6121, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7257328033447266, "rewards/margins": 0.5600049495697021, "rewards/rejected": -3.2857375144958496, "sft_loss": 2.827327251434326, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 8.491548692705383, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.17100057005882263, "logits/rejected": -0.023897219449281693, "logps/chosen": -2.6321306228637695, "logps/rejected": -3.1332454681396484, "loss": 0.5968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6321306228637695, "rewards/margins": 0.5011148452758789, "rewards/rejected": -3.1332454681396484, "sft_loss": 2.7944517135620117, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 9.103701034722597, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.10997509956359863, "logits/rejected": -0.007670065853744745, "logps/chosen": -2.7183170318603516, "logps/rejected": -3.2243950366973877, "loss": 0.5872, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7183170318603516, "rewards/margins": 0.5060782432556152, "rewards/rejected": -3.2243950366973877, "sft_loss": 2.8439135551452637, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 8.442057525715944, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.13612933456897736, "logits/rejected": -0.045088279992341995, "logps/chosen": -2.5174169540405273, "logps/rejected": -3.1568408012390137, "loss": 0.565, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5174169540405273, "rewards/margins": 0.6394241452217102, "rewards/rejected": -3.1568408012390137, "sft_loss": 2.7010550498962402, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 12.833982485757991, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.25432294607162476, "logits/rejected": -0.04225042089819908, "logps/chosen": -2.7238693237304688, "logps/rejected": -3.3072547912597656, "loss": 0.5698, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7238693237304688, "rewards/margins": 0.5833858251571655, "rewards/rejected": -3.3072547912597656, "sft_loss": 2.861285924911499, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 13.724749293706079, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.17298512160778046, "logits/rejected": -0.03237203508615494, "logps/chosen": -2.628120183944702, "logps/rejected": -3.2568199634552, "loss": 0.5586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.628120183944702, "rewards/margins": 0.6286996603012085, "rewards/rejected": -3.2568199634552, "sft_loss": 2.8256094455718994, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 14.119179582690768, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.2202630490064621, "logits/rejected": -0.010647903196513653, "logps/chosen": -2.709089994430542, "logps/rejected": -3.362189531326294, "loss": 0.5749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.709089994430542, "rewards/margins": 0.6530997157096863, "rewards/rejected": -3.362189531326294, "sft_loss": 2.794367790222168, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 10.603355210152403, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.13367509841918945, "logits/rejected": -0.0059948088601231575, "logps/chosen": -2.6444592475891113, "logps/rejected": -3.3636856079101562, "loss": 0.5557, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.6444592475891113, "rewards/margins": 0.7192264795303345, "rewards/rejected": -3.3636856079101562, "sft_loss": 2.6509063243865967, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 7.671753470196667, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.22960379719734192, "logits/rejected": -0.06299453228712082, "logps/chosen": -2.6862707138061523, "logps/rejected": -3.4968085289001465, "loss": 0.5132, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.6862707138061523, "rewards/margins": 0.810538113117218, "rewards/rejected": -3.4968085289001465, "sft_loss": 2.782322406768799, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 10.997777173510148, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.13848312199115753, "logits/rejected": 0.09042768180370331, "logps/chosen": -2.704963207244873, "logps/rejected": -3.3764922618865967, "loss": 0.5578, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.704963207244873, "rewards/margins": 0.6715291738510132, "rewards/rejected": -3.3764922618865967, "sft_loss": 2.77587628364563, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 11.821301015944938, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.1645531952381134, "logits/rejected": -0.09662993997335434, "logps/chosen": -2.6595356464385986, "logps/rejected": -3.2250595092773438, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6595356464385986, "rewards/margins": 0.5655234456062317, "rewards/rejected": -3.2250595092773438, "sft_loss": 2.7365801334381104, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 11.270848247245398, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.1551237255334854, "logits/rejected": -0.11444219201803207, "logps/chosen": -2.605210542678833, "logps/rejected": -3.154036045074463, "loss": 0.5935, "rewards/accuracies": 0.6875, "rewards/chosen": -2.605210542678833, "rewards/margins": 0.5488253235816956, "rewards/rejected": -3.154036045074463, "sft_loss": 2.6858043670654297, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 10.422792273873622, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.09268782287836075, "logits/rejected": -0.012595447711646557, "logps/chosen": -2.611571788787842, "logps/rejected": -3.4692039489746094, "loss": 0.5112, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.611571788787842, "rewards/margins": 0.8576324582099915, "rewards/rejected": -3.4692039489746094, "sft_loss": 2.760436534881592, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 9.90903026753714, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.16010849177837372, "logits/rejected": 0.008882254362106323, "logps/chosen": -2.4900741577148438, "logps/rejected": -3.0593371391296387, "loss": 0.5737, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4900741577148438, "rewards/margins": 0.569263219833374, "rewards/rejected": -3.0593371391296387, "sft_loss": 2.582530975341797, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 11.780133581006648, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.017460066825151443, "logits/rejected": 0.07679884135723114, "logps/chosen": -2.45796275138855, "logps/rejected": -3.05059814453125, "loss": 0.5745, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.45796275138855, "rewards/margins": 0.5926356911659241, "rewards/rejected": -3.05059814453125, "sft_loss": 2.583042860031128, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 11.184196786986732, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.021548813208937645, "logits/rejected": 0.09033869951963425, "logps/chosen": -2.6186671257019043, "logps/rejected": -3.222067356109619, "loss": 0.6004, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6186671257019043, "rewards/margins": 0.6034001111984253, "rewards/rejected": -3.222067356109619, "sft_loss": 2.7017316818237305, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 11.45281124333676, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.0954420417547226, "logits/rejected": 0.07028938829898834, "logps/chosen": -2.580390453338623, "logps/rejected": -3.0125155448913574, "loss": 0.6408, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.580390453338623, "rewards/margins": 0.4321257174015045, "rewards/rejected": -3.0125155448913574, "sft_loss": 2.6577117443084717, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 12.735874658213135, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.13700740039348602, "logits/rejected": -0.03511708974838257, "logps/chosen": -2.569897174835205, "logps/rejected": -3.0620884895324707, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.569897174835205, "rewards/margins": 0.4921916127204895, "rewards/rejected": -3.0620884895324707, "sft_loss": 2.6316020488739014, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 12.016988520778332, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.04812765121459961, "logits/rejected": 0.11438647657632828, "logps/chosen": -2.4896674156188965, "logps/rejected": -3.2544426918029785, "loss": 0.5309, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.4896674156188965, "rewards/margins": 0.7647750973701477, "rewards/rejected": -3.2544426918029785, "sft_loss": 2.6088204383850098, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 15.774458155051652, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.1408371925354004, "logits/rejected": 0.027349501848220825, "logps/chosen": -2.6003634929656982, "logps/rejected": -3.204463243484497, "loss": 0.6075, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6003634929656982, "rewards/margins": 0.6040997505187988, "rewards/rejected": -3.204463243484497, "sft_loss": 2.717322826385498, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.24547554552555084, "eval_logits/rejected": 0.3535785973072052, "eval_logps/chosen": -2.534740924835205, "eval_logps/rejected": -3.195641040802002, "eval_loss": 0.5675076842308044, "eval_rewards/accuracies": 0.716617226600647, "eval_rewards/chosen": -2.534740924835205, "eval_rewards/margins": 0.6609002351760864, "eval_rewards/rejected": -3.195641040802002, "eval_runtime": 49.7579, "eval_samples_per_second": 27.031, "eval_sft_loss": 2.677027702331543, "eval_steps_per_second": 6.773, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 12.550971047870846, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.09266819059848785, "logits/rejected": -0.02057764120399952, "logps/chosen": -2.560471773147583, "logps/rejected": -3.081770658493042, "loss": 0.6251, "rewards/accuracies": 0.65625, "rewards/chosen": -2.560471773147583, "rewards/margins": 0.5212991833686829, "rewards/rejected": -3.081770658493042, "sft_loss": 2.682894706726074, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 14.01923033463733, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.05897900462150574, "logits/rejected": 0.06726238876581192, "logps/chosen": -2.394794464111328, "logps/rejected": -2.990241289138794, "loss": 0.5569, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.394794464111328, "rewards/margins": 0.5954467058181763, "rewards/rejected": -2.990241289138794, "sft_loss": 2.538649559020996, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 10.839356405411152, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.1080300584435463, "logits/rejected": 0.028784072026610374, "logps/chosen": -2.469409465789795, "logps/rejected": -3.049816608428955, "loss": 0.5759, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.469409465789795, "rewards/margins": 0.5804071426391602, "rewards/rejected": -3.049816608428955, "sft_loss": 2.5252418518066406, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 13.273536157240635, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.2106810063123703, "logits/rejected": 0.010429046116769314, "logps/chosen": -2.5872609615325928, "logps/rejected": -3.283881664276123, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": -2.5872609615325928, "rewards/margins": 0.6966210007667542, "rewards/rejected": -3.283881664276123, "sft_loss": 2.6289219856262207, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 10.380937321819658, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.04295419156551361, "logits/rejected": 0.11656694114208221, "logps/chosen": -2.4731650352478027, "logps/rejected": -3.125133991241455, "loss": 0.5726, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.4731650352478027, "rewards/margins": 0.6519689559936523, "rewards/rejected": -3.125133991241455, "sft_loss": 2.6329140663146973, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 8.969741241293566, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.13495591282844543, "logits/rejected": -0.09867937862873077, "logps/chosen": -2.4684181213378906, "logps/rejected": -3.0391838550567627, "loss": 0.57, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.4684181213378906, "rewards/margins": 0.5707659721374512, "rewards/rejected": -3.0391838550567627, "sft_loss": 2.571349620819092, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 15.580026823869845, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.15621896088123322, "logits/rejected": 0.01897679828107357, "logps/chosen": -2.595720052719116, "logps/rejected": -3.1915364265441895, "loss": 0.5966, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.595720052719116, "rewards/margins": 0.5958161950111389, "rewards/rejected": -3.1915364265441895, "sft_loss": 2.727245330810547, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 22.842739177982196, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.22418169677257538, "logits/rejected": -0.1372956484556198, "logps/chosen": -2.629754066467285, "logps/rejected": -3.2748329639434814, "loss": 0.58, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.629754066467285, "rewards/margins": 0.645078718662262, "rewards/rejected": -3.2748329639434814, "sft_loss": 2.68192982673645, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 13.12177568543038, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.08619613200426102, "logits/rejected": -0.030371153727173805, "logps/chosen": -2.731705665588379, "logps/rejected": -3.35599946975708, "loss": 0.5613, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.731705665588379, "rewards/margins": 0.6242938041687012, "rewards/rejected": -3.35599946975708, "sft_loss": 2.846529960632324, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 13.03628061104775, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.17338579893112183, "logits/rejected": -0.01972215436398983, "logps/chosen": -2.701442003250122, "logps/rejected": -3.3588452339172363, "loss": 0.5609, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.701442003250122, "rewards/margins": 0.6574033498764038, "rewards/rejected": -3.3588452339172363, "sft_loss": 2.878345012664795, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 13.795442492625112, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.12506094574928284, "logits/rejected": 0.049610551446676254, "logps/chosen": -2.8180489540100098, "logps/rejected": -3.6988799571990967, "loss": 0.5293, "rewards/accuracies": 0.75, "rewards/chosen": -2.8180489540100098, "rewards/margins": 0.8808309435844421, "rewards/rejected": -3.6988799571990967, "sft_loss": 3.0852441787719727, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 18.801175453987884, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.09460072964429855, "logits/rejected": -0.019903432577848434, "logps/chosen": -2.8260114192962646, "logps/rejected": -3.6575844287872314, "loss": 0.5206, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8260114192962646, "rewards/margins": 0.8315728306770325, "rewards/rejected": -3.6575844287872314, "sft_loss": 3.039412498474121, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 15.622175643204827, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.13542449474334717, "logits/rejected": 0.031426429748535156, "logps/chosen": -2.97768235206604, "logps/rejected": -3.8668739795684814, "loss": 0.5144, "rewards/accuracies": 0.78125, "rewards/chosen": -2.97768235206604, "rewards/margins": 0.8891918063163757, "rewards/rejected": -3.8668739795684814, "sft_loss": 3.1043858528137207, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 19.316503635391115, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.10220174491405487, "logits/rejected": 0.06846420466899872, "logps/chosen": -2.8224167823791504, "logps/rejected": -3.7612388134002686, "loss": 0.4854, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8224167823791504, "rewards/margins": 0.9388219714164734, "rewards/rejected": -3.7612388134002686, "sft_loss": 2.9477078914642334, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 18.5568200414671, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.1605706512928009, "logits/rejected": -0.02944830060005188, "logps/chosen": -3.0751566886901855, "logps/rejected": -3.7981693744659424, "loss": 0.5777, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.0751566886901855, "rewards/margins": 0.7230121493339539, "rewards/rejected": -3.7981693744659424, "sft_loss": 3.372133731842041, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 13.343185976912125, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.21381273865699768, "logits/rejected": -0.019857224076986313, "logps/chosen": -2.8909995555877686, "logps/rejected": -3.6995689868927, "loss": 0.5401, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8909995555877686, "rewards/margins": 0.8085689544677734, "rewards/rejected": -3.6995689868927, "sft_loss": 3.1301767826080322, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 17.805283689818808, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.14839546382427216, "logits/rejected": -0.04645346850156784, "logps/chosen": -2.9273176193237305, "logps/rejected": -3.6850764751434326, "loss": 0.5572, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.9273176193237305, "rewards/margins": 0.7577590942382812, "rewards/rejected": -3.6850764751434326, "sft_loss": 3.088479518890381, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 14.030737674244467, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.20233619213104248, "logits/rejected": -0.18629398941993713, "logps/chosen": -2.8093066215515137, "logps/rejected": -3.3424696922302246, "loss": 0.6199, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8093066215515137, "rewards/margins": 0.5331630706787109, "rewards/rejected": -3.3424696922302246, "sft_loss": 3.0653176307678223, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 14.668997301854489, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.221414715051651, "logits/rejected": -0.0698011964559555, "logps/chosen": -2.9920241832733154, "logps/rejected": -3.6130146980285645, "loss": 0.5959, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9920241832733154, "rewards/margins": 0.6209903359413147, "rewards/rejected": -3.6130146980285645, "sft_loss": 3.0735440254211426, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 15.491495900754604, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.10512229055166245, "logits/rejected": -0.03650935739278793, "logps/chosen": -2.888662338256836, "logps/rejected": -3.501347064971924, "loss": 0.5887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.888662338256836, "rewards/margins": 0.612684428691864, "rewards/rejected": -3.501347064971924, "sft_loss": 2.9852821826934814, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 14.576856389525108, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.08463722467422485, "logits/rejected": 0.07376393675804138, "logps/chosen": -2.693789482116699, "logps/rejected": -3.459043025970459, "loss": 0.5221, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.693789482116699, "rewards/margins": 0.7652538418769836, "rewards/rejected": -3.459043025970459, "sft_loss": 2.7604660987854004, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 16.062435436979722, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.13985982537269592, "logits/rejected": 0.019844168797135353, "logps/chosen": -2.6246771812438965, "logps/rejected": -3.4403557777404785, "loss": 0.5577, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6246771812438965, "rewards/margins": 0.815678596496582, "rewards/rejected": -3.4403557777404785, "sft_loss": 2.7694716453552246, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 13.179507477842277, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.01281226146966219, "logits/rejected": 0.046844713389873505, "logps/chosen": -2.728203535079956, "logps/rejected": -3.4431490898132324, "loss": 0.5681, "rewards/accuracies": 0.71875, "rewards/chosen": -2.728203535079956, "rewards/margins": 0.7149455547332764, "rewards/rejected": -3.4431490898132324, "sft_loss": 2.8578782081604004, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 12.75446394519099, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.14359095692634583, "logits/rejected": -0.0907754972577095, "logps/chosen": -2.6554677486419678, "logps/rejected": -3.2389018535614014, "loss": 0.5536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6554677486419678, "rewards/margins": 0.5834343433380127, "rewards/rejected": -3.2389018535614014, "sft_loss": 2.838890790939331, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 10.722427530408353, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.13219062983989716, "logits/rejected": -0.03505239635705948, "logps/chosen": -2.730032444000244, "logps/rejected": -3.409017562866211, "loss": 0.5558, "rewards/accuracies": 0.6875, "rewards/chosen": -2.730032444000244, "rewards/margins": 0.6789852380752563, "rewards/rejected": -3.409017562866211, "sft_loss": 2.899423360824585, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 13.255145077665803, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.09360867738723755, "logits/rejected": 0.18282029032707214, "logps/chosen": -2.653068780899048, "logps/rejected": -3.4807612895965576, "loss": 0.526, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.653068780899048, "rewards/margins": 0.8276923894882202, "rewards/rejected": -3.4807612895965576, "sft_loss": 2.8217830657958984, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 13.204016973156914, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.10313459485769272, "logits/rejected": 0.11331169307231903, "logps/chosen": -2.686976909637451, "logps/rejected": -3.457796096801758, "loss": 0.5432, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.686976909637451, "rewards/margins": 0.7708194255828857, "rewards/rejected": -3.457796096801758, "sft_loss": 2.7527642250061035, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 17.918813310487703, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.09085582196712494, "logits/rejected": 0.07258772850036621, "logps/chosen": -2.6320650577545166, "logps/rejected": -3.3432857990264893, "loss": 0.5614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.6320650577545166, "rewards/margins": 0.7112206220626831, "rewards/rejected": -3.3432857990264893, "sft_loss": 2.7670674324035645, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 13.3017856458366, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.1044057160615921, "logits/rejected": 0.009650531224906445, "logps/chosen": -2.556666135787964, "logps/rejected": -3.480056047439575, "loss": 0.5091, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.556666135787964, "rewards/margins": 0.923389732837677, "rewards/rejected": -3.480056047439575, "sft_loss": 2.6876208782196045, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 9.98772600061051, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.11959944665431976, "logits/rejected": -0.039528582245111465, "logps/chosen": -2.725451946258545, "logps/rejected": -3.507183790206909, "loss": 0.5099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.725451946258545, "rewards/margins": 0.7817317843437195, "rewards/rejected": -3.507183790206909, "sft_loss": 2.7946159839630127, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 13.38597052407973, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.08520710468292236, "logits/rejected": 0.016001040115952492, "logps/chosen": -2.9222166538238525, "logps/rejected": -3.529139280319214, "loss": 0.6033, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9222166538238525, "rewards/margins": 0.6069226861000061, "rewards/rejected": -3.529139280319214, "sft_loss": 3.0721733570098877, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 16.46895036226173, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.1309281587600708, "logits/rejected": -0.0354643315076828, "logps/chosen": -2.7004354000091553, "logps/rejected": -3.518594741821289, "loss": 0.5459, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7004354000091553, "rewards/margins": 0.8181589841842651, "rewards/rejected": -3.518594741821289, "sft_loss": 2.789832592010498, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 12.847039245754097, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.08045268058776855, "logits/rejected": 0.1180802583694458, "logps/chosen": -2.740691661834717, "logps/rejected": -3.5636367797851562, "loss": 0.551, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.740691661834717, "rewards/margins": 0.8229446411132812, "rewards/rejected": -3.5636367797851562, "sft_loss": 2.8766980171203613, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 9.856853536296434, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.07948831468820572, "logits/rejected": 0.099387027323246, "logps/chosen": -2.6131155490875244, "logps/rejected": -3.3903956413269043, "loss": 0.5483, "rewards/accuracies": 0.75, "rewards/chosen": -2.6131155490875244, "rewards/margins": 0.7772801518440247, "rewards/rejected": -3.3903956413269043, "sft_loss": 2.744523286819458, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 10.170757207302868, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.1351918876171112, "logits/rejected": 0.06504303961992264, "logps/chosen": -2.585547685623169, "logps/rejected": -3.219287157058716, "loss": 0.5553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.585547685623169, "rewards/margins": 0.6337396502494812, "rewards/rejected": -3.219287157058716, "sft_loss": 2.7263667583465576, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 11.671673346294954, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.12973728775978088, "logits/rejected": -0.02659899927675724, "logps/chosen": -2.515537738800049, "logps/rejected": -3.344698667526245, "loss": 0.5147, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.515537738800049, "rewards/margins": 0.8291610479354858, "rewards/rejected": -3.344698667526245, "sft_loss": 2.676945924758911, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 13.71221290860054, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.047227486968040466, "logits/rejected": 0.1276738941669464, "logps/chosen": -2.6473500728607178, "logps/rejected": -3.255404233932495, "loss": 0.5629, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6473500728607178, "rewards/margins": 0.6080541610717773, "rewards/rejected": -3.255404233932495, "sft_loss": 2.734590530395508, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 12.438608083959897, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.027154380455613136, "logits/rejected": 0.06761490553617477, "logps/chosen": -2.582916259765625, "logps/rejected": -3.297471284866333, "loss": 0.5451, "rewards/accuracies": 0.78125, "rewards/chosen": -2.582916259765625, "rewards/margins": 0.7145551443099976, "rewards/rejected": -3.297471284866333, "sft_loss": 2.8727736473083496, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 11.917197558048063, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.04968901723623276, "logits/rejected": 0.06473871320486069, "logps/chosen": -2.7895896434783936, "logps/rejected": -3.569030284881592, "loss": 0.5934, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7895896434783936, "rewards/margins": 0.7794402837753296, "rewards/rejected": -3.569030284881592, "sft_loss": 3.034745693206787, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 11.495106920955104, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.10112420469522476, "logits/rejected": -0.10614802688360214, "logps/chosen": -2.657148599624634, "logps/rejected": -3.3440566062927246, "loss": 0.5671, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.657148599624634, "rewards/margins": 0.6869081258773804, "rewards/rejected": -3.3440566062927246, "sft_loss": 2.8682680130004883, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 12.574700982249729, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.16951291263103485, "logits/rejected": 0.036975160241127014, "logps/chosen": -2.642416000366211, "logps/rejected": -3.3090901374816895, "loss": 0.5598, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.642416000366211, "rewards/margins": 0.6666739583015442, "rewards/rejected": -3.3090901374816895, "sft_loss": 2.740629196166992, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 13.447116701349463, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.2006872147321701, "logits/rejected": -0.06577114015817642, "logps/chosen": -2.430807113647461, "logps/rejected": -3.3179214000701904, "loss": 0.4972, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.430807113647461, "rewards/margins": 0.8871143460273743, "rewards/rejected": -3.3179214000701904, "sft_loss": 2.5946366786956787, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 13.341152863137065, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.15490292012691498, "logits/rejected": -0.05388979986310005, "logps/chosen": -2.692946195602417, "logps/rejected": -3.3634724617004395, "loss": 0.5757, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.692946195602417, "rewards/margins": 0.6705261468887329, "rewards/rejected": -3.3634724617004395, "sft_loss": 2.7373454570770264, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 13.048397341515892, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.1885337084531784, "logits/rejected": -0.05167509242892265, "logps/chosen": -2.5184414386749268, "logps/rejected": -3.193661689758301, "loss": 0.5845, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5184414386749268, "rewards/margins": 0.6752203702926636, "rewards/rejected": -3.193661689758301, "sft_loss": 2.647007703781128, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 11.605630875359319, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.2051846981048584, "logits/rejected": 0.06742997467517853, "logps/chosen": -2.474386215209961, "logps/rejected": -3.319561004638672, "loss": 0.5138, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.474386215209961, "rewards/margins": 0.8451749682426453, "rewards/rejected": -3.319561004638672, "sft_loss": 2.6089258193969727, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 16.90516707069004, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.17644917964935303, "logits/rejected": 0.006234446074813604, "logps/chosen": -2.490910053253174, "logps/rejected": -3.3691134452819824, "loss": 0.5263, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.490910053253174, "rewards/margins": 0.8782032132148743, "rewards/rejected": -3.3691134452819824, "sft_loss": 2.5951409339904785, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 24.034250993795567, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.184208482503891, "logits/rejected": 0.034039318561553955, "logps/chosen": -2.527522563934326, "logps/rejected": -3.3448543548583984, "loss": 0.531, "rewards/accuracies": 0.75, "rewards/chosen": -2.527522563934326, "rewards/margins": 0.8173316717147827, "rewards/rejected": -3.3448543548583984, "sft_loss": 2.6108803749084473, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 11.839183488169162, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.15044936537742615, "logits/rejected": 0.0458507277071476, "logps/chosen": -2.6617484092712402, "logps/rejected": -3.3631751537323, "loss": 0.5553, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6617484092712402, "rewards/margins": 0.7014267444610596, "rewards/rejected": -3.3631751537323, "sft_loss": 2.855072021484375, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 14.701365832973247, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.09620432555675507, "logits/rejected": -0.04263802617788315, "logps/chosen": -2.5127742290496826, "logps/rejected": -3.233936309814453, "loss": 0.5351, "rewards/accuracies": 0.75, "rewards/chosen": -2.5127742290496826, "rewards/margins": 0.7211618423461914, "rewards/rejected": -3.233936309814453, "sft_loss": 2.6880221366882324, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 13.0410833738944, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.036066021770238876, "logits/rejected": 0.009444182738661766, "logps/chosen": -2.5961806774139404, "logps/rejected": -3.322415590286255, "loss": 0.5558, "rewards/accuracies": 0.75, "rewards/chosen": -2.5961806774139404, "rewards/margins": 0.7262347936630249, "rewards/rejected": -3.322415590286255, "sft_loss": 2.7093794345855713, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 16.972296740618447, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.15286365151405334, "logits/rejected": -0.0631868839263916, "logps/chosen": -2.5808475017547607, "logps/rejected": -3.2137513160705566, "loss": 0.5988, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.5808475017547607, "rewards/margins": 0.6329033374786377, "rewards/rejected": -3.2137513160705566, "sft_loss": 2.66843581199646, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 12.446230685664725, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.09740933030843735, "logits/rejected": -0.09939023107290268, "logps/chosen": -2.4693121910095215, "logps/rejected": -3.026426315307617, "loss": 0.5838, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4693121910095215, "rewards/margins": 0.5571140646934509, "rewards/rejected": -3.026426315307617, "sft_loss": 2.6136717796325684, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 11.761230687897601, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.13074907660484314, "logits/rejected": -0.030215833336114883, "logps/chosen": -2.479123115539551, "logps/rejected": -3.2345752716064453, "loss": 0.53, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.479123115539551, "rewards/margins": 0.7554522752761841, "rewards/rejected": -3.2345752716064453, "sft_loss": 2.625739574432373, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 16.20048329992292, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.23178374767303467, "logits/rejected": -0.036921434104442596, "logps/chosen": -2.546027421951294, "logps/rejected": -3.364816665649414, "loss": 0.5121, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.546027421951294, "rewards/margins": 0.8187891244888306, "rewards/rejected": -3.364816665649414, "sft_loss": 2.596057891845703, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 13.237538300780992, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.15380941331386566, "logits/rejected": 0.012906426563858986, "logps/chosen": -2.4451916217803955, "logps/rejected": -3.311616897583008, "loss": 0.497, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4451916217803955, "rewards/margins": 0.8664249181747437, "rewards/rejected": -3.311616897583008, "sft_loss": 2.575024127960205, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 13.956827265019383, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.22626054286956787, "logits/rejected": -0.012976361438632011, "logps/chosen": -2.4243850708007812, "logps/rejected": -3.1488213539123535, "loss": 0.5332, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4243850708007812, "rewards/margins": 0.7244361639022827, "rewards/rejected": -3.1488213539123535, "sft_loss": 2.582690477371216, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 10.327241906099365, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.19221287965774536, "logits/rejected": 0.026269104331731796, "logps/chosen": -2.713867425918579, "logps/rejected": -3.5505664348602295, "loss": 0.525, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.713867425918579, "rewards/margins": 0.8366985321044922, "rewards/rejected": -3.5505664348602295, "sft_loss": 2.8076956272125244, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 16.810249159493672, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.2720238268375397, "logits/rejected": -0.0906338170170784, "logps/chosen": -2.614173650741577, "logps/rejected": -3.446650266647339, "loss": 0.5747, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.614173650741577, "rewards/margins": 0.8324767351150513, "rewards/rejected": -3.446650266647339, "sft_loss": 2.7610104084014893, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 13.34223976398338, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.2368827760219574, "logits/rejected": -0.1460256278514862, "logps/chosen": -2.7980031967163086, "logps/rejected": -3.4800288677215576, "loss": 0.61, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7980031967163086, "rewards/margins": 0.6820257306098938, "rewards/rejected": -3.4800288677215576, "sft_loss": 2.993485689163208, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 18.378836446546607, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.1851898729801178, "logits/rejected": -0.03527377173304558, "logps/chosen": -2.6871867179870605, "logps/rejected": -3.432643175125122, "loss": 0.5874, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6871867179870605, "rewards/margins": 0.7454566359519958, "rewards/rejected": -3.432643175125122, "sft_loss": 2.8437156677246094, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 15.379182647175359, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.19557121396064758, "logits/rejected": -0.06361202895641327, "logps/chosen": -2.899049758911133, "logps/rejected": -3.6863160133361816, "loss": 0.5622, "rewards/accuracies": 0.75, "rewards/chosen": -2.899049758911133, "rewards/margins": 0.7872657775878906, "rewards/rejected": -3.6863160133361816, "sft_loss": 2.9786953926086426, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 19.226688805785205, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.15122616291046143, "logits/rejected": 0.1265804022550583, "logps/chosen": -2.5949044227600098, "logps/rejected": -3.4804539680480957, "loss": 0.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.5949044227600098, "rewards/margins": 0.8855496644973755, "rewards/rejected": -3.4804539680480957, "sft_loss": 2.70125412940979, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 12.066918686519287, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.22271844744682312, "logits/rejected": 0.0023609772324562073, "logps/chosen": -2.7400269508361816, "logps/rejected": -3.5635292530059814, "loss": 0.5176, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7400269508361816, "rewards/margins": 0.8235026597976685, "rewards/rejected": -3.5635292530059814, "sft_loss": 2.8706085681915283, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 10.177395026790624, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.08435920625925064, "logits/rejected": -0.04384620115160942, "logps/chosen": -2.7082793712615967, "logps/rejected": -3.4856247901916504, "loss": 0.525, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7082793712615967, "rewards/margins": 0.7773455381393433, "rewards/rejected": -3.4856247901916504, "sft_loss": 2.8122072219848633, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 11.947349382256055, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.044284939765930176, "logits/rejected": 0.03796308487653732, "logps/chosen": -2.7693889141082764, "logps/rejected": -3.5747649669647217, "loss": 0.511, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7693889141082764, "rewards/margins": 0.8053762316703796, "rewards/rejected": -3.5747649669647217, "sft_loss": 2.8007047176361084, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 15.120074288295065, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.20141425728797913, "logits/rejected": -0.08690561354160309, "logps/chosen": -2.728034496307373, "logps/rejected": -3.516979932785034, "loss": 0.56, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.728034496307373, "rewards/margins": 0.7889455556869507, "rewards/rejected": -3.516979932785034, "sft_loss": 2.9466300010681152, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 12.277672817485561, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.12988251447677612, "logits/rejected": -0.03518949821591377, "logps/chosen": -2.8811938762664795, "logps/rejected": -3.6656360626220703, "loss": 0.542, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8811938762664795, "rewards/margins": 0.7844420075416565, "rewards/rejected": -3.6656360626220703, "sft_loss": 2.9895501136779785, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 11.712712073445397, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.28072601556777954, "logits/rejected": -0.0645415335893631, "logps/chosen": -2.8859238624572754, "logps/rejected": -3.632223606109619, "loss": 0.5675, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8859238624572754, "rewards/margins": 0.7462996244430542, "rewards/rejected": -3.632223606109619, "sft_loss": 2.982220411300659, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 12.946250592719066, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.2070569097995758, "logits/rejected": -0.017365068197250366, "logps/chosen": -2.7785403728485107, "logps/rejected": -3.644423246383667, "loss": 0.5313, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7785403728485107, "rewards/margins": 0.8658832311630249, "rewards/rejected": -3.644423246383667, "sft_loss": 2.9232311248779297, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 11.736503117054808, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.1762668788433075, "logits/rejected": 0.003911969251930714, "logps/chosen": -2.836501359939575, "logps/rejected": -3.646477460861206, "loss": 0.5416, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.836501359939575, "rewards/margins": 0.8099767565727234, "rewards/rejected": -3.646477460861206, "sft_loss": 2.9586517810821533, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 15.007043656895517, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.17761953175067902, "logits/rejected": -0.031355392187833786, "logps/chosen": -2.6332767009735107, "logps/rejected": -3.617527723312378, "loss": 0.5132, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6332767009735107, "rewards/margins": 0.9842513799667358, "rewards/rejected": -3.617527723312378, "sft_loss": 2.784289836883545, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 16.31340823948697, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.20015859603881836, "logits/rejected": -0.02334914542734623, "logps/chosen": -2.89608097076416, "logps/rejected": -3.7705636024475098, "loss": 0.5374, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.89608097076416, "rewards/margins": 0.8744827508926392, "rewards/rejected": -3.7705636024475098, "sft_loss": 3.092453956604004, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 14.44861023226222, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.19069427251815796, "logits/rejected": -0.0180917177349329, "logps/chosen": -2.741619110107422, "logps/rejected": -3.618983745574951, "loss": 0.5324, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.741619110107422, "rewards/margins": 0.877364993095398, "rewards/rejected": -3.618983745574951, "sft_loss": 2.8957886695861816, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 16.48183837907669, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.13300219178199768, "logits/rejected": 0.04468105733394623, "logps/chosen": -2.9447832107543945, "logps/rejected": -3.7198548316955566, "loss": 0.5551, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9447832107543945, "rewards/margins": 0.7750714421272278, "rewards/rejected": -3.7198548316955566, "sft_loss": 3.105386257171631, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 14.999334086996232, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.29770466685295105, "logits/rejected": -0.10078287124633789, "logps/chosen": -2.7202506065368652, "logps/rejected": -3.578933000564575, "loss": 0.5286, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7202506065368652, "rewards/margins": 0.8586821556091309, "rewards/rejected": -3.578933000564575, "sft_loss": 2.8595337867736816, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 14.321699959576042, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.21056774258613586, "logits/rejected": -0.02034085802733898, "logps/chosen": -2.89121675491333, "logps/rejected": -3.8506247997283936, "loss": 0.5274, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.89121675491333, "rewards/margins": 0.9594081044197083, "rewards/rejected": -3.8506247997283936, "sft_loss": 3.043997287750244, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 13.887602835497312, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.13619615137577057, "logits/rejected": 0.0005922317504882812, "logps/chosen": -2.908087968826294, "logps/rejected": -3.741673707962036, "loss": 0.5438, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.908087968826294, "rewards/margins": 0.8335857391357422, "rewards/rejected": -3.741673707962036, "sft_loss": 3.002686023712158, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 16.49687038495756, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.12130200862884521, "logits/rejected": -0.04878392815589905, "logps/chosen": -2.851966381072998, "logps/rejected": -3.7922961711883545, "loss": 0.4991, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.851966381072998, "rewards/margins": 0.9403297305107117, "rewards/rejected": -3.7922961711883545, "sft_loss": 2.9816746711730957, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 16.483275366685195, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.2887728214263916, "logits/rejected": -0.1400589495897293, "logps/chosen": -2.6632139682769775, "logps/rejected": -3.386286973953247, "loss": 0.5552, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6632139682769775, "rewards/margins": 0.7230730056762695, "rewards/rejected": -3.386286973953247, "sft_loss": 2.812216281890869, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 23.26258200474679, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.2058527171611786, "logits/rejected": -0.06673066318035126, "logps/chosen": -2.7234292030334473, "logps/rejected": -3.306281328201294, "loss": 0.5886, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.7234292030334473, "rewards/margins": 0.5828520059585571, "rewards/rejected": -3.306281328201294, "sft_loss": 2.842212200164795, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.13509926199913025, "eval_logits/rejected": 0.24084508419036865, "eval_logps/chosen": -2.800790786743164, "eval_logps/rejected": -3.598574161529541, "eval_loss": 0.5600130558013916, "eval_rewards/accuracies": 0.7292284965515137, "eval_rewards/chosen": -2.800790786743164, "eval_rewards/margins": 0.7977828979492188, "eval_rewards/rejected": -3.598574161529541, "eval_runtime": 49.8522, "eval_samples_per_second": 26.98, "eval_sft_loss": 2.9405603408813477, "eval_steps_per_second": 6.76, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 9.178747966280907, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.19156606495380402, "logits/rejected": -0.04732166603207588, "logps/chosen": -2.7235498428344727, "logps/rejected": -3.664428234100342, "loss": 0.5177, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7235498428344727, "rewards/margins": 0.9408785700798035, "rewards/rejected": -3.664428234100342, "sft_loss": 2.8657774925231934, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 14.64997920224963, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.2830374240875244, "logits/rejected": -0.08339240401983261, "logps/chosen": -2.740020990371704, "logps/rejected": -3.4012451171875, "loss": 0.5545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.740020990371704, "rewards/margins": 0.6612240076065063, "rewards/rejected": -3.4012451171875, "sft_loss": 2.873657703399658, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 11.524358523068935, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.12432912737131119, "logits/rejected": -0.029606735333800316, "logps/chosen": -2.6872119903564453, "logps/rejected": -3.5602688789367676, "loss": 0.5376, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6872119903564453, "rewards/margins": 0.8730567097663879, "rewards/rejected": -3.5602688789367676, "sft_loss": 2.837043285369873, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 11.142509944407367, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.23667557537555695, "logits/rejected": -0.10876540839672089, "logps/chosen": -2.668792724609375, "logps/rejected": -3.506338119506836, "loss": 0.5018, "rewards/accuracies": 0.75, "rewards/chosen": -2.668792724609375, "rewards/margins": 0.8375449180603027, "rewards/rejected": -3.506338119506836, "sft_loss": 2.8608899116516113, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 10.733047141858474, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.17369894683361053, "logits/rejected": -0.07994101941585541, "logps/chosen": -2.5164730548858643, "logps/rejected": -3.185642957687378, "loss": 0.5352, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5164730548858643, "rewards/margins": 0.6691699624061584, "rewards/rejected": -3.185642957687378, "sft_loss": 2.750406503677368, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 15.58574746404078, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.1767248660326004, "logits/rejected": -0.10301689803600311, "logps/chosen": -2.7023122310638428, "logps/rejected": -3.4687092304229736, "loss": 0.554, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7023122310638428, "rewards/margins": 0.7663971185684204, "rewards/rejected": -3.4687092304229736, "sft_loss": 2.8399150371551514, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 11.42669165967883, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.15778064727783203, "logits/rejected": -0.0029870569705963135, "logps/chosen": -2.656895160675049, "logps/rejected": -3.3034110069274902, "loss": 0.5897, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.656895160675049, "rewards/margins": 0.6465158462524414, "rewards/rejected": -3.3034110069274902, "sft_loss": 2.8100223541259766, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 14.465073990971277, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.21933957934379578, "logits/rejected": -0.08447906374931335, "logps/chosen": -2.6796069145202637, "logps/rejected": -3.46742582321167, "loss": 0.5622, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6796069145202637, "rewards/margins": 0.7878190875053406, "rewards/rejected": -3.46742582321167, "sft_loss": 2.7700302600860596, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 19.255952750159327, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.2369045466184616, "logits/rejected": -0.08217814564704895, "logps/chosen": -2.7740702629089355, "logps/rejected": -3.428654909133911, "loss": 0.5859, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7740702629089355, "rewards/margins": 0.654584527015686, "rewards/rejected": -3.428654909133911, "sft_loss": 2.906905174255371, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 16.646039882943093, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.16528551280498505, "logits/rejected": -0.008354656398296356, "logps/chosen": -2.638530969619751, "logps/rejected": -3.4399943351745605, "loss": 0.5305, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.638530969619751, "rewards/margins": 0.8014636039733887, "rewards/rejected": -3.4399943351745605, "sft_loss": 2.756185293197632, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 11.067511070480855, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.24355682730674744, "logits/rejected": 0.05653177946805954, "logps/chosen": -2.8080837726593018, "logps/rejected": -3.5968105792999268, "loss": 0.5213, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8080837726593018, "rewards/margins": 0.7887266874313354, "rewards/rejected": -3.5968105792999268, "sft_loss": 2.9028820991516113, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 18.68045619931817, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.16562731564044952, "logits/rejected": -0.04927445575594902, "logps/chosen": -2.7073352336883545, "logps/rejected": -3.3803048133850098, "loss": 0.5804, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7073352336883545, "rewards/margins": 0.6729689836502075, "rewards/rejected": -3.3803048133850098, "sft_loss": 2.819136381149292, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 11.705893401506131, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.2004169523715973, "logits/rejected": -0.08777768909931183, "logps/chosen": -2.8763434886932373, "logps/rejected": -3.8222098350524902, "loss": 0.4966, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8763434886932373, "rewards/margins": 0.945866584777832, "rewards/rejected": -3.8222098350524902, "sft_loss": 2.9446804523468018, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 18.213731947297997, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.3117894232273102, "logits/rejected": -0.1837528645992279, "logps/chosen": -2.9130516052246094, "logps/rejected": -3.5416507720947266, "loss": 0.6002, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9130516052246094, "rewards/margins": 0.6285988092422485, "rewards/rejected": -3.5416507720947266, "sft_loss": 3.135199785232544, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 15.827545785560098, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.16902324557304382, "logits/rejected": -0.10092641413211823, "logps/chosen": -2.875767469406128, "logps/rejected": -3.5667643547058105, "loss": 0.558, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.875767469406128, "rewards/margins": 0.6909972429275513, "rewards/rejected": -3.5667643547058105, "sft_loss": 3.042696237564087, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 10.677099643746306, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.30803683400154114, "logits/rejected": -0.10283231735229492, "logps/chosen": -2.822262763977051, "logps/rejected": -3.7773125171661377, "loss": 0.4987, "rewards/accuracies": 0.78125, "rewards/chosen": -2.822262763977051, "rewards/margins": 0.9550496339797974, "rewards/rejected": -3.7773125171661377, "sft_loss": 3.107553005218506, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 13.727165695334431, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.09935744106769562, "logits/rejected": 0.07015521824359894, "logps/chosen": -2.6926915645599365, "logps/rejected": -3.6834826469421387, "loss": 0.5068, "rewards/accuracies": 0.75, "rewards/chosen": -2.6926915645599365, "rewards/margins": 0.9907909631729126, "rewards/rejected": -3.6834826469421387, "sft_loss": 2.978348731994629, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 12.6517428480068, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.22434866428375244, "logits/rejected": -0.1461108922958374, "logps/chosen": -2.745539665222168, "logps/rejected": -3.575504779815674, "loss": 0.5058, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.745539665222168, "rewards/margins": 0.8299651145935059, "rewards/rejected": -3.575504779815674, "sft_loss": 2.93871808052063, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 14.644970152963, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.1396378129720688, "logits/rejected": -0.10330124944448471, "logps/chosen": -2.7004261016845703, "logps/rejected": -3.378009796142578, "loss": 0.5875, "rewards/accuracies": 0.75, "rewards/chosen": -2.7004261016845703, "rewards/margins": 0.6775835752487183, "rewards/rejected": -3.378009796142578, "sft_loss": 2.8961141109466553, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 12.36729496599677, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.14708910882472992, "logits/rejected": 0.03477005660533905, "logps/chosen": -2.6921839714050293, "logps/rejected": -3.314579725265503, "loss": 0.5643, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6921839714050293, "rewards/margins": 0.6223957538604736, "rewards/rejected": -3.314579725265503, "sft_loss": 2.88639235496521, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 12.37532525890674, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.156544491648674, "logits/rejected": 0.026922767981886864, "logps/chosen": -2.521768808364868, "logps/rejected": -3.3125717639923096, "loss": 0.5289, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.521768808364868, "rewards/margins": 0.7908032536506653, "rewards/rejected": -3.3125717639923096, "sft_loss": 2.6430845260620117, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 14.180483053003758, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.20356695353984833, "logits/rejected": -0.047584448009729385, "logps/chosen": -2.5231261253356934, "logps/rejected": -3.316211700439453, "loss": 0.5097, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5231261253356934, "rewards/margins": 0.7930856347084045, "rewards/rejected": -3.316211700439453, "sft_loss": 2.685520648956299, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 17.337359619768694, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.10451420396566391, "logits/rejected": 0.00366055965423584, "logps/chosen": -2.52323842048645, "logps/rejected": -3.3734488487243652, "loss": 0.5063, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.52323842048645, "rewards/margins": 0.8502107858657837, "rewards/rejected": -3.3734488487243652, "sft_loss": 2.73701810836792, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 14.455186650463212, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.22164861857891083, "logits/rejected": -0.136245459318161, "logps/chosen": -2.6677823066711426, "logps/rejected": -3.357440233230591, "loss": 0.5296, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6677823066711426, "rewards/margins": 0.6896578073501587, "rewards/rejected": -3.357440233230591, "sft_loss": 2.6678340435028076, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 17.826150311848068, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.21637877821922302, "logits/rejected": -0.12764832377433777, "logps/chosen": -2.770840883255005, "logps/rejected": -3.4959206581115723, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -2.770840883255005, "rewards/margins": 0.7250800132751465, "rewards/rejected": -3.4959206581115723, "sft_loss": 2.990640163421631, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 23.893730691241025, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.18899592757225037, "logits/rejected": -0.11210862547159195, "logps/chosen": -2.7489566802978516, "logps/rejected": -3.3823254108428955, "loss": 0.6187, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7489566802978516, "rewards/margins": 0.6333683133125305, "rewards/rejected": -3.3823254108428955, "sft_loss": 2.92044734954834, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 13.561641964424144, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.2757723331451416, "logits/rejected": -0.11094491183757782, "logps/chosen": -2.680462598800659, "logps/rejected": -3.4301047325134277, "loss": 0.5435, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.680462598800659, "rewards/margins": 0.7496423721313477, "rewards/rejected": -3.4301047325134277, "sft_loss": 2.8209385871887207, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 15.414372943397188, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.20649929344654083, "logits/rejected": -0.016391444951295853, "logps/chosen": -2.79091215133667, "logps/rejected": -3.4462196826934814, "loss": 0.5771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.79091215133667, "rewards/margins": 0.6553074717521667, "rewards/rejected": -3.4462196826934814, "sft_loss": 2.9573159217834473, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 13.30450273991112, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.20889747142791748, "logits/rejected": 0.004860124550759792, "logps/chosen": -2.6875839233398438, "logps/rejected": -3.52405047416687, "loss": 0.5065, "rewards/accuracies": 0.75, "rewards/chosen": -2.6875839233398438, "rewards/margins": 0.8364666700363159, "rewards/rejected": -3.52405047416687, "sft_loss": 2.816736936569214, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 13.620162752755839, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.1965445727109909, "logits/rejected": -0.016790464520454407, "logps/chosen": -2.808445453643799, "logps/rejected": -3.509186267852783, "loss": 0.5488, "rewards/accuracies": 0.75, "rewards/chosen": -2.808445453643799, "rewards/margins": 0.7007406949996948, "rewards/rejected": -3.509186267852783, "sft_loss": 2.945175886154175, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 15.913301539563097, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.14413128793239594, "logits/rejected": -0.00657269824296236, "logps/chosen": -2.6809163093566895, "logps/rejected": -3.610265016555786, "loss": 0.5396, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6809163093566895, "rewards/margins": 0.9293490648269653, "rewards/rejected": -3.610265016555786, "sft_loss": 2.8856360912323, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 18.466727374987098, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.1730526238679886, "logits/rejected": -0.04170035570859909, "logps/chosen": -2.8152554035186768, "logps/rejected": -3.526615619659424, "loss": 0.5729, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8152554035186768, "rewards/margins": 0.7113600969314575, "rewards/rejected": -3.526615619659424, "sft_loss": 2.9285664558410645, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 33.35905874085635, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.15972650051116943, "logits/rejected": -0.0013029143447056413, "logps/chosen": -2.736224412918091, "logps/rejected": -3.581981658935547, "loss": 0.5385, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.736224412918091, "rewards/margins": 0.8457571864128113, "rewards/rejected": -3.581981658935547, "sft_loss": 2.8889448642730713, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 10.512825087959222, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.133263498544693, "logits/rejected": -0.04440528526902199, "logps/chosen": -2.628513813018799, "logps/rejected": -3.4317822456359863, "loss": 0.5477, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.628513813018799, "rewards/margins": 0.803268313407898, "rewards/rejected": -3.4317822456359863, "sft_loss": 2.7247188091278076, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 12.64341290727834, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.23653188347816467, "logits/rejected": -0.06919825822114944, "logps/chosen": -2.5829663276672363, "logps/rejected": -3.34773325920105, "loss": 0.5467, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.5829663276672363, "rewards/margins": 0.7647669911384583, "rewards/rejected": -3.34773325920105, "sft_loss": 2.7210841178894043, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 17.65401632433239, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.17679783701896667, "logits/rejected": -0.06925829499959946, "logps/chosen": -2.688565492630005, "logps/rejected": -3.24664568901062, "loss": 0.5949, "rewards/accuracies": 0.6875, "rewards/chosen": -2.688565492630005, "rewards/margins": 0.55808025598526, "rewards/rejected": -3.24664568901062, "sft_loss": 2.872016191482544, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 11.379800330719814, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.08219718188047409, "logits/rejected": 0.09790127724409103, "logps/chosen": -2.4927101135253906, "logps/rejected": -3.3262767791748047, "loss": 0.4998, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4927101135253906, "rewards/margins": 0.8335663676261902, "rewards/rejected": -3.3262767791748047, "sft_loss": 2.616259813308716, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 17.147163336849538, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.17891012132167816, "logits/rejected": -0.036122821271419525, "logps/chosen": -2.4723801612854004, "logps/rejected": -3.185424327850342, "loss": 0.5609, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.4723801612854004, "rewards/margins": 0.7130442261695862, "rewards/rejected": -3.185424327850342, "sft_loss": 2.7231812477111816, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 16.501352248032468, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.16941580176353455, "logits/rejected": -0.11211607605218887, "logps/chosen": -2.6303224563598633, "logps/rejected": -3.272925853729248, "loss": 0.5667, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6303224563598633, "rewards/margins": 0.6426035165786743, "rewards/rejected": -3.272925853729248, "sft_loss": 2.7140302658081055, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 17.074771053628144, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.07847137004137039, "logits/rejected": 0.0759764313697815, "logps/chosen": -2.61045503616333, "logps/rejected": -3.3174538612365723, "loss": 0.5387, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.61045503616333, "rewards/margins": 0.7069988250732422, "rewards/rejected": -3.3174538612365723, "sft_loss": 2.783203125, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 13.390652917894553, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.06304212659597397, "logits/rejected": 0.017494995146989822, "logps/chosen": -2.6455399990081787, "logps/rejected": -3.3666281700134277, "loss": 0.5324, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6455399990081787, "rewards/margins": 0.721088171005249, "rewards/rejected": -3.3666281700134277, "sft_loss": 2.9030449390411377, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 23.949889372898472, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.0719418078660965, "logits/rejected": -0.009783153422176838, "logps/chosen": -2.728297472000122, "logps/rejected": -3.488001585006714, "loss": 0.5536, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.728297472000122, "rewards/margins": 0.7597039341926575, "rewards/rejected": -3.488001585006714, "sft_loss": 2.902074098587036, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 18.518272311030678, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.17964942753314972, "logits/rejected": -0.025340866297483444, "logps/chosen": -2.68434739112854, "logps/rejected": -3.5610384941101074, "loss": 0.5317, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.68434739112854, "rewards/margins": 0.8766916394233704, "rewards/rejected": -3.5610384941101074, "sft_loss": 2.9283835887908936, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 14.34158145926517, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.12653285264968872, "logits/rejected": -0.15354570746421814, "logps/chosen": -2.7683181762695312, "logps/rejected": -3.3777289390563965, "loss": 0.5983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.7683181762695312, "rewards/margins": 0.6094110012054443, "rewards/rejected": -3.3777289390563965, "sft_loss": 3.0164825916290283, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 20.30195335913346, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.18265673518180847, "logits/rejected": -0.015449044294655323, "logps/chosen": -2.7602028846740723, "logps/rejected": -3.659242630004883, "loss": 0.5537, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7602028846740723, "rewards/margins": 0.8990398645401001, "rewards/rejected": -3.659242630004883, "sft_loss": 2.8773014545440674, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 13.652805417608606, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.27530530095100403, "logits/rejected": -0.07264344394207001, "logps/chosen": -2.845526933670044, "logps/rejected": -3.6497280597686768, "loss": 0.5391, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.845526933670044, "rewards/margins": 0.8042010068893433, "rewards/rejected": -3.6497280597686768, "sft_loss": 2.9927849769592285, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 15.842694719104523, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.2530178427696228, "logits/rejected": -0.13777250051498413, "logps/chosen": -2.893038272857666, "logps/rejected": -3.697848081588745, "loss": 0.5376, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.893038272857666, "rewards/margins": 0.8048097491264343, "rewards/rejected": -3.697848081588745, "sft_loss": 3.057915210723877, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 14.544232181041682, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.19055934250354767, "logits/rejected": -0.0527595654129982, "logps/chosen": -2.9984183311462402, "logps/rejected": -3.722759962081909, "loss": 0.5649, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9984183311462402, "rewards/margins": 0.7243413329124451, "rewards/rejected": -3.722759962081909, "sft_loss": 3.1255040168762207, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 16.115021801415836, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.18890123069286346, "logits/rejected": -0.09562461078166962, "logps/chosen": -2.700990676879883, "logps/rejected": -3.611769199371338, "loss": 0.5006, "rewards/accuracies": 0.8125, "rewards/chosen": -2.700990676879883, "rewards/margins": 0.9107785224914551, "rewards/rejected": -3.611769199371338, "sft_loss": 2.8132376670837402, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 14.469293489368207, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.2293945848941803, "logits/rejected": -0.07062532007694244, "logps/chosen": -2.8479835987091064, "logps/rejected": -3.846541166305542, "loss": 0.5125, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8479835987091064, "rewards/margins": 0.9985576868057251, "rewards/rejected": -3.846541166305542, "sft_loss": 2.973374605178833, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 17.29314917200767, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.18610408902168274, "logits/rejected": -0.009005597792565823, "logps/chosen": -2.818213939666748, "logps/rejected": -3.674471378326416, "loss": 0.4958, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.818213939666748, "rewards/margins": 0.856257438659668, "rewards/rejected": -3.674471378326416, "sft_loss": 3.050729274749756, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 15.722883885277431, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.14516502618789673, "logits/rejected": -0.044414542615413666, "logps/chosen": -2.8328747749328613, "logps/rejected": -3.519896984100342, "loss": 0.6077, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8328747749328613, "rewards/margins": 0.6870219111442566, "rewards/rejected": -3.519896984100342, "sft_loss": 3.001101493835449, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 12.264431130751412, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.2737159729003906, "logits/rejected": -0.10480846464633942, "logps/chosen": -2.8148722648620605, "logps/rejected": -3.628904342651367, "loss": 0.5078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8148722648620605, "rewards/margins": 0.8140321969985962, "rewards/rejected": -3.628904342651367, "sft_loss": 2.98403000831604, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 16.23830385932747, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.11874481290578842, "logits/rejected": 2.7514994144439697e-05, "logps/chosen": -2.9035863876342773, "logps/rejected": -3.6935629844665527, "loss": 0.5183, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.9035863876342773, "rewards/margins": 0.7899765968322754, "rewards/rejected": -3.6935629844665527, "sft_loss": 2.9855895042419434, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 20.915588392873623, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.17837285995483398, "logits/rejected": -0.0020327470265328884, "logps/chosen": -2.8230979442596436, "logps/rejected": -3.5790398120880127, "loss": 0.5848, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8230979442596436, "rewards/margins": 0.7559415102005005, "rewards/rejected": -3.5790398120880127, "sft_loss": 2.8579752445220947, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 21.16032431937469, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.21279004216194153, "logits/rejected": -0.012899084016680717, "logps/chosen": -2.6492888927459717, "logps/rejected": -3.3431143760681152, "loss": 0.5916, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6492888927459717, "rewards/margins": 0.693824827671051, "rewards/rejected": -3.3431143760681152, "sft_loss": 2.761671543121338, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 12.487601525148522, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.2572769522666931, "logits/rejected": -0.003753144294023514, "logps/chosen": -2.539226770401001, "logps/rejected": -3.389569044113159, "loss": 0.4963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.539226770401001, "rewards/margins": 0.8503425717353821, "rewards/rejected": -3.389569044113159, "sft_loss": 2.672743320465088, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 22.048819759200136, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.3438642919063568, "logits/rejected": -0.05838317796587944, "logps/chosen": -2.6405534744262695, "logps/rejected": -3.430321216583252, "loss": 0.5653, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6405534744262695, "rewards/margins": 0.7897677421569824, "rewards/rejected": -3.430321216583252, "sft_loss": 2.7079286575317383, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 17.818430875235002, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.1408831775188446, "logits/rejected": -0.12864689528942108, "logps/chosen": -2.626361131668091, "logps/rejected": -3.380760908126831, "loss": 0.5562, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.626361131668091, "rewards/margins": 0.7543995380401611, "rewards/rejected": -3.380760908126831, "sft_loss": 2.6994235515594482, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 11.370843110523085, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.23384025692939758, "logits/rejected": -0.03425057977437973, "logps/chosen": -2.668400526046753, "logps/rejected": -3.441761016845703, "loss": 0.5463, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.668400526046753, "rewards/margins": 0.7733603715896606, "rewards/rejected": -3.441761016845703, "sft_loss": 2.858882427215576, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 11.361594017102297, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.3020915389060974, "logits/rejected": -0.04475082457065582, "logps/chosen": -2.5850963592529297, "logps/rejected": -3.5558242797851562, "loss": 0.4789, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5850963592529297, "rewards/margins": 0.9707280993461609, "rewards/rejected": -3.5558242797851562, "sft_loss": 2.647453784942627, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 16.554445963777106, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.14629390835762024, "logits/rejected": -0.12839782238006592, "logps/chosen": -2.6853654384613037, "logps/rejected": -3.4737114906311035, "loss": 0.5241, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.6853654384613037, "rewards/margins": 0.7883461117744446, "rewards/rejected": -3.4737114906311035, "sft_loss": 2.7871718406677246, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 10.852800163434761, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.3211224675178528, "logits/rejected": -0.1885753720998764, "logps/chosen": -2.6056792736053467, "logps/rejected": -3.3587448596954346, "loss": 0.5246, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6056792736053467, "rewards/margins": 0.753065288066864, "rewards/rejected": -3.3587448596954346, "sft_loss": 2.79439115524292, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 13.470218832823608, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.26647713780403137, "logits/rejected": -0.12850052118301392, "logps/chosen": -2.6721959114074707, "logps/rejected": -3.3363571166992188, "loss": 0.5514, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6721959114074707, "rewards/margins": 0.6641608476638794, "rewards/rejected": -3.3363571166992188, "sft_loss": 2.7512269020080566, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 16.435840602476095, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.21397753059864044, "logits/rejected": -0.10246507823467255, "logps/chosen": -2.6906704902648926, "logps/rejected": -3.434344530105591, "loss": 0.5419, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.6906704902648926, "rewards/margins": 0.7436736226081848, "rewards/rejected": -3.434344530105591, "sft_loss": 2.8569579124450684, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 18.18937708660994, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.21393327414989471, "logits/rejected": -0.010212997905910015, "logps/chosen": -2.658050537109375, "logps/rejected": -3.51568603515625, "loss": 0.543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.658050537109375, "rewards/margins": 0.857635498046875, "rewards/rejected": -3.51568603515625, "sft_loss": 2.8359665870666504, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 10.561874113057621, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.21657970547676086, "logits/rejected": 0.0002588302013464272, "logps/chosen": -2.510268211364746, "logps/rejected": -3.424483060836792, "loss": 0.481, "rewards/accuracies": 0.78125, "rewards/chosen": -2.510268211364746, "rewards/margins": 0.9142149686813354, "rewards/rejected": -3.424483060836792, "sft_loss": 2.7040927410125732, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 12.939507861708492, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.24379563331604004, "logits/rejected": -0.11864738166332245, "logps/chosen": -2.693303346633911, "logps/rejected": -3.5887484550476074, "loss": 0.5047, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.693303346633911, "rewards/margins": 0.8954454660415649, "rewards/rejected": -3.5887484550476074, "sft_loss": 2.855567693710327, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 13.038429418395213, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.2585260272026062, "logits/rejected": -0.08726723492145538, "logps/chosen": -2.6370608806610107, "logps/rejected": -3.403134822845459, "loss": 0.5105, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6370608806610107, "rewards/margins": 0.766074001789093, "rewards/rejected": -3.403134822845459, "sft_loss": 2.7392070293426514, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 18.37818975642997, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.17349588871002197, "logits/rejected": -0.04044419154524803, "logps/chosen": -2.7869231700897217, "logps/rejected": -3.616891860961914, "loss": 0.5208, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7869231700897217, "rewards/margins": 0.8299688100814819, "rewards/rejected": -3.616891860961914, "sft_loss": 2.87677001953125, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 23.30245997826689, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.14427100121974945, "logits/rejected": -0.0082294512540102, "logps/chosen": -2.7531418800354004, "logps/rejected": -3.622286319732666, "loss": 0.5112, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7531418800354004, "rewards/margins": 0.8691444396972656, "rewards/rejected": -3.622286319732666, "sft_loss": 2.935823440551758, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 14.765799932563043, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.2974587082862854, "logits/rejected": -0.08597452938556671, "logps/chosen": -2.8631033897399902, "logps/rejected": -3.757943630218506, "loss": 0.5084, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8631033897399902, "rewards/margins": 0.89484041929245, "rewards/rejected": -3.757943630218506, "sft_loss": 3.0459372997283936, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 18.81329740078329, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.1754823625087738, "logits/rejected": -0.03205886483192444, "logps/chosen": -2.784930944442749, "logps/rejected": -3.547107696533203, "loss": 0.5417, "rewards/accuracies": 0.75, "rewards/chosen": -2.784930944442749, "rewards/margins": 0.7621761560440063, "rewards/rejected": -3.547107696533203, "sft_loss": 3.011758804321289, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 14.856372988678277, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.2784285545349121, "logits/rejected": -0.19298048317432404, "logps/chosen": -2.7671022415161133, "logps/rejected": -3.4874374866485596, "loss": 0.5639, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7671022415161133, "rewards/margins": 0.7203353643417358, "rewards/rejected": -3.4874374866485596, "sft_loss": 2.97153902053833, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 16.34455536326885, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.22903160750865936, "logits/rejected": -0.1430944949388504, "logps/chosen": -2.869506359100342, "logps/rejected": -3.7298552989959717, "loss": 0.5389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.869506359100342, "rewards/margins": 0.8603488802909851, "rewards/rejected": -3.7298552989959717, "sft_loss": 3.0238983631134033, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 12.018862895462558, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.1803596168756485, "logits/rejected": -0.058324117213487625, "logps/chosen": -2.9235939979553223, "logps/rejected": -3.5747509002685547, "loss": 0.6078, "rewards/accuracies": 0.65625, "rewards/chosen": -2.9235939979553223, "rewards/margins": 0.6511572599411011, "rewards/rejected": -3.5747509002685547, "sft_loss": 2.9814629554748535, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 12.871878981446248, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.25617536902427673, "logits/rejected": -0.0906013622879982, "logps/chosen": -2.8694026470184326, "logps/rejected": -3.690614700317383, "loss": 0.5259, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8694026470184326, "rewards/margins": 0.8212119936943054, "rewards/rejected": -3.690614700317383, "sft_loss": 2.922090530395508, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 17.058939346776455, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.2006787359714508, "logits/rejected": -0.21372473239898682, "logps/chosen": -2.7825400829315186, "logps/rejected": -3.514707565307617, "loss": 0.5655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7825400829315186, "rewards/margins": 0.7321674823760986, "rewards/rejected": -3.514707565307617, "sft_loss": 3.0377678871154785, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 12.339324602189375, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.11219914257526398, "logits/rejected": -0.10450281947851181, "logps/chosen": -2.5796525478363037, "logps/rejected": -3.4286201000213623, "loss": 0.5323, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5796525478363037, "rewards/margins": 0.8489675521850586, "rewards/rejected": -3.4286201000213623, "sft_loss": 2.6898679733276367, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 17.408938715499623, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.2657962143421173, "logits/rejected": -0.18797865509986877, "logps/chosen": -2.6418039798736572, "logps/rejected": -3.3556416034698486, "loss": 0.5549, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6418039798736572, "rewards/margins": 0.713837742805481, "rewards/rejected": -3.3556416034698486, "sft_loss": 2.7874643802642822, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.1467505246400833, "eval_logits/rejected": 0.2546423673629761, "eval_logps/chosen": -2.7228782176971436, "eval_logps/rejected": -3.506152868270874, "eval_loss": 0.5573322176933289, "eval_rewards/accuracies": 0.7247774600982666, "eval_rewards/chosen": -2.7228782176971436, "eval_rewards/margins": 0.7832746505737305, "eval_rewards/rejected": -3.506152868270874, "eval_runtime": 49.9154, "eval_samples_per_second": 26.946, "eval_sft_loss": 2.8691623210906982, "eval_steps_per_second": 6.751, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 10.399875082771421, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.34869104623794556, "logits/rejected": -0.167296901345253, "logps/chosen": -2.527095079421997, "logps/rejected": -3.321505069732666, "loss": 0.5255, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.527095079421997, "rewards/margins": 0.7944096326828003, "rewards/rejected": -3.321505069732666, "sft_loss": 2.6784870624542236, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 16.63514429865691, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.22122304141521454, "logits/rejected": -0.1313764750957489, "logps/chosen": -2.628300189971924, "logps/rejected": -3.411324977874756, "loss": 0.538, "rewards/accuracies": 0.75, "rewards/chosen": -2.628300189971924, "rewards/margins": 0.7830251455307007, "rewards/rejected": -3.411324977874756, "sft_loss": 2.7826220989227295, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 10.66724424739637, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.16654136776924133, "logits/rejected": -0.042657412588596344, "logps/chosen": -2.652475357055664, "logps/rejected": -3.6194043159484863, "loss": 0.4705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.652475357055664, "rewards/margins": 0.9669289588928223, "rewards/rejected": -3.6194043159484863, "sft_loss": 2.7372677326202393, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 14.064916607818864, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.24533767998218536, "logits/rejected": -0.014905953779816628, "logps/chosen": -2.8403804302215576, "logps/rejected": -3.5930869579315186, "loss": 0.5703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8403804302215576, "rewards/margins": 0.7527070045471191, "rewards/rejected": -3.5930869579315186, "sft_loss": 2.991934061050415, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 14.350919762086672, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.30641651153564453, "logits/rejected": -0.09210020303726196, "logps/chosen": -2.866778612136841, "logps/rejected": -3.7245547771453857, "loss": 0.5238, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.866778612136841, "rewards/margins": 0.8577759861946106, "rewards/rejected": -3.7245547771453857, "sft_loss": 2.979400634765625, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 13.547482917019332, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.24482688307762146, "logits/rejected": -0.10179238021373749, "logps/chosen": -2.7300779819488525, "logps/rejected": -3.591665744781494, "loss": 0.5218, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7300779819488525, "rewards/margins": 0.8615878820419312, "rewards/rejected": -3.591665744781494, "sft_loss": 2.8641223907470703, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 29.997519760061838, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.25270360708236694, "logits/rejected": -0.08831027895212173, "logps/chosen": -2.880031108856201, "logps/rejected": -3.8612148761749268, "loss": 0.5023, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.880031108856201, "rewards/margins": 0.9811837077140808, "rewards/rejected": -3.8612148761749268, "sft_loss": 3.011920213699341, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 18.971073746288894, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.2417343109846115, "logits/rejected": -0.15943384170532227, "logps/chosen": -2.898433208465576, "logps/rejected": -3.618417263031006, "loss": 0.5858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.898433208465576, "rewards/margins": 0.7199840545654297, "rewards/rejected": -3.618417263031006, "sft_loss": 3.053682804107666, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 15.931686737504796, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.19866490364074707, "logits/rejected": -0.10472752153873444, "logps/chosen": -2.9091525077819824, "logps/rejected": -3.641115665435791, "loss": 0.6296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9091525077819824, "rewards/margins": 0.7319625616073608, "rewards/rejected": -3.641115665435791, "sft_loss": 2.9120960235595703, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 16.461024108369852, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.25154000520706177, "logits/rejected": -0.14929892122745514, "logps/chosen": -2.7859530448913574, "logps/rejected": -3.70951509475708, "loss": 0.4924, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7859530448913574, "rewards/margins": 0.9235623478889465, "rewards/rejected": -3.70951509475708, "sft_loss": 2.9241738319396973, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 12.408242747971943, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.2801755368709564, "logits/rejected": -0.0861397385597229, "logps/chosen": -2.93034029006958, "logps/rejected": -3.7381014823913574, "loss": 0.546, "rewards/accuracies": 0.71875, "rewards/chosen": -2.93034029006958, "rewards/margins": 0.8077613711357117, "rewards/rejected": -3.7381014823913574, "sft_loss": 3.0639588832855225, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 15.501732706021073, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.324562132358551, "logits/rejected": -0.11474557965993881, "logps/chosen": -2.8127694129943848, "logps/rejected": -3.550741195678711, "loss": 0.533, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8127694129943848, "rewards/margins": 0.7379716634750366, "rewards/rejected": -3.550741195678711, "sft_loss": 2.8940277099609375, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 19.540289281178666, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.35332703590393066, "logits/rejected": -0.21106410026550293, "logps/chosen": -2.8680214881896973, "logps/rejected": -3.487658739089966, "loss": 0.5897, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8680214881896973, "rewards/margins": 0.6196374893188477, "rewards/rejected": -3.487658739089966, "sft_loss": 3.0192437171936035, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 17.32744962552554, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.2802363634109497, "logits/rejected": -0.15609270334243774, "logps/chosen": -2.7928478717803955, "logps/rejected": -3.4934096336364746, "loss": 0.5619, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7928478717803955, "rewards/margins": 0.7005618810653687, "rewards/rejected": -3.4934096336364746, "sft_loss": 2.8554327487945557, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 16.893818398379278, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.26265794038772583, "logits/rejected": -0.13234956562519073, "logps/chosen": -2.7641029357910156, "logps/rejected": -3.4810073375701904, "loss": 0.544, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7641029357910156, "rewards/margins": 0.7169044017791748, "rewards/rejected": -3.4810073375701904, "sft_loss": 2.81727933883667, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 15.252444812376583, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.1614963561296463, "logits/rejected": -0.04674742743372917, "logps/chosen": -2.6011624336242676, "logps/rejected": -3.4162681102752686, "loss": 0.5102, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6011624336242676, "rewards/margins": 0.8151056170463562, "rewards/rejected": -3.4162681102752686, "sft_loss": 2.713378429412842, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 14.330731261415055, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.2792847752571106, "logits/rejected": -0.14449983835220337, "logps/chosen": -2.732344150543213, "logps/rejected": -3.5566773414611816, "loss": 0.5276, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.732344150543213, "rewards/margins": 0.8243331909179688, "rewards/rejected": -3.5566773414611816, "sft_loss": 2.828441619873047, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 12.995898173961194, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.30418476462364197, "logits/rejected": -0.10869389772415161, "logps/chosen": -2.7487494945526123, "logps/rejected": -3.7656631469726562, "loss": 0.4808, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7487494945526123, "rewards/margins": 1.0169135332107544, "rewards/rejected": -3.7656631469726562, "sft_loss": 2.8211586475372314, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 14.851715763505505, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.2987568974494934, "logits/rejected": -0.13216647505760193, "logps/chosen": -2.708491802215576, "logps/rejected": -3.4900200366973877, "loss": 0.5571, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.708491802215576, "rewards/margins": 0.7815281748771667, "rewards/rejected": -3.4900200366973877, "sft_loss": 2.7990097999572754, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 12.612914431649259, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.20559749007225037, "logits/rejected": -0.09073267877101898, "logps/chosen": -2.8077054023742676, "logps/rejected": -3.738248825073242, "loss": 0.4909, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8077054023742676, "rewards/margins": 0.9305435419082642, "rewards/rejected": -3.738248825073242, "sft_loss": 2.9258337020874023, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 16.242664587469477, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.15259698033332825, "logits/rejected": -0.13122805953025818, "logps/chosen": -2.800950765609741, "logps/rejected": -3.6922545433044434, "loss": 0.5171, "rewards/accuracies": 0.75, "rewards/chosen": -2.800950765609741, "rewards/margins": 0.8913037180900574, "rewards/rejected": -3.6922545433044434, "sft_loss": 2.9571404457092285, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 18.94326857674209, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.2627522051334381, "logits/rejected": -0.14064963161945343, "logps/chosen": -2.9124884605407715, "logps/rejected": -3.6973648071289062, "loss": 0.5426, "rewards/accuracies": 0.75, "rewards/chosen": -2.9124884605407715, "rewards/margins": 0.7848763465881348, "rewards/rejected": -3.6973648071289062, "sft_loss": 3.141721725463867, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 24.740523928241057, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.21578022837638855, "logits/rejected": -0.0047931610606610775, "logps/chosen": -2.9035096168518066, "logps/rejected": -3.8408610820770264, "loss": 0.5311, "rewards/accuracies": 0.75, "rewards/chosen": -2.9035096168518066, "rewards/margins": 0.937351405620575, "rewards/rejected": -3.8408610820770264, "sft_loss": 3.172224521636963, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 20.55201507662613, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.23507125675678253, "logits/rejected": -0.11718054115772247, "logps/chosen": -3.0174198150634766, "logps/rejected": -3.8366706371307373, "loss": 0.5798, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0174198150634766, "rewards/margins": 0.8192507028579712, "rewards/rejected": -3.8366706371307373, "sft_loss": 3.2097160816192627, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 17.222357571169933, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.2141776978969574, "logits/rejected": -0.10248160362243652, "logps/chosen": -2.8130953311920166, "logps/rejected": -3.6256022453308105, "loss": 0.5655, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8130953311920166, "rewards/margins": 0.812507152557373, "rewards/rejected": -3.6256022453308105, "sft_loss": 2.977510452270508, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 13.799444985142202, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.3303857743740082, "logits/rejected": -0.1394929587841034, "logps/chosen": -2.7024035453796387, "logps/rejected": -3.8372929096221924, "loss": 0.4763, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7024035453796387, "rewards/margins": 1.1348894834518433, "rewards/rejected": -3.8372929096221924, "sft_loss": 2.956207275390625, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 15.263193791635503, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.28155213594436646, "logits/rejected": -0.156253844499588, "logps/chosen": -2.8950753211975098, "logps/rejected": -3.7396531105041504, "loss": 0.5191, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8950753211975098, "rewards/margins": 0.8445774912834167, "rewards/rejected": -3.7396531105041504, "sft_loss": 3.0152199268341064, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 14.100321215775855, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.2842947542667389, "logits/rejected": -0.10026898235082626, "logps/chosen": -2.8834152221679688, "logps/rejected": -3.8942084312438965, "loss": 0.5247, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8834152221679688, "rewards/margins": 1.0107934474945068, "rewards/rejected": -3.8942084312438965, "sft_loss": 3.0497002601623535, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 19.66127515967987, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.23068693280220032, "logits/rejected": -0.0951416864991188, "logps/chosen": -2.798116445541382, "logps/rejected": -3.5930447578430176, "loss": 0.5406, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.798116445541382, "rewards/margins": 0.7949279546737671, "rewards/rejected": -3.5930447578430176, "sft_loss": 2.894116163253784, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 17.20298797419941, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.23539146780967712, "logits/rejected": 0.025868237018585205, "logps/chosen": -2.7686853408813477, "logps/rejected": -3.7298712730407715, "loss": 0.4766, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7686853408813477, "rewards/margins": 0.9611856341362, "rewards/rejected": -3.7298712730407715, "sft_loss": 2.9039459228515625, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 18.947778675822796, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.18216149508953094, "logits/rejected": -0.05739528685808182, "logps/chosen": -2.887604236602783, "logps/rejected": -3.6991076469421387, "loss": 0.551, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.887604236602783, "rewards/margins": 0.811503529548645, "rewards/rejected": -3.6991076469421387, "sft_loss": 3.119185209274292, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 11.464756525255813, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.3169929087162018, "logits/rejected": -0.16190847754478455, "logps/chosen": -2.839843273162842, "logps/rejected": -3.680530548095703, "loss": 0.507, "rewards/accuracies": 0.78125, "rewards/chosen": -2.839843273162842, "rewards/margins": 0.8406869769096375, "rewards/rejected": -3.680530548095703, "sft_loss": 2.9576756954193115, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 12.331848177245327, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.2760445177555084, "logits/rejected": -0.1369338035583496, "logps/chosen": -2.9550209045410156, "logps/rejected": -3.916705369949341, "loss": 0.5183, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9550209045410156, "rewards/margins": 0.9616841077804565, "rewards/rejected": -3.916705369949341, "sft_loss": 3.1083853244781494, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 24.02246912843936, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.2904479503631592, "logits/rejected": -0.08660642802715302, "logps/chosen": -2.9979000091552734, "logps/rejected": -3.875239849090576, "loss": 0.5522, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9979000091552734, "rewards/margins": 0.8773400187492371, "rewards/rejected": -3.875239849090576, "sft_loss": 3.12526798248291, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 18.30655578201634, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.21777451038360596, "logits/rejected": -0.04189714044332504, "logps/chosen": -2.8661983013153076, "logps/rejected": -3.6684353351593018, "loss": 0.5423, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8661983013153076, "rewards/margins": 0.8022370338439941, "rewards/rejected": -3.6684353351593018, "sft_loss": 3.0100347995758057, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 14.26079978751246, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.288684606552124, "logits/rejected": -0.09366115927696228, "logps/chosen": -2.9219727516174316, "logps/rejected": -3.6889071464538574, "loss": 0.522, "rewards/accuracies": 0.75, "rewards/chosen": -2.9219727516174316, "rewards/margins": 0.7669345140457153, "rewards/rejected": -3.6889071464538574, "sft_loss": 3.0620250701904297, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 18.763679318314267, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.26433873176574707, "logits/rejected": -0.02369770035147667, "logps/chosen": -2.94124174118042, "logps/rejected": -3.9786980152130127, "loss": 0.5133, "rewards/accuracies": 0.75, "rewards/chosen": -2.94124174118042, "rewards/margins": 1.0374559164047241, "rewards/rejected": -3.9786980152130127, "sft_loss": 3.0656442642211914, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 15.811139693762856, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.2913285493850708, "logits/rejected": -0.0641009658575058, "logps/chosen": -2.8629794120788574, "logps/rejected": -3.8095791339874268, "loss": 0.5101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8629794120788574, "rewards/margins": 0.9465991854667664, "rewards/rejected": -3.8095791339874268, "sft_loss": 3.0161020755767822, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 18.022532012750336, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.2145463228225708, "logits/rejected": -0.06065446883440018, "logps/chosen": -3.002933979034424, "logps/rejected": -4.084083557128906, "loss": 0.4918, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.002933979034424, "rewards/margins": 1.081149697303772, "rewards/rejected": -4.084083557128906, "sft_loss": 3.2060210704803467, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 11.476591812970014, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.27649787068367004, "logits/rejected": -0.06862474977970123, "logps/chosen": -2.9410276412963867, "logps/rejected": -3.6895077228546143, "loss": 0.577, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9410276412963867, "rewards/margins": 0.748479962348938, "rewards/rejected": -3.6895077228546143, "sft_loss": 3.015018939971924, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 28.570194060864278, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.3490707278251648, "logits/rejected": -0.15127266943454742, "logps/chosen": -2.9497642517089844, "logps/rejected": -3.570955276489258, "loss": 0.6157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9497642517089844, "rewards/margins": 0.6211908459663391, "rewards/rejected": -3.570955276489258, "sft_loss": 3.063214063644409, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 22.720628768475557, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.3165649473667145, "logits/rejected": -0.17648476362228394, "logps/chosen": -2.8999311923980713, "logps/rejected": -3.5884041786193848, "loss": 0.6102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8999311923980713, "rewards/margins": 0.6884732246398926, "rewards/rejected": -3.5884041786193848, "sft_loss": 2.998579740524292, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 18.748980969575, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.07036960870027542, "logits/rejected": 0.014557006768882275, "logps/chosen": -2.9264798164367676, "logps/rejected": -3.6826775074005127, "loss": 0.5487, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9264798164367676, "rewards/margins": 0.7561972737312317, "rewards/rejected": -3.6826775074005127, "sft_loss": 3.034297227859497, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 13.749498822383595, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.2745167315006256, "logits/rejected": -0.13641813397407532, "logps/chosen": -2.8453528881073, "logps/rejected": -3.5740790367126465, "loss": 0.5414, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8453528881073, "rewards/margins": 0.7287262678146362, "rewards/rejected": -3.5740790367126465, "sft_loss": 3.005955219268799, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 14.094554426517202, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.25325021147727966, "logits/rejected": -0.06744090467691422, "logps/chosen": -2.703946590423584, "logps/rejected": -3.4337966442108154, "loss": 0.5563, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.703946590423584, "rewards/margins": 0.7298499345779419, "rewards/rejected": -3.4337966442108154, "sft_loss": 2.8957529067993164, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 14.384649491858822, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.20750777423381805, "logits/rejected": -0.047119565308094025, "logps/chosen": -2.895918369293213, "logps/rejected": -3.666656494140625, "loss": 0.5495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.895918369293213, "rewards/margins": 0.7707374691963196, "rewards/rejected": -3.666656494140625, "sft_loss": 3.003587007522583, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 15.088461428091204, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.14871835708618164, "logits/rejected": 0.01115778274834156, "logps/chosen": -2.7989089488983154, "logps/rejected": -3.4743430614471436, "loss": 0.5458, "rewards/accuracies": 0.75, "rewards/chosen": -2.7989089488983154, "rewards/margins": 0.6754340529441833, "rewards/rejected": -3.4743430614471436, "sft_loss": 2.9447684288024902, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 12.180618978442498, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.23028519749641418, "logits/rejected": -0.1455889195203781, "logps/chosen": -2.748809337615967, "logps/rejected": -3.695582628250122, "loss": 0.5166, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.748809337615967, "rewards/margins": 0.9467732310295105, "rewards/rejected": -3.695582628250122, "sft_loss": 2.9013164043426514, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 26.879565279570176, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.22017955780029297, "logits/rejected": -0.1012934222817421, "logps/chosen": -2.6123456954956055, "logps/rejected": -3.3041110038757324, "loss": 0.5678, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6123456954956055, "rewards/margins": 0.6917653679847717, "rewards/rejected": -3.3041110038757324, "sft_loss": 2.7556638717651367, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 11.349719271048372, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.22333073616027832, "logits/rejected": -0.06680073589086533, "logps/chosen": -2.8216347694396973, "logps/rejected": -3.7416367530822754, "loss": 0.4819, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8216347694396973, "rewards/margins": 0.9200018644332886, "rewards/rejected": -3.7416367530822754, "sft_loss": 3.018942356109619, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 13.77330629661692, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.285283625125885, "logits/rejected": -0.11413697898387909, "logps/chosen": -2.9083285331726074, "logps/rejected": -3.6721458435058594, "loss": 0.5478, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9083285331726074, "rewards/margins": 0.763817548751831, "rewards/rejected": -3.6721458435058594, "sft_loss": 3.0378081798553467, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 13.26939237550019, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.2616371214389801, "logits/rejected": -0.05170721560716629, "logps/chosen": -2.853921413421631, "logps/rejected": -3.6635985374450684, "loss": 0.5813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.853921413421631, "rewards/margins": 0.809677243232727, "rewards/rejected": -3.6635985374450684, "sft_loss": 3.0633363723754883, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 14.822875458029475, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.19637763500213623, "logits/rejected": -0.10091104358434677, "logps/chosen": -2.7999989986419678, "logps/rejected": -3.6137301921844482, "loss": 0.5582, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7999989986419678, "rewards/margins": 0.8137310743331909, "rewards/rejected": -3.6137301921844482, "sft_loss": 2.9661269187927246, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 19.196992236377532, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.2853774428367615, "logits/rejected": -0.16362276673316956, "logps/chosen": -2.8738396167755127, "logps/rejected": -3.68664813041687, "loss": 0.564, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8738396167755127, "rewards/margins": 0.812808632850647, "rewards/rejected": -3.68664813041687, "sft_loss": 3.035071611404419, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 13.120713832451536, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.30218741297721863, "logits/rejected": -0.2097523957490921, "logps/chosen": -2.6479439735412598, "logps/rejected": -3.4495468139648438, "loss": 0.4967, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6479439735412598, "rewards/margins": 0.8016031980514526, "rewards/rejected": -3.4495468139648438, "sft_loss": 2.778782367706299, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 13.395548570121615, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.3696254789829254, "logits/rejected": -0.17545874416828156, "logps/chosen": -2.555445432662964, "logps/rejected": -3.4399514198303223, "loss": 0.4687, "rewards/accuracies": 0.84375, "rewards/chosen": -2.555445432662964, "rewards/margins": 0.8845059275627136, "rewards/rejected": -3.4399514198303223, "sft_loss": 2.7573673725128174, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 16.649977896866492, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.17495186626911163, "logits/rejected": -0.14151157438755035, "logps/chosen": -2.7216289043426514, "logps/rejected": -3.3717918395996094, "loss": 0.5736, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7216289043426514, "rewards/margins": 0.6501628160476685, "rewards/rejected": -3.3717918395996094, "sft_loss": 2.8681225776672363, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 16.772112988286686, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.18401309847831726, "logits/rejected": -0.10007508099079132, "logps/chosen": -2.6721231937408447, "logps/rejected": -3.324143648147583, "loss": 0.6245, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6721231937408447, "rewards/margins": 0.6520205140113831, "rewards/rejected": -3.324143648147583, "sft_loss": 2.845592737197876, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 15.422309003619537, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.2133098542690277, "logits/rejected": -0.05000300332903862, "logps/chosen": -2.772364377975464, "logps/rejected": -3.53290057182312, "loss": 0.5361, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.772364377975464, "rewards/margins": 0.760536253452301, "rewards/rejected": -3.53290057182312, "sft_loss": 2.9012913703918457, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 15.57194234622625, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.3201986253261566, "logits/rejected": -0.17920561134815216, "logps/chosen": -2.6756997108459473, "logps/rejected": -3.4864468574523926, "loss": 0.5285, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6756997108459473, "rewards/margins": 0.8107470273971558, "rewards/rejected": -3.4864468574523926, "sft_loss": 2.9155473709106445, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 18.956343721755545, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.31728580594062805, "logits/rejected": -0.12212977558374405, "logps/chosen": -2.669448137283325, "logps/rejected": -3.3638412952423096, "loss": 0.5545, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.669448137283325, "rewards/margins": 0.6943932771682739, "rewards/rejected": -3.3638412952423096, "sft_loss": 2.872934341430664, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 13.27250872198471, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.22764411568641663, "logits/rejected": -0.0870494470000267, "logps/chosen": -2.6941583156585693, "logps/rejected": -3.426682233810425, "loss": 0.5447, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6941583156585693, "rewards/margins": 0.7325237989425659, "rewards/rejected": -3.426682233810425, "sft_loss": 2.724475383758545, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 12.557439037020663, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.1906013935804367, "logits/rejected": -0.047843921929597855, "logps/chosen": -2.722144365310669, "logps/rejected": -3.6132140159606934, "loss": 0.4835, "rewards/accuracies": 0.78125, "rewards/chosen": -2.722144365310669, "rewards/margins": 0.891069769859314, "rewards/rejected": -3.6132140159606934, "sft_loss": 2.923128604888916, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 11.984896003834054, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.32420215010643005, "logits/rejected": -0.1440507471561432, "logps/chosen": -2.875488042831421, "logps/rejected": -3.7107577323913574, "loss": 0.5353, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.875488042831421, "rewards/margins": 0.8352691531181335, "rewards/rejected": -3.7107577323913574, "sft_loss": 2.9838318824768066, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 13.906447094523328, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.24320106208324432, "logits/rejected": -0.10679192841053009, "logps/chosen": -2.88425612449646, "logps/rejected": -3.715994358062744, "loss": 0.53, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.88425612449646, "rewards/margins": 0.8317381739616394, "rewards/rejected": -3.715994358062744, "sft_loss": 2.966104030609131, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 14.976367657165467, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.19528812170028687, "logits/rejected": -0.009975330904126167, "logps/chosen": -3.02905535697937, "logps/rejected": -3.80255126953125, "loss": 0.5337, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.02905535697937, "rewards/margins": 0.7734959721565247, "rewards/rejected": -3.80255126953125, "sft_loss": 3.1802637577056885, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 15.548326598764058, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.17453578114509583, "logits/rejected": -0.07390455901622772, "logps/chosen": -3.150977611541748, "logps/rejected": -3.8425517082214355, "loss": 0.5693, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.150977611541748, "rewards/margins": 0.6915736794471741, "rewards/rejected": -3.8425517082214355, "sft_loss": 3.230241060256958, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 16.533383070336015, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.4218342900276184, "logits/rejected": -0.2783734202384949, "logps/chosen": -2.9137845039367676, "logps/rejected": -3.6819236278533936, "loss": 0.527, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9137845039367676, "rewards/margins": 0.7681390047073364, "rewards/rejected": -3.6819236278533936, "sft_loss": 3.0383102893829346, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 17.88192366513802, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.21219484508037567, "logits/rejected": -0.06534367054700851, "logps/chosen": -2.799802780151367, "logps/rejected": -3.7559711933135986, "loss": 0.519, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.799802780151367, "rewards/margins": 0.9561680555343628, "rewards/rejected": -3.7559711933135986, "sft_loss": 2.8882555961608887, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 14.847677829791488, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.3468804955482483, "logits/rejected": -0.21161310374736786, "logps/chosen": -2.8697726726531982, "logps/rejected": -3.809154987335205, "loss": 0.4897, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8697726726531982, "rewards/margins": 0.9393820762634277, "rewards/rejected": -3.809154987335205, "sft_loss": 2.9497411251068115, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 12.793945840546476, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.3617003560066223, "logits/rejected": -0.21521353721618652, "logps/chosen": -2.7200443744659424, "logps/rejected": -3.6564979553222656, "loss": 0.49, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7200443744659424, "rewards/margins": 0.9364538192749023, "rewards/rejected": -3.6564979553222656, "sft_loss": 2.89874005317688, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 17.62398573548974, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.1891530603170395, "logits/rejected": -0.21747808158397675, "logps/chosen": -2.798849582672119, "logps/rejected": -3.6794886589050293, "loss": 0.5158, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.798849582672119, "rewards/margins": 0.880639374256134, "rewards/rejected": -3.6794886589050293, "sft_loss": 3.0444750785827637, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 14.920422420380982, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.3750852644443512, "logits/rejected": -0.21744951605796814, "logps/chosen": -2.9588398933410645, "logps/rejected": -3.6091854572296143, "loss": 0.6136, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.9588398933410645, "rewards/margins": 0.6503456830978394, "rewards/rejected": -3.6091854572296143, "sft_loss": 3.1452479362487793, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 13.725127922916966, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.2826746106147766, "logits/rejected": -0.17756345868110657, "logps/chosen": -2.858684778213501, "logps/rejected": -3.7989399433135986, "loss": 0.496, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.858684778213501, "rewards/margins": 0.9402546882629395, "rewards/rejected": -3.7989399433135986, "sft_loss": 2.9993367195129395, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 20.41563340319681, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.28491419553756714, "logits/rejected": -0.11838585138320923, "logps/chosen": -2.934544801712036, "logps/rejected": -3.5151214599609375, "loss": 0.6561, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.934544801712036, "rewards/margins": 0.58057701587677, "rewards/rejected": -3.5151214599609375, "sft_loss": 3.12235426902771, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 15.819865212335717, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.16424153745174408, "logits/rejected": -0.09867256879806519, "logps/chosen": -2.845426559448242, "logps/rejected": -3.666630983352661, "loss": 0.5349, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.845426559448242, "rewards/margins": 0.8212043642997742, "rewards/rejected": -3.666630983352661, "sft_loss": 2.955371141433716, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 16.014390487503512, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.23135367035865784, "logits/rejected": -0.17494915425777435, "logps/chosen": -2.815685272216797, "logps/rejected": -3.2968387603759766, "loss": 0.635, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.815685272216797, "rewards/margins": 0.48115357756614685, "rewards/rejected": -3.2968387603759766, "sft_loss": 2.9847769737243652, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 11.69622838348128, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.32371675968170166, "logits/rejected": -0.12809544801712036, "logps/chosen": -2.8046722412109375, "logps/rejected": -3.664748430252075, "loss": 0.4901, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8046722412109375, "rewards/margins": 0.8600761294364929, "rewards/rejected": -3.664748430252075, "sft_loss": 2.9333043098449707, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 11.61131870098387, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.29520028829574585, "logits/rejected": -0.16227586567401886, "logps/chosen": -2.6985669136047363, "logps/rejected": -3.5508294105529785, "loss": 0.5224, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6985669136047363, "rewards/margins": 0.8522623777389526, "rewards/rejected": -3.5508294105529785, "sft_loss": 2.784538745880127, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 14.994844084135467, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.20806710422039032, "logits/rejected": -0.04904582351446152, "logps/chosen": -2.8403031826019287, "logps/rejected": -3.572239637374878, "loss": 0.5785, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8403031826019287, "rewards/margins": 0.7319362759590149, "rewards/rejected": -3.572239637374878, "sft_loss": 2.9280142784118652, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.15309284627437592, "eval_logits/rejected": 0.2598608136177063, "eval_logps/chosen": -2.7303080558776855, "eval_logps/rejected": -3.5084829330444336, "eval_loss": 0.5548537373542786, "eval_rewards/accuracies": 0.7240356206893921, "eval_rewards/chosen": -2.7303080558776855, "eval_rewards/margins": 0.778174877166748, "eval_rewards/rejected": -3.5084829330444336, "eval_runtime": 50.0441, "eval_samples_per_second": 26.876, "eval_sft_loss": 2.8826727867126465, "eval_steps_per_second": 6.734, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 11.644774075451304, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.32497185468673706, "logits/rejected": -0.18863452970981598, "logps/chosen": -2.6446356773376465, "logps/rejected": -3.518303394317627, "loss": 0.5075, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6446356773376465, "rewards/margins": 0.8736675381660461, "rewards/rejected": -3.518303394317627, "sft_loss": 2.7527430057525635, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 16.781271448266775, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.30310553312301636, "logits/rejected": -0.1773833930492401, "logps/chosen": -2.711203098297119, "logps/rejected": -3.3796043395996094, "loss": 0.5956, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.711203098297119, "rewards/margins": 0.6684012413024902, "rewards/rejected": -3.3796043395996094, "sft_loss": 2.832228899002075, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 16.686412798729457, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.22252731025218964, "logits/rejected": -0.1045379638671875, "logps/chosen": -2.7253940105438232, "logps/rejected": -3.514838457107544, "loss": 0.5377, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7253940105438232, "rewards/margins": 0.7894442677497864, "rewards/rejected": -3.514838457107544, "sft_loss": 2.815645217895508, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 13.158363402934352, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.2946009039878845, "logits/rejected": -0.022943900898098946, "logps/chosen": -2.9355263710021973, "logps/rejected": -3.752810001373291, "loss": 0.5893, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9355263710021973, "rewards/margins": 0.8172832727432251, "rewards/rejected": -3.752810001373291, "sft_loss": 2.941917896270752, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 12.757088855427256, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.24457335472106934, "logits/rejected": -0.10586385428905487, "logps/chosen": -2.697072744369507, "logps/rejected": -3.480059862136841, "loss": 0.5324, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.697072744369507, "rewards/margins": 0.7829869985580444, "rewards/rejected": -3.480059862136841, "sft_loss": 2.8617265224456787, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 21.939144034765643, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.27455341815948486, "logits/rejected": -0.10001333057880402, "logps/chosen": -2.7869138717651367, "logps/rejected": -3.8302619457244873, "loss": 0.5037, "rewards/accuracies": 0.75, "rewards/chosen": -2.7869138717651367, "rewards/margins": 1.0433475971221924, "rewards/rejected": -3.8302619457244873, "sft_loss": 2.9080328941345215, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 15.158390831944335, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.28310340642929077, "logits/rejected": -0.1736556589603424, "logps/chosen": -2.7446627616882324, "logps/rejected": -3.565007448196411, "loss": 0.5144, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7446627616882324, "rewards/margins": 0.8203444480895996, "rewards/rejected": -3.565007448196411, "sft_loss": 2.8578567504882812, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 13.765901993484606, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.227499321103096, "logits/rejected": -0.08456512540578842, "logps/chosen": -2.7374513149261475, "logps/rejected": -3.654040813446045, "loss": 0.5154, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7374513149261475, "rewards/margins": 0.9165895581245422, "rewards/rejected": -3.654040813446045, "sft_loss": 2.739924192428589, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 15.445753485493217, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.36142498254776, "logits/rejected": -0.21744556725025177, "logps/chosen": -2.792912244796753, "logps/rejected": -3.547822952270508, "loss": 0.5606, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.792912244796753, "rewards/margins": 0.7549106478691101, "rewards/rejected": -3.547822952270508, "sft_loss": 2.836681365966797, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 13.51159695538134, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.18365325033664703, "logits/rejected": 0.014891237020492554, "logps/chosen": -2.7513034343719482, "logps/rejected": -3.632906675338745, "loss": 0.5287, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7513034343719482, "rewards/margins": 0.8816030621528625, "rewards/rejected": -3.632906675338745, "sft_loss": 2.799229621887207, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 12.850674062353518, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.22592787444591522, "logits/rejected": -0.12565144896507263, "logps/chosen": -2.7363648414611816, "logps/rejected": -3.4756031036376953, "loss": 0.5525, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7363648414611816, "rewards/margins": 0.7392383813858032, "rewards/rejected": -3.4756031036376953, "sft_loss": 2.8443732261657715, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 17.764794839669747, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.29207858443260193, "logits/rejected": -0.1414732187986374, "logps/chosen": -2.731041193008423, "logps/rejected": -3.5729775428771973, "loss": 0.5142, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.731041193008423, "rewards/margins": 0.841936469078064, "rewards/rejected": -3.5729775428771973, "sft_loss": 2.848785400390625, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 20.729812971010393, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.20951029658317566, "logits/rejected": -0.0795585885643959, "logps/chosen": -2.695565700531006, "logps/rejected": -3.5492007732391357, "loss": 0.5336, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.695565700531006, "rewards/margins": 0.8536350131034851, "rewards/rejected": -3.5492007732391357, "sft_loss": 2.873015880584717, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 20.6746787536389, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.26037219166755676, "logits/rejected": -0.13233566284179688, "logps/chosen": -2.7734415531158447, "logps/rejected": -3.595205783843994, "loss": 0.5397, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7734415531158447, "rewards/margins": 0.8217641711235046, "rewards/rejected": -3.595205783843994, "sft_loss": 2.8323311805725098, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 15.537877865607411, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.31725236773490906, "logits/rejected": 0.016078215092420578, "logps/chosen": -2.695134162902832, "logps/rejected": -3.618396282196045, "loss": 0.4915, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.695134162902832, "rewards/margins": 0.9232619404792786, "rewards/rejected": -3.618396282196045, "sft_loss": 2.8468456268310547, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 20.8433494712244, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.26147979497909546, "logits/rejected": -0.17856861650943756, "logps/chosen": -2.5003762245178223, "logps/rejected": -3.4275104999542236, "loss": 0.5124, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.5003762245178223, "rewards/margins": 0.9271339178085327, "rewards/rejected": -3.4275104999542236, "sft_loss": 2.689016342163086, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 15.440325896504861, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.28607049584388733, "logits/rejected": -0.051154326647520065, "logps/chosen": -2.7072086334228516, "logps/rejected": -3.6027824878692627, "loss": 0.505, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7072086334228516, "rewards/margins": 0.8955739736557007, "rewards/rejected": -3.6027824878692627, "sft_loss": 2.805786609649658, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 17.340041941861976, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.2097930908203125, "logits/rejected": -0.08719705045223236, "logps/chosen": -2.6455299854278564, "logps/rejected": -3.431236982345581, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": -2.6455299854278564, "rewards/margins": 0.7857068777084351, "rewards/rejected": -3.431236982345581, "sft_loss": 2.7738378047943115, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 16.365244845189448, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.2564403712749481, "logits/rejected": -0.14944425225257874, "logps/chosen": -2.9101572036743164, "logps/rejected": -3.7674922943115234, "loss": 0.5606, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9101572036743164, "rewards/margins": 0.8573352098464966, "rewards/rejected": -3.7674922943115234, "sft_loss": 2.965453863143921, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 22.915254342116935, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.34263506531715393, "logits/rejected": -0.18292446434497833, "logps/chosen": -3.0182735919952393, "logps/rejected": -3.888817310333252, "loss": 0.5581, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0182735919952393, "rewards/margins": 0.8705435991287231, "rewards/rejected": -3.888817310333252, "sft_loss": 3.1601388454437256, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 19.993626570967955, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.352492094039917, "logits/rejected": -0.15054023265838623, "logps/chosen": -2.95141339302063, "logps/rejected": -3.8508782386779785, "loss": 0.5433, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.95141339302063, "rewards/margins": 0.8994649052619934, "rewards/rejected": -3.8508782386779785, "sft_loss": 3.0732500553131104, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 14.797733830293513, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.14886128902435303, "logits/rejected": -0.09603078663349152, "logps/chosen": -2.9899444580078125, "logps/rejected": -3.8496603965759277, "loss": 0.5117, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9899444580078125, "rewards/margins": 0.859715461730957, "rewards/rejected": -3.8496603965759277, "sft_loss": 3.140385150909424, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 29.066924240582257, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.313045471906662, "logits/rejected": -0.12574300169944763, "logps/chosen": -2.9519901275634766, "logps/rejected": -3.7856521606445312, "loss": 0.549, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9519901275634766, "rewards/margins": 0.8336623311042786, "rewards/rejected": -3.7856521606445312, "sft_loss": 3.0585060119628906, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 14.007089002617244, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.28222742676734924, "logits/rejected": -0.036201111972332, "logps/chosen": -2.8928723335266113, "logps/rejected": -3.8801913261413574, "loss": 0.5264, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8928723335266113, "rewards/margins": 0.9873189926147461, "rewards/rejected": -3.8801913261413574, "sft_loss": 2.9914212226867676, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 14.434540253014186, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.28749576210975647, "logits/rejected": -0.10014557838439941, "logps/chosen": -2.850964069366455, "logps/rejected": -3.6977570056915283, "loss": 0.5512, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.850964069366455, "rewards/margins": 0.846792995929718, "rewards/rejected": -3.6977570056915283, "sft_loss": 3.068385362625122, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 15.998222025322567, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.2363944947719574, "logits/rejected": -0.111115001142025, "logps/chosen": -2.7223422527313232, "logps/rejected": -3.6873691082000732, "loss": 0.4948, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7223422527313232, "rewards/margins": 0.9650264978408813, "rewards/rejected": -3.6873691082000732, "sft_loss": 2.9275765419006348, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 26.943749067338448, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.25617361068725586, "logits/rejected": -0.06846214830875397, "logps/chosen": -2.9510550498962402, "logps/rejected": -3.6926627159118652, "loss": 0.5636, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9510550498962402, "rewards/margins": 0.741607666015625, "rewards/rejected": -3.6926627159118652, "sft_loss": 2.9970152378082275, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 37.80260581105259, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.22949638962745667, "logits/rejected": -0.05085518956184387, "logps/chosen": -2.757495641708374, "logps/rejected": -3.7478134632110596, "loss": 0.4968, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.757495641708374, "rewards/margins": 0.9903179407119751, "rewards/rejected": -3.7478134632110596, "sft_loss": 2.917391300201416, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 16.307364392346315, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.25462061166763306, "logits/rejected": -0.04345916956663132, "logps/chosen": -2.6675283908843994, "logps/rejected": -3.524871826171875, "loss": 0.508, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6675283908843994, "rewards/margins": 0.8573434948921204, "rewards/rejected": -3.524871826171875, "sft_loss": 2.846837282180786, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 14.953511487287509, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.35961371660232544, "logits/rejected": -0.21522529423236847, "logps/chosen": -2.8548665046691895, "logps/rejected": -3.6475882530212402, "loss": 0.5401, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8548665046691895, "rewards/margins": 0.7927218079566956, "rewards/rejected": -3.6475882530212402, "sft_loss": 2.979081630706787, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 21.090179405489167, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.1550268530845642, "logits/rejected": -0.09269969165325165, "logps/chosen": -2.971285581588745, "logps/rejected": -3.6374497413635254, "loss": 0.6211, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.971285581588745, "rewards/margins": 0.6661639213562012, "rewards/rejected": -3.6374497413635254, "sft_loss": 3.0060386657714844, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 17.823650021351096, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.2913093864917755, "logits/rejected": -0.08032882213592529, "logps/chosen": -2.660451650619507, "logps/rejected": -3.5529582500457764, "loss": 0.5203, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.660451650619507, "rewards/margins": 0.8925067782402039, "rewards/rejected": -3.5529582500457764, "sft_loss": 2.7594053745269775, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 14.755037676642225, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.3176870346069336, "logits/rejected": -0.1266889125108719, "logps/chosen": -2.8807005882263184, "logps/rejected": -3.641491413116455, "loss": 0.5444, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8807005882263184, "rewards/margins": 0.7607907652854919, "rewards/rejected": -3.641491413116455, "sft_loss": 3.023757219314575, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 16.466658197863744, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.30670759081840515, "logits/rejected": -0.13745181262493134, "logps/chosen": -2.8676910400390625, "logps/rejected": -3.5250790119171143, "loss": 0.6141, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8676910400390625, "rewards/margins": 0.6573879718780518, "rewards/rejected": -3.5250790119171143, "sft_loss": 2.9657163619995117, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 22.069927121834432, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.1460348516702652, "logits/rejected": -0.06519370526075363, "logps/chosen": -2.9532673358917236, "logps/rejected": -3.681931734085083, "loss": 0.6144, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.9532673358917236, "rewards/margins": 0.7286645770072937, "rewards/rejected": -3.681931734085083, "sft_loss": 3.024280071258545, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 12.642912115114475, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.3509058356285095, "logits/rejected": -0.15340352058410645, "logps/chosen": -2.7628684043884277, "logps/rejected": -3.520265579223633, "loss": 0.558, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7628684043884277, "rewards/margins": 0.7573972940444946, "rewards/rejected": -3.520265579223633, "sft_loss": 2.7993674278259277, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 14.091249291129772, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.31110242009162903, "logits/rejected": -0.27701109647750854, "logps/chosen": -2.7618496417999268, "logps/rejected": -3.413426637649536, "loss": 0.5889, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7618496417999268, "rewards/margins": 0.6515769362449646, "rewards/rejected": -3.413426637649536, "sft_loss": 2.8429646492004395, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 20.278779594228684, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.28902843594551086, "logits/rejected": -0.1411515474319458, "logps/chosen": -2.742339611053467, "logps/rejected": -3.423797607421875, "loss": 0.5976, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.742339611053467, "rewards/margins": 0.6814578771591187, "rewards/rejected": -3.423797607421875, "sft_loss": 2.7904200553894043, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 15.678556690957445, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.3427790701389313, "logits/rejected": -0.18898403644561768, "logps/chosen": -2.5821168422698975, "logps/rejected": -3.509188413619995, "loss": 0.4932, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5821168422698975, "rewards/margins": 0.9270719289779663, "rewards/rejected": -3.509188413619995, "sft_loss": 2.699791431427002, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 16.2002282046552, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.2501813769340515, "logits/rejected": -0.1078735813498497, "logps/chosen": -2.6638801097869873, "logps/rejected": -3.426955461502075, "loss": 0.531, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6638801097869873, "rewards/margins": 0.7630751729011536, "rewards/rejected": -3.426955461502075, "sft_loss": 2.7353997230529785, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 13.96144715875297, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.27894407510757446, "logits/rejected": -0.12392063438892365, "logps/chosen": -2.638472318649292, "logps/rejected": -3.3759467601776123, "loss": 0.5214, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.638472318649292, "rewards/margins": 0.7374745607376099, "rewards/rejected": -3.3759467601776123, "sft_loss": 2.7297282218933105, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 16.286571712916317, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.27272436022758484, "logits/rejected": -0.020682910457253456, "logps/chosen": -2.7578983306884766, "logps/rejected": -3.5018246173858643, "loss": 0.5205, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7578983306884766, "rewards/margins": 0.7439260482788086, "rewards/rejected": -3.5018246173858643, "sft_loss": 2.786332607269287, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 21.677066865913943, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.2839580774307251, "logits/rejected": -0.1497466266155243, "logps/chosen": -2.744100570678711, "logps/rejected": -3.567244052886963, "loss": 0.5608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.744100570678711, "rewards/margins": 0.8231437802314758, "rewards/rejected": -3.567244052886963, "sft_loss": 2.881943702697754, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 21.504461546621123, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.18854017555713654, "logits/rejected": -0.06854341924190521, "logps/chosen": -2.828106641769409, "logps/rejected": -3.6131012439727783, "loss": 0.6063, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.828106641769409, "rewards/margins": 0.7849944829940796, "rewards/rejected": -3.6131012439727783, "sft_loss": 2.945675849914551, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 13.004971249919047, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.2789008915424347, "logits/rejected": -0.21969743072986603, "logps/chosen": -2.7043042182922363, "logps/rejected": -3.5020999908447266, "loss": 0.5391, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7043042182922363, "rewards/margins": 0.7977955937385559, "rewards/rejected": -3.5020999908447266, "sft_loss": 2.7692298889160156, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 17.180145525706955, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.2672729790210724, "logits/rejected": -0.12829595804214478, "logps/chosen": -2.6477010250091553, "logps/rejected": -3.6499581336975098, "loss": 0.4954, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6477010250091553, "rewards/margins": 1.0022567510604858, "rewards/rejected": -3.6499581336975098, "sft_loss": 2.7258620262145996, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 17.2549891404319, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.2865196168422699, "logits/rejected": -0.15484482049942017, "logps/chosen": -2.614703416824341, "logps/rejected": -3.5150234699249268, "loss": 0.4677, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.614703416824341, "rewards/margins": 0.9003196954727173, "rewards/rejected": -3.5150234699249268, "sft_loss": 2.7431111335754395, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 18.690750195324476, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.29971033334732056, "logits/rejected": -0.1828007996082306, "logps/chosen": -2.6536471843719482, "logps/rejected": -3.5673458576202393, "loss": 0.5552, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6536471843719482, "rewards/margins": 0.9136987924575806, "rewards/rejected": -3.5673458576202393, "sft_loss": 2.7793192863464355, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 14.611042814476118, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.3003006875514984, "logits/rejected": -0.07512084394693375, "logps/chosen": -2.7247841358184814, "logps/rejected": -3.641571521759033, "loss": 0.5117, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7247841358184814, "rewards/margins": 0.9167870283126831, "rewards/rejected": -3.641571521759033, "sft_loss": 2.859342098236084, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 16.380895729120613, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.3656696379184723, "logits/rejected": -0.17512385547161102, "logps/chosen": -2.7375683784484863, "logps/rejected": -3.6500372886657715, "loss": 0.5116, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7375683784484863, "rewards/margins": 0.9124690294265747, "rewards/rejected": -3.6500372886657715, "sft_loss": 2.8651626110076904, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 15.685753189854438, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.2697731852531433, "logits/rejected": -0.05836169049143791, "logps/chosen": -2.9335014820098877, "logps/rejected": -3.754599094390869, "loss": 0.5682, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9335014820098877, "rewards/margins": 0.821097731590271, "rewards/rejected": -3.754599094390869, "sft_loss": 3.0067005157470703, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 18.491827651861172, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.16491341590881348, "logits/rejected": -0.03908557817339897, "logps/chosen": -2.7487964630126953, "logps/rejected": -3.60390043258667, "loss": 0.5333, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7487964630126953, "rewards/margins": 0.8551036715507507, "rewards/rejected": -3.60390043258667, "sft_loss": 2.7997887134552, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 13.212186209386868, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.2802397608757019, "logits/rejected": -0.15976294875144958, "logps/chosen": -2.7203943729400635, "logps/rejected": -3.5208003520965576, "loss": 0.5436, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7203943729400635, "rewards/margins": 0.8004060983657837, "rewards/rejected": -3.5208003520965576, "sft_loss": 2.802574634552002, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 15.71739129001188, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.2877870202064514, "logits/rejected": -0.18142752349376678, "logps/chosen": -2.883432388305664, "logps/rejected": -3.7455458641052246, "loss": 0.5411, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.883432388305664, "rewards/margins": 0.862113356590271, "rewards/rejected": -3.7455458641052246, "sft_loss": 2.970731496810913, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 18.59449367259967, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.274854838848114, "logits/rejected": -0.21188096702098846, "logps/chosen": -2.752173662185669, "logps/rejected": -3.550297975540161, "loss": 0.5405, "rewards/accuracies": 0.71875, "rewards/chosen": -2.752173662185669, "rewards/margins": 0.7981240749359131, "rewards/rejected": -3.550297975540161, "sft_loss": 2.8199985027313232, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 13.948609692358458, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.17315642535686493, "logits/rejected": -0.1392008364200592, "logps/chosen": -2.801203489303589, "logps/rejected": -3.6079413890838623, "loss": 0.5448, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.801203489303589, "rewards/margins": 0.8067380785942078, "rewards/rejected": -3.6079413890838623, "sft_loss": 2.931431531906128, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 17.599399947889275, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.23535147309303284, "logits/rejected": -0.07883518189191818, "logps/chosen": -2.6835312843322754, "logps/rejected": -3.50940203666687, "loss": 0.5464, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6835312843322754, "rewards/margins": 0.8258708715438843, "rewards/rejected": -3.50940203666687, "sft_loss": 2.843803882598877, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 16.88396980667088, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.20262908935546875, "logits/rejected": -0.07145232707262039, "logps/chosen": -2.6471047401428223, "logps/rejected": -3.5645880699157715, "loss": 0.479, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6471047401428223, "rewards/margins": 0.9174835085868835, "rewards/rejected": -3.5645880699157715, "sft_loss": 2.833148717880249, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 13.969009365364938, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.33862629532814026, "logits/rejected": -0.15098699927330017, "logps/chosen": -2.782113552093506, "logps/rejected": -3.5242133140563965, "loss": 0.5138, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.782113552093506, "rewards/margins": 0.7420998811721802, "rewards/rejected": -3.5242133140563965, "sft_loss": 2.9555227756500244, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 14.151770729569519, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.19998544454574585, "logits/rejected": -0.02457023784518242, "logps/chosen": -2.7103729248046875, "logps/rejected": -3.6097068786621094, "loss": 0.5117, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7103729248046875, "rewards/margins": 0.8993337750434875, "rewards/rejected": -3.6097068786621094, "sft_loss": 2.882556200027466, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 23.44049071726931, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.30788227915763855, "logits/rejected": -0.13961350917816162, "logps/chosen": -2.7522499561309814, "logps/rejected": -3.6674914360046387, "loss": 0.5067, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7522499561309814, "rewards/margins": 0.9152417182922363, "rewards/rejected": -3.6674914360046387, "sft_loss": 2.9316442012786865, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 20.91735152160501, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.2761691212654114, "logits/rejected": -0.177979975938797, "logps/chosen": -2.6528451442718506, "logps/rejected": -3.4922308921813965, "loss": 0.4983, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6528451442718506, "rewards/margins": 0.839385986328125, "rewards/rejected": -3.4922308921813965, "sft_loss": 2.755087375640869, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 21.367028365960795, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.2699565291404724, "logits/rejected": -0.05399322509765625, "logps/chosen": -2.869797468185425, "logps/rejected": -3.72851300239563, "loss": 0.5309, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.869797468185425, "rewards/margins": 0.8587149381637573, "rewards/rejected": -3.72851300239563, "sft_loss": 2.976771116256714, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 13.942471281442186, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.3158034682273865, "logits/rejected": -0.1378587782382965, "logps/chosen": -2.7531518936157227, "logps/rejected": -3.6037163734436035, "loss": 0.5322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7531518936157227, "rewards/margins": 0.8505643606185913, "rewards/rejected": -3.6037163734436035, "sft_loss": 2.949075222015381, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 17.656841624151504, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.24910321831703186, "logits/rejected": -0.03514650836586952, "logps/chosen": -2.827221155166626, "logps/rejected": -3.6355996131896973, "loss": 0.5644, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.827221155166626, "rewards/margins": 0.8083783984184265, "rewards/rejected": -3.6355996131896973, "sft_loss": 2.9915034770965576, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 16.270674214554624, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.2466723918914795, "logits/rejected": -0.15336759388446808, "logps/chosen": -2.871335744857788, "logps/rejected": -3.689772367477417, "loss": 0.5225, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.871335744857788, "rewards/margins": 0.8184367418289185, "rewards/rejected": -3.689772367477417, "sft_loss": 3.0920445919036865, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 12.425537120157095, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.33766406774520874, "logits/rejected": -0.18579557538032532, "logps/chosen": -2.852994680404663, "logps/rejected": -3.759185791015625, "loss": 0.5285, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.852994680404663, "rewards/margins": 0.9061908721923828, "rewards/rejected": -3.759185791015625, "sft_loss": 3.0922865867614746, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 20.041944841505437, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.31889665126800537, "logits/rejected": -0.0808551162481308, "logps/chosen": -2.8681604862213135, "logps/rejected": -3.5937094688415527, "loss": 0.5752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8681604862213135, "rewards/margins": 0.7255493402481079, "rewards/rejected": -3.5937094688415527, "sft_loss": 3.0391573905944824, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 19.31081687769959, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.4244014620780945, "logits/rejected": -0.1237758994102478, "logps/chosen": -2.7603707313537598, "logps/rejected": -3.643112897872925, "loss": 0.4808, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7603707313537598, "rewards/margins": 0.8827424049377441, "rewards/rejected": -3.643112897872925, "sft_loss": 2.8689522743225098, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 19.63782385553952, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.1956585794687271, "logits/rejected": -0.09507829695940018, "logps/chosen": -2.851369857788086, "logps/rejected": -3.5696005821228027, "loss": 0.5755, "rewards/accuracies": 0.71875, "rewards/chosen": -2.851369857788086, "rewards/margins": 0.7182309627532959, "rewards/rejected": -3.5696005821228027, "sft_loss": 2.826671600341797, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 13.299258594267988, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.344322144985199, "logits/rejected": -0.22166843712329865, "logps/chosen": -2.7484452724456787, "logps/rejected": -3.57319974899292, "loss": 0.5759, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.7484452724456787, "rewards/margins": 0.8247542381286621, "rewards/rejected": -3.57319974899292, "sft_loss": 2.8463919162750244, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 20.45364215505241, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.27780282497406006, "logits/rejected": -0.15248972177505493, "logps/chosen": -2.6875338554382324, "logps/rejected": -3.4828343391418457, "loss": 0.5208, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6875338554382324, "rewards/margins": 0.7953005433082581, "rewards/rejected": -3.4828343391418457, "sft_loss": 2.8021836280822754, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 15.0201163079026, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.2638780474662781, "logits/rejected": -0.15065474808216095, "logps/chosen": -2.8345370292663574, "logps/rejected": -3.8164570331573486, "loss": 0.5003, "rewards/accuracies": 0.75, "rewards/chosen": -2.8345370292663574, "rewards/margins": 0.9819199442863464, "rewards/rejected": -3.8164570331573486, "sft_loss": 2.997745990753174, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 13.740822823904988, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.32493412494659424, "logits/rejected": -0.12234127521514893, "logps/chosen": -2.6589608192443848, "logps/rejected": -3.5269927978515625, "loss": 0.4908, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.6589608192443848, "rewards/margins": 0.8680317997932434, "rewards/rejected": -3.5269927978515625, "sft_loss": 2.7340474128723145, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 12.774152902110295, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.21357354521751404, "logits/rejected": -0.07995374500751495, "logps/chosen": -2.7149415016174316, "logps/rejected": -3.6193466186523438, "loss": 0.5332, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7149415016174316, "rewards/margins": 0.9044052362442017, "rewards/rejected": -3.6193466186523438, "sft_loss": 2.7866055965423584, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 15.600413109989216, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.2804366648197174, "logits/rejected": -0.20661187171936035, "logps/chosen": -2.737346649169922, "logps/rejected": -3.6407647132873535, "loss": 0.497, "rewards/accuracies": 0.75, "rewards/chosen": -2.737346649169922, "rewards/margins": 0.9034177660942078, "rewards/rejected": -3.6407647132873535, "sft_loss": 2.8912928104400635, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 13.983136798774643, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.21731364727020264, "logits/rejected": -0.1391337811946869, "logps/chosen": -2.885457992553711, "logps/rejected": -3.7557575702667236, "loss": 0.5071, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.885457992553711, "rewards/margins": 0.8702995181083679, "rewards/rejected": -3.7557575702667236, "sft_loss": 3.06406569480896, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 15.846021214008868, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.2161417454481125, "logits/rejected": -0.1433815360069275, "logps/chosen": -2.861445426940918, "logps/rejected": -3.537444591522217, "loss": 0.5979, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.861445426940918, "rewards/margins": 0.6759993433952332, "rewards/rejected": -3.537444591522217, "sft_loss": 2.9814562797546387, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 10.787969643568355, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.3371451199054718, "logits/rejected": -0.11801593005657196, "logps/chosen": -2.8876984119415283, "logps/rejected": -3.7116875648498535, "loss": 0.5446, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8876984119415283, "rewards/margins": 0.82398921251297, "rewards/rejected": -3.7116875648498535, "sft_loss": 2.9036552906036377, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 17.28603451468424, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.21108360588550568, "logits/rejected": -0.030581191182136536, "logps/chosen": -2.9036574363708496, "logps/rejected": -3.7772459983825684, "loss": 0.5649, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9036574363708496, "rewards/margins": 0.8735888600349426, "rewards/rejected": -3.7772459983825684, "sft_loss": 3.0899136066436768, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.0982469990849495, "eval_logits/rejected": 0.20617729425430298, "eval_logps/chosen": -2.8066318035125732, "eval_logps/rejected": -3.636258840560913, "eval_loss": 0.5509018301963806, "eval_rewards/accuracies": 0.7240356206893921, "eval_rewards/chosen": -2.8066318035125732, "eval_rewards/margins": 0.8296267986297607, "eval_rewards/rejected": -3.636258840560913, "eval_runtime": 50.0546, "eval_samples_per_second": 26.871, "eval_sft_loss": 2.9742238521575928, "eval_steps_per_second": 6.733, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 15.491129944188577, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.27998265624046326, "logits/rejected": -0.2484864443540573, "logps/chosen": -2.683946132659912, "logps/rejected": -3.4610118865966797, "loss": 0.5354, "rewards/accuracies": 0.71875, "rewards/chosen": -2.683946132659912, "rewards/margins": 0.7770653963088989, "rewards/rejected": -3.4610118865966797, "sft_loss": 2.8647847175598145, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 16.588855650315963, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.3101789355278015, "logits/rejected": -0.1497582644224167, "logps/chosen": -2.7263779640197754, "logps/rejected": -3.524534225463867, "loss": 0.51, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7263779640197754, "rewards/margins": 0.7981564998626709, "rewards/rejected": -3.524534225463867, "sft_loss": 2.9038267135620117, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 16.739263575752542, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.21048936247825623, "logits/rejected": -0.06801290065050125, "logps/chosen": -2.8764920234680176, "logps/rejected": -3.7195560932159424, "loss": 0.5268, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8764920234680176, "rewards/margins": 0.84306401014328, "rewards/rejected": -3.7195560932159424, "sft_loss": 2.958024024963379, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 18.613139691100287, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.2533569037914276, "logits/rejected": -0.13912202417850494, "logps/chosen": -2.958700656890869, "logps/rejected": -3.6944549083709717, "loss": 0.5598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.958700656890869, "rewards/margins": 0.7357538342475891, "rewards/rejected": -3.6944549083709717, "sft_loss": 3.131559371948242, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 17.01635297334278, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.15639057755470276, "logits/rejected": -0.16117417812347412, "logps/chosen": -2.762528896331787, "logps/rejected": -3.521737575531006, "loss": 0.5339, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.762528896331787, "rewards/margins": 0.7592090368270874, "rewards/rejected": -3.521737575531006, "sft_loss": 2.86425518989563, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 18.155698170544465, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.26868265867233276, "logits/rejected": -0.22519557178020477, "logps/chosen": -2.8326733112335205, "logps/rejected": -3.687572479248047, "loss": 0.5588, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8326733112335205, "rewards/margins": 0.8548991084098816, "rewards/rejected": -3.687572479248047, "sft_loss": 3.030932664871216, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 15.858042608953705, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.3185202479362488, "logits/rejected": -0.1940157264471054, "logps/chosen": -2.7259998321533203, "logps/rejected": -3.6320412158966064, "loss": 0.5033, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7259998321533203, "rewards/margins": 0.9060415029525757, "rewards/rejected": -3.6320412158966064, "sft_loss": 2.879058361053467, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 15.658328201630576, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.27361616492271423, "logits/rejected": -0.17969655990600586, "logps/chosen": -2.839315891265869, "logps/rejected": -3.5681090354919434, "loss": 0.5349, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.839315891265869, "rewards/margins": 0.7287934422492981, "rewards/rejected": -3.5681090354919434, "sft_loss": 2.993678569793701, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 14.139512973464816, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.2828649878501892, "logits/rejected": -0.14037677645683289, "logps/chosen": -2.81725811958313, "logps/rejected": -3.4038796424865723, "loss": 0.5832, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.81725811958313, "rewards/margins": 0.5866214036941528, "rewards/rejected": -3.4038796424865723, "sft_loss": 2.944801092147827, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 17.638167050426947, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.332253634929657, "logits/rejected": -0.22307121753692627, "logps/chosen": -2.9059665203094482, "logps/rejected": -3.5740585327148438, "loss": 0.5805, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9059665203094482, "rewards/margins": 0.6680923700332642, "rewards/rejected": -3.5740585327148438, "sft_loss": 3.0662026405334473, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 20.879019950964675, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.26796287298202515, "logits/rejected": -0.145319402217865, "logps/chosen": -2.836381673812866, "logps/rejected": -3.6222052574157715, "loss": 0.5546, "rewards/accuracies": 0.71875, "rewards/chosen": -2.836381673812866, "rewards/margins": 0.7858238220214844, "rewards/rejected": -3.6222052574157715, "sft_loss": 2.9162559509277344, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 15.735016333049394, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.28387266397476196, "logits/rejected": -0.12554284930229187, "logps/chosen": -2.83955717086792, "logps/rejected": -3.6048247814178467, "loss": 0.5498, "rewards/accuracies": 0.71875, "rewards/chosen": -2.83955717086792, "rewards/margins": 0.7652674913406372, "rewards/rejected": -3.6048247814178467, "sft_loss": 3.0139026641845703, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 14.179116175137064, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.2856293320655823, "logits/rejected": -0.17432229220867157, "logps/chosen": -2.6420066356658936, "logps/rejected": -3.5150704383850098, "loss": 0.5291, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6420066356658936, "rewards/margins": 0.8730632066726685, "rewards/rejected": -3.5150704383850098, "sft_loss": 2.7040858268737793, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 13.655424318476642, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.2945409119129181, "logits/rejected": -0.15200811624526978, "logps/chosen": -2.7398436069488525, "logps/rejected": -3.718191623687744, "loss": 0.4739, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7398436069488525, "rewards/margins": 0.978347897529602, "rewards/rejected": -3.718191623687744, "sft_loss": 2.9533541202545166, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 14.895341760219607, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.3262266516685486, "logits/rejected": -0.16349753737449646, "logps/chosen": -2.935656785964966, "logps/rejected": -3.808778762817383, "loss": 0.5753, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.935656785964966, "rewards/margins": 0.8731220960617065, "rewards/rejected": -3.808778762817383, "sft_loss": 3.0750489234924316, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 12.97760841860067, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.2657465636730194, "logits/rejected": -0.12304908037185669, "logps/chosen": -2.537490129470825, "logps/rejected": -3.3631515502929688, "loss": 0.4959, "rewards/accuracies": 0.78125, "rewards/chosen": -2.537490129470825, "rewards/margins": 0.825661301612854, "rewards/rejected": -3.3631515502929688, "sft_loss": 2.72019362449646, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 19.859032127404333, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.3163455128669739, "logits/rejected": -0.21008479595184326, "logps/chosen": -2.7686800956726074, "logps/rejected": -3.6506667137145996, "loss": 0.5407, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7686800956726074, "rewards/margins": 0.8819867968559265, "rewards/rejected": -3.6506667137145996, "sft_loss": 2.9922776222229004, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 13.46626137783446, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.27424660325050354, "logits/rejected": -0.0691673532128334, "logps/chosen": -2.855818271636963, "logps/rejected": -3.652209758758545, "loss": 0.5403, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.855818271636963, "rewards/margins": 0.7963913083076477, "rewards/rejected": -3.652209758758545, "sft_loss": 2.939704418182373, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 16.39196142560876, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.26725202798843384, "logits/rejected": -0.17296263575553894, "logps/chosen": -2.8052046298980713, "logps/rejected": -3.607107639312744, "loss": 0.5424, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8052046298980713, "rewards/margins": 0.8019029498100281, "rewards/rejected": -3.607107639312744, "sft_loss": 2.9598681926727295, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 14.226033481114372, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.22751012444496155, "logits/rejected": -0.08938495814800262, "logps/chosen": -2.6386122703552246, "logps/rejected": -3.4772655963897705, "loss": 0.5082, "rewards/accuracies": 0.75, "rewards/chosen": -2.6386122703552246, "rewards/margins": 0.8386530876159668, "rewards/rejected": -3.4772655963897705, "sft_loss": 2.7765183448791504, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 19.205617348945218, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.25414150953292847, "logits/rejected": -0.16614079475402832, "logps/chosen": -2.6993870735168457, "logps/rejected": -3.6588058471679688, "loss": 0.4926, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6993870735168457, "rewards/margins": 0.9594192504882812, "rewards/rejected": -3.6588058471679688, "sft_loss": 2.820012092590332, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 14.684077770215215, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.12193117290735245, "logits/rejected": -0.07238110154867172, "logps/chosen": -2.7730677127838135, "logps/rejected": -3.5438296794891357, "loss": 0.5664, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7730677127838135, "rewards/margins": 0.7707620859146118, "rewards/rejected": -3.5438296794891357, "sft_loss": 2.9567716121673584, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 16.579906174692645, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.20543989539146423, "logits/rejected": -0.14003416895866394, "logps/chosen": -2.792309522628784, "logps/rejected": -3.7953097820281982, "loss": 0.4912, "rewards/accuracies": 0.78125, "rewards/chosen": -2.792309522628784, "rewards/margins": 1.0030001401901245, "rewards/rejected": -3.7953097820281982, "sft_loss": 2.9646449089050293, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 16.384158900838727, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.25673890113830566, "logits/rejected": -0.14675481617450714, "logps/chosen": -2.748898983001709, "logps/rejected": -3.506723403930664, "loss": 0.5451, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.748898983001709, "rewards/margins": 0.757824182510376, "rewards/rejected": -3.506723403930664, "sft_loss": 2.8113455772399902, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 15.376392346664371, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.32976996898651123, "logits/rejected": -0.22071564197540283, "logps/chosen": -2.7523465156555176, "logps/rejected": -3.6334640979766846, "loss": 0.5007, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7523465156555176, "rewards/margins": 0.8811177015304565, "rewards/rejected": -3.6334640979766846, "sft_loss": 2.9830379486083984, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 13.25572921129167, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.2685486674308777, "logits/rejected": -0.11623702943325043, "logps/chosen": -2.7400898933410645, "logps/rejected": -3.5789923667907715, "loss": 0.4804, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7400898933410645, "rewards/margins": 0.838902473449707, "rewards/rejected": -3.5789923667907715, "sft_loss": 2.8983848094940186, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 16.15165716924802, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.198993980884552, "logits/rejected": -0.10462522506713867, "logps/chosen": -2.9255807399749756, "logps/rejected": -3.707984447479248, "loss": 0.5792, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9255807399749756, "rewards/margins": 0.7824038863182068, "rewards/rejected": -3.707984447479248, "sft_loss": 3.0322368144989014, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 16.0411973409057, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.1713380515575409, "logits/rejected": -0.09565655887126923, "logps/chosen": -3.05859637260437, "logps/rejected": -3.847487688064575, "loss": 0.5326, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.05859637260437, "rewards/margins": 0.7888910174369812, "rewards/rejected": -3.847487688064575, "sft_loss": 3.13173246383667, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 14.16234994601713, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.21377214789390564, "logits/rejected": -0.1134125143289566, "logps/chosen": -2.901991844177246, "logps/rejected": -3.988110065460205, "loss": 0.4429, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.901991844177246, "rewards/margins": 1.086118459701538, "rewards/rejected": -3.988110065460205, "sft_loss": 3.0303547382354736, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 12.244453619004945, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.306434690952301, "logits/rejected": -0.18747636675834656, "logps/chosen": -2.716519832611084, "logps/rejected": -3.7627689838409424, "loss": 0.4652, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.716519832611084, "rewards/margins": 1.0462491512298584, "rewards/rejected": -3.7627689838409424, "sft_loss": 2.938028335571289, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 15.423811405373074, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.2862057685852051, "logits/rejected": -0.11905969679355621, "logps/chosen": -2.9096438884735107, "logps/rejected": -3.999202251434326, "loss": 0.4579, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9096438884735107, "rewards/margins": 1.0895582437515259, "rewards/rejected": -3.999202251434326, "sft_loss": 3.161621332168579, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 12.428807927419921, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.33346086740493774, "logits/rejected": -0.26415562629699707, "logps/chosen": -2.9282045364379883, "logps/rejected": -4.097114086151123, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9282045364379883, "rewards/margins": 1.1689093112945557, "rewards/rejected": -4.097114086151123, "sft_loss": 3.1366372108459473, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 22.392331618088818, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.2757893204689026, "logits/rejected": -0.04873190447688103, "logps/chosen": -3.1032681465148926, "logps/rejected": -4.14900016784668, "loss": 0.5084, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.1032681465148926, "rewards/margins": 1.0457319021224976, "rewards/rejected": -4.14900016784668, "sft_loss": 3.2254319190979004, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 23.304653348639043, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.33224955201148987, "logits/rejected": -0.1305333971977234, "logps/chosen": -2.9278311729431152, "logps/rejected": -4.174778938293457, "loss": 0.4505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9278311729431152, "rewards/margins": 1.246948003768921, "rewards/rejected": -4.174778938293457, "sft_loss": 3.146299123764038, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 19.897203015559125, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.22761209309101105, "logits/rejected": -0.16656561195850372, "logps/chosen": -3.047513484954834, "logps/rejected": -4.061963081359863, "loss": 0.4903, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.047513484954834, "rewards/margins": 1.0144492387771606, "rewards/rejected": -4.061963081359863, "sft_loss": 3.1401548385620117, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 13.776672506207479, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.24800440669059753, "logits/rejected": -0.20165736973285675, "logps/chosen": -2.960462808609009, "logps/rejected": -4.0827765464782715, "loss": 0.4562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.960462808609009, "rewards/margins": 1.122314214706421, "rewards/rejected": -4.0827765464782715, "sft_loss": 3.118192195892334, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 11.773085259364048, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.2977316081523895, "logits/rejected": -0.11875119060277939, "logps/chosen": -3.0365774631500244, "logps/rejected": -4.105264663696289, "loss": 0.4775, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0365774631500244, "rewards/margins": 1.0686873197555542, "rewards/rejected": -4.105264663696289, "sft_loss": 3.1606414318084717, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 12.527830416654856, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.33617570996284485, "logits/rejected": -0.1405753344297409, "logps/chosen": -2.931879997253418, "logps/rejected": -4.046960830688477, "loss": 0.4693, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.931879997253418, "rewards/margins": 1.1150810718536377, "rewards/rejected": -4.046960830688477, "sft_loss": 3.1172118186950684, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 15.611435619917417, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.22904033958911896, "logits/rejected": -0.19225440919399261, "logps/chosen": -2.8065621852874756, "logps/rejected": -3.717280149459839, "loss": 0.5059, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8065621852874756, "rewards/margins": 0.9107178449630737, "rewards/rejected": -3.717280149459839, "sft_loss": 2.9993736743927, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 25.12690448505756, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.2554751932621002, "logits/rejected": -0.09961952269077301, "logps/chosen": -2.9748282432556152, "logps/rejected": -3.9946041107177734, "loss": 0.4708, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9748282432556152, "rewards/margins": 1.0197762250900269, "rewards/rejected": -3.9946041107177734, "sft_loss": 3.2063193321228027, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 22.659939278716454, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.26864659786224365, "logits/rejected": -0.11086989939212799, "logps/chosen": -2.8870584964752197, "logps/rejected": -3.908726930618286, "loss": 0.4519, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8870584964752197, "rewards/margins": 1.021668791770935, "rewards/rejected": -3.908726930618286, "sft_loss": 3.0277743339538574, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 15.111167904101759, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.1721217930316925, "logits/rejected": -0.06392903625965118, "logps/chosen": -2.877350330352783, "logps/rejected": -4.044107437133789, "loss": 0.4091, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.877350330352783, "rewards/margins": 1.1667568683624268, "rewards/rejected": -4.044107437133789, "sft_loss": 2.936011791229248, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 14.154684910410312, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.3511294722557068, "logits/rejected": -0.22026808559894562, "logps/chosen": -2.9797768592834473, "logps/rejected": -4.012446403503418, "loss": 0.4576, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9797768592834473, "rewards/margins": 1.0326696634292603, "rewards/rejected": -4.012446403503418, "sft_loss": 3.0182080268859863, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 16.20660707105562, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.3694307208061218, "logits/rejected": -0.10809771716594696, "logps/chosen": -3.0436954498291016, "logps/rejected": -4.178753852844238, "loss": 0.4598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0436954498291016, "rewards/margins": 1.1350584030151367, "rewards/rejected": -4.178753852844238, "sft_loss": 3.252185344696045, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 16.511892258173834, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.27332353591918945, "logits/rejected": -0.19080711901187897, "logps/chosen": -2.957089900970459, "logps/rejected": -4.055009365081787, "loss": 0.4476, "rewards/accuracies": 0.84375, "rewards/chosen": -2.957089900970459, "rewards/margins": 1.0979197025299072, "rewards/rejected": -4.055009365081787, "sft_loss": 3.1537868976593018, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 19.73959161900953, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.38870173692703247, "logits/rejected": -0.2058717906475067, "logps/chosen": -3.182220935821533, "logps/rejected": -4.362703323364258, "loss": 0.4582, "rewards/accuracies": 0.8125, "rewards/chosen": -3.182220935821533, "rewards/margins": 1.180482029914856, "rewards/rejected": -4.362703323364258, "sft_loss": 3.3152670860290527, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 15.048704309405935, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.28820163011550903, "logits/rejected": -0.08687031269073486, "logps/chosen": -3.045510768890381, "logps/rejected": -4.212818145751953, "loss": 0.4475, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.045510768890381, "rewards/margins": 1.1673071384429932, "rewards/rejected": -4.212818145751953, "sft_loss": 3.289644718170166, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 14.31299688757163, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.2321111410856247, "logits/rejected": -0.06788916885852814, "logps/chosen": -2.9039368629455566, "logps/rejected": -4.054717063903809, "loss": 0.4518, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.9039368629455566, "rewards/margins": 1.1507799625396729, "rewards/rejected": -4.054717063903809, "sft_loss": 3.1044628620147705, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 13.386343057348473, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.39123016595840454, "logits/rejected": -0.12693606317043304, "logps/chosen": -3.091785430908203, "logps/rejected": -4.274016380310059, "loss": 0.4142, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.091785430908203, "rewards/margins": 1.1822311878204346, "rewards/rejected": -4.274016380310059, "sft_loss": 3.207846164703369, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 16.329522011469113, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.32532352209091187, "logits/rejected": -0.10557621717453003, "logps/chosen": -3.1879098415374756, "logps/rejected": -4.119997024536133, "loss": 0.5105, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1879098415374756, "rewards/margins": 0.9320871233940125, "rewards/rejected": -4.119997024536133, "sft_loss": 3.367783784866333, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 18.388836364712105, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.27799874544143677, "logits/rejected": -0.05749162286520004, "logps/chosen": -3.1412248611450195, "logps/rejected": -4.446188926696777, "loss": 0.4546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1412248611450195, "rewards/margins": 1.304964542388916, "rewards/rejected": -4.446188926696777, "sft_loss": 3.2613327503204346, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 16.88347966335065, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.23603376746177673, "logits/rejected": -0.07529838383197784, "logps/chosen": -3.112762928009033, "logps/rejected": -4.320590019226074, "loss": 0.4491, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.112762928009033, "rewards/margins": 1.2078269720077515, "rewards/rejected": -4.320590019226074, "sft_loss": 3.2125473022460938, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 19.52573396013767, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.40236061811447144, "logits/rejected": -0.22329147160053253, "logps/chosen": -3.084240436553955, "logps/rejected": -4.158616065979004, "loss": 0.4653, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.084240436553955, "rewards/margins": 1.0743753910064697, "rewards/rejected": -4.158616065979004, "sft_loss": 3.236417293548584, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 16.765985019585877, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.3749734163284302, "logits/rejected": -0.2466563880443573, "logps/chosen": -2.95463490486145, "logps/rejected": -4.059712886810303, "loss": 0.4625, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.95463490486145, "rewards/margins": 1.1050784587860107, "rewards/rejected": -4.059712886810303, "sft_loss": 3.155550479888916, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 17.908687383871158, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.31655049324035645, "logits/rejected": -0.19131407141685486, "logps/chosen": -3.113335609436035, "logps/rejected": -4.204074859619141, "loss": 0.477, "rewards/accuracies": 0.75, "rewards/chosen": -3.113335609436035, "rewards/margins": 1.0907394886016846, "rewards/rejected": -4.204074859619141, "sft_loss": 3.320544719696045, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 18.39441900762111, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.3523116707801819, "logits/rejected": -0.15895763039588928, "logps/chosen": -2.8050246238708496, "logps/rejected": -3.9845402240753174, "loss": 0.4351, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8050246238708496, "rewards/margins": 1.1795158386230469, "rewards/rejected": -3.9845402240753174, "sft_loss": 2.912895679473877, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 13.819702075148708, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.3338176906108856, "logits/rejected": -0.19842395186424255, "logps/chosen": -2.7931671142578125, "logps/rejected": -4.065673828125, "loss": 0.418, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7931671142578125, "rewards/margins": 1.2725064754486084, "rewards/rejected": -4.065673828125, "sft_loss": 3.0094237327575684, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 18.610614155493714, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.23861142992973328, "logits/rejected": -0.12467072159051895, "logps/chosen": -3.0036704540252686, "logps/rejected": -4.0125226974487305, "loss": 0.4999, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0036704540252686, "rewards/margins": 1.0088523626327515, "rewards/rejected": -4.0125226974487305, "sft_loss": 3.170523166656494, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 15.426117287104132, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.3244778513908386, "logits/rejected": -0.062110286206007004, "logps/chosen": -3.009836196899414, "logps/rejected": -4.056250095367432, "loss": 0.4907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.009836196899414, "rewards/margins": 1.0464141368865967, "rewards/rejected": -4.056250095367432, "sft_loss": 3.0853304862976074, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 14.699009607152908, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.2429141104221344, "logits/rejected": -0.20059914886951447, "logps/chosen": -3.0386226177215576, "logps/rejected": -4.1500349044799805, "loss": 0.4483, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0386226177215576, "rewards/margins": 1.1114122867584229, "rewards/rejected": -4.1500349044799805, "sft_loss": 3.1930127143859863, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 15.620716890065738, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.24442140758037567, "logits/rejected": -0.1320722997188568, "logps/chosen": -3.105576276779175, "logps/rejected": -4.1901750564575195, "loss": 0.494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.105576276779175, "rewards/margins": 1.0845987796783447, "rewards/rejected": -4.1901750564575195, "sft_loss": 3.2068934440612793, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 16.207557805252748, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.27643728256225586, "logits/rejected": -0.1240261048078537, "logps/chosen": -2.892338275909424, "logps/rejected": -3.966510772705078, "loss": 0.4534, "rewards/accuracies": 0.78125, "rewards/chosen": -2.892338275909424, "rewards/margins": 1.0741727352142334, "rewards/rejected": -3.966510772705078, "sft_loss": 3.089568614959717, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 21.10890673694221, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.3095929026603699, "logits/rejected": -0.13495635986328125, "logps/chosen": -3.2057957649230957, "logps/rejected": -4.216448783874512, "loss": 0.5132, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2057957649230957, "rewards/margins": 1.0106537342071533, "rewards/rejected": -4.216448783874512, "sft_loss": 3.349108934402466, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 15.675139661938122, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.2906665503978729, "logits/rejected": -0.11295589059591293, "logps/chosen": -3.0297131538391113, "logps/rejected": -4.311059951782227, "loss": 0.4317, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0297131538391113, "rewards/margins": 1.2813465595245361, "rewards/rejected": -4.311059951782227, "sft_loss": 3.1101412773132324, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 14.984845149493891, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.2796470522880554, "logits/rejected": -0.17265164852142334, "logps/chosen": -2.9086081981658936, "logps/rejected": -4.126116752624512, "loss": 0.4417, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9086081981658936, "rewards/margins": 1.2175090312957764, "rewards/rejected": -4.126116752624512, "sft_loss": 3.0992932319641113, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 12.670306936276564, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.37444189190864563, "logits/rejected": -0.1644412726163864, "logps/chosen": -3.0525717735290527, "logps/rejected": -4.331545829772949, "loss": 0.446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0525717735290527, "rewards/margins": 1.2789738178253174, "rewards/rejected": -4.331545829772949, "sft_loss": 3.174462080001831, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 13.338592047448122, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.284696489572525, "logits/rejected": -0.1664656102657318, "logps/chosen": -3.0490806102752686, "logps/rejected": -4.257328987121582, "loss": 0.4521, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0490806102752686, "rewards/margins": 1.208248496055603, "rewards/rejected": -4.257328987121582, "sft_loss": 3.1560282707214355, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 22.165854005873395, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.2826927900314331, "logits/rejected": -0.18620052933692932, "logps/chosen": -3.007849931716919, "logps/rejected": -4.2933125495910645, "loss": 0.4446, "rewards/accuracies": 0.78125, "rewards/chosen": -3.007849931716919, "rewards/margins": 1.285462737083435, "rewards/rejected": -4.2933125495910645, "sft_loss": 3.0982885360717773, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 18.244244282787054, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.30995243787765503, "logits/rejected": -0.10001242160797119, "logps/chosen": -3.2020652294158936, "logps/rejected": -4.1528000831604, "loss": 0.5172, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2020652294158936, "rewards/margins": 0.9507347941398621, "rewards/rejected": -4.1528000831604, "sft_loss": 3.400285005569458, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 25.00130870325128, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.2304377257823944, "logits/rejected": -0.05512354522943497, "logps/chosen": -3.052947759628296, "logps/rejected": -4.249530792236328, "loss": 0.4534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.052947759628296, "rewards/margins": 1.1965830326080322, "rewards/rejected": -4.249530792236328, "sft_loss": 3.1837849617004395, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 17.54566506904545, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.341848224401474, "logits/rejected": -0.2598082721233368, "logps/chosen": -3.0490105152130127, "logps/rejected": -4.387795448303223, "loss": 0.4447, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0490105152130127, "rewards/margins": 1.33878493309021, "rewards/rejected": -4.387795448303223, "sft_loss": 3.2051093578338623, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 13.39085503782966, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.2547725439071655, "logits/rejected": -0.13449934124946594, "logps/chosen": -3.0368006229400635, "logps/rejected": -4.475960731506348, "loss": 0.3986, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0368006229400635, "rewards/margins": 1.4391599893569946, "rewards/rejected": -4.475960731506348, "sft_loss": 3.2502083778381348, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 20.65963530181319, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.293484628200531, "logits/rejected": -0.12475170940160751, "logps/chosen": -3.2072854042053223, "logps/rejected": -4.5082550048828125, "loss": 0.4329, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2072854042053223, "rewards/margins": 1.3009698390960693, "rewards/rejected": -4.5082550048828125, "sft_loss": 3.502829074859619, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 16.427496657913935, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.2562063932418823, "logits/rejected": -0.1234719380736351, "logps/chosen": -3.096013307571411, "logps/rejected": -4.341220378875732, "loss": 0.496, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.096013307571411, "rewards/margins": 1.2452070713043213, "rewards/rejected": -4.341220378875732, "sft_loss": 3.4139180183410645, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 17.20178783513042, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.2581852078437805, "logits/rejected": -0.09759654849767685, "logps/chosen": -2.892961025238037, "logps/rejected": -4.1011962890625, "loss": 0.418, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.892961025238037, "rewards/margins": 1.2082349061965942, "rewards/rejected": -4.1011962890625, "sft_loss": 3.0654709339141846, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 15.628020604945597, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.3146493136882782, "logits/rejected": -0.20411427319049835, "logps/chosen": -2.870588779449463, "logps/rejected": -4.039918899536133, "loss": 0.4605, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.870588779449463, "rewards/margins": 1.1693298816680908, "rewards/rejected": -4.039918899536133, "sft_loss": 3.2099366188049316, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 23.53305266958996, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.3712599277496338, "logits/rejected": -0.23454952239990234, "logps/chosen": -2.9788217544555664, "logps/rejected": -4.112313747406006, "loss": 0.4674, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9788217544555664, "rewards/margins": 1.1334917545318604, "rewards/rejected": -4.112313747406006, "sft_loss": 3.238032579421997, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 16.200415464439846, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.38899844884872437, "logits/rejected": -0.23052570223808289, "logps/chosen": -2.9085559844970703, "logps/rejected": -4.126476287841797, "loss": 0.4187, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9085559844970703, "rewards/margins": 1.217919945716858, "rewards/rejected": -4.126476287841797, "sft_loss": 3.132606267929077, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 22.72340929322911, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.4118987023830414, "logits/rejected": -0.12402470409870148, "logps/chosen": -3.095353126525879, "logps/rejected": -4.235506534576416, "loss": 0.4774, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.095353126525879, "rewards/margins": 1.140153169631958, "rewards/rejected": -4.235506534576416, "sft_loss": 3.2529025077819824, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 21.212523140842713, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.31405314803123474, "logits/rejected": -0.21102198958396912, "logps/chosen": -2.9583473205566406, "logps/rejected": -4.192997932434082, "loss": 0.4683, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9583473205566406, "rewards/margins": 1.2346506118774414, "rewards/rejected": -4.192997932434082, "sft_loss": 3.1199488639831543, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.025688041001558304, "eval_logits/rejected": 0.13502153754234314, "eval_logps/chosen": -3.158782958984375, "eval_logps/rejected": -4.109988212585449, "eval_loss": 0.5601091980934143, "eval_rewards/accuracies": 0.719584584236145, "eval_rewards/chosen": -3.158782958984375, "eval_rewards/margins": 0.9512055516242981, "eval_rewards/rejected": -4.109988212585449, "eval_runtime": 51.6293, "eval_samples_per_second": 26.051, "eval_sft_loss": 3.3500845432281494, "eval_steps_per_second": 6.527, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 16.63182385862367, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.2870718538761139, "logits/rejected": -0.05029254034161568, "logps/chosen": -3.2278294563293457, "logps/rejected": -4.264133453369141, "loss": 0.4955, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2278294563293457, "rewards/margins": 1.0363037586212158, "rewards/rejected": -4.264133453369141, "sft_loss": 3.312840223312378, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 16.68456462998088, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.3568572402000427, "logits/rejected": -0.23159785568714142, "logps/chosen": -2.974052906036377, "logps/rejected": -3.96720814704895, "loss": 0.4973, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.974052906036377, "rewards/margins": 0.9931553602218628, "rewards/rejected": -3.96720814704895, "sft_loss": 3.1371753215789795, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 17.55953407548012, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.32833558320999146, "logits/rejected": -0.0644773468375206, "logps/chosen": -3.0109498500823975, "logps/rejected": -4.042330265045166, "loss": 0.5032, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0109498500823975, "rewards/margins": 1.031380295753479, "rewards/rejected": -4.042330265045166, "sft_loss": 3.1866564750671387, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 12.996125025597934, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.4082667827606201, "logits/rejected": -0.2362288236618042, "logps/chosen": -3.1122148036956787, "logps/rejected": -4.284976005554199, "loss": 0.448, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1122148036956787, "rewards/margins": 1.1727612018585205, "rewards/rejected": -4.284976005554199, "sft_loss": 3.294731616973877, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 21.344050643418907, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.311269611120224, "logits/rejected": -0.1855960339307785, "logps/chosen": -2.9530930519104004, "logps/rejected": -3.987037181854248, "loss": 0.4904, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9530930519104004, "rewards/margins": 1.0339438915252686, "rewards/rejected": -3.987037181854248, "sft_loss": 3.1352100372314453, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 14.0250481619352, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.36123383045196533, "logits/rejected": -0.24479889869689941, "logps/chosen": -2.9066200256347656, "logps/rejected": -4.128986358642578, "loss": 0.4004, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.9066200256347656, "rewards/margins": 1.2223665714263916, "rewards/rejected": -4.128986358642578, "sft_loss": 3.0532615184783936, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 21.136200769220174, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.3215072453022003, "logits/rejected": -0.2983446717262268, "logps/chosen": -2.8843977451324463, "logps/rejected": -3.8361854553222656, "loss": 0.4983, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8843977451324463, "rewards/margins": 0.9517875909805298, "rewards/rejected": -3.8361854553222656, "sft_loss": 3.0769405364990234, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 18.318502665383225, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.31124424934387207, "logits/rejected": -0.2860221564769745, "logps/chosen": -2.903745412826538, "logps/rejected": -4.019559383392334, "loss": 0.4356, "rewards/accuracies": 0.8125, "rewards/chosen": -2.903745412826538, "rewards/margins": 1.115814447402954, "rewards/rejected": -4.019559383392334, "sft_loss": 3.0267748832702637, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 17.55774246027213, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.2943703532218933, "logits/rejected": -0.14766912162303925, "logps/chosen": -2.9912478923797607, "logps/rejected": -4.015740394592285, "loss": 0.4922, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9912478923797607, "rewards/margins": 1.0244930982589722, "rewards/rejected": -4.015740394592285, "sft_loss": 3.1674442291259766, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 22.372909429762434, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.38501840829849243, "logits/rejected": -0.2358318269252777, "logps/chosen": -3.215186357498169, "logps/rejected": -4.143369674682617, "loss": 0.5279, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.215186357498169, "rewards/margins": 0.9281827807426453, "rewards/rejected": -4.143369674682617, "sft_loss": 3.4197921752929688, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 21.92536904942618, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.25042837858200073, "logits/rejected": -0.1560048609972, "logps/chosen": -3.093291997909546, "logps/rejected": -3.9603614807128906, "loss": 0.5049, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.093291997909546, "rewards/margins": 0.8670692443847656, "rewards/rejected": -3.9603614807128906, "sft_loss": 3.21748685836792, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 17.27114805230743, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.23693613708019257, "logits/rejected": -0.05825047567486763, "logps/chosen": -3.1077284812927246, "logps/rejected": -4.1357502937316895, "loss": 0.5128, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1077284812927246, "rewards/margins": 1.0280214548110962, "rewards/rejected": -4.1357502937316895, "sft_loss": 3.373389482498169, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 17.350160855736583, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.35128170251846313, "logits/rejected": -0.25272828340530396, "logps/chosen": -3.0533649921417236, "logps/rejected": -4.274240493774414, "loss": 0.4319, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0533649921417236, "rewards/margins": 1.22087562084198, "rewards/rejected": -4.274240493774414, "sft_loss": 3.2763259410858154, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 24.237254411981112, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.2499866485595703, "logits/rejected": -0.024306219071149826, "logps/chosen": -3.124173164367676, "logps/rejected": -4.354640960693359, "loss": 0.4902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.124173164367676, "rewards/margins": 1.2304680347442627, "rewards/rejected": -4.354640960693359, "sft_loss": 3.3544716835021973, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 16.256757634082003, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.31078004837036133, "logits/rejected": -0.0933656319975853, "logps/chosen": -3.0529751777648926, "logps/rejected": -4.145366668701172, "loss": 0.4455, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0529751777648926, "rewards/margins": 1.0923912525177002, "rewards/rejected": -4.145366668701172, "sft_loss": 3.150514841079712, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 16.07676660011823, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.35007601976394653, "logits/rejected": -0.19697776436805725, "logps/chosen": -3.1284284591674805, "logps/rejected": -4.333003520965576, "loss": 0.4519, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1284284591674805, "rewards/margins": 1.204574704170227, "rewards/rejected": -4.333003520965576, "sft_loss": 3.2495639324188232, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 14.043602735428511, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.27686479687690735, "logits/rejected": -0.19484971463680267, "logps/chosen": -2.907302141189575, "logps/rejected": -4.04062032699585, "loss": 0.4381, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.907302141189575, "rewards/margins": 1.1333180665969849, "rewards/rejected": -4.04062032699585, "sft_loss": 3.009895086288452, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 16.392123117224134, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.41479548811912537, "logits/rejected": -0.18581287562847137, "logps/chosen": -2.9701197147369385, "logps/rejected": -4.102685928344727, "loss": 0.4516, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9701197147369385, "rewards/margins": 1.1325660943984985, "rewards/rejected": -4.102685928344727, "sft_loss": 3.1302144527435303, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 23.254534911552202, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.36941179633140564, "logits/rejected": -0.25071144104003906, "logps/chosen": -3.2064356803894043, "logps/rejected": -4.208579063415527, "loss": 0.4821, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2064356803894043, "rewards/margins": 1.002143383026123, "rewards/rejected": -4.208579063415527, "sft_loss": 3.360086441040039, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 21.084331152299193, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.2859252691268921, "logits/rejected": -0.09964105486869812, "logps/chosen": -2.9344687461853027, "logps/rejected": -4.217326641082764, "loss": 0.3965, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.9344687461853027, "rewards/margins": 1.2828583717346191, "rewards/rejected": -4.217326641082764, "sft_loss": 3.188898801803589, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 16.571221681014507, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.2801959812641144, "logits/rejected": -0.0652405172586441, "logps/chosen": -3.313950300216675, "logps/rejected": -4.442086219787598, "loss": 0.4633, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.313950300216675, "rewards/margins": 1.1281362771987915, "rewards/rejected": -4.442086219787598, "sft_loss": 3.3639557361602783, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 26.337516275744846, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.28500619530677795, "logits/rejected": -0.04683919996023178, "logps/chosen": -3.06573486328125, "logps/rejected": -4.341249942779541, "loss": 0.4386, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.06573486328125, "rewards/margins": 1.2755151987075806, "rewards/rejected": -4.341249942779541, "sft_loss": 3.2408344745635986, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 18.375904597568343, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.26755040884017944, "logits/rejected": -0.1529216468334198, "logps/chosen": -3.136706590652466, "logps/rejected": -4.274568557739258, "loss": 0.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.136706590652466, "rewards/margins": 1.1378618478775024, "rewards/rejected": -4.274568557739258, "sft_loss": 3.320249557495117, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 21.94889236681124, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.29934266209602356, "logits/rejected": -0.1676492989063263, "logps/chosen": -3.0635173320770264, "logps/rejected": -4.196096420288086, "loss": 0.4547, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0635173320770264, "rewards/margins": 1.1325792074203491, "rewards/rejected": -4.196096420288086, "sft_loss": 3.3024845123291016, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 14.760395037648353, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.3397952616214752, "logits/rejected": -0.10662896931171417, "logps/chosen": -3.0589253902435303, "logps/rejected": -4.455246925354004, "loss": 0.4053, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0589253902435303, "rewards/margins": 1.396321177482605, "rewards/rejected": -4.455246925354004, "sft_loss": 3.2559409141540527, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 16.322852975390248, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.2383042573928833, "logits/rejected": -0.1254793107509613, "logps/chosen": -3.2248597145080566, "logps/rejected": -4.4617018699646, "loss": 0.4697, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2248597145080566, "rewards/margins": 1.236842393875122, "rewards/rejected": -4.4617018699646, "sft_loss": 3.446223735809326, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 14.37152358991461, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.2022382915019989, "logits/rejected": -0.20282498002052307, "logps/chosen": -3.195924758911133, "logps/rejected": -4.3802595138549805, "loss": 0.5141, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.195924758911133, "rewards/margins": 1.1843347549438477, "rewards/rejected": -4.3802595138549805, "sft_loss": 3.360313892364502, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 17.521945693795125, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.4308474659919739, "logits/rejected": -0.34627005457878113, "logps/chosen": -3.1027534008026123, "logps/rejected": -4.180264472961426, "loss": 0.4843, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1027534008026123, "rewards/margins": 1.077511191368103, "rewards/rejected": -4.180264472961426, "sft_loss": 3.237123966217041, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 16.896194855981122, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.18039865791797638, "logits/rejected": -0.026836853474378586, "logps/chosen": -3.3387451171875, "logps/rejected": -4.3841986656188965, "loss": 0.5017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3387451171875, "rewards/margins": 1.0454537868499756, "rewards/rejected": -4.3841986656188965, "sft_loss": 3.483057737350464, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 16.441757226523716, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.2857555150985718, "logits/rejected": -0.15832646191120148, "logps/chosen": -2.8970754146575928, "logps/rejected": -3.9910411834716797, "loss": 0.4743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8970754146575928, "rewards/margins": 1.0939652919769287, "rewards/rejected": -3.9910411834716797, "sft_loss": 3.0792059898376465, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 19.01573807202738, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.26029425859451294, "logits/rejected": -0.2442263811826706, "logps/chosen": -3.0544180870056152, "logps/rejected": -4.162773132324219, "loss": 0.4682, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0544180870056152, "rewards/margins": 1.108355164527893, "rewards/rejected": -4.162773132324219, "sft_loss": 3.1890597343444824, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 24.025395376054902, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.3096083402633667, "logits/rejected": -0.1437438279390335, "logps/chosen": -3.1456522941589355, "logps/rejected": -4.235614776611328, "loss": 0.4625, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1456522941589355, "rewards/margins": 1.0899627208709717, "rewards/rejected": -4.235614776611328, "sft_loss": 3.3048198223114014, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 18.65213393676238, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.2922806739807129, "logits/rejected": -0.129634290933609, "logps/chosen": -3.0212314128875732, "logps/rejected": -4.285717487335205, "loss": 0.4529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0212314128875732, "rewards/margins": 1.2644855976104736, "rewards/rejected": -4.285717487335205, "sft_loss": 3.2442543506622314, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 29.2513496192298, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.1702096164226532, "logits/rejected": -0.0935095027089119, "logps/chosen": -3.0341057777404785, "logps/rejected": -4.166206359863281, "loss": 0.4668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0341057777404785, "rewards/margins": 1.13210129737854, "rewards/rejected": -4.166206359863281, "sft_loss": 3.3285961151123047, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 23.00704024821283, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.31321150064468384, "logits/rejected": -0.2151360958814621, "logps/chosen": -2.9937045574188232, "logps/rejected": -3.9885659217834473, "loss": 0.4813, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.9937045574188232, "rewards/margins": 0.9948616027832031, "rewards/rejected": -3.9885659217834473, "sft_loss": 3.190828561782837, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 17.89894309314384, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.34084606170654297, "logits/rejected": -0.1118582934141159, "logps/chosen": -3.0107243061065674, "logps/rejected": -4.235139846801758, "loss": 0.4364, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0107243061065674, "rewards/margins": 1.2244160175323486, "rewards/rejected": -4.235139846801758, "sft_loss": 3.230741500854492, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 22.70863676615316, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.3982570767402649, "logits/rejected": -0.34550541639328003, "logps/chosen": -2.9199929237365723, "logps/rejected": -4.091763019561768, "loss": 0.4763, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9199929237365723, "rewards/margins": 1.1717698574066162, "rewards/rejected": -4.091763019561768, "sft_loss": 3.0896589756011963, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 17.986620779251524, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.20317812263965607, "logits/rejected": -0.122073695063591, "logps/chosen": -2.9494340419769287, "logps/rejected": -4.093937397003174, "loss": 0.4748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9494340419769287, "rewards/margins": 1.1445037126541138, "rewards/rejected": -4.093937397003174, "sft_loss": 3.065373420715332, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 15.153617752967738, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.43922024965286255, "logits/rejected": -0.2848798632621765, "logps/chosen": -2.9392600059509277, "logps/rejected": -3.8739826679229736, "loss": 0.4935, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9392600059509277, "rewards/margins": 0.934722900390625, "rewards/rejected": -3.8739826679229736, "sft_loss": 3.1426877975463867, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 17.0434905367743, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.18671448528766632, "logits/rejected": -0.1985694319009781, "logps/chosen": -3.104698657989502, "logps/rejected": -3.9841995239257812, "loss": 0.5454, "rewards/accuracies": 0.75, "rewards/chosen": -3.104698657989502, "rewards/margins": 0.8795011639595032, "rewards/rejected": -3.9841995239257812, "sft_loss": 3.2671265602111816, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 15.5388913454973, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.364023357629776, "logits/rejected": -0.26746666431427, "logps/chosen": -2.8987088203430176, "logps/rejected": -3.918377637863159, "loss": 0.4505, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.8987088203430176, "rewards/margins": 1.0196691751480103, "rewards/rejected": -3.918377637863159, "sft_loss": 3.0810458660125732, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 15.16323308061328, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.36861300468444824, "logits/rejected": -0.22601374983787537, "logps/chosen": -2.9905242919921875, "logps/rejected": -4.286072254180908, "loss": 0.4379, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9905242919921875, "rewards/margins": 1.2955482006072998, "rewards/rejected": -4.286072254180908, "sft_loss": 3.1793177127838135, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 17.123638355788778, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.32376617193222046, "logits/rejected": -0.15093651413917542, "logps/chosen": -2.9996771812438965, "logps/rejected": -4.0844011306762695, "loss": 0.4765, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9996771812438965, "rewards/margins": 1.0847234725952148, "rewards/rejected": -4.0844011306762695, "sft_loss": 3.0887672901153564, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 13.627329026221213, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.38504481315612793, "logits/rejected": -0.15042896568775177, "logps/chosen": -2.893672227859497, "logps/rejected": -4.146300792694092, "loss": 0.4252, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.893672227859497, "rewards/margins": 1.2526286840438843, "rewards/rejected": -4.146300792694092, "sft_loss": 3.088972568511963, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 19.486732275129167, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.3545922338962555, "logits/rejected": -0.10276027023792267, "logps/chosen": -3.0243966579437256, "logps/rejected": -4.108520030975342, "loss": 0.4833, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0243966579437256, "rewards/margins": 1.0841232538223267, "rewards/rejected": -4.108520030975342, "sft_loss": 3.1572792530059814, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 16.72094357730139, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.35756057500839233, "logits/rejected": -0.2667251229286194, "logps/chosen": -3.001896619796753, "logps/rejected": -4.1554274559021, "loss": 0.4724, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.001896619796753, "rewards/margins": 1.1535308361053467, "rewards/rejected": -4.1554274559021, "sft_loss": 3.0512232780456543, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 15.281841973742925, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.2982345223426819, "logits/rejected": -0.19628144800662994, "logps/chosen": -3.0487236976623535, "logps/rejected": -4.0739336013793945, "loss": 0.5052, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0487236976623535, "rewards/margins": 1.0252102613449097, "rewards/rejected": -4.0739336013793945, "sft_loss": 3.2537829875946045, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 15.820741935754505, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.31810635328292847, "logits/rejected": -0.20068666338920593, "logps/chosen": -2.9168238639831543, "logps/rejected": -4.0605692863464355, "loss": 0.4363, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9168238639831543, "rewards/margins": 1.1437454223632812, "rewards/rejected": -4.0605692863464355, "sft_loss": 3.097944974899292, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 20.775218753242996, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.3695822060108185, "logits/rejected": -0.21076634526252747, "logps/chosen": -3.0002455711364746, "logps/rejected": -4.147057056427002, "loss": 0.4466, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0002455711364746, "rewards/margins": 1.1468111276626587, "rewards/rejected": -4.147057056427002, "sft_loss": 3.1657638549804688, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 16.731510328798766, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.35614705085754395, "logits/rejected": -0.1391567438840866, "logps/chosen": -3.073739528656006, "logps/rejected": -4.305612564086914, "loss": 0.4326, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.073739528656006, "rewards/margins": 1.2318732738494873, "rewards/rejected": -4.305612564086914, "sft_loss": 3.16626238822937, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 17.387501097476846, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.3530198633670807, "logits/rejected": -0.07886115461587906, "logps/chosen": -3.11734938621521, "logps/rejected": -4.312512397766113, "loss": 0.4423, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.11734938621521, "rewards/margins": 1.1951625347137451, "rewards/rejected": -4.312512397766113, "sft_loss": 3.1892035007476807, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 19.201864047625754, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.29020678997039795, "logits/rejected": -0.19367562234401703, "logps/chosen": -3.008631944656372, "logps/rejected": -4.115485191345215, "loss": 0.4694, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.008631944656372, "rewards/margins": 1.106853723526001, "rewards/rejected": -4.115485191345215, "sft_loss": 3.0630905628204346, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 14.832985219659655, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.3708581328392029, "logits/rejected": -0.17783014476299286, "logps/chosen": -3.0939762592315674, "logps/rejected": -4.41884708404541, "loss": 0.4437, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0939762592315674, "rewards/margins": 1.3248708248138428, "rewards/rejected": -4.41884708404541, "sft_loss": 3.2989234924316406, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 18.326357383114857, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.2677830159664154, "logits/rejected": -0.009764463640749454, "logps/chosen": -3.1972920894622803, "logps/rejected": -4.31788444519043, "loss": 0.4879, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1972920894622803, "rewards/margins": 1.1205923557281494, "rewards/rejected": -4.31788444519043, "sft_loss": 3.2770209312438965, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 15.356981651869333, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.3357272744178772, "logits/rejected": -0.19809041917324066, "logps/chosen": -3.137024164199829, "logps/rejected": -4.19732666015625, "loss": 0.4936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.137024164199829, "rewards/margins": 1.060302734375, "rewards/rejected": -4.19732666015625, "sft_loss": 3.298842668533325, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 20.46982568740184, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.3426167368888855, "logits/rejected": -0.22665449976921082, "logps/chosen": -3.0479929447174072, "logps/rejected": -4.279765605926514, "loss": 0.4209, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0479929447174072, "rewards/margins": 1.2317724227905273, "rewards/rejected": -4.279765605926514, "sft_loss": 3.211376667022705, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 29.92794653237818, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.29240962862968445, "logits/rejected": -0.2794325351715088, "logps/chosen": -3.042105197906494, "logps/rejected": -4.291446685791016, "loss": 0.4216, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.042105197906494, "rewards/margins": 1.249341607093811, "rewards/rejected": -4.291446685791016, "sft_loss": 3.2021212577819824, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 28.274575534360725, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.3965105712413788, "logits/rejected": -0.2133466899394989, "logps/chosen": -3.2141833305358887, "logps/rejected": -4.305365562438965, "loss": 0.4813, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2141833305358887, "rewards/margins": 1.0911824703216553, "rewards/rejected": -4.305365562438965, "sft_loss": 3.2878317832946777, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 19.428228118524952, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.3674038350582123, "logits/rejected": -0.2661629319190979, "logps/chosen": -3.1580593585968018, "logps/rejected": -4.163644790649414, "loss": 0.4719, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1580593585968018, "rewards/margins": 1.0055850744247437, "rewards/rejected": -4.163644790649414, "sft_loss": 3.2378013134002686, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 15.088312796626367, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.28699201345443726, "logits/rejected": -0.14260557293891907, "logps/chosen": -3.083094835281372, "logps/rejected": -4.206784248352051, "loss": 0.4303, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.083094835281372, "rewards/margins": 1.1236896514892578, "rewards/rejected": -4.206784248352051, "sft_loss": 3.1587040424346924, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 15.085807412977193, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.398585706949234, "logits/rejected": -0.26450929045677185, "logps/chosen": -3.036803722381592, "logps/rejected": -4.33641242980957, "loss": 0.435, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.036803722381592, "rewards/margins": 1.2996087074279785, "rewards/rejected": -4.33641242980957, "sft_loss": 3.2943191528320312, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 17.875756064699107, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.3602619469165802, "logits/rejected": -0.14143213629722595, "logps/chosen": -3.187171459197998, "logps/rejected": -4.539390563964844, "loss": 0.4034, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.187171459197998, "rewards/margins": 1.352218508720398, "rewards/rejected": -4.539390563964844, "sft_loss": 3.360614061355591, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 19.254669989616364, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.3277217745780945, "logits/rejected": -0.14917297661304474, "logps/chosen": -3.2284178733825684, "logps/rejected": -4.194758415222168, "loss": 0.5259, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2284178733825684, "rewards/margins": 0.9663406610488892, "rewards/rejected": -4.194758415222168, "sft_loss": 3.447810411453247, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 15.662502888489064, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.34191417694091797, "logits/rejected": -0.16648179292678833, "logps/chosen": -3.153449535369873, "logps/rejected": -4.330137252807617, "loss": 0.4379, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.153449535369873, "rewards/margins": 1.1766880750656128, "rewards/rejected": -4.330137252807617, "sft_loss": 3.2555534839630127, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 28.49248750066988, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.31089869141578674, "logits/rejected": -0.131780743598938, "logps/chosen": -3.172286033630371, "logps/rejected": -4.270587921142578, "loss": 0.5245, "rewards/accuracies": 0.71875, "rewards/chosen": -3.172286033630371, "rewards/margins": 1.0983017683029175, "rewards/rejected": -4.270587921142578, "sft_loss": 3.335955858230591, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 19.18597001532169, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.33054202795028687, "logits/rejected": -0.16461774706840515, "logps/chosen": -3.1897456645965576, "logps/rejected": -4.276345252990723, "loss": 0.4839, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1897456645965576, "rewards/margins": 1.0865994691848755, "rewards/rejected": -4.276345252990723, "sft_loss": 3.2617619037628174, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 15.951155250872317, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.304962694644928, "logits/rejected": -0.2025957554578781, "logps/chosen": -3.0752673149108887, "logps/rejected": -4.30559778213501, "loss": 0.4418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0752673149108887, "rewards/margins": 1.230330228805542, "rewards/rejected": -4.30559778213501, "sft_loss": 3.1467199325561523, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 20.887886259722755, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.3699023127555847, "logits/rejected": -0.15047678351402283, "logps/chosen": -3.0767436027526855, "logps/rejected": -4.3717427253723145, "loss": 0.408, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0767436027526855, "rewards/margins": 1.2949992418289185, "rewards/rejected": -4.3717427253723145, "sft_loss": 3.279715061187744, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 34.759432720467146, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.33491817116737366, "logits/rejected": -0.2518042325973511, "logps/chosen": -3.222126007080078, "logps/rejected": -4.264901161193848, "loss": 0.5152, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.222126007080078, "rewards/margins": 1.0427753925323486, "rewards/rejected": -4.264901161193848, "sft_loss": 3.2958178520202637, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 16.679103901957355, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.29535579681396484, "logits/rejected": -0.19904498755931854, "logps/chosen": -3.22019624710083, "logps/rejected": -4.493924140930176, "loss": 0.4517, "rewards/accuracies": 0.8125, "rewards/chosen": -3.22019624710083, "rewards/margins": 1.273728609085083, "rewards/rejected": -4.493924140930176, "sft_loss": 3.3317291736602783, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 19.01300702417004, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.420266330242157, "logits/rejected": -0.2640858590602875, "logps/chosen": -3.09688663482666, "logps/rejected": -4.210850715637207, "loss": 0.4362, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.09688663482666, "rewards/margins": 1.1139637231826782, "rewards/rejected": -4.210850715637207, "sft_loss": 3.282369613647461, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 18.772689720631274, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.41807642579078674, "logits/rejected": -0.16949285566806793, "logps/chosen": -3.1378002166748047, "logps/rejected": -4.288260459899902, "loss": 0.4554, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1378002166748047, "rewards/margins": 1.1504604816436768, "rewards/rejected": -4.288260459899902, "sft_loss": 3.284740447998047, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 16.581837975657816, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.2532419264316559, "logits/rejected": -0.2221713811159134, "logps/chosen": -3.086059093475342, "logps/rejected": -4.135127067565918, "loss": 0.4878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.086059093475342, "rewards/margins": 1.0490682125091553, "rewards/rejected": -4.135127067565918, "sft_loss": 3.2261061668395996, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 20.023461445702395, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.2906354069709778, "logits/rejected": -0.15659931302070618, "logps/chosen": -2.9711108207702637, "logps/rejected": -4.29131555557251, "loss": 0.4048, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9711108207702637, "rewards/margins": 1.320204496383667, "rewards/rejected": -4.29131555557251, "sft_loss": 3.096325635910034, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 21.821290850850822, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.41042837500572205, "logits/rejected": -0.16330023109912872, "logps/chosen": -3.022231340408325, "logps/rejected": -4.2526140213012695, "loss": 0.4597, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.022231340408325, "rewards/margins": 1.2303820848464966, "rewards/rejected": -4.2526140213012695, "sft_loss": 3.254199266433716, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 15.71800589805128, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.3562023341655731, "logits/rejected": -0.11175551265478134, "logps/chosen": -3.140465021133423, "logps/rejected": -4.293511867523193, "loss": 0.458, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.140465021133423, "rewards/margins": 1.1530468463897705, "rewards/rejected": -4.293511867523193, "sft_loss": 3.288815975189209, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 20.242450556583425, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.3815039396286011, "logits/rejected": -0.2283477783203125, "logps/chosen": -3.178576707839966, "logps/rejected": -4.190559387207031, "loss": 0.5102, "rewards/accuracies": 0.78125, "rewards/chosen": -3.178576707839966, "rewards/margins": 1.0119825601577759, "rewards/rejected": -4.190559387207031, "sft_loss": 3.2854411602020264, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 15.326371019800186, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.33895745873451233, "logits/rejected": -0.19376994669437408, "logps/chosen": -3.007366895675659, "logps/rejected": -4.102883338928223, "loss": 0.4496, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.007366895675659, "rewards/margins": 1.095516324043274, "rewards/rejected": -4.102883338928223, "sft_loss": 3.211820602416992, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 18.435973466867548, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.3354995846748352, "logits/rejected": -0.1504775583744049, "logps/chosen": -3.0240750312805176, "logps/rejected": -4.287436485290527, "loss": 0.448, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0240750312805176, "rewards/margins": 1.2633622884750366, "rewards/rejected": -4.287436485290527, "sft_loss": 3.16176438331604, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 15.865403458445352, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.35504865646362305, "logits/rejected": -0.20114044845104218, "logps/chosen": -3.141514778137207, "logps/rejected": -4.188848495483398, "loss": 0.491, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.141514778137207, "rewards/margins": 1.047333836555481, "rewards/rejected": -4.188848495483398, "sft_loss": 3.353114366531372, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.09221591055393219, "eval_logits/rejected": 0.2088293880224228, "eval_logps/chosen": -3.2269797325134277, "eval_logps/rejected": -4.211081504821777, "eval_loss": 0.5603845715522766, "eval_rewards/accuracies": 0.7203264236450195, "eval_rewards/chosen": -3.2269797325134277, "eval_rewards/margins": 0.9841019511222839, "eval_rewards/rejected": -4.211081504821777, "eval_runtime": 51.2188, "eval_samples_per_second": 26.26, "eval_sft_loss": 3.356903076171875, "eval_steps_per_second": 6.58, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 17.549190178042807, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.38595595955848694, "logits/rejected": -0.3314853608608246, "logps/chosen": -3.010838747024536, "logps/rejected": -4.1536455154418945, "loss": 0.4246, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.010838747024536, "rewards/margins": 1.1428062915802002, "rewards/rejected": -4.1536455154418945, "sft_loss": 3.1467928886413574, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 15.470684494854325, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.3064468502998352, "logits/rejected": -0.183636873960495, "logps/chosen": -3.236712694168091, "logps/rejected": -4.509716987609863, "loss": 0.44, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.236712694168091, "rewards/margins": 1.2730040550231934, "rewards/rejected": -4.509716987609863, "sft_loss": 3.392608642578125, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 16.278846121553524, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.31276029348373413, "logits/rejected": -0.12013645470142365, "logps/chosen": -3.12622332572937, "logps/rejected": -4.3513078689575195, "loss": 0.4574, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.12622332572937, "rewards/margins": 1.225084662437439, "rewards/rejected": -4.3513078689575195, "sft_loss": 3.1967005729675293, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 19.832974055308483, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.3182033598423004, "logits/rejected": -0.22087469696998596, "logps/chosen": -3.1732428073883057, "logps/rejected": -4.155479431152344, "loss": 0.4911, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1732428073883057, "rewards/margins": 0.982236385345459, "rewards/rejected": -4.155479431152344, "sft_loss": 3.2742087841033936, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 11.517550512791749, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.33021387457847595, "logits/rejected": -0.22628983855247498, "logps/chosen": -2.900446891784668, "logps/rejected": -4.092584133148193, "loss": 0.3754, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.900446891784668, "rewards/margins": 1.1921371221542358, "rewards/rejected": -4.092584133148193, "sft_loss": 3.1026883125305176, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 21.771288688400453, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.3661060333251953, "logits/rejected": -0.1493196189403534, "logps/chosen": -3.1229987144470215, "logps/rejected": -4.320777416229248, "loss": 0.4451, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1229987144470215, "rewards/margins": 1.1977789402008057, "rewards/rejected": -4.320777416229248, "sft_loss": 3.2086873054504395, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 15.90923873125897, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.3496370017528534, "logits/rejected": -0.12278521060943604, "logps/chosen": -3.133633852005005, "logps/rejected": -4.354303359985352, "loss": 0.4414, "rewards/accuracies": 0.78125, "rewards/chosen": -3.133633852005005, "rewards/margins": 1.2206697463989258, "rewards/rejected": -4.354303359985352, "sft_loss": 3.3628973960876465, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 15.384651482718192, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.4240453839302063, "logits/rejected": -0.1185920238494873, "logps/chosen": -3.0241332054138184, "logps/rejected": -4.203446865081787, "loss": 0.4596, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0241332054138184, "rewards/margins": 1.1793134212493896, "rewards/rejected": -4.203446865081787, "sft_loss": 3.1807942390441895, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 23.54080916969624, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.36372414231300354, "logits/rejected": -0.12318293005228043, "logps/chosen": -3.158956527709961, "logps/rejected": -4.101058006286621, "loss": 0.5432, "rewards/accuracies": 0.71875, "rewards/chosen": -3.158956527709961, "rewards/margins": 0.942101001739502, "rewards/rejected": -4.101058006286621, "sft_loss": 3.2985637187957764, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 21.514102270030268, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.32725459337234497, "logits/rejected": -0.16769400238990784, "logps/chosen": -3.1743004322052, "logps/rejected": -4.455135345458984, "loss": 0.4501, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1743004322052, "rewards/margins": 1.2808345556259155, "rewards/rejected": -4.455135345458984, "sft_loss": 3.3227474689483643, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 20.027767150897976, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.4107169508934021, "logits/rejected": -0.2752717137336731, "logps/chosen": -3.105297327041626, "logps/rejected": -4.271871089935303, "loss": 0.456, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.105297327041626, "rewards/margins": 1.1665735244750977, "rewards/rejected": -4.271871089935303, "sft_loss": 3.2842094898223877, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 14.956469716175903, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.38273295760154724, "logits/rejected": -0.15654902160167694, "logps/chosen": -3.0597941875457764, "logps/rejected": -4.213123321533203, "loss": 0.4623, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0597941875457764, "rewards/margins": 1.1533290147781372, "rewards/rejected": -4.213123321533203, "sft_loss": 3.2771835327148438, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 18.673644430533805, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.3945019841194153, "logits/rejected": -0.17653414607048035, "logps/chosen": -3.2652783393859863, "logps/rejected": -4.370820045471191, "loss": 0.5077, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2652783393859863, "rewards/margins": 1.1055415868759155, "rewards/rejected": -4.370820045471191, "sft_loss": 3.3431732654571533, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 16.928687332642628, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.30592089891433716, "logits/rejected": -0.2027606964111328, "logps/chosen": -3.0578575134277344, "logps/rejected": -4.163485527038574, "loss": 0.502, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0578575134277344, "rewards/margins": 1.1056289672851562, "rewards/rejected": -4.163485527038574, "sft_loss": 3.2745048999786377, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 22.804494188615138, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.36948710680007935, "logits/rejected": -0.14660361409187317, "logps/chosen": -3.161653757095337, "logps/rejected": -4.248857498168945, "loss": 0.4663, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.161653757095337, "rewards/margins": 1.0872037410736084, "rewards/rejected": -4.248857498168945, "sft_loss": 3.2454497814178467, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 18.940651879852197, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.39973369240760803, "logits/rejected": -0.24374902248382568, "logps/chosen": -3.0319912433624268, "logps/rejected": -4.062434673309326, "loss": 0.4592, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0319912433624268, "rewards/margins": 1.0304433107376099, "rewards/rejected": -4.062434673309326, "sft_loss": 3.186066150665283, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 23.47984244284787, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.2687085270881653, "logits/rejected": -0.17982222139835358, "logps/chosen": -3.1518681049346924, "logps/rejected": -4.31063985824585, "loss": 0.4605, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1518681049346924, "rewards/margins": 1.1587715148925781, "rewards/rejected": -4.31063985824585, "sft_loss": 3.254848003387451, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 21.97239515670181, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.33787789940834045, "logits/rejected": -0.10413823276758194, "logps/chosen": -3.130115509033203, "logps/rejected": -4.395178318023682, "loss": 0.4466, "rewards/accuracies": 0.78125, "rewards/chosen": -3.130115509033203, "rewards/margins": 1.2650625705718994, "rewards/rejected": -4.395178318023682, "sft_loss": 3.320671796798706, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 16.423993441943615, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.45037755370140076, "logits/rejected": -0.2175770103931427, "logps/chosen": -3.2069408893585205, "logps/rejected": -4.270087242126465, "loss": 0.4828, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2069408893585205, "rewards/margins": 1.0631463527679443, "rewards/rejected": -4.270087242126465, "sft_loss": 3.34574818611145, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 17.638490509172506, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.30428510904312134, "logits/rejected": -0.2593707740306854, "logps/chosen": -3.059542417526245, "logps/rejected": -4.189467906951904, "loss": 0.447, "rewards/accuracies": 0.8125, "rewards/chosen": -3.059542417526245, "rewards/margins": 1.1299254894256592, "rewards/rejected": -4.189467906951904, "sft_loss": 3.2454159259796143, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 19.793799506487687, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.3508601188659668, "logits/rejected": -0.122955322265625, "logps/chosen": -3.1088366508483887, "logps/rejected": -4.098649978637695, "loss": 0.496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1088366508483887, "rewards/margins": 0.9898133277893066, "rewards/rejected": -4.098649978637695, "sft_loss": 3.2179903984069824, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 16.759274363474006, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.372157484292984, "logits/rejected": -0.2090158760547638, "logps/chosen": -3.145791530609131, "logps/rejected": -4.174318790435791, "loss": 0.5217, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.145791530609131, "rewards/margins": 1.0285265445709229, "rewards/rejected": -4.174318790435791, "sft_loss": 3.2394118309020996, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 15.948049682773291, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.3354955315589905, "logits/rejected": -0.24566030502319336, "logps/chosen": -3.003711223602295, "logps/rejected": -4.0184173583984375, "loss": 0.5035, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.003711223602295, "rewards/margins": 1.0147063732147217, "rewards/rejected": -4.0184173583984375, "sft_loss": 3.1131107807159424, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 17.295297630257068, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.35742565989494324, "logits/rejected": -0.22281166911125183, "logps/chosen": -3.1032912731170654, "logps/rejected": -4.247357368469238, "loss": 0.4456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1032912731170654, "rewards/margins": 1.1440660953521729, "rewards/rejected": -4.247357368469238, "sft_loss": 3.2081050872802734, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 23.203320817128144, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.39975064992904663, "logits/rejected": -0.18377183377742767, "logps/chosen": -3.1296451091766357, "logps/rejected": -4.430170059204102, "loss": 0.4169, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.1296451091766357, "rewards/margins": 1.3005244731903076, "rewards/rejected": -4.430170059204102, "sft_loss": 3.2475943565368652, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 19.048434918637117, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.36884161829948425, "logits/rejected": -0.25827229022979736, "logps/chosen": -3.074650526046753, "logps/rejected": -3.9793503284454346, "loss": 0.5143, "rewards/accuracies": 0.78125, "rewards/chosen": -3.074650526046753, "rewards/margins": 0.9046999216079712, "rewards/rejected": -3.9793503284454346, "sft_loss": 3.1442952156066895, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 20.349098775884393, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.2782624363899231, "logits/rejected": -0.17058196663856506, "logps/chosen": -2.9235804080963135, "logps/rejected": -4.07112979888916, "loss": 0.4807, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9235804080963135, "rewards/margins": 1.1475495100021362, "rewards/rejected": -4.07112979888916, "sft_loss": 3.1867012977600098, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 17.163122523064544, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.30895841121673584, "logits/rejected": -0.17452199757099152, "logps/chosen": -2.8846898078918457, "logps/rejected": -3.919926166534424, "loss": 0.4898, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8846898078918457, "rewards/margins": 1.0352360010147095, "rewards/rejected": -3.919926166534424, "sft_loss": 3.070295810699463, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 18.635859605646804, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.27063530683517456, "logits/rejected": -0.15605516731739044, "logps/chosen": -2.980637311935425, "logps/rejected": -4.103749752044678, "loss": 0.4746, "rewards/accuracies": 0.78125, "rewards/chosen": -2.980637311935425, "rewards/margins": 1.123112440109253, "rewards/rejected": -4.103749752044678, "sft_loss": 3.122291088104248, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 19.401639550425234, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.3831943869590759, "logits/rejected": -0.2574603259563446, "logps/chosen": -2.92333984375, "logps/rejected": -4.0302629470825195, "loss": 0.4766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.92333984375, "rewards/margins": 1.1069234609603882, "rewards/rejected": -4.0302629470825195, "sft_loss": 3.1017067432403564, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 16.231351109341652, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.3166596293449402, "logits/rejected": -0.20488278567790985, "logps/chosen": -3.1826725006103516, "logps/rejected": -4.1101884841918945, "loss": 0.5237, "rewards/accuracies": 0.75, "rewards/chosen": -3.1826725006103516, "rewards/margins": 0.9275161623954773, "rewards/rejected": -4.1101884841918945, "sft_loss": 3.377725601196289, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 22.175234926002215, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.37456607818603516, "logits/rejected": -0.14552044868469238, "logps/chosen": -3.1262221336364746, "logps/rejected": -4.358965873718262, "loss": 0.4665, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1262221336364746, "rewards/margins": 1.2327439785003662, "rewards/rejected": -4.358965873718262, "sft_loss": 3.2200629711151123, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 17.647430018000996, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.32481950521469116, "logits/rejected": -0.18392665684223175, "logps/chosen": -3.0514533519744873, "logps/rejected": -4.248579978942871, "loss": 0.4352, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0514533519744873, "rewards/margins": 1.1971266269683838, "rewards/rejected": -4.248579978942871, "sft_loss": 3.1023261547088623, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 19.93756168866044, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.2983691394329071, "logits/rejected": -0.122675821185112, "logps/chosen": -3.1254711151123047, "logps/rejected": -4.209573745727539, "loss": 0.5005, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1254711151123047, "rewards/margins": 1.0841023921966553, "rewards/rejected": -4.209573745727539, "sft_loss": 3.2371037006378174, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 19.845925220759483, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.2862624228000641, "logits/rejected": -0.19400539994239807, "logps/chosen": -2.9830963611602783, "logps/rejected": -4.230919361114502, "loss": 0.4169, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9830963611602783, "rewards/margins": 1.2478225231170654, "rewards/rejected": -4.230919361114502, "sft_loss": 3.1368911266326904, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 21.594842070698846, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.2956594228744507, "logits/rejected": -0.22624805569648743, "logps/chosen": -2.9545702934265137, "logps/rejected": -3.8970417976379395, "loss": 0.5049, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9545702934265137, "rewards/margins": 0.9424716830253601, "rewards/rejected": -3.8970417976379395, "sft_loss": 3.093780040740967, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 19.202639752444874, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.3196641802787781, "logits/rejected": -0.15099498629570007, "logps/chosen": -3.0403876304626465, "logps/rejected": -4.135394096374512, "loss": 0.4713, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0403876304626465, "rewards/margins": 1.095007061958313, "rewards/rejected": -4.135394096374512, "sft_loss": 3.225785732269287, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 12.768231099980593, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.3886847496032715, "logits/rejected": -0.2385650873184204, "logps/chosen": -2.975139617919922, "logps/rejected": -3.9979240894317627, "loss": 0.4818, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.975139617919922, "rewards/margins": 1.0227844715118408, "rewards/rejected": -3.9979240894317627, "sft_loss": 3.1574254035949707, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 17.63037705851786, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.4010900855064392, "logits/rejected": -0.26809439063072205, "logps/chosen": -2.91060209274292, "logps/rejected": -4.1822333335876465, "loss": 0.4567, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.91060209274292, "rewards/margins": 1.2716315984725952, "rewards/rejected": -4.1822333335876465, "sft_loss": 3.0425546169281006, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 14.547973993460808, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.3640051782131195, "logits/rejected": -0.1315825879573822, "logps/chosen": -2.999293804168701, "logps/rejected": -4.230451583862305, "loss": 0.4298, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.999293804168701, "rewards/margins": 1.2311577796936035, "rewards/rejected": -4.230451583862305, "sft_loss": 3.197936534881592, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 16.722793874915272, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.3400874137878418, "logits/rejected": -0.16026431322097778, "logps/chosen": -3.140813112258911, "logps/rejected": -4.275707721710205, "loss": 0.4853, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.140813112258911, "rewards/margins": 1.1348944902420044, "rewards/rejected": -4.275707721710205, "sft_loss": 3.330498456954956, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 18.983221644965163, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.37416914105415344, "logits/rejected": -0.22600290179252625, "logps/chosen": -3.138129711151123, "logps/rejected": -4.2918477058410645, "loss": 0.477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.138129711151123, "rewards/margins": 1.1537177562713623, "rewards/rejected": -4.2918477058410645, "sft_loss": 3.226738691329956, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 18.030058401465762, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.33753058314323425, "logits/rejected": -0.14018869400024414, "logps/chosen": -3.171159029006958, "logps/rejected": -4.425040245056152, "loss": 0.4658, "rewards/accuracies": 0.78125, "rewards/chosen": -3.171159029006958, "rewards/margins": 1.2538812160491943, "rewards/rejected": -4.425040245056152, "sft_loss": 3.3366711139678955, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 18.217023564165505, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.3064436912536621, "logits/rejected": -0.08399565517902374, "logps/chosen": -2.881296157836914, "logps/rejected": -4.056055545806885, "loss": 0.4144, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.881296157836914, "rewards/margins": 1.1747593879699707, "rewards/rejected": -4.056055545806885, "sft_loss": 3.1400952339172363, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 16.51985063792187, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.3931240439414978, "logits/rejected": -0.3596143126487732, "logps/chosen": -3.014737606048584, "logps/rejected": -4.183712959289551, "loss": 0.4529, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.014737606048584, "rewards/margins": 1.1689751148223877, "rewards/rejected": -4.183712959289551, "sft_loss": 3.1337687969207764, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 19.25543546277433, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.26421234011650085, "logits/rejected": -0.16148383915424347, "logps/chosen": -3.0065956115722656, "logps/rejected": -4.194592475891113, "loss": 0.4534, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0065956115722656, "rewards/margins": 1.187996745109558, "rewards/rejected": -4.194592475891113, "sft_loss": 3.2171425819396973, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 24.027527657533568, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.2581459581851959, "logits/rejected": -0.08003239333629608, "logps/chosen": -3.1716055870056152, "logps/rejected": -4.404228210449219, "loss": 0.4574, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.1716055870056152, "rewards/margins": 1.2326222658157349, "rewards/rejected": -4.404228210449219, "sft_loss": 3.2989680767059326, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 15.809736239137736, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.2888232171535492, "logits/rejected": -0.15002648532390594, "logps/chosen": -3.079983711242676, "logps/rejected": -4.162103652954102, "loss": 0.4622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.079983711242676, "rewards/margins": 1.0821197032928467, "rewards/rejected": -4.162103652954102, "sft_loss": 3.223768949508667, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 19.957289974118574, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.32226279377937317, "logits/rejected": -0.15740683674812317, "logps/chosen": -3.0749404430389404, "logps/rejected": -4.272371768951416, "loss": 0.4498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0749404430389404, "rewards/margins": 1.1974310874938965, "rewards/rejected": -4.272371768951416, "sft_loss": 3.260594606399536, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 20.147501113981313, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.37094011902809143, "logits/rejected": -0.11719591915607452, "logps/chosen": -2.9942538738250732, "logps/rejected": -4.199368476867676, "loss": 0.4432, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9942538738250732, "rewards/margins": 1.2051149606704712, "rewards/rejected": -4.199368476867676, "sft_loss": 3.0453991889953613, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 19.390214950334087, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.36261844635009766, "logits/rejected": -0.18685957789421082, "logps/chosen": -3.118291139602661, "logps/rejected": -4.132403373718262, "loss": 0.4893, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.118291139602661, "rewards/margins": 1.0141125917434692, "rewards/rejected": -4.132403373718262, "sft_loss": 3.346827745437622, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 17.06096706635864, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.32899707555770874, "logits/rejected": -0.1959637850522995, "logps/chosen": -3.1111388206481934, "logps/rejected": -4.257960319519043, "loss": 0.4374, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1111388206481934, "rewards/margins": 1.1468214988708496, "rewards/rejected": -4.257960319519043, "sft_loss": 3.194671154022217, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 22.251746068963072, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.3317912518978119, "logits/rejected": -0.23203852772712708, "logps/chosen": -3.047714948654175, "logps/rejected": -4.089310646057129, "loss": 0.4892, "rewards/accuracies": 0.75, "rewards/chosen": -3.047714948654175, "rewards/margins": 1.041595458984375, "rewards/rejected": -4.089310646057129, "sft_loss": 3.1647562980651855, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 24.021656232040822, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.3317021429538727, "logits/rejected": -0.1665099561214447, "logps/chosen": -2.9778199195861816, "logps/rejected": -3.9552600383758545, "loss": 0.4825, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9778199195861816, "rewards/margins": 0.977440357208252, "rewards/rejected": -3.9552600383758545, "sft_loss": 3.044766902923584, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 24.23329671944176, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.2726336121559143, "logits/rejected": -0.0905083566904068, "logps/chosen": -3.018521547317505, "logps/rejected": -4.061691761016846, "loss": 0.5167, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.018521547317505, "rewards/margins": 1.0431700944900513, "rewards/rejected": -4.061691761016846, "sft_loss": 3.164506435394287, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 21.406723040593615, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.2635464370250702, "logits/rejected": -0.1459456831216812, "logps/chosen": -2.977616786956787, "logps/rejected": -4.1141133308410645, "loss": 0.4754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.977616786956787, "rewards/margins": 1.1364965438842773, "rewards/rejected": -4.1141133308410645, "sft_loss": 3.0608811378479004, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 12.667085384649567, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.36656802892684937, "logits/rejected": -0.13581299781799316, "logps/chosen": -2.9960341453552246, "logps/rejected": -4.152059555053711, "loss": 0.4398, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9960341453552246, "rewards/margins": 1.1560252904891968, "rewards/rejected": -4.152059555053711, "sft_loss": 3.1322269439697266, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 17.283403366542846, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.27755430340766907, "logits/rejected": -0.20100736618041992, "logps/chosen": -2.942697048187256, "logps/rejected": -4.024935245513916, "loss": 0.475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.942697048187256, "rewards/margins": 1.0822378396987915, "rewards/rejected": -4.024935245513916, "sft_loss": 3.020854949951172, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 15.476244362183891, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.2853260338306427, "logits/rejected": -0.08334718644618988, "logps/chosen": -3.1610915660858154, "logps/rejected": -4.359834671020508, "loss": 0.4604, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1610915660858154, "rewards/margins": 1.1987429857254028, "rewards/rejected": -4.359834671020508, "sft_loss": 3.271552562713623, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 16.524220171719012, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.39955052733421326, "logits/rejected": -0.24060598015785217, "logps/chosen": -3.005897045135498, "logps/rejected": -4.021838188171387, "loss": 0.4888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.005897045135498, "rewards/margins": 1.0159412622451782, "rewards/rejected": -4.021838188171387, "sft_loss": 3.2215378284454346, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 19.96309863344739, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.29617124795913696, "logits/rejected": -0.21583464741706848, "logps/chosen": -3.0837156772613525, "logps/rejected": -4.209137916564941, "loss": 0.4901, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0837156772613525, "rewards/margins": 1.1254225969314575, "rewards/rejected": -4.209137916564941, "sft_loss": 3.1022210121154785, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 25.87501638585397, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.3336920440196991, "logits/rejected": -0.11970163881778717, "logps/chosen": -3.096277952194214, "logps/rejected": -4.252020835876465, "loss": 0.466, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.096277952194214, "rewards/margins": 1.1557424068450928, "rewards/rejected": -4.252020835876465, "sft_loss": 3.2355098724365234, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 16.68144881433365, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.283000648021698, "logits/rejected": -0.2432548999786377, "logps/chosen": -3.095372438430786, "logps/rejected": -4.4320454597473145, "loss": 0.3979, "rewards/accuracies": 0.84375, "rewards/chosen": -3.095372438430786, "rewards/margins": 1.3366725444793701, "rewards/rejected": -4.4320454597473145, "sft_loss": 3.2416470050811768, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 22.269589452874328, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.33037471771240234, "logits/rejected": -0.17230312526226044, "logps/chosen": -2.932950735092163, "logps/rejected": -4.207057952880859, "loss": 0.4142, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.932950735092163, "rewards/margins": 1.2741069793701172, "rewards/rejected": -4.207057952880859, "sft_loss": 3.052420139312744, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 15.984312304222266, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.36222535371780396, "logits/rejected": -0.2763141393661499, "logps/chosen": -3.0980286598205566, "logps/rejected": -4.238247871398926, "loss": 0.5043, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0980286598205566, "rewards/margins": 1.1402194499969482, "rewards/rejected": -4.238247871398926, "sft_loss": 3.2644906044006348, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 20.107738671957918, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.2804338335990906, "logits/rejected": -0.10021962970495224, "logps/chosen": -3.0184197425842285, "logps/rejected": -4.308283805847168, "loss": 0.4141, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0184197425842285, "rewards/margins": 1.2898635864257812, "rewards/rejected": -4.308283805847168, "sft_loss": 3.2268364429473877, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 17.955224209629822, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.2626858055591583, "logits/rejected": -0.29193446040153503, "logps/chosen": -3.162912607192993, "logps/rejected": -4.278327465057373, "loss": 0.4312, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.162912607192993, "rewards/margins": 1.115415334701538, "rewards/rejected": -4.278327465057373, "sft_loss": 3.282226085662842, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 12.84821885820571, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.33620548248291016, "logits/rejected": -0.15401166677474976, "logps/chosen": -3.1779723167419434, "logps/rejected": -4.275982856750488, "loss": 0.4813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1779723167419434, "rewards/margins": 1.0980106592178345, "rewards/rejected": -4.275982856750488, "sft_loss": 3.3701813220977783, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 19.088401413763112, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.33222153782844543, "logits/rejected": -0.2611008286476135, "logps/chosen": -3.208530902862549, "logps/rejected": -4.362675666809082, "loss": 0.4814, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.208530902862549, "rewards/margins": 1.1541452407836914, "rewards/rejected": -4.362675666809082, "sft_loss": 3.362258195877075, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 14.875425765942385, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.27498680353164673, "logits/rejected": -0.014174816198647022, "logps/chosen": -3.0799829959869385, "logps/rejected": -4.485774040222168, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -3.0799829959869385, "rewards/margins": 1.4057915210723877, "rewards/rejected": -4.485774040222168, "sft_loss": 3.205416202545166, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 16.35400953971993, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.3643206059932709, "logits/rejected": -0.162673220038414, "logps/chosen": -3.0855135917663574, "logps/rejected": -4.370043754577637, "loss": 0.4518, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0855135917663574, "rewards/margins": 1.2845300436019897, "rewards/rejected": -4.370043754577637, "sft_loss": 3.2811782360076904, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 16.782745797370215, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.3893781304359436, "logits/rejected": -0.27229851484298706, "logps/chosen": -3.0741641521453857, "logps/rejected": -4.406427383422852, "loss": 0.4043, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.0741641521453857, "rewards/margins": 1.3322635889053345, "rewards/rejected": -4.406427383422852, "sft_loss": 3.285371780395508, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 22.533404367403524, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.3591257631778717, "logits/rejected": -0.16151633858680725, "logps/chosen": -3.186220169067383, "logps/rejected": -4.249715328216553, "loss": 0.4871, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.186220169067383, "rewards/margins": 1.0634949207305908, "rewards/rejected": -4.249715328216553, "sft_loss": 3.3226311206817627, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 24.867433963411433, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.3205176889896393, "logits/rejected": -0.15790733695030212, "logps/chosen": -3.1841578483581543, "logps/rejected": -4.544305324554443, "loss": 0.4391, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1841578483581543, "rewards/margins": 1.3601475954055786, "rewards/rejected": -4.544305324554443, "sft_loss": 3.2552971839904785, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 22.050455771571038, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.2175244837999344, "logits/rejected": -0.11109142005443573, "logps/chosen": -3.1900973320007324, "logps/rejected": -4.4396071434021, "loss": 0.4804, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1900973320007324, "rewards/margins": 1.249509572982788, "rewards/rejected": -4.4396071434021, "sft_loss": 3.242172956466675, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 16.86639995641779, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.35965830087661743, "logits/rejected": -0.22785428166389465, "logps/chosen": -3.0863912105560303, "logps/rejected": -4.196893215179443, "loss": 0.4612, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0863912105560303, "rewards/margins": 1.1105024814605713, "rewards/rejected": -4.196893215179443, "sft_loss": 3.212651014328003, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 18.825314082783557, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.2895389199256897, "logits/rejected": -0.22839932143688202, "logps/chosen": -3.135324716567993, "logps/rejected": -4.043545722961426, "loss": 0.5092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.135324716567993, "rewards/margins": 0.9082208871841431, "rewards/rejected": -4.043545722961426, "sft_loss": 3.223323106765747, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 17.842623697485102, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.37467458844184875, "logits/rejected": -0.2382585108280182, "logps/chosen": -3.1378002166748047, "logps/rejected": -4.3286638259887695, "loss": 0.4699, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1378002166748047, "rewards/margins": 1.190863013267517, "rewards/rejected": -4.3286638259887695, "sft_loss": 3.2269675731658936, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 24.464303733487966, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.30891790986061096, "logits/rejected": -0.18581345677375793, "logps/chosen": -3.2107269763946533, "logps/rejected": -4.322188854217529, "loss": 0.4934, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2107269763946533, "rewards/margins": 1.111462116241455, "rewards/rejected": -4.322188854217529, "sft_loss": 3.2918734550476074, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 16.845112498795427, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.3393566906452179, "logits/rejected": -0.21427495777606964, "logps/chosen": -3.025163173675537, "logps/rejected": -4.013575553894043, "loss": 0.4967, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.025163173675537, "rewards/margins": 0.9884128570556641, "rewards/rejected": -4.013575553894043, "sft_loss": 3.262495756149292, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.05386869236826897, "eval_logits/rejected": 0.16599443554878235, "eval_logps/chosen": -3.1626408100128174, "eval_logps/rejected": -4.141357421875, "eval_loss": 0.5589354038238525, "eval_rewards/accuracies": 0.7225519418716431, "eval_rewards/chosen": -3.1626408100128174, "eval_rewards/margins": 0.9787165522575378, "eval_rewards/rejected": -4.141357421875, "eval_runtime": 50.2599, "eval_samples_per_second": 26.761, "eval_sft_loss": 3.286099433898926, "eval_steps_per_second": 6.705, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 24.916256914603725, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.38694795966148376, "logits/rejected": -0.2582995295524597, "logps/chosen": -3.044128179550171, "logps/rejected": -4.149737358093262, "loss": 0.5057, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.044128179550171, "rewards/margins": 1.1056091785430908, "rewards/rejected": -4.149737358093262, "sft_loss": 3.150749444961548, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 16.625164558780753, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.33551377058029175, "logits/rejected": -0.21937128901481628, "logps/chosen": -3.0479540824890137, "logps/rejected": -4.1916184425354, "loss": 0.468, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0479540824890137, "rewards/margins": 1.1436642408370972, "rewards/rejected": -4.1916184425354, "sft_loss": 3.181412696838379, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 23.333341592536538, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.3330397605895996, "logits/rejected": -0.18397782742977142, "logps/chosen": -2.8639981746673584, "logps/rejected": -4.1112751960754395, "loss": 0.5087, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8639981746673584, "rewards/margins": 1.2472766637802124, "rewards/rejected": -4.1112751960754395, "sft_loss": 3.0787956714630127, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 19.920525986764915, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.29376813769340515, "logits/rejected": -0.1875263750553131, "logps/chosen": -3.086697578430176, "logps/rejected": -4.033606052398682, "loss": 0.4938, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.086697578430176, "rewards/margins": 0.9469084739685059, "rewards/rejected": -4.033606052398682, "sft_loss": 3.1368043422698975, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 23.23922038582777, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.29967522621154785, "logits/rejected": -0.133182555437088, "logps/chosen": -3.0171871185302734, "logps/rejected": -4.105466842651367, "loss": 0.4585, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0171871185302734, "rewards/margins": 1.0882798433303833, "rewards/rejected": -4.105466842651367, "sft_loss": 3.0773675441741943, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 22.37041290053205, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.3489793539047241, "logits/rejected": -0.1727316677570343, "logps/chosen": -3.129185438156128, "logps/rejected": -4.195072174072266, "loss": 0.4729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.129185438156128, "rewards/margins": 1.0658868551254272, "rewards/rejected": -4.195072174072266, "sft_loss": 3.2715210914611816, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 20.35813506460253, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.4507429599761963, "logits/rejected": -0.24051764607429504, "logps/chosen": -2.9227616786956787, "logps/rejected": -4.2196455001831055, "loss": 0.4019, "rewards/accuracies": 0.875, "rewards/chosen": -2.9227616786956787, "rewards/margins": 1.2968841791152954, "rewards/rejected": -4.2196455001831055, "sft_loss": 3.03584361076355, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 18.646249235430773, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.40485063195228577, "logits/rejected": -0.16617676615715027, "logps/chosen": -2.9163639545440674, "logps/rejected": -4.19936466217041, "loss": 0.4011, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9163639545440674, "rewards/margins": 1.2830005884170532, "rewards/rejected": -4.19936466217041, "sft_loss": 3.042677402496338, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 19.18315587660348, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.32241642475128174, "logits/rejected": -0.261809766292572, "logps/chosen": -2.975820302963257, "logps/rejected": -4.039728164672852, "loss": 0.4794, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.975820302963257, "rewards/margins": 1.0639079809188843, "rewards/rejected": -4.039728164672852, "sft_loss": 3.182965040206909, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 19.24094940649905, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.4445788860321045, "logits/rejected": -0.2621915340423584, "logps/chosen": -3.1237831115722656, "logps/rejected": -4.347258567810059, "loss": 0.4573, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1237831115722656, "rewards/margins": 1.223475694656372, "rewards/rejected": -4.347258567810059, "sft_loss": 3.208775758743286, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 23.126508233398816, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.2847757935523987, "logits/rejected": -0.11558979749679565, "logps/chosen": -2.8969483375549316, "logps/rejected": -4.117270469665527, "loss": 0.4668, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.8969483375549316, "rewards/margins": 1.2203221321105957, "rewards/rejected": -4.117270469665527, "sft_loss": 3.1041221618652344, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 20.547858879397218, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.3398367762565613, "logits/rejected": -0.22868943214416504, "logps/chosen": -2.8547940254211426, "logps/rejected": -4.000241279602051, "loss": 0.4489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8547940254211426, "rewards/margins": 1.1454476118087769, "rewards/rejected": -4.000241279602051, "sft_loss": 3.06701397895813, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 10.82015699618501, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.4190579950809479, "logits/rejected": -0.3059861660003662, "logps/chosen": -3.1115918159484863, "logps/rejected": -4.324707984924316, "loss": 0.4422, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1115918159484863, "rewards/margins": 1.2131168842315674, "rewards/rejected": -4.324707984924316, "sft_loss": 3.2403512001037598, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 16.438235909967577, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.3123801350593567, "logits/rejected": -0.1777162104845047, "logps/chosen": -2.853092670440674, "logps/rejected": -4.257376670837402, "loss": 0.3776, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.853092670440674, "rewards/margins": 1.4042837619781494, "rewards/rejected": -4.257376670837402, "sft_loss": 2.9209725856781006, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 23.006845285193506, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.23909814655780792, "logits/rejected": -0.13520967960357666, "logps/chosen": -2.869157314300537, "logps/rejected": -4.090670585632324, "loss": 0.4834, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.869157314300537, "rewards/margins": 1.2215136289596558, "rewards/rejected": -4.090670585632324, "sft_loss": 3.0704166889190674, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 14.726162342640116, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.24981050193309784, "logits/rejected": -0.1145351380109787, "logps/chosen": -2.9208083152770996, "logps/rejected": -4.255640506744385, "loss": 0.4267, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.9208083152770996, "rewards/margins": 1.334831953048706, "rewards/rejected": -4.255640506744385, "sft_loss": 3.1174123287200928, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 17.993500744261805, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.25230225920677185, "logits/rejected": -0.1823224127292633, "logps/chosen": -3.0884265899658203, "logps/rejected": -4.351536750793457, "loss": 0.4498, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0884265899658203, "rewards/margins": 1.2631103992462158, "rewards/rejected": -4.351536750793457, "sft_loss": 3.2716116905212402, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 19.86826170864046, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.3957950472831726, "logits/rejected": -0.19159351289272308, "logps/chosen": -3.080976963043213, "logps/rejected": -4.284677505493164, "loss": 0.4773, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.080976963043213, "rewards/margins": 1.2037004232406616, "rewards/rejected": -4.284677505493164, "sft_loss": 3.260425567626953, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 14.646080419525127, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.30659979581832886, "logits/rejected": -0.2037850320339203, "logps/chosen": -3.0508840084075928, "logps/rejected": -3.9952120780944824, "loss": 0.5283, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0508840084075928, "rewards/margins": 0.9443281292915344, "rewards/rejected": -3.9952120780944824, "sft_loss": 3.201610565185547, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 18.560996284442947, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.2744046747684479, "logits/rejected": -0.09588640928268433, "logps/chosen": -2.9869422912597656, "logps/rejected": -4.162718772888184, "loss": 0.4379, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9869422912597656, "rewards/margins": 1.175776481628418, "rewards/rejected": -4.162718772888184, "sft_loss": 3.1522040367126465, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 14.481153499398367, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.3010391592979431, "logits/rejected": -0.11915872246026993, "logps/chosen": -3.0480072498321533, "logps/rejected": -4.383750915527344, "loss": 0.4113, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0480072498321533, "rewards/margins": 1.33574378490448, "rewards/rejected": -4.383750915527344, "sft_loss": 3.292620897293091, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 16.282657228318666, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.305327832698822, "logits/rejected": -0.3109681308269501, "logps/chosen": -2.971714496612549, "logps/rejected": -4.347641944885254, "loss": 0.4422, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.971714496612549, "rewards/margins": 1.3759269714355469, "rewards/rejected": -4.347641944885254, "sft_loss": 3.0994067192077637, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 23.68553480847594, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.3070877194404602, "logits/rejected": -0.1455794870853424, "logps/chosen": -2.9347236156463623, "logps/rejected": -4.083266258239746, "loss": 0.46, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9347236156463623, "rewards/margins": 1.1485424041748047, "rewards/rejected": -4.083266258239746, "sft_loss": 3.092538356781006, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 17.0194885455986, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.3108268976211548, "logits/rejected": -0.17864060401916504, "logps/chosen": -3.0577645301818848, "logps/rejected": -4.320757865905762, "loss": 0.4578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0577645301818848, "rewards/margins": 1.262993574142456, "rewards/rejected": -4.320757865905762, "sft_loss": 3.2131295204162598, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 16.110603675677545, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.43900686502456665, "logits/rejected": -0.16938167810440063, "logps/chosen": -3.0163490772247314, "logps/rejected": -4.240276336669922, "loss": 0.4472, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0163490772247314, "rewards/margins": 1.2239272594451904, "rewards/rejected": -4.240276336669922, "sft_loss": 3.218003749847412, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 14.772454995298762, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.31613582372665405, "logits/rejected": -0.097527876496315, "logps/chosen": -3.0685675144195557, "logps/rejected": -4.425512790679932, "loss": 0.4334, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0685675144195557, "rewards/margins": 1.356945514678955, "rewards/rejected": -4.425512790679932, "sft_loss": 3.140439510345459, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 26.294902275748264, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.36880728602409363, "logits/rejected": -0.15747341513633728, "logps/chosen": -2.98952579498291, "logps/rejected": -4.092957973480225, "loss": 0.49, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.98952579498291, "rewards/margins": 1.1034326553344727, "rewards/rejected": -4.092957973480225, "sft_loss": 3.1426939964294434, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 15.384947752941567, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.3689922094345093, "logits/rejected": -0.2464989870786667, "logps/chosen": -3.0003583431243896, "logps/rejected": -3.947209596633911, "loss": 0.485, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0003583431243896, "rewards/margins": 0.9468511343002319, "rewards/rejected": -3.947209596633911, "sft_loss": 3.1662871837615967, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 22.157128734922004, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.3239455819129944, "logits/rejected": -0.10675706714391708, "logps/chosen": -3.0349650382995605, "logps/rejected": -4.290961265563965, "loss": 0.4187, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0349650382995605, "rewards/margins": 1.2559964656829834, "rewards/rejected": -4.290961265563965, "sft_loss": 3.1621806621551514, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 19.15505957419598, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.3120448589324951, "logits/rejected": -0.12682923674583435, "logps/chosen": -2.9931201934814453, "logps/rejected": -4.10862398147583, "loss": 0.5114, "rewards/accuracies": 0.75, "rewards/chosen": -2.9931201934814453, "rewards/margins": 1.1155036687850952, "rewards/rejected": -4.10862398147583, "sft_loss": 3.1184768676757812, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 22.40083435029277, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.3152123987674713, "logits/rejected": -0.2484721839427948, "logps/chosen": -3.118741273880005, "logps/rejected": -4.189919471740723, "loss": 0.461, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.118741273880005, "rewards/margins": 1.0711780786514282, "rewards/rejected": -4.189919471740723, "sft_loss": 3.274350643157959, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 15.22644592986102, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.27597761154174805, "logits/rejected": -0.14365874230861664, "logps/chosen": -2.9235987663269043, "logps/rejected": -4.190218925476074, "loss": 0.4232, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.9235987663269043, "rewards/margins": 1.26662015914917, "rewards/rejected": -4.190218925476074, "sft_loss": 3.094919204711914, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 20.777279697287344, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.3656620681285858, "logits/rejected": -0.2070375382900238, "logps/chosen": -2.9615466594696045, "logps/rejected": -4.212521553039551, "loss": 0.4379, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.9615466594696045, "rewards/margins": 1.2509740591049194, "rewards/rejected": -4.212521553039551, "sft_loss": 3.181602954864502, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 20.388056518083097, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.25572115182876587, "logits/rejected": -0.1687474101781845, "logps/chosen": -2.872952699661255, "logps/rejected": -4.130259990692139, "loss": 0.4301, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.872952699661255, "rewards/margins": 1.2573074102401733, "rewards/rejected": -4.130259990692139, "sft_loss": 2.927790641784668, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 16.961728362857738, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.29145973920822144, "logits/rejected": -0.22770313918590546, "logps/chosen": -3.081450939178467, "logps/rejected": -4.227843284606934, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.081450939178467, "rewards/margins": 1.1463927030563354, "rewards/rejected": -4.227843284606934, "sft_loss": 3.1626648902893066, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 19.0372220245103, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.19781547784805298, "logits/rejected": -0.1551518440246582, "logps/chosen": -3.236917495727539, "logps/rejected": -4.306820869445801, "loss": 0.5071, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.236917495727539, "rewards/margins": 1.0699033737182617, "rewards/rejected": -4.306820869445801, "sft_loss": 3.259613513946533, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 18.562975696656537, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.299947053194046, "logits/rejected": -0.09621630609035492, "logps/chosen": -2.9437007904052734, "logps/rejected": -4.194373607635498, "loss": 0.4854, "rewards/accuracies": 0.75, "rewards/chosen": -2.9437007904052734, "rewards/margins": 1.2506728172302246, "rewards/rejected": -4.194373607635498, "sft_loss": 3.020115375518799, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 19.595036919820167, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.4113302230834961, "logits/rejected": -0.21265852451324463, "logps/chosen": -2.9879603385925293, "logps/rejected": -4.28658390045166, "loss": 0.4121, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.9879603385925293, "rewards/margins": 1.2986233234405518, "rewards/rejected": -4.28658390045166, "sft_loss": 3.139157772064209, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 16.51138964172436, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.3349289894104004, "logits/rejected": -0.3230084776878357, "logps/chosen": -3.0833542346954346, "logps/rejected": -4.051263809204102, "loss": 0.5243, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0833542346954346, "rewards/margins": 0.9679099917411804, "rewards/rejected": -4.051263809204102, "sft_loss": 3.2449798583984375, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 18.197818314108957, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.34814295172691345, "logits/rejected": -0.29311561584472656, "logps/chosen": -3.0884833335876465, "logps/rejected": -4.1714372634887695, "loss": 0.5178, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0884833335876465, "rewards/margins": 1.0829538106918335, "rewards/rejected": -4.1714372634887695, "sft_loss": 3.25970458984375, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 18.610757371590207, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.26829028129577637, "logits/rejected": -0.1963682621717453, "logps/chosen": -3.059337854385376, "logps/rejected": -4.076889991760254, "loss": 0.4642, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.059337854385376, "rewards/margins": 1.0175515413284302, "rewards/rejected": -4.076889991760254, "sft_loss": 3.198467969894409, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 23.57394142189509, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.2515251040458679, "logits/rejected": -0.15543124079704285, "logps/chosen": -3.1278204917907715, "logps/rejected": -4.222821235656738, "loss": 0.4596, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1278204917907715, "rewards/margins": 1.0950000286102295, "rewards/rejected": -4.222821235656738, "sft_loss": 3.204451322555542, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 19.210952164670346, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.3350864052772522, "logits/rejected": -0.24469581246376038, "logps/chosen": -3.0523061752319336, "logps/rejected": -4.216456413269043, "loss": 0.4467, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0523061752319336, "rewards/margins": 1.1641499996185303, "rewards/rejected": -4.216456413269043, "sft_loss": 3.1259608268737793, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 22.822771825087354, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.3542567491531372, "logits/rejected": -0.09227250516414642, "logps/chosen": -2.9733572006225586, "logps/rejected": -4.103068828582764, "loss": 0.4809, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9733572006225586, "rewards/margins": 1.1297115087509155, "rewards/rejected": -4.103068828582764, "sft_loss": 3.186063766479492, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 15.633412485679143, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.38056424260139465, "logits/rejected": -0.2081277072429657, "logps/chosen": -3.025116443634033, "logps/rejected": -4.112739086151123, "loss": 0.4807, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.025116443634033, "rewards/margins": 1.0876230001449585, "rewards/rejected": -4.112739086151123, "sft_loss": 3.1368777751922607, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 15.403511604630026, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.341784805059433, "logits/rejected": -0.2454969882965088, "logps/chosen": -2.987480640411377, "logps/rejected": -4.311043739318848, "loss": 0.4443, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.987480640411377, "rewards/margins": 1.323562502861023, "rewards/rejected": -4.311043739318848, "sft_loss": 3.0825743675231934, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 21.957693002820385, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.32171764969825745, "logits/rejected": -0.1897803694009781, "logps/chosen": -3.0886237621307373, "logps/rejected": -4.227081775665283, "loss": 0.4616, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0886237621307373, "rewards/margins": 1.138457179069519, "rewards/rejected": -4.227081775665283, "sft_loss": 3.2400691509246826, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 17.284715714096862, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.15596535801887512, "logits/rejected": -0.1801324039697647, "logps/chosen": -3.184924364089966, "logps/rejected": -4.2829718589782715, "loss": 0.4788, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.184924364089966, "rewards/margins": 1.0980473756790161, "rewards/rejected": -4.2829718589782715, "sft_loss": 3.25410795211792, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 21.944048134631043, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.2575821578502655, "logits/rejected": -0.25107091665267944, "logps/chosen": -3.027339220046997, "logps/rejected": -4.128751277923584, "loss": 0.4693, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.027339220046997, "rewards/margins": 1.1014121770858765, "rewards/rejected": -4.128751277923584, "sft_loss": 3.14856219291687, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 18.66301339111262, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.30721646547317505, "logits/rejected": -0.22108140587806702, "logps/chosen": -2.860776424407959, "logps/rejected": -3.9469971656799316, "loss": 0.4347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.860776424407959, "rewards/margins": 1.0862209796905518, "rewards/rejected": -3.9469971656799316, "sft_loss": 3.0134289264678955, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 17.39476431005673, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.4259369969367981, "logits/rejected": -0.3169228434562683, "logps/chosen": -2.816646099090576, "logps/rejected": -4.042110443115234, "loss": 0.419, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.816646099090576, "rewards/margins": 1.2254643440246582, "rewards/rejected": -4.042110443115234, "sft_loss": 2.9703478813171387, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 19.832856387364917, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.2929435074329376, "logits/rejected": -0.20484165847301483, "logps/chosen": -2.9489831924438477, "logps/rejected": -4.000323295593262, "loss": 0.4958, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9489831924438477, "rewards/margins": 1.0513403415679932, "rewards/rejected": -4.000323295593262, "sft_loss": 3.115665912628174, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 18.512432157472645, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.3188219666481018, "logits/rejected": -0.04534931108355522, "logps/chosen": -2.9126038551330566, "logps/rejected": -4.1106038093566895, "loss": 0.4466, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.9126038551330566, "rewards/margins": 1.197999358177185, "rewards/rejected": -4.1106038093566895, "sft_loss": 3.0568900108337402, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 21.857296108358923, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.21708163619041443, "logits/rejected": -0.054964639246463776, "logps/chosen": -2.9592323303222656, "logps/rejected": -4.031314849853516, "loss": 0.4542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9592323303222656, "rewards/margins": 1.0720826387405396, "rewards/rejected": -4.031314849853516, "sft_loss": 3.165611505508423, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 25.98677017324065, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.2887173295021057, "logits/rejected": -0.20963990688323975, "logps/chosen": -2.968780040740967, "logps/rejected": -3.817471981048584, "loss": 0.5555, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.968780040740967, "rewards/margins": 0.8486918210983276, "rewards/rejected": -3.817471981048584, "sft_loss": 3.15004563331604, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 14.823323188091512, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.35453784465789795, "logits/rejected": -0.17695707082748413, "logps/chosen": -3.0053393840789795, "logps/rejected": -4.123621940612793, "loss": 0.4883, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0053393840789795, "rewards/margins": 1.1182823181152344, "rewards/rejected": -4.123621940612793, "sft_loss": 3.142940044403076, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 35.681208849165785, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.3615763783454895, "logits/rejected": -0.22551564872264862, "logps/chosen": -2.9433486461639404, "logps/rejected": -4.234328746795654, "loss": 0.4195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9433486461639404, "rewards/margins": 1.2909801006317139, "rewards/rejected": -4.234328746795654, "sft_loss": 3.0968470573425293, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 19.324635659444645, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.4254387319087982, "logits/rejected": -0.21585650742053986, "logps/chosen": -3.1022112369537354, "logps/rejected": -4.332705020904541, "loss": 0.448, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1022112369537354, "rewards/margins": 1.2304937839508057, "rewards/rejected": -4.332705020904541, "sft_loss": 3.239518404006958, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 15.705163473856352, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.4600273072719574, "logits/rejected": -0.2306920289993286, "logps/chosen": -2.96919322013855, "logps/rejected": -4.197896480560303, "loss": 0.4428, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.96919322013855, "rewards/margins": 1.2287031412124634, "rewards/rejected": -4.197896480560303, "sft_loss": 3.110826015472412, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 12.42786764570172, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.24805191159248352, "logits/rejected": -0.09726305305957794, "logps/chosen": -3.012068748474121, "logps/rejected": -4.305748462677002, "loss": 0.4509, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.012068748474121, "rewards/margins": 1.2936795949935913, "rewards/rejected": -4.305748462677002, "sft_loss": 3.1654953956604004, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 26.94058532427146, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.29765814542770386, "logits/rejected": -0.11189641803503036, "logps/chosen": -3.1366753578186035, "logps/rejected": -4.3091325759887695, "loss": 0.466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1366753578186035, "rewards/margins": 1.1724575757980347, "rewards/rejected": -4.3091325759887695, "sft_loss": 3.252049684524536, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 18.48452893303915, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.32929176092147827, "logits/rejected": -0.19954873621463776, "logps/chosen": -2.9225192070007324, "logps/rejected": -4.042097568511963, "loss": 0.4681, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.9225192070007324, "rewards/margins": 1.1195781230926514, "rewards/rejected": -4.042097568511963, "sft_loss": 3.047149658203125, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 15.784808698281562, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.38457226753234863, "logits/rejected": -0.14432287216186523, "logps/chosen": -3.0213623046875, "logps/rejected": -4.299278736114502, "loss": 0.4068, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0213623046875, "rewards/margins": 1.2779161930084229, "rewards/rejected": -4.299278736114502, "sft_loss": 3.104046583175659, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 22.853353508016887, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.33879074454307556, "logits/rejected": -0.2555561065673828, "logps/chosen": -3.044795513153076, "logps/rejected": -4.244548320770264, "loss": 0.4223, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.044795513153076, "rewards/margins": 1.1997532844543457, "rewards/rejected": -4.244548320770264, "sft_loss": 3.1476967334747314, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 27.26187470654718, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.4439014792442322, "logits/rejected": -0.19702909886837006, "logps/chosen": -2.9273324012756348, "logps/rejected": -4.080560207366943, "loss": 0.4568, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9273324012756348, "rewards/margins": 1.1532284021377563, "rewards/rejected": -4.080560207366943, "sft_loss": 3.1291260719299316, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 19.92107499422407, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.45958924293518066, "logits/rejected": -0.2524833083152771, "logps/chosen": -3.048386812210083, "logps/rejected": -4.28609561920166, "loss": 0.4498, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.048386812210083, "rewards/margins": 1.2377086877822876, "rewards/rejected": -4.28609561920166, "sft_loss": 3.2454352378845215, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 26.367710927243852, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.2267228662967682, "logits/rejected": -0.15011325478553772, "logps/chosen": -3.0078647136688232, "logps/rejected": -4.023341178894043, "loss": 0.4932, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0078647136688232, "rewards/margins": 1.0154768228530884, "rewards/rejected": -4.023341178894043, "sft_loss": 3.212428331375122, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 22.474552444052417, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.28271785378456116, "logits/rejected": -0.10942456871271133, "logps/chosen": -3.120392322540283, "logps/rejected": -4.208460807800293, "loss": 0.4647, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.120392322540283, "rewards/margins": 1.0880682468414307, "rewards/rejected": -4.208460807800293, "sft_loss": 3.176684856414795, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 27.186846650364313, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.28622376918792725, "logits/rejected": -0.1410348266363144, "logps/chosen": -3.016723871231079, "logps/rejected": -4.079378128051758, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": -3.016723871231079, "rewards/margins": 1.0626541376113892, "rewards/rejected": -4.079378128051758, "sft_loss": 3.183474063873291, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 16.057516905741448, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.35685616731643677, "logits/rejected": -0.2156098634004593, "logps/chosen": -2.992096424102783, "logps/rejected": -4.160267353057861, "loss": 0.4594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.992096424102783, "rewards/margins": 1.16817045211792, "rewards/rejected": -4.160267353057861, "sft_loss": 3.1283581256866455, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 17.13120591922187, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.3916035592556, "logits/rejected": -0.28982120752334595, "logps/chosen": -2.9988656044006348, "logps/rejected": -4.073007106781006, "loss": 0.4696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9988656044006348, "rewards/margins": 1.0741417407989502, "rewards/rejected": -4.073007106781006, "sft_loss": 3.182443380355835, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 14.507481787012965, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.31555086374282837, "logits/rejected": -0.2631721794605255, "logps/chosen": -2.906529188156128, "logps/rejected": -4.1582770347595215, "loss": 0.4346, "rewards/accuracies": 0.8125, "rewards/chosen": -2.906529188156128, "rewards/margins": 1.2517473697662354, "rewards/rejected": -4.1582770347595215, "sft_loss": 3.072540760040283, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 20.585576738264063, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.3451662063598633, "logits/rejected": -0.1332363784313202, "logps/chosen": -2.919412136077881, "logps/rejected": -4.0757293701171875, "loss": 0.4401, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.919412136077881, "rewards/margins": 1.1563172340393066, "rewards/rejected": -4.0757293701171875, "sft_loss": 3.012556552886963, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 16.93919875228968, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.3259938955307007, "logits/rejected": -0.17566534876823425, "logps/chosen": -3.0338306427001953, "logps/rejected": -4.260069370269775, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0338306427001953, "rewards/margins": 1.2262380123138428, "rewards/rejected": -4.260069370269775, "sft_loss": 3.2258095741271973, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 17.058168897094994, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2931608557701111, "logits/rejected": -0.18977099657058716, "logps/chosen": -3.1517324447631836, "logps/rejected": -4.432768821716309, "loss": 0.3983, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1517324447631836, "rewards/margins": 1.281036615371704, "rewards/rejected": -4.432768821716309, "sft_loss": 3.2231268882751465, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 23.569625986902015, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.248325914144516, "logits/rejected": -0.17798493802547455, "logps/chosen": -3.194131374359131, "logps/rejected": -4.191938877105713, "loss": 0.5005, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.194131374359131, "rewards/margins": 0.9978069067001343, "rewards/rejected": -4.191938877105713, "sft_loss": 3.4025511741638184, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 17.11771675584726, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.2721349895000458, "logits/rejected": -0.11362195014953613, "logps/chosen": -2.9763708114624023, "logps/rejected": -4.256707191467285, "loss": 0.4714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9763708114624023, "rewards/margins": 1.2803370952606201, "rewards/rejected": -4.256707191467285, "sft_loss": 3.202526807785034, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 21.237414221087953, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.3889920711517334, "logits/rejected": -0.19775991141796112, "logps/chosen": -2.9420952796936035, "logps/rejected": -4.098598957061768, "loss": 0.4557, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9420952796936035, "rewards/margins": 1.1565032005310059, "rewards/rejected": -4.098598957061768, "sft_loss": 3.1482629776000977, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 21.637278412651288, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.2731543183326721, "logits/rejected": -0.10965131223201752, "logps/chosen": -3.038461446762085, "logps/rejected": -4.216440677642822, "loss": 0.4455, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.038461446762085, "rewards/margins": 1.177978754043579, "rewards/rejected": -4.216440677642822, "sft_loss": 3.1527132987976074, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 24.901410013852114, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.40109768509864807, "logits/rejected": -0.21919843554496765, "logps/chosen": -2.9902687072753906, "logps/rejected": -4.239418029785156, "loss": 0.439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9902687072753906, "rewards/margins": 1.2491494417190552, "rewards/rejected": -4.239418029785156, "sft_loss": 3.18410062789917, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.03516725078225136, "eval_logits/rejected": 0.14618034660816193, "eval_logps/chosen": -3.1772398948669434, "eval_logps/rejected": -4.1641154289245605, "eval_loss": 0.5584015250205994, "eval_rewards/accuracies": 0.721068263053894, "eval_rewards/chosen": -3.1772398948669434, "eval_rewards/margins": 0.9868756532669067, "eval_rewards/rejected": -4.1641154289245605, "eval_runtime": 51.25, "eval_samples_per_second": 26.244, "eval_sft_loss": 3.3039963245391846, "eval_steps_per_second": 6.576, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 11.349911011374502, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.28439879417419434, "logits/rejected": -0.2091202735900879, "logps/chosen": -2.9650275707244873, "logps/rejected": -4.147830963134766, "loss": 0.4494, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9650275707244873, "rewards/margins": 1.1828030347824097, "rewards/rejected": -4.147830963134766, "sft_loss": 3.0423130989074707, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 15.93567970311688, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.3161846101284027, "logits/rejected": -0.19209876656532288, "logps/chosen": -3.036440372467041, "logps/rejected": -4.228138446807861, "loss": 0.4694, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.036440372467041, "rewards/margins": 1.1916977167129517, "rewards/rejected": -4.228138446807861, "sft_loss": 3.137333393096924, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 14.87570642794917, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.3232470154762268, "logits/rejected": -0.22589445114135742, "logps/chosen": -2.9099388122558594, "logps/rejected": -4.149899005889893, "loss": 0.4381, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.9099388122558594, "rewards/margins": 1.239959955215454, "rewards/rejected": -4.149899005889893, "sft_loss": 3.07240629196167, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 15.708089383825824, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.37510597705841064, "logits/rejected": -0.2620985805988312, "logps/chosen": -2.993945598602295, "logps/rejected": -4.423148155212402, "loss": 0.392, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.993945598602295, "rewards/margins": 1.4292032718658447, "rewards/rejected": -4.423148155212402, "sft_loss": 3.146930694580078, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 23.91438518388687, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.3168920874595642, "logits/rejected": -0.13376422226428986, "logps/chosen": -3.074483633041382, "logps/rejected": -4.210515975952148, "loss": 0.473, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.074483633041382, "rewards/margins": 1.136031985282898, "rewards/rejected": -4.210515975952148, "sft_loss": 3.172656297683716, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 13.810781848975232, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.4242118299007416, "logits/rejected": -0.250504732131958, "logps/chosen": -3.033071756362915, "logps/rejected": -4.1675004959106445, "loss": 0.4504, "rewards/accuracies": 0.8125, "rewards/chosen": -3.033071756362915, "rewards/margins": 1.1344287395477295, "rewards/rejected": -4.1675004959106445, "sft_loss": 3.1074013710021973, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 16.266341458942986, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.40003085136413574, "logits/rejected": -0.2384142428636551, "logps/chosen": -2.941970109939575, "logps/rejected": -4.156831741333008, "loss": 0.438, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.941970109939575, "rewards/margins": 1.2148616313934326, "rewards/rejected": -4.156831741333008, "sft_loss": 3.055846691131592, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 18.466590109043757, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.36745721101760864, "logits/rejected": -0.1435421258211136, "logps/chosen": -3.0547235012054443, "logps/rejected": -4.208713531494141, "loss": 0.4631, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0547235012054443, "rewards/margins": 1.1539896726608276, "rewards/rejected": -4.208713531494141, "sft_loss": 3.1727612018585205, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 20.405521035426105, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.31539779901504517, "logits/rejected": -0.24084322154521942, "logps/chosen": -2.9838807582855225, "logps/rejected": -4.0755181312561035, "loss": 0.4832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9838807582855225, "rewards/margins": 1.0916370153427124, "rewards/rejected": -4.0755181312561035, "sft_loss": 3.1585214138031006, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 29.4394596025172, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.40207844972610474, "logits/rejected": -0.2369350641965866, "logps/chosen": -3.1063499450683594, "logps/rejected": -4.314638614654541, "loss": 0.458, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1063499450683594, "rewards/margins": 1.2082884311676025, "rewards/rejected": -4.314638614654541, "sft_loss": 3.2730610370635986, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 19.81157748000706, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.3515699505805969, "logits/rejected": -0.11190332472324371, "logps/chosen": -3.1851155757904053, "logps/rejected": -4.409258842468262, "loss": 0.4359, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1851155757904053, "rewards/margins": 1.224143385887146, "rewards/rejected": -4.409258842468262, "sft_loss": 3.227384090423584, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 21.971960987506325, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.31455013155937195, "logits/rejected": -0.23573537170886993, "logps/chosen": -3.047395706176758, "logps/rejected": -4.163751602172852, "loss": 0.4894, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.047395706176758, "rewards/margins": 1.1163556575775146, "rewards/rejected": -4.163751602172852, "sft_loss": 3.3204185962677, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 16.779204925233334, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.44790953397750854, "logits/rejected": -0.18838870525360107, "logps/chosen": -3.199179172515869, "logps/rejected": -4.315297603607178, "loss": 0.4731, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.199179172515869, "rewards/margins": 1.1161186695098877, "rewards/rejected": -4.315297603607178, "sft_loss": 3.3300118446350098, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 22.629600760447133, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.2885778844356537, "logits/rejected": -0.12899121642112732, "logps/chosen": -3.0951497554779053, "logps/rejected": -4.30334997177124, "loss": 0.4359, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.0951497554779053, "rewards/margins": 1.208200216293335, "rewards/rejected": -4.30334997177124, "sft_loss": 3.252854585647583, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 14.960965202827843, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.33523792028427124, "logits/rejected": -0.17725275456905365, "logps/chosen": -3.1233747005462646, "logps/rejected": -4.302353858947754, "loss": 0.4212, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.1233747005462646, "rewards/margins": 1.1789793968200684, "rewards/rejected": -4.302353858947754, "sft_loss": 3.1863956451416016, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 15.585561296051177, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.3423822224140167, "logits/rejected": -0.12648749351501465, "logps/chosen": -3.2453339099884033, "logps/rejected": -4.551573753356934, "loss": 0.4263, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2453339099884033, "rewards/margins": 1.306240200996399, "rewards/rejected": -4.551573753356934, "sft_loss": 3.2849297523498535, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 24.343265168221777, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.3864888548851013, "logits/rejected": -0.25290876626968384, "logps/chosen": -3.046827793121338, "logps/rejected": -4.351868629455566, "loss": 0.4402, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.046827793121338, "rewards/margins": 1.3050403594970703, "rewards/rejected": -4.351868629455566, "sft_loss": 3.154933452606201, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 16.948247904423653, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.32873716950416565, "logits/rejected": -0.15873248875141144, "logps/chosen": -3.001735210418701, "logps/rejected": -4.202784538269043, "loss": 0.4276, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.001735210418701, "rewards/margins": 1.2010494470596313, "rewards/rejected": -4.202784538269043, "sft_loss": 3.209874391555786, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 15.834554738414026, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.3988923132419586, "logits/rejected": -0.18734273314476013, "logps/chosen": -3.2281811237335205, "logps/rejected": -4.491852760314941, "loss": 0.4471, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2281811237335205, "rewards/margins": 1.2636712789535522, "rewards/rejected": -4.491852760314941, "sft_loss": 3.325453281402588, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 20.48925827707505, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.3241375982761383, "logits/rejected": -0.24348489940166473, "logps/chosen": -3.0451343059539795, "logps/rejected": -4.30004358291626, "loss": 0.4306, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0451343059539795, "rewards/margins": 1.2549090385437012, "rewards/rejected": -4.30004358291626, "sft_loss": 3.1367976665496826, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 31.491521093892707, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.31725817918777466, "logits/rejected": -0.17649342119693756, "logps/chosen": -3.1203866004943848, "logps/rejected": -4.242855072021484, "loss": 0.4812, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1203866004943848, "rewards/margins": 1.1224688291549683, "rewards/rejected": -4.242855072021484, "sft_loss": 3.316706895828247, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 17.25842303345134, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.4709432125091553, "logits/rejected": -0.22131678462028503, "logps/chosen": -3.094914674758911, "logps/rejected": -4.457337856292725, "loss": 0.4612, "rewards/accuracies": 0.8125, "rewards/chosen": -3.094914674758911, "rewards/margins": 1.362423062324524, "rewards/rejected": -4.457337856292725, "sft_loss": 3.228952407836914, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 35.861043164824565, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.2745920419692993, "logits/rejected": -0.07384338974952698, "logps/chosen": -3.1135764122009277, "logps/rejected": -4.157094478607178, "loss": 0.4824, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1135764122009277, "rewards/margins": 1.0435179471969604, "rewards/rejected": -4.157094478607178, "sft_loss": 3.250995635986328, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 21.065820661965578, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.2862524092197418, "logits/rejected": -0.20901036262512207, "logps/chosen": -3.0392966270446777, "logps/rejected": -4.194875240325928, "loss": 0.4305, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0392966270446777, "rewards/margins": 1.155578374862671, "rewards/rejected": -4.194875240325928, "sft_loss": 3.0931644439697266, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 14.551933906690897, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.3247146010398865, "logits/rejected": -0.1293286234140396, "logps/chosen": -3.0812928676605225, "logps/rejected": -4.426455497741699, "loss": 0.3984, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0812928676605225, "rewards/margins": 1.345163106918335, "rewards/rejected": -4.426455497741699, "sft_loss": 3.193646192550659, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 21.83556838472186, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.2696150839328766, "logits/rejected": -0.16448882222175598, "logps/chosen": -2.9790823459625244, "logps/rejected": -4.206761360168457, "loss": 0.4807, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9790823459625244, "rewards/margins": 1.227678894996643, "rewards/rejected": -4.206761360168457, "sft_loss": 3.2123515605926514, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 14.238067056693254, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.23434197902679443, "logits/rejected": -0.2795167863368988, "logps/chosen": -2.9896132946014404, "logps/rejected": -3.9661147594451904, "loss": 0.5082, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9896132946014404, "rewards/margins": 0.9765016436576843, "rewards/rejected": -3.9661147594451904, "sft_loss": 3.145164728164673, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 15.629585462898145, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.29780083894729614, "logits/rejected": -0.22754064202308655, "logps/chosen": -2.9592719078063965, "logps/rejected": -4.178595542907715, "loss": 0.453, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9592719078063965, "rewards/margins": 1.219322919845581, "rewards/rejected": -4.178595542907715, "sft_loss": 3.040536642074585, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 18.825198907230106, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.43763312697410583, "logits/rejected": -0.22980007529258728, "logps/chosen": -3.007265090942383, "logps/rejected": -4.1483659744262695, "loss": 0.4454, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.007265090942383, "rewards/margins": 1.1411011219024658, "rewards/rejected": -4.1483659744262695, "sft_loss": 3.1913371086120605, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 24.09754601456622, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.25056177377700806, "logits/rejected": -0.24701841175556183, "logps/chosen": -3.0743985176086426, "logps/rejected": -4.054999351501465, "loss": 0.5204, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0743985176086426, "rewards/margins": 0.9806006550788879, "rewards/rejected": -4.054999351501465, "sft_loss": 3.1449050903320312, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 16.841622362463834, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.327300488948822, "logits/rejected": -0.24840061366558075, "logps/chosen": -3.0672824382781982, "logps/rejected": -4.048326015472412, "loss": 0.4871, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0672824382781982, "rewards/margins": 0.981043815612793, "rewards/rejected": -4.048326015472412, "sft_loss": 3.1784448623657227, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 24.06564336912586, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.3746258020401001, "logits/rejected": -0.16087034344673157, "logps/chosen": -3.024641752243042, "logps/rejected": -4.097342014312744, "loss": 0.4835, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.024641752243042, "rewards/margins": 1.0727002620697021, "rewards/rejected": -4.097342014312744, "sft_loss": 3.2068710327148438, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 13.811767973628212, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.39718228578567505, "logits/rejected": -0.23315775394439697, "logps/chosen": -3.001275062561035, "logps/rejected": -4.263126373291016, "loss": 0.4283, "rewards/accuracies": 0.84375, "rewards/chosen": -3.001275062561035, "rewards/margins": 1.2618507146835327, "rewards/rejected": -4.263126373291016, "sft_loss": 3.128613233566284, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 19.206923805132742, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.24512071907520294, "logits/rejected": -0.19996261596679688, "logps/chosen": -3.0810763835906982, "logps/rejected": -4.056224346160889, "loss": 0.5042, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0810763835906982, "rewards/margins": 0.97514808177948, "rewards/rejected": -4.056224346160889, "sft_loss": 3.3016300201416016, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 18.151418865502585, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.32794076204299927, "logits/rejected": -0.21797947585582733, "logps/chosen": -2.933988094329834, "logps/rejected": -4.380335807800293, "loss": 0.4164, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.933988094329834, "rewards/margins": 1.446347951889038, "rewards/rejected": -4.380335807800293, "sft_loss": 3.140775203704834, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 19.789281082524955, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.39932963252067566, "logits/rejected": -0.13446304202079773, "logps/chosen": -3.127798557281494, "logps/rejected": -4.366634368896484, "loss": 0.4286, "rewards/accuracies": 0.84375, "rewards/chosen": -3.127798557281494, "rewards/margins": 1.2388359308242798, "rewards/rejected": -4.366634368896484, "sft_loss": 3.287930965423584, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 17.89650539798015, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.29105740785598755, "logits/rejected": -0.18979479372501373, "logps/chosen": -2.984229803085327, "logps/rejected": -4.134560585021973, "loss": 0.4395, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.984229803085327, "rewards/margins": 1.1503304243087769, "rewards/rejected": -4.134560585021973, "sft_loss": 3.2013649940490723, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 16.136346966785975, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.3459378182888031, "logits/rejected": -0.17318777740001678, "logps/chosen": -3.1791253089904785, "logps/rejected": -4.12210750579834, "loss": 0.5108, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1791253089904785, "rewards/margins": 0.9429818987846375, "rewards/rejected": -4.12210750579834, "sft_loss": 3.3328216075897217, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 15.568184572108814, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.4857475161552429, "logits/rejected": -0.24215015769004822, "logps/chosen": -3.0089166164398193, "logps/rejected": -4.201539516448975, "loss": 0.433, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0089166164398193, "rewards/margins": 1.1926231384277344, "rewards/rejected": -4.201539516448975, "sft_loss": 3.1859726905822754, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 19.49373403612521, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.3905830979347229, "logits/rejected": -0.1188025027513504, "logps/chosen": -2.917149543762207, "logps/rejected": -4.4046220779418945, "loss": 0.3984, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.917149543762207, "rewards/margins": 1.4874722957611084, "rewards/rejected": -4.4046220779418945, "sft_loss": 2.99169921875, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 18.319127277349217, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.27997198700904846, "logits/rejected": -0.21191298961639404, "logps/chosen": -3.1477558612823486, "logps/rejected": -4.176412582397461, "loss": 0.5048, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1477558612823486, "rewards/margins": 1.0286567211151123, "rewards/rejected": -4.176412582397461, "sft_loss": 3.182608127593994, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 13.206043980986074, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.339926540851593, "logits/rejected": -0.19760119915008545, "logps/chosen": -3.116013765335083, "logps/rejected": -4.474963665008545, "loss": 0.4585, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.116013765335083, "rewards/margins": 1.3589502573013306, "rewards/rejected": -4.474963665008545, "sft_loss": 3.2682881355285645, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 12.092949101875329, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.35985273122787476, "logits/rejected": -0.1344916820526123, "logps/chosen": -3.164681911468506, "logps/rejected": -4.202638626098633, "loss": 0.4885, "rewards/accuracies": 0.78125, "rewards/chosen": -3.164681911468506, "rewards/margins": 1.0379563570022583, "rewards/rejected": -4.202638626098633, "sft_loss": 3.3962695598602295, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 15.77643103729951, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.31049636006355286, "logits/rejected": -0.22904932498931885, "logps/chosen": -3.153508424758911, "logps/rejected": -4.185422420501709, "loss": 0.5176, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.153508424758911, "rewards/margins": 1.0319141149520874, "rewards/rejected": -4.185422420501709, "sft_loss": 3.297576427459717, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 19.787753694119417, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.30364230275154114, "logits/rejected": -0.15516886115074158, "logps/chosen": -3.094975709915161, "logps/rejected": -4.400447368621826, "loss": 0.4194, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.094975709915161, "rewards/margins": 1.3054720163345337, "rewards/rejected": -4.400447368621826, "sft_loss": 3.280155658721924, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 15.444519909963496, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.35556572675704956, "logits/rejected": -0.14741787314414978, "logps/chosen": -3.0733776092529297, "logps/rejected": -4.3367719650268555, "loss": 0.4164, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.0733776092529297, "rewards/margins": 1.2633943557739258, "rewards/rejected": -4.3367719650268555, "sft_loss": 3.206371307373047, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 18.64094817357154, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.4322203993797302, "logits/rejected": -0.3308923840522766, "logps/chosen": -2.9811339378356934, "logps/rejected": -4.230893135070801, "loss": 0.4331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9811339378356934, "rewards/margins": 1.2497590780258179, "rewards/rejected": -4.230893135070801, "sft_loss": 3.179506540298462, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 13.419565857750332, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.3933342397212982, "logits/rejected": -0.23518791794776917, "logps/chosen": -3.0282037258148193, "logps/rejected": -4.129472255706787, "loss": 0.4647, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0282037258148193, "rewards/margins": 1.1012687683105469, "rewards/rejected": -4.129472255706787, "sft_loss": 3.158400774002075, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 20.2744413481314, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.19685646891593933, "logits/rejected": -0.13014453649520874, "logps/chosen": -3.0012104511260986, "logps/rejected": -4.314382076263428, "loss": 0.4723, "rewards/accuracies": 0.75, "rewards/chosen": -3.0012104511260986, "rewards/margins": 1.31317138671875, "rewards/rejected": -4.314382076263428, "sft_loss": 3.1396377086639404, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 20.60754015834533, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.2115621566772461, "logits/rejected": -0.11249543726444244, "logps/chosen": -3.1051881313323975, "logps/rejected": -4.277671813964844, "loss": 0.4395, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.1051881313323975, "rewards/margins": 1.1724836826324463, "rewards/rejected": -4.277671813964844, "sft_loss": 3.2242259979248047, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 17.604786632420204, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.390419602394104, "logits/rejected": -0.1984190046787262, "logps/chosen": -3.1073858737945557, "logps/rejected": -4.138300895690918, "loss": 0.4884, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.1073858737945557, "rewards/margins": 1.030915379524231, "rewards/rejected": -4.138300895690918, "sft_loss": 3.204761028289795, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 13.87575549735312, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.34086284041404724, "logits/rejected": -0.1693355292081833, "logps/chosen": -2.8941614627838135, "logps/rejected": -4.1378021240234375, "loss": 0.4606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8941614627838135, "rewards/margins": 1.2436411380767822, "rewards/rejected": -4.1378021240234375, "sft_loss": 3.085679531097412, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 12.381814139095438, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.4181889593601227, "logits/rejected": -0.18107159435749054, "logps/chosen": -3.1676297187805176, "logps/rejected": -4.195403099060059, "loss": 0.4951, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1676297187805176, "rewards/margins": 1.0277737379074097, "rewards/rejected": -4.195403099060059, "sft_loss": 3.2065887451171875, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 29.507089109713135, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.33318883180618286, "logits/rejected": -0.24138236045837402, "logps/chosen": -3.0527524948120117, "logps/rejected": -4.077324867248535, "loss": 0.4885, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0527524948120117, "rewards/margins": 1.0245723724365234, "rewards/rejected": -4.077324867248535, "sft_loss": 3.267174482345581, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 15.988901477420473, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.2998291850090027, "logits/rejected": -0.33957356214523315, "logps/chosen": -2.969876766204834, "logps/rejected": -3.991947889328003, "loss": 0.484, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.969876766204834, "rewards/margins": 1.022071361541748, "rewards/rejected": -3.991947889328003, "sft_loss": 3.1517035961151123, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 18.14887590672031, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.33400124311447144, "logits/rejected": -0.20043723285198212, "logps/chosen": -3.100147247314453, "logps/rejected": -4.258397102355957, "loss": 0.4754, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.100147247314453, "rewards/margins": 1.1582508087158203, "rewards/rejected": -4.258397102355957, "sft_loss": 3.2473278045654297, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 20.02783237572497, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.26498645544052124, "logits/rejected": -0.06162431836128235, "logps/chosen": -2.9117674827575684, "logps/rejected": -3.9550652503967285, "loss": 0.4418, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9117674827575684, "rewards/margins": 1.0432971715927124, "rewards/rejected": -3.9550652503967285, "sft_loss": 3.029630661010742, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 19.7209031712253, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.3522023856639862, "logits/rejected": -0.2869417071342468, "logps/chosen": -2.9644052982330322, "logps/rejected": -4.142910957336426, "loss": 0.4551, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9644052982330322, "rewards/margins": 1.1785061359405518, "rewards/rejected": -4.142910957336426, "sft_loss": 3.2182540893554688, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 12.521407433641665, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.41579556465148926, "logits/rejected": -0.19478026032447815, "logps/chosen": -2.9432249069213867, "logps/rejected": -4.327273845672607, "loss": 0.4058, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.9432249069213867, "rewards/margins": 1.3840488195419312, "rewards/rejected": -4.327273845672607, "sft_loss": 3.180227756500244, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 13.05754787349225, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.265876829624176, "logits/rejected": -0.14506739377975464, "logps/chosen": -3.11197566986084, "logps/rejected": -4.149270534515381, "loss": 0.4817, "rewards/accuracies": 0.8125, "rewards/chosen": -3.11197566986084, "rewards/margins": 1.037294864654541, "rewards/rejected": -4.149270534515381, "sft_loss": 3.2263476848602295, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 19.91544793700982, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.2848760783672333, "logits/rejected": -0.2564757466316223, "logps/chosen": -3.073355197906494, "logps/rejected": -4.214011192321777, "loss": 0.4578, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.073355197906494, "rewards/margins": 1.1406558752059937, "rewards/rejected": -4.214011192321777, "sft_loss": 3.2507076263427734, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 17.813566649956503, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.373394250869751, "logits/rejected": -0.20408591628074646, "logps/chosen": -3.0406758785247803, "logps/rejected": -4.433874130249023, "loss": 0.434, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0406758785247803, "rewards/margins": 1.3931986093521118, "rewards/rejected": -4.433874130249023, "sft_loss": 3.1046528816223145, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 16.244572749023327, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.41688647866249084, "logits/rejected": -0.2552393078804016, "logps/chosen": -3.0431668758392334, "logps/rejected": -4.226834297180176, "loss": 0.4655, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0431668758392334, "rewards/margins": 1.1836671829223633, "rewards/rejected": -4.226834297180176, "sft_loss": 3.256648302078247, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 19.60797999208122, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.30990904569625854, "logits/rejected": -0.222394198179245, "logps/chosen": -3.017165422439575, "logps/rejected": -4.103949069976807, "loss": 0.4548, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.017165422439575, "rewards/margins": 1.0867832899093628, "rewards/rejected": -4.103949069976807, "sft_loss": 3.1367456912994385, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 19.10578857508277, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.3262700140476227, "logits/rejected": -0.16396215558052063, "logps/chosen": -3.2386765480041504, "logps/rejected": -4.5179853439331055, "loss": 0.4615, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2386765480041504, "rewards/margins": 1.2793089151382446, "rewards/rejected": -4.5179853439331055, "sft_loss": 3.428264617919922, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 14.770948828555499, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.4027184545993805, "logits/rejected": -0.1612546145915985, "logps/chosen": -3.1544299125671387, "logps/rejected": -4.1260833740234375, "loss": 0.4836, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1544299125671387, "rewards/margins": 0.9716532826423645, "rewards/rejected": -4.1260833740234375, "sft_loss": 3.242033004760742, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 23.03402766448594, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.32584530115127563, "logits/rejected": -0.25883787870407104, "logps/chosen": -3.097665309906006, "logps/rejected": -4.14642858505249, "loss": 0.5031, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.097665309906006, "rewards/margins": 1.0487632751464844, "rewards/rejected": -4.14642858505249, "sft_loss": 3.284144878387451, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 19.128617418074587, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.40291205048561096, "logits/rejected": -0.25008195638656616, "logps/chosen": -3.05241060256958, "logps/rejected": -4.164105415344238, "loss": 0.4697, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.05241060256958, "rewards/margins": 1.1116950511932373, "rewards/rejected": -4.164105415344238, "sft_loss": 3.1327614784240723, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 25.594131415702417, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.3280094861984253, "logits/rejected": -0.20888443291187286, "logps/chosen": -2.939770221710205, "logps/rejected": -3.989638566970825, "loss": 0.4704, "rewards/accuracies": 0.8125, "rewards/chosen": -2.939770221710205, "rewards/margins": 1.0498679876327515, "rewards/rejected": -3.989638566970825, "sft_loss": 3.02432918548584, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 15.41544308032071, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.24691219627857208, "logits/rejected": -0.17742550373077393, "logps/chosen": -3.015709400177002, "logps/rejected": -4.0467987060546875, "loss": 0.4573, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.015709400177002, "rewards/margins": 1.031089425086975, "rewards/rejected": -4.0467987060546875, "sft_loss": 3.1204111576080322, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 22.34414377412042, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.3485460579395294, "logits/rejected": -0.14062300324440002, "logps/chosen": -3.0802266597747803, "logps/rejected": -4.3738532066345215, "loss": 0.4465, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0802266597747803, "rewards/margins": 1.2936267852783203, "rewards/rejected": -4.3738532066345215, "sft_loss": 3.1362340450286865, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 20.00966499088373, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.26539528369903564, "logits/rejected": -0.22001326084136963, "logps/chosen": -3.059741497039795, "logps/rejected": -4.021082878112793, "loss": 0.5115, "rewards/accuracies": 0.78125, "rewards/chosen": -3.059741497039795, "rewards/margins": 0.9613416790962219, "rewards/rejected": -4.021082878112793, "sft_loss": 3.258448362350464, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 17.624578233565497, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.35784557461738586, "logits/rejected": -0.22992336750030518, "logps/chosen": -3.0968403816223145, "logps/rejected": -4.2120184898376465, "loss": 0.4963, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0968403816223145, "rewards/margins": 1.115178108215332, "rewards/rejected": -4.2120184898376465, "sft_loss": 3.2440342903137207, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 16.380030236450217, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.3142494559288025, "logits/rejected": -0.15964466333389282, "logps/chosen": -2.9746203422546387, "logps/rejected": -4.431513786315918, "loss": 0.414, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9746203422546387, "rewards/margins": 1.4568936824798584, "rewards/rejected": -4.431513786315918, "sft_loss": 3.1340255737304688, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 19.14052944165982, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.32244330644607544, "logits/rejected": -0.13985905051231384, "logps/chosen": -2.8702309131622314, "logps/rejected": -4.0701775550842285, "loss": 0.4721, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8702309131622314, "rewards/margins": 1.199946641921997, "rewards/rejected": -4.0701775550842285, "sft_loss": 3.0334982872009277, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 18.04040313922582, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.4082750380039215, "logits/rejected": -0.11360353231430054, "logps/chosen": -3.010322093963623, "logps/rejected": -4.2205352783203125, "loss": 0.4456, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.010322093963623, "rewards/margins": 1.2102131843566895, "rewards/rejected": -4.2205352783203125, "sft_loss": 3.1632823944091797, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 19.264446580620298, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.37494826316833496, "logits/rejected": -0.17825865745544434, "logps/chosen": -3.0055809020996094, "logps/rejected": -4.270408630371094, "loss": 0.4199, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0055809020996094, "rewards/margins": 1.2648277282714844, "rewards/rejected": -4.270408630371094, "sft_loss": 3.263542890548706, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 17.075054798421235, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.2850029468536377, "logits/rejected": -0.2014038860797882, "logps/chosen": -3.143592357635498, "logps/rejected": -4.232028961181641, "loss": 0.4806, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.143592357635498, "rewards/margins": 1.088436484336853, "rewards/rejected": -4.232028961181641, "sft_loss": 3.220813751220703, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 28.69235281888499, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.3260534405708313, "logits/rejected": -0.1166486144065857, "logps/chosen": -3.000702142715454, "logps/rejected": -4.186171531677246, "loss": 0.4439, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.000702142715454, "rewards/margins": 1.185469388961792, "rewards/rejected": -4.186171531677246, "sft_loss": 3.1785988807678223, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 17.986136529078, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.331061452627182, "logits/rejected": -0.1841285526752472, "logps/chosen": -3.105658769607544, "logps/rejected": -4.51476526260376, "loss": 0.4704, "rewards/accuracies": 0.78125, "rewards/chosen": -3.105658769607544, "rewards/margins": 1.4091064929962158, "rewards/rejected": -4.51476526260376, "sft_loss": 3.228262424468994, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.05207514017820358, "eval_logits/rejected": 0.16449151933193207, "eval_logps/chosen": -3.185509204864502, "eval_logps/rejected": -4.173870086669922, "eval_loss": 0.558981716632843, "eval_rewards/accuracies": 0.7225519418716431, "eval_rewards/chosen": -3.185509204864502, "eval_rewards/margins": 0.988361120223999, "eval_rewards/rejected": -4.173870086669922, "eval_runtime": 51.1788, "eval_samples_per_second": 26.28, "eval_sft_loss": 3.316328763961792, "eval_steps_per_second": 6.585, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.5562219639746825, "train_runtime": 39292.0664, "train_samples_per_second": 4.565, "train_steps_per_second": 0.143 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }