diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 5.960413904919439, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.06863687932491302, + "logits/rejected": 0.14140453934669495, + "logps/chosen": -1.7160797119140625, + "logps/rejected": -1.8894574642181396, + "loss": 0.7285, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7160797119140625, + "rewards/margins": 0.1733776032924652, + "rewards/rejected": -1.8894574642181396, + "sft_loss": 1.4685349464416504, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 10.283120910113055, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.007261426188051701, + "logits/rejected": 0.11421868950128555, + "logps/chosen": -1.8034416437149048, + "logps/rejected": -1.8454005718231201, + "loss": 0.8198, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8034416437149048, + "rewards/margins": 0.041958972811698914, + "rewards/rejected": -1.8454005718231201, + "sft_loss": 1.5082828998565674, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 10.95594283910855, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.03530733659863472, + "logits/rejected": 0.0647643432021141, + "logps/chosen": -1.6338050365447998, + "logps/rejected": -1.7646713256835938, + "loss": 0.8026, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6338050365447998, + "rewards/margins": 0.13086609542369843, + "rewards/rejected": -1.7646713256835938, + "sft_loss": 1.5001604557037354, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 6.14709723690152, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.046555109322071075, + "logits/rejected": 0.0392180010676384, + "logps/chosen": -1.7248073816299438, + "logps/rejected": -1.8061683177947998, + "loss": 0.8257, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7248073816299438, + "rewards/margins": 0.08136085420846939, + "rewards/rejected": -1.8061683177947998, + "sft_loss": 1.5003635883331299, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 16.53309947885159, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.06515147536993027, + "logits/rejected": 0.021384865045547485, + "logps/chosen": -1.8690541982650757, + "logps/rejected": -1.7782766819000244, + "loss": 0.9645, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -1.8690541982650757, + "rewards/margins": -0.09077732264995575, + "rewards/rejected": -1.7782766819000244, + "sft_loss": 1.5456018447875977, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 9.241174984603704, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.08728420734405518, + "logits/rejected": 0.006704470608383417, + "logps/chosen": -1.9081052541732788, + "logps/rejected": -1.8311151266098022, + "loss": 0.8797, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9081052541732788, + "rewards/margins": -0.07699020951986313, + "rewards/rejected": -1.8311151266098022, + "sft_loss": 1.6464630365371704, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 10.141825796126035, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.06290142983198166, + "logits/rejected": 0.09779242426156998, + "logps/chosen": -1.84554922580719, + "logps/rejected": -1.9966566562652588, + "loss": 0.8619, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.84554922580719, + "rewards/margins": 0.15110749006271362, + "rewards/rejected": -1.9966566562652588, + "sft_loss": 1.5615030527114868, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 9.60539728118551, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.018712041899561882, + "logits/rejected": 0.19298934936523438, + "logps/chosen": -1.8813413381576538, + "logps/rejected": -1.7433903217315674, + "loss": 0.907, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8813413381576538, + "rewards/margins": -0.1379513442516327, + "rewards/rejected": -1.7433903217315674, + "sft_loss": 1.51904296875, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 15.279796660776759, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.021166115999221802, + "logits/rejected": 0.2210729420185089, + "logps/chosen": -1.83597731590271, + "logps/rejected": -1.8712654113769531, + "loss": 0.8692, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.83597731590271, + "rewards/margins": 0.03528793901205063, + "rewards/rejected": -1.8712654113769531, + "sft_loss": 1.5357192754745483, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 12.369890038233116, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.04084717482328415, + "logits/rejected": 0.11164959520101547, + "logps/chosen": -1.8961776494979858, + "logps/rejected": -1.7771739959716797, + "loss": 0.9233, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8961776494979858, + "rewards/margins": -0.11900367587804794, + "rewards/rejected": -1.7771739959716797, + "sft_loss": 1.5827200412750244, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 8.172530142387867, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.11221468448638916, + "logits/rejected": 0.11364294588565826, + "logps/chosen": -1.830999732017517, + "logps/rejected": -1.8655872344970703, + "loss": 0.8864, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.830999732017517, + "rewards/margins": 0.034587521106004715, + "rewards/rejected": -1.8655872344970703, + "sft_loss": 1.582080602645874, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 7.708516139332287, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.0882994756102562, + "logits/rejected": 0.10581319034099579, + "logps/chosen": -1.786087989807129, + "logps/rejected": -1.8903782367706299, + "loss": 0.7979, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.786087989807129, + "rewards/margins": 0.10429023206233978, + "rewards/rejected": -1.8903782367706299, + "sft_loss": 1.542878270149231, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 6.784833173876897, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.021900106221437454, + "logits/rejected": 0.12525932490825653, + "logps/chosen": -1.6341216564178467, + "logps/rejected": -1.7630643844604492, + "loss": 0.7555, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6341216564178467, + "rewards/margins": 0.128942608833313, + "rewards/rejected": -1.7630643844604492, + "sft_loss": 1.473035454750061, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 12.066547435002901, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.06697101891040802, + "logits/rejected": 0.08831731230020523, + "logps/chosen": -1.7613502740859985, + "logps/rejected": -1.8077118396759033, + "loss": 0.8785, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.7613502740859985, + "rewards/margins": 0.04636149853467941, + "rewards/rejected": -1.8077118396759033, + "sft_loss": 1.6272966861724854, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 13.435717052234807, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.058324266225099564, + "logits/rejected": 0.12552091479301453, + "logps/chosen": -1.7716388702392578, + "logps/rejected": -2.032058000564575, + "loss": 0.7572, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7716388702392578, + "rewards/margins": 0.26041918992996216, + "rewards/rejected": -2.032058000564575, + "sft_loss": 1.5637356042861938, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 9.171672146207309, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": -0.010003057308495045, + "logits/rejected": 0.09709867835044861, + "logps/chosen": -1.7084449529647827, + "logps/rejected": -1.7419836521148682, + "loss": 0.8303, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7084449529647827, + "rewards/margins": 0.03353862091898918, + "rewards/rejected": -1.7419836521148682, + "sft_loss": 1.5209019184112549, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 6.253780556003053, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.12957318127155304, + "logits/rejected": 0.1263594627380371, + "logps/chosen": -1.7734349966049194, + "logps/rejected": -1.947437047958374, + "loss": 0.7969, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7734349966049194, + "rewards/margins": 0.17400197684764862, + "rewards/rejected": -1.947437047958374, + "sft_loss": 1.4895031452178955, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 15.574243557399985, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.10436828434467316, + "logits/rejected": 0.06782497465610504, + "logps/chosen": -1.7253191471099854, + "logps/rejected": -1.758958101272583, + "loss": 0.8626, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.7253191471099854, + "rewards/margins": 0.03363896161317825, + "rewards/rejected": -1.758958101272583, + "sft_loss": 1.4510555267333984, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 6.8301329645871744, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.07905842363834381, + "logits/rejected": 0.0742034763097763, + "logps/chosen": -1.7728517055511475, + "logps/rejected": -1.8891079425811768, + "loss": 0.8211, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7728517055511475, + "rewards/margins": 0.1162562221288681, + "rewards/rejected": -1.8891079425811768, + "sft_loss": 1.513932228088379, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 5.552900149636991, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.038597553968429565, + "logits/rejected": 0.027492264285683632, + "logps/chosen": -1.6718080043792725, + "logps/rejected": -1.7776873111724854, + "loss": 0.7783, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.6718080043792725, + "rewards/margins": 0.1058792918920517, + "rewards/rejected": -1.7776873111724854, + "sft_loss": 1.4830631017684937, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 10.33948626839002, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.03208022564649582, + "logits/rejected": 0.06198335438966751, + "logps/chosen": -1.6234149932861328, + "logps/rejected": -1.7916975021362305, + "loss": 0.75, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6234149932861328, + "rewards/margins": 0.16828235983848572, + "rewards/rejected": -1.7916975021362305, + "sft_loss": 1.4286878108978271, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 6.843753554089203, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -0.016793223097920418, + "logits/rejected": 0.08014251291751862, + "logps/chosen": -1.6443202495574951, + "logps/rejected": -1.7000795602798462, + "loss": 0.8239, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6443202495574951, + "rewards/margins": 0.05575937032699585, + "rewards/rejected": -1.7000795602798462, + "sft_loss": 1.4516596794128418, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 11.566384576174354, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.03288597613573074, + "logits/rejected": 0.24429932236671448, + "logps/chosen": -1.6173032522201538, + "logps/rejected": -1.8868554830551147, + "loss": 0.7148, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6173032522201538, + "rewards/margins": 0.2695521414279938, + "rewards/rejected": -1.8868554830551147, + "sft_loss": 1.541107177734375, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 6.913730001102249, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.09573063254356384, + "logits/rejected": 0.07752221822738647, + "logps/chosen": -1.6756315231323242, + "logps/rejected": -1.7910346984863281, + "loss": 0.7763, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6756315231323242, + "rewards/margins": 0.11540316045284271, + "rewards/rejected": -1.7910346984863281, + "sft_loss": 1.5277470350265503, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 6.1554867852846415, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.08294974267482758, + "logits/rejected": 0.050920598208904266, + "logps/chosen": -1.6054375171661377, + "logps/rejected": -1.5668278932571411, + "loss": 0.8437, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.6054375171661377, + "rewards/margins": -0.03860952705144882, + "rewards/rejected": -1.5668278932571411, + "sft_loss": 1.4991239309310913, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 9.989171102195591, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.03443840518593788, + "logits/rejected": 0.1720113605260849, + "logps/chosen": -1.6433916091918945, + "logps/rejected": -1.7639652490615845, + "loss": 0.7318, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.6433916091918945, + "rewards/margins": 0.12057371437549591, + "rewards/rejected": -1.7639652490615845, + "sft_loss": 1.5579571723937988, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 16.717284162597252, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.05925334617495537, + "logits/rejected": 0.0586424246430397, + "logps/chosen": -1.6929420232772827, + "logps/rejected": -1.7211458683013916, + "loss": 0.8289, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6929420232772827, + "rewards/margins": 0.028203705325722694, + "rewards/rejected": -1.7211458683013916, + "sft_loss": 1.4951034784317017, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 10.788978294247835, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.04839114844799042, + "logits/rejected": 0.11875492334365845, + "logps/chosen": -1.6551433801651, + "logps/rejected": -1.7841171026229858, + "loss": 0.7597, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6551433801651, + "rewards/margins": 0.12897393107414246, + "rewards/rejected": -1.7841171026229858, + "sft_loss": 1.5421087741851807, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 10.497061215269742, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.02363504469394684, + "logits/rejected": 0.13335788249969482, + "logps/chosen": -1.5652744770050049, + "logps/rejected": -1.6764402389526367, + "loss": 0.7541, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.5652744770050049, + "rewards/margins": 0.1111656203866005, + "rewards/rejected": -1.6764402389526367, + "sft_loss": 1.4905664920806885, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 11.824220911593985, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.06564657390117645, + "logits/rejected": 0.09819936007261276, + "logps/chosen": -1.5161938667297363, + "logps/rejected": -1.5157290697097778, + "loss": 0.8118, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5161938667297363, + "rewards/margins": -0.00046480895252898335, + "rewards/rejected": -1.5157290697097778, + "sft_loss": 1.3504148721694946, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 9.777524019352478, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.052200376987457275, + "logits/rejected": 0.000967084604781121, + "logps/chosen": -1.5276451110839844, + "logps/rejected": -1.6252696514129639, + "loss": 0.7564, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5276451110839844, + "rewards/margins": 0.09762442111968994, + "rewards/rejected": -1.6252696514129639, + "sft_loss": 1.4340415000915527, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 10.105912104578321, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.15139448642730713, + "logits/rejected": -0.006771622691303492, + "logps/chosen": -1.638725996017456, + "logps/rejected": -1.6153501272201538, + "loss": 0.8533, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.638725996017456, + "rewards/margins": -0.02337591163814068, + "rewards/rejected": -1.6153501272201538, + "sft_loss": 1.4840831756591797, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 9.03891423231266, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.08140228688716888, + "logits/rejected": 0.08782564848661423, + "logps/chosen": -1.4777276515960693, + "logps/rejected": -1.598278284072876, + "loss": 0.7631, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4777276515960693, + "rewards/margins": 0.12055070698261261, + "rewards/rejected": -1.598278284072876, + "sft_loss": 1.3730732202529907, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 17.13980495820537, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.08505090326070786, + "logits/rejected": -0.03041454777121544, + "logps/chosen": -1.5955283641815186, + "logps/rejected": -1.6537210941314697, + "loss": 0.7996, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.5955283641815186, + "rewards/margins": 0.058192551136016846, + "rewards/rejected": -1.6537210941314697, + "sft_loss": 1.4852439165115356, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 9.642144891608261, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.038266055285930634, + "logits/rejected": 0.0400555320084095, + "logps/chosen": -1.4493739604949951, + "logps/rejected": -1.5499675273895264, + "loss": 0.7664, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4493739604949951, + "rewards/margins": 0.10059352964162827, + "rewards/rejected": -1.5499675273895264, + "sft_loss": 1.4198567867279053, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 9.1649894632413, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": -0.012896222993731499, + "logits/rejected": -0.012625837698578835, + "logps/chosen": -1.4367154836654663, + "logps/rejected": -1.6282215118408203, + "loss": 0.745, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4367154836654663, + "rewards/margins": 0.1915060132741928, + "rewards/rejected": -1.6282215118408203, + "sft_loss": 1.4140430688858032, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 8.447001689686841, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.17267711460590363, + "logits/rejected": -0.08615531027317047, + "logps/chosen": -1.3968846797943115, + "logps/rejected": -1.459826946258545, + "loss": 0.799, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3968846797943115, + "rewards/margins": 0.06294231116771698, + "rewards/rejected": -1.459826946258545, + "sft_loss": 1.3795416355133057, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 10.92962736637001, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.09284885972738266, + "logits/rejected": 0.022700410336256027, + "logps/chosen": -1.3386284112930298, + "logps/rejected": -1.4749207496643066, + "loss": 0.7308, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3386284112930298, + "rewards/margins": 0.13629236817359924, + "rewards/rejected": -1.4749207496643066, + "sft_loss": 1.3290035724639893, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 6.43306218127895, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": -0.00904055405408144, + "logits/rejected": 0.14529991149902344, + "logps/chosen": -1.2881543636322021, + "logps/rejected": -1.4579533338546753, + "loss": 0.7028, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2881543636322021, + "rewards/margins": 0.1697990745306015, + "rewards/rejected": -1.4579533338546753, + "sft_loss": 1.3130781650543213, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 16.45987301331402, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.11670553684234619, + "logits/rejected": 0.017344793304800987, + "logps/chosen": -1.4145252704620361, + "logps/rejected": -1.4540348052978516, + "loss": 0.7736, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4145252704620361, + "rewards/margins": 0.03950957953929901, + "rewards/rejected": -1.4540348052978516, + "sft_loss": 1.4150238037109375, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 14.278219923361767, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.09433043003082275, + "logits/rejected": 0.04523424059152603, + "logps/chosen": -1.3298991918563843, + "logps/rejected": -1.4006279706954956, + "loss": 0.76, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3298991918563843, + "rewards/margins": 0.07072871923446655, + "rewards/rejected": -1.4006279706954956, + "sft_loss": 1.3061649799346924, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 10.534106245791639, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.17358310520648956, + "logits/rejected": 0.012704399414360523, + "logps/chosen": -1.4094555377960205, + "logps/rejected": -1.5506460666656494, + "loss": 0.7436, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4094555377960205, + "rewards/margins": 0.1411905735731125, + "rewards/rejected": -1.5506460666656494, + "sft_loss": 1.3683921098709106, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 9.099909343280414, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.19610531628131866, + "logits/rejected": 0.04747116565704346, + "logps/chosen": -1.4265471696853638, + "logps/rejected": -1.502264142036438, + "loss": 0.7316, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4265471696853638, + "rewards/margins": 0.07571707665920258, + "rewards/rejected": -1.502264142036438, + "sft_loss": 1.3935734033584595, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 16.994371531459464, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.024409957230091095, + "logits/rejected": 0.12192012369632721, + "logps/chosen": -1.3803361654281616, + "logps/rejected": -1.5626693964004517, + "loss": 0.7115, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3803361654281616, + "rewards/margins": 0.1823331117630005, + "rewards/rejected": -1.5626693964004517, + "sft_loss": 1.3772460222244263, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 6.560055258862408, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.11784350872039795, + "logits/rejected": 0.04851361736655235, + "logps/chosen": -1.3842450380325317, + "logps/rejected": -1.5287258625030518, + "loss": 0.706, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3842450380325317, + "rewards/margins": 0.14448070526123047, + "rewards/rejected": -1.5287258625030518, + "sft_loss": 1.3530828952789307, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 6.752363954250565, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.03928910568356514, + "logits/rejected": 0.036488309502601624, + "logps/chosen": -1.4256141185760498, + "logps/rejected": -1.5976355075836182, + "loss": 0.7152, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4256141185760498, + "rewards/margins": 0.17202138900756836, + "rewards/rejected": -1.5976355075836182, + "sft_loss": 1.3297736644744873, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 13.272456740531851, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": -0.01150902546942234, + "logits/rejected": 0.12112773954868317, + "logps/chosen": -1.3811085224151611, + "logps/rejected": -1.5527503490447998, + "loss": 0.6883, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3811085224151611, + "rewards/margins": 0.17164193093776703, + "rewards/rejected": -1.5527503490447998, + "sft_loss": 1.3338648080825806, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 5.611535575434795, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.04383648559451103, + "logits/rejected": 0.07974930107593536, + "logps/chosen": -1.3846567869186401, + "logps/rejected": -1.582404613494873, + "loss": 0.7098, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3846567869186401, + "rewards/margins": 0.19774766266345978, + "rewards/rejected": -1.582404613494873, + "sft_loss": 1.3970749378204346, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 8.21392188565044, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": -0.00035706907510757446, + "logits/rejected": 0.11412318050861359, + "logps/chosen": -1.4996440410614014, + "logps/rejected": -1.5583584308624268, + "loss": 0.7687, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4996440410614014, + "rewards/margins": 0.058714479207992554, + "rewards/rejected": -1.5583584308624268, + "sft_loss": 1.4782047271728516, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 16.18873353125527, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.07753191888332367, + "logits/rejected": 0.0833100825548172, + "logps/chosen": -1.424626111984253, + "logps/rejected": -1.492189884185791, + "loss": 0.7836, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.424626111984253, + "rewards/margins": 0.06756364554166794, + "rewards/rejected": -1.492189884185791, + "sft_loss": 1.3677934408187866, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 10.420223868825996, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.06023973226547241, + "logits/rejected": 0.07743427902460098, + "logps/chosen": -1.359100341796875, + "logps/rejected": -1.4964516162872314, + "loss": 0.7151, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.359100341796875, + "rewards/margins": 0.1373511254787445, + "rewards/rejected": -1.4964516162872314, + "sft_loss": 1.2914539575576782, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 7.087682456530805, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.2527022659778595, + "logits/rejected": -0.1475326120853424, + "logps/chosen": -1.4745581150054932, + "logps/rejected": -1.6276257038116455, + "loss": 0.6963, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4745581150054932, + "rewards/margins": 0.15306781232357025, + "rewards/rejected": -1.6276257038116455, + "sft_loss": 1.4316717386245728, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 11.224112493004597, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.11539869010448456, + "logits/rejected": -0.03174302354454994, + "logps/chosen": -1.469599962234497, + "logps/rejected": -1.6331703662872314, + "loss": 0.7424, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.469599962234497, + "rewards/margins": 0.16357028484344482, + "rewards/rejected": -1.6331703662872314, + "sft_loss": 1.4724925756454468, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 5.68586018826389, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.10837896913290024, + "logits/rejected": 0.01979227364063263, + "logps/chosen": -1.3989070653915405, + "logps/rejected": -1.5062487125396729, + "loss": 0.7324, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3989070653915405, + "rewards/margins": 0.10734158754348755, + "rewards/rejected": -1.5062487125396729, + "sft_loss": 1.3805054426193237, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 9.169435724630166, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.03658795356750488, + "logits/rejected": 0.060106754302978516, + "logps/chosen": -1.3430955410003662, + "logps/rejected": -1.5379518270492554, + "loss": 0.7119, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3430955410003662, + "rewards/margins": 0.19485625624656677, + "rewards/rejected": -1.5379518270492554, + "sft_loss": 1.3056552410125732, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 11.642343753910186, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.10649528354406357, + "logits/rejected": 0.050676681101322174, + "logps/chosen": -1.4193177223205566, + "logps/rejected": -1.532428503036499, + "loss": 0.744, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4193177223205566, + "rewards/margins": 0.11311081796884537, + "rewards/rejected": -1.532428503036499, + "sft_loss": 1.3757294416427612, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 9.477273557866901, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.08700501173734665, + "logits/rejected": 0.05256549268960953, + "logps/chosen": -1.4146549701690674, + "logps/rejected": -1.521597146987915, + "loss": 0.7508, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4146549701690674, + "rewards/margins": 0.10694216191768646, + "rewards/rejected": -1.521597146987915, + "sft_loss": 1.4378408193588257, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 7.858316458995477, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.1313084065914154, + "logits/rejected": 0.16333989799022675, + "logps/chosen": -1.4311192035675049, + "logps/rejected": -1.5807135105133057, + "loss": 0.7008, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4311192035675049, + "rewards/margins": 0.14959420263767242, + "rewards/rejected": -1.5807135105133057, + "sft_loss": 1.408268928527832, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 10.722054726747858, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.07545675337314606, + "logits/rejected": -0.0185849629342556, + "logps/chosen": -1.356143832206726, + "logps/rejected": -1.4903559684753418, + "loss": 0.7084, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.356143832206726, + "rewards/margins": 0.13421215116977692, + "rewards/rejected": -1.4903559684753418, + "sft_loss": 1.3286858797073364, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 8.396936759589622, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.07630442082881927, + "logits/rejected": 0.09307421743869781, + "logps/chosen": -1.3904763460159302, + "logps/rejected": -1.4916353225708008, + "loss": 0.7406, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3904763460159302, + "rewards/margins": 0.10115914046764374, + "rewards/rejected": -1.4916353225708008, + "sft_loss": 1.4179452657699585, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 7.5461255699415615, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.021782396361231804, + "logits/rejected": 0.0503949336707592, + "logps/chosen": -1.5006964206695557, + "logps/rejected": -1.5041887760162354, + "loss": 0.8034, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.5006964206695557, + "rewards/margins": 0.0034922778140753508, + "rewards/rejected": -1.5041887760162354, + "sft_loss": 1.470004916191101, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 10.379361247174433, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.20494525134563446, + "logits/rejected": -0.11462219059467316, + "logps/chosen": -1.4621378183364868, + "logps/rejected": -1.5664069652557373, + "loss": 0.7687, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.4621378183364868, + "rewards/margins": 0.1042691320180893, + "rewards/rejected": -1.5664069652557373, + "sft_loss": 1.4343583583831787, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 12.206956445803208, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.0364091582596302, + "logits/rejected": 0.12094012647867203, + "logps/chosen": -1.4632132053375244, + "logps/rejected": -1.6362049579620361, + "loss": 0.7321, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4632132053375244, + "rewards/margins": 0.17299169301986694, + "rewards/rejected": -1.6362049579620361, + "sft_loss": 1.4501442909240723, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 7.8218244456780175, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.06194036453962326, + "logits/rejected": 0.0724940076470375, + "logps/chosen": -1.4121302366256714, + "logps/rejected": -1.473226547241211, + "loss": 0.7475, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4121302366256714, + "rewards/margins": 0.06109621003270149, + "rewards/rejected": -1.473226547241211, + "sft_loss": 1.4019798040390015, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 10.966456850479648, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.13376030325889587, + "logits/rejected": -0.014714968390762806, + "logps/chosen": -1.4268195629119873, + "logps/rejected": -1.7038514614105225, + "loss": 0.6965, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4268195629119873, + "rewards/margins": 0.2770320475101471, + "rewards/rejected": -1.7038514614105225, + "sft_loss": 1.4688560962677002, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 13.122909386225336, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.014597897417843342, + "logits/rejected": 0.13579130172729492, + "logps/chosen": -1.4340598583221436, + "logps/rejected": -1.6640983819961548, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4340598583221436, + "rewards/margins": 0.23003847897052765, + "rewards/rejected": -1.6640983819961548, + "sft_loss": 1.4217156171798706, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 14.708678526323988, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.042291101068258286, + "logits/rejected": 0.14999577403068542, + "logps/chosen": -1.467124581336975, + "logps/rejected": -1.5221842527389526, + "loss": 0.7524, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.467124581336975, + "rewards/margins": 0.05505945533514023, + "rewards/rejected": -1.5221842527389526, + "sft_loss": 1.4110476970672607, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 13.006814045736004, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.030298244208097458, + "logits/rejected": 0.11877082288265228, + "logps/chosen": -1.5452629327774048, + "logps/rejected": -1.641233205795288, + "loss": 0.776, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5452629327774048, + "rewards/margins": 0.09597032517194748, + "rewards/rejected": -1.641233205795288, + "sft_loss": 1.4715862274169922, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 14.681577501181096, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.03941451385617256, + "logits/rejected": 0.06996399164199829, + "logps/chosen": -1.4516140222549438, + "logps/rejected": -1.6157200336456299, + "loss": 0.7219, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4516140222549438, + "rewards/margins": 0.16410626471042633, + "rewards/rejected": -1.6157200336456299, + "sft_loss": 1.43868088722229, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 11.93474643394763, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": 0.0037063672207295895, + "logits/rejected": 0.09875715523958206, + "logps/chosen": -1.3943617343902588, + "logps/rejected": -1.522147536277771, + "loss": 0.7419, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3943617343902588, + "rewards/margins": 0.12778589129447937, + "rewards/rejected": -1.522147536277771, + "sft_loss": 1.4115302562713623, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 8.821079935796888, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.10563834756612778, + "logits/rejected": 0.11631409823894501, + "logps/chosen": -1.481509804725647, + "logps/rejected": -1.5383491516113281, + "loss": 0.7743, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.481509804725647, + "rewards/margins": 0.05683939531445503, + "rewards/rejected": -1.5383491516113281, + "sft_loss": 1.471835732460022, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 9.718394718090156, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.07945707440376282, + "logits/rejected": 0.0006481289747171104, + "logps/chosen": -1.433421015739441, + "logps/rejected": -1.5861304998397827, + "loss": 0.7337, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.433421015739441, + "rewards/margins": 0.1527094841003418, + "rewards/rejected": -1.5861304998397827, + "sft_loss": 1.376868486404419, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 15.56970534009352, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": -0.010357332415878773, + "logits/rejected": 0.07142248749732971, + "logps/chosen": -1.4000120162963867, + "logps/rejected": -1.4971468448638916, + "loss": 0.7601, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4000120162963867, + "rewards/margins": 0.09713465720415115, + "rewards/rejected": -1.4971468448638916, + "sft_loss": 1.339832067489624, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 11.018058444333253, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.04047941416501999, + "logits/rejected": 0.05441926792263985, + "logps/chosen": -1.3855236768722534, + "logps/rejected": -1.4579277038574219, + "loss": 0.7709, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3855236768722534, + "rewards/margins": 0.07240404933691025, + "rewards/rejected": -1.4579277038574219, + "sft_loss": 1.3380168676376343, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 13.123972784133054, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.08183223009109497, + "logits/rejected": 0.07531268894672394, + "logps/chosen": -1.3839911222457886, + "logps/rejected": -1.5580480098724365, + "loss": 0.7248, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3839911222457886, + "rewards/margins": 0.17405681312084198, + "rewards/rejected": -1.5580480098724365, + "sft_loss": 1.3942081928253174, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 6.492217241230478, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.06026136875152588, + "logits/rejected": 0.022083550691604614, + "logps/chosen": -1.417571783065796, + "logps/rejected": -1.6119505167007446, + "loss": 0.6757, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.417571783065796, + "rewards/margins": 0.19437862932682037, + "rewards/rejected": -1.6119505167007446, + "sft_loss": 1.3851673603057861, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 5.285375790935912, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.004461781587451696, + "logits/rejected": 0.0782126933336258, + "logps/chosen": -1.5011136531829834, + "logps/rejected": -1.5273463726043701, + "loss": 0.7812, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5011136531829834, + "rewards/margins": 0.026232635602355003, + "rewards/rejected": -1.5273463726043701, + "sft_loss": 1.4751291275024414, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 12.608344518189268, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.048131298273801804, + "logits/rejected": 0.21444062888622284, + "logps/chosen": -1.5212723016738892, + "logps/rejected": -1.623795747756958, + "loss": 0.7575, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5212723016738892, + "rewards/margins": 0.10252338647842407, + "rewards/rejected": -1.623795747756958, + "sft_loss": 1.47605299949646, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 7.482378689086901, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.0485985204577446, + "logits/rejected": 0.11298723518848419, + "logps/chosen": -1.476596474647522, + "logps/rejected": -1.5395801067352295, + "loss": 0.7418, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.476596474647522, + "rewards/margins": 0.06298379600048065, + "rewards/rejected": -1.5395801067352295, + "sft_loss": 1.4238179922103882, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 8.087243389889734, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.06979383528232574, + "logits/rejected": 0.1674462854862213, + "logps/chosen": -1.4954628944396973, + "logps/rejected": -1.612022042274475, + "loss": 0.7198, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4954628944396973, + "rewards/margins": 0.11655900627374649, + "rewards/rejected": -1.612022042274475, + "sft_loss": 1.4028011560440063, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.2917894423007965, + "eval_logits/rejected": 0.3831002116203308, + "eval_logps/chosen": -1.5116333961486816, + "eval_logps/rejected": -1.6721185445785522, + "eval_loss": 0.7212684750556946, + "eval_rewards/accuracies": 0.5556379556655884, + "eval_rewards/chosen": -1.5116333961486816, + "eval_rewards/margins": 0.16048528254032135, + "eval_rewards/rejected": -1.6721185445785522, + "eval_runtime": 49.8544, + "eval_samples_per_second": 26.979, + "eval_sft_loss": 1.442051887512207, + "eval_steps_per_second": 6.76, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 7.673976548115488, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": 0.0018800109392032027, + "logits/rejected": 0.09711170196533203, + "logps/chosen": -1.5108472108840942, + "logps/rejected": -1.6167656183242798, + "loss": 0.7613, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.5108472108840942, + "rewards/margins": 0.10591830313205719, + "rewards/rejected": -1.6167656183242798, + "sft_loss": 1.4360049962997437, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 16.658005859941053, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.012961970642209053, + "logits/rejected": 0.14409925043582916, + "logps/chosen": -1.442063570022583, + "logps/rejected": -1.5798479318618774, + "loss": 0.7233, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.442063570022583, + "rewards/margins": 0.13778413832187653, + "rewards/rejected": -1.5798479318618774, + "sft_loss": 1.4210898876190186, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 7.1966242834238425, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.020494680851697922, + "logits/rejected": 0.019982147961854935, + "logps/chosen": -1.4275459051132202, + "logps/rejected": -1.6003360748291016, + "loss": 0.7133, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4275459051132202, + "rewards/margins": 0.172790065407753, + "rewards/rejected": -1.6003360748291016, + "sft_loss": 1.3976166248321533, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 8.634312077569724, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.03358606994152069, + "logits/rejected": 0.1547229290008545, + "logps/chosen": -1.3682522773742676, + "logps/rejected": -1.496917963027954, + "loss": 0.74, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3682522773742676, + "rewards/margins": 0.12866561114788055, + "rewards/rejected": -1.496917963027954, + "sft_loss": 1.3897976875305176, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 9.656064150679228, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.07592150568962097, + "logits/rejected": 0.11702374368906021, + "logps/chosen": -1.4173481464385986, + "logps/rejected": -1.6226460933685303, + "loss": 0.6801, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4173481464385986, + "rewards/margins": 0.2052977979183197, + "rewards/rejected": -1.6226460933685303, + "sft_loss": 1.4624885320663452, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 11.2083299615921, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.10108651965856552, + "logits/rejected": 0.09405249357223511, + "logps/chosen": -1.4314601421356201, + "logps/rejected": -1.637756586074829, + "loss": 0.6848, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4314601421356201, + "rewards/margins": 0.20629656314849854, + "rewards/rejected": -1.637756586074829, + "sft_loss": 1.4554636478424072, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 9.669481078109088, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.019327210262417793, + "logits/rejected": 0.07081412523984909, + "logps/chosen": -1.322218656539917, + "logps/rejected": -1.4697790145874023, + "loss": 0.695, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.322218656539917, + "rewards/margins": 0.1475602388381958, + "rewards/rejected": -1.4697790145874023, + "sft_loss": 1.3607923984527588, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 10.428163109914133, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.021354446187615395, + "logits/rejected": 0.06936828047037125, + "logps/chosen": -1.3904298543930054, + "logps/rejected": -1.5355784893035889, + "loss": 0.7124, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3904298543930054, + "rewards/margins": 0.14514875411987305, + "rewards/rejected": -1.5355784893035889, + "sft_loss": 1.3935145139694214, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 11.07544413713983, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.07900739461183548, + "logits/rejected": 0.03180045634508133, + "logps/chosen": -1.4274473190307617, + "logps/rejected": -1.6190907955169678, + "loss": 0.7381, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4274473190307617, + "rewards/margins": 0.19164356589317322, + "rewards/rejected": -1.6190907955169678, + "sft_loss": 1.435367465019226, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 12.53024445306653, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": -0.020625609904527664, + "logits/rejected": 0.10038242489099503, + "logps/chosen": -1.449824571609497, + "logps/rejected": -1.6178346872329712, + "loss": 0.6905, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.449824571609497, + "rewards/margins": 0.16801026463508606, + "rewards/rejected": -1.6178346872329712, + "sft_loss": 1.3960511684417725, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 16.569495110491154, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": 0.010531505569815636, + "logits/rejected": 0.10226383060216904, + "logps/chosen": -1.4113497734069824, + "logps/rejected": -1.6499805450439453, + "loss": 0.6805, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4113497734069824, + "rewards/margins": 0.23863062262535095, + "rewards/rejected": -1.6499805450439453, + "sft_loss": 1.3804184198379517, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 10.178410801248045, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.14100396633148193, + "logits/rejected": -0.019715065136551857, + "logps/chosen": -1.5282491445541382, + "logps/rejected": -1.6294291019439697, + "loss": 0.7417, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5282491445541382, + "rewards/margins": 0.10117986053228378, + "rewards/rejected": -1.6294291019439697, + "sft_loss": 1.5199025869369507, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 13.275624616560746, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.1582765281200409, + "logits/rejected": 0.180677130818367, + "logps/chosen": -1.5120041370391846, + "logps/rejected": -1.699751853942871, + "loss": 0.7239, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5120041370391846, + "rewards/margins": 0.18774764239788055, + "rewards/rejected": -1.699751853942871, + "sft_loss": 1.4534732103347778, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 9.492688236948386, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.16971933841705322, + "logits/rejected": 0.12275818735361099, + "logps/chosen": -1.4381402730941772, + "logps/rejected": -1.641122579574585, + "loss": 0.7046, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4381402730941772, + "rewards/margins": 0.2029825747013092, + "rewards/rejected": -1.641122579574585, + "sft_loss": 1.4118239879608154, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 8.759624212285418, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.034986358135938644, + "logits/rejected": 0.11416208744049072, + "logps/chosen": -1.4491407871246338, + "logps/rejected": -1.778569221496582, + "loss": 0.6601, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4491407871246338, + "rewards/margins": 0.329428493976593, + "rewards/rejected": -1.778569221496582, + "sft_loss": 1.4499163627624512, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 17.90670295613648, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.00017474293417762965, + "logits/rejected": 0.2095792293548584, + "logps/chosen": -1.4264090061187744, + "logps/rejected": -1.5594055652618408, + "loss": 0.7328, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4264090061187744, + "rewards/margins": 0.13299642503261566, + "rewards/rejected": -1.5594055652618408, + "sft_loss": 1.4186687469482422, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 18.812647759724136, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": 0.038185965269804, + "logits/rejected": 0.08179084956645966, + "logps/chosen": -1.5820934772491455, + "logps/rejected": -1.686597466468811, + "loss": 0.755, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5820934772491455, + "rewards/margins": 0.10450379550457001, + "rewards/rejected": -1.686597466468811, + "sft_loss": 1.5076513290405273, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 8.799166305860778, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.045380812138319016, + "logits/rejected": 0.1188635379076004, + "logps/chosen": -1.5420925617218018, + "logps/rejected": -1.652745008468628, + "loss": 0.7556, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5420925617218018, + "rewards/margins": 0.11065268516540527, + "rewards/rejected": -1.652745008468628, + "sft_loss": 1.461670994758606, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 14.31453780443302, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.010799932293593884, + "logits/rejected": 0.011995360255241394, + "logps/chosen": -1.5552622079849243, + "logps/rejected": -1.6749614477157593, + "loss": 0.7415, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5552622079849243, + "rewards/margins": 0.11969934403896332, + "rewards/rejected": -1.6749614477157593, + "sft_loss": 1.533546805381775, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 9.790672917266049, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.021091187372803688, + "logits/rejected": 0.08316578716039658, + "logps/chosen": -1.4617230892181396, + "logps/rejected": -1.651269555091858, + "loss": 0.7205, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4617230892181396, + "rewards/margins": 0.189546599984169, + "rewards/rejected": -1.651269555091858, + "sft_loss": 1.4185668230056763, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 9.993005038856948, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.0358644500374794, + "logits/rejected": 0.11196329444646835, + "logps/chosen": -1.5400495529174805, + "logps/rejected": -1.6119331121444702, + "loss": 0.7458, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5400495529174805, + "rewards/margins": 0.0718836560845375, + "rewards/rejected": -1.6119331121444702, + "sft_loss": 1.4943927526474, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 7.301408856819212, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.12172931432723999, + "logits/rejected": 0.186055988073349, + "logps/chosen": -1.4746077060699463, + "logps/rejected": -1.6851272583007812, + "loss": 0.6833, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4746077060699463, + "rewards/margins": 0.21051974594593048, + "rewards/rejected": -1.6851272583007812, + "sft_loss": 1.4081268310546875, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 8.16894501894189, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.08651556074619293, + "logits/rejected": 0.18817943334579468, + "logps/chosen": -1.3984525203704834, + "logps/rejected": -1.5859407186508179, + "loss": 0.6888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3984525203704834, + "rewards/margins": 0.18748828768730164, + "rewards/rejected": -1.5859407186508179, + "sft_loss": 1.3933144807815552, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 6.611398368137036, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.05394362285733223, + "logits/rejected": 0.08871600031852722, + "logps/chosen": -1.4527640342712402, + "logps/rejected": -1.6061521768569946, + "loss": 0.725, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4527640342712402, + "rewards/margins": 0.15338827669620514, + "rewards/rejected": -1.6061521768569946, + "sft_loss": 1.5176591873168945, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 16.311152981957626, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.14393463730812073, + "logits/rejected": 0.221922367811203, + "logps/chosen": -1.4483121633529663, + "logps/rejected": -1.6912574768066406, + "loss": 0.7012, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4483121633529663, + "rewards/margins": 0.24294531345367432, + "rewards/rejected": -1.6912574768066406, + "sft_loss": 1.4978748559951782, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 7.814584788227321, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.11198891699314117, + "logits/rejected": 0.2009311467409134, + "logps/chosen": -1.403903603553772, + "logps/rejected": -1.5798251628875732, + "loss": 0.7275, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.403903603553772, + "rewards/margins": 0.17592160403728485, + "rewards/rejected": -1.5798251628875732, + "sft_loss": 1.3508328199386597, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 8.901087935063803, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.05171307176351547, + "logits/rejected": 0.2174970805644989, + "logps/chosen": -1.4211546182632446, + "logps/rejected": -1.5635936260223389, + "loss": 0.6988, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4211546182632446, + "rewards/margins": 0.1424390822649002, + "rewards/rejected": -1.5635936260223389, + "sft_loss": 1.3733739852905273, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 6.850510093446501, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.06330250203609467, + "logits/rejected": 0.1406676471233368, + "logps/chosen": -1.565857172012329, + "logps/rejected": -1.7019507884979248, + "loss": 0.7262, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.565857172012329, + "rewards/margins": 0.1360936015844345, + "rewards/rejected": -1.7019507884979248, + "sft_loss": 1.572394609451294, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 7.960687627957905, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.04025112837553024, + "logits/rejected": 0.16977599263191223, + "logps/chosen": -1.4802181720733643, + "logps/rejected": -1.6751117706298828, + "loss": 0.6705, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4802181720733643, + "rewards/margins": 0.19489355385303497, + "rewards/rejected": -1.6751117706298828, + "sft_loss": 1.4630837440490723, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 8.11698133990825, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.12934860587120056, + "logits/rejected": 0.20263417065143585, + "logps/chosen": -1.5041999816894531, + "logps/rejected": -1.6858787536621094, + "loss": 0.6857, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5041999816894531, + "rewards/margins": 0.18167902529239655, + "rewards/rejected": -1.6858787536621094, + "sft_loss": 1.4602586030960083, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 14.065711458465282, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.009817545302212238, + "logits/rejected": 0.11904706805944443, + "logps/chosen": -1.6011021137237549, + "logps/rejected": -1.7125682830810547, + "loss": 0.7452, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6011021137237549, + "rewards/margins": 0.11146605014801025, + "rewards/rejected": -1.7125682830810547, + "sft_loss": 1.5662167072296143, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 11.494237494395797, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.10378079116344452, + "logits/rejected": 0.12546811997890472, + "logps/chosen": -1.4556339979171753, + "logps/rejected": -1.6442264318466187, + "loss": 0.6829, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4556339979171753, + "rewards/margins": 0.1885923445224762, + "rewards/rejected": -1.6442264318466187, + "sft_loss": 1.5452286005020142, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 7.026135520570362, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": 0.07938935607671738, + "logits/rejected": 0.14415811002254486, + "logps/chosen": -1.5560821294784546, + "logps/rejected": -1.753104567527771, + "loss": 0.6979, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5560821294784546, + "rewards/margins": 0.19702255725860596, + "rewards/rejected": -1.753104567527771, + "sft_loss": 1.553784966468811, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 13.085706709126768, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.0006553709390573204, + "logits/rejected": 0.23008818924427032, + "logps/chosen": -1.5384807586669922, + "logps/rejected": -1.6943981647491455, + "loss": 0.718, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5384807586669922, + "rewards/margins": 0.15591737627983093, + "rewards/rejected": -1.6943981647491455, + "sft_loss": 1.565394401550293, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 7.562416716843187, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": 0.01612495258450508, + "logits/rejected": 0.07761206477880478, + "logps/chosen": -1.4772237539291382, + "logps/rejected": -1.6728931665420532, + "loss": 0.7012, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4772237539291382, + "rewards/margins": 0.19566944241523743, + "rewards/rejected": -1.6728931665420532, + "sft_loss": 1.4823495149612427, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 7.27759841113723, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": 0.02421320043504238, + "logits/rejected": 0.18904590606689453, + "logps/chosen": -1.4877642393112183, + "logps/rejected": -1.7540662288665771, + "loss": 0.6545, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4877642393112183, + "rewards/margins": 0.2663021683692932, + "rewards/rejected": -1.7540662288665771, + "sft_loss": 1.4846785068511963, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 8.545572857334468, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.03330926224589348, + "logits/rejected": 0.07859676331281662, + "logps/chosen": -1.694360375404358, + "logps/rejected": -1.8174225091934204, + "loss": 0.7547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.694360375404358, + "rewards/margins": 0.12306202948093414, + "rewards/rejected": -1.8174225091934204, + "sft_loss": 1.6499773263931274, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 20.068562800951632, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": 0.02297041192650795, + "logits/rejected": 0.15485970675945282, + "logps/chosen": -1.6264880895614624, + "logps/rejected": -1.7158950567245483, + "loss": 0.737, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6264880895614624, + "rewards/margins": 0.08940695226192474, + "rewards/rejected": -1.7158950567245483, + "sft_loss": 1.6207870244979858, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 11.548169014320758, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": 0.08564107120037079, + "logits/rejected": 0.22062385082244873, + "logps/chosen": -1.5454473495483398, + "logps/rejected": -1.7348639965057373, + "loss": 0.6962, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5454473495483398, + "rewards/margins": 0.18941658735275269, + "rewards/rejected": -1.7348639965057373, + "sft_loss": 1.540062665939331, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 7.903169561620104, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": 0.03412085771560669, + "logits/rejected": 0.19069956243038177, + "logps/chosen": -1.5859081745147705, + "logps/rejected": -1.7322183847427368, + "loss": 0.7088, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5859081745147705, + "rewards/margins": 0.1463100016117096, + "rewards/rejected": -1.7322183847427368, + "sft_loss": 1.5760464668273926, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 12.78791682535348, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.016653526574373245, + "logits/rejected": 0.07038216292858124, + "logps/chosen": -1.7270981073379517, + "logps/rejected": -1.93124520778656, + "loss": 0.6967, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7270981073379517, + "rewards/margins": 0.20414705574512482, + "rewards/rejected": -1.93124520778656, + "sft_loss": 1.6932952404022217, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 19.968400745148678, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": 0.10672630369663239, + "logits/rejected": 0.27668988704681396, + "logps/chosen": -1.6665118932724, + "logps/rejected": -1.8740952014923096, + "loss": 0.6986, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6665118932724, + "rewards/margins": 0.20758314430713654, + "rewards/rejected": -1.8740952014923096, + "sft_loss": 1.6268879175186157, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 6.570304438637934, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": 0.028351670131087303, + "logits/rejected": 0.14044256508350372, + "logps/chosen": -1.6373220682144165, + "logps/rejected": -1.9211671352386475, + "loss": 0.6661, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6373220682144165, + "rewards/margins": 0.2838451862335205, + "rewards/rejected": -1.9211671352386475, + "sft_loss": 1.6178901195526123, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 8.128605670361818, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.1174500435590744, + "logits/rejected": 0.18964903056621552, + "logps/chosen": -1.7414772510528564, + "logps/rejected": -1.9283758401870728, + "loss": 0.7044, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.7414772510528564, + "rewards/margins": 0.1868988275527954, + "rewards/rejected": -1.9283758401870728, + "sft_loss": 1.7075878381729126, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 11.036972242567145, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": 0.1186809092760086, + "logits/rejected": 0.2449244260787964, + "logps/chosen": -1.6226027011871338, + "logps/rejected": -1.9033540487289429, + "loss": 0.6386, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6226027011871338, + "rewards/margins": 0.28075140714645386, + "rewards/rejected": -1.9033540487289429, + "sft_loss": 1.6231905221939087, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 10.804970717169716, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": 0.04840857535600662, + "logits/rejected": 0.16239185631275177, + "logps/chosen": -1.6699421405792236, + "logps/rejected": -1.9459302425384521, + "loss": 0.6439, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6699421405792236, + "rewards/margins": 0.27598804235458374, + "rewards/rejected": -1.9459302425384521, + "sft_loss": 1.6243393421173096, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 7.257398978817098, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": 0.04751784726977348, + "logits/rejected": 0.2962645888328552, + "logps/chosen": -1.761138677597046, + "logps/rejected": -1.9825347661972046, + "loss": 0.6997, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.761138677597046, + "rewards/margins": 0.22139613330364227, + "rewards/rejected": -1.9825347661972046, + "sft_loss": 1.7507600784301758, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 18.5328736159243, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.047339897602796555, + "logits/rejected": 0.159702330827713, + "logps/chosen": -1.7461141347885132, + "logps/rejected": -1.9744231700897217, + "loss": 0.7026, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.7461141347885132, + "rewards/margins": 0.22830912470817566, + "rewards/rejected": -1.9744231700897217, + "sft_loss": 1.7676417827606201, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 9.49519419022593, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.03701364994049072, + "logits/rejected": 0.047867387533187866, + "logps/chosen": -1.6813396215438843, + "logps/rejected": -1.9668573141098022, + "loss": 0.6555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6813396215438843, + "rewards/margins": 0.28551778197288513, + "rewards/rejected": -1.9668573141098022, + "sft_loss": 1.6320127248764038, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 14.110541145250972, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": 0.023934122174978256, + "logits/rejected": 0.16555899381637573, + "logps/chosen": -1.8226124048233032, + "logps/rejected": -2.125878095626831, + "loss": 0.6608, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8226124048233032, + "rewards/margins": 0.3032657504081726, + "rewards/rejected": -2.125878095626831, + "sft_loss": 1.8263565301895142, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 14.501783200578824, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.0873773917555809, + "logits/rejected": 0.10016919672489166, + "logps/chosen": -1.7935062646865845, + "logps/rejected": -2.083219289779663, + "loss": 0.6994, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7935062646865845, + "rewards/margins": 0.2897128462791443, + "rewards/rejected": -2.083219289779663, + "sft_loss": 1.787320852279663, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 11.901703209101552, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": 0.058928169310092926, + "logits/rejected": 0.18308427929878235, + "logps/chosen": -1.7760273218154907, + "logps/rejected": -1.9454076290130615, + "loss": 0.704, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.7760273218154907, + "rewards/margins": 0.16938039660453796, + "rewards/rejected": -1.9454076290130615, + "sft_loss": 1.7750753164291382, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 7.257840074108619, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": 0.04965372011065483, + "logits/rejected": 0.18989132344722748, + "logps/chosen": -1.7178142070770264, + "logps/rejected": -1.983058214187622, + "loss": 0.6622, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7178142070770264, + "rewards/margins": 0.26524391770362854, + "rewards/rejected": -1.983058214187622, + "sft_loss": 1.7471628189086914, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 9.43646865006057, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": 0.02612454816699028, + "logits/rejected": 0.22704274952411652, + "logps/chosen": -1.760839819908142, + "logps/rejected": -1.984262228012085, + "loss": 0.6536, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.760839819908142, + "rewards/margins": 0.22342228889465332, + "rewards/rejected": -1.984262228012085, + "sft_loss": 1.8557103872299194, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 12.75217129491488, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": 0.03415621817111969, + "logits/rejected": 0.11213034391403198, + "logps/chosen": -1.7600212097167969, + "logps/rejected": -2.105306625366211, + "loss": 0.6401, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7600212097167969, + "rewards/margins": 0.3452851176261902, + "rewards/rejected": -2.105306625366211, + "sft_loss": 1.8122920989990234, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 14.136886280968675, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": 0.12487022578716278, + "logits/rejected": 0.2956189513206482, + "logps/chosen": -1.863404631614685, + "logps/rejected": -2.0956015586853027, + "loss": 0.7424, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.863404631614685, + "rewards/margins": 0.23219680786132812, + "rewards/rejected": -2.0956015586853027, + "sft_loss": 1.8058325052261353, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 23.573937335629136, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": 0.08845379203557968, + "logits/rejected": 0.24961254000663757, + "logps/chosen": -1.875475525856018, + "logps/rejected": -2.1358890533447266, + "loss": 0.6855, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.875475525856018, + "rewards/margins": 0.2604133188724518, + "rewards/rejected": -2.1358890533447266, + "sft_loss": 1.8293803930282593, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 10.229793160551953, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.10676054656505585, + "logits/rejected": 0.14324404299259186, + "logps/chosen": -1.8982778787612915, + "logps/rejected": -2.15161395072937, + "loss": 0.6836, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.8982778787612915, + "rewards/margins": 0.25333622097969055, + "rewards/rejected": -2.15161395072937, + "sft_loss": 1.9185289144515991, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 8.294142865496967, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": 0.06274578720331192, + "logits/rejected": 0.1509443074464798, + "logps/chosen": -1.7770349979400635, + "logps/rejected": -2.099757194519043, + "loss": 0.6382, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.7770349979400635, + "rewards/margins": 0.3227222263813019, + "rewards/rejected": -2.099757194519043, + "sft_loss": 1.8025732040405273, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 12.13623780002283, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.0386006124317646, + "logits/rejected": 0.1029144749045372, + "logps/chosen": -1.9709434509277344, + "logps/rejected": -2.2089548110961914, + "loss": 0.6667, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9709434509277344, + "rewards/margins": 0.2380111962556839, + "rewards/rejected": -2.2089548110961914, + "sft_loss": 1.9142709970474243, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 10.247038800111389, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.0016391247045248747, + "logits/rejected": 0.16924506425857544, + "logps/chosen": -1.9363353252410889, + "logps/rejected": -2.3193938732147217, + "loss": 0.6286, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9363353252410889, + "rewards/margins": 0.38305872678756714, + "rewards/rejected": -2.3193938732147217, + "sft_loss": 1.9278072118759155, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 10.824747447247148, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": 0.10746796429157257, + "logits/rejected": 0.20378378033638, + "logps/chosen": -1.9353691339492798, + "logps/rejected": -2.2340164184570312, + "loss": 0.6387, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9353691339492798, + "rewards/margins": 0.2986472249031067, + "rewards/rejected": -2.2340164184570312, + "sft_loss": 1.8967138528823853, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 11.75047292485389, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": 0.030619556084275246, + "logits/rejected": 0.19418159127235413, + "logps/chosen": -2.069227933883667, + "logps/rejected": -2.339308261871338, + "loss": 0.6662, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.069227933883667, + "rewards/margins": 0.27007991075515747, + "rewards/rejected": -2.339308261871338, + "sft_loss": 1.9816768169403076, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 17.695396071152995, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": 0.09588191658258438, + "logits/rejected": 0.28006869554519653, + "logps/chosen": -2.1513028144836426, + "logps/rejected": -2.4341063499450684, + "loss": 0.697, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1513028144836426, + "rewards/margins": 0.2828032970428467, + "rewards/rejected": -2.4341063499450684, + "sft_loss": 2.1287317276000977, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 10.947508153385135, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.03072550520300865, + "logits/rejected": 0.10771825164556503, + "logps/chosen": -1.934647798538208, + "logps/rejected": -2.3204731941223145, + "loss": 0.6085, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.934647798538208, + "rewards/margins": 0.3858256936073303, + "rewards/rejected": -2.3204731941223145, + "sft_loss": 1.9882686138153076, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 19.265460613648305, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.05715986341238022, + "logits/rejected": 0.0929824709892273, + "logps/chosen": -2.092118978500366, + "logps/rejected": -2.363680124282837, + "loss": 0.6823, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.092118978500366, + "rewards/margins": 0.2715609073638916, + "rewards/rejected": -2.363680124282837, + "sft_loss": 2.1064469814300537, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 10.638788847537189, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.022910332307219505, + "logits/rejected": 0.07507751882076263, + "logps/chosen": -2.084163188934326, + "logps/rejected": -2.3188323974609375, + "loss": 0.6646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.084163188934326, + "rewards/margins": 0.23466899991035461, + "rewards/rejected": -2.3188323974609375, + "sft_loss": 2.1011803150177, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 15.490968768923501, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.026397373527288437, + "logits/rejected": 0.13944368064403534, + "logps/chosen": -2.146524667739868, + "logps/rejected": -2.450518846511841, + "loss": 0.6567, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.146524667739868, + "rewards/margins": 0.3039940893650055, + "rewards/rejected": -2.450518846511841, + "sft_loss": 2.1449663639068604, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 6.928646909317261, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": 0.09138406813144684, + "logits/rejected": 0.16776150465011597, + "logps/chosen": -2.144055128097534, + "logps/rejected": -2.5063459873199463, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.144055128097534, + "rewards/margins": 0.3622905910015106, + "rewards/rejected": -2.5063459873199463, + "sft_loss": 2.130777359008789, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 17.01523123010893, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": 0.044910602271556854, + "logits/rejected": 0.21089403331279755, + "logps/chosen": -2.10158109664917, + "logps/rejected": -2.466341972351074, + "loss": 0.6458, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.10158109664917, + "rewards/margins": 0.3647606372833252, + "rewards/rejected": -2.466341972351074, + "sft_loss": 2.08829927444458, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 9.971064318414882, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": 0.0854736715555191, + "logits/rejected": 0.22234101593494415, + "logps/chosen": -2.100032091140747, + "logps/rejected": -2.390237331390381, + "loss": 0.6847, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.100032091140747, + "rewards/margins": 0.29020509123802185, + "rewards/rejected": -2.390237331390381, + "sft_loss": 2.0968873500823975, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 6.942490028615991, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.019798316061496735, + "logits/rejected": 0.06015967205166817, + "logps/chosen": -2.1238718032836914, + "logps/rejected": -2.4312360286712646, + "loss": 0.6619, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1238718032836914, + "rewards/margins": 0.30736416578292847, + "rewards/rejected": -2.4312360286712646, + "sft_loss": 2.1534500122070312, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 8.678598883925492, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": 0.03256041929125786, + "logits/rejected": 0.12763187289237976, + "logps/chosen": -2.0395474433898926, + "logps/rejected": -2.3078253269195557, + "loss": 0.6768, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0395474433898926, + "rewards/margins": 0.26827770471572876, + "rewards/rejected": -2.3078253269195557, + "sft_loss": 2.047736883163452, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 10.086072034634013, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.1441255360841751, + "logits/rejected": 0.2772209346294403, + "logps/chosen": -2.025604724884033, + "logps/rejected": -2.478121519088745, + "loss": 0.6098, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.025604724884033, + "rewards/margins": 0.45251646637916565, + "rewards/rejected": -2.478121519088745, + "sft_loss": 2.0043413639068604, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 6.9563669209630135, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.027880221605300903, + "logits/rejected": 0.15025661885738373, + "logps/chosen": -2.039132833480835, + "logps/rejected": -2.3555057048797607, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.039132833480835, + "rewards/margins": 0.3163727819919586, + "rewards/rejected": -2.3555057048797607, + "sft_loss": 1.9834359884262085, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 10.1422790063844, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.08938129246234894, + "logits/rejected": 0.06203896924853325, + "logps/chosen": -2.033430576324463, + "logps/rejected": -2.3355135917663574, + "loss": 0.6575, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.033430576324463, + "rewards/margins": 0.302082896232605, + "rewards/rejected": -2.3355135917663574, + "sft_loss": 2.0773210525512695, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 9.246731434824245, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": 0.05133030563592911, + "logits/rejected": 0.18070444464683533, + "logps/chosen": -1.9821866750717163, + "logps/rejected": -2.3778305053710938, + "loss": 0.6272, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9821866750717163, + "rewards/margins": 0.39564332365989685, + "rewards/rejected": -2.3778305053710938, + "sft_loss": 1.9979397058486938, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 10.058741885647887, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.013221639208495617, + "logits/rejected": 0.12137987464666367, + "logps/chosen": -1.9625238180160522, + "logps/rejected": -2.3376636505126953, + "loss": 0.6178, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9625238180160522, + "rewards/margins": 0.3751398026943207, + "rewards/rejected": -2.3376636505126953, + "sft_loss": 1.949650526046753, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 11.369308771538593, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.017232773825526237, + "logits/rejected": 0.17944982647895813, + "logps/chosen": -2.0125339031219482, + "logps/rejected": -2.472736358642578, + "loss": 0.6233, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0125339031219482, + "rewards/margins": 0.46020251512527466, + "rewards/rejected": -2.472736358642578, + "sft_loss": 2.0411736965179443, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 8.691476566345305, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": 0.0012146488297730684, + "logits/rejected": 0.08047197759151459, + "logps/chosen": -2.0661959648132324, + "logps/rejected": -2.431159496307373, + "loss": 0.6256, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0661959648132324, + "rewards/margins": 0.3649637699127197, + "rewards/rejected": -2.431159496307373, + "sft_loss": 2.0064785480499268, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.3408060073852539, + "eval_logits/rejected": 0.43282878398895264, + "eval_logps/chosen": -2.1063621044158936, + "eval_logps/rejected": -2.51962947845459, + "eval_loss": 0.6210696697235107, + "eval_rewards/accuracies": 0.6654302477836609, + "eval_rewards/chosen": -2.1063621044158936, + "eval_rewards/margins": 0.41326722502708435, + "eval_rewards/rejected": -2.51962947845459, + "eval_runtime": 51.7853, + "eval_samples_per_second": 25.973, + "eval_sft_loss": 2.064507246017456, + "eval_steps_per_second": 6.508, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 11.272499184163516, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.028055086731910706, + "logits/rejected": 0.14415912330150604, + "logps/chosen": -2.0782713890075684, + "logps/rejected": -2.608614444732666, + "loss": 0.5881, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0782713890075684, + "rewards/margins": 0.5303429961204529, + "rewards/rejected": -2.608614444732666, + "sft_loss": 2.0765321254730225, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 10.196181196736118, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": 0.07339145243167877, + "logits/rejected": 0.1465933471918106, + "logps/chosen": -2.146256923675537, + "logps/rejected": -2.4748787879943848, + "loss": 0.6468, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.146256923675537, + "rewards/margins": 0.32862168550491333, + "rewards/rejected": -2.4748787879943848, + "sft_loss": 2.156471014022827, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 12.834641747300186, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.09053969383239746, + "logits/rejected": 0.22762493789196014, + "logps/chosen": -2.0983657836914062, + "logps/rejected": -2.6281564235687256, + "loss": 0.6087, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0983657836914062, + "rewards/margins": 0.5297908782958984, + "rewards/rejected": -2.6281564235687256, + "sft_loss": 2.1565418243408203, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 10.407568591424436, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.10789525508880615, + "logits/rejected": 0.15350130200386047, + "logps/chosen": -2.1824188232421875, + "logps/rejected": -2.57214617729187, + "loss": 0.6614, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1824188232421875, + "rewards/margins": 0.3897269070148468, + "rewards/rejected": -2.57214617729187, + "sft_loss": 2.1535208225250244, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 16.32583749722319, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": 0.04362216964364052, + "logits/rejected": 0.16212865710258484, + "logps/chosen": -2.0230681896209717, + "logps/rejected": -2.4962611198425293, + "loss": 0.6143, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0230681896209717, + "rewards/margins": 0.47319287061691284, + "rewards/rejected": -2.4962611198425293, + "sft_loss": 2.1321260929107666, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 9.679449553639905, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.10882987082004547, + "logits/rejected": 0.27502506971359253, + "logps/chosen": -2.0700125694274902, + "logps/rejected": -2.392230987548828, + "loss": 0.653, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0700125694274902, + "rewards/margins": 0.3222183287143707, + "rewards/rejected": -2.392230987548828, + "sft_loss": 2.0672836303710938, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 11.783414410888401, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": 0.06562662124633789, + "logits/rejected": 0.19133488833904266, + "logps/chosen": -2.0099849700927734, + "logps/rejected": -2.356341600418091, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0099849700927734, + "rewards/margins": 0.34635668992996216, + "rewards/rejected": -2.356341600418091, + "sft_loss": 2.1112921237945557, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 11.977442879693111, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": 0.049711473286151886, + "logits/rejected": 0.21187114715576172, + "logps/chosen": -2.1671743392944336, + "logps/rejected": -2.5274157524108887, + "loss": 0.6755, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1671743392944336, + "rewards/margins": 0.36024102568626404, + "rewards/rejected": -2.5274157524108887, + "sft_loss": 2.248695135116577, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 6.468856534604243, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.1306043565273285, + "logits/rejected": 0.19352035224437714, + "logps/chosen": -2.1376121044158936, + "logps/rejected": -2.382310628890991, + "loss": 0.7009, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1376121044158936, + "rewards/margins": 0.24469847977161407, + "rewards/rejected": -2.382310628890991, + "sft_loss": 2.2619433403015137, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 9.605540692516087, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.0685216411948204, + "logits/rejected": 0.1462893933057785, + "logps/chosen": -2.1614136695861816, + "logps/rejected": -2.4543886184692383, + "loss": 0.6515, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1614136695861816, + "rewards/margins": 0.2929750084877014, + "rewards/rejected": -2.4543886184692383, + "sft_loss": 2.1964187622070312, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 13.607347229557735, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": 0.03243374079465866, + "logits/rejected": 0.18373778462409973, + "logps/chosen": -2.1962177753448486, + "logps/rejected": -2.599600315093994, + "loss": 0.6816, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.1962177753448486, + "rewards/margins": 0.40338245034217834, + "rewards/rejected": -2.599600315093994, + "sft_loss": 2.2184200286865234, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 7.112280570191756, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.006338512059301138, + "logits/rejected": 0.16377900540828705, + "logps/chosen": -2.3264520168304443, + "logps/rejected": -2.855248212814331, + "loss": 0.5984, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3264520168304443, + "rewards/margins": 0.5287963151931763, + "rewards/rejected": -2.855248212814331, + "sft_loss": 2.329258918762207, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 18.411463377434387, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": 0.06724239140748978, + "logits/rejected": 0.16363653540611267, + "logps/chosen": -2.276106357574463, + "logps/rejected": -2.728752374649048, + "loss": 0.6566, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.276106357574463, + "rewards/margins": 0.45264577865600586, + "rewards/rejected": -2.728752374649048, + "sft_loss": 2.194448947906494, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 8.602773177915976, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.05895795673131943, + "logits/rejected": 0.20239920914173126, + "logps/chosen": -2.257357120513916, + "logps/rejected": -2.714703321456909, + "loss": 0.6051, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.257357120513916, + "rewards/margins": 0.4573463499546051, + "rewards/rejected": -2.714703321456909, + "sft_loss": 2.27178692817688, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 11.413058495257092, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.0611516535282135, + "logits/rejected": 0.14355552196502686, + "logps/chosen": -2.314882278442383, + "logps/rejected": -2.6507880687713623, + "loss": 0.7265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.314882278442383, + "rewards/margins": 0.3359060287475586, + "rewards/rejected": -2.6507880687713623, + "sft_loss": 2.268735647201538, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 8.322484118302752, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.07006438076496124, + "logits/rejected": 0.2505702078342438, + "logps/chosen": -2.313701868057251, + "logps/rejected": -2.7320985794067383, + "loss": 0.6242, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.313701868057251, + "rewards/margins": 0.41839662194252014, + "rewards/rejected": -2.7320985794067383, + "sft_loss": 2.212843179702759, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 11.110994213765201, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.003151841461658478, + "logits/rejected": 0.0851263627409935, + "logps/chosen": -2.2502312660217285, + "logps/rejected": -2.7037360668182373, + "loss": 0.5925, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2502312660217285, + "rewards/margins": 0.45350486040115356, + "rewards/rejected": -2.7037360668182373, + "sft_loss": 2.3255438804626465, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 8.042578958055598, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.07694842666387558, + "logits/rejected": 0.054141778498888016, + "logps/chosen": -2.2050156593322754, + "logps/rejected": -2.605039119720459, + "loss": 0.628, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2050156593322754, + "rewards/margins": 0.40002351999282837, + "rewards/rejected": -2.605039119720459, + "sft_loss": 2.29542875289917, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 10.772931639154226, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.12407276779413223, + "logits/rejected": 0.22976458072662354, + "logps/chosen": -2.0931012630462646, + "logps/rejected": -2.411670446395874, + "loss": 0.6527, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0931012630462646, + "rewards/margins": 0.3185690641403198, + "rewards/rejected": -2.411670446395874, + "sft_loss": 2.15762996673584, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 11.709109730569184, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.031045908108353615, + "logits/rejected": 0.046828627586364746, + "logps/chosen": -2.079315662384033, + "logps/rejected": -2.49480938911438, + "loss": 0.6455, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.079315662384033, + "rewards/margins": 0.41549381613731384, + "rewards/rejected": -2.49480938911438, + "sft_loss": 2.1446011066436768, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 11.282429312529446, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": 0.06443838775157928, + "logits/rejected": 0.2664431929588318, + "logps/chosen": -2.190359354019165, + "logps/rejected": -2.6365315914154053, + "loss": 0.6507, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.190359354019165, + "rewards/margins": 0.44617241621017456, + "rewards/rejected": -2.6365315914154053, + "sft_loss": 2.263021230697632, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 6.609977962565628, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": 0.037000637501478195, + "logits/rejected": 0.13551054894924164, + "logps/chosen": -2.2155261039733887, + "logps/rejected": -2.5495128631591797, + "loss": 0.6795, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2155261039733887, + "rewards/margins": 0.3339867889881134, + "rewards/rejected": -2.5495128631591797, + "sft_loss": 2.206447124481201, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 8.953583728302595, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": 0.007858863100409508, + "logits/rejected": 0.20538286864757538, + "logps/chosen": -2.1532692909240723, + "logps/rejected": -2.579277515411377, + "loss": 0.617, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1532692909240723, + "rewards/margins": 0.42600807547569275, + "rewards/rejected": -2.579277515411377, + "sft_loss": 2.1827797889709473, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 11.136986263164165, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.1128419041633606, + "logits/rejected": 0.2760482430458069, + "logps/chosen": -2.1042284965515137, + "logps/rejected": -2.5994343757629395, + "loss": 0.6181, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1042284965515137, + "rewards/margins": 0.49520596861839294, + "rewards/rejected": -2.5994343757629395, + "sft_loss": 2.201141595840454, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 11.552133892516244, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.020469993352890015, + "logits/rejected": 0.1490965336561203, + "logps/chosen": -2.3277976512908936, + "logps/rejected": -2.7004261016845703, + "loss": 0.6247, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.3277976512908936, + "rewards/margins": 0.372628390789032, + "rewards/rejected": -2.7004261016845703, + "sft_loss": 2.2944016456604004, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 11.712552792289904, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.053301334381103516, + "logits/rejected": 0.060594916343688965, + "logps/chosen": -2.2250044345855713, + "logps/rejected": -2.7405011653900146, + "loss": 0.6411, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2250044345855713, + "rewards/margins": 0.5154968500137329, + "rewards/rejected": -2.7405011653900146, + "sft_loss": 2.2513134479522705, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 9.738315063652102, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.024926379323005676, + "logits/rejected": 0.06959456950426102, + "logps/chosen": -2.3082454204559326, + "logps/rejected": -2.7023775577545166, + "loss": 0.641, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3082454204559326, + "rewards/margins": 0.39413195848464966, + "rewards/rejected": -2.7023775577545166, + "sft_loss": 2.3833935260772705, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 10.449726981814146, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.009898416697978973, + "logits/rejected": 0.10434658825397491, + "logps/chosen": -2.2227025032043457, + "logps/rejected": -2.627720594406128, + "loss": 0.6159, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2227025032043457, + "rewards/margins": 0.4050180912017822, + "rewards/rejected": -2.627720594406128, + "sft_loss": 2.2642316818237305, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 10.750391532240872, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.05544097349047661, + "logits/rejected": 0.08029208332300186, + "logps/chosen": -2.1924877166748047, + "logps/rejected": -2.6801917552948, + "loss": 0.5819, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1924877166748047, + "rewards/margins": 0.487703800201416, + "rewards/rejected": -2.6801917552948, + "sft_loss": 2.233238697052002, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 8.575901452586978, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.05543569475412369, + "logits/rejected": 0.11257767677307129, + "logps/chosen": -2.3032419681549072, + "logps/rejected": -2.6663010120391846, + "loss": 0.659, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3032419681549072, + "rewards/margins": 0.3630586564540863, + "rewards/rejected": -2.6663010120391846, + "sft_loss": 2.2930896282196045, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 8.858314895755024, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": -0.0004590839089360088, + "logits/rejected": 0.21903236210346222, + "logps/chosen": -2.3906993865966797, + "logps/rejected": -2.7964766025543213, + "loss": 0.6359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.3906993865966797, + "rewards/margins": 0.4057773947715759, + "rewards/rejected": -2.7964766025543213, + "sft_loss": 2.451749801635742, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 8.898233512349927, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.04713328927755356, + "logits/rejected": 0.09548879414796829, + "logps/chosen": -2.3923017978668213, + "logps/rejected": -2.7829506397247314, + "loss": 0.6556, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3923017978668213, + "rewards/margins": 0.3906486928462982, + "rewards/rejected": -2.7829506397247314, + "sft_loss": 2.457034111022949, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 10.236929503532881, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.045480988919734955, + "logits/rejected": 0.05475884675979614, + "logps/chosen": -2.4740447998046875, + "logps/rejected": -2.9208970069885254, + "loss": 0.6572, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4740447998046875, + "rewards/margins": 0.44685202836990356, + "rewards/rejected": -2.9208970069885254, + "sft_loss": 2.5296459197998047, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 10.991585515412387, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.05277659744024277, + "logits/rejected": 0.06578487157821655, + "logps/chosen": -2.321049451828003, + "logps/rejected": -2.6783082485198975, + "loss": 0.6606, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.321049451828003, + "rewards/margins": 0.35725873708724976, + "rewards/rejected": -2.6783082485198975, + "sft_loss": 2.3697457313537598, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 9.505797728949158, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.07590442150831223, + "logits/rejected": 0.11572281271219254, + "logps/chosen": -2.364682197570801, + "logps/rejected": -2.8034095764160156, + "loss": 0.5979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.364682197570801, + "rewards/margins": 0.4387272894382477, + "rewards/rejected": -2.8034095764160156, + "sft_loss": 2.4401233196258545, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 18.395820893073818, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.08162043243646622, + "logits/rejected": 0.10748519748449326, + "logps/chosen": -2.491405963897705, + "logps/rejected": -2.876349925994873, + "loss": 0.6539, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.491405963897705, + "rewards/margins": 0.3849434554576874, + "rewards/rejected": -2.876349925994873, + "sft_loss": 2.516740560531616, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 10.764029069829844, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.005502620246261358, + "logits/rejected": 0.20565679669380188, + "logps/chosen": -2.4020817279815674, + "logps/rejected": -2.7886009216308594, + "loss": 0.6282, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4020817279815674, + "rewards/margins": 0.38651904463768005, + "rewards/rejected": -2.7886009216308594, + "sft_loss": 2.452831745147705, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 9.926095190413394, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.08108798414468765, + "logits/rejected": 0.10665123164653778, + "logps/chosen": -2.2424654960632324, + "logps/rejected": -2.74579119682312, + "loss": 0.6075, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2424654960632324, + "rewards/margins": 0.5033257603645325, + "rewards/rejected": -2.74579119682312, + "sft_loss": 2.4026360511779785, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 11.831281235348646, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.10297316312789917, + "logits/rejected": 0.18497176468372345, + "logps/chosen": -2.3841609954833984, + "logps/rejected": -2.7636427879333496, + "loss": 0.6799, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.3841609954833984, + "rewards/margins": 0.37948182225227356, + "rewards/rejected": -2.7636427879333496, + "sft_loss": 2.5231597423553467, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 12.148857237997744, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.0838220939040184, + "logits/rejected": 0.2115117609500885, + "logps/chosen": -2.5137131214141846, + "logps/rejected": -3.0223727226257324, + "loss": 0.5939, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.5137131214141846, + "rewards/margins": 0.5086593627929688, + "rewards/rejected": -3.0223727226257324, + "sft_loss": 2.5393052101135254, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 14.629770964322283, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.06634237617254257, + "logits/rejected": 0.23861713707447052, + "logps/chosen": -2.6201236248016357, + "logps/rejected": -3.0741326808929443, + "loss": 0.6586, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.6201236248016357, + "rewards/margins": 0.45400896668434143, + "rewards/rejected": -3.0741326808929443, + "sft_loss": 2.6818976402282715, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 10.03296199140755, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.030392784625291824, + "logits/rejected": 0.22696132957935333, + "logps/chosen": -2.3675222396850586, + "logps/rejected": -2.956252336502075, + "loss": 0.5959, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3675222396850586, + "rewards/margins": 0.5887301564216614, + "rewards/rejected": -2.956252336502075, + "sft_loss": 2.461897850036621, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 10.073673041792262, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": 0.027452487498521805, + "logits/rejected": 0.20025476813316345, + "logps/chosen": -2.364055633544922, + "logps/rejected": -2.828199625015259, + "loss": 0.6378, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.364055633544922, + "rewards/margins": 0.464143842458725, + "rewards/rejected": -2.828199625015259, + "sft_loss": 2.470456600189209, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 12.618955481917041, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.029651781544089317, + "logits/rejected": 0.18079546093940735, + "logps/chosen": -2.3804421424865723, + "logps/rejected": -2.89245867729187, + "loss": 0.5858, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3804421424865723, + "rewards/margins": 0.5120163559913635, + "rewards/rejected": -2.89245867729187, + "sft_loss": 2.506648063659668, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 13.157924185810748, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.012971627525985241, + "logits/rejected": 0.11660315841436386, + "logps/chosen": -2.403261184692383, + "logps/rejected": -2.9521636962890625, + "loss": 0.6047, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.403261184692383, + "rewards/margins": 0.5489023923873901, + "rewards/rejected": -2.9521636962890625, + "sft_loss": 2.4948441982269287, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 19.127254297505065, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.0366203635931015, + "logits/rejected": 0.12366179376840591, + "logps/chosen": -2.4076895713806152, + "logps/rejected": -2.8217995166778564, + "loss": 0.6607, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.4076895713806152, + "rewards/margins": 0.41411009430885315, + "rewards/rejected": -2.8217995166778564, + "sft_loss": 2.4844412803649902, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 9.835627653069569, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.008364463225007057, + "logits/rejected": 0.10912313312292099, + "logps/chosen": -2.4262290000915527, + "logps/rejected": -2.7828643321990967, + "loss": 0.6512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4262290000915527, + "rewards/margins": 0.35663533210754395, + "rewards/rejected": -2.7828643321990967, + "sft_loss": 2.4651858806610107, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 14.97168100471102, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.019193686544895172, + "logits/rejected": 0.18119914829730988, + "logps/chosen": -2.1704416275024414, + "logps/rejected": -2.723491668701172, + "loss": 0.5976, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1704416275024414, + "rewards/margins": 0.55305016040802, + "rewards/rejected": -2.723491668701172, + "sft_loss": 2.256765604019165, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 8.608115838074811, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.05023932456970215, + "logits/rejected": 0.09959036856889725, + "logps/chosen": -2.277273416519165, + "logps/rejected": -2.8040313720703125, + "loss": 0.5946, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.277273416519165, + "rewards/margins": 0.526758074760437, + "rewards/rejected": -2.8040313720703125, + "sft_loss": 2.397162914276123, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 18.060015330254146, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.07415231317281723, + "logits/rejected": 0.0306253582239151, + "logps/chosen": -2.4391984939575195, + "logps/rejected": -2.969973087310791, + "loss": 0.5954, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4391984939575195, + "rewards/margins": 0.5307744145393372, + "rewards/rejected": -2.969973087310791, + "sft_loss": 2.513673782348633, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 14.88256555882873, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": -0.0031950809061527252, + "logits/rejected": 0.07587514817714691, + "logps/chosen": -2.466426372528076, + "logps/rejected": -2.9754433631896973, + "loss": 0.5991, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.466426372528076, + "rewards/margins": 0.5090171694755554, + "rewards/rejected": -2.9754433631896973, + "sft_loss": 2.4860310554504395, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 14.577243346146625, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.044427402317523956, + "logits/rejected": 0.2288268506526947, + "logps/chosen": -2.6163885593414307, + "logps/rejected": -3.047532320022583, + "loss": 0.6489, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6163885593414307, + "rewards/margins": 0.4311438202857971, + "rewards/rejected": -3.047532320022583, + "sft_loss": 2.5802102088928223, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 13.460320440360055, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.037521276623010635, + "logits/rejected": 0.17139963805675507, + "logps/chosen": -2.5272560119628906, + "logps/rejected": -3.0960605144500732, + "loss": 0.587, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5272560119628906, + "rewards/margins": 0.5688048005104065, + "rewards/rejected": -3.0960605144500732, + "sft_loss": 2.6325743198394775, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 8.556134708359306, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.06615705788135529, + "logits/rejected": 0.18242689967155457, + "logps/chosen": -2.5637755393981934, + "logps/rejected": -3.100074052810669, + "loss": 0.5931, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.5637755393981934, + "rewards/margins": 0.5362985134124756, + "rewards/rejected": -3.100074052810669, + "sft_loss": 2.6259491443634033, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 7.643138342066282, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": 0.010040968656539917, + "logits/rejected": 0.141854926943779, + "logps/chosen": -2.3528027534484863, + "logps/rejected": -2.9125170707702637, + "loss": 0.6114, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3528027534484863, + "rewards/margins": 0.5597147345542908, + "rewards/rejected": -2.9125170707702637, + "sft_loss": 2.4413468837738037, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 9.667325714568168, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.0742809846997261, + "logits/rejected": 0.04069014638662338, + "logps/chosen": -2.461108684539795, + "logps/rejected": -2.9574544429779053, + "loss": 0.6113, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.461108684539795, + "rewards/margins": 0.4963456690311432, + "rewards/rejected": -2.9574544429779053, + "sft_loss": 2.6110215187072754, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 12.00641733717943, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": -0.002614037599414587, + "logits/rejected": 0.16859155893325806, + "logps/chosen": -2.38588285446167, + "logps/rejected": -2.7110936641693115, + "loss": 0.6804, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.38588285446167, + "rewards/margins": 0.3252108693122864, + "rewards/rejected": -2.7110936641693115, + "sft_loss": 2.4398577213287354, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 8.887866413388897, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": 0.017975768074393272, + "logits/rejected": 0.126893550157547, + "logps/chosen": -2.442229986190796, + "logps/rejected": -2.8189597129821777, + "loss": 0.6606, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.442229986190796, + "rewards/margins": 0.3767297863960266, + "rewards/rejected": -2.8189597129821777, + "sft_loss": 2.5147817134857178, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 13.634216782277669, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.11448929458856583, + "logits/rejected": 0.04175081476569176, + "logps/chosen": -2.4436726570129395, + "logps/rejected": -2.931497097015381, + "loss": 0.6069, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4436726570129395, + "rewards/margins": 0.48782461881637573, + "rewards/rejected": -2.931497097015381, + "sft_loss": 2.564622402191162, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 14.502468297807804, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": 0.013986131176352501, + "logits/rejected": 0.1314467340707779, + "logps/chosen": -2.3802475929260254, + "logps/rejected": -2.9986507892608643, + "loss": 0.5917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3802475929260254, + "rewards/margins": 0.6184031963348389, + "rewards/rejected": -2.9986507892608643, + "sft_loss": 2.479921817779541, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 10.560583498259847, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.07381193339824677, + "logits/rejected": 0.025053244084119797, + "logps/chosen": -2.515423536300659, + "logps/rejected": -2.880089521408081, + "loss": 0.6627, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.515423536300659, + "rewards/margins": 0.36466631293296814, + "rewards/rejected": -2.880089521408081, + "sft_loss": 2.5252633094787598, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 16.477565978507176, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.04238230735063553, + "logits/rejected": 0.09166856110095978, + "logps/chosen": -2.4855003356933594, + "logps/rejected": -2.8636887073516846, + "loss": 0.639, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.4855003356933594, + "rewards/margins": 0.37818849086761475, + "rewards/rejected": -2.8636887073516846, + "sft_loss": 2.6017448902130127, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 22.795719086000116, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": 0.009466012939810753, + "logits/rejected": 0.1582622528076172, + "logps/chosen": -2.2711384296417236, + "logps/rejected": -2.729496479034424, + "loss": 0.6323, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2711384296417236, + "rewards/margins": 0.45835790038108826, + "rewards/rejected": -2.729496479034424, + "sft_loss": 2.3448898792266846, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 8.150525288694316, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.14355149865150452, + "logits/rejected": -0.022237544879317284, + "logps/chosen": -2.287654399871826, + "logps/rejected": -2.7592105865478516, + "loss": 0.608, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.287654399871826, + "rewards/margins": 0.47155603766441345, + "rewards/rejected": -2.7592105865478516, + "sft_loss": 2.4397618770599365, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 10.603609318641688, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.13433615863323212, + "logits/rejected": 0.06298176944255829, + "logps/chosen": -2.3329834938049316, + "logps/rejected": -2.8423380851745605, + "loss": 0.5815, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3329834938049316, + "rewards/margins": 0.5093545317649841, + "rewards/rejected": -2.8423380851745605, + "sft_loss": 2.368196487426758, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 14.037296058485492, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": 0.04950593039393425, + "logits/rejected": 0.12211354821920395, + "logps/chosen": -2.235487461090088, + "logps/rejected": -2.6551833152770996, + "loss": 0.6361, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.235487461090088, + "rewards/margins": 0.4196963906288147, + "rewards/rejected": -2.6551833152770996, + "sft_loss": 2.243699789047241, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 10.96100978751047, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.04126668721437454, + "logits/rejected": 0.13179424405097961, + "logps/chosen": -2.3691515922546387, + "logps/rejected": -2.8386237621307373, + "loss": 0.6283, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3691515922546387, + "rewards/margins": 0.46947216987609863, + "rewards/rejected": -2.8386237621307373, + "sft_loss": 2.3317360877990723, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 10.295095221983487, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.03282228857278824, + "logits/rejected": 0.013936568982899189, + "logps/chosen": -2.3662123680114746, + "logps/rejected": -2.6578352451324463, + "loss": 0.6672, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3662123680114746, + "rewards/margins": 0.2916230261325836, + "rewards/rejected": -2.6578352451324463, + "sft_loss": 2.444215774536133, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 9.482162758527256, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.16914108395576477, + "logits/rejected": -0.031090175732970238, + "logps/chosen": -2.3121986389160156, + "logps/rejected": -2.846667528152466, + "loss": 0.6255, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3121986389160156, + "rewards/margins": 0.534468948841095, + "rewards/rejected": -2.846667528152466, + "sft_loss": 2.4279391765594482, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 10.838291866151629, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.13112396001815796, + "logits/rejected": 0.006520587019622326, + "logps/chosen": -2.263624429702759, + "logps/rejected": -2.783017635345459, + "loss": 0.5878, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.263624429702759, + "rewards/margins": 0.519393265247345, + "rewards/rejected": -2.783017635345459, + "sft_loss": 2.3268685340881348, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 8.980724497171297, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.043813712894916534, + "logits/rejected": 0.06382577121257782, + "logps/chosen": -2.2976884841918945, + "logps/rejected": -2.7014777660369873, + "loss": 0.6231, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2976884841918945, + "rewards/margins": 0.4037889838218689, + "rewards/rejected": -2.7014777660369873, + "sft_loss": 2.4225406646728516, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 11.245760940970241, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": 0.007918231189250946, + "logits/rejected": 0.1136241927742958, + "logps/chosen": -2.3954429626464844, + "logps/rejected": -2.982637882232666, + "loss": 0.5482, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3954429626464844, + "rewards/margins": 0.5871953368186951, + "rewards/rejected": -2.982637882232666, + "sft_loss": 2.5971601009368896, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 9.536133362700639, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.06791021674871445, + "logits/rejected": 0.029560793191194534, + "logps/chosen": -2.4827258586883545, + "logps/rejected": -2.875556230545044, + "loss": 0.6381, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4827258586883545, + "rewards/margins": 0.39283058047294617, + "rewards/rejected": -2.875556230545044, + "sft_loss": 2.6223301887512207, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 11.297407838882163, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.04049893841147423, + "logits/rejected": 0.11827345192432404, + "logps/chosen": -2.772382974624634, + "logps/rejected": -3.2351608276367188, + "loss": 0.6649, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.772382974624634, + "rewards/margins": 0.4627775251865387, + "rewards/rejected": -3.2351608276367188, + "sft_loss": 2.763463258743286, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 7.303873199808049, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.042949218302965164, + "logits/rejected": 0.16490450501441956, + "logps/chosen": -2.4589686393737793, + "logps/rejected": -2.966370105743408, + "loss": 0.6061, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4589686393737793, + "rewards/margins": 0.5074009895324707, + "rewards/rejected": -2.966370105743408, + "sft_loss": 2.608790159225464, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 10.367996592364596, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.06765934824943542, + "logits/rejected": 0.09714097529649734, + "logps/chosen": -2.715125560760498, + "logps/rejected": -3.250112533569336, + "loss": 0.6225, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.715125560760498, + "rewards/margins": 0.5349869728088379, + "rewards/rejected": -3.250112533569336, + "sft_loss": 2.7810139656066895, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 9.58430608083217, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": 0.001974116312339902, + "logits/rejected": 0.15458719432353973, + "logps/chosen": -2.6295740604400635, + "logps/rejected": -3.2241318225860596, + "loss": 0.5955, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.6295740604400635, + "rewards/margins": 0.5945574045181274, + "rewards/rejected": -3.2241318225860596, + "sft_loss": 2.6677796840667725, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 11.885649791284852, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.07231882214546204, + "logits/rejected": 0.08323998749256134, + "logps/chosen": -2.6960177421569824, + "logps/rejected": -3.336521625518799, + "loss": 0.5726, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.6960177421569824, + "rewards/margins": 0.6405037641525269, + "rewards/rejected": -3.336521625518799, + "sft_loss": 2.8467204570770264, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 9.78909696850363, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": 0.022339412942528725, + "logits/rejected": 0.15590617060661316, + "logps/chosen": -2.4645371437072754, + "logps/rejected": -3.0680432319641113, + "loss": 0.5755, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4645371437072754, + "rewards/margins": 0.6035064458847046, + "rewards/rejected": -3.0680432319641113, + "sft_loss": 2.643650531768799, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 13.395794212531905, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.046702008694410324, + "logits/rejected": 0.029883652925491333, + "logps/chosen": -2.6166794300079346, + "logps/rejected": -3.108447790145874, + "loss": 0.6247, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6166794300079346, + "rewards/margins": 0.4917687475681305, + "rewards/rejected": -3.108447790145874, + "sft_loss": 2.6548516750335693, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.30381494760513306, + "eval_logits/rejected": 0.40509033203125, + "eval_logps/chosen": -2.514362335205078, + "eval_logps/rejected": -3.0950636863708496, + "eval_loss": 0.5880183577537537, + "eval_rewards/accuracies": 0.6965875625610352, + "eval_rewards/chosen": -2.514362335205078, + "eval_rewards/margins": 0.5807018876075745, + "eval_rewards/rejected": -3.0950636863708496, + "eval_runtime": 52.2125, + "eval_samples_per_second": 25.76, + "eval_sft_loss": 2.6367032527923584, + "eval_steps_per_second": 6.454, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 14.596912231575132, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.14092347025871277, + "logits/rejected": 0.05596120283007622, + "logps/chosen": -2.4769959449768066, + "logps/rejected": -3.0365443229675293, + "loss": 0.5774, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4769959449768066, + "rewards/margins": 0.5595483183860779, + "rewards/rejected": -3.0365443229675293, + "sft_loss": 2.5679807662963867, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 10.905534215006766, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.05486919730901718, + "logits/rejected": 0.15231744945049286, + "logps/chosen": -2.4324145317077637, + "logps/rejected": -2.9520673751831055, + "loss": 0.5815, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4324145317077637, + "rewards/margins": 0.5196529626846313, + "rewards/rejected": -2.9520673751831055, + "sft_loss": 2.52256441116333, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 9.633793347973821, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.11220196634531021, + "logits/rejected": 0.012372380122542381, + "logps/chosen": -2.554220676422119, + "logps/rejected": -2.970442295074463, + "loss": 0.6393, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.554220676422119, + "rewards/margins": 0.4162214696407318, + "rewards/rejected": -2.970442295074463, + "sft_loss": 2.600206136703491, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 10.474965982793881, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.08466102927923203, + "logits/rejected": 0.2119368016719818, + "logps/chosen": -2.3087267875671387, + "logps/rejected": -2.7697083950042725, + "loss": 0.6234, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3087267875671387, + "rewards/margins": 0.46098190546035767, + "rewards/rejected": -2.7697083950042725, + "sft_loss": 2.337554931640625, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 10.567533632382021, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.14341183006763458, + "logits/rejected": 0.08627209067344666, + "logps/chosen": -2.5253093242645264, + "logps/rejected": -2.9244608879089355, + "loss": 0.6651, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5253093242645264, + "rewards/margins": 0.3991513252258301, + "rewards/rejected": -2.9244608879089355, + "sft_loss": 2.554520845413208, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 12.36533370644545, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.026950160041451454, + "logits/rejected": 0.0841381847858429, + "logps/chosen": -2.5097880363464355, + "logps/rejected": -2.9253194332122803, + "loss": 0.645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.5097880363464355, + "rewards/margins": 0.4155314564704895, + "rewards/rejected": -2.9253194332122803, + "sft_loss": 2.5107533931732178, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 11.395902605646452, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": -0.04188716784119606, + "logits/rejected": 0.09305311739444733, + "logps/chosen": -2.451453685760498, + "logps/rejected": -2.845390796661377, + "loss": 0.6245, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.451453685760498, + "rewards/margins": 0.393937349319458, + "rewards/rejected": -2.845390796661377, + "sft_loss": 2.5635507106781006, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 10.423991842768576, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.1326659470796585, + "logits/rejected": 0.010229384526610374, + "logps/chosen": -2.410856008529663, + "logps/rejected": -2.8557980060577393, + "loss": 0.6034, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.410856008529663, + "rewards/margins": 0.4449416995048523, + "rewards/rejected": -2.8557980060577393, + "sft_loss": 2.5360636711120605, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 12.024052772828735, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.04977841302752495, + "logits/rejected": 0.14385564625263214, + "logps/chosen": -2.3730082511901855, + "logps/rejected": -2.846153974533081, + "loss": 0.6114, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3730082511901855, + "rewards/margins": 0.4731457829475403, + "rewards/rejected": -2.846153974533081, + "sft_loss": 2.4019570350646973, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 15.743615619080508, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.00495730247348547, + "logits/rejected": 0.2099606990814209, + "logps/chosen": -2.516322612762451, + "logps/rejected": -2.905848979949951, + "loss": 0.6711, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.516322612762451, + "rewards/margins": 0.38952645659446716, + "rewards/rejected": -2.905848979949951, + "sft_loss": 2.5147805213928223, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 10.553637253359975, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.07317081093788147, + "logits/rejected": 0.051881879568099976, + "logps/chosen": -2.4117274284362793, + "logps/rejected": -2.902822494506836, + "loss": 0.6066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4117274284362793, + "rewards/margins": 0.4910949766635895, + "rewards/rejected": -2.902822494506836, + "sft_loss": 2.575561046600342, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 9.959680874105779, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.061844177544116974, + "logits/rejected": 0.0837896317243576, + "logps/chosen": -2.44221568107605, + "logps/rejected": -3.0768089294433594, + "loss": 0.5839, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.44221568107605, + "rewards/margins": 0.63459312915802, + "rewards/rejected": -3.0768089294433594, + "sft_loss": 2.600975751876831, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 13.05513480259246, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": -0.03136907145380974, + "logits/rejected": 0.12455201148986816, + "logps/chosen": -2.47062349319458, + "logps/rejected": -3.0895168781280518, + "loss": 0.5719, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.47062349319458, + "rewards/margins": 0.6188936829566956, + "rewards/rejected": -3.0895168781280518, + "sft_loss": 2.5600526332855225, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 11.902034042794147, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.10893243551254272, + "logits/rejected": -0.019856825470924377, + "logps/chosen": -2.5645415782928467, + "logps/rejected": -3.071223735809326, + "loss": 0.6129, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5645415782928467, + "rewards/margins": 0.5066820979118347, + "rewards/rejected": -3.071223735809326, + "sft_loss": 2.6555449962615967, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 9.803927227171691, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.07021275162696838, + "logits/rejected": 0.04914160445332527, + "logps/chosen": -2.699671745300293, + "logps/rejected": -3.1752398014068604, + "loss": 0.6551, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.699671745300293, + "rewards/margins": 0.4755678176879883, + "rewards/rejected": -3.1752398014068604, + "sft_loss": 2.689988136291504, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 13.390660658299637, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.047956183552742004, + "logits/rejected": 0.12472472339868546, + "logps/chosen": -2.5990281105041504, + "logps/rejected": -3.0923752784729004, + "loss": 0.6194, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5990281105041504, + "rewards/margins": 0.49334701895713806, + "rewards/rejected": -3.0923752784729004, + "sft_loss": 2.686919689178467, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 12.692293278068258, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.026659566909074783, + "logits/rejected": 0.12806643545627594, + "logps/chosen": -2.487506866455078, + "logps/rejected": -2.900245428085327, + "loss": 0.6237, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.487506866455078, + "rewards/margins": 0.4127384126186371, + "rewards/rejected": -2.900245428085327, + "sft_loss": 2.5144777297973633, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 12.475581061613873, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.15530245006084442, + "logits/rejected": 0.007811339106410742, + "logps/chosen": -2.631837844848633, + "logps/rejected": -2.9907565116882324, + "loss": 0.671, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.631837844848633, + "rewards/margins": 0.3589186668395996, + "rewards/rejected": -2.9907565116882324, + "sft_loss": 2.694824457168579, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 9.920761501515237, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.03656279668211937, + "logits/rejected": -0.03115103766322136, + "logps/chosen": -2.5637030601501465, + "logps/rejected": -3.136502742767334, + "loss": 0.5941, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5637030601501465, + "rewards/margins": 0.5728000402450562, + "rewards/rejected": -3.136502742767334, + "sft_loss": 2.6105880737304688, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 13.549074636681373, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.07891203463077545, + "logits/rejected": 0.19809429347515106, + "logps/chosen": -2.4845290184020996, + "logps/rejected": -3.1138668060302734, + "loss": 0.5857, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4845290184020996, + "rewards/margins": 0.6293376684188843, + "rewards/rejected": -3.1138668060302734, + "sft_loss": 2.5858771800994873, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 8.42170392456347, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.04925096780061722, + "logits/rejected": 0.09650204330682755, + "logps/chosen": -2.5102059841156006, + "logps/rejected": -2.9356038570404053, + "loss": 0.652, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.5102059841156006, + "rewards/margins": 0.42539793252944946, + "rewards/rejected": -2.9356038570404053, + "sft_loss": 2.518486499786377, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 10.651506064370555, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": 0.046065930277109146, + "logits/rejected": 0.1338791847229004, + "logps/chosen": -2.371086597442627, + "logps/rejected": -2.912052631378174, + "loss": 0.6024, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.371086597442627, + "rewards/margins": 0.5409659147262573, + "rewards/rejected": -2.912052631378174, + "sft_loss": 2.4932312965393066, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 11.904745337777952, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.046427834779024124, + "logits/rejected": 0.1345222443342209, + "logps/chosen": -2.43359375, + "logps/rejected": -2.9828577041625977, + "loss": 0.5972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.43359375, + "rewards/margins": 0.5492635369300842, + "rewards/rejected": -2.9828577041625977, + "sft_loss": 2.488131284713745, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 9.711103384168968, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.14108577370643616, + "logits/rejected": 0.06483618170022964, + "logps/chosen": -2.425170660018921, + "logps/rejected": -2.924701690673828, + "loss": 0.5948, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.425170660018921, + "rewards/margins": 0.49953120946884155, + "rewards/rejected": -2.924701690673828, + "sft_loss": 2.5391621589660645, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 14.583305860389924, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": -0.006739693693816662, + "logits/rejected": 0.11771132797002792, + "logps/chosen": -2.378840923309326, + "logps/rejected": -2.857478618621826, + "loss": 0.6699, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.378840923309326, + "rewards/margins": 0.4786376357078552, + "rewards/rejected": -2.857478618621826, + "sft_loss": 2.3840348720550537, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 12.03553674525137, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.028889214619994164, + "logits/rejected": 0.0867237076163292, + "logps/chosen": -2.4030401706695557, + "logps/rejected": -2.889843463897705, + "loss": 0.6259, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.4030401706695557, + "rewards/margins": 0.4868030548095703, + "rewards/rejected": -2.889843463897705, + "sft_loss": 2.4857749938964844, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 12.184739129523157, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.04891614243388176, + "logits/rejected": 0.042119212448596954, + "logps/chosen": -2.378790855407715, + "logps/rejected": -2.828828811645508, + "loss": 0.6266, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.378790855407715, + "rewards/margins": 0.45003795623779297, + "rewards/rejected": -2.828828811645508, + "sft_loss": 2.418074131011963, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 16.356956717006213, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": -0.05136380344629288, + "logits/rejected": -0.009587121196091175, + "logps/chosen": -2.2361412048339844, + "logps/rejected": -2.6039490699768066, + "loss": 0.6459, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2361412048339844, + "rewards/margins": 0.3678080141544342, + "rewards/rejected": -2.6039490699768066, + "sft_loss": 2.2793796062469482, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 9.192707974512697, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.0014787286054342985, + "logits/rejected": 0.20284879207611084, + "logps/chosen": -2.2834632396698, + "logps/rejected": -2.858455181121826, + "loss": 0.5436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2834632396698, + "rewards/margins": 0.5749918222427368, + "rewards/rejected": -2.858455181121826, + "sft_loss": 2.40155029296875, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 13.138651117833307, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.007256925106048584, + "logits/rejected": 0.08275376260280609, + "logps/chosen": -2.3986923694610596, + "logps/rejected": -2.7163703441619873, + "loss": 0.6698, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3986923694610596, + "rewards/margins": 0.31767791509628296, + "rewards/rejected": -2.7163703441619873, + "sft_loss": 2.5431981086730957, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 11.667087496374497, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.056357402354478836, + "logits/rejected": 0.24279463291168213, + "logps/chosen": -2.555568218231201, + "logps/rejected": -3.031388759613037, + "loss": 0.6144, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.555568218231201, + "rewards/margins": 0.4758206307888031, + "rewards/rejected": -3.031388759613037, + "sft_loss": 2.605769634246826, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 12.788470711956991, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.15414592623710632, + "logits/rejected": -0.024874219670891762, + "logps/chosen": -2.698474168777466, + "logps/rejected": -3.311936616897583, + "loss": 0.5811, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.698474168777466, + "rewards/margins": 0.6134623289108276, + "rewards/rejected": -3.311936616897583, + "sft_loss": 2.8778061866760254, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 7.833984274422432, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.07727707922458649, + "logits/rejected": 0.06534862518310547, + "logps/chosen": -2.554556369781494, + "logps/rejected": -3.1005702018737793, + "loss": 0.5939, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.554556369781494, + "rewards/margins": 0.5460137128829956, + "rewards/rejected": -3.1005702018737793, + "sft_loss": 2.639235258102417, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 36.94322657241421, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.043488435447216034, + "logits/rejected": 0.08955095708370209, + "logps/chosen": -2.6442129611968994, + "logps/rejected": -3.3849635124206543, + "loss": 0.579, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6442129611968994, + "rewards/margins": 0.7407506108283997, + "rewards/rejected": -3.3849635124206543, + "sft_loss": 2.7124366760253906, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 8.09388086134637, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.08221219480037689, + "logits/rejected": 0.07584551721811295, + "logps/chosen": -2.562751293182373, + "logps/rejected": -3.230823516845703, + "loss": 0.5687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.562751293182373, + "rewards/margins": 0.6680727005004883, + "rewards/rejected": -3.230823516845703, + "sft_loss": 2.666548490524292, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 10.802977372135773, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.12206602096557617, + "logits/rejected": 0.014964615926146507, + "logps/chosen": -2.536492109298706, + "logps/rejected": -2.920431613922119, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.536492109298706, + "rewards/margins": 0.38393938541412354, + "rewards/rejected": -2.920431613922119, + "sft_loss": 2.5786898136138916, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 18.137912494220632, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.11852151155471802, + "logits/rejected": 0.0182892344892025, + "logps/chosen": -2.264341354370117, + "logps/rejected": -2.783043384552002, + "loss": 0.6116, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.264341354370117, + "rewards/margins": 0.5187021493911743, + "rewards/rejected": -2.783043384552002, + "sft_loss": 2.3643131256103516, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 10.143001252686217, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.010053953155875206, + "logits/rejected": 0.12595690786838531, + "logps/chosen": -2.3592190742492676, + "logps/rejected": -2.8065428733825684, + "loss": 0.6213, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.3592190742492676, + "rewards/margins": 0.4473237991333008, + "rewards/rejected": -2.8065428733825684, + "sft_loss": 2.4362292289733887, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 11.33999682092262, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.00567228440195322, + "logits/rejected": 0.099876768887043, + "logps/chosen": -2.412571668624878, + "logps/rejected": -2.805781602859497, + "loss": 0.6335, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.412571668624878, + "rewards/margins": 0.39320993423461914, + "rewards/rejected": -2.805781602859497, + "sft_loss": 2.475083589553833, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 9.03668816129448, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.21119725704193115, + "logits/rejected": -0.0663188025355339, + "logps/chosen": -2.5495543479919434, + "logps/rejected": -2.9691262245178223, + "loss": 0.6221, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.5495543479919434, + "rewards/margins": 0.41957202553749084, + "rewards/rejected": -2.9691262245178223, + "sft_loss": 2.6195390224456787, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 10.611564828740077, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.09098132699728012, + "logits/rejected": 0.10673109441995621, + "logps/chosen": -2.3393394947052, + "logps/rejected": -2.9454898834228516, + "loss": 0.6236, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3393394947052, + "rewards/margins": 0.6061506271362305, + "rewards/rejected": -2.9454898834228516, + "sft_loss": 2.4549148082733154, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 14.889925201224557, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.09605233371257782, + "logits/rejected": 0.09647075086832047, + "logps/chosen": -2.418492555618286, + "logps/rejected": -2.98441743850708, + "loss": 0.5642, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.418492555618286, + "rewards/margins": 0.5659249424934387, + "rewards/rejected": -2.98441743850708, + "sft_loss": 2.5511069297790527, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 16.924932065414907, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.012868774123489857, + "logits/rejected": 0.13494983315467834, + "logps/chosen": -2.3770487308502197, + "logps/rejected": -2.7323498725891113, + "loss": 0.6441, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3770487308502197, + "rewards/margins": 0.3553008735179901, + "rewards/rejected": -2.7323498725891113, + "sft_loss": 2.36963152885437, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 8.08049445992655, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": -0.00014423727407120168, + "logits/rejected": 0.15110808610916138, + "logps/chosen": -2.4371447563171387, + "logps/rejected": -3.1043925285339355, + "loss": 0.5964, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.4371447563171387, + "rewards/margins": 0.6672475934028625, + "rewards/rejected": -3.1043925285339355, + "sft_loss": 2.590604066848755, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 8.365953003889633, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": -0.0053975535556674, + "logits/rejected": 0.10485513508319855, + "logps/chosen": -2.463087558746338, + "logps/rejected": -3.069497585296631, + "loss": 0.5976, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.463087558746338, + "rewards/margins": 0.606410026550293, + "rewards/rejected": -3.069497585296631, + "sft_loss": 2.5250258445739746, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 14.122983727549176, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.1040046364068985, + "logits/rejected": 0.043417539447546005, + "logps/chosen": -2.6348214149475098, + "logps/rejected": -3.121133804321289, + "loss": 0.6123, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6348214149475098, + "rewards/margins": 0.48631221055984497, + "rewards/rejected": -3.121133804321289, + "sft_loss": 2.6830124855041504, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 12.061809049765845, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.031954310834407806, + "logits/rejected": 0.09408613294363022, + "logps/chosen": -2.5333523750305176, + "logps/rejected": -3.221738338470459, + "loss": 0.5818, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5333523750305176, + "rewards/margins": 0.6883862614631653, + "rewards/rejected": -3.221738338470459, + "sft_loss": 2.635631561279297, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 8.854422848321477, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.06054989621043205, + "logits/rejected": 0.0525406114757061, + "logps/chosen": -2.517436981201172, + "logps/rejected": -2.995318651199341, + "loss": 0.6267, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.517436981201172, + "rewards/margins": 0.4778814911842346, + "rewards/rejected": -2.995318651199341, + "sft_loss": 2.6538000106811523, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 14.109188635463664, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.12987467646598816, + "logits/rejected": 0.0863143727183342, + "logps/chosen": -2.5347824096679688, + "logps/rejected": -3.0305581092834473, + "loss": 0.6116, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.5347824096679688, + "rewards/margins": 0.49577564001083374, + "rewards/rejected": -3.0305581092834473, + "sft_loss": 2.6745293140411377, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 17.81440390033541, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": 0.003881660057231784, + "logits/rejected": 0.10796427726745605, + "logps/chosen": -2.5523457527160645, + "logps/rejected": -3.016080856323242, + "loss": 0.6686, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.5523457527160645, + "rewards/margins": 0.4637354016304016, + "rewards/rejected": -3.016080856323242, + "sft_loss": 2.617882013320923, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 11.42605708100968, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.06866137683391571, + "logits/rejected": 0.09967400878667831, + "logps/chosen": -2.4362640380859375, + "logps/rejected": -2.9746646881103516, + "loss": 0.6049, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4362640380859375, + "rewards/margins": 0.5384005308151245, + "rewards/rejected": -2.9746646881103516, + "sft_loss": 2.526332139968872, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 8.45931725262656, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.04088922217488289, + "logits/rejected": 0.14248767495155334, + "logps/chosen": -2.4679696559906006, + "logps/rejected": -3.048508644104004, + "loss": 0.5875, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4679696559906006, + "rewards/margins": 0.5805387496948242, + "rewards/rejected": -3.048508644104004, + "sft_loss": 2.4981863498687744, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 8.693422332707609, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.0389975979924202, + "logits/rejected": 0.09865973144769669, + "logps/chosen": -2.6156318187713623, + "logps/rejected": -3.040283679962158, + "loss": 0.638, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.6156318187713623, + "rewards/margins": 0.4246513843536377, + "rewards/rejected": -3.040283679962158, + "sft_loss": 2.6994106769561768, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 9.618870707688409, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.11955811083316803, + "logits/rejected": -0.08894553035497665, + "logps/chosen": -2.564248561859131, + "logps/rejected": -3.060483455657959, + "loss": 0.6321, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.564248561859131, + "rewards/margins": 0.4962351322174072, + "rewards/rejected": -3.060483455657959, + "sft_loss": 2.657533645629883, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 10.696480953576044, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.12024404108524323, + "logits/rejected": 0.06615696847438812, + "logps/chosen": -2.527562379837036, + "logps/rejected": -3.061629295349121, + "loss": 0.6019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.527562379837036, + "rewards/margins": 0.5340667963027954, + "rewards/rejected": -3.061629295349121, + "sft_loss": 2.5972416400909424, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 12.054364447689002, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.005958074238151312, + "logits/rejected": 0.10832609236240387, + "logps/chosen": -2.411243200302124, + "logps/rejected": -3.0436832904815674, + "loss": 0.595, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.411243200302124, + "rewards/margins": 0.6324400901794434, + "rewards/rejected": -3.0436832904815674, + "sft_loss": 2.4762954711914062, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 13.297039351708401, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.003141486318781972, + "logits/rejected": 0.16741812229156494, + "logps/chosen": -2.579014539718628, + "logps/rejected": -3.246927261352539, + "loss": 0.5531, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.579014539718628, + "rewards/margins": 0.6679128408432007, + "rewards/rejected": -3.246927261352539, + "sft_loss": 2.678422212600708, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 13.486736198358573, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.0727333202958107, + "logits/rejected": 0.05342050641775131, + "logps/chosen": -2.5079057216644287, + "logps/rejected": -3.046525478363037, + "loss": 0.6101, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5079057216644287, + "rewards/margins": 0.5386193990707397, + "rewards/rejected": -3.046525478363037, + "sft_loss": 2.6102094650268555, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 12.378423003736247, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.03273031860589981, + "logits/rejected": 0.07781729847192764, + "logps/chosen": -2.753040075302124, + "logps/rejected": -3.1269867420196533, + "loss": 0.6583, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.753040075302124, + "rewards/margins": 0.3739466071128845, + "rewards/rejected": -3.1269867420196533, + "sft_loss": 2.9218058586120605, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 11.683946013575603, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.08013884723186493, + "logits/rejected": 0.034627676010131836, + "logps/chosen": -2.6559319496154785, + "logps/rejected": -3.1538009643554688, + "loss": 0.6609, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.6559319496154785, + "rewards/margins": 0.497869074344635, + "rewards/rejected": -3.1538009643554688, + "sft_loss": 2.8204636573791504, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 11.430235771854319, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": 0.003335425164550543, + "logits/rejected": 0.08415015041828156, + "logps/chosen": -2.660719394683838, + "logps/rejected": -3.2465882301330566, + "loss": 0.5838, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.660719394683838, + "rewards/margins": 0.5858690738677979, + "rewards/rejected": -3.2465882301330566, + "sft_loss": 2.7394461631774902, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 7.0444059398549035, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.05530339479446411, + "logits/rejected": 0.2480275183916092, + "logps/chosen": -2.5760626792907715, + "logps/rejected": -3.228938341140747, + "loss": 0.5571, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5760626792907715, + "rewards/margins": 0.6528751850128174, + "rewards/rejected": -3.228938341140747, + "sft_loss": 2.6400935649871826, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 10.242804568872272, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": 0.029218971729278564, + "logits/rejected": 0.10311850160360336, + "logps/chosen": -2.518911123275757, + "logps/rejected": -3.0885183811187744, + "loss": 0.5873, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.518911123275757, + "rewards/margins": 0.5696069598197937, + "rewards/rejected": -3.0885183811187744, + "sft_loss": 2.62086820602417, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 10.426215268521785, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.008902695961296558, + "logits/rejected": 0.09115861356258392, + "logps/chosen": -2.458341121673584, + "logps/rejected": -3.0484235286712646, + "loss": 0.5696, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.458341121673584, + "rewards/margins": 0.5900823473930359, + "rewards/rejected": -3.0484235286712646, + "sft_loss": 2.6848063468933105, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 9.322478402993056, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.036038726568222046, + "logits/rejected": 0.16355329751968384, + "logps/chosen": -2.571803092956543, + "logps/rejected": -3.060861587524414, + "loss": 0.6206, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.571803092956543, + "rewards/margins": 0.4890584945678711, + "rewards/rejected": -3.060861587524414, + "sft_loss": 2.6588542461395264, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 16.133013311367087, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.059060920029878616, + "logits/rejected": 0.08883248269557953, + "logps/chosen": -2.6606833934783936, + "logps/rejected": -3.106626272201538, + "loss": 0.6284, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.6606833934783936, + "rewards/margins": 0.4459429383277893, + "rewards/rejected": -3.106626272201538, + "sft_loss": 2.746152400970459, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 10.74009173882518, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.07637111842632294, + "logits/rejected": 0.12358202785253525, + "logps/chosen": -2.522994041442871, + "logps/rejected": -3.0614607334136963, + "loss": 0.5745, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.522994041442871, + "rewards/margins": 0.5384668707847595, + "rewards/rejected": -3.0614607334136963, + "sft_loss": 2.630826234817505, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 12.589286926809567, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.007654297165572643, + "logits/rejected": 0.16238531470298767, + "logps/chosen": -2.5522336959838867, + "logps/rejected": -3.288139820098877, + "loss": 0.5539, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5522336959838867, + "rewards/margins": 0.7359061241149902, + "rewards/rejected": -3.288139820098877, + "sft_loss": 2.6166367530822754, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 10.204031227197607, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": 0.0032329752575606108, + "logits/rejected": 0.16776946187019348, + "logps/chosen": -2.5196006298065186, + "logps/rejected": -3.0311813354492188, + "loss": 0.6056, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5196006298065186, + "rewards/margins": 0.5115808248519897, + "rewards/rejected": -3.0311813354492188, + "sft_loss": 2.6822352409362793, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 10.445154609189464, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.15377630293369293, + "logits/rejected": 0.12954029440879822, + "logps/chosen": -2.508655548095703, + "logps/rejected": -3.2540974617004395, + "loss": 0.5356, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.508655548095703, + "rewards/margins": 0.7454419136047363, + "rewards/rejected": -3.2540974617004395, + "sft_loss": 2.5944552421569824, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 11.455076652369373, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.12354922294616699, + "logits/rejected": 0.16556768119335175, + "logps/chosen": -2.5879435539245605, + "logps/rejected": -3.3230979442596436, + "loss": 0.5937, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5879435539245605, + "rewards/margins": 0.7351543307304382, + "rewards/rejected": -3.3230979442596436, + "sft_loss": 2.6574831008911133, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 11.105711383512512, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": 0.008125312626361847, + "logits/rejected": 0.09454827010631561, + "logps/chosen": -2.7672481536865234, + "logps/rejected": -3.295419216156006, + "loss": 0.6376, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.7672481536865234, + "rewards/margins": 0.5281708836555481, + "rewards/rejected": -3.295419216156006, + "sft_loss": 2.950342893600464, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 9.102603916421376, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.00024118572764564306, + "logits/rejected": 0.0962609052658081, + "logps/chosen": -2.770569086074829, + "logps/rejected": -3.313864231109619, + "loss": 0.6154, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.770569086074829, + "rewards/margins": 0.5432949066162109, + "rewards/rejected": -3.313864231109619, + "sft_loss": 2.8849356174468994, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 18.46276019459034, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": 0.005351717583835125, + "logits/rejected": 0.044967781752347946, + "logps/chosen": -2.6325314044952393, + "logps/rejected": -3.045335292816162, + "loss": 0.6741, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6325314044952393, + "rewards/margins": 0.4128040671348572, + "rewards/rejected": -3.045335292816162, + "sft_loss": 2.838153123855591, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 9.744510912842921, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.0044286372140049934, + "logits/rejected": 0.10475552082061768, + "logps/chosen": -2.3398423194885254, + "logps/rejected": -2.780120372772217, + "loss": 0.6058, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3398423194885254, + "rewards/margins": 0.4402780532836914, + "rewards/rejected": -2.780120372772217, + "sft_loss": 2.416719436645508, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 10.136785199712552, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": 0.029368087649345398, + "logits/rejected": 0.2844516932964325, + "logps/chosen": -2.4363322257995605, + "logps/rejected": -2.983987808227539, + "loss": 0.5794, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4363322257995605, + "rewards/margins": 0.5476558208465576, + "rewards/rejected": -2.983987808227539, + "sft_loss": 2.530247688293457, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 8.083867650728607, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.01698513887822628, + "logits/rejected": 0.15768086910247803, + "logps/chosen": -2.3244481086730957, + "logps/rejected": -2.885270118713379, + "loss": 0.5732, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3244481086730957, + "rewards/margins": 0.5608222484588623, + "rewards/rejected": -2.885270118713379, + "sft_loss": 2.433962345123291, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 19.128381491713746, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.09792274236679077, + "logits/rejected": -0.009612299501895905, + "logps/chosen": -2.360170364379883, + "logps/rejected": -2.749636650085449, + "loss": 0.6318, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.360170364379883, + "rewards/margins": 0.38946622610092163, + "rewards/rejected": -2.749636650085449, + "sft_loss": 2.4644882678985596, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 11.983132406771713, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.010349246673285961, + "logits/rejected": 0.06283728778362274, + "logps/chosen": -2.491295099258423, + "logps/rejected": -2.836306095123291, + "loss": 0.6653, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.491295099258423, + "rewards/margins": 0.34501126408576965, + "rewards/rejected": -2.836306095123291, + "sft_loss": 2.5202624797821045, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 8.077069174188852, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.1035315990447998, + "logits/rejected": 0.1056133285164833, + "logps/chosen": -2.4259142875671387, + "logps/rejected": -3.097588300704956, + "loss": 0.5355, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.4259142875671387, + "rewards/margins": 0.6716742515563965, + "rewards/rejected": -3.097588300704956, + "sft_loss": 2.545226573944092, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.3133206367492676, + "eval_logits/rejected": 0.41915422677993774, + "eval_logps/chosen": -2.4305131435394287, + "eval_logps/rejected": -2.997382879257202, + "eval_loss": 0.575062096118927, + "eval_rewards/accuracies": 0.7062314748764038, + "eval_rewards/chosen": -2.4305131435394287, + "eval_rewards/margins": 0.5668694972991943, + "eval_rewards/rejected": -2.997382879257202, + "eval_runtime": 52.9695, + "eval_samples_per_second": 25.392, + "eval_sft_loss": 2.5634634494781494, + "eval_steps_per_second": 6.362, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 10.914004888548867, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.0882820338010788, + "logits/rejected": 0.163892924785614, + "logps/chosen": -2.531283140182495, + "logps/rejected": -3.127635955810547, + "loss": 0.5666, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.531283140182495, + "rewards/margins": 0.5963530540466309, + "rewards/rejected": -3.127635955810547, + "sft_loss": 2.6074588298797607, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 10.16876236997209, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.1145872101187706, + "logits/rejected": 0.07419227063655853, + "logps/chosen": -2.5505528450012207, + "logps/rejected": -3.200885057449341, + "loss": 0.5697, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.5505528450012207, + "rewards/margins": 0.6503321528434753, + "rewards/rejected": -3.200885057449341, + "sft_loss": 2.7054452896118164, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 14.031412502648942, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.04301288723945618, + "logits/rejected": 0.08894392102956772, + "logps/chosen": -2.5927734375, + "logps/rejected": -3.142228841781616, + "loss": 0.6424, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5927734375, + "rewards/margins": 0.5494555234909058, + "rewards/rejected": -3.142228841781616, + "sft_loss": 2.692375898361206, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 10.380932361427314, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.026785671710968018, + "logits/rejected": 0.14500652253627777, + "logps/chosen": -2.513824224472046, + "logps/rejected": -3.0432636737823486, + "loss": 0.6333, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.513824224472046, + "rewards/margins": 0.5294393301010132, + "rewards/rejected": -3.0432636737823486, + "sft_loss": 2.635474443435669, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 11.788160329897083, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": 0.06771846115589142, + "logits/rejected": 0.12368135154247284, + "logps/chosen": -2.5657715797424316, + "logps/rejected": -2.8500313758850098, + "loss": 0.7095, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5657715797424316, + "rewards/margins": 0.284260094165802, + "rewards/rejected": -2.8500313758850098, + "sft_loss": 2.569617509841919, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 7.220376061788953, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": 0.00953853689134121, + "logits/rejected": 0.16754086315631866, + "logps/chosen": -2.31351375579834, + "logps/rejected": -2.8156228065490723, + "loss": 0.5925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.31351375579834, + "rewards/margins": 0.502109169960022, + "rewards/rejected": -2.8156228065490723, + "sft_loss": 2.38759708404541, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 8.497784358932309, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.08759839832782745, + "logits/rejected": 0.05009545758366585, + "logps/chosen": -2.1406750679016113, + "logps/rejected": -2.7028450965881348, + "loss": 0.5683, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1406750679016113, + "rewards/margins": 0.5621696710586548, + "rewards/rejected": -2.7028450965881348, + "sft_loss": 2.215157985687256, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 10.362610570766398, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.040607232600450516, + "logits/rejected": 0.09515784680843353, + "logps/chosen": -2.210148334503174, + "logps/rejected": -2.790614604949951, + "loss": 0.5847, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.210148334503174, + "rewards/margins": 0.5804664492607117, + "rewards/rejected": -2.790614604949951, + "sft_loss": 2.3017220497131348, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 10.46273728676422, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.016619618982076645, + "logits/rejected": 0.034366074949502945, + "logps/chosen": -2.285428524017334, + "logps/rejected": -2.6628036499023438, + "loss": 0.6234, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.285428524017334, + "rewards/margins": 0.37737521529197693, + "rewards/rejected": -2.6628036499023438, + "sft_loss": 2.3670566082000732, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 11.988584939935281, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": 0.07678203284740448, + "logits/rejected": 0.13864430785179138, + "logps/chosen": -2.2349565029144287, + "logps/rejected": -2.7476844787597656, + "loss": 0.5722, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2349565029144287, + "rewards/margins": 0.5127274990081787, + "rewards/rejected": -2.7476844787597656, + "sft_loss": 2.2399978637695312, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 10.684458877547073, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.03853264078497887, + "logits/rejected": 0.08583366125822067, + "logps/chosen": -2.3231568336486816, + "logps/rejected": -2.8199455738067627, + "loss": 0.5862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3231568336486816, + "rewards/margins": 0.4967884123325348, + "rewards/rejected": -2.8199455738067627, + "sft_loss": 2.355069875717163, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 10.614315640564005, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": 0.009507464244961739, + "logits/rejected": 0.06045049428939819, + "logps/chosen": -2.2326788902282715, + "logps/rejected": -2.739642381668091, + "loss": 0.5685, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2326788902282715, + "rewards/margins": 0.5069635510444641, + "rewards/rejected": -2.739642381668091, + "sft_loss": 2.2458808422088623, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 12.65809689170843, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": 0.013400441035628319, + "logits/rejected": 0.08506964892148972, + "logps/chosen": -2.543550968170166, + "logps/rejected": -2.9200243949890137, + "loss": 0.6529, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.543550968170166, + "rewards/margins": 0.37647324800491333, + "rewards/rejected": -2.9200243949890137, + "sft_loss": 2.5819945335388184, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 10.470244663605284, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": 0.05795196816325188, + "logits/rejected": 0.2349679172039032, + "logps/chosen": -2.51816987991333, + "logps/rejected": -2.9122061729431152, + "loss": 0.6769, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.51816987991333, + "rewards/margins": 0.39403635263442993, + "rewards/rejected": -2.9122061729431152, + "sft_loss": 2.468186140060425, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 9.382088774982067, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": 0.02824602648615837, + "logits/rejected": 0.254594087600708, + "logps/chosen": -2.4892547130584717, + "logps/rejected": -3.094567060470581, + "loss": 0.5948, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.4892547130584717, + "rewards/margins": 0.6053122878074646, + "rewards/rejected": -3.094567060470581, + "sft_loss": 2.575155735015869, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 12.346178841407921, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.06833375990390778, + "logits/rejected": 0.06057899445295334, + "logps/chosen": -2.488093614578247, + "logps/rejected": -2.9171090126037598, + "loss": 0.6347, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.488093614578247, + "rewards/margins": 0.42901507019996643, + "rewards/rejected": -2.9171090126037598, + "sft_loss": 2.5657894611358643, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 13.056931547739362, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": 0.0010797411669045687, + "logits/rejected": 0.24253420531749725, + "logps/chosen": -2.361572742462158, + "logps/rejected": -2.9080519676208496, + "loss": 0.6233, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.361572742462158, + "rewards/margins": 0.5464791059494019, + "rewards/rejected": -2.9080519676208496, + "sft_loss": 2.4400172233581543, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 9.03111975442011, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.0008448451990261674, + "logits/rejected": 0.20967717468738556, + "logps/chosen": -2.329864025115967, + "logps/rejected": -2.955321788787842, + "loss": 0.5453, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.329864025115967, + "rewards/margins": 0.6254577040672302, + "rewards/rejected": -2.955321788787842, + "sft_loss": 2.4391579627990723, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 17.373789805328247, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.1382381170988083, + "logits/rejected": 0.0370989628136158, + "logps/chosen": -2.4268882274627686, + "logps/rejected": -2.9267404079437256, + "loss": 0.601, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4268882274627686, + "rewards/margins": 0.49985241889953613, + "rewards/rejected": -2.9267404079437256, + "sft_loss": 2.5433390140533447, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 12.136056902371177, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": 0.04478234052658081, + "logits/rejected": -0.008689765818417072, + "logps/chosen": -2.5137197971343994, + "logps/rejected": -2.812690019607544, + "loss": 0.6569, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.5137197971343994, + "rewards/margins": 0.2989702820777893, + "rewards/rejected": -2.812690019607544, + "sft_loss": 2.6227807998657227, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 19.782307301050942, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": 0.04610518366098404, + "logits/rejected": 0.20527955889701843, + "logps/chosen": -2.4842915534973145, + "logps/rejected": -3.0323004722595215, + "loss": 0.587, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4842915534973145, + "rewards/margins": 0.5480088591575623, + "rewards/rejected": -3.0323004722595215, + "sft_loss": 2.596231698989868, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 9.505496059640969, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.013093151152133942, + "logits/rejected": 0.09648257493972778, + "logps/chosen": -2.617058038711548, + "logps/rejected": -3.034942150115967, + "loss": 0.63, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.617058038711548, + "rewards/margins": 0.4178840219974518, + "rewards/rejected": -3.034942150115967, + "sft_loss": 2.7770838737487793, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 11.789714973897295, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.015839237719774246, + "logits/rejected": 0.19138559699058533, + "logps/chosen": -2.698256015777588, + "logps/rejected": -3.1933929920196533, + "loss": 0.6454, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.698256015777588, + "rewards/margins": 0.49513691663742065, + "rewards/rejected": -3.1933929920196533, + "sft_loss": 2.781946897506714, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 11.412043051469436, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.0009754031780175865, + "logits/rejected": 0.08380867540836334, + "logps/chosen": -2.5303540229797363, + "logps/rejected": -3.1968514919281006, + "loss": 0.5591, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5303540229797363, + "rewards/margins": 0.6664971113204956, + "rewards/rejected": -3.1968514919281006, + "sft_loss": 2.7090067863464355, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 8.995570834686255, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": 0.018614167347550392, + "logits/rejected": 0.14034906029701233, + "logps/chosen": -2.7271647453308105, + "logps/rejected": -3.4992728233337402, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7271647453308105, + "rewards/margins": 0.7721078991889954, + "rewards/rejected": -3.4992728233337402, + "sft_loss": 2.928816318511963, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 12.144945793536012, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": 0.05527348071336746, + "logits/rejected": 0.13575479388237, + "logps/chosen": -2.6572043895721436, + "logps/rejected": -3.217125654220581, + "loss": 0.5988, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6572043895721436, + "rewards/margins": 0.559921145439148, + "rewards/rejected": -3.217125654220581, + "sft_loss": 2.908268451690674, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 12.236971878685171, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.11078289896249771, + "logits/rejected": 0.10976402461528778, + "logps/chosen": -2.8110814094543457, + "logps/rejected": -3.594407558441162, + "loss": 0.5748, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8110814094543457, + "rewards/margins": 0.7833271026611328, + "rewards/rejected": -3.594407558441162, + "sft_loss": 2.929760456085205, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 11.502499419546336, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.05445173382759094, + "logits/rejected": 0.1365424394607544, + "logps/chosen": -2.9095003604888916, + "logps/rejected": -3.4668147563934326, + "loss": 0.5874, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9095003604888916, + "rewards/margins": 0.5573142170906067, + "rewards/rejected": -3.4668147563934326, + "sft_loss": 3.011441946029663, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 8.443109951812232, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": 0.06129883602261543, + "logits/rejected": 0.09885939955711365, + "logps/chosen": -2.6593096256256104, + "logps/rejected": -3.227020263671875, + "loss": 0.5862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6593096256256104, + "rewards/margins": 0.5677107572555542, + "rewards/rejected": -3.227020263671875, + "sft_loss": 2.8268625736236572, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 16.493045146068436, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.0019575455226004124, + "logits/rejected": 0.06592272222042084, + "logps/chosen": -2.8948216438293457, + "logps/rejected": -3.321528911590576, + "loss": 0.6763, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.8948216438293457, + "rewards/margins": 0.4267074167728424, + "rewards/rejected": -3.321528911590576, + "sft_loss": 3.0294787883758545, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 11.25773668875448, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.04980425536632538, + "logits/rejected": 0.07690002769231796, + "logps/chosen": -2.7664177417755127, + "logps/rejected": -3.3948757648468018, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7664177417755127, + "rewards/margins": 0.62845778465271, + "rewards/rejected": -3.3948757648468018, + "sft_loss": 2.962599277496338, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 13.445498840947511, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.024612322449684143, + "logits/rejected": 0.12784752249717712, + "logps/chosen": -2.734321117401123, + "logps/rejected": -3.302389621734619, + "loss": 0.5772, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.734321117401123, + "rewards/margins": 0.5680681467056274, + "rewards/rejected": -3.302389621734619, + "sft_loss": 2.892714262008667, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 13.079442066176235, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": 0.022269438952207565, + "logits/rejected": 0.05613626167178154, + "logps/chosen": -2.763192653656006, + "logps/rejected": -3.321763515472412, + "loss": 0.6194, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.763192653656006, + "rewards/margins": 0.5585710406303406, + "rewards/rejected": -3.321763515472412, + "sft_loss": 2.930387496948242, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 10.737616747574133, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": 0.007398630026727915, + "logits/rejected": 0.1687568575143814, + "logps/chosen": -2.7443089485168457, + "logps/rejected": -3.232015609741211, + "loss": 0.626, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7443089485168457, + "rewards/margins": 0.4877067506313324, + "rewards/rejected": -3.232015609741211, + "sft_loss": 2.9475789070129395, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 11.013113629479797, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.0652175024151802, + "logits/rejected": 0.053606174886226654, + "logps/chosen": -2.545154333114624, + "logps/rejected": -3.148621082305908, + "loss": 0.5846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.545154333114624, + "rewards/margins": 0.6034666895866394, + "rewards/rejected": -3.148621082305908, + "sft_loss": 2.7204573154449463, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 10.651167349747164, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.03264901041984558, + "logits/rejected": 0.10116423666477203, + "logps/chosen": -2.4588916301727295, + "logps/rejected": -3.0085623264312744, + "loss": 0.5825, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4588916301727295, + "rewards/margins": 0.5496702790260315, + "rewards/rejected": -3.0085623264312744, + "sft_loss": 2.547887086868286, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 15.623065206500607, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": 0.04355254024267197, + "logits/rejected": 0.1262408196926117, + "logps/chosen": -2.4711506366729736, + "logps/rejected": -2.809096336364746, + "loss": 0.6557, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4711506366729736, + "rewards/margins": 0.33794572949409485, + "rewards/rejected": -2.809096336364746, + "sft_loss": 2.529564619064331, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 13.881284427213853, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": 0.08312052488327026, + "logits/rejected": 0.16090969741344452, + "logps/chosen": -2.548485279083252, + "logps/rejected": -2.9776816368103027, + "loss": 0.6146, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.548485279083252, + "rewards/margins": 0.42919617891311646, + "rewards/rejected": -2.9776816368103027, + "sft_loss": 2.6493735313415527, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 8.169881197230874, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.057188816368579865, + "logits/rejected": 0.09091904014348984, + "logps/chosen": -2.39776349067688, + "logps/rejected": -2.983966588973999, + "loss": 0.5808, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.39776349067688, + "rewards/margins": 0.5862034559249878, + "rewards/rejected": -2.983966588973999, + "sft_loss": 2.574922800064087, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 9.207422884537966, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.02254396490752697, + "logits/rejected": 0.06835971772670746, + "logps/chosen": -2.5146327018737793, + "logps/rejected": -2.956911563873291, + "loss": 0.6064, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.5146327018737793, + "rewards/margins": 0.44227901101112366, + "rewards/rejected": -2.956911563873291, + "sft_loss": 2.6073718070983887, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 11.647852185686641, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.1361158937215805, + "logits/rejected": 0.005249606911092997, + "logps/chosen": -2.355909824371338, + "logps/rejected": -3.086822509765625, + "loss": 0.5193, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.355909824371338, + "rewards/margins": 0.7309123277664185, + "rewards/rejected": -3.086822509765625, + "sft_loss": 2.463071346282959, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 10.142898164937732, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.011810372583568096, + "logits/rejected": 0.1351340115070343, + "logps/chosen": -2.4389145374298096, + "logps/rejected": -2.9896650314331055, + "loss": 0.5704, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4389145374298096, + "rewards/margins": 0.5507505536079407, + "rewards/rejected": -2.9896650314331055, + "sft_loss": 2.5153541564941406, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 9.915497698492498, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.061279840767383575, + "logits/rejected": 0.09167703241109848, + "logps/chosen": -2.593907117843628, + "logps/rejected": -3.216207504272461, + "loss": 0.5646, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.593907117843628, + "rewards/margins": 0.6223001480102539, + "rewards/rejected": -3.216207504272461, + "sft_loss": 2.7145023345947266, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 11.75845777405208, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.03900023549795151, + "logits/rejected": 0.016491200774908066, + "logps/chosen": -2.6452884674072266, + "logps/rejected": -3.208444118499756, + "loss": 0.5861, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.6452884674072266, + "rewards/margins": 0.5631558299064636, + "rewards/rejected": -3.208444118499756, + "sft_loss": 2.685883045196533, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 10.256406392333993, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.1221277266740799, + "logits/rejected": 0.047670863568782806, + "logps/chosen": -2.4664437770843506, + "logps/rejected": -3.2738850116729736, + "loss": 0.541, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4664437770843506, + "rewards/margins": 0.807441234588623, + "rewards/rejected": -3.2738850116729736, + "sft_loss": 2.6216683387756348, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 11.766825056658277, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": 0.008026264607906342, + "logits/rejected": 0.0632171481847763, + "logps/chosen": -2.8379218578338623, + "logps/rejected": -3.3935370445251465, + "loss": 0.6042, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.8379218578338623, + "rewards/margins": 0.5556154251098633, + "rewards/rejected": -3.3935370445251465, + "sft_loss": 2.949826955795288, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 12.179530743385463, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.1061452180147171, + "logits/rejected": -0.022467706352472305, + "logps/chosen": -2.8701460361480713, + "logps/rejected": -3.3405520915985107, + "loss": 0.6117, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8701460361480713, + "rewards/margins": 0.47040629386901855, + "rewards/rejected": -3.3405520915985107, + "sft_loss": 2.990164279937744, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 10.285677662244042, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.057100921869277954, + "logits/rejected": 0.10320702940225601, + "logps/chosen": -3.0298407077789307, + "logps/rejected": -3.7158799171447754, + "loss": 0.5614, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0298407077789307, + "rewards/margins": 0.6860392093658447, + "rewards/rejected": -3.7158799171447754, + "sft_loss": 3.214426040649414, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 18.13952521344405, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.08308350294828415, + "logits/rejected": 0.027438625693321228, + "logps/chosen": -3.064002513885498, + "logps/rejected": -3.6987290382385254, + "loss": 0.6068, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.064002513885498, + "rewards/margins": 0.6347264051437378, + "rewards/rejected": -3.6987290382385254, + "sft_loss": 3.298839569091797, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 15.704432394955504, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.21905367076396942, + "logits/rejected": -0.06774080544710159, + "logps/chosen": -3.2466511726379395, + "logps/rejected": -3.815436840057373, + "loss": 0.6149, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.2466511726379395, + "rewards/margins": 0.5687858462333679, + "rewards/rejected": -3.815436840057373, + "sft_loss": 3.38019061088562, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 13.349749187918693, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.11080940067768097, + "logits/rejected": 0.0334743857383728, + "logps/chosen": -3.068286418914795, + "logps/rejected": -3.7758116722106934, + "loss": 0.6029, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.068286418914795, + "rewards/margins": 0.7075250744819641, + "rewards/rejected": -3.7758116722106934, + "sft_loss": 3.3414759635925293, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 16.693407798704268, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.0996282771229744, + "logits/rejected": 0.02317485585808754, + "logps/chosen": -3.228316068649292, + "logps/rejected": -3.8254122734069824, + "loss": 0.6024, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.228316068649292, + "rewards/margins": 0.5970960259437561, + "rewards/rejected": -3.8254122734069824, + "sft_loss": 3.3384742736816406, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 12.2068079158362, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.09745468199253082, + "logits/rejected": -0.00797833502292633, + "logps/chosen": -2.809922695159912, + "logps/rejected": -3.3745949268341064, + "loss": 0.6085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.809922695159912, + "rewards/margins": 0.5646719932556152, + "rewards/rejected": -3.3745949268341064, + "sft_loss": 2.949306011199951, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 10.777854658083896, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.13089512288570404, + "logits/rejected": 0.013808004558086395, + "logps/chosen": -2.7855277061462402, + "logps/rejected": -3.438579559326172, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7855277061462402, + "rewards/margins": 0.6530521512031555, + "rewards/rejected": -3.438579559326172, + "sft_loss": 2.905142307281494, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 14.023255853880016, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.16114047169685364, + "logits/rejected": -0.011397371999919415, + "logps/chosen": -2.7088358402252197, + "logps/rejected": -3.1518595218658447, + "loss": 0.6186, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.7088358402252197, + "rewards/margins": 0.44302353262901306, + "rewards/rejected": -3.1518595218658447, + "sft_loss": 2.7553300857543945, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 10.75467613467904, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.03138697147369385, + "logits/rejected": 0.13168206810951233, + "logps/chosen": -2.6109001636505127, + "logps/rejected": -3.409217119216919, + "loss": 0.5165, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6109001636505127, + "rewards/margins": 0.7983168959617615, + "rewards/rejected": -3.409217119216919, + "sft_loss": 2.7761216163635254, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 9.570772308359619, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.10052132606506348, + "logits/rejected": -0.013578740879893303, + "logps/chosen": -2.5562167167663574, + "logps/rejected": -3.0964174270629883, + "loss": 0.6024, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5562167167663574, + "rewards/margins": 0.54020094871521, + "rewards/rejected": -3.0964174270629883, + "sft_loss": 2.6829943656921387, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 13.743912767629043, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.10133364051580429, + "logits/rejected": 0.12248275429010391, + "logps/chosen": -2.7118899822235107, + "logps/rejected": -3.332120418548584, + "loss": 0.567, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7118899822235107, + "rewards/margins": 0.620230495929718, + "rewards/rejected": -3.332120418548584, + "sft_loss": 2.9557461738586426, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 10.624028115849335, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.05728140473365784, + "logits/rejected": -0.016837697476148605, + "logps/chosen": -2.6872398853302, + "logps/rejected": -3.2197844982147217, + "loss": 0.5828, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6872398853302, + "rewards/margins": 0.5325449109077454, + "rewards/rejected": -3.2197844982147217, + "sft_loss": 2.794914722442627, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 10.188213475469688, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.1692381203174591, + "logits/rejected": 0.10236310958862305, + "logps/chosen": -2.693068265914917, + "logps/rejected": -3.3337535858154297, + "loss": 0.5717, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.693068265914917, + "rewards/margins": 0.6406850814819336, + "rewards/rejected": -3.3337535858154297, + "sft_loss": 2.888385772705078, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 9.126408253755638, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.07820995151996613, + "logits/rejected": 0.06870969384908676, + "logps/chosen": -2.7257328033447266, + "logps/rejected": -3.2857375144958496, + "loss": 0.6121, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.7257328033447266, + "rewards/margins": 0.5600049495697021, + "rewards/rejected": -3.2857375144958496, + "sft_loss": 2.827327251434326, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 8.491548692705383, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.17100057005882263, + "logits/rejected": -0.023897219449281693, + "logps/chosen": -2.6321306228637695, + "logps/rejected": -3.1332454681396484, + "loss": 0.5968, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6321306228637695, + "rewards/margins": 0.5011148452758789, + "rewards/rejected": -3.1332454681396484, + "sft_loss": 2.7944517135620117, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 9.103701034722597, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.10997509956359863, + "logits/rejected": -0.007670065853744745, + "logps/chosen": -2.7183170318603516, + "logps/rejected": -3.2243950366973877, + "loss": 0.5872, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7183170318603516, + "rewards/margins": 0.5060782432556152, + "rewards/rejected": -3.2243950366973877, + "sft_loss": 2.8439135551452637, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 8.442057525715944, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.13612933456897736, + "logits/rejected": -0.045088279992341995, + "logps/chosen": -2.5174169540405273, + "logps/rejected": -3.1568408012390137, + "loss": 0.565, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.5174169540405273, + "rewards/margins": 0.6394241452217102, + "rewards/rejected": -3.1568408012390137, + "sft_loss": 2.7010550498962402, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 12.833982485757991, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.25432294607162476, + "logits/rejected": -0.04225042089819908, + "logps/chosen": -2.7238693237304688, + "logps/rejected": -3.3072547912597656, + "loss": 0.5698, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7238693237304688, + "rewards/margins": 0.5833858251571655, + "rewards/rejected": -3.3072547912597656, + "sft_loss": 2.861285924911499, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 13.724749293706079, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.17298512160778046, + "logits/rejected": -0.03237203508615494, + "logps/chosen": -2.628120183944702, + "logps/rejected": -3.2568199634552, + "loss": 0.5586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.628120183944702, + "rewards/margins": 0.6286996603012085, + "rewards/rejected": -3.2568199634552, + "sft_loss": 2.8256094455718994, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 14.119179582690768, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.2202630490064621, + "logits/rejected": -0.010647903196513653, + "logps/chosen": -2.709089994430542, + "logps/rejected": -3.362189531326294, + "loss": 0.5749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.709089994430542, + "rewards/margins": 0.6530997157096863, + "rewards/rejected": -3.362189531326294, + "sft_loss": 2.794367790222168, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 10.603355210152403, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.13367509841918945, + "logits/rejected": -0.0059948088601231575, + "logps/chosen": -2.6444592475891113, + "logps/rejected": -3.3636856079101562, + "loss": 0.5557, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.6444592475891113, + "rewards/margins": 0.7192264795303345, + "rewards/rejected": -3.3636856079101562, + "sft_loss": 2.6509063243865967, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 7.671753470196667, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.22960379719734192, + "logits/rejected": -0.06299453228712082, + "logps/chosen": -2.6862707138061523, + "logps/rejected": -3.4968085289001465, + "loss": 0.5132, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6862707138061523, + "rewards/margins": 0.810538113117218, + "rewards/rejected": -3.4968085289001465, + "sft_loss": 2.782322406768799, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 10.997777173510148, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.13848312199115753, + "logits/rejected": 0.09042768180370331, + "logps/chosen": -2.704963207244873, + "logps/rejected": -3.3764922618865967, + "loss": 0.5578, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.704963207244873, + "rewards/margins": 0.6715291738510132, + "rewards/rejected": -3.3764922618865967, + "sft_loss": 2.77587628364563, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 11.821301015944938, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.1645531952381134, + "logits/rejected": -0.09662993997335434, + "logps/chosen": -2.6595356464385986, + "logps/rejected": -3.2250595092773438, + "loss": 0.567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6595356464385986, + "rewards/margins": 0.5655234456062317, + "rewards/rejected": -3.2250595092773438, + "sft_loss": 2.7365801334381104, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 11.270848247245398, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.1551237255334854, + "logits/rejected": -0.11444219201803207, + "logps/chosen": -2.605210542678833, + "logps/rejected": -3.154036045074463, + "loss": 0.5935, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.605210542678833, + "rewards/margins": 0.5488253235816956, + "rewards/rejected": -3.154036045074463, + "sft_loss": 2.6858043670654297, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 10.422792273873622, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.09268782287836075, + "logits/rejected": -0.012595447711646557, + "logps/chosen": -2.611571788787842, + "logps/rejected": -3.4692039489746094, + "loss": 0.5112, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.611571788787842, + "rewards/margins": 0.8576324582099915, + "rewards/rejected": -3.4692039489746094, + "sft_loss": 2.760436534881592, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 9.90903026753714, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.16010849177837372, + "logits/rejected": 0.008882254362106323, + "logps/chosen": -2.4900741577148438, + "logps/rejected": -3.0593371391296387, + "loss": 0.5737, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4900741577148438, + "rewards/margins": 0.569263219833374, + "rewards/rejected": -3.0593371391296387, + "sft_loss": 2.582530975341797, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 11.780133581006648, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.017460066825151443, + "logits/rejected": 0.07679884135723114, + "logps/chosen": -2.45796275138855, + "logps/rejected": -3.05059814453125, + "loss": 0.5745, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.45796275138855, + "rewards/margins": 0.5926356911659241, + "rewards/rejected": -3.05059814453125, + "sft_loss": 2.583042860031128, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 11.184196786986732, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.021548813208937645, + "logits/rejected": 0.09033869951963425, + "logps/chosen": -2.6186671257019043, + "logps/rejected": -3.222067356109619, + "loss": 0.6004, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6186671257019043, + "rewards/margins": 0.6034001111984253, + "rewards/rejected": -3.222067356109619, + "sft_loss": 2.7017316818237305, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 11.45281124333676, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.0954420417547226, + "logits/rejected": 0.07028938829898834, + "logps/chosen": -2.580390453338623, + "logps/rejected": -3.0125155448913574, + "loss": 0.6408, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.580390453338623, + "rewards/margins": 0.4321257174015045, + "rewards/rejected": -3.0125155448913574, + "sft_loss": 2.6577117443084717, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 12.735874658213135, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.13700740039348602, + "logits/rejected": -0.03511708974838257, + "logps/chosen": -2.569897174835205, + "logps/rejected": -3.0620884895324707, + "loss": 0.5878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.569897174835205, + "rewards/margins": 0.4921916127204895, + "rewards/rejected": -3.0620884895324707, + "sft_loss": 2.6316020488739014, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 12.016988520778332, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.04812765121459961, + "logits/rejected": 0.11438647657632828, + "logps/chosen": -2.4896674156188965, + "logps/rejected": -3.2544426918029785, + "loss": 0.5309, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4896674156188965, + "rewards/margins": 0.7647750973701477, + "rewards/rejected": -3.2544426918029785, + "sft_loss": 2.6088204383850098, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 15.774458155051652, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.1408371925354004, + "logits/rejected": 0.027349501848220825, + "logps/chosen": -2.6003634929656982, + "logps/rejected": -3.204463243484497, + "loss": 0.6075, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6003634929656982, + "rewards/margins": 0.6040997505187988, + "rewards/rejected": -3.204463243484497, + "sft_loss": 2.717322826385498, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.24547554552555084, + "eval_logits/rejected": 0.3535785973072052, + "eval_logps/chosen": -2.534740924835205, + "eval_logps/rejected": -3.195641040802002, + "eval_loss": 0.5675076842308044, + "eval_rewards/accuracies": 0.716617226600647, + "eval_rewards/chosen": -2.534740924835205, + "eval_rewards/margins": 0.6609002351760864, + "eval_rewards/rejected": -3.195641040802002, + "eval_runtime": 49.7579, + "eval_samples_per_second": 27.031, + "eval_sft_loss": 2.677027702331543, + "eval_steps_per_second": 6.773, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 12.550971047870846, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.09266819059848785, + "logits/rejected": -0.02057764120399952, + "logps/chosen": -2.560471773147583, + "logps/rejected": -3.081770658493042, + "loss": 0.6251, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.560471773147583, + "rewards/margins": 0.5212991833686829, + "rewards/rejected": -3.081770658493042, + "sft_loss": 2.682894706726074, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 14.01923033463733, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.05897900462150574, + "logits/rejected": 0.06726238876581192, + "logps/chosen": -2.394794464111328, + "logps/rejected": -2.990241289138794, + "loss": 0.5569, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.394794464111328, + "rewards/margins": 0.5954467058181763, + "rewards/rejected": -2.990241289138794, + "sft_loss": 2.538649559020996, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 10.839356405411152, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.1080300584435463, + "logits/rejected": 0.028784072026610374, + "logps/chosen": -2.469409465789795, + "logps/rejected": -3.049816608428955, + "loss": 0.5759, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.469409465789795, + "rewards/margins": 0.5804071426391602, + "rewards/rejected": -3.049816608428955, + "sft_loss": 2.5252418518066406, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 13.273536157240635, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.2106810063123703, + "logits/rejected": 0.010429046116769314, + "logps/chosen": -2.5872609615325928, + "logps/rejected": -3.283881664276123, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5872609615325928, + "rewards/margins": 0.6966210007667542, + "rewards/rejected": -3.283881664276123, + "sft_loss": 2.6289219856262207, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 10.380937321819658, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.04295419156551361, + "logits/rejected": 0.11656694114208221, + "logps/chosen": -2.4731650352478027, + "logps/rejected": -3.125133991241455, + "loss": 0.5726, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4731650352478027, + "rewards/margins": 0.6519689559936523, + "rewards/rejected": -3.125133991241455, + "sft_loss": 2.6329140663146973, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 8.969741241293566, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.13495591282844543, + "logits/rejected": -0.09867937862873077, + "logps/chosen": -2.4684181213378906, + "logps/rejected": -3.0391838550567627, + "loss": 0.57, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4684181213378906, + "rewards/margins": 0.5707659721374512, + "rewards/rejected": -3.0391838550567627, + "sft_loss": 2.571349620819092, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 15.580026823869845, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.15621896088123322, + "logits/rejected": 0.01897679828107357, + "logps/chosen": -2.595720052719116, + "logps/rejected": -3.1915364265441895, + "loss": 0.5966, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.595720052719116, + "rewards/margins": 0.5958161950111389, + "rewards/rejected": -3.1915364265441895, + "sft_loss": 2.727245330810547, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 22.842739177982196, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.22418169677257538, + "logits/rejected": -0.1372956484556198, + "logps/chosen": -2.629754066467285, + "logps/rejected": -3.2748329639434814, + "loss": 0.58, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.629754066467285, + "rewards/margins": 0.645078718662262, + "rewards/rejected": -3.2748329639434814, + "sft_loss": 2.68192982673645, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 13.12177568543038, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.08619613200426102, + "logits/rejected": -0.030371153727173805, + "logps/chosen": -2.731705665588379, + "logps/rejected": -3.35599946975708, + "loss": 0.5613, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.731705665588379, + "rewards/margins": 0.6242938041687012, + "rewards/rejected": -3.35599946975708, + "sft_loss": 2.846529960632324, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 13.03628061104775, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.17338579893112183, + "logits/rejected": -0.01972215436398983, + "logps/chosen": -2.701442003250122, + "logps/rejected": -3.3588452339172363, + "loss": 0.5609, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.701442003250122, + "rewards/margins": 0.6574033498764038, + "rewards/rejected": -3.3588452339172363, + "sft_loss": 2.878345012664795, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 13.795442492625112, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.12506094574928284, + "logits/rejected": 0.049610551446676254, + "logps/chosen": -2.8180489540100098, + "logps/rejected": -3.6988799571990967, + "loss": 0.5293, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8180489540100098, + "rewards/margins": 0.8808309435844421, + "rewards/rejected": -3.6988799571990967, + "sft_loss": 3.0852441787719727, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 18.801175453987884, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.09460072964429855, + "logits/rejected": -0.019903432577848434, + "logps/chosen": -2.8260114192962646, + "logps/rejected": -3.6575844287872314, + "loss": 0.5206, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8260114192962646, + "rewards/margins": 0.8315728306770325, + "rewards/rejected": -3.6575844287872314, + "sft_loss": 3.039412498474121, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 15.622175643204827, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.13542449474334717, + "logits/rejected": 0.031426429748535156, + "logps/chosen": -2.97768235206604, + "logps/rejected": -3.8668739795684814, + "loss": 0.5144, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.97768235206604, + "rewards/margins": 0.8891918063163757, + "rewards/rejected": -3.8668739795684814, + "sft_loss": 3.1043858528137207, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 19.316503635391115, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.10220174491405487, + "logits/rejected": 0.06846420466899872, + "logps/chosen": -2.8224167823791504, + "logps/rejected": -3.7612388134002686, + "loss": 0.4854, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8224167823791504, + "rewards/margins": 0.9388219714164734, + "rewards/rejected": -3.7612388134002686, + "sft_loss": 2.9477078914642334, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 18.5568200414671, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.1605706512928009, + "logits/rejected": -0.02944830060005188, + "logps/chosen": -3.0751566886901855, + "logps/rejected": -3.7981693744659424, + "loss": 0.5777, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.0751566886901855, + "rewards/margins": 0.7230121493339539, + "rewards/rejected": -3.7981693744659424, + "sft_loss": 3.372133731842041, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 13.343185976912125, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.21381273865699768, + "logits/rejected": -0.019857224076986313, + "logps/chosen": -2.8909995555877686, + "logps/rejected": -3.6995689868927, + "loss": 0.5401, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8909995555877686, + "rewards/margins": 0.8085689544677734, + "rewards/rejected": -3.6995689868927, + "sft_loss": 3.1301767826080322, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 17.805283689818808, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.14839546382427216, + "logits/rejected": -0.04645346850156784, + "logps/chosen": -2.9273176193237305, + "logps/rejected": -3.6850764751434326, + "loss": 0.5572, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.9273176193237305, + "rewards/margins": 0.7577590942382812, + "rewards/rejected": -3.6850764751434326, + "sft_loss": 3.088479518890381, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 14.030737674244467, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.20233619213104248, + "logits/rejected": -0.18629398941993713, + "logps/chosen": -2.8093066215515137, + "logps/rejected": -3.3424696922302246, + "loss": 0.6199, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8093066215515137, + "rewards/margins": 0.5331630706787109, + "rewards/rejected": -3.3424696922302246, + "sft_loss": 3.0653176307678223, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 14.668997301854489, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.221414715051651, + "logits/rejected": -0.0698011964559555, + "logps/chosen": -2.9920241832733154, + "logps/rejected": -3.6130146980285645, + "loss": 0.5959, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9920241832733154, + "rewards/margins": 0.6209903359413147, + "rewards/rejected": -3.6130146980285645, + "sft_loss": 3.0735440254211426, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 15.491495900754604, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.10512229055166245, + "logits/rejected": -0.03650935739278793, + "logps/chosen": -2.888662338256836, + "logps/rejected": -3.501347064971924, + "loss": 0.5887, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.888662338256836, + "rewards/margins": 0.612684428691864, + "rewards/rejected": -3.501347064971924, + "sft_loss": 2.9852821826934814, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 14.576856389525108, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.08463722467422485, + "logits/rejected": 0.07376393675804138, + "logps/chosen": -2.693789482116699, + "logps/rejected": -3.459043025970459, + "loss": 0.5221, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.693789482116699, + "rewards/margins": 0.7652538418769836, + "rewards/rejected": -3.459043025970459, + "sft_loss": 2.7604660987854004, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 16.062435436979722, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.13985982537269592, + "logits/rejected": 0.019844168797135353, + "logps/chosen": -2.6246771812438965, + "logps/rejected": -3.4403557777404785, + "loss": 0.5577, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6246771812438965, + "rewards/margins": 0.815678596496582, + "rewards/rejected": -3.4403557777404785, + "sft_loss": 2.7694716453552246, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 13.179507477842277, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.01281226146966219, + "logits/rejected": 0.046844713389873505, + "logps/chosen": -2.728203535079956, + "logps/rejected": -3.4431490898132324, + "loss": 0.5681, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.728203535079956, + "rewards/margins": 0.7149455547332764, + "rewards/rejected": -3.4431490898132324, + "sft_loss": 2.8578782081604004, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 12.75446394519099, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.14359095692634583, + "logits/rejected": -0.0907754972577095, + "logps/chosen": -2.6554677486419678, + "logps/rejected": -3.2389018535614014, + "loss": 0.5536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6554677486419678, + "rewards/margins": 0.5834343433380127, + "rewards/rejected": -3.2389018535614014, + "sft_loss": 2.838890790939331, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 10.722427530408353, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.13219062983989716, + "logits/rejected": -0.03505239635705948, + "logps/chosen": -2.730032444000244, + "logps/rejected": -3.409017562866211, + "loss": 0.5558, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.730032444000244, + "rewards/margins": 0.6789852380752563, + "rewards/rejected": -3.409017562866211, + "sft_loss": 2.899423360824585, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 13.255145077665803, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.09360867738723755, + "logits/rejected": 0.18282029032707214, + "logps/chosen": -2.653068780899048, + "logps/rejected": -3.4807612895965576, + "loss": 0.526, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.653068780899048, + "rewards/margins": 0.8276923894882202, + "rewards/rejected": -3.4807612895965576, + "sft_loss": 2.8217830657958984, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 13.204016973156914, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.10313459485769272, + "logits/rejected": 0.11331169307231903, + "logps/chosen": -2.686976909637451, + "logps/rejected": -3.457796096801758, + "loss": 0.5432, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.686976909637451, + "rewards/margins": 0.7708194255828857, + "rewards/rejected": -3.457796096801758, + "sft_loss": 2.7527642250061035, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 17.918813310487703, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.09085582196712494, + "logits/rejected": 0.07258772850036621, + "logps/chosen": -2.6320650577545166, + "logps/rejected": -3.3432857990264893, + "loss": 0.5614, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.6320650577545166, + "rewards/margins": 0.7112206220626831, + "rewards/rejected": -3.3432857990264893, + "sft_loss": 2.7670674324035645, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 13.3017856458366, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.1044057160615921, + "logits/rejected": 0.009650531224906445, + "logps/chosen": -2.556666135787964, + "logps/rejected": -3.480056047439575, + "loss": 0.5091, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.556666135787964, + "rewards/margins": 0.923389732837677, + "rewards/rejected": -3.480056047439575, + "sft_loss": 2.6876208782196045, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 9.98772600061051, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.11959944665431976, + "logits/rejected": -0.039528582245111465, + "logps/chosen": -2.725451946258545, + "logps/rejected": -3.507183790206909, + "loss": 0.5099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.725451946258545, + "rewards/margins": 0.7817317843437195, + "rewards/rejected": -3.507183790206909, + "sft_loss": 2.7946159839630127, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 13.38597052407973, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.08520710468292236, + "logits/rejected": 0.016001040115952492, + "logps/chosen": -2.9222166538238525, + "logps/rejected": -3.529139280319214, + "loss": 0.6033, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9222166538238525, + "rewards/margins": 0.6069226861000061, + "rewards/rejected": -3.529139280319214, + "sft_loss": 3.0721733570098877, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 16.46895036226173, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.1309281587600708, + "logits/rejected": -0.0354643315076828, + "logps/chosen": -2.7004354000091553, + "logps/rejected": -3.518594741821289, + "loss": 0.5459, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7004354000091553, + "rewards/margins": 0.8181589841842651, + "rewards/rejected": -3.518594741821289, + "sft_loss": 2.789832592010498, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 12.847039245754097, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.08045268058776855, + "logits/rejected": 0.1180802583694458, + "logps/chosen": -2.740691661834717, + "logps/rejected": -3.5636367797851562, + "loss": 0.551, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.740691661834717, + "rewards/margins": 0.8229446411132812, + "rewards/rejected": -3.5636367797851562, + "sft_loss": 2.8766980171203613, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 9.856853536296434, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.07948831468820572, + "logits/rejected": 0.099387027323246, + "logps/chosen": -2.6131155490875244, + "logps/rejected": -3.3903956413269043, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6131155490875244, + "rewards/margins": 0.7772801518440247, + "rewards/rejected": -3.3903956413269043, + "sft_loss": 2.744523286819458, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 10.170757207302868, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.1351918876171112, + "logits/rejected": 0.06504303961992264, + "logps/chosen": -2.585547685623169, + "logps/rejected": -3.219287157058716, + "loss": 0.5553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.585547685623169, + "rewards/margins": 0.6337396502494812, + "rewards/rejected": -3.219287157058716, + "sft_loss": 2.7263667583465576, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 11.671673346294954, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.12973728775978088, + "logits/rejected": -0.02659899927675724, + "logps/chosen": -2.515537738800049, + "logps/rejected": -3.344698667526245, + "loss": 0.5147, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.515537738800049, + "rewards/margins": 0.8291610479354858, + "rewards/rejected": -3.344698667526245, + "sft_loss": 2.676945924758911, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 13.71221290860054, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.047227486968040466, + "logits/rejected": 0.1276738941669464, + "logps/chosen": -2.6473500728607178, + "logps/rejected": -3.255404233932495, + "loss": 0.5629, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6473500728607178, + "rewards/margins": 0.6080541610717773, + "rewards/rejected": -3.255404233932495, + "sft_loss": 2.734590530395508, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 12.438608083959897, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": 0.027154380455613136, + "logits/rejected": 0.06761490553617477, + "logps/chosen": -2.582916259765625, + "logps/rejected": -3.297471284866333, + "loss": 0.5451, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.582916259765625, + "rewards/margins": 0.7145551443099976, + "rewards/rejected": -3.297471284866333, + "sft_loss": 2.8727736473083496, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 11.917197558048063, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.04968901723623276, + "logits/rejected": 0.06473871320486069, + "logps/chosen": -2.7895896434783936, + "logps/rejected": -3.569030284881592, + "loss": 0.5934, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7895896434783936, + "rewards/margins": 0.7794402837753296, + "rewards/rejected": -3.569030284881592, + "sft_loss": 3.034745693206787, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 11.495106920955104, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.10112420469522476, + "logits/rejected": -0.10614802688360214, + "logps/chosen": -2.657148599624634, + "logps/rejected": -3.3440566062927246, + "loss": 0.5671, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.657148599624634, + "rewards/margins": 0.6869081258773804, + "rewards/rejected": -3.3440566062927246, + "sft_loss": 2.8682680130004883, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 12.574700982249729, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.16951291263103485, + "logits/rejected": 0.036975160241127014, + "logps/chosen": -2.642416000366211, + "logps/rejected": -3.3090901374816895, + "loss": 0.5598, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.642416000366211, + "rewards/margins": 0.6666739583015442, + "rewards/rejected": -3.3090901374816895, + "sft_loss": 2.740629196166992, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 13.447116701349463, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.2006872147321701, + "logits/rejected": -0.06577114015817642, + "logps/chosen": -2.430807113647461, + "logps/rejected": -3.3179214000701904, + "loss": 0.4972, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.430807113647461, + "rewards/margins": 0.8871143460273743, + "rewards/rejected": -3.3179214000701904, + "sft_loss": 2.5946366786956787, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 13.341152863137065, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.15490292012691498, + "logits/rejected": -0.05388979986310005, + "logps/chosen": -2.692946195602417, + "logps/rejected": -3.3634724617004395, + "loss": 0.5757, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.692946195602417, + "rewards/margins": 0.6705261468887329, + "rewards/rejected": -3.3634724617004395, + "sft_loss": 2.7373454570770264, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 13.048397341515892, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.1885337084531784, + "logits/rejected": -0.05167509242892265, + "logps/chosen": -2.5184414386749268, + "logps/rejected": -3.193661689758301, + "loss": 0.5845, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.5184414386749268, + "rewards/margins": 0.6752203702926636, + "rewards/rejected": -3.193661689758301, + "sft_loss": 2.647007703781128, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 11.605630875359319, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.2051846981048584, + "logits/rejected": 0.06742997467517853, + "logps/chosen": -2.474386215209961, + "logps/rejected": -3.319561004638672, + "loss": 0.5138, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.474386215209961, + "rewards/margins": 0.8451749682426453, + "rewards/rejected": -3.319561004638672, + "sft_loss": 2.6089258193969727, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 16.90516707069004, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.17644917964935303, + "logits/rejected": 0.006234446074813604, + "logps/chosen": -2.490910053253174, + "logps/rejected": -3.3691134452819824, + "loss": 0.5263, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.490910053253174, + "rewards/margins": 0.8782032132148743, + "rewards/rejected": -3.3691134452819824, + "sft_loss": 2.5951409339904785, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 24.034250993795567, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.184208482503891, + "logits/rejected": 0.034039318561553955, + "logps/chosen": -2.527522563934326, + "logps/rejected": -3.3448543548583984, + "loss": 0.531, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.527522563934326, + "rewards/margins": 0.8173316717147827, + "rewards/rejected": -3.3448543548583984, + "sft_loss": 2.6108803749084473, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 11.839183488169162, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.15044936537742615, + "logits/rejected": 0.0458507277071476, + "logps/chosen": -2.6617484092712402, + "logps/rejected": -3.3631751537323, + "loss": 0.5553, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6617484092712402, + "rewards/margins": 0.7014267444610596, + "rewards/rejected": -3.3631751537323, + "sft_loss": 2.855072021484375, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 14.701365832973247, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.09620432555675507, + "logits/rejected": -0.04263802617788315, + "logps/chosen": -2.5127742290496826, + "logps/rejected": -3.233936309814453, + "loss": 0.5351, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5127742290496826, + "rewards/margins": 0.7211618423461914, + "rewards/rejected": -3.233936309814453, + "sft_loss": 2.6880221366882324, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 13.0410833738944, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.036066021770238876, + "logits/rejected": 0.009444182738661766, + "logps/chosen": -2.5961806774139404, + "logps/rejected": -3.322415590286255, + "loss": 0.5558, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5961806774139404, + "rewards/margins": 0.7262347936630249, + "rewards/rejected": -3.322415590286255, + "sft_loss": 2.7093794345855713, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 16.972296740618447, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.15286365151405334, + "logits/rejected": -0.0631868839263916, + "logps/chosen": -2.5808475017547607, + "logps/rejected": -3.2137513160705566, + "loss": 0.5988, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5808475017547607, + "rewards/margins": 0.6329033374786377, + "rewards/rejected": -3.2137513160705566, + "sft_loss": 2.66843581199646, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 12.446230685664725, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.09740933030843735, + "logits/rejected": -0.09939023107290268, + "logps/chosen": -2.4693121910095215, + "logps/rejected": -3.026426315307617, + "loss": 0.5838, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4693121910095215, + "rewards/margins": 0.5571140646934509, + "rewards/rejected": -3.026426315307617, + "sft_loss": 2.6136717796325684, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 11.761230687897601, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.13074907660484314, + "logits/rejected": -0.030215833336114883, + "logps/chosen": -2.479123115539551, + "logps/rejected": -3.2345752716064453, + "loss": 0.53, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.479123115539551, + "rewards/margins": 0.7554522752761841, + "rewards/rejected": -3.2345752716064453, + "sft_loss": 2.625739574432373, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 16.20048329992292, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.23178374767303467, + "logits/rejected": -0.036921434104442596, + "logps/chosen": -2.546027421951294, + "logps/rejected": -3.364816665649414, + "loss": 0.5121, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.546027421951294, + "rewards/margins": 0.8187891244888306, + "rewards/rejected": -3.364816665649414, + "sft_loss": 2.596057891845703, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 13.237538300780992, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.15380941331386566, + "logits/rejected": 0.012906426563858986, + "logps/chosen": -2.4451916217803955, + "logps/rejected": -3.311616897583008, + "loss": 0.497, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4451916217803955, + "rewards/margins": 0.8664249181747437, + "rewards/rejected": -3.311616897583008, + "sft_loss": 2.575024127960205, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 13.956827265019383, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.22626054286956787, + "logits/rejected": -0.012976361438632011, + "logps/chosen": -2.4243850708007812, + "logps/rejected": -3.1488213539123535, + "loss": 0.5332, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4243850708007812, + "rewards/margins": 0.7244361639022827, + "rewards/rejected": -3.1488213539123535, + "sft_loss": 2.582690477371216, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 10.327241906099365, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.19221287965774536, + "logits/rejected": 0.026269104331731796, + "logps/chosen": -2.713867425918579, + "logps/rejected": -3.5505664348602295, + "loss": 0.525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.713867425918579, + "rewards/margins": 0.8366985321044922, + "rewards/rejected": -3.5505664348602295, + "sft_loss": 2.8076956272125244, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 16.810249159493672, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.2720238268375397, + "logits/rejected": -0.0906338170170784, + "logps/chosen": -2.614173650741577, + "logps/rejected": -3.446650266647339, + "loss": 0.5747, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.614173650741577, + "rewards/margins": 0.8324767351150513, + "rewards/rejected": -3.446650266647339, + "sft_loss": 2.7610104084014893, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 13.34223976398338, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.2368827760219574, + "logits/rejected": -0.1460256278514862, + "logps/chosen": -2.7980031967163086, + "logps/rejected": -3.4800288677215576, + "loss": 0.61, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7980031967163086, + "rewards/margins": 0.6820257306098938, + "rewards/rejected": -3.4800288677215576, + "sft_loss": 2.993485689163208, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 18.378836446546607, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.1851898729801178, + "logits/rejected": -0.03527377173304558, + "logps/chosen": -2.6871867179870605, + "logps/rejected": -3.432643175125122, + "loss": 0.5874, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6871867179870605, + "rewards/margins": 0.7454566359519958, + "rewards/rejected": -3.432643175125122, + "sft_loss": 2.8437156677246094, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 15.379182647175359, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.19557121396064758, + "logits/rejected": -0.06361202895641327, + "logps/chosen": -2.899049758911133, + "logps/rejected": -3.6863160133361816, + "loss": 0.5622, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.899049758911133, + "rewards/margins": 0.7872657775878906, + "rewards/rejected": -3.6863160133361816, + "sft_loss": 2.9786953926086426, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 19.226688805785205, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.15122616291046143, + "logits/rejected": 0.1265804022550583, + "logps/chosen": -2.5949044227600098, + "logps/rejected": -3.4804539680480957, + "loss": 0.5106, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5949044227600098, + "rewards/margins": 0.8855496644973755, + "rewards/rejected": -3.4804539680480957, + "sft_loss": 2.70125412940979, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 12.066918686519287, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.22271844744682312, + "logits/rejected": 0.0023609772324562073, + "logps/chosen": -2.7400269508361816, + "logps/rejected": -3.5635292530059814, + "loss": 0.5176, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7400269508361816, + "rewards/margins": 0.8235026597976685, + "rewards/rejected": -3.5635292530059814, + "sft_loss": 2.8706085681915283, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 10.177395026790624, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.08435920625925064, + "logits/rejected": -0.04384620115160942, + "logps/chosen": -2.7082793712615967, + "logps/rejected": -3.4856247901916504, + "loss": 0.525, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7082793712615967, + "rewards/margins": 0.7773455381393433, + "rewards/rejected": -3.4856247901916504, + "sft_loss": 2.8122072219848633, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 11.947349382256055, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.044284939765930176, + "logits/rejected": 0.03796308487653732, + "logps/chosen": -2.7693889141082764, + "logps/rejected": -3.5747649669647217, + "loss": 0.511, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7693889141082764, + "rewards/margins": 0.8053762316703796, + "rewards/rejected": -3.5747649669647217, + "sft_loss": 2.8007047176361084, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 15.120074288295065, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.20141425728797913, + "logits/rejected": -0.08690561354160309, + "logps/chosen": -2.728034496307373, + "logps/rejected": -3.516979932785034, + "loss": 0.56, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.728034496307373, + "rewards/margins": 0.7889455556869507, + "rewards/rejected": -3.516979932785034, + "sft_loss": 2.9466300010681152, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 12.277672817485561, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.12988251447677612, + "logits/rejected": -0.03518949821591377, + "logps/chosen": -2.8811938762664795, + "logps/rejected": -3.6656360626220703, + "loss": 0.542, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8811938762664795, + "rewards/margins": 0.7844420075416565, + "rewards/rejected": -3.6656360626220703, + "sft_loss": 2.9895501136779785, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 11.712712073445397, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.28072601556777954, + "logits/rejected": -0.0645415335893631, + "logps/chosen": -2.8859238624572754, + "logps/rejected": -3.632223606109619, + "loss": 0.5675, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8859238624572754, + "rewards/margins": 0.7462996244430542, + "rewards/rejected": -3.632223606109619, + "sft_loss": 2.982220411300659, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 12.946250592719066, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.2070569097995758, + "logits/rejected": -0.017365068197250366, + "logps/chosen": -2.7785403728485107, + "logps/rejected": -3.644423246383667, + "loss": 0.5313, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7785403728485107, + "rewards/margins": 0.8658832311630249, + "rewards/rejected": -3.644423246383667, + "sft_loss": 2.9232311248779297, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 11.736503117054808, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.1762668788433075, + "logits/rejected": 0.003911969251930714, + "logps/chosen": -2.836501359939575, + "logps/rejected": -3.646477460861206, + "loss": 0.5416, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.836501359939575, + "rewards/margins": 0.8099767565727234, + "rewards/rejected": -3.646477460861206, + "sft_loss": 2.9586517810821533, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 15.007043656895517, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.17761953175067902, + "logits/rejected": -0.031355392187833786, + "logps/chosen": -2.6332767009735107, + "logps/rejected": -3.617527723312378, + "loss": 0.5132, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.6332767009735107, + "rewards/margins": 0.9842513799667358, + "rewards/rejected": -3.617527723312378, + "sft_loss": 2.784289836883545, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 16.31340823948697, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.20015859603881836, + "logits/rejected": -0.02334914542734623, + "logps/chosen": -2.89608097076416, + "logps/rejected": -3.7705636024475098, + "loss": 0.5374, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.89608097076416, + "rewards/margins": 0.8744827508926392, + "rewards/rejected": -3.7705636024475098, + "sft_loss": 3.092453956604004, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 14.44861023226222, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.19069427251815796, + "logits/rejected": -0.0180917177349329, + "logps/chosen": -2.741619110107422, + "logps/rejected": -3.618983745574951, + "loss": 0.5324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.741619110107422, + "rewards/margins": 0.877364993095398, + "rewards/rejected": -3.618983745574951, + "sft_loss": 2.8957886695861816, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 16.48183837907669, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.13300219178199768, + "logits/rejected": 0.04468105733394623, + "logps/chosen": -2.9447832107543945, + "logps/rejected": -3.7198548316955566, + "loss": 0.5551, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9447832107543945, + "rewards/margins": 0.7750714421272278, + "rewards/rejected": -3.7198548316955566, + "sft_loss": 3.105386257171631, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 14.999334086996232, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.29770466685295105, + "logits/rejected": -0.10078287124633789, + "logps/chosen": -2.7202506065368652, + "logps/rejected": -3.578933000564575, + "loss": 0.5286, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7202506065368652, + "rewards/margins": 0.8586821556091309, + "rewards/rejected": -3.578933000564575, + "sft_loss": 2.8595337867736816, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 14.321699959576042, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.21056774258613586, + "logits/rejected": -0.02034085802733898, + "logps/chosen": -2.89121675491333, + "logps/rejected": -3.8506247997283936, + "loss": 0.5274, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.89121675491333, + "rewards/margins": 0.9594081044197083, + "rewards/rejected": -3.8506247997283936, + "sft_loss": 3.043997287750244, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 13.887602835497312, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.13619615137577057, + "logits/rejected": 0.0005922317504882812, + "logps/chosen": -2.908087968826294, + "logps/rejected": -3.741673707962036, + "loss": 0.5438, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.908087968826294, + "rewards/margins": 0.8335857391357422, + "rewards/rejected": -3.741673707962036, + "sft_loss": 3.002686023712158, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 16.49687038495756, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.12130200862884521, + "logits/rejected": -0.04878392815589905, + "logps/chosen": -2.851966381072998, + "logps/rejected": -3.7922961711883545, + "loss": 0.4991, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.851966381072998, + "rewards/margins": 0.9403297305107117, + "rewards/rejected": -3.7922961711883545, + "sft_loss": 2.9816746711730957, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 16.483275366685195, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.2887728214263916, + "logits/rejected": -0.1400589495897293, + "logps/chosen": -2.6632139682769775, + "logps/rejected": -3.386286973953247, + "loss": 0.5552, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6632139682769775, + "rewards/margins": 0.7230730056762695, + "rewards/rejected": -3.386286973953247, + "sft_loss": 2.812216281890869, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 23.26258200474679, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.2058527171611786, + "logits/rejected": -0.06673066318035126, + "logps/chosen": -2.7234292030334473, + "logps/rejected": -3.306281328201294, + "loss": 0.5886, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.7234292030334473, + "rewards/margins": 0.5828520059585571, + "rewards/rejected": -3.306281328201294, + "sft_loss": 2.842212200164795, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.13509926199913025, + "eval_logits/rejected": 0.24084508419036865, + "eval_logps/chosen": -2.800790786743164, + "eval_logps/rejected": -3.598574161529541, + "eval_loss": 0.5600130558013916, + "eval_rewards/accuracies": 0.7292284965515137, + "eval_rewards/chosen": -2.800790786743164, + "eval_rewards/margins": 0.7977828979492188, + "eval_rewards/rejected": -3.598574161529541, + "eval_runtime": 49.8522, + "eval_samples_per_second": 26.98, + "eval_sft_loss": 2.9405603408813477, + "eval_steps_per_second": 6.76, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 9.178747966280907, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.19156606495380402, + "logits/rejected": -0.04732166603207588, + "logps/chosen": -2.7235498428344727, + "logps/rejected": -3.664428234100342, + "loss": 0.5177, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7235498428344727, + "rewards/margins": 0.9408785700798035, + "rewards/rejected": -3.664428234100342, + "sft_loss": 2.8657774925231934, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 14.64997920224963, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.2830374240875244, + "logits/rejected": -0.08339240401983261, + "logps/chosen": -2.740020990371704, + "logps/rejected": -3.4012451171875, + "loss": 0.5545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.740020990371704, + "rewards/margins": 0.6612240076065063, + "rewards/rejected": -3.4012451171875, + "sft_loss": 2.873657703399658, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 11.524358523068935, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.12432912737131119, + "logits/rejected": -0.029606735333800316, + "logps/chosen": -2.6872119903564453, + "logps/rejected": -3.5602688789367676, + "loss": 0.5376, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6872119903564453, + "rewards/margins": 0.8730567097663879, + "rewards/rejected": -3.5602688789367676, + "sft_loss": 2.837043285369873, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 11.142509944407367, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.23667557537555695, + "logits/rejected": -0.10876540839672089, + "logps/chosen": -2.668792724609375, + "logps/rejected": -3.506338119506836, + "loss": 0.5018, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.668792724609375, + "rewards/margins": 0.8375449180603027, + "rewards/rejected": -3.506338119506836, + "sft_loss": 2.8608899116516113, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 10.733047141858474, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.17369894683361053, + "logits/rejected": -0.07994101941585541, + "logps/chosen": -2.5164730548858643, + "logps/rejected": -3.185642957687378, + "loss": 0.5352, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.5164730548858643, + "rewards/margins": 0.6691699624061584, + "rewards/rejected": -3.185642957687378, + "sft_loss": 2.750406503677368, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 15.58574746404078, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.1767248660326004, + "logits/rejected": -0.10301689803600311, + "logps/chosen": -2.7023122310638428, + "logps/rejected": -3.4687092304229736, + "loss": 0.554, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7023122310638428, + "rewards/margins": 0.7663971185684204, + "rewards/rejected": -3.4687092304229736, + "sft_loss": 2.8399150371551514, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 11.42669165967883, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.15778064727783203, + "logits/rejected": -0.0029870569705963135, + "logps/chosen": -2.656895160675049, + "logps/rejected": -3.3034110069274902, + "loss": 0.5897, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.656895160675049, + "rewards/margins": 0.6465158462524414, + "rewards/rejected": -3.3034110069274902, + "sft_loss": 2.8100223541259766, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 14.465073990971277, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.21933957934379578, + "logits/rejected": -0.08447906374931335, + "logps/chosen": -2.6796069145202637, + "logps/rejected": -3.46742582321167, + "loss": 0.5622, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6796069145202637, + "rewards/margins": 0.7878190875053406, + "rewards/rejected": -3.46742582321167, + "sft_loss": 2.7700302600860596, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 19.255952750159327, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.2369045466184616, + "logits/rejected": -0.08217814564704895, + "logps/chosen": -2.7740702629089355, + "logps/rejected": -3.428654909133911, + "loss": 0.5859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7740702629089355, + "rewards/margins": 0.654584527015686, + "rewards/rejected": -3.428654909133911, + "sft_loss": 2.906905174255371, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 16.646039882943093, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.16528551280498505, + "logits/rejected": -0.008354656398296356, + "logps/chosen": -2.638530969619751, + "logps/rejected": -3.4399943351745605, + "loss": 0.5305, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.638530969619751, + "rewards/margins": 0.8014636039733887, + "rewards/rejected": -3.4399943351745605, + "sft_loss": 2.756185293197632, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 11.067511070480855, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.24355682730674744, + "logits/rejected": 0.05653177946805954, + "logps/chosen": -2.8080837726593018, + "logps/rejected": -3.5968105792999268, + "loss": 0.5213, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8080837726593018, + "rewards/margins": 0.7887266874313354, + "rewards/rejected": -3.5968105792999268, + "sft_loss": 2.9028820991516113, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 18.68045619931817, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.16562731564044952, + "logits/rejected": -0.04927445575594902, + "logps/chosen": -2.7073352336883545, + "logps/rejected": -3.3803048133850098, + "loss": 0.5804, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7073352336883545, + "rewards/margins": 0.6729689836502075, + "rewards/rejected": -3.3803048133850098, + "sft_loss": 2.819136381149292, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 11.705893401506131, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.2004169523715973, + "logits/rejected": -0.08777768909931183, + "logps/chosen": -2.8763434886932373, + "logps/rejected": -3.8222098350524902, + "loss": 0.4966, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8763434886932373, + "rewards/margins": 0.945866584777832, + "rewards/rejected": -3.8222098350524902, + "sft_loss": 2.9446804523468018, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 18.213731947297997, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.3117894232273102, + "logits/rejected": -0.1837528645992279, + "logps/chosen": -2.9130516052246094, + "logps/rejected": -3.5416507720947266, + "loss": 0.6002, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9130516052246094, + "rewards/margins": 0.6285988092422485, + "rewards/rejected": -3.5416507720947266, + "sft_loss": 3.135199785232544, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 15.827545785560098, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.16902324557304382, + "logits/rejected": -0.10092641413211823, + "logps/chosen": -2.875767469406128, + "logps/rejected": -3.5667643547058105, + "loss": 0.558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.875767469406128, + "rewards/margins": 0.6909972429275513, + "rewards/rejected": -3.5667643547058105, + "sft_loss": 3.042696237564087, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 10.677099643746306, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.30803683400154114, + "logits/rejected": -0.10283231735229492, + "logps/chosen": -2.822262763977051, + "logps/rejected": -3.7773125171661377, + "loss": 0.4987, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.822262763977051, + "rewards/margins": 0.9550496339797974, + "rewards/rejected": -3.7773125171661377, + "sft_loss": 3.107553005218506, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 13.727165695334431, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": -0.09935744106769562, + "logits/rejected": 0.07015521824359894, + "logps/chosen": -2.6926915645599365, + "logps/rejected": -3.6834826469421387, + "loss": 0.5068, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6926915645599365, + "rewards/margins": 0.9907909631729126, + "rewards/rejected": -3.6834826469421387, + "sft_loss": 2.978348731994629, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 12.6517428480068, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.22434866428375244, + "logits/rejected": -0.1461108922958374, + "logps/chosen": -2.745539665222168, + "logps/rejected": -3.575504779815674, + "loss": 0.5058, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.745539665222168, + "rewards/margins": 0.8299651145935059, + "rewards/rejected": -3.575504779815674, + "sft_loss": 2.93871808052063, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 14.644970152963, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.1396378129720688, + "logits/rejected": -0.10330124944448471, + "logps/chosen": -2.7004261016845703, + "logps/rejected": -3.378009796142578, + "loss": 0.5875, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7004261016845703, + "rewards/margins": 0.6775835752487183, + "rewards/rejected": -3.378009796142578, + "sft_loss": 2.8961141109466553, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 12.36729496599677, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.14708910882472992, + "logits/rejected": 0.03477005660533905, + "logps/chosen": -2.6921839714050293, + "logps/rejected": -3.314579725265503, + "loss": 0.5643, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6921839714050293, + "rewards/margins": 0.6223957538604736, + "rewards/rejected": -3.314579725265503, + "sft_loss": 2.88639235496521, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 12.37532525890674, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.156544491648674, + "logits/rejected": 0.026922767981886864, + "logps/chosen": -2.521768808364868, + "logps/rejected": -3.3125717639923096, + "loss": 0.5289, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.521768808364868, + "rewards/margins": 0.7908032536506653, + "rewards/rejected": -3.3125717639923096, + "sft_loss": 2.6430845260620117, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 14.180483053003758, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.20356695353984833, + "logits/rejected": -0.047584448009729385, + "logps/chosen": -2.5231261253356934, + "logps/rejected": -3.316211700439453, + "loss": 0.5097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5231261253356934, + "rewards/margins": 0.7930856347084045, + "rewards/rejected": -3.316211700439453, + "sft_loss": 2.685520648956299, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 17.337359619768694, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.10451420396566391, + "logits/rejected": 0.00366055965423584, + "logps/chosen": -2.52323842048645, + "logps/rejected": -3.3734488487243652, + "loss": 0.5063, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.52323842048645, + "rewards/margins": 0.8502107858657837, + "rewards/rejected": -3.3734488487243652, + "sft_loss": 2.73701810836792, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 14.455186650463212, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.22164861857891083, + "logits/rejected": -0.136245459318161, + "logps/chosen": -2.6677823066711426, + "logps/rejected": -3.357440233230591, + "loss": 0.5296, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6677823066711426, + "rewards/margins": 0.6896578073501587, + "rewards/rejected": -3.357440233230591, + "sft_loss": 2.6678340435028076, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 17.826150311848068, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.21637877821922302, + "logits/rejected": -0.12764832377433777, + "logps/chosen": -2.770840883255005, + "logps/rejected": -3.4959206581115723, + "loss": 0.5591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.770840883255005, + "rewards/margins": 0.7250800132751465, + "rewards/rejected": -3.4959206581115723, + "sft_loss": 2.990640163421631, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 23.893730691241025, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.18899592757225037, + "logits/rejected": -0.11210862547159195, + "logps/chosen": -2.7489566802978516, + "logps/rejected": -3.3823254108428955, + "loss": 0.6187, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7489566802978516, + "rewards/margins": 0.6333683133125305, + "rewards/rejected": -3.3823254108428955, + "sft_loss": 2.92044734954834, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 13.561641964424144, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.2757723331451416, + "logits/rejected": -0.11094491183757782, + "logps/chosen": -2.680462598800659, + "logps/rejected": -3.4301047325134277, + "loss": 0.5435, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.680462598800659, + "rewards/margins": 0.7496423721313477, + "rewards/rejected": -3.4301047325134277, + "sft_loss": 2.8209385871887207, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 15.414372943397188, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.20649929344654083, + "logits/rejected": -0.016391444951295853, + "logps/chosen": -2.79091215133667, + "logps/rejected": -3.4462196826934814, + "loss": 0.5771, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.79091215133667, + "rewards/margins": 0.6553074717521667, + "rewards/rejected": -3.4462196826934814, + "sft_loss": 2.9573159217834473, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 13.30450273991112, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.20889747142791748, + "logits/rejected": 0.004860124550759792, + "logps/chosen": -2.6875839233398438, + "logps/rejected": -3.52405047416687, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6875839233398438, + "rewards/margins": 0.8364666700363159, + "rewards/rejected": -3.52405047416687, + "sft_loss": 2.816736936569214, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 13.620162752755839, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.1965445727109909, + "logits/rejected": -0.016790464520454407, + "logps/chosen": -2.808445453643799, + "logps/rejected": -3.509186267852783, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.808445453643799, + "rewards/margins": 0.7007406949996948, + "rewards/rejected": -3.509186267852783, + "sft_loss": 2.945175886154175, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 15.913301539563097, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.14413128793239594, + "logits/rejected": -0.00657269824296236, + "logps/chosen": -2.6809163093566895, + "logps/rejected": -3.610265016555786, + "loss": 0.5396, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6809163093566895, + "rewards/margins": 0.9293490648269653, + "rewards/rejected": -3.610265016555786, + "sft_loss": 2.8856360912323, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 18.466727374987098, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.1730526238679886, + "logits/rejected": -0.04170035570859909, + "logps/chosen": -2.8152554035186768, + "logps/rejected": -3.526615619659424, + "loss": 0.5729, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8152554035186768, + "rewards/margins": 0.7113600969314575, + "rewards/rejected": -3.526615619659424, + "sft_loss": 2.9285664558410645, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 33.35905874085635, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.15972650051116943, + "logits/rejected": -0.0013029143447056413, + "logps/chosen": -2.736224412918091, + "logps/rejected": -3.581981658935547, + "loss": 0.5385, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.736224412918091, + "rewards/margins": 0.8457571864128113, + "rewards/rejected": -3.581981658935547, + "sft_loss": 2.8889448642730713, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 10.512825087959222, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.133263498544693, + "logits/rejected": -0.04440528526902199, + "logps/chosen": -2.628513813018799, + "logps/rejected": -3.4317822456359863, + "loss": 0.5477, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.628513813018799, + "rewards/margins": 0.803268313407898, + "rewards/rejected": -3.4317822456359863, + "sft_loss": 2.7247188091278076, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 12.64341290727834, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.23653188347816467, + "logits/rejected": -0.06919825822114944, + "logps/chosen": -2.5829663276672363, + "logps/rejected": -3.34773325920105, + "loss": 0.5467, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5829663276672363, + "rewards/margins": 0.7647669911384583, + "rewards/rejected": -3.34773325920105, + "sft_loss": 2.7210841178894043, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 17.65401632433239, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.17679783701896667, + "logits/rejected": -0.06925829499959946, + "logps/chosen": -2.688565492630005, + "logps/rejected": -3.24664568901062, + "loss": 0.5949, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.688565492630005, + "rewards/margins": 0.55808025598526, + "rewards/rejected": -3.24664568901062, + "sft_loss": 2.872016191482544, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 11.379800330719814, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.08219718188047409, + "logits/rejected": 0.09790127724409103, + "logps/chosen": -2.4927101135253906, + "logps/rejected": -3.3262767791748047, + "loss": 0.4998, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4927101135253906, + "rewards/margins": 0.8335663676261902, + "rewards/rejected": -3.3262767791748047, + "sft_loss": 2.616259813308716, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 17.147163336849538, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.17891012132167816, + "logits/rejected": -0.036122821271419525, + "logps/chosen": -2.4723801612854004, + "logps/rejected": -3.185424327850342, + "loss": 0.5609, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4723801612854004, + "rewards/margins": 0.7130442261695862, + "rewards/rejected": -3.185424327850342, + "sft_loss": 2.7231812477111816, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 16.501352248032468, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.16941580176353455, + "logits/rejected": -0.11211607605218887, + "logps/chosen": -2.6303224563598633, + "logps/rejected": -3.272925853729248, + "loss": 0.5667, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6303224563598633, + "rewards/margins": 0.6426035165786743, + "rewards/rejected": -3.272925853729248, + "sft_loss": 2.7140302658081055, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 17.074771053628144, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.07847137004137039, + "logits/rejected": 0.0759764313697815, + "logps/chosen": -2.61045503616333, + "logps/rejected": -3.3174538612365723, + "loss": 0.5387, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.61045503616333, + "rewards/margins": 0.7069988250732422, + "rewards/rejected": -3.3174538612365723, + "sft_loss": 2.783203125, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 13.390652917894553, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.06304212659597397, + "logits/rejected": 0.017494995146989822, + "logps/chosen": -2.6455399990081787, + "logps/rejected": -3.3666281700134277, + "loss": 0.5324, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6455399990081787, + "rewards/margins": 0.721088171005249, + "rewards/rejected": -3.3666281700134277, + "sft_loss": 2.9030449390411377, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 23.949889372898472, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": -0.0719418078660965, + "logits/rejected": -0.009783153422176838, + "logps/chosen": -2.728297472000122, + "logps/rejected": -3.488001585006714, + "loss": 0.5536, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.728297472000122, + "rewards/margins": 0.7597039341926575, + "rewards/rejected": -3.488001585006714, + "sft_loss": 2.902074098587036, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 18.518272311030678, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.17964942753314972, + "logits/rejected": -0.025340866297483444, + "logps/chosen": -2.68434739112854, + "logps/rejected": -3.5610384941101074, + "loss": 0.5317, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.68434739112854, + "rewards/margins": 0.8766916394233704, + "rewards/rejected": -3.5610384941101074, + "sft_loss": 2.9283835887908936, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 14.34158145926517, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.12653285264968872, + "logits/rejected": -0.15354570746421814, + "logps/chosen": -2.7683181762695312, + "logps/rejected": -3.3777289390563965, + "loss": 0.5983, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.7683181762695312, + "rewards/margins": 0.6094110012054443, + "rewards/rejected": -3.3777289390563965, + "sft_loss": 3.0164825916290283, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 20.30195335913346, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.18265673518180847, + "logits/rejected": -0.015449044294655323, + "logps/chosen": -2.7602028846740723, + "logps/rejected": -3.659242630004883, + "loss": 0.5537, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7602028846740723, + "rewards/margins": 0.8990398645401001, + "rewards/rejected": -3.659242630004883, + "sft_loss": 2.8773014545440674, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 13.652805417608606, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.27530530095100403, + "logits/rejected": -0.07264344394207001, + "logps/chosen": -2.845526933670044, + "logps/rejected": -3.6497280597686768, + "loss": 0.5391, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.845526933670044, + "rewards/margins": 0.8042010068893433, + "rewards/rejected": -3.6497280597686768, + "sft_loss": 2.9927849769592285, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 15.842694719104523, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.2530178427696228, + "logits/rejected": -0.13777250051498413, + "logps/chosen": -2.893038272857666, + "logps/rejected": -3.697848081588745, + "loss": 0.5376, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.893038272857666, + "rewards/margins": 0.8048097491264343, + "rewards/rejected": -3.697848081588745, + "sft_loss": 3.057915210723877, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 14.544232181041682, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.19055934250354767, + "logits/rejected": -0.0527595654129982, + "logps/chosen": -2.9984183311462402, + "logps/rejected": -3.722759962081909, + "loss": 0.5649, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9984183311462402, + "rewards/margins": 0.7243413329124451, + "rewards/rejected": -3.722759962081909, + "sft_loss": 3.1255040168762207, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 16.115021801415836, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.18890123069286346, + "logits/rejected": -0.09562461078166962, + "logps/chosen": -2.700990676879883, + "logps/rejected": -3.611769199371338, + "loss": 0.5006, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.700990676879883, + "rewards/margins": 0.9107785224914551, + "rewards/rejected": -3.611769199371338, + "sft_loss": 2.8132376670837402, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 14.469293489368207, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.2293945848941803, + "logits/rejected": -0.07062532007694244, + "logps/chosen": -2.8479835987091064, + "logps/rejected": -3.846541166305542, + "loss": 0.5125, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8479835987091064, + "rewards/margins": 0.9985576868057251, + "rewards/rejected": -3.846541166305542, + "sft_loss": 2.973374605178833, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 17.29314917200767, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.18610408902168274, + "logits/rejected": -0.009005597792565823, + "logps/chosen": -2.818213939666748, + "logps/rejected": -3.674471378326416, + "loss": 0.4958, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.818213939666748, + "rewards/margins": 0.856257438659668, + "rewards/rejected": -3.674471378326416, + "sft_loss": 3.050729274749756, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 15.722883885277431, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.14516502618789673, + "logits/rejected": -0.044414542615413666, + "logps/chosen": -2.8328747749328613, + "logps/rejected": -3.519896984100342, + "loss": 0.6077, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8328747749328613, + "rewards/margins": 0.6870219111442566, + "rewards/rejected": -3.519896984100342, + "sft_loss": 3.001101493835449, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 12.264431130751412, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.2737159729003906, + "logits/rejected": -0.10480846464633942, + "logps/chosen": -2.8148722648620605, + "logps/rejected": -3.628904342651367, + "loss": 0.5078, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8148722648620605, + "rewards/margins": 0.8140321969985962, + "rewards/rejected": -3.628904342651367, + "sft_loss": 2.98403000831604, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 16.23830385932747, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.11874481290578842, + "logits/rejected": 2.7514994144439697e-05, + "logps/chosen": -2.9035863876342773, + "logps/rejected": -3.6935629844665527, + "loss": 0.5183, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.9035863876342773, + "rewards/margins": 0.7899765968322754, + "rewards/rejected": -3.6935629844665527, + "sft_loss": 2.9855895042419434, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 20.915588392873623, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.17837285995483398, + "logits/rejected": -0.0020327470265328884, + "logps/chosen": -2.8230979442596436, + "logps/rejected": -3.5790398120880127, + "loss": 0.5848, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8230979442596436, + "rewards/margins": 0.7559415102005005, + "rewards/rejected": -3.5790398120880127, + "sft_loss": 2.8579752445220947, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 21.16032431937469, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.21279004216194153, + "logits/rejected": -0.012899084016680717, + "logps/chosen": -2.6492888927459717, + "logps/rejected": -3.3431143760681152, + "loss": 0.5916, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6492888927459717, + "rewards/margins": 0.693824827671051, + "rewards/rejected": -3.3431143760681152, + "sft_loss": 2.761671543121338, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 12.487601525148522, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.2572769522666931, + "logits/rejected": -0.003753144294023514, + "logps/chosen": -2.539226770401001, + "logps/rejected": -3.389569044113159, + "loss": 0.4963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.539226770401001, + "rewards/margins": 0.8503425717353821, + "rewards/rejected": -3.389569044113159, + "sft_loss": 2.672743320465088, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 22.048819759200136, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.3438642919063568, + "logits/rejected": -0.05838317796587944, + "logps/chosen": -2.6405534744262695, + "logps/rejected": -3.430321216583252, + "loss": 0.5653, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6405534744262695, + "rewards/margins": 0.7897677421569824, + "rewards/rejected": -3.430321216583252, + "sft_loss": 2.7079286575317383, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 17.818430875235002, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.1408831775188446, + "logits/rejected": -0.12864689528942108, + "logps/chosen": -2.626361131668091, + "logps/rejected": -3.380760908126831, + "loss": 0.5562, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.626361131668091, + "rewards/margins": 0.7543995380401611, + "rewards/rejected": -3.380760908126831, + "sft_loss": 2.6994235515594482, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 11.370843110523085, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.23384025692939758, + "logits/rejected": -0.03425057977437973, + "logps/chosen": -2.668400526046753, + "logps/rejected": -3.441761016845703, + "loss": 0.5463, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.668400526046753, + "rewards/margins": 0.7733603715896606, + "rewards/rejected": -3.441761016845703, + "sft_loss": 2.858882427215576, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 11.361594017102297, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.3020915389060974, + "logits/rejected": -0.04475082457065582, + "logps/chosen": -2.5850963592529297, + "logps/rejected": -3.5558242797851562, + "loss": 0.4789, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5850963592529297, + "rewards/margins": 0.9707280993461609, + "rewards/rejected": -3.5558242797851562, + "sft_loss": 2.647453784942627, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 16.554445963777106, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.14629390835762024, + "logits/rejected": -0.12839782238006592, + "logps/chosen": -2.6853654384613037, + "logps/rejected": -3.4737114906311035, + "loss": 0.5241, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.6853654384613037, + "rewards/margins": 0.7883461117744446, + "rewards/rejected": -3.4737114906311035, + "sft_loss": 2.7871718406677246, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 10.852800163434761, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.3211224675178528, + "logits/rejected": -0.1885753720998764, + "logps/chosen": -2.6056792736053467, + "logps/rejected": -3.3587448596954346, + "loss": 0.5246, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6056792736053467, + "rewards/margins": 0.753065288066864, + "rewards/rejected": -3.3587448596954346, + "sft_loss": 2.79439115524292, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 13.470218832823608, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.26647713780403137, + "logits/rejected": -0.12850052118301392, + "logps/chosen": -2.6721959114074707, + "logps/rejected": -3.3363571166992188, + "loss": 0.5514, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6721959114074707, + "rewards/margins": 0.6641608476638794, + "rewards/rejected": -3.3363571166992188, + "sft_loss": 2.7512269020080566, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 16.435840602476095, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.21397753059864044, + "logits/rejected": -0.10246507823467255, + "logps/chosen": -2.6906704902648926, + "logps/rejected": -3.434344530105591, + "loss": 0.5419, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.6906704902648926, + "rewards/margins": 0.7436736226081848, + "rewards/rejected": -3.434344530105591, + "sft_loss": 2.8569579124450684, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 18.18937708660994, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.21393327414989471, + "logits/rejected": -0.010212997905910015, + "logps/chosen": -2.658050537109375, + "logps/rejected": -3.51568603515625, + "loss": 0.543, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.658050537109375, + "rewards/margins": 0.857635498046875, + "rewards/rejected": -3.51568603515625, + "sft_loss": 2.8359665870666504, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 10.561874113057621, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.21657970547676086, + "logits/rejected": 0.0002588302013464272, + "logps/chosen": -2.510268211364746, + "logps/rejected": -3.424483060836792, + "loss": 0.481, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.510268211364746, + "rewards/margins": 0.9142149686813354, + "rewards/rejected": -3.424483060836792, + "sft_loss": 2.7040927410125732, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 12.939507861708492, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.24379563331604004, + "logits/rejected": -0.11864738166332245, + "logps/chosen": -2.693303346633911, + "logps/rejected": -3.5887484550476074, + "loss": 0.5047, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.693303346633911, + "rewards/margins": 0.8954454660415649, + "rewards/rejected": -3.5887484550476074, + "sft_loss": 2.855567693710327, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 13.038429418395213, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.2585260272026062, + "logits/rejected": -0.08726723492145538, + "logps/chosen": -2.6370608806610107, + "logps/rejected": -3.403134822845459, + "loss": 0.5105, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6370608806610107, + "rewards/margins": 0.766074001789093, + "rewards/rejected": -3.403134822845459, + "sft_loss": 2.7392070293426514, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 18.37818975642997, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.17349588871002197, + "logits/rejected": -0.04044419154524803, + "logps/chosen": -2.7869231700897217, + "logps/rejected": -3.616891860961914, + "loss": 0.5208, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7869231700897217, + "rewards/margins": 0.8299688100814819, + "rewards/rejected": -3.616891860961914, + "sft_loss": 2.87677001953125, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 23.30245997826689, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.14427100121974945, + "logits/rejected": -0.0082294512540102, + "logps/chosen": -2.7531418800354004, + "logps/rejected": -3.622286319732666, + "loss": 0.5112, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7531418800354004, + "rewards/margins": 0.8691444396972656, + "rewards/rejected": -3.622286319732666, + "sft_loss": 2.935823440551758, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 14.765799932563043, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.2974587082862854, + "logits/rejected": -0.08597452938556671, + "logps/chosen": -2.8631033897399902, + "logps/rejected": -3.757943630218506, + "loss": 0.5084, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8631033897399902, + "rewards/margins": 0.89484041929245, + "rewards/rejected": -3.757943630218506, + "sft_loss": 3.0459372997283936, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 18.81329740078329, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.1754823625087738, + "logits/rejected": -0.03205886483192444, + "logps/chosen": -2.784930944442749, + "logps/rejected": -3.547107696533203, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.784930944442749, + "rewards/margins": 0.7621761560440063, + "rewards/rejected": -3.547107696533203, + "sft_loss": 3.011758804321289, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 14.856372988678277, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.2784285545349121, + "logits/rejected": -0.19298048317432404, + "logps/chosen": -2.7671022415161133, + "logps/rejected": -3.4874374866485596, + "loss": 0.5639, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7671022415161133, + "rewards/margins": 0.7203353643417358, + "rewards/rejected": -3.4874374866485596, + "sft_loss": 2.97153902053833, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 16.34455536326885, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.22903160750865936, + "logits/rejected": -0.1430944949388504, + "logps/chosen": -2.869506359100342, + "logps/rejected": -3.7298552989959717, + "loss": 0.5389, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.869506359100342, + "rewards/margins": 0.8603488802909851, + "rewards/rejected": -3.7298552989959717, + "sft_loss": 3.0238983631134033, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 12.018862895462558, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.1803596168756485, + "logits/rejected": -0.058324117213487625, + "logps/chosen": -2.9235939979553223, + "logps/rejected": -3.5747509002685547, + "loss": 0.6078, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.9235939979553223, + "rewards/margins": 0.6511572599411011, + "rewards/rejected": -3.5747509002685547, + "sft_loss": 2.9814629554748535, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 12.871878981446248, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.25617536902427673, + "logits/rejected": -0.0906013622879982, + "logps/chosen": -2.8694026470184326, + "logps/rejected": -3.690614700317383, + "loss": 0.5259, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8694026470184326, + "rewards/margins": 0.8212119936943054, + "rewards/rejected": -3.690614700317383, + "sft_loss": 2.922090530395508, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 17.058939346776455, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.2006787359714508, + "logits/rejected": -0.21372473239898682, + "logps/chosen": -2.7825400829315186, + "logps/rejected": -3.514707565307617, + "loss": 0.5655, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7825400829315186, + "rewards/margins": 0.7321674823760986, + "rewards/rejected": -3.514707565307617, + "sft_loss": 3.0377678871154785, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 12.339324602189375, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.11219914257526398, + "logits/rejected": -0.10450281947851181, + "logps/chosen": -2.5796525478363037, + "logps/rejected": -3.4286201000213623, + "loss": 0.5323, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5796525478363037, + "rewards/margins": 0.8489675521850586, + "rewards/rejected": -3.4286201000213623, + "sft_loss": 2.6898679733276367, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 17.408938715499623, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.2657962143421173, + "logits/rejected": -0.18797865509986877, + "logps/chosen": -2.6418039798736572, + "logps/rejected": -3.3556416034698486, + "loss": 0.5549, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6418039798736572, + "rewards/margins": 0.713837742805481, + "rewards/rejected": -3.3556416034698486, + "sft_loss": 2.7874643802642822, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.1467505246400833, + "eval_logits/rejected": 0.2546423673629761, + "eval_logps/chosen": -2.7228782176971436, + "eval_logps/rejected": -3.506152868270874, + "eval_loss": 0.5573322176933289, + "eval_rewards/accuracies": 0.7247774600982666, + "eval_rewards/chosen": -2.7228782176971436, + "eval_rewards/margins": 0.7832746505737305, + "eval_rewards/rejected": -3.506152868270874, + "eval_runtime": 49.9154, + "eval_samples_per_second": 26.946, + "eval_sft_loss": 2.8691623210906982, + "eval_steps_per_second": 6.751, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 10.399875082771421, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.34869104623794556, + "logits/rejected": -0.167296901345253, + "logps/chosen": -2.527095079421997, + "logps/rejected": -3.321505069732666, + "loss": 0.5255, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.527095079421997, + "rewards/margins": 0.7944096326828003, + "rewards/rejected": -3.321505069732666, + "sft_loss": 2.6784870624542236, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 16.63514429865691, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.22122304141521454, + "logits/rejected": -0.1313764750957489, + "logps/chosen": -2.628300189971924, + "logps/rejected": -3.411324977874756, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.628300189971924, + "rewards/margins": 0.7830251455307007, + "rewards/rejected": -3.411324977874756, + "sft_loss": 2.7826220989227295, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 10.66724424739637, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.16654136776924133, + "logits/rejected": -0.042657412588596344, + "logps/chosen": -2.652475357055664, + "logps/rejected": -3.6194043159484863, + "loss": 0.4705, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.652475357055664, + "rewards/margins": 0.9669289588928223, + "rewards/rejected": -3.6194043159484863, + "sft_loss": 2.7372677326202393, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 14.064916607818864, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.24533767998218536, + "logits/rejected": -0.014905953779816628, + "logps/chosen": -2.8403804302215576, + "logps/rejected": -3.5930869579315186, + "loss": 0.5703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8403804302215576, + "rewards/margins": 0.7527070045471191, + "rewards/rejected": -3.5930869579315186, + "sft_loss": 2.991934061050415, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 14.350919762086672, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.30641651153564453, + "logits/rejected": -0.09210020303726196, + "logps/chosen": -2.866778612136841, + "logps/rejected": -3.7245547771453857, + "loss": 0.5238, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.866778612136841, + "rewards/margins": 0.8577759861946106, + "rewards/rejected": -3.7245547771453857, + "sft_loss": 2.979400634765625, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 13.547482917019332, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.24482688307762146, + "logits/rejected": -0.10179238021373749, + "logps/chosen": -2.7300779819488525, + "logps/rejected": -3.591665744781494, + "loss": 0.5218, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7300779819488525, + "rewards/margins": 0.8615878820419312, + "rewards/rejected": -3.591665744781494, + "sft_loss": 2.8641223907470703, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 29.997519760061838, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.25270360708236694, + "logits/rejected": -0.08831027895212173, + "logps/chosen": -2.880031108856201, + "logps/rejected": -3.8612148761749268, + "loss": 0.5023, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.880031108856201, + "rewards/margins": 0.9811837077140808, + "rewards/rejected": -3.8612148761749268, + "sft_loss": 3.011920213699341, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 18.971073746288894, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.2417343109846115, + "logits/rejected": -0.15943384170532227, + "logps/chosen": -2.898433208465576, + "logps/rejected": -3.618417263031006, + "loss": 0.5858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.898433208465576, + "rewards/margins": 0.7199840545654297, + "rewards/rejected": -3.618417263031006, + "sft_loss": 3.053682804107666, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 15.931686737504796, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.19866490364074707, + "logits/rejected": -0.10472752153873444, + "logps/chosen": -2.9091525077819824, + "logps/rejected": -3.641115665435791, + "loss": 0.6296, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9091525077819824, + "rewards/margins": 0.7319625616073608, + "rewards/rejected": -3.641115665435791, + "sft_loss": 2.9120960235595703, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 16.461024108369852, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.25154000520706177, + "logits/rejected": -0.14929892122745514, + "logps/chosen": -2.7859530448913574, + "logps/rejected": -3.70951509475708, + "loss": 0.4924, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7859530448913574, + "rewards/margins": 0.9235623478889465, + "rewards/rejected": -3.70951509475708, + "sft_loss": 2.9241738319396973, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 12.408242747971943, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.2801755368709564, + "logits/rejected": -0.0861397385597229, + "logps/chosen": -2.93034029006958, + "logps/rejected": -3.7381014823913574, + "loss": 0.546, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.93034029006958, + "rewards/margins": 0.8077613711357117, + "rewards/rejected": -3.7381014823913574, + "sft_loss": 3.0639588832855225, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 15.501732706021073, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.324562132358551, + "logits/rejected": -0.11474557965993881, + "logps/chosen": -2.8127694129943848, + "logps/rejected": -3.550741195678711, + "loss": 0.533, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8127694129943848, + "rewards/margins": 0.7379716634750366, + "rewards/rejected": -3.550741195678711, + "sft_loss": 2.8940277099609375, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 19.540289281178666, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.35332703590393066, + "logits/rejected": -0.21106410026550293, + "logps/chosen": -2.8680214881896973, + "logps/rejected": -3.487658739089966, + "loss": 0.5897, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8680214881896973, + "rewards/margins": 0.6196374893188477, + "rewards/rejected": -3.487658739089966, + "sft_loss": 3.0192437171936035, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 17.32744962552554, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.2802363634109497, + "logits/rejected": -0.15609270334243774, + "logps/chosen": -2.7928478717803955, + "logps/rejected": -3.4934096336364746, + "loss": 0.5619, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7928478717803955, + "rewards/margins": 0.7005618810653687, + "rewards/rejected": -3.4934096336364746, + "sft_loss": 2.8554327487945557, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 16.893818398379278, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.26265794038772583, + "logits/rejected": -0.13234956562519073, + "logps/chosen": -2.7641029357910156, + "logps/rejected": -3.4810073375701904, + "loss": 0.544, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7641029357910156, + "rewards/margins": 0.7169044017791748, + "rewards/rejected": -3.4810073375701904, + "sft_loss": 2.81727933883667, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 15.252444812376583, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.1614963561296463, + "logits/rejected": -0.04674742743372917, + "logps/chosen": -2.6011624336242676, + "logps/rejected": -3.4162681102752686, + "loss": 0.5102, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6011624336242676, + "rewards/margins": 0.8151056170463562, + "rewards/rejected": -3.4162681102752686, + "sft_loss": 2.713378429412842, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 14.330731261415055, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.2792847752571106, + "logits/rejected": -0.14449983835220337, + "logps/chosen": -2.732344150543213, + "logps/rejected": -3.5566773414611816, + "loss": 0.5276, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.732344150543213, + "rewards/margins": 0.8243331909179688, + "rewards/rejected": -3.5566773414611816, + "sft_loss": 2.828441619873047, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 12.995898173961194, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.30418476462364197, + "logits/rejected": -0.10869389772415161, + "logps/chosen": -2.7487494945526123, + "logps/rejected": -3.7656631469726562, + "loss": 0.4808, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7487494945526123, + "rewards/margins": 1.0169135332107544, + "rewards/rejected": -3.7656631469726562, + "sft_loss": 2.8211586475372314, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 14.851715763505505, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.2987568974494934, + "logits/rejected": -0.13216647505760193, + "logps/chosen": -2.708491802215576, + "logps/rejected": -3.4900200366973877, + "loss": 0.5571, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.708491802215576, + "rewards/margins": 0.7815281748771667, + "rewards/rejected": -3.4900200366973877, + "sft_loss": 2.7990097999572754, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 12.612914431649259, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.20559749007225037, + "logits/rejected": -0.09073267877101898, + "logps/chosen": -2.8077054023742676, + "logps/rejected": -3.738248825073242, + "loss": 0.4909, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8077054023742676, + "rewards/margins": 0.9305435419082642, + "rewards/rejected": -3.738248825073242, + "sft_loss": 2.9258337020874023, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 16.242664587469477, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.15259698033332825, + "logits/rejected": -0.13122805953025818, + "logps/chosen": -2.800950765609741, + "logps/rejected": -3.6922545433044434, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.800950765609741, + "rewards/margins": 0.8913037180900574, + "rewards/rejected": -3.6922545433044434, + "sft_loss": 2.9571404457092285, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 18.94326857674209, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.2627522051334381, + "logits/rejected": -0.14064963161945343, + "logps/chosen": -2.9124884605407715, + "logps/rejected": -3.6973648071289062, + "loss": 0.5426, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9124884605407715, + "rewards/margins": 0.7848763465881348, + "rewards/rejected": -3.6973648071289062, + "sft_loss": 3.141721725463867, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 24.740523928241057, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.21578022837638855, + "logits/rejected": -0.0047931610606610775, + "logps/chosen": -2.9035096168518066, + "logps/rejected": -3.8408610820770264, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9035096168518066, + "rewards/margins": 0.937351405620575, + "rewards/rejected": -3.8408610820770264, + "sft_loss": 3.172224521636963, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 20.55201507662613, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.23507125675678253, + "logits/rejected": -0.11718054115772247, + "logps/chosen": -3.0174198150634766, + "logps/rejected": -3.8366706371307373, + "loss": 0.5798, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0174198150634766, + "rewards/margins": 0.8192507028579712, + "rewards/rejected": -3.8366706371307373, + "sft_loss": 3.2097160816192627, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 17.222357571169933, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.2141776978969574, + "logits/rejected": -0.10248160362243652, + "logps/chosen": -2.8130953311920166, + "logps/rejected": -3.6256022453308105, + "loss": 0.5655, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8130953311920166, + "rewards/margins": 0.812507152557373, + "rewards/rejected": -3.6256022453308105, + "sft_loss": 2.977510452270508, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 13.799444985142202, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.3303857743740082, + "logits/rejected": -0.1394929587841034, + "logps/chosen": -2.7024035453796387, + "logps/rejected": -3.8372929096221924, + "loss": 0.4763, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7024035453796387, + "rewards/margins": 1.1348894834518433, + "rewards/rejected": -3.8372929096221924, + "sft_loss": 2.956207275390625, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 15.263193791635503, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.28155213594436646, + "logits/rejected": -0.156253844499588, + "logps/chosen": -2.8950753211975098, + "logps/rejected": -3.7396531105041504, + "loss": 0.5191, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8950753211975098, + "rewards/margins": 0.8445774912834167, + "rewards/rejected": -3.7396531105041504, + "sft_loss": 3.0152199268341064, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 14.100321215775855, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.2842947542667389, + "logits/rejected": -0.10026898235082626, + "logps/chosen": -2.8834152221679688, + "logps/rejected": -3.8942084312438965, + "loss": 0.5247, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8834152221679688, + "rewards/margins": 1.0107934474945068, + "rewards/rejected": -3.8942084312438965, + "sft_loss": 3.0497002601623535, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 19.66127515967987, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.23068693280220032, + "logits/rejected": -0.0951416864991188, + "logps/chosen": -2.798116445541382, + "logps/rejected": -3.5930447578430176, + "loss": 0.5406, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.798116445541382, + "rewards/margins": 0.7949279546737671, + "rewards/rejected": -3.5930447578430176, + "sft_loss": 2.894116163253784, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 17.20298797419941, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.23539146780967712, + "logits/rejected": 0.025868237018585205, + "logps/chosen": -2.7686853408813477, + "logps/rejected": -3.7298712730407715, + "loss": 0.4766, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.7686853408813477, + "rewards/margins": 0.9611856341362, + "rewards/rejected": -3.7298712730407715, + "sft_loss": 2.9039459228515625, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 18.947778675822796, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.18216149508953094, + "logits/rejected": -0.05739528685808182, + "logps/chosen": -2.887604236602783, + "logps/rejected": -3.6991076469421387, + "loss": 0.551, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.887604236602783, + "rewards/margins": 0.811503529548645, + "rewards/rejected": -3.6991076469421387, + "sft_loss": 3.119185209274292, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 11.464756525255813, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.3169929087162018, + "logits/rejected": -0.16190847754478455, + "logps/chosen": -2.839843273162842, + "logps/rejected": -3.680530548095703, + "loss": 0.507, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.839843273162842, + "rewards/margins": 0.8406869769096375, + "rewards/rejected": -3.680530548095703, + "sft_loss": 2.9576756954193115, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 12.331848177245327, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.2760445177555084, + "logits/rejected": -0.1369338035583496, + "logps/chosen": -2.9550209045410156, + "logps/rejected": -3.916705369949341, + "loss": 0.5183, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9550209045410156, + "rewards/margins": 0.9616841077804565, + "rewards/rejected": -3.916705369949341, + "sft_loss": 3.1083853244781494, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 24.02246912843936, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.2904479503631592, + "logits/rejected": -0.08660642802715302, + "logps/chosen": -2.9979000091552734, + "logps/rejected": -3.875239849090576, + "loss": 0.5522, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9979000091552734, + "rewards/margins": 0.8773400187492371, + "rewards/rejected": -3.875239849090576, + "sft_loss": 3.12526798248291, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 18.30655578201634, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.21777451038360596, + "logits/rejected": -0.04189714044332504, + "logps/chosen": -2.8661983013153076, + "logps/rejected": -3.6684353351593018, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8661983013153076, + "rewards/margins": 0.8022370338439941, + "rewards/rejected": -3.6684353351593018, + "sft_loss": 3.0100347995758057, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 14.26079978751246, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.288684606552124, + "logits/rejected": -0.09366115927696228, + "logps/chosen": -2.9219727516174316, + "logps/rejected": -3.6889071464538574, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9219727516174316, + "rewards/margins": 0.7669345140457153, + "rewards/rejected": -3.6889071464538574, + "sft_loss": 3.0620250701904297, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 18.763679318314267, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.26433873176574707, + "logits/rejected": -0.02369770035147667, + "logps/chosen": -2.94124174118042, + "logps/rejected": -3.9786980152130127, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.94124174118042, + "rewards/margins": 1.0374559164047241, + "rewards/rejected": -3.9786980152130127, + "sft_loss": 3.0656442642211914, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 15.811139693762856, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.2913285493850708, + "logits/rejected": -0.0641009658575058, + "logps/chosen": -2.8629794120788574, + "logps/rejected": -3.8095791339874268, + "loss": 0.5101, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8629794120788574, + "rewards/margins": 0.9465991854667664, + "rewards/rejected": -3.8095791339874268, + "sft_loss": 3.0161020755767822, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 18.022532012750336, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.2145463228225708, + "logits/rejected": -0.06065446883440018, + "logps/chosen": -3.002933979034424, + "logps/rejected": -4.084083557128906, + "loss": 0.4918, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.002933979034424, + "rewards/margins": 1.081149697303772, + "rewards/rejected": -4.084083557128906, + "sft_loss": 3.2060210704803467, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 11.476591812970014, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.27649787068367004, + "logits/rejected": -0.06862474977970123, + "logps/chosen": -2.9410276412963867, + "logps/rejected": -3.6895077228546143, + "loss": 0.577, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9410276412963867, + "rewards/margins": 0.748479962348938, + "rewards/rejected": -3.6895077228546143, + "sft_loss": 3.015018939971924, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 28.570194060864278, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.3490707278251648, + "logits/rejected": -0.15127266943454742, + "logps/chosen": -2.9497642517089844, + "logps/rejected": -3.570955276489258, + "loss": 0.6157, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9497642517089844, + "rewards/margins": 0.6211908459663391, + "rewards/rejected": -3.570955276489258, + "sft_loss": 3.063214063644409, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 22.720628768475557, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.3165649473667145, + "logits/rejected": -0.17648476362228394, + "logps/chosen": -2.8999311923980713, + "logps/rejected": -3.5884041786193848, + "loss": 0.6102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.8999311923980713, + "rewards/margins": 0.6884732246398926, + "rewards/rejected": -3.5884041786193848, + "sft_loss": 2.998579740524292, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 18.748980969575, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.07036960870027542, + "logits/rejected": 0.014557006768882275, + "logps/chosen": -2.9264798164367676, + "logps/rejected": -3.6826775074005127, + "loss": 0.5487, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9264798164367676, + "rewards/margins": 0.7561972737312317, + "rewards/rejected": -3.6826775074005127, + "sft_loss": 3.034297227859497, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 13.749498822383595, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.2745167315006256, + "logits/rejected": -0.13641813397407532, + "logps/chosen": -2.8453528881073, + "logps/rejected": -3.5740790367126465, + "loss": 0.5414, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8453528881073, + "rewards/margins": 0.7287262678146362, + "rewards/rejected": -3.5740790367126465, + "sft_loss": 3.005955219268799, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 14.094554426517202, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.25325021147727966, + "logits/rejected": -0.06744090467691422, + "logps/chosen": -2.703946590423584, + "logps/rejected": -3.4337966442108154, + "loss": 0.5563, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.703946590423584, + "rewards/margins": 0.7298499345779419, + "rewards/rejected": -3.4337966442108154, + "sft_loss": 2.8957529067993164, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 14.384649491858822, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.20750777423381805, + "logits/rejected": -0.047119565308094025, + "logps/chosen": -2.895918369293213, + "logps/rejected": -3.666656494140625, + "loss": 0.5495, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.895918369293213, + "rewards/margins": 0.7707374691963196, + "rewards/rejected": -3.666656494140625, + "sft_loss": 3.003587007522583, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 15.088461428091204, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.14871835708618164, + "logits/rejected": 0.01115778274834156, + "logps/chosen": -2.7989089488983154, + "logps/rejected": -3.4743430614471436, + "loss": 0.5458, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7989089488983154, + "rewards/margins": 0.6754340529441833, + "rewards/rejected": -3.4743430614471436, + "sft_loss": 2.9447684288024902, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 12.180618978442498, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.23028519749641418, + "logits/rejected": -0.1455889195203781, + "logps/chosen": -2.748809337615967, + "logps/rejected": -3.695582628250122, + "loss": 0.5166, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.748809337615967, + "rewards/margins": 0.9467732310295105, + "rewards/rejected": -3.695582628250122, + "sft_loss": 2.9013164043426514, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 26.879565279570176, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.22017955780029297, + "logits/rejected": -0.1012934222817421, + "logps/chosen": -2.6123456954956055, + "logps/rejected": -3.3041110038757324, + "loss": 0.5678, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6123456954956055, + "rewards/margins": 0.6917653679847717, + "rewards/rejected": -3.3041110038757324, + "sft_loss": 2.7556638717651367, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 11.349719271048372, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.22333073616027832, + "logits/rejected": -0.06680073589086533, + "logps/chosen": -2.8216347694396973, + "logps/rejected": -3.7416367530822754, + "loss": 0.4819, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8216347694396973, + "rewards/margins": 0.9200018644332886, + "rewards/rejected": -3.7416367530822754, + "sft_loss": 3.018942356109619, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 13.77330629661692, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.285283625125885, + "logits/rejected": -0.11413697898387909, + "logps/chosen": -2.9083285331726074, + "logps/rejected": -3.6721458435058594, + "loss": 0.5478, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9083285331726074, + "rewards/margins": 0.763817548751831, + "rewards/rejected": -3.6721458435058594, + "sft_loss": 3.0378081798553467, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 13.26939237550019, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.2616371214389801, + "logits/rejected": -0.05170721560716629, + "logps/chosen": -2.853921413421631, + "logps/rejected": -3.6635985374450684, + "loss": 0.5813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.853921413421631, + "rewards/margins": 0.809677243232727, + "rewards/rejected": -3.6635985374450684, + "sft_loss": 3.0633363723754883, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 14.822875458029475, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.19637763500213623, + "logits/rejected": -0.10091104358434677, + "logps/chosen": -2.7999989986419678, + "logps/rejected": -3.6137301921844482, + "loss": 0.5582, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7999989986419678, + "rewards/margins": 0.8137310743331909, + "rewards/rejected": -3.6137301921844482, + "sft_loss": 2.9661269187927246, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 19.196992236377532, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.2853774428367615, + "logits/rejected": -0.16362276673316956, + "logps/chosen": -2.8738396167755127, + "logps/rejected": -3.68664813041687, + "loss": 0.564, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8738396167755127, + "rewards/margins": 0.812808632850647, + "rewards/rejected": -3.68664813041687, + "sft_loss": 3.035071611404419, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 13.120713832451536, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.30218741297721863, + "logits/rejected": -0.2097523957490921, + "logps/chosen": -2.6479439735412598, + "logps/rejected": -3.4495468139648438, + "loss": 0.4967, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6479439735412598, + "rewards/margins": 0.8016031980514526, + "rewards/rejected": -3.4495468139648438, + "sft_loss": 2.778782367706299, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 13.395548570121615, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.3696254789829254, + "logits/rejected": -0.17545874416828156, + "logps/chosen": -2.555445432662964, + "logps/rejected": -3.4399514198303223, + "loss": 0.4687, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.555445432662964, + "rewards/margins": 0.8845059275627136, + "rewards/rejected": -3.4399514198303223, + "sft_loss": 2.7573673725128174, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 16.649977896866492, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.17495186626911163, + "logits/rejected": -0.14151157438755035, + "logps/chosen": -2.7216289043426514, + "logps/rejected": -3.3717918395996094, + "loss": 0.5736, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7216289043426514, + "rewards/margins": 0.6501628160476685, + "rewards/rejected": -3.3717918395996094, + "sft_loss": 2.8681225776672363, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 16.772112988286686, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.18401309847831726, + "logits/rejected": -0.10007508099079132, + "logps/chosen": -2.6721231937408447, + "logps/rejected": -3.324143648147583, + "loss": 0.6245, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6721231937408447, + "rewards/margins": 0.6520205140113831, + "rewards/rejected": -3.324143648147583, + "sft_loss": 2.845592737197876, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 15.422309003619537, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.2133098542690277, + "logits/rejected": -0.05000300332903862, + "logps/chosen": -2.772364377975464, + "logps/rejected": -3.53290057182312, + "loss": 0.5361, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.772364377975464, + "rewards/margins": 0.760536253452301, + "rewards/rejected": -3.53290057182312, + "sft_loss": 2.9012913703918457, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 15.57194234622625, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.3201986253261566, + "logits/rejected": -0.17920561134815216, + "logps/chosen": -2.6756997108459473, + "logps/rejected": -3.4864468574523926, + "loss": 0.5285, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6756997108459473, + "rewards/margins": 0.8107470273971558, + "rewards/rejected": -3.4864468574523926, + "sft_loss": 2.9155473709106445, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 18.956343721755545, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.31728580594062805, + "logits/rejected": -0.12212977558374405, + "logps/chosen": -2.669448137283325, + "logps/rejected": -3.3638412952423096, + "loss": 0.5545, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.669448137283325, + "rewards/margins": 0.6943932771682739, + "rewards/rejected": -3.3638412952423096, + "sft_loss": 2.872934341430664, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 13.27250872198471, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.22764411568641663, + "logits/rejected": -0.0870494470000267, + "logps/chosen": -2.6941583156585693, + "logps/rejected": -3.426682233810425, + "loss": 0.5447, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6941583156585693, + "rewards/margins": 0.7325237989425659, + "rewards/rejected": -3.426682233810425, + "sft_loss": 2.724475383758545, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 12.557439037020663, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.1906013935804367, + "logits/rejected": -0.047843921929597855, + "logps/chosen": -2.722144365310669, + "logps/rejected": -3.6132140159606934, + "loss": 0.4835, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.722144365310669, + "rewards/margins": 0.891069769859314, + "rewards/rejected": -3.6132140159606934, + "sft_loss": 2.923128604888916, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 11.984896003834054, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.32420215010643005, + "logits/rejected": -0.1440507471561432, + "logps/chosen": -2.875488042831421, + "logps/rejected": -3.7107577323913574, + "loss": 0.5353, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.875488042831421, + "rewards/margins": 0.8352691531181335, + "rewards/rejected": -3.7107577323913574, + "sft_loss": 2.9838318824768066, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 13.906447094523328, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.24320106208324432, + "logits/rejected": -0.10679192841053009, + "logps/chosen": -2.88425612449646, + "logps/rejected": -3.715994358062744, + "loss": 0.53, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.88425612449646, + "rewards/margins": 0.8317381739616394, + "rewards/rejected": -3.715994358062744, + "sft_loss": 2.966104030609131, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 14.976367657165467, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.19528812170028687, + "logits/rejected": -0.009975330904126167, + "logps/chosen": -3.02905535697937, + "logps/rejected": -3.80255126953125, + "loss": 0.5337, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.02905535697937, + "rewards/margins": 0.7734959721565247, + "rewards/rejected": -3.80255126953125, + "sft_loss": 3.1802637577056885, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 15.548326598764058, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.17453578114509583, + "logits/rejected": -0.07390455901622772, + "logps/chosen": -3.150977611541748, + "logps/rejected": -3.8425517082214355, + "loss": 0.5693, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.150977611541748, + "rewards/margins": 0.6915736794471741, + "rewards/rejected": -3.8425517082214355, + "sft_loss": 3.230241060256958, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 16.533383070336015, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.4218342900276184, + "logits/rejected": -0.2783734202384949, + "logps/chosen": -2.9137845039367676, + "logps/rejected": -3.6819236278533936, + "loss": 0.527, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9137845039367676, + "rewards/margins": 0.7681390047073364, + "rewards/rejected": -3.6819236278533936, + "sft_loss": 3.0383102893829346, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 17.88192366513802, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.21219484508037567, + "logits/rejected": -0.06534367054700851, + "logps/chosen": -2.799802780151367, + "logps/rejected": -3.7559711933135986, + "loss": 0.519, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.799802780151367, + "rewards/margins": 0.9561680555343628, + "rewards/rejected": -3.7559711933135986, + "sft_loss": 2.8882555961608887, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 14.847677829791488, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.3468804955482483, + "logits/rejected": -0.21161310374736786, + "logps/chosen": -2.8697726726531982, + "logps/rejected": -3.809154987335205, + "loss": 0.4897, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8697726726531982, + "rewards/margins": 0.9393820762634277, + "rewards/rejected": -3.809154987335205, + "sft_loss": 2.9497411251068115, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 12.793945840546476, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.3617003560066223, + "logits/rejected": -0.21521353721618652, + "logps/chosen": -2.7200443744659424, + "logps/rejected": -3.6564979553222656, + "loss": 0.49, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7200443744659424, + "rewards/margins": 0.9364538192749023, + "rewards/rejected": -3.6564979553222656, + "sft_loss": 2.89874005317688, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 17.62398573548974, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.1891530603170395, + "logits/rejected": -0.21747808158397675, + "logps/chosen": -2.798849582672119, + "logps/rejected": -3.6794886589050293, + "loss": 0.5158, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.798849582672119, + "rewards/margins": 0.880639374256134, + "rewards/rejected": -3.6794886589050293, + "sft_loss": 3.0444750785827637, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 14.920422420380982, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.3750852644443512, + "logits/rejected": -0.21744951605796814, + "logps/chosen": -2.9588398933410645, + "logps/rejected": -3.6091854572296143, + "loss": 0.6136, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.9588398933410645, + "rewards/margins": 0.6503456830978394, + "rewards/rejected": -3.6091854572296143, + "sft_loss": 3.1452479362487793, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 13.725127922916966, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.2826746106147766, + "logits/rejected": -0.17756345868110657, + "logps/chosen": -2.858684778213501, + "logps/rejected": -3.7989399433135986, + "loss": 0.496, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.858684778213501, + "rewards/margins": 0.9402546882629395, + "rewards/rejected": -3.7989399433135986, + "sft_loss": 2.9993367195129395, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 20.41563340319681, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.28491419553756714, + "logits/rejected": -0.11838585138320923, + "logps/chosen": -2.934544801712036, + "logps/rejected": -3.5151214599609375, + "loss": 0.6561, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.934544801712036, + "rewards/margins": 0.58057701587677, + "rewards/rejected": -3.5151214599609375, + "sft_loss": 3.12235426902771, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 15.819865212335717, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.16424153745174408, + "logits/rejected": -0.09867256879806519, + "logps/chosen": -2.845426559448242, + "logps/rejected": -3.666630983352661, + "loss": 0.5349, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.845426559448242, + "rewards/margins": 0.8212043642997742, + "rewards/rejected": -3.666630983352661, + "sft_loss": 2.955371141433716, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 16.014390487503512, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.23135367035865784, + "logits/rejected": -0.17494915425777435, + "logps/chosen": -2.815685272216797, + "logps/rejected": -3.2968387603759766, + "loss": 0.635, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.815685272216797, + "rewards/margins": 0.48115357756614685, + "rewards/rejected": -3.2968387603759766, + "sft_loss": 2.9847769737243652, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 11.69622838348128, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.32371675968170166, + "logits/rejected": -0.12809544801712036, + "logps/chosen": -2.8046722412109375, + "logps/rejected": -3.664748430252075, + "loss": 0.4901, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.8046722412109375, + "rewards/margins": 0.8600761294364929, + "rewards/rejected": -3.664748430252075, + "sft_loss": 2.9333043098449707, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 11.61131870098387, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.29520028829574585, + "logits/rejected": -0.16227586567401886, + "logps/chosen": -2.6985669136047363, + "logps/rejected": -3.5508294105529785, + "loss": 0.5224, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6985669136047363, + "rewards/margins": 0.8522623777389526, + "rewards/rejected": -3.5508294105529785, + "sft_loss": 2.784538745880127, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 14.994844084135467, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.20806710422039032, + "logits/rejected": -0.04904582351446152, + "logps/chosen": -2.8403031826019287, + "logps/rejected": -3.572239637374878, + "loss": 0.5785, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8403031826019287, + "rewards/margins": 0.7319362759590149, + "rewards/rejected": -3.572239637374878, + "sft_loss": 2.9280142784118652, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.15309284627437592, + "eval_logits/rejected": 0.2598608136177063, + "eval_logps/chosen": -2.7303080558776855, + "eval_logps/rejected": -3.5084829330444336, + "eval_loss": 0.5548537373542786, + "eval_rewards/accuracies": 0.7240356206893921, + "eval_rewards/chosen": -2.7303080558776855, + "eval_rewards/margins": 0.778174877166748, + "eval_rewards/rejected": -3.5084829330444336, + "eval_runtime": 50.0441, + "eval_samples_per_second": 26.876, + "eval_sft_loss": 2.8826727867126465, + "eval_steps_per_second": 6.734, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 11.644774075451304, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.32497185468673706, + "logits/rejected": -0.18863452970981598, + "logps/chosen": -2.6446356773376465, + "logps/rejected": -3.518303394317627, + "loss": 0.5075, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6446356773376465, + "rewards/margins": 0.8736675381660461, + "rewards/rejected": -3.518303394317627, + "sft_loss": 2.7527430057525635, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 16.781271448266775, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.30310553312301636, + "logits/rejected": -0.1773833930492401, + "logps/chosen": -2.711203098297119, + "logps/rejected": -3.3796043395996094, + "loss": 0.5956, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.711203098297119, + "rewards/margins": 0.6684012413024902, + "rewards/rejected": -3.3796043395996094, + "sft_loss": 2.832228899002075, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 16.686412798729457, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.22252731025218964, + "logits/rejected": -0.1045379638671875, + "logps/chosen": -2.7253940105438232, + "logps/rejected": -3.514838457107544, + "loss": 0.5377, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7253940105438232, + "rewards/margins": 0.7894442677497864, + "rewards/rejected": -3.514838457107544, + "sft_loss": 2.815645217895508, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 13.158363402934352, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.2946009039878845, + "logits/rejected": -0.022943900898098946, + "logps/chosen": -2.9355263710021973, + "logps/rejected": -3.752810001373291, + "loss": 0.5893, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9355263710021973, + "rewards/margins": 0.8172832727432251, + "rewards/rejected": -3.752810001373291, + "sft_loss": 2.941917896270752, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 12.757088855427256, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.24457335472106934, + "logits/rejected": -0.10586385428905487, + "logps/chosen": -2.697072744369507, + "logps/rejected": -3.480059862136841, + "loss": 0.5324, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.697072744369507, + "rewards/margins": 0.7829869985580444, + "rewards/rejected": -3.480059862136841, + "sft_loss": 2.8617265224456787, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 21.939144034765643, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.27455341815948486, + "logits/rejected": -0.10001333057880402, + "logps/chosen": -2.7869138717651367, + "logps/rejected": -3.8302619457244873, + "loss": 0.5037, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7869138717651367, + "rewards/margins": 1.0433475971221924, + "rewards/rejected": -3.8302619457244873, + "sft_loss": 2.9080328941345215, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 15.158390831944335, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.28310340642929077, + "logits/rejected": -0.1736556589603424, + "logps/chosen": -2.7446627616882324, + "logps/rejected": -3.565007448196411, + "loss": 0.5144, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7446627616882324, + "rewards/margins": 0.8203444480895996, + "rewards/rejected": -3.565007448196411, + "sft_loss": 2.8578567504882812, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 13.765901993484606, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.227499321103096, + "logits/rejected": -0.08456512540578842, + "logps/chosen": -2.7374513149261475, + "logps/rejected": -3.654040813446045, + "loss": 0.5154, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7374513149261475, + "rewards/margins": 0.9165895581245422, + "rewards/rejected": -3.654040813446045, + "sft_loss": 2.739924192428589, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 15.445753485493217, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.36142498254776, + "logits/rejected": -0.21744556725025177, + "logps/chosen": -2.792912244796753, + "logps/rejected": -3.547822952270508, + "loss": 0.5606, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.792912244796753, + "rewards/margins": 0.7549106478691101, + "rewards/rejected": -3.547822952270508, + "sft_loss": 2.836681365966797, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 13.51159695538134, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.18365325033664703, + "logits/rejected": 0.014891237020492554, + "logps/chosen": -2.7513034343719482, + "logps/rejected": -3.632906675338745, + "loss": 0.5287, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7513034343719482, + "rewards/margins": 0.8816030621528625, + "rewards/rejected": -3.632906675338745, + "sft_loss": 2.799229621887207, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 12.850674062353518, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.22592787444591522, + "logits/rejected": -0.12565144896507263, + "logps/chosen": -2.7363648414611816, + "logps/rejected": -3.4756031036376953, + "loss": 0.5525, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7363648414611816, + "rewards/margins": 0.7392383813858032, + "rewards/rejected": -3.4756031036376953, + "sft_loss": 2.8443732261657715, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 17.764794839669747, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.29207858443260193, + "logits/rejected": -0.1414732187986374, + "logps/chosen": -2.731041193008423, + "logps/rejected": -3.5729775428771973, + "loss": 0.5142, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.731041193008423, + "rewards/margins": 0.841936469078064, + "rewards/rejected": -3.5729775428771973, + "sft_loss": 2.848785400390625, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 20.729812971010393, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.20951029658317566, + "logits/rejected": -0.0795585885643959, + "logps/chosen": -2.695565700531006, + "logps/rejected": -3.5492007732391357, + "loss": 0.5336, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.695565700531006, + "rewards/margins": 0.8536350131034851, + "rewards/rejected": -3.5492007732391357, + "sft_loss": 2.873015880584717, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 20.6746787536389, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.26037219166755676, + "logits/rejected": -0.13233566284179688, + "logps/chosen": -2.7734415531158447, + "logps/rejected": -3.595205783843994, + "loss": 0.5397, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7734415531158447, + "rewards/margins": 0.8217641711235046, + "rewards/rejected": -3.595205783843994, + "sft_loss": 2.8323311805725098, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 15.537877865607411, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.31725236773490906, + "logits/rejected": 0.016078215092420578, + "logps/chosen": -2.695134162902832, + "logps/rejected": -3.618396282196045, + "loss": 0.4915, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.695134162902832, + "rewards/margins": 0.9232619404792786, + "rewards/rejected": -3.618396282196045, + "sft_loss": 2.8468456268310547, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 20.8433494712244, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.26147979497909546, + "logits/rejected": -0.17856861650943756, + "logps/chosen": -2.5003762245178223, + "logps/rejected": -3.4275104999542236, + "loss": 0.5124, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5003762245178223, + "rewards/margins": 0.9271339178085327, + "rewards/rejected": -3.4275104999542236, + "sft_loss": 2.689016342163086, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 15.440325896504861, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.28607049584388733, + "logits/rejected": -0.051154326647520065, + "logps/chosen": -2.7072086334228516, + "logps/rejected": -3.6027824878692627, + "loss": 0.505, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7072086334228516, + "rewards/margins": 0.8955739736557007, + "rewards/rejected": -3.6027824878692627, + "sft_loss": 2.805786609649658, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 17.340041941861976, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.2097930908203125, + "logits/rejected": -0.08719705045223236, + "logps/chosen": -2.6455299854278564, + "logps/rejected": -3.431236982345581, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6455299854278564, + "rewards/margins": 0.7857068777084351, + "rewards/rejected": -3.431236982345581, + "sft_loss": 2.7738378047943115, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 16.365244845189448, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.2564403712749481, + "logits/rejected": -0.14944425225257874, + "logps/chosen": -2.9101572036743164, + "logps/rejected": -3.7674922943115234, + "loss": 0.5606, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9101572036743164, + "rewards/margins": 0.8573352098464966, + "rewards/rejected": -3.7674922943115234, + "sft_loss": 2.965453863143921, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 22.915254342116935, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.34263506531715393, + "logits/rejected": -0.18292446434497833, + "logps/chosen": -3.0182735919952393, + "logps/rejected": -3.888817310333252, + "loss": 0.5581, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.0182735919952393, + "rewards/margins": 0.8705435991287231, + "rewards/rejected": -3.888817310333252, + "sft_loss": 3.1601388454437256, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 19.993626570967955, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.352492094039917, + "logits/rejected": -0.15054023265838623, + "logps/chosen": -2.95141339302063, + "logps/rejected": -3.8508782386779785, + "loss": 0.5433, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.95141339302063, + "rewards/margins": 0.8994649052619934, + "rewards/rejected": -3.8508782386779785, + "sft_loss": 3.0732500553131104, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 14.797733830293513, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.14886128902435303, + "logits/rejected": -0.09603078663349152, + "logps/chosen": -2.9899444580078125, + "logps/rejected": -3.8496603965759277, + "loss": 0.5117, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9899444580078125, + "rewards/margins": 0.859715461730957, + "rewards/rejected": -3.8496603965759277, + "sft_loss": 3.140385150909424, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 29.066924240582257, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.313045471906662, + "logits/rejected": -0.12574300169944763, + "logps/chosen": -2.9519901275634766, + "logps/rejected": -3.7856521606445312, + "loss": 0.549, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9519901275634766, + "rewards/margins": 0.8336623311042786, + "rewards/rejected": -3.7856521606445312, + "sft_loss": 3.0585060119628906, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 14.007089002617244, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.28222742676734924, + "logits/rejected": -0.036201111972332, + "logps/chosen": -2.8928723335266113, + "logps/rejected": -3.8801913261413574, + "loss": 0.5264, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8928723335266113, + "rewards/margins": 0.9873189926147461, + "rewards/rejected": -3.8801913261413574, + "sft_loss": 2.9914212226867676, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 14.434540253014186, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.28749576210975647, + "logits/rejected": -0.10014557838439941, + "logps/chosen": -2.850964069366455, + "logps/rejected": -3.6977570056915283, + "loss": 0.5512, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.850964069366455, + "rewards/margins": 0.846792995929718, + "rewards/rejected": -3.6977570056915283, + "sft_loss": 3.068385362625122, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 15.998222025322567, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.2363944947719574, + "logits/rejected": -0.111115001142025, + "logps/chosen": -2.7223422527313232, + "logps/rejected": -3.6873691082000732, + "loss": 0.4948, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7223422527313232, + "rewards/margins": 0.9650264978408813, + "rewards/rejected": -3.6873691082000732, + "sft_loss": 2.9275765419006348, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 26.943749067338448, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.25617361068725586, + "logits/rejected": -0.06846214830875397, + "logps/chosen": -2.9510550498962402, + "logps/rejected": -3.6926627159118652, + "loss": 0.5636, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9510550498962402, + "rewards/margins": 0.741607666015625, + "rewards/rejected": -3.6926627159118652, + "sft_loss": 2.9970152378082275, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 37.80260581105259, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.22949638962745667, + "logits/rejected": -0.05085518956184387, + "logps/chosen": -2.757495641708374, + "logps/rejected": -3.7478134632110596, + "loss": 0.4968, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.757495641708374, + "rewards/margins": 0.9903179407119751, + "rewards/rejected": -3.7478134632110596, + "sft_loss": 2.917391300201416, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 16.307364392346315, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.25462061166763306, + "logits/rejected": -0.04345916956663132, + "logps/chosen": -2.6675283908843994, + "logps/rejected": -3.524871826171875, + "loss": 0.508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6675283908843994, + "rewards/margins": 0.8573434948921204, + "rewards/rejected": -3.524871826171875, + "sft_loss": 2.846837282180786, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 14.953511487287509, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.35961371660232544, + "logits/rejected": -0.21522529423236847, + "logps/chosen": -2.8548665046691895, + "logps/rejected": -3.6475882530212402, + "loss": 0.5401, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8548665046691895, + "rewards/margins": 0.7927218079566956, + "rewards/rejected": -3.6475882530212402, + "sft_loss": 2.979081630706787, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 21.090179405489167, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.1550268530845642, + "logits/rejected": -0.09269969165325165, + "logps/chosen": -2.971285581588745, + "logps/rejected": -3.6374497413635254, + "loss": 0.6211, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.971285581588745, + "rewards/margins": 0.6661639213562012, + "rewards/rejected": -3.6374497413635254, + "sft_loss": 3.0060386657714844, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 17.823650021351096, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.2913093864917755, + "logits/rejected": -0.08032882213592529, + "logps/chosen": -2.660451650619507, + "logps/rejected": -3.5529582500457764, + "loss": 0.5203, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.660451650619507, + "rewards/margins": 0.8925067782402039, + "rewards/rejected": -3.5529582500457764, + "sft_loss": 2.7594053745269775, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 14.755037676642225, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.3176870346069336, + "logits/rejected": -0.1266889125108719, + "logps/chosen": -2.8807005882263184, + "logps/rejected": -3.641491413116455, + "loss": 0.5444, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8807005882263184, + "rewards/margins": 0.7607907652854919, + "rewards/rejected": -3.641491413116455, + "sft_loss": 3.023757219314575, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 16.466658197863744, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.30670759081840515, + "logits/rejected": -0.13745181262493134, + "logps/chosen": -2.8676910400390625, + "logps/rejected": -3.5250790119171143, + "loss": 0.6141, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8676910400390625, + "rewards/margins": 0.6573879718780518, + "rewards/rejected": -3.5250790119171143, + "sft_loss": 2.9657163619995117, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 22.069927121834432, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.1460348516702652, + "logits/rejected": -0.06519370526075363, + "logps/chosen": -2.9532673358917236, + "logps/rejected": -3.681931734085083, + "loss": 0.6144, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.9532673358917236, + "rewards/margins": 0.7286645770072937, + "rewards/rejected": -3.681931734085083, + "sft_loss": 3.024280071258545, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 12.642912115114475, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.3509058356285095, + "logits/rejected": -0.15340352058410645, + "logps/chosen": -2.7628684043884277, + "logps/rejected": -3.520265579223633, + "loss": 0.558, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7628684043884277, + "rewards/margins": 0.7573972940444946, + "rewards/rejected": -3.520265579223633, + "sft_loss": 2.7993674278259277, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 14.091249291129772, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.31110242009162903, + "logits/rejected": -0.27701109647750854, + "logps/chosen": -2.7618496417999268, + "logps/rejected": -3.413426637649536, + "loss": 0.5889, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7618496417999268, + "rewards/margins": 0.6515769362449646, + "rewards/rejected": -3.413426637649536, + "sft_loss": 2.8429646492004395, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 20.278779594228684, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.28902843594551086, + "logits/rejected": -0.1411515474319458, + "logps/chosen": -2.742339611053467, + "logps/rejected": -3.423797607421875, + "loss": 0.5976, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.742339611053467, + "rewards/margins": 0.6814578771591187, + "rewards/rejected": -3.423797607421875, + "sft_loss": 2.7904200553894043, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 15.678556690957445, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.3427790701389313, + "logits/rejected": -0.18898403644561768, + "logps/chosen": -2.5821168422698975, + "logps/rejected": -3.509188413619995, + "loss": 0.4932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5821168422698975, + "rewards/margins": 0.9270719289779663, + "rewards/rejected": -3.509188413619995, + "sft_loss": 2.699791431427002, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 16.2002282046552, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.2501813769340515, + "logits/rejected": -0.1078735813498497, + "logps/chosen": -2.6638801097869873, + "logps/rejected": -3.426955461502075, + "loss": 0.531, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6638801097869873, + "rewards/margins": 0.7630751729011536, + "rewards/rejected": -3.426955461502075, + "sft_loss": 2.7353997230529785, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 13.96144715875297, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.27894407510757446, + "logits/rejected": -0.12392063438892365, + "logps/chosen": -2.638472318649292, + "logps/rejected": -3.3759467601776123, + "loss": 0.5214, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.638472318649292, + "rewards/margins": 0.7374745607376099, + "rewards/rejected": -3.3759467601776123, + "sft_loss": 2.7297282218933105, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 16.286571712916317, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.27272436022758484, + "logits/rejected": -0.020682910457253456, + "logps/chosen": -2.7578983306884766, + "logps/rejected": -3.5018246173858643, + "loss": 0.5205, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7578983306884766, + "rewards/margins": 0.7439260482788086, + "rewards/rejected": -3.5018246173858643, + "sft_loss": 2.786332607269287, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 21.677066865913943, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.2839580774307251, + "logits/rejected": -0.1497466266155243, + "logps/chosen": -2.744100570678711, + "logps/rejected": -3.567244052886963, + "loss": 0.5608, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.744100570678711, + "rewards/margins": 0.8231437802314758, + "rewards/rejected": -3.567244052886963, + "sft_loss": 2.881943702697754, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 21.504461546621123, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.18854017555713654, + "logits/rejected": -0.06854341924190521, + "logps/chosen": -2.828106641769409, + "logps/rejected": -3.6131012439727783, + "loss": 0.6063, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.828106641769409, + "rewards/margins": 0.7849944829940796, + "rewards/rejected": -3.6131012439727783, + "sft_loss": 2.945675849914551, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 13.004971249919047, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.2789008915424347, + "logits/rejected": -0.21969743072986603, + "logps/chosen": -2.7043042182922363, + "logps/rejected": -3.5020999908447266, + "loss": 0.5391, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7043042182922363, + "rewards/margins": 0.7977955937385559, + "rewards/rejected": -3.5020999908447266, + "sft_loss": 2.7692298889160156, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 17.180145525706955, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.2672729790210724, + "logits/rejected": -0.12829595804214478, + "logps/chosen": -2.6477010250091553, + "logps/rejected": -3.6499581336975098, + "loss": 0.4954, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6477010250091553, + "rewards/margins": 1.0022567510604858, + "rewards/rejected": -3.6499581336975098, + "sft_loss": 2.7258620262145996, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 17.2549891404319, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.2865196168422699, + "logits/rejected": -0.15484482049942017, + "logps/chosen": -2.614703416824341, + "logps/rejected": -3.5150234699249268, + "loss": 0.4677, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.614703416824341, + "rewards/margins": 0.9003196954727173, + "rewards/rejected": -3.5150234699249268, + "sft_loss": 2.7431111335754395, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 18.690750195324476, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.29971033334732056, + "logits/rejected": -0.1828007996082306, + "logps/chosen": -2.6536471843719482, + "logps/rejected": -3.5673458576202393, + "loss": 0.5552, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6536471843719482, + "rewards/margins": 0.9136987924575806, + "rewards/rejected": -3.5673458576202393, + "sft_loss": 2.7793192863464355, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 14.611042814476118, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.3003006875514984, + "logits/rejected": -0.07512084394693375, + "logps/chosen": -2.7247841358184814, + "logps/rejected": -3.641571521759033, + "loss": 0.5117, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7247841358184814, + "rewards/margins": 0.9167870283126831, + "rewards/rejected": -3.641571521759033, + "sft_loss": 2.859342098236084, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 16.380895729120613, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.3656696379184723, + "logits/rejected": -0.17512385547161102, + "logps/chosen": -2.7375683784484863, + "logps/rejected": -3.6500372886657715, + "loss": 0.5116, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7375683784484863, + "rewards/margins": 0.9124690294265747, + "rewards/rejected": -3.6500372886657715, + "sft_loss": 2.8651626110076904, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 15.685753189854438, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.2697731852531433, + "logits/rejected": -0.05836169049143791, + "logps/chosen": -2.9335014820098877, + "logps/rejected": -3.754599094390869, + "loss": 0.5682, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9335014820098877, + "rewards/margins": 0.821097731590271, + "rewards/rejected": -3.754599094390869, + "sft_loss": 3.0067005157470703, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 18.491827651861172, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.16491341590881348, + "logits/rejected": -0.03908557817339897, + "logps/chosen": -2.7487964630126953, + "logps/rejected": -3.60390043258667, + "loss": 0.5333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7487964630126953, + "rewards/margins": 0.8551036715507507, + "rewards/rejected": -3.60390043258667, + "sft_loss": 2.7997887134552, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 13.212186209386868, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.2802397608757019, + "logits/rejected": -0.15976294875144958, + "logps/chosen": -2.7203943729400635, + "logps/rejected": -3.5208003520965576, + "loss": 0.5436, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7203943729400635, + "rewards/margins": 0.8004060983657837, + "rewards/rejected": -3.5208003520965576, + "sft_loss": 2.802574634552002, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 15.71739129001188, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.2877870202064514, + "logits/rejected": -0.18142752349376678, + "logps/chosen": -2.883432388305664, + "logps/rejected": -3.7455458641052246, + "loss": 0.5411, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.883432388305664, + "rewards/margins": 0.862113356590271, + "rewards/rejected": -3.7455458641052246, + "sft_loss": 2.970731496810913, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 18.59449367259967, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.274854838848114, + "logits/rejected": -0.21188096702098846, + "logps/chosen": -2.752173662185669, + "logps/rejected": -3.550297975540161, + "loss": 0.5405, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.752173662185669, + "rewards/margins": 0.7981240749359131, + "rewards/rejected": -3.550297975540161, + "sft_loss": 2.8199985027313232, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 13.948609692358458, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.17315642535686493, + "logits/rejected": -0.1392008364200592, + "logps/chosen": -2.801203489303589, + "logps/rejected": -3.6079413890838623, + "loss": 0.5448, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.801203489303589, + "rewards/margins": 0.8067380785942078, + "rewards/rejected": -3.6079413890838623, + "sft_loss": 2.931431531906128, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 17.599399947889275, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.23535147309303284, + "logits/rejected": -0.07883518189191818, + "logps/chosen": -2.6835312843322754, + "logps/rejected": -3.50940203666687, + "loss": 0.5464, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6835312843322754, + "rewards/margins": 0.8258708715438843, + "rewards/rejected": -3.50940203666687, + "sft_loss": 2.843803882598877, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 16.88396980667088, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.20262908935546875, + "logits/rejected": -0.07145232707262039, + "logps/chosen": -2.6471047401428223, + "logps/rejected": -3.5645880699157715, + "loss": 0.479, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6471047401428223, + "rewards/margins": 0.9174835085868835, + "rewards/rejected": -3.5645880699157715, + "sft_loss": 2.833148717880249, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 13.969009365364938, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.33862629532814026, + "logits/rejected": -0.15098699927330017, + "logps/chosen": -2.782113552093506, + "logps/rejected": -3.5242133140563965, + "loss": 0.5138, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.782113552093506, + "rewards/margins": 0.7420998811721802, + "rewards/rejected": -3.5242133140563965, + "sft_loss": 2.9555227756500244, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 14.151770729569519, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.19998544454574585, + "logits/rejected": -0.02457023784518242, + "logps/chosen": -2.7103729248046875, + "logps/rejected": -3.6097068786621094, + "loss": 0.5117, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7103729248046875, + "rewards/margins": 0.8993337750434875, + "rewards/rejected": -3.6097068786621094, + "sft_loss": 2.882556200027466, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 23.44049071726931, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.30788227915763855, + "logits/rejected": -0.13961350917816162, + "logps/chosen": -2.7522499561309814, + "logps/rejected": -3.6674914360046387, + "loss": 0.5067, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7522499561309814, + "rewards/margins": 0.9152417182922363, + "rewards/rejected": -3.6674914360046387, + "sft_loss": 2.9316442012786865, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 20.91735152160501, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.2761691212654114, + "logits/rejected": -0.177979975938797, + "logps/chosen": -2.6528451442718506, + "logps/rejected": -3.4922308921813965, + "loss": 0.4983, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6528451442718506, + "rewards/margins": 0.839385986328125, + "rewards/rejected": -3.4922308921813965, + "sft_loss": 2.755087375640869, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 21.367028365960795, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.2699565291404724, + "logits/rejected": -0.05399322509765625, + "logps/chosen": -2.869797468185425, + "logps/rejected": -3.72851300239563, + "loss": 0.5309, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.869797468185425, + "rewards/margins": 0.8587149381637573, + "rewards/rejected": -3.72851300239563, + "sft_loss": 2.976771116256714, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 13.942471281442186, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.3158034682273865, + "logits/rejected": -0.1378587782382965, + "logps/chosen": -2.7531518936157227, + "logps/rejected": -3.6037163734436035, + "loss": 0.5322, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7531518936157227, + "rewards/margins": 0.8505643606185913, + "rewards/rejected": -3.6037163734436035, + "sft_loss": 2.949075222015381, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 17.656841624151504, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.24910321831703186, + "logits/rejected": -0.03514650836586952, + "logps/chosen": -2.827221155166626, + "logps/rejected": -3.6355996131896973, + "loss": 0.5644, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.827221155166626, + "rewards/margins": 0.8083783984184265, + "rewards/rejected": -3.6355996131896973, + "sft_loss": 2.9915034770965576, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 16.270674214554624, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.2466723918914795, + "logits/rejected": -0.15336759388446808, + "logps/chosen": -2.871335744857788, + "logps/rejected": -3.689772367477417, + "loss": 0.5225, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.871335744857788, + "rewards/margins": 0.8184367418289185, + "rewards/rejected": -3.689772367477417, + "sft_loss": 3.0920445919036865, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 12.425537120157095, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.33766406774520874, + "logits/rejected": -0.18579557538032532, + "logps/chosen": -2.852994680404663, + "logps/rejected": -3.759185791015625, + "loss": 0.5285, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.852994680404663, + "rewards/margins": 0.9061908721923828, + "rewards/rejected": -3.759185791015625, + "sft_loss": 3.0922865867614746, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 20.041944841505437, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.31889665126800537, + "logits/rejected": -0.0808551162481308, + "logps/chosen": -2.8681604862213135, + "logps/rejected": -3.5937094688415527, + "loss": 0.5752, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8681604862213135, + "rewards/margins": 0.7255493402481079, + "rewards/rejected": -3.5937094688415527, + "sft_loss": 3.0391573905944824, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 19.31081687769959, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.4244014620780945, + "logits/rejected": -0.1237758994102478, + "logps/chosen": -2.7603707313537598, + "logps/rejected": -3.643112897872925, + "loss": 0.4808, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7603707313537598, + "rewards/margins": 0.8827424049377441, + "rewards/rejected": -3.643112897872925, + "sft_loss": 2.8689522743225098, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 19.63782385553952, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.1956585794687271, + "logits/rejected": -0.09507829695940018, + "logps/chosen": -2.851369857788086, + "logps/rejected": -3.5696005821228027, + "loss": 0.5755, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.851369857788086, + "rewards/margins": 0.7182309627532959, + "rewards/rejected": -3.5696005821228027, + "sft_loss": 2.826671600341797, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 13.299258594267988, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.344322144985199, + "logits/rejected": -0.22166843712329865, + "logps/chosen": -2.7484452724456787, + "logps/rejected": -3.57319974899292, + "loss": 0.5759, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.7484452724456787, + "rewards/margins": 0.8247542381286621, + "rewards/rejected": -3.57319974899292, + "sft_loss": 2.8463919162750244, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 20.45364215505241, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.27780282497406006, + "logits/rejected": -0.15248972177505493, + "logps/chosen": -2.6875338554382324, + "logps/rejected": -3.4828343391418457, + "loss": 0.5208, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6875338554382324, + "rewards/margins": 0.7953005433082581, + "rewards/rejected": -3.4828343391418457, + "sft_loss": 2.8021836280822754, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 15.0201163079026, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.2638780474662781, + "logits/rejected": -0.15065474808216095, + "logps/chosen": -2.8345370292663574, + "logps/rejected": -3.8164570331573486, + "loss": 0.5003, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8345370292663574, + "rewards/margins": 0.9819199442863464, + "rewards/rejected": -3.8164570331573486, + "sft_loss": 2.997745990753174, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 13.740822823904988, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.32493412494659424, + "logits/rejected": -0.12234127521514893, + "logps/chosen": -2.6589608192443848, + "logps/rejected": -3.5269927978515625, + "loss": 0.4908, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.6589608192443848, + "rewards/margins": 0.8680317997932434, + "rewards/rejected": -3.5269927978515625, + "sft_loss": 2.7340474128723145, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 12.774152902110295, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.21357354521751404, + "logits/rejected": -0.07995374500751495, + "logps/chosen": -2.7149415016174316, + "logps/rejected": -3.6193466186523438, + "loss": 0.5332, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7149415016174316, + "rewards/margins": 0.9044052362442017, + "rewards/rejected": -3.6193466186523438, + "sft_loss": 2.7866055965423584, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 15.600413109989216, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.2804366648197174, + "logits/rejected": -0.20661187171936035, + "logps/chosen": -2.737346649169922, + "logps/rejected": -3.6407647132873535, + "loss": 0.497, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.737346649169922, + "rewards/margins": 0.9034177660942078, + "rewards/rejected": -3.6407647132873535, + "sft_loss": 2.8912928104400635, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 13.983136798774643, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.21731364727020264, + "logits/rejected": -0.1391337811946869, + "logps/chosen": -2.885457992553711, + "logps/rejected": -3.7557575702667236, + "loss": 0.5071, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.885457992553711, + "rewards/margins": 0.8702995181083679, + "rewards/rejected": -3.7557575702667236, + "sft_loss": 3.06406569480896, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 15.846021214008868, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.2161417454481125, + "logits/rejected": -0.1433815360069275, + "logps/chosen": -2.861445426940918, + "logps/rejected": -3.537444591522217, + "loss": 0.5979, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.861445426940918, + "rewards/margins": 0.6759993433952332, + "rewards/rejected": -3.537444591522217, + "sft_loss": 2.9814562797546387, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 10.787969643568355, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.3371451199054718, + "logits/rejected": -0.11801593005657196, + "logps/chosen": -2.8876984119415283, + "logps/rejected": -3.7116875648498535, + "loss": 0.5446, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8876984119415283, + "rewards/margins": 0.82398921251297, + "rewards/rejected": -3.7116875648498535, + "sft_loss": 2.9036552906036377, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 17.28603451468424, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.21108360588550568, + "logits/rejected": -0.030581191182136536, + "logps/chosen": -2.9036574363708496, + "logps/rejected": -3.7772459983825684, + "loss": 0.5649, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9036574363708496, + "rewards/margins": 0.8735888600349426, + "rewards/rejected": -3.7772459983825684, + "sft_loss": 3.0899136066436768, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.0982469990849495, + "eval_logits/rejected": 0.20617729425430298, + "eval_logps/chosen": -2.8066318035125732, + "eval_logps/rejected": -3.636258840560913, + "eval_loss": 0.5509018301963806, + "eval_rewards/accuracies": 0.7240356206893921, + "eval_rewards/chosen": -2.8066318035125732, + "eval_rewards/margins": 0.8296267986297607, + "eval_rewards/rejected": -3.636258840560913, + "eval_runtime": 50.0546, + "eval_samples_per_second": 26.871, + "eval_sft_loss": 2.9742238521575928, + "eval_steps_per_second": 6.733, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 15.491129944188577, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.27998265624046326, + "logits/rejected": -0.2484864443540573, + "logps/chosen": -2.683946132659912, + "logps/rejected": -3.4610118865966797, + "loss": 0.5354, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.683946132659912, + "rewards/margins": 0.7770653963088989, + "rewards/rejected": -3.4610118865966797, + "sft_loss": 2.8647847175598145, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 16.588855650315963, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.3101789355278015, + "logits/rejected": -0.1497582644224167, + "logps/chosen": -2.7263779640197754, + "logps/rejected": -3.524534225463867, + "loss": 0.51, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7263779640197754, + "rewards/margins": 0.7981564998626709, + "rewards/rejected": -3.524534225463867, + "sft_loss": 2.9038267135620117, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 16.739263575752542, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.21048936247825623, + "logits/rejected": -0.06801290065050125, + "logps/chosen": -2.8764920234680176, + "logps/rejected": -3.7195560932159424, + "loss": 0.5268, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8764920234680176, + "rewards/margins": 0.84306401014328, + "rewards/rejected": -3.7195560932159424, + "sft_loss": 2.958024024963379, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 18.613139691100287, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.2533569037914276, + "logits/rejected": -0.13912202417850494, + "logps/chosen": -2.958700656890869, + "logps/rejected": -3.6944549083709717, + "loss": 0.5598, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.958700656890869, + "rewards/margins": 0.7357538342475891, + "rewards/rejected": -3.6944549083709717, + "sft_loss": 3.131559371948242, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 17.01635297334278, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.15639057755470276, + "logits/rejected": -0.16117417812347412, + "logps/chosen": -2.762528896331787, + "logps/rejected": -3.521737575531006, + "loss": 0.5339, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.762528896331787, + "rewards/margins": 0.7592090368270874, + "rewards/rejected": -3.521737575531006, + "sft_loss": 2.86425518989563, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 18.155698170544465, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.26868265867233276, + "logits/rejected": -0.22519557178020477, + "logps/chosen": -2.8326733112335205, + "logps/rejected": -3.687572479248047, + "loss": 0.5588, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8326733112335205, + "rewards/margins": 0.8548991084098816, + "rewards/rejected": -3.687572479248047, + "sft_loss": 3.030932664871216, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 15.858042608953705, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.3185202479362488, + "logits/rejected": -0.1940157264471054, + "logps/chosen": -2.7259998321533203, + "logps/rejected": -3.6320412158966064, + "loss": 0.5033, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7259998321533203, + "rewards/margins": 0.9060415029525757, + "rewards/rejected": -3.6320412158966064, + "sft_loss": 2.879058361053467, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 15.658328201630576, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.27361616492271423, + "logits/rejected": -0.17969655990600586, + "logps/chosen": -2.839315891265869, + "logps/rejected": -3.5681090354919434, + "loss": 0.5349, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.839315891265869, + "rewards/margins": 0.7287934422492981, + "rewards/rejected": -3.5681090354919434, + "sft_loss": 2.993678569793701, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 14.139512973464816, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.2828649878501892, + "logits/rejected": -0.14037677645683289, + "logps/chosen": -2.81725811958313, + "logps/rejected": -3.4038796424865723, + "loss": 0.5832, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.81725811958313, + "rewards/margins": 0.5866214036941528, + "rewards/rejected": -3.4038796424865723, + "sft_loss": 2.944801092147827, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 17.638167050426947, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.332253634929657, + "logits/rejected": -0.22307121753692627, + "logps/chosen": -2.9059665203094482, + "logps/rejected": -3.5740585327148438, + "loss": 0.5805, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.9059665203094482, + "rewards/margins": 0.6680923700332642, + "rewards/rejected": -3.5740585327148438, + "sft_loss": 3.0662026405334473, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 20.879019950964675, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.26796287298202515, + "logits/rejected": -0.145319402217865, + "logps/chosen": -2.836381673812866, + "logps/rejected": -3.6222052574157715, + "loss": 0.5546, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.836381673812866, + "rewards/margins": 0.7858238220214844, + "rewards/rejected": -3.6222052574157715, + "sft_loss": 2.9162559509277344, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 15.735016333049394, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.28387266397476196, + "logits/rejected": -0.12554284930229187, + "logps/chosen": -2.83955717086792, + "logps/rejected": -3.6048247814178467, + "loss": 0.5498, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.83955717086792, + "rewards/margins": 0.7652674913406372, + "rewards/rejected": -3.6048247814178467, + "sft_loss": 3.0139026641845703, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 14.179116175137064, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.2856293320655823, + "logits/rejected": -0.17432229220867157, + "logps/chosen": -2.6420066356658936, + "logps/rejected": -3.5150704383850098, + "loss": 0.5291, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6420066356658936, + "rewards/margins": 0.8730632066726685, + "rewards/rejected": -3.5150704383850098, + "sft_loss": 2.7040858268737793, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 13.655424318476642, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.2945409119129181, + "logits/rejected": -0.15200811624526978, + "logps/chosen": -2.7398436069488525, + "logps/rejected": -3.718191623687744, + "loss": 0.4739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7398436069488525, + "rewards/margins": 0.978347897529602, + "rewards/rejected": -3.718191623687744, + "sft_loss": 2.9533541202545166, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 14.895341760219607, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.3262266516685486, + "logits/rejected": -0.16349753737449646, + "logps/chosen": -2.935656785964966, + "logps/rejected": -3.808778762817383, + "loss": 0.5753, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.935656785964966, + "rewards/margins": 0.8731220960617065, + "rewards/rejected": -3.808778762817383, + "sft_loss": 3.0750489234924316, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 12.97760841860067, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.2657465636730194, + "logits/rejected": -0.12304908037185669, + "logps/chosen": -2.537490129470825, + "logps/rejected": -3.3631515502929688, + "loss": 0.4959, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.537490129470825, + "rewards/margins": 0.825661301612854, + "rewards/rejected": -3.3631515502929688, + "sft_loss": 2.72019362449646, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 19.859032127404333, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.3163455128669739, + "logits/rejected": -0.21008479595184326, + "logps/chosen": -2.7686800956726074, + "logps/rejected": -3.6506667137145996, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7686800956726074, + "rewards/margins": 0.8819867968559265, + "rewards/rejected": -3.6506667137145996, + "sft_loss": 2.9922776222229004, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 13.46626137783446, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.27424660325050354, + "logits/rejected": -0.0691673532128334, + "logps/chosen": -2.855818271636963, + "logps/rejected": -3.652209758758545, + "loss": 0.5403, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.855818271636963, + "rewards/margins": 0.7963913083076477, + "rewards/rejected": -3.652209758758545, + "sft_loss": 2.939704418182373, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 16.39196142560876, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.26725202798843384, + "logits/rejected": -0.17296263575553894, + "logps/chosen": -2.8052046298980713, + "logps/rejected": -3.607107639312744, + "loss": 0.5424, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8052046298980713, + "rewards/margins": 0.8019029498100281, + "rewards/rejected": -3.607107639312744, + "sft_loss": 2.9598681926727295, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 14.226033481114372, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.22751012444496155, + "logits/rejected": -0.08938495814800262, + "logps/chosen": -2.6386122703552246, + "logps/rejected": -3.4772655963897705, + "loss": 0.5082, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6386122703552246, + "rewards/margins": 0.8386530876159668, + "rewards/rejected": -3.4772655963897705, + "sft_loss": 2.7765183448791504, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 19.205617348945218, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.25414150953292847, + "logits/rejected": -0.16614079475402832, + "logps/chosen": -2.6993870735168457, + "logps/rejected": -3.6588058471679688, + "loss": 0.4926, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6993870735168457, + "rewards/margins": 0.9594192504882812, + "rewards/rejected": -3.6588058471679688, + "sft_loss": 2.820012092590332, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 14.684077770215215, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.12193117290735245, + "logits/rejected": -0.07238110154867172, + "logps/chosen": -2.7730677127838135, + "logps/rejected": -3.5438296794891357, + "loss": 0.5664, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7730677127838135, + "rewards/margins": 0.7707620859146118, + "rewards/rejected": -3.5438296794891357, + "sft_loss": 2.9567716121673584, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 16.579906174692645, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.20543989539146423, + "logits/rejected": -0.14003416895866394, + "logps/chosen": -2.792309522628784, + "logps/rejected": -3.7953097820281982, + "loss": 0.4912, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.792309522628784, + "rewards/margins": 1.0030001401901245, + "rewards/rejected": -3.7953097820281982, + "sft_loss": 2.9646449089050293, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 16.384158900838727, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.25673890113830566, + "logits/rejected": -0.14675481617450714, + "logps/chosen": -2.748898983001709, + "logps/rejected": -3.506723403930664, + "loss": 0.5451, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.748898983001709, + "rewards/margins": 0.757824182510376, + "rewards/rejected": -3.506723403930664, + "sft_loss": 2.8113455772399902, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 15.376392346664371, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.32976996898651123, + "logits/rejected": -0.22071564197540283, + "logps/chosen": -2.7523465156555176, + "logps/rejected": -3.6334640979766846, + "loss": 0.5007, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7523465156555176, + "rewards/margins": 0.8811177015304565, + "rewards/rejected": -3.6334640979766846, + "sft_loss": 2.9830379486083984, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 13.25572921129167, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.2685486674308777, + "logits/rejected": -0.11623702943325043, + "logps/chosen": -2.7400898933410645, + "logps/rejected": -3.5789923667907715, + "loss": 0.4804, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7400898933410645, + "rewards/margins": 0.838902473449707, + "rewards/rejected": -3.5789923667907715, + "sft_loss": 2.8983848094940186, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 16.15165716924802, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.198993980884552, + "logits/rejected": -0.10462522506713867, + "logps/chosen": -2.9255807399749756, + "logps/rejected": -3.707984447479248, + "loss": 0.5792, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9255807399749756, + "rewards/margins": 0.7824038863182068, + "rewards/rejected": -3.707984447479248, + "sft_loss": 3.0322368144989014, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 16.0411973409057, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.1713380515575409, + "logits/rejected": -0.09565655887126923, + "logps/chosen": -3.05859637260437, + "logps/rejected": -3.847487688064575, + "loss": 0.5326, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.05859637260437, + "rewards/margins": 0.7888910174369812, + "rewards/rejected": -3.847487688064575, + "sft_loss": 3.13173246383667, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 14.16234994601713, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.21377214789390564, + "logits/rejected": -0.1134125143289566, + "logps/chosen": -2.901991844177246, + "logps/rejected": -3.988110065460205, + "loss": 0.4429, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.901991844177246, + "rewards/margins": 1.086118459701538, + "rewards/rejected": -3.988110065460205, + "sft_loss": 3.0303547382354736, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 12.244453619004945, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.306434690952301, + "logits/rejected": -0.18747636675834656, + "logps/chosen": -2.716519832611084, + "logps/rejected": -3.7627689838409424, + "loss": 0.4652, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.716519832611084, + "rewards/margins": 1.0462491512298584, + "rewards/rejected": -3.7627689838409424, + "sft_loss": 2.938028335571289, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 15.423811405373074, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.2862057685852051, + "logits/rejected": -0.11905969679355621, + "logps/chosen": -2.9096438884735107, + "logps/rejected": -3.999202251434326, + "loss": 0.4579, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9096438884735107, + "rewards/margins": 1.0895582437515259, + "rewards/rejected": -3.999202251434326, + "sft_loss": 3.161621332168579, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 12.428807927419921, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.33346086740493774, + "logits/rejected": -0.26415562629699707, + "logps/chosen": -2.9282045364379883, + "logps/rejected": -4.097114086151123, + "loss": 0.4487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9282045364379883, + "rewards/margins": 1.1689093112945557, + "rewards/rejected": -4.097114086151123, + "sft_loss": 3.1366372108459473, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 22.392331618088818, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.2757893204689026, + "logits/rejected": -0.04873190447688103, + "logps/chosen": -3.1032681465148926, + "logps/rejected": -4.14900016784668, + "loss": 0.5084, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.1032681465148926, + "rewards/margins": 1.0457319021224976, + "rewards/rejected": -4.14900016784668, + "sft_loss": 3.2254319190979004, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 23.304653348639043, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.33224955201148987, + "logits/rejected": -0.1305333971977234, + "logps/chosen": -2.9278311729431152, + "logps/rejected": -4.174778938293457, + "loss": 0.4505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9278311729431152, + "rewards/margins": 1.246948003768921, + "rewards/rejected": -4.174778938293457, + "sft_loss": 3.146299123764038, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 19.897203015559125, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.22761209309101105, + "logits/rejected": -0.16656561195850372, + "logps/chosen": -3.047513484954834, + "logps/rejected": -4.061963081359863, + "loss": 0.4903, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.047513484954834, + "rewards/margins": 1.0144492387771606, + "rewards/rejected": -4.061963081359863, + "sft_loss": 3.1401548385620117, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 13.776672506207479, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.24800440669059753, + "logits/rejected": -0.20165736973285675, + "logps/chosen": -2.960462808609009, + "logps/rejected": -4.0827765464782715, + "loss": 0.4562, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.960462808609009, + "rewards/margins": 1.122314214706421, + "rewards/rejected": -4.0827765464782715, + "sft_loss": 3.118192195892334, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 11.773085259364048, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.2977316081523895, + "logits/rejected": -0.11875119060277939, + "logps/chosen": -3.0365774631500244, + "logps/rejected": -4.105264663696289, + "loss": 0.4775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0365774631500244, + "rewards/margins": 1.0686873197555542, + "rewards/rejected": -4.105264663696289, + "sft_loss": 3.1606414318084717, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 12.527830416654856, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.33617570996284485, + "logits/rejected": -0.1405753344297409, + "logps/chosen": -2.931879997253418, + "logps/rejected": -4.046960830688477, + "loss": 0.4693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.931879997253418, + "rewards/margins": 1.1150810718536377, + "rewards/rejected": -4.046960830688477, + "sft_loss": 3.1172118186950684, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 15.611435619917417, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.22904033958911896, + "logits/rejected": -0.19225440919399261, + "logps/chosen": -2.8065621852874756, + "logps/rejected": -3.717280149459839, + "loss": 0.5059, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8065621852874756, + "rewards/margins": 0.9107178449630737, + "rewards/rejected": -3.717280149459839, + "sft_loss": 2.9993736743927, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 25.12690448505756, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.2554751932621002, + "logits/rejected": -0.09961952269077301, + "logps/chosen": -2.9748282432556152, + "logps/rejected": -3.9946041107177734, + "loss": 0.4708, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9748282432556152, + "rewards/margins": 1.0197762250900269, + "rewards/rejected": -3.9946041107177734, + "sft_loss": 3.2063193321228027, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 22.659939278716454, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.26864659786224365, + "logits/rejected": -0.11086989939212799, + "logps/chosen": -2.8870584964752197, + "logps/rejected": -3.908726930618286, + "loss": 0.4519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8870584964752197, + "rewards/margins": 1.021668791770935, + "rewards/rejected": -3.908726930618286, + "sft_loss": 3.0277743339538574, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 15.111167904101759, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.1721217930316925, + "logits/rejected": -0.06392903625965118, + "logps/chosen": -2.877350330352783, + "logps/rejected": -4.044107437133789, + "loss": 0.4091, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.877350330352783, + "rewards/margins": 1.1667568683624268, + "rewards/rejected": -4.044107437133789, + "sft_loss": 2.936011791229248, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 14.154684910410312, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.3511294722557068, + "logits/rejected": -0.22026808559894562, + "logps/chosen": -2.9797768592834473, + "logps/rejected": -4.012446403503418, + "loss": 0.4576, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9797768592834473, + "rewards/margins": 1.0326696634292603, + "rewards/rejected": -4.012446403503418, + "sft_loss": 3.0182080268859863, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 16.20660707105562, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.3694307208061218, + "logits/rejected": -0.10809771716594696, + "logps/chosen": -3.0436954498291016, + "logps/rejected": -4.178753852844238, + "loss": 0.4598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0436954498291016, + "rewards/margins": 1.1350584030151367, + "rewards/rejected": -4.178753852844238, + "sft_loss": 3.252185344696045, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 16.511892258173834, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.27332353591918945, + "logits/rejected": -0.19080711901187897, + "logps/chosen": -2.957089900970459, + "logps/rejected": -4.055009365081787, + "loss": 0.4476, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.957089900970459, + "rewards/margins": 1.0979197025299072, + "rewards/rejected": -4.055009365081787, + "sft_loss": 3.1537868976593018, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 19.73959161900953, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.38870173692703247, + "logits/rejected": -0.2058717906475067, + "logps/chosen": -3.182220935821533, + "logps/rejected": -4.362703323364258, + "loss": 0.4582, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.182220935821533, + "rewards/margins": 1.180482029914856, + "rewards/rejected": -4.362703323364258, + "sft_loss": 3.3152670860290527, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 15.048704309405935, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.28820163011550903, + "logits/rejected": -0.08687031269073486, + "logps/chosen": -3.045510768890381, + "logps/rejected": -4.212818145751953, + "loss": 0.4475, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.045510768890381, + "rewards/margins": 1.1673071384429932, + "rewards/rejected": -4.212818145751953, + "sft_loss": 3.289644718170166, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 14.31299688757163, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.2321111410856247, + "logits/rejected": -0.06788916885852814, + "logps/chosen": -2.9039368629455566, + "logps/rejected": -4.054717063903809, + "loss": 0.4518, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.9039368629455566, + "rewards/margins": 1.1507799625396729, + "rewards/rejected": -4.054717063903809, + "sft_loss": 3.1044628620147705, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 13.386343057348473, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.39123016595840454, + "logits/rejected": -0.12693606317043304, + "logps/chosen": -3.091785430908203, + "logps/rejected": -4.274016380310059, + "loss": 0.4142, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.091785430908203, + "rewards/margins": 1.1822311878204346, + "rewards/rejected": -4.274016380310059, + "sft_loss": 3.207846164703369, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 16.329522011469113, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.32532352209091187, + "logits/rejected": -0.10557621717453003, + "logps/chosen": -3.1879098415374756, + "logps/rejected": -4.119997024536133, + "loss": 0.5105, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1879098415374756, + "rewards/margins": 0.9320871233940125, + "rewards/rejected": -4.119997024536133, + "sft_loss": 3.367783784866333, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 18.388836364712105, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.27799874544143677, + "logits/rejected": -0.05749162286520004, + "logps/chosen": -3.1412248611450195, + "logps/rejected": -4.446188926696777, + "loss": 0.4546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1412248611450195, + "rewards/margins": 1.304964542388916, + "rewards/rejected": -4.446188926696777, + "sft_loss": 3.2613327503204346, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 16.88347966335065, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.23603376746177673, + "logits/rejected": -0.07529838383197784, + "logps/chosen": -3.112762928009033, + "logps/rejected": -4.320590019226074, + "loss": 0.4491, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.112762928009033, + "rewards/margins": 1.2078269720077515, + "rewards/rejected": -4.320590019226074, + "sft_loss": 3.2125473022460938, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 19.52573396013767, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.40236061811447144, + "logits/rejected": -0.22329147160053253, + "logps/chosen": -3.084240436553955, + "logps/rejected": -4.158616065979004, + "loss": 0.4653, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.084240436553955, + "rewards/margins": 1.0743753910064697, + "rewards/rejected": -4.158616065979004, + "sft_loss": 3.236417293548584, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 16.765985019585877, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.3749734163284302, + "logits/rejected": -0.2466563880443573, + "logps/chosen": -2.95463490486145, + "logps/rejected": -4.059712886810303, + "loss": 0.4625, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.95463490486145, + "rewards/margins": 1.1050784587860107, + "rewards/rejected": -4.059712886810303, + "sft_loss": 3.155550479888916, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 17.908687383871158, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.31655049324035645, + "logits/rejected": -0.19131407141685486, + "logps/chosen": -3.113335609436035, + "logps/rejected": -4.204074859619141, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.113335609436035, + "rewards/margins": 1.0907394886016846, + "rewards/rejected": -4.204074859619141, + "sft_loss": 3.320544719696045, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 18.39441900762111, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.3523116707801819, + "logits/rejected": -0.15895763039588928, + "logps/chosen": -2.8050246238708496, + "logps/rejected": -3.9845402240753174, + "loss": 0.4351, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8050246238708496, + "rewards/margins": 1.1795158386230469, + "rewards/rejected": -3.9845402240753174, + "sft_loss": 2.912895679473877, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 13.819702075148708, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.3338176906108856, + "logits/rejected": -0.19842395186424255, + "logps/chosen": -2.7931671142578125, + "logps/rejected": -4.065673828125, + "loss": 0.418, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.7931671142578125, + "rewards/margins": 1.2725064754486084, + "rewards/rejected": -4.065673828125, + "sft_loss": 3.0094237327575684, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 18.610614155493714, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.23861142992973328, + "logits/rejected": -0.12467072159051895, + "logps/chosen": -3.0036704540252686, + "logps/rejected": -4.0125226974487305, + "loss": 0.4999, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0036704540252686, + "rewards/margins": 1.0088523626327515, + "rewards/rejected": -4.0125226974487305, + "sft_loss": 3.170523166656494, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 15.426117287104132, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.3244778513908386, + "logits/rejected": -0.062110286206007004, + "logps/chosen": -3.009836196899414, + "logps/rejected": -4.056250095367432, + "loss": 0.4907, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.009836196899414, + "rewards/margins": 1.0464141368865967, + "rewards/rejected": -4.056250095367432, + "sft_loss": 3.0853304862976074, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 14.699009607152908, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.2429141104221344, + "logits/rejected": -0.20059914886951447, + "logps/chosen": -3.0386226177215576, + "logps/rejected": -4.1500349044799805, + "loss": 0.4483, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0386226177215576, + "rewards/margins": 1.1114122867584229, + "rewards/rejected": -4.1500349044799805, + "sft_loss": 3.1930127143859863, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 15.620716890065738, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.24442140758037567, + "logits/rejected": -0.1320722997188568, + "logps/chosen": -3.105576276779175, + "logps/rejected": -4.1901750564575195, + "loss": 0.494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.105576276779175, + "rewards/margins": 1.0845987796783447, + "rewards/rejected": -4.1901750564575195, + "sft_loss": 3.2068934440612793, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 16.207557805252748, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.27643728256225586, + "logits/rejected": -0.1240261048078537, + "logps/chosen": -2.892338275909424, + "logps/rejected": -3.966510772705078, + "loss": 0.4534, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.892338275909424, + "rewards/margins": 1.0741727352142334, + "rewards/rejected": -3.966510772705078, + "sft_loss": 3.089568614959717, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 21.10890673694221, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.3095929026603699, + "logits/rejected": -0.13495635986328125, + "logps/chosen": -3.2057957649230957, + "logps/rejected": -4.216448783874512, + "loss": 0.5132, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2057957649230957, + "rewards/margins": 1.0106537342071533, + "rewards/rejected": -4.216448783874512, + "sft_loss": 3.349108934402466, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 15.675139661938122, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.2906665503978729, + "logits/rejected": -0.11295589059591293, + "logps/chosen": -3.0297131538391113, + "logps/rejected": -4.311059951782227, + "loss": 0.4317, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0297131538391113, + "rewards/margins": 1.2813465595245361, + "rewards/rejected": -4.311059951782227, + "sft_loss": 3.1101412773132324, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 14.984845149493891, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.2796470522880554, + "logits/rejected": -0.17265164852142334, + "logps/chosen": -2.9086081981658936, + "logps/rejected": -4.126116752624512, + "loss": 0.4417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9086081981658936, + "rewards/margins": 1.2175090312957764, + "rewards/rejected": -4.126116752624512, + "sft_loss": 3.0992932319641113, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 12.670306936276564, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.37444189190864563, + "logits/rejected": -0.1644412726163864, + "logps/chosen": -3.0525717735290527, + "logps/rejected": -4.331545829772949, + "loss": 0.446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0525717735290527, + "rewards/margins": 1.2789738178253174, + "rewards/rejected": -4.331545829772949, + "sft_loss": 3.174462080001831, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 13.338592047448122, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.284696489572525, + "logits/rejected": -0.1664656102657318, + "logps/chosen": -3.0490806102752686, + "logps/rejected": -4.257328987121582, + "loss": 0.4521, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0490806102752686, + "rewards/margins": 1.208248496055603, + "rewards/rejected": -4.257328987121582, + "sft_loss": 3.1560282707214355, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 22.165854005873395, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.2826927900314331, + "logits/rejected": -0.18620052933692932, + "logps/chosen": -3.007849931716919, + "logps/rejected": -4.2933125495910645, + "loss": 0.4446, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.007849931716919, + "rewards/margins": 1.285462737083435, + "rewards/rejected": -4.2933125495910645, + "sft_loss": 3.0982885360717773, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 18.244244282787054, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.30995243787765503, + "logits/rejected": -0.10001242160797119, + "logps/chosen": -3.2020652294158936, + "logps/rejected": -4.1528000831604, + "loss": 0.5172, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2020652294158936, + "rewards/margins": 0.9507347941398621, + "rewards/rejected": -4.1528000831604, + "sft_loss": 3.400285005569458, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 25.00130870325128, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.2304377257823944, + "logits/rejected": -0.05512354522943497, + "logps/chosen": -3.052947759628296, + "logps/rejected": -4.249530792236328, + "loss": 0.4534, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.052947759628296, + "rewards/margins": 1.1965830326080322, + "rewards/rejected": -4.249530792236328, + "sft_loss": 3.1837849617004395, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 17.54566506904545, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.341848224401474, + "logits/rejected": -0.2598082721233368, + "logps/chosen": -3.0490105152130127, + "logps/rejected": -4.387795448303223, + "loss": 0.4447, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0490105152130127, + "rewards/margins": 1.33878493309021, + "rewards/rejected": -4.387795448303223, + "sft_loss": 3.2051093578338623, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 13.39085503782966, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.2547725439071655, + "logits/rejected": -0.13449934124946594, + "logps/chosen": -3.0368006229400635, + "logps/rejected": -4.475960731506348, + "loss": 0.3986, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0368006229400635, + "rewards/margins": 1.4391599893569946, + "rewards/rejected": -4.475960731506348, + "sft_loss": 3.2502083778381348, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 20.65963530181319, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.293484628200531, + "logits/rejected": -0.12475170940160751, + "logps/chosen": -3.2072854042053223, + "logps/rejected": -4.5082550048828125, + "loss": 0.4329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2072854042053223, + "rewards/margins": 1.3009698390960693, + "rewards/rejected": -4.5082550048828125, + "sft_loss": 3.502829074859619, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 16.427496657913935, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.2562063932418823, + "logits/rejected": -0.1234719380736351, + "logps/chosen": -3.096013307571411, + "logps/rejected": -4.341220378875732, + "loss": 0.496, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.096013307571411, + "rewards/margins": 1.2452070713043213, + "rewards/rejected": -4.341220378875732, + "sft_loss": 3.4139180183410645, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 17.20178783513042, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.2581852078437805, + "logits/rejected": -0.09759654849767685, + "logps/chosen": -2.892961025238037, + "logps/rejected": -4.1011962890625, + "loss": 0.418, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.892961025238037, + "rewards/margins": 1.2082349061965942, + "rewards/rejected": -4.1011962890625, + "sft_loss": 3.0654709339141846, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 15.628020604945597, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.3146493136882782, + "logits/rejected": -0.20411427319049835, + "logps/chosen": -2.870588779449463, + "logps/rejected": -4.039918899536133, + "loss": 0.4605, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.870588779449463, + "rewards/margins": 1.1693298816680908, + "rewards/rejected": -4.039918899536133, + "sft_loss": 3.2099366188049316, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 23.53305266958996, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.3712599277496338, + "logits/rejected": -0.23454952239990234, + "logps/chosen": -2.9788217544555664, + "logps/rejected": -4.112313747406006, + "loss": 0.4674, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9788217544555664, + "rewards/margins": 1.1334917545318604, + "rewards/rejected": -4.112313747406006, + "sft_loss": 3.238032579421997, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 16.200415464439846, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.38899844884872437, + "logits/rejected": -0.23052570223808289, + "logps/chosen": -2.9085559844970703, + "logps/rejected": -4.126476287841797, + "loss": 0.4187, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9085559844970703, + "rewards/margins": 1.217919945716858, + "rewards/rejected": -4.126476287841797, + "sft_loss": 3.132606267929077, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 22.72340929322911, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.4118987023830414, + "logits/rejected": -0.12402470409870148, + "logps/chosen": -3.095353126525879, + "logps/rejected": -4.235506534576416, + "loss": 0.4774, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.095353126525879, + "rewards/margins": 1.140153169631958, + "rewards/rejected": -4.235506534576416, + "sft_loss": 3.2529025077819824, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 21.212523140842713, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.31405314803123474, + "logits/rejected": -0.21102198958396912, + "logps/chosen": -2.9583473205566406, + "logps/rejected": -4.192997932434082, + "loss": 0.4683, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9583473205566406, + "rewards/margins": 1.2346506118774414, + "rewards/rejected": -4.192997932434082, + "sft_loss": 3.1199488639831543, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.025688041001558304, + "eval_logits/rejected": 0.13502153754234314, + "eval_logps/chosen": -3.158782958984375, + "eval_logps/rejected": -4.109988212585449, + "eval_loss": 0.5601091980934143, + "eval_rewards/accuracies": 0.719584584236145, + "eval_rewards/chosen": -3.158782958984375, + "eval_rewards/margins": 0.9512055516242981, + "eval_rewards/rejected": -4.109988212585449, + "eval_runtime": 51.6293, + "eval_samples_per_second": 26.051, + "eval_sft_loss": 3.3500845432281494, + "eval_steps_per_second": 6.527, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 16.63182385862367, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.2870718538761139, + "logits/rejected": -0.05029254034161568, + "logps/chosen": -3.2278294563293457, + "logps/rejected": -4.264133453369141, + "loss": 0.4955, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2278294563293457, + "rewards/margins": 1.0363037586212158, + "rewards/rejected": -4.264133453369141, + "sft_loss": 3.312840223312378, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 16.68456462998088, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.3568572402000427, + "logits/rejected": -0.23159785568714142, + "logps/chosen": -2.974052906036377, + "logps/rejected": -3.96720814704895, + "loss": 0.4973, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.974052906036377, + "rewards/margins": 0.9931553602218628, + "rewards/rejected": -3.96720814704895, + "sft_loss": 3.1371753215789795, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 17.55953407548012, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.32833558320999146, + "logits/rejected": -0.0644773468375206, + "logps/chosen": -3.0109498500823975, + "logps/rejected": -4.042330265045166, + "loss": 0.5032, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0109498500823975, + "rewards/margins": 1.031380295753479, + "rewards/rejected": -4.042330265045166, + "sft_loss": 3.1866564750671387, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 12.996125025597934, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.4082667827606201, + "logits/rejected": -0.2362288236618042, + "logps/chosen": -3.1122148036956787, + "logps/rejected": -4.284976005554199, + "loss": 0.448, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1122148036956787, + "rewards/margins": 1.1727612018585205, + "rewards/rejected": -4.284976005554199, + "sft_loss": 3.294731616973877, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 21.344050643418907, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.311269611120224, + "logits/rejected": -0.1855960339307785, + "logps/chosen": -2.9530930519104004, + "logps/rejected": -3.987037181854248, + "loss": 0.4904, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9530930519104004, + "rewards/margins": 1.0339438915252686, + "rewards/rejected": -3.987037181854248, + "sft_loss": 3.1352100372314453, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 14.0250481619352, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.36123383045196533, + "logits/rejected": -0.24479889869689941, + "logps/chosen": -2.9066200256347656, + "logps/rejected": -4.128986358642578, + "loss": 0.4004, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.9066200256347656, + "rewards/margins": 1.2223665714263916, + "rewards/rejected": -4.128986358642578, + "sft_loss": 3.0532615184783936, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 21.136200769220174, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.3215072453022003, + "logits/rejected": -0.2983446717262268, + "logps/chosen": -2.8843977451324463, + "logps/rejected": -3.8361854553222656, + "loss": 0.4983, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8843977451324463, + "rewards/margins": 0.9517875909805298, + "rewards/rejected": -3.8361854553222656, + "sft_loss": 3.0769405364990234, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 18.318502665383225, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.31124424934387207, + "logits/rejected": -0.2860221564769745, + "logps/chosen": -2.903745412826538, + "logps/rejected": -4.019559383392334, + "loss": 0.4356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.903745412826538, + "rewards/margins": 1.115814447402954, + "rewards/rejected": -4.019559383392334, + "sft_loss": 3.0267748832702637, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 17.55774246027213, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.2943703532218933, + "logits/rejected": -0.14766912162303925, + "logps/chosen": -2.9912478923797607, + "logps/rejected": -4.015740394592285, + "loss": 0.4922, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9912478923797607, + "rewards/margins": 1.0244930982589722, + "rewards/rejected": -4.015740394592285, + "sft_loss": 3.1674442291259766, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 22.372909429762434, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.38501840829849243, + "logits/rejected": -0.2358318269252777, + "logps/chosen": -3.215186357498169, + "logps/rejected": -4.143369674682617, + "loss": 0.5279, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.215186357498169, + "rewards/margins": 0.9281827807426453, + "rewards/rejected": -4.143369674682617, + "sft_loss": 3.4197921752929688, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 21.92536904942618, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.25042837858200073, + "logits/rejected": -0.1560048609972, + "logps/chosen": -3.093291997909546, + "logps/rejected": -3.9603614807128906, + "loss": 0.5049, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.093291997909546, + "rewards/margins": 0.8670692443847656, + "rewards/rejected": -3.9603614807128906, + "sft_loss": 3.21748685836792, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 17.27114805230743, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.23693613708019257, + "logits/rejected": -0.05825047567486763, + "logps/chosen": -3.1077284812927246, + "logps/rejected": -4.1357502937316895, + "loss": 0.5128, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.1077284812927246, + "rewards/margins": 1.0280214548110962, + "rewards/rejected": -4.1357502937316895, + "sft_loss": 3.373389482498169, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 17.350160855736583, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.35128170251846313, + "logits/rejected": -0.25272828340530396, + "logps/chosen": -3.0533649921417236, + "logps/rejected": -4.274240493774414, + "loss": 0.4319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0533649921417236, + "rewards/margins": 1.22087562084198, + "rewards/rejected": -4.274240493774414, + "sft_loss": 3.2763259410858154, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 24.237254411981112, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.2499866485595703, + "logits/rejected": -0.024306219071149826, + "logps/chosen": -3.124173164367676, + "logps/rejected": -4.354640960693359, + "loss": 0.4902, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.124173164367676, + "rewards/margins": 1.2304680347442627, + "rewards/rejected": -4.354640960693359, + "sft_loss": 3.3544716835021973, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 16.256757634082003, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.31078004837036133, + "logits/rejected": -0.0933656319975853, + "logps/chosen": -3.0529751777648926, + "logps/rejected": -4.145366668701172, + "loss": 0.4455, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0529751777648926, + "rewards/margins": 1.0923912525177002, + "rewards/rejected": -4.145366668701172, + "sft_loss": 3.150514841079712, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 16.07676660011823, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.35007601976394653, + "logits/rejected": -0.19697776436805725, + "logps/chosen": -3.1284284591674805, + "logps/rejected": -4.333003520965576, + "loss": 0.4519, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1284284591674805, + "rewards/margins": 1.204574704170227, + "rewards/rejected": -4.333003520965576, + "sft_loss": 3.2495639324188232, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 14.043602735428511, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.27686479687690735, + "logits/rejected": -0.19484971463680267, + "logps/chosen": -2.907302141189575, + "logps/rejected": -4.04062032699585, + "loss": 0.4381, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.907302141189575, + "rewards/margins": 1.1333180665969849, + "rewards/rejected": -4.04062032699585, + "sft_loss": 3.009895086288452, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 16.392123117224134, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.41479548811912537, + "logits/rejected": -0.18581287562847137, + "logps/chosen": -2.9701197147369385, + "logps/rejected": -4.102685928344727, + "loss": 0.4516, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9701197147369385, + "rewards/margins": 1.1325660943984985, + "rewards/rejected": -4.102685928344727, + "sft_loss": 3.1302144527435303, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 23.254534911552202, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.36941179633140564, + "logits/rejected": -0.25071144104003906, + "logps/chosen": -3.2064356803894043, + "logps/rejected": -4.208579063415527, + "loss": 0.4821, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2064356803894043, + "rewards/margins": 1.002143383026123, + "rewards/rejected": -4.208579063415527, + "sft_loss": 3.360086441040039, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 21.084331152299193, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.2859252691268921, + "logits/rejected": -0.09964105486869812, + "logps/chosen": -2.9344687461853027, + "logps/rejected": -4.217326641082764, + "loss": 0.3965, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.9344687461853027, + "rewards/margins": 1.2828583717346191, + "rewards/rejected": -4.217326641082764, + "sft_loss": 3.188898801803589, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 16.571221681014507, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.2801959812641144, + "logits/rejected": -0.0652405172586441, + "logps/chosen": -3.313950300216675, + "logps/rejected": -4.442086219787598, + "loss": 0.4633, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.313950300216675, + "rewards/margins": 1.1281362771987915, + "rewards/rejected": -4.442086219787598, + "sft_loss": 3.3639557361602783, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 26.337516275744846, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.28500619530677795, + "logits/rejected": -0.04683919996023178, + "logps/chosen": -3.06573486328125, + "logps/rejected": -4.341249942779541, + "loss": 0.4386, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.06573486328125, + "rewards/margins": 1.2755151987075806, + "rewards/rejected": -4.341249942779541, + "sft_loss": 3.2408344745635986, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 18.375904597568343, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.26755040884017944, + "logits/rejected": -0.1529216468334198, + "logps/chosen": -3.136706590652466, + "logps/rejected": -4.274568557739258, + "loss": 0.4523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.136706590652466, + "rewards/margins": 1.1378618478775024, + "rewards/rejected": -4.274568557739258, + "sft_loss": 3.320249557495117, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 21.94889236681124, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.29934266209602356, + "logits/rejected": -0.1676492989063263, + "logps/chosen": -3.0635173320770264, + "logps/rejected": -4.196096420288086, + "loss": 0.4547, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0635173320770264, + "rewards/margins": 1.1325792074203491, + "rewards/rejected": -4.196096420288086, + "sft_loss": 3.3024845123291016, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 14.760395037648353, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.3397952616214752, + "logits/rejected": -0.10662896931171417, + "logps/chosen": -3.0589253902435303, + "logps/rejected": -4.455246925354004, + "loss": 0.4053, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0589253902435303, + "rewards/margins": 1.396321177482605, + "rewards/rejected": -4.455246925354004, + "sft_loss": 3.2559409141540527, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 16.322852975390248, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.2383042573928833, + "logits/rejected": -0.1254793107509613, + "logps/chosen": -3.2248597145080566, + "logps/rejected": -4.4617018699646, + "loss": 0.4697, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2248597145080566, + "rewards/margins": 1.236842393875122, + "rewards/rejected": -4.4617018699646, + "sft_loss": 3.446223735809326, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 14.37152358991461, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.2022382915019989, + "logits/rejected": -0.20282498002052307, + "logps/chosen": -3.195924758911133, + "logps/rejected": -4.3802595138549805, + "loss": 0.5141, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.195924758911133, + "rewards/margins": 1.1843347549438477, + "rewards/rejected": -4.3802595138549805, + "sft_loss": 3.360313892364502, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 17.521945693795125, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.4308474659919739, + "logits/rejected": -0.34627005457878113, + "logps/chosen": -3.1027534008026123, + "logps/rejected": -4.180264472961426, + "loss": 0.4843, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1027534008026123, + "rewards/margins": 1.077511191368103, + "rewards/rejected": -4.180264472961426, + "sft_loss": 3.237123966217041, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 16.896194855981122, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.18039865791797638, + "logits/rejected": -0.026836853474378586, + "logps/chosen": -3.3387451171875, + "logps/rejected": -4.3841986656188965, + "loss": 0.5017, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3387451171875, + "rewards/margins": 1.0454537868499756, + "rewards/rejected": -4.3841986656188965, + "sft_loss": 3.483057737350464, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 16.441757226523716, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.2857555150985718, + "logits/rejected": -0.15832646191120148, + "logps/chosen": -2.8970754146575928, + "logps/rejected": -3.9910411834716797, + "loss": 0.4743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8970754146575928, + "rewards/margins": 1.0939652919769287, + "rewards/rejected": -3.9910411834716797, + "sft_loss": 3.0792059898376465, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 19.01573807202738, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.26029425859451294, + "logits/rejected": -0.2442263811826706, + "logps/chosen": -3.0544180870056152, + "logps/rejected": -4.162773132324219, + "loss": 0.4682, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0544180870056152, + "rewards/margins": 1.108355164527893, + "rewards/rejected": -4.162773132324219, + "sft_loss": 3.1890597343444824, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 24.025395376054902, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.3096083402633667, + "logits/rejected": -0.1437438279390335, + "logps/chosen": -3.1456522941589355, + "logps/rejected": -4.235614776611328, + "loss": 0.4625, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1456522941589355, + "rewards/margins": 1.0899627208709717, + "rewards/rejected": -4.235614776611328, + "sft_loss": 3.3048198223114014, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 18.65213393676238, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.2922806739807129, + "logits/rejected": -0.129634290933609, + "logps/chosen": -3.0212314128875732, + "logps/rejected": -4.285717487335205, + "loss": 0.4529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0212314128875732, + "rewards/margins": 1.2644855976104736, + "rewards/rejected": -4.285717487335205, + "sft_loss": 3.2442543506622314, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 29.2513496192298, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.1702096164226532, + "logits/rejected": -0.0935095027089119, + "logps/chosen": -3.0341057777404785, + "logps/rejected": -4.166206359863281, + "loss": 0.4668, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0341057777404785, + "rewards/margins": 1.13210129737854, + "rewards/rejected": -4.166206359863281, + "sft_loss": 3.3285961151123047, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 23.00704024821283, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.31321150064468384, + "logits/rejected": -0.2151360958814621, + "logps/chosen": -2.9937045574188232, + "logps/rejected": -3.9885659217834473, + "loss": 0.4813, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.9937045574188232, + "rewards/margins": 0.9948616027832031, + "rewards/rejected": -3.9885659217834473, + "sft_loss": 3.190828561782837, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 17.89894309314384, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.34084606170654297, + "logits/rejected": -0.1118582934141159, + "logps/chosen": -3.0107243061065674, + "logps/rejected": -4.235139846801758, + "loss": 0.4364, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0107243061065674, + "rewards/margins": 1.2244160175323486, + "rewards/rejected": -4.235139846801758, + "sft_loss": 3.230741500854492, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 22.70863676615316, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.3982570767402649, + "logits/rejected": -0.34550541639328003, + "logps/chosen": -2.9199929237365723, + "logps/rejected": -4.091763019561768, + "loss": 0.4763, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9199929237365723, + "rewards/margins": 1.1717698574066162, + "rewards/rejected": -4.091763019561768, + "sft_loss": 3.0896589756011963, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 17.986620779251524, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.20317812263965607, + "logits/rejected": -0.122073695063591, + "logps/chosen": -2.9494340419769287, + "logps/rejected": -4.093937397003174, + "loss": 0.4748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9494340419769287, + "rewards/margins": 1.1445037126541138, + "rewards/rejected": -4.093937397003174, + "sft_loss": 3.065373420715332, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 15.153617752967738, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.43922024965286255, + "logits/rejected": -0.2848798632621765, + "logps/chosen": -2.9392600059509277, + "logps/rejected": -3.8739826679229736, + "loss": 0.4935, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9392600059509277, + "rewards/margins": 0.934722900390625, + "rewards/rejected": -3.8739826679229736, + "sft_loss": 3.1426877975463867, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 17.0434905367743, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.18671448528766632, + "logits/rejected": -0.1985694319009781, + "logps/chosen": -3.104698657989502, + "logps/rejected": -3.9841995239257812, + "loss": 0.5454, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.104698657989502, + "rewards/margins": 0.8795011639595032, + "rewards/rejected": -3.9841995239257812, + "sft_loss": 3.2671265602111816, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 15.5388913454973, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.364023357629776, + "logits/rejected": -0.26746666431427, + "logps/chosen": -2.8987088203430176, + "logps/rejected": -3.918377637863159, + "loss": 0.4505, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.8987088203430176, + "rewards/margins": 1.0196691751480103, + "rewards/rejected": -3.918377637863159, + "sft_loss": 3.0810458660125732, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 15.16323308061328, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.36861300468444824, + "logits/rejected": -0.22601374983787537, + "logps/chosen": -2.9905242919921875, + "logps/rejected": -4.286072254180908, + "loss": 0.4379, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9905242919921875, + "rewards/margins": 1.2955482006072998, + "rewards/rejected": -4.286072254180908, + "sft_loss": 3.1793177127838135, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 17.123638355788778, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.32376617193222046, + "logits/rejected": -0.15093651413917542, + "logps/chosen": -2.9996771812438965, + "logps/rejected": -4.0844011306762695, + "loss": 0.4765, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9996771812438965, + "rewards/margins": 1.0847234725952148, + "rewards/rejected": -4.0844011306762695, + "sft_loss": 3.0887672901153564, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 13.627329026221213, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.38504481315612793, + "logits/rejected": -0.15042896568775177, + "logps/chosen": -2.893672227859497, + "logps/rejected": -4.146300792694092, + "loss": 0.4252, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.893672227859497, + "rewards/margins": 1.2526286840438843, + "rewards/rejected": -4.146300792694092, + "sft_loss": 3.088972568511963, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 19.486732275129167, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.3545922338962555, + "logits/rejected": -0.10276027023792267, + "logps/chosen": -3.0243966579437256, + "logps/rejected": -4.108520030975342, + "loss": 0.4833, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0243966579437256, + "rewards/margins": 1.0841232538223267, + "rewards/rejected": -4.108520030975342, + "sft_loss": 3.1572792530059814, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 16.72094357730139, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.35756057500839233, + "logits/rejected": -0.2667251229286194, + "logps/chosen": -3.001896619796753, + "logps/rejected": -4.1554274559021, + "loss": 0.4724, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.001896619796753, + "rewards/margins": 1.1535308361053467, + "rewards/rejected": -4.1554274559021, + "sft_loss": 3.0512232780456543, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 15.281841973742925, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.2982345223426819, + "logits/rejected": -0.19628144800662994, + "logps/chosen": -3.0487236976623535, + "logps/rejected": -4.0739336013793945, + "loss": 0.5052, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0487236976623535, + "rewards/margins": 1.0252102613449097, + "rewards/rejected": -4.0739336013793945, + "sft_loss": 3.2537829875946045, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 15.820741935754505, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.31810635328292847, + "logits/rejected": -0.20068666338920593, + "logps/chosen": -2.9168238639831543, + "logps/rejected": -4.0605692863464355, + "loss": 0.4363, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9168238639831543, + "rewards/margins": 1.1437454223632812, + "rewards/rejected": -4.0605692863464355, + "sft_loss": 3.097944974899292, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 20.775218753242996, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.3695822060108185, + "logits/rejected": -0.21076634526252747, + "logps/chosen": -3.0002455711364746, + "logps/rejected": -4.147057056427002, + "loss": 0.4466, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0002455711364746, + "rewards/margins": 1.1468111276626587, + "rewards/rejected": -4.147057056427002, + "sft_loss": 3.1657638549804688, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 16.731510328798766, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.35614705085754395, + "logits/rejected": -0.1391567438840866, + "logps/chosen": -3.073739528656006, + "logps/rejected": -4.305612564086914, + "loss": 0.4326, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.073739528656006, + "rewards/margins": 1.2318732738494873, + "rewards/rejected": -4.305612564086914, + "sft_loss": 3.16626238822937, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 17.387501097476846, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.3530198633670807, + "logits/rejected": -0.07886115461587906, + "logps/chosen": -3.11734938621521, + "logps/rejected": -4.312512397766113, + "loss": 0.4423, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.11734938621521, + "rewards/margins": 1.1951625347137451, + "rewards/rejected": -4.312512397766113, + "sft_loss": 3.1892035007476807, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 19.201864047625754, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.29020678997039795, + "logits/rejected": -0.19367562234401703, + "logps/chosen": -3.008631944656372, + "logps/rejected": -4.115485191345215, + "loss": 0.4694, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.008631944656372, + "rewards/margins": 1.106853723526001, + "rewards/rejected": -4.115485191345215, + "sft_loss": 3.0630905628204346, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 14.832985219659655, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.3708581328392029, + "logits/rejected": -0.17783014476299286, + "logps/chosen": -3.0939762592315674, + "logps/rejected": -4.41884708404541, + "loss": 0.4437, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0939762592315674, + "rewards/margins": 1.3248708248138428, + "rewards/rejected": -4.41884708404541, + "sft_loss": 3.2989234924316406, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 18.326357383114857, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.2677830159664154, + "logits/rejected": -0.009764463640749454, + "logps/chosen": -3.1972920894622803, + "logps/rejected": -4.31788444519043, + "loss": 0.4879, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1972920894622803, + "rewards/margins": 1.1205923557281494, + "rewards/rejected": -4.31788444519043, + "sft_loss": 3.2770209312438965, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 15.356981651869333, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.3357272744178772, + "logits/rejected": -0.19809041917324066, + "logps/chosen": -3.137024164199829, + "logps/rejected": -4.19732666015625, + "loss": 0.4936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.137024164199829, + "rewards/margins": 1.060302734375, + "rewards/rejected": -4.19732666015625, + "sft_loss": 3.298842668533325, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 20.46982568740184, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.3426167368888855, + "logits/rejected": -0.22665449976921082, + "logps/chosen": -3.0479929447174072, + "logps/rejected": -4.279765605926514, + "loss": 0.4209, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0479929447174072, + "rewards/margins": 1.2317724227905273, + "rewards/rejected": -4.279765605926514, + "sft_loss": 3.211376667022705, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 29.92794653237818, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.29240962862968445, + "logits/rejected": -0.2794325351715088, + "logps/chosen": -3.042105197906494, + "logps/rejected": -4.291446685791016, + "loss": 0.4216, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.042105197906494, + "rewards/margins": 1.249341607093811, + "rewards/rejected": -4.291446685791016, + "sft_loss": 3.2021212577819824, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 28.274575534360725, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.3965105712413788, + "logits/rejected": -0.2133466899394989, + "logps/chosen": -3.2141833305358887, + "logps/rejected": -4.305365562438965, + "loss": 0.4813, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2141833305358887, + "rewards/margins": 1.0911824703216553, + "rewards/rejected": -4.305365562438965, + "sft_loss": 3.2878317832946777, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 19.428228118524952, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.3674038350582123, + "logits/rejected": -0.2661629319190979, + "logps/chosen": -3.1580593585968018, + "logps/rejected": -4.163644790649414, + "loss": 0.4719, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1580593585968018, + "rewards/margins": 1.0055850744247437, + "rewards/rejected": -4.163644790649414, + "sft_loss": 3.2378013134002686, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 15.088312796626367, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.28699201345443726, + "logits/rejected": -0.14260557293891907, + "logps/chosen": -3.083094835281372, + "logps/rejected": -4.206784248352051, + "loss": 0.4303, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.083094835281372, + "rewards/margins": 1.1236896514892578, + "rewards/rejected": -4.206784248352051, + "sft_loss": 3.1587040424346924, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 15.085807412977193, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.398585706949234, + "logits/rejected": -0.26450929045677185, + "logps/chosen": -3.036803722381592, + "logps/rejected": -4.33641242980957, + "loss": 0.435, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.036803722381592, + "rewards/margins": 1.2996087074279785, + "rewards/rejected": -4.33641242980957, + "sft_loss": 3.2943191528320312, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 17.875756064699107, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.3602619469165802, + "logits/rejected": -0.14143213629722595, + "logps/chosen": -3.187171459197998, + "logps/rejected": -4.539390563964844, + "loss": 0.4034, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.187171459197998, + "rewards/margins": 1.352218508720398, + "rewards/rejected": -4.539390563964844, + "sft_loss": 3.360614061355591, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 19.254669989616364, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.3277217745780945, + "logits/rejected": -0.14917297661304474, + "logps/chosen": -3.2284178733825684, + "logps/rejected": -4.194758415222168, + "loss": 0.5259, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2284178733825684, + "rewards/margins": 0.9663406610488892, + "rewards/rejected": -4.194758415222168, + "sft_loss": 3.447810411453247, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 15.662502888489064, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.34191417694091797, + "logits/rejected": -0.16648179292678833, + "logps/chosen": -3.153449535369873, + "logps/rejected": -4.330137252807617, + "loss": 0.4379, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.153449535369873, + "rewards/margins": 1.1766880750656128, + "rewards/rejected": -4.330137252807617, + "sft_loss": 3.2555534839630127, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 28.49248750066988, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.31089869141578674, + "logits/rejected": -0.131780743598938, + "logps/chosen": -3.172286033630371, + "logps/rejected": -4.270587921142578, + "loss": 0.5245, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.172286033630371, + "rewards/margins": 1.0983017683029175, + "rewards/rejected": -4.270587921142578, + "sft_loss": 3.335955858230591, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 19.18597001532169, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.33054202795028687, + "logits/rejected": -0.16461774706840515, + "logps/chosen": -3.1897456645965576, + "logps/rejected": -4.276345252990723, + "loss": 0.4839, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1897456645965576, + "rewards/margins": 1.0865994691848755, + "rewards/rejected": -4.276345252990723, + "sft_loss": 3.2617619037628174, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 15.951155250872317, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.304962694644928, + "logits/rejected": -0.2025957554578781, + "logps/chosen": -3.0752673149108887, + "logps/rejected": -4.30559778213501, + "loss": 0.4418, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0752673149108887, + "rewards/margins": 1.230330228805542, + "rewards/rejected": -4.30559778213501, + "sft_loss": 3.1467199325561523, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 20.887886259722755, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.3699023127555847, + "logits/rejected": -0.15047678351402283, + "logps/chosen": -3.0767436027526855, + "logps/rejected": -4.3717427253723145, + "loss": 0.408, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0767436027526855, + "rewards/margins": 1.2949992418289185, + "rewards/rejected": -4.3717427253723145, + "sft_loss": 3.279715061187744, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 34.759432720467146, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.33491817116737366, + "logits/rejected": -0.2518042325973511, + "logps/chosen": -3.222126007080078, + "logps/rejected": -4.264901161193848, + "loss": 0.5152, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.222126007080078, + "rewards/margins": 1.0427753925323486, + "rewards/rejected": -4.264901161193848, + "sft_loss": 3.2958178520202637, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 16.679103901957355, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.29535579681396484, + "logits/rejected": -0.19904498755931854, + "logps/chosen": -3.22019624710083, + "logps/rejected": -4.493924140930176, + "loss": 0.4517, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.22019624710083, + "rewards/margins": 1.273728609085083, + "rewards/rejected": -4.493924140930176, + "sft_loss": 3.3317291736602783, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 19.01300702417004, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.420266330242157, + "logits/rejected": -0.2640858590602875, + "logps/chosen": -3.09688663482666, + "logps/rejected": -4.210850715637207, + "loss": 0.4362, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.09688663482666, + "rewards/margins": 1.1139637231826782, + "rewards/rejected": -4.210850715637207, + "sft_loss": 3.282369613647461, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 18.772689720631274, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.41807642579078674, + "logits/rejected": -0.16949285566806793, + "logps/chosen": -3.1378002166748047, + "logps/rejected": -4.288260459899902, + "loss": 0.4554, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1378002166748047, + "rewards/margins": 1.1504604816436768, + "rewards/rejected": -4.288260459899902, + "sft_loss": 3.284740447998047, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 16.581837975657816, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.2532419264316559, + "logits/rejected": -0.2221713811159134, + "logps/chosen": -3.086059093475342, + "logps/rejected": -4.135127067565918, + "loss": 0.4878, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.086059093475342, + "rewards/margins": 1.0490682125091553, + "rewards/rejected": -4.135127067565918, + "sft_loss": 3.2261061668395996, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 20.023461445702395, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.2906354069709778, + "logits/rejected": -0.15659931302070618, + "logps/chosen": -2.9711108207702637, + "logps/rejected": -4.29131555557251, + "loss": 0.4048, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9711108207702637, + "rewards/margins": 1.320204496383667, + "rewards/rejected": -4.29131555557251, + "sft_loss": 3.096325635910034, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 21.821290850850822, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.41042837500572205, + "logits/rejected": -0.16330023109912872, + "logps/chosen": -3.022231340408325, + "logps/rejected": -4.2526140213012695, + "loss": 0.4597, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.022231340408325, + "rewards/margins": 1.2303820848464966, + "rewards/rejected": -4.2526140213012695, + "sft_loss": 3.254199266433716, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 15.71800589805128, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.3562023341655731, + "logits/rejected": -0.11175551265478134, + "logps/chosen": -3.140465021133423, + "logps/rejected": -4.293511867523193, + "loss": 0.458, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.140465021133423, + "rewards/margins": 1.1530468463897705, + "rewards/rejected": -4.293511867523193, + "sft_loss": 3.288815975189209, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 20.242450556583425, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.3815039396286011, + "logits/rejected": -0.2283477783203125, + "logps/chosen": -3.178576707839966, + "logps/rejected": -4.190559387207031, + "loss": 0.5102, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.178576707839966, + "rewards/margins": 1.0119825601577759, + "rewards/rejected": -4.190559387207031, + "sft_loss": 3.2854411602020264, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 15.326371019800186, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.33895745873451233, + "logits/rejected": -0.19376994669437408, + "logps/chosen": -3.007366895675659, + "logps/rejected": -4.102883338928223, + "loss": 0.4496, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.007366895675659, + "rewards/margins": 1.095516324043274, + "rewards/rejected": -4.102883338928223, + "sft_loss": 3.211820602416992, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 18.435973466867548, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.3354995846748352, + "logits/rejected": -0.1504775583744049, + "logps/chosen": -3.0240750312805176, + "logps/rejected": -4.287436485290527, + "loss": 0.448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0240750312805176, + "rewards/margins": 1.2633622884750366, + "rewards/rejected": -4.287436485290527, + "sft_loss": 3.16176438331604, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 15.865403458445352, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.35504865646362305, + "logits/rejected": -0.20114044845104218, + "logps/chosen": -3.141514778137207, + "logps/rejected": -4.188848495483398, + "loss": 0.491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.141514778137207, + "rewards/margins": 1.047333836555481, + "rewards/rejected": -4.188848495483398, + "sft_loss": 3.353114366531372, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.09221591055393219, + "eval_logits/rejected": 0.2088293880224228, + "eval_logps/chosen": -3.2269797325134277, + "eval_logps/rejected": -4.211081504821777, + "eval_loss": 0.5603845715522766, + "eval_rewards/accuracies": 0.7203264236450195, + "eval_rewards/chosen": -3.2269797325134277, + "eval_rewards/margins": 0.9841019511222839, + "eval_rewards/rejected": -4.211081504821777, + "eval_runtime": 51.2188, + "eval_samples_per_second": 26.26, + "eval_sft_loss": 3.356903076171875, + "eval_steps_per_second": 6.58, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 17.549190178042807, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.38595595955848694, + "logits/rejected": -0.3314853608608246, + "logps/chosen": -3.010838747024536, + "logps/rejected": -4.1536455154418945, + "loss": 0.4246, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.010838747024536, + "rewards/margins": 1.1428062915802002, + "rewards/rejected": -4.1536455154418945, + "sft_loss": 3.1467928886413574, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 15.470684494854325, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.3064468502998352, + "logits/rejected": -0.183636873960495, + "logps/chosen": -3.236712694168091, + "logps/rejected": -4.509716987609863, + "loss": 0.44, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.236712694168091, + "rewards/margins": 1.2730040550231934, + "rewards/rejected": -4.509716987609863, + "sft_loss": 3.392608642578125, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 16.278846121553524, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.31276029348373413, + "logits/rejected": -0.12013645470142365, + "logps/chosen": -3.12622332572937, + "logps/rejected": -4.3513078689575195, + "loss": 0.4574, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.12622332572937, + "rewards/margins": 1.225084662437439, + "rewards/rejected": -4.3513078689575195, + "sft_loss": 3.1967005729675293, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 19.832974055308483, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.3182033598423004, + "logits/rejected": -0.22087469696998596, + "logps/chosen": -3.1732428073883057, + "logps/rejected": -4.155479431152344, + "loss": 0.4911, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1732428073883057, + "rewards/margins": 0.982236385345459, + "rewards/rejected": -4.155479431152344, + "sft_loss": 3.2742087841033936, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 11.517550512791749, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.33021387457847595, + "logits/rejected": -0.22628983855247498, + "logps/chosen": -2.900446891784668, + "logps/rejected": -4.092584133148193, + "loss": 0.3754, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.900446891784668, + "rewards/margins": 1.1921371221542358, + "rewards/rejected": -4.092584133148193, + "sft_loss": 3.1026883125305176, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 21.771288688400453, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.3661060333251953, + "logits/rejected": -0.1493196189403534, + "logps/chosen": -3.1229987144470215, + "logps/rejected": -4.320777416229248, + "loss": 0.4451, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1229987144470215, + "rewards/margins": 1.1977789402008057, + "rewards/rejected": -4.320777416229248, + "sft_loss": 3.2086873054504395, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 15.90923873125897, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.3496370017528534, + "logits/rejected": -0.12278521060943604, + "logps/chosen": -3.133633852005005, + "logps/rejected": -4.354303359985352, + "loss": 0.4414, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.133633852005005, + "rewards/margins": 1.2206697463989258, + "rewards/rejected": -4.354303359985352, + "sft_loss": 3.3628973960876465, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 15.384651482718192, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.4240453839302063, + "logits/rejected": -0.1185920238494873, + "logps/chosen": -3.0241332054138184, + "logps/rejected": -4.203446865081787, + "loss": 0.4596, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0241332054138184, + "rewards/margins": 1.1793134212493896, + "rewards/rejected": -4.203446865081787, + "sft_loss": 3.1807942390441895, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 23.54080916969624, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.36372414231300354, + "logits/rejected": -0.12318293005228043, + "logps/chosen": -3.158956527709961, + "logps/rejected": -4.101058006286621, + "loss": 0.5432, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.158956527709961, + "rewards/margins": 0.942101001739502, + "rewards/rejected": -4.101058006286621, + "sft_loss": 3.2985637187957764, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 21.514102270030268, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.32725459337234497, + "logits/rejected": -0.16769400238990784, + "logps/chosen": -3.1743004322052, + "logps/rejected": -4.455135345458984, + "loss": 0.4501, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1743004322052, + "rewards/margins": 1.2808345556259155, + "rewards/rejected": -4.455135345458984, + "sft_loss": 3.3227474689483643, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 20.027767150897976, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.4107169508934021, + "logits/rejected": -0.2752717137336731, + "logps/chosen": -3.105297327041626, + "logps/rejected": -4.271871089935303, + "loss": 0.456, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.105297327041626, + "rewards/margins": 1.1665735244750977, + "rewards/rejected": -4.271871089935303, + "sft_loss": 3.2842094898223877, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 14.956469716175903, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.38273295760154724, + "logits/rejected": -0.15654902160167694, + "logps/chosen": -3.0597941875457764, + "logps/rejected": -4.213123321533203, + "loss": 0.4623, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0597941875457764, + "rewards/margins": 1.1533290147781372, + "rewards/rejected": -4.213123321533203, + "sft_loss": 3.2771835327148438, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 18.673644430533805, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.3945019841194153, + "logits/rejected": -0.17653414607048035, + "logps/chosen": -3.2652783393859863, + "logps/rejected": -4.370820045471191, + "loss": 0.5077, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2652783393859863, + "rewards/margins": 1.1055415868759155, + "rewards/rejected": -4.370820045471191, + "sft_loss": 3.3431732654571533, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 16.928687332642628, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.30592089891433716, + "logits/rejected": -0.2027606964111328, + "logps/chosen": -3.0578575134277344, + "logps/rejected": -4.163485527038574, + "loss": 0.502, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0578575134277344, + "rewards/margins": 1.1056289672851562, + "rewards/rejected": -4.163485527038574, + "sft_loss": 3.2745048999786377, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 22.804494188615138, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.36948710680007935, + "logits/rejected": -0.14660361409187317, + "logps/chosen": -3.161653757095337, + "logps/rejected": -4.248857498168945, + "loss": 0.4663, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.161653757095337, + "rewards/margins": 1.0872037410736084, + "rewards/rejected": -4.248857498168945, + "sft_loss": 3.2454497814178467, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 18.940651879852197, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.39973369240760803, + "logits/rejected": -0.24374902248382568, + "logps/chosen": -3.0319912433624268, + "logps/rejected": -4.062434673309326, + "loss": 0.4592, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0319912433624268, + "rewards/margins": 1.0304433107376099, + "rewards/rejected": -4.062434673309326, + "sft_loss": 3.186066150665283, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 23.47984244284787, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.2687085270881653, + "logits/rejected": -0.17982222139835358, + "logps/chosen": -3.1518681049346924, + "logps/rejected": -4.31063985824585, + "loss": 0.4605, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1518681049346924, + "rewards/margins": 1.1587715148925781, + "rewards/rejected": -4.31063985824585, + "sft_loss": 3.254848003387451, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 21.97239515670181, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.33787789940834045, + "logits/rejected": -0.10413823276758194, + "logps/chosen": -3.130115509033203, + "logps/rejected": -4.395178318023682, + "loss": 0.4466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.130115509033203, + "rewards/margins": 1.2650625705718994, + "rewards/rejected": -4.395178318023682, + "sft_loss": 3.320671796798706, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 16.423993441943615, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.45037755370140076, + "logits/rejected": -0.2175770103931427, + "logps/chosen": -3.2069408893585205, + "logps/rejected": -4.270087242126465, + "loss": 0.4828, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2069408893585205, + "rewards/margins": 1.0631463527679443, + "rewards/rejected": -4.270087242126465, + "sft_loss": 3.34574818611145, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 17.638490509172506, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.30428510904312134, + "logits/rejected": -0.2593707740306854, + "logps/chosen": -3.059542417526245, + "logps/rejected": -4.189467906951904, + "loss": 0.447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.059542417526245, + "rewards/margins": 1.1299254894256592, + "rewards/rejected": -4.189467906951904, + "sft_loss": 3.2454159259796143, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 19.793799506487687, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.3508601188659668, + "logits/rejected": -0.122955322265625, + "logps/chosen": -3.1088366508483887, + "logps/rejected": -4.098649978637695, + "loss": 0.496, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1088366508483887, + "rewards/margins": 0.9898133277893066, + "rewards/rejected": -4.098649978637695, + "sft_loss": 3.2179903984069824, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 16.759274363474006, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.372157484292984, + "logits/rejected": -0.2090158760547638, + "logps/chosen": -3.145791530609131, + "logps/rejected": -4.174318790435791, + "loss": 0.5217, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.145791530609131, + "rewards/margins": 1.0285265445709229, + "rewards/rejected": -4.174318790435791, + "sft_loss": 3.2394118309020996, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 15.948049682773291, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.3354955315589905, + "logits/rejected": -0.24566030502319336, + "logps/chosen": -3.003711223602295, + "logps/rejected": -4.0184173583984375, + "loss": 0.5035, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.003711223602295, + "rewards/margins": 1.0147063732147217, + "rewards/rejected": -4.0184173583984375, + "sft_loss": 3.1131107807159424, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 17.295297630257068, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.35742565989494324, + "logits/rejected": -0.22281166911125183, + "logps/chosen": -3.1032912731170654, + "logps/rejected": -4.247357368469238, + "loss": 0.4456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1032912731170654, + "rewards/margins": 1.1440660953521729, + "rewards/rejected": -4.247357368469238, + "sft_loss": 3.2081050872802734, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 23.203320817128144, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.39975064992904663, + "logits/rejected": -0.18377183377742767, + "logps/chosen": -3.1296451091766357, + "logps/rejected": -4.430170059204102, + "loss": 0.4169, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1296451091766357, + "rewards/margins": 1.3005244731903076, + "rewards/rejected": -4.430170059204102, + "sft_loss": 3.2475943565368652, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 19.048434918637117, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.36884161829948425, + "logits/rejected": -0.25827229022979736, + "logps/chosen": -3.074650526046753, + "logps/rejected": -3.9793503284454346, + "loss": 0.5143, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.074650526046753, + "rewards/margins": 0.9046999216079712, + "rewards/rejected": -3.9793503284454346, + "sft_loss": 3.1442952156066895, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 20.349098775884393, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.2782624363899231, + "logits/rejected": -0.17058196663856506, + "logps/chosen": -2.9235804080963135, + "logps/rejected": -4.07112979888916, + "loss": 0.4807, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9235804080963135, + "rewards/margins": 1.1475495100021362, + "rewards/rejected": -4.07112979888916, + "sft_loss": 3.1867012977600098, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 17.163122523064544, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.30895841121673584, + "logits/rejected": -0.17452199757099152, + "logps/chosen": -2.8846898078918457, + "logps/rejected": -3.919926166534424, + "loss": 0.4898, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8846898078918457, + "rewards/margins": 1.0352360010147095, + "rewards/rejected": -3.919926166534424, + "sft_loss": 3.070295810699463, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 18.635859605646804, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.27063530683517456, + "logits/rejected": -0.15605516731739044, + "logps/chosen": -2.980637311935425, + "logps/rejected": -4.103749752044678, + "loss": 0.4746, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.980637311935425, + "rewards/margins": 1.123112440109253, + "rewards/rejected": -4.103749752044678, + "sft_loss": 3.122291088104248, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 19.401639550425234, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.3831943869590759, + "logits/rejected": -0.2574603259563446, + "logps/chosen": -2.92333984375, + "logps/rejected": -4.0302629470825195, + "loss": 0.4766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.92333984375, + "rewards/margins": 1.1069234609603882, + "rewards/rejected": -4.0302629470825195, + "sft_loss": 3.1017067432403564, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 16.231351109341652, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.3166596293449402, + "logits/rejected": -0.20488278567790985, + "logps/chosen": -3.1826725006103516, + "logps/rejected": -4.1101884841918945, + "loss": 0.5237, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1826725006103516, + "rewards/margins": 0.9275161623954773, + "rewards/rejected": -4.1101884841918945, + "sft_loss": 3.377725601196289, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 22.175234926002215, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.37456607818603516, + "logits/rejected": -0.14552044868469238, + "logps/chosen": -3.1262221336364746, + "logps/rejected": -4.358965873718262, + "loss": 0.4665, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1262221336364746, + "rewards/margins": 1.2327439785003662, + "rewards/rejected": -4.358965873718262, + "sft_loss": 3.2200629711151123, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 17.647430018000996, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.32481950521469116, + "logits/rejected": -0.18392665684223175, + "logps/chosen": -3.0514533519744873, + "logps/rejected": -4.248579978942871, + "loss": 0.4352, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0514533519744873, + "rewards/margins": 1.1971266269683838, + "rewards/rejected": -4.248579978942871, + "sft_loss": 3.1023261547088623, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 19.93756168866044, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.2983691394329071, + "logits/rejected": -0.122675821185112, + "logps/chosen": -3.1254711151123047, + "logps/rejected": -4.209573745727539, + "loss": 0.5005, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1254711151123047, + "rewards/margins": 1.0841023921966553, + "rewards/rejected": -4.209573745727539, + "sft_loss": 3.2371037006378174, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 19.845925220759483, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.2862624228000641, + "logits/rejected": -0.19400539994239807, + "logps/chosen": -2.9830963611602783, + "logps/rejected": -4.230919361114502, + "loss": 0.4169, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9830963611602783, + "rewards/margins": 1.2478225231170654, + "rewards/rejected": -4.230919361114502, + "sft_loss": 3.1368911266326904, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 21.594842070698846, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.2956594228744507, + "logits/rejected": -0.22624805569648743, + "logps/chosen": -2.9545702934265137, + "logps/rejected": -3.8970417976379395, + "loss": 0.5049, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9545702934265137, + "rewards/margins": 0.9424716830253601, + "rewards/rejected": -3.8970417976379395, + "sft_loss": 3.093780040740967, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 19.202639752444874, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.3196641802787781, + "logits/rejected": -0.15099498629570007, + "logps/chosen": -3.0403876304626465, + "logps/rejected": -4.135394096374512, + "loss": 0.4713, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0403876304626465, + "rewards/margins": 1.095007061958313, + "rewards/rejected": -4.135394096374512, + "sft_loss": 3.225785732269287, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 12.768231099980593, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.3886847496032715, + "logits/rejected": -0.2385650873184204, + "logps/chosen": -2.975139617919922, + "logps/rejected": -3.9979240894317627, + "loss": 0.4818, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.975139617919922, + "rewards/margins": 1.0227844715118408, + "rewards/rejected": -3.9979240894317627, + "sft_loss": 3.1574254035949707, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 17.63037705851786, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.4010900855064392, + "logits/rejected": -0.26809439063072205, + "logps/chosen": -2.91060209274292, + "logps/rejected": -4.1822333335876465, + "loss": 0.4567, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.91060209274292, + "rewards/margins": 1.2716315984725952, + "rewards/rejected": -4.1822333335876465, + "sft_loss": 3.0425546169281006, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 14.547973993460808, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.3640051782131195, + "logits/rejected": -0.1315825879573822, + "logps/chosen": -2.999293804168701, + "logps/rejected": -4.230451583862305, + "loss": 0.4298, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.999293804168701, + "rewards/margins": 1.2311577796936035, + "rewards/rejected": -4.230451583862305, + "sft_loss": 3.197936534881592, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 16.722793874915272, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.3400874137878418, + "logits/rejected": -0.16026431322097778, + "logps/chosen": -3.140813112258911, + "logps/rejected": -4.275707721710205, + "loss": 0.4853, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.140813112258911, + "rewards/margins": 1.1348944902420044, + "rewards/rejected": -4.275707721710205, + "sft_loss": 3.330498456954956, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 18.983221644965163, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.37416914105415344, + "logits/rejected": -0.22600290179252625, + "logps/chosen": -3.138129711151123, + "logps/rejected": -4.2918477058410645, + "loss": 0.477, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.138129711151123, + "rewards/margins": 1.1537177562713623, + "rewards/rejected": -4.2918477058410645, + "sft_loss": 3.226738691329956, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 18.030058401465762, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.33753058314323425, + "logits/rejected": -0.14018869400024414, + "logps/chosen": -3.171159029006958, + "logps/rejected": -4.425040245056152, + "loss": 0.4658, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.171159029006958, + "rewards/margins": 1.2538812160491943, + "rewards/rejected": -4.425040245056152, + "sft_loss": 3.3366711139678955, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 18.217023564165505, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.3064436912536621, + "logits/rejected": -0.08399565517902374, + "logps/chosen": -2.881296157836914, + "logps/rejected": -4.056055545806885, + "loss": 0.4144, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.881296157836914, + "rewards/margins": 1.1747593879699707, + "rewards/rejected": -4.056055545806885, + "sft_loss": 3.1400952339172363, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 16.51985063792187, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.3931240439414978, + "logits/rejected": -0.3596143126487732, + "logps/chosen": -3.014737606048584, + "logps/rejected": -4.183712959289551, + "loss": 0.4529, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.014737606048584, + "rewards/margins": 1.1689751148223877, + "rewards/rejected": -4.183712959289551, + "sft_loss": 3.1337687969207764, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 19.25543546277433, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.26421234011650085, + "logits/rejected": -0.16148383915424347, + "logps/chosen": -3.0065956115722656, + "logps/rejected": -4.194592475891113, + "loss": 0.4534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0065956115722656, + "rewards/margins": 1.187996745109558, + "rewards/rejected": -4.194592475891113, + "sft_loss": 3.2171425819396973, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 24.027527657533568, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.2581459581851959, + "logits/rejected": -0.08003239333629608, + "logps/chosen": -3.1716055870056152, + "logps/rejected": -4.404228210449219, + "loss": 0.4574, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1716055870056152, + "rewards/margins": 1.2326222658157349, + "rewards/rejected": -4.404228210449219, + "sft_loss": 3.2989680767059326, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 15.809736239137736, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.2888232171535492, + "logits/rejected": -0.15002648532390594, + "logps/chosen": -3.079983711242676, + "logps/rejected": -4.162103652954102, + "loss": 0.4622, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.079983711242676, + "rewards/margins": 1.0821197032928467, + "rewards/rejected": -4.162103652954102, + "sft_loss": 3.223768949508667, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 19.957289974118574, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.32226279377937317, + "logits/rejected": -0.15740683674812317, + "logps/chosen": -3.0749404430389404, + "logps/rejected": -4.272371768951416, + "loss": 0.4498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0749404430389404, + "rewards/margins": 1.1974310874938965, + "rewards/rejected": -4.272371768951416, + "sft_loss": 3.260594606399536, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 20.147501113981313, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.37094011902809143, + "logits/rejected": -0.11719591915607452, + "logps/chosen": -2.9942538738250732, + "logps/rejected": -4.199368476867676, + "loss": 0.4432, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9942538738250732, + "rewards/margins": 1.2051149606704712, + "rewards/rejected": -4.199368476867676, + "sft_loss": 3.0453991889953613, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 19.390214950334087, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.36261844635009766, + "logits/rejected": -0.18685957789421082, + "logps/chosen": -3.118291139602661, + "logps/rejected": -4.132403373718262, + "loss": 0.4893, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.118291139602661, + "rewards/margins": 1.0141125917434692, + "rewards/rejected": -4.132403373718262, + "sft_loss": 3.346827745437622, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 17.06096706635864, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.32899707555770874, + "logits/rejected": -0.1959637850522995, + "logps/chosen": -3.1111388206481934, + "logps/rejected": -4.257960319519043, + "loss": 0.4374, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1111388206481934, + "rewards/margins": 1.1468214988708496, + "rewards/rejected": -4.257960319519043, + "sft_loss": 3.194671154022217, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 22.251746068963072, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.3317912518978119, + "logits/rejected": -0.23203852772712708, + "logps/chosen": -3.047714948654175, + "logps/rejected": -4.089310646057129, + "loss": 0.4892, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.047714948654175, + "rewards/margins": 1.041595458984375, + "rewards/rejected": -4.089310646057129, + "sft_loss": 3.1647562980651855, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 24.021656232040822, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.3317021429538727, + "logits/rejected": -0.1665099561214447, + "logps/chosen": -2.9778199195861816, + "logps/rejected": -3.9552600383758545, + "loss": 0.4825, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9778199195861816, + "rewards/margins": 0.977440357208252, + "rewards/rejected": -3.9552600383758545, + "sft_loss": 3.044766902923584, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 24.23329671944176, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.2726336121559143, + "logits/rejected": -0.0905083566904068, + "logps/chosen": -3.018521547317505, + "logps/rejected": -4.061691761016846, + "loss": 0.5167, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.018521547317505, + "rewards/margins": 1.0431700944900513, + "rewards/rejected": -4.061691761016846, + "sft_loss": 3.164506435394287, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 21.406723040593615, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.2635464370250702, + "logits/rejected": -0.1459456831216812, + "logps/chosen": -2.977616786956787, + "logps/rejected": -4.1141133308410645, + "loss": 0.4754, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.977616786956787, + "rewards/margins": 1.1364965438842773, + "rewards/rejected": -4.1141133308410645, + "sft_loss": 3.0608811378479004, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 12.667085384649567, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.36656802892684937, + "logits/rejected": -0.13581299781799316, + "logps/chosen": -2.9960341453552246, + "logps/rejected": -4.152059555053711, + "loss": 0.4398, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.9960341453552246, + "rewards/margins": 1.1560252904891968, + "rewards/rejected": -4.152059555053711, + "sft_loss": 3.1322269439697266, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 17.283403366542846, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.27755430340766907, + "logits/rejected": -0.20100736618041992, + "logps/chosen": -2.942697048187256, + "logps/rejected": -4.024935245513916, + "loss": 0.475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.942697048187256, + "rewards/margins": 1.0822378396987915, + "rewards/rejected": -4.024935245513916, + "sft_loss": 3.020854949951172, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 15.476244362183891, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.2853260338306427, + "logits/rejected": -0.08334718644618988, + "logps/chosen": -3.1610915660858154, + "logps/rejected": -4.359834671020508, + "loss": 0.4604, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1610915660858154, + "rewards/margins": 1.1987429857254028, + "rewards/rejected": -4.359834671020508, + "sft_loss": 3.271552562713623, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 16.524220171719012, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.39955052733421326, + "logits/rejected": -0.24060598015785217, + "logps/chosen": -3.005897045135498, + "logps/rejected": -4.021838188171387, + "loss": 0.4888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.005897045135498, + "rewards/margins": 1.0159412622451782, + "rewards/rejected": -4.021838188171387, + "sft_loss": 3.2215378284454346, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 19.96309863344739, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.29617124795913696, + "logits/rejected": -0.21583464741706848, + "logps/chosen": -3.0837156772613525, + "logps/rejected": -4.209137916564941, + "loss": 0.4901, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0837156772613525, + "rewards/margins": 1.1254225969314575, + "rewards/rejected": -4.209137916564941, + "sft_loss": 3.1022210121154785, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 25.87501638585397, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.3336920440196991, + "logits/rejected": -0.11970163881778717, + "logps/chosen": -3.096277952194214, + "logps/rejected": -4.252020835876465, + "loss": 0.466, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.096277952194214, + "rewards/margins": 1.1557424068450928, + "rewards/rejected": -4.252020835876465, + "sft_loss": 3.2355098724365234, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 16.68144881433365, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.283000648021698, + "logits/rejected": -0.2432548999786377, + "logps/chosen": -3.095372438430786, + "logps/rejected": -4.4320454597473145, + "loss": 0.3979, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.095372438430786, + "rewards/margins": 1.3366725444793701, + "rewards/rejected": -4.4320454597473145, + "sft_loss": 3.2416470050811768, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 22.269589452874328, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.33037471771240234, + "logits/rejected": -0.17230312526226044, + "logps/chosen": -2.932950735092163, + "logps/rejected": -4.207057952880859, + "loss": 0.4142, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.932950735092163, + "rewards/margins": 1.2741069793701172, + "rewards/rejected": -4.207057952880859, + "sft_loss": 3.052420139312744, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 15.984312304222266, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.36222535371780396, + "logits/rejected": -0.2763141393661499, + "logps/chosen": -3.0980286598205566, + "logps/rejected": -4.238247871398926, + "loss": 0.5043, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0980286598205566, + "rewards/margins": 1.1402194499969482, + "rewards/rejected": -4.238247871398926, + "sft_loss": 3.2644906044006348, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 20.107738671957918, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.2804338335990906, + "logits/rejected": -0.10021962970495224, + "logps/chosen": -3.0184197425842285, + "logps/rejected": -4.308283805847168, + "loss": 0.4141, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0184197425842285, + "rewards/margins": 1.2898635864257812, + "rewards/rejected": -4.308283805847168, + "sft_loss": 3.2268364429473877, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 17.955224209629822, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.2626858055591583, + "logits/rejected": -0.29193446040153503, + "logps/chosen": -3.162912607192993, + "logps/rejected": -4.278327465057373, + "loss": 0.4312, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.162912607192993, + "rewards/margins": 1.115415334701538, + "rewards/rejected": -4.278327465057373, + "sft_loss": 3.282226085662842, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 12.84821885820571, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.33620548248291016, + "logits/rejected": -0.15401166677474976, + "logps/chosen": -3.1779723167419434, + "logps/rejected": -4.275982856750488, + "loss": 0.4813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1779723167419434, + "rewards/margins": 1.0980106592178345, + "rewards/rejected": -4.275982856750488, + "sft_loss": 3.3701813220977783, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 19.088401413763112, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.33222153782844543, + "logits/rejected": -0.2611008286476135, + "logps/chosen": -3.208530902862549, + "logps/rejected": -4.362675666809082, + "loss": 0.4814, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.208530902862549, + "rewards/margins": 1.1541452407836914, + "rewards/rejected": -4.362675666809082, + "sft_loss": 3.362258195877075, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 14.875425765942385, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.27498680353164673, + "logits/rejected": -0.014174816198647022, + "logps/chosen": -3.0799829959869385, + "logps/rejected": -4.485774040222168, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0799829959869385, + "rewards/margins": 1.4057915210723877, + "rewards/rejected": -4.485774040222168, + "sft_loss": 3.205416202545166, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 16.35400953971993, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.3643206059932709, + "logits/rejected": -0.162673220038414, + "logps/chosen": -3.0855135917663574, + "logps/rejected": -4.370043754577637, + "loss": 0.4518, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0855135917663574, + "rewards/margins": 1.2845300436019897, + "rewards/rejected": -4.370043754577637, + "sft_loss": 3.2811782360076904, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 16.782745797370215, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.3893781304359436, + "logits/rejected": -0.27229851484298706, + "logps/chosen": -3.0741641521453857, + "logps/rejected": -4.406427383422852, + "loss": 0.4043, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.0741641521453857, + "rewards/margins": 1.3322635889053345, + "rewards/rejected": -4.406427383422852, + "sft_loss": 3.285371780395508, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 22.533404367403524, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.3591257631778717, + "logits/rejected": -0.16151633858680725, + "logps/chosen": -3.186220169067383, + "logps/rejected": -4.249715328216553, + "loss": 0.4871, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.186220169067383, + "rewards/margins": 1.0634949207305908, + "rewards/rejected": -4.249715328216553, + "sft_loss": 3.3226311206817627, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 24.867433963411433, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.3205176889896393, + "logits/rejected": -0.15790733695030212, + "logps/chosen": -3.1841578483581543, + "logps/rejected": -4.544305324554443, + "loss": 0.4391, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1841578483581543, + "rewards/margins": 1.3601475954055786, + "rewards/rejected": -4.544305324554443, + "sft_loss": 3.2552971839904785, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 22.050455771571038, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.2175244837999344, + "logits/rejected": -0.11109142005443573, + "logps/chosen": -3.1900973320007324, + "logps/rejected": -4.4396071434021, + "loss": 0.4804, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1900973320007324, + "rewards/margins": 1.249509572982788, + "rewards/rejected": -4.4396071434021, + "sft_loss": 3.242172956466675, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 16.86639995641779, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.35965830087661743, + "logits/rejected": -0.22785428166389465, + "logps/chosen": -3.0863912105560303, + "logps/rejected": -4.196893215179443, + "loss": 0.4612, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0863912105560303, + "rewards/margins": 1.1105024814605713, + "rewards/rejected": -4.196893215179443, + "sft_loss": 3.212651014328003, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 18.825314082783557, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.2895389199256897, + "logits/rejected": -0.22839932143688202, + "logps/chosen": -3.135324716567993, + "logps/rejected": -4.043545722961426, + "loss": 0.5092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.135324716567993, + "rewards/margins": 0.9082208871841431, + "rewards/rejected": -4.043545722961426, + "sft_loss": 3.223323106765747, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 17.842623697485102, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.37467458844184875, + "logits/rejected": -0.2382585108280182, + "logps/chosen": -3.1378002166748047, + "logps/rejected": -4.3286638259887695, + "loss": 0.4699, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1378002166748047, + "rewards/margins": 1.190863013267517, + "rewards/rejected": -4.3286638259887695, + "sft_loss": 3.2269675731658936, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 24.464303733487966, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.30891790986061096, + "logits/rejected": -0.18581345677375793, + "logps/chosen": -3.2107269763946533, + "logps/rejected": -4.322188854217529, + "loss": 0.4934, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2107269763946533, + "rewards/margins": 1.111462116241455, + "rewards/rejected": -4.322188854217529, + "sft_loss": 3.2918734550476074, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 16.845112498795427, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.3393566906452179, + "logits/rejected": -0.21427495777606964, + "logps/chosen": -3.025163173675537, + "logps/rejected": -4.013575553894043, + "loss": 0.4967, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.025163173675537, + "rewards/margins": 0.9884128570556641, + "rewards/rejected": -4.013575553894043, + "sft_loss": 3.262495756149292, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.05386869236826897, + "eval_logits/rejected": 0.16599443554878235, + "eval_logps/chosen": -3.1626408100128174, + "eval_logps/rejected": -4.141357421875, + "eval_loss": 0.5589354038238525, + "eval_rewards/accuracies": 0.7225519418716431, + "eval_rewards/chosen": -3.1626408100128174, + "eval_rewards/margins": 0.9787165522575378, + "eval_rewards/rejected": -4.141357421875, + "eval_runtime": 50.2599, + "eval_samples_per_second": 26.761, + "eval_sft_loss": 3.286099433898926, + "eval_steps_per_second": 6.705, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 24.916256914603725, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.38694795966148376, + "logits/rejected": -0.2582995295524597, + "logps/chosen": -3.044128179550171, + "logps/rejected": -4.149737358093262, + "loss": 0.5057, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.044128179550171, + "rewards/margins": 1.1056091785430908, + "rewards/rejected": -4.149737358093262, + "sft_loss": 3.150749444961548, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 16.625164558780753, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.33551377058029175, + "logits/rejected": -0.21937128901481628, + "logps/chosen": -3.0479540824890137, + "logps/rejected": -4.1916184425354, + "loss": 0.468, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0479540824890137, + "rewards/margins": 1.1436642408370972, + "rewards/rejected": -4.1916184425354, + "sft_loss": 3.181412696838379, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 23.333341592536538, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.3330397605895996, + "logits/rejected": -0.18397782742977142, + "logps/chosen": -2.8639981746673584, + "logps/rejected": -4.1112751960754395, + "loss": 0.5087, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8639981746673584, + "rewards/margins": 1.2472766637802124, + "rewards/rejected": -4.1112751960754395, + "sft_loss": 3.0787956714630127, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 19.920525986764915, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.29376813769340515, + "logits/rejected": -0.1875263750553131, + "logps/chosen": -3.086697578430176, + "logps/rejected": -4.033606052398682, + "loss": 0.4938, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.086697578430176, + "rewards/margins": 0.9469084739685059, + "rewards/rejected": -4.033606052398682, + "sft_loss": 3.1368043422698975, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 23.23922038582777, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.29967522621154785, + "logits/rejected": -0.133182555437088, + "logps/chosen": -3.0171871185302734, + "logps/rejected": -4.105466842651367, + "loss": 0.4585, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0171871185302734, + "rewards/margins": 1.0882798433303833, + "rewards/rejected": -4.105466842651367, + "sft_loss": 3.0773675441741943, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 22.37041290053205, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.3489793539047241, + "logits/rejected": -0.1727316677570343, + "logps/chosen": -3.129185438156128, + "logps/rejected": -4.195072174072266, + "loss": 0.4729, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.129185438156128, + "rewards/margins": 1.0658868551254272, + "rewards/rejected": -4.195072174072266, + "sft_loss": 3.2715210914611816, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 20.35813506460253, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.4507429599761963, + "logits/rejected": -0.24051764607429504, + "logps/chosen": -2.9227616786956787, + "logps/rejected": -4.2196455001831055, + "loss": 0.4019, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9227616786956787, + "rewards/margins": 1.2968841791152954, + "rewards/rejected": -4.2196455001831055, + "sft_loss": 3.03584361076355, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 18.646249235430773, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.40485063195228577, + "logits/rejected": -0.16617676615715027, + "logps/chosen": -2.9163639545440674, + "logps/rejected": -4.19936466217041, + "loss": 0.4011, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9163639545440674, + "rewards/margins": 1.2830005884170532, + "rewards/rejected": -4.19936466217041, + "sft_loss": 3.042677402496338, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 19.18315587660348, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.32241642475128174, + "logits/rejected": -0.261809766292572, + "logps/chosen": -2.975820302963257, + "logps/rejected": -4.039728164672852, + "loss": 0.4794, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.975820302963257, + "rewards/margins": 1.0639079809188843, + "rewards/rejected": -4.039728164672852, + "sft_loss": 3.182965040206909, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 19.24094940649905, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.4445788860321045, + "logits/rejected": -0.2621915340423584, + "logps/chosen": -3.1237831115722656, + "logps/rejected": -4.347258567810059, + "loss": 0.4573, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1237831115722656, + "rewards/margins": 1.223475694656372, + "rewards/rejected": -4.347258567810059, + "sft_loss": 3.208775758743286, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 23.126508233398816, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.2847757935523987, + "logits/rejected": -0.11558979749679565, + "logps/chosen": -2.8969483375549316, + "logps/rejected": -4.117270469665527, + "loss": 0.4668, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.8969483375549316, + "rewards/margins": 1.2203221321105957, + "rewards/rejected": -4.117270469665527, + "sft_loss": 3.1041221618652344, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 20.547858879397218, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.3398367762565613, + "logits/rejected": -0.22868943214416504, + "logps/chosen": -2.8547940254211426, + "logps/rejected": -4.000241279602051, + "loss": 0.4489, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.8547940254211426, + "rewards/margins": 1.1454476118087769, + "rewards/rejected": -4.000241279602051, + "sft_loss": 3.06701397895813, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 10.82015699618501, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.4190579950809479, + "logits/rejected": -0.3059861660003662, + "logps/chosen": -3.1115918159484863, + "logps/rejected": -4.324707984924316, + "loss": 0.4422, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1115918159484863, + "rewards/margins": 1.2131168842315674, + "rewards/rejected": -4.324707984924316, + "sft_loss": 3.2403512001037598, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 16.438235909967577, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.3123801350593567, + "logits/rejected": -0.1777162104845047, + "logps/chosen": -2.853092670440674, + "logps/rejected": -4.257376670837402, + "loss": 0.3776, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.853092670440674, + "rewards/margins": 1.4042837619781494, + "rewards/rejected": -4.257376670837402, + "sft_loss": 2.9209725856781006, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 23.006845285193506, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.23909814655780792, + "logits/rejected": -0.13520967960357666, + "logps/chosen": -2.869157314300537, + "logps/rejected": -4.090670585632324, + "loss": 0.4834, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.869157314300537, + "rewards/margins": 1.2215136289596558, + "rewards/rejected": -4.090670585632324, + "sft_loss": 3.0704166889190674, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 14.726162342640116, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.24981050193309784, + "logits/rejected": -0.1145351380109787, + "logps/chosen": -2.9208083152770996, + "logps/rejected": -4.255640506744385, + "loss": 0.4267, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.9208083152770996, + "rewards/margins": 1.334831953048706, + "rewards/rejected": -4.255640506744385, + "sft_loss": 3.1174123287200928, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 17.993500744261805, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.25230225920677185, + "logits/rejected": -0.1823224127292633, + "logps/chosen": -3.0884265899658203, + "logps/rejected": -4.351536750793457, + "loss": 0.4498, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0884265899658203, + "rewards/margins": 1.2631103992462158, + "rewards/rejected": -4.351536750793457, + "sft_loss": 3.2716116905212402, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 19.86826170864046, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.3957950472831726, + "logits/rejected": -0.19159351289272308, + "logps/chosen": -3.080976963043213, + "logps/rejected": -4.284677505493164, + "loss": 0.4773, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.080976963043213, + "rewards/margins": 1.2037004232406616, + "rewards/rejected": -4.284677505493164, + "sft_loss": 3.260425567626953, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 14.646080419525127, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.30659979581832886, + "logits/rejected": -0.2037850320339203, + "logps/chosen": -3.0508840084075928, + "logps/rejected": -3.9952120780944824, + "loss": 0.5283, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0508840084075928, + "rewards/margins": 0.9443281292915344, + "rewards/rejected": -3.9952120780944824, + "sft_loss": 3.201610565185547, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 18.560996284442947, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.2744046747684479, + "logits/rejected": -0.09588640928268433, + "logps/chosen": -2.9869422912597656, + "logps/rejected": -4.162718772888184, + "loss": 0.4379, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9869422912597656, + "rewards/margins": 1.175776481628418, + "rewards/rejected": -4.162718772888184, + "sft_loss": 3.1522040367126465, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 14.481153499398367, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.3010391592979431, + "logits/rejected": -0.11915872246026993, + "logps/chosen": -3.0480072498321533, + "logps/rejected": -4.383750915527344, + "loss": 0.4113, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0480072498321533, + "rewards/margins": 1.33574378490448, + "rewards/rejected": -4.383750915527344, + "sft_loss": 3.292620897293091, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 16.282657228318666, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.305327832698822, + "logits/rejected": -0.3109681308269501, + "logps/chosen": -2.971714496612549, + "logps/rejected": -4.347641944885254, + "loss": 0.4422, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.971714496612549, + "rewards/margins": 1.3759269714355469, + "rewards/rejected": -4.347641944885254, + "sft_loss": 3.0994067192077637, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 23.68553480847594, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.3070877194404602, + "logits/rejected": -0.1455794870853424, + "logps/chosen": -2.9347236156463623, + "logps/rejected": -4.083266258239746, + "loss": 0.46, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9347236156463623, + "rewards/margins": 1.1485424041748047, + "rewards/rejected": -4.083266258239746, + "sft_loss": 3.092538356781006, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 17.0194885455986, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.3108268976211548, + "logits/rejected": -0.17864060401916504, + "logps/chosen": -3.0577645301818848, + "logps/rejected": -4.320757865905762, + "loss": 0.4578, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0577645301818848, + "rewards/margins": 1.262993574142456, + "rewards/rejected": -4.320757865905762, + "sft_loss": 3.2131295204162598, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 16.110603675677545, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.43900686502456665, + "logits/rejected": -0.16938167810440063, + "logps/chosen": -3.0163490772247314, + "logps/rejected": -4.240276336669922, + "loss": 0.4472, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0163490772247314, + "rewards/margins": 1.2239272594451904, + "rewards/rejected": -4.240276336669922, + "sft_loss": 3.218003749847412, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 14.772454995298762, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.31613582372665405, + "logits/rejected": -0.097527876496315, + "logps/chosen": -3.0685675144195557, + "logps/rejected": -4.425512790679932, + "loss": 0.4334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0685675144195557, + "rewards/margins": 1.356945514678955, + "rewards/rejected": -4.425512790679932, + "sft_loss": 3.140439510345459, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 26.294902275748264, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.36880728602409363, + "logits/rejected": -0.15747341513633728, + "logps/chosen": -2.98952579498291, + "logps/rejected": -4.092957973480225, + "loss": 0.49, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.98952579498291, + "rewards/margins": 1.1034326553344727, + "rewards/rejected": -4.092957973480225, + "sft_loss": 3.1426939964294434, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 15.384947752941567, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.3689922094345093, + "logits/rejected": -0.2464989870786667, + "logps/chosen": -3.0003583431243896, + "logps/rejected": -3.947209596633911, + "loss": 0.485, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0003583431243896, + "rewards/margins": 0.9468511343002319, + "rewards/rejected": -3.947209596633911, + "sft_loss": 3.1662871837615967, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 22.157128734922004, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.3239455819129944, + "logits/rejected": -0.10675706714391708, + "logps/chosen": -3.0349650382995605, + "logps/rejected": -4.290961265563965, + "loss": 0.4187, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0349650382995605, + "rewards/margins": 1.2559964656829834, + "rewards/rejected": -4.290961265563965, + "sft_loss": 3.1621806621551514, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 19.15505957419598, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.3120448589324951, + "logits/rejected": -0.12682923674583435, + "logps/chosen": -2.9931201934814453, + "logps/rejected": -4.10862398147583, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9931201934814453, + "rewards/margins": 1.1155036687850952, + "rewards/rejected": -4.10862398147583, + "sft_loss": 3.1184768676757812, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 22.40083435029277, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.3152123987674713, + "logits/rejected": -0.2484721839427948, + "logps/chosen": -3.118741273880005, + "logps/rejected": -4.189919471740723, + "loss": 0.461, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.118741273880005, + "rewards/margins": 1.0711780786514282, + "rewards/rejected": -4.189919471740723, + "sft_loss": 3.274350643157959, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 15.22644592986102, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.27597761154174805, + "logits/rejected": -0.14365874230861664, + "logps/chosen": -2.9235987663269043, + "logps/rejected": -4.190218925476074, + "loss": 0.4232, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.9235987663269043, + "rewards/margins": 1.26662015914917, + "rewards/rejected": -4.190218925476074, + "sft_loss": 3.094919204711914, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 20.777279697287344, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.3656620681285858, + "logits/rejected": -0.2070375382900238, + "logps/chosen": -2.9615466594696045, + "logps/rejected": -4.212521553039551, + "loss": 0.4379, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.9615466594696045, + "rewards/margins": 1.2509740591049194, + "rewards/rejected": -4.212521553039551, + "sft_loss": 3.181602954864502, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 20.388056518083097, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.25572115182876587, + "logits/rejected": -0.1687474101781845, + "logps/chosen": -2.872952699661255, + "logps/rejected": -4.130259990692139, + "loss": 0.4301, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.872952699661255, + "rewards/margins": 1.2573074102401733, + "rewards/rejected": -4.130259990692139, + "sft_loss": 2.927790641784668, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 16.961728362857738, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.29145973920822144, + "logits/rejected": -0.22770313918590546, + "logps/chosen": -3.081450939178467, + "logps/rejected": -4.227843284606934, + "loss": 0.443, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.081450939178467, + "rewards/margins": 1.1463927030563354, + "rewards/rejected": -4.227843284606934, + "sft_loss": 3.1626648902893066, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 19.0372220245103, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.19781547784805298, + "logits/rejected": -0.1551518440246582, + "logps/chosen": -3.236917495727539, + "logps/rejected": -4.306820869445801, + "loss": 0.5071, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.236917495727539, + "rewards/margins": 1.0699033737182617, + "rewards/rejected": -4.306820869445801, + "sft_loss": 3.259613513946533, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 18.562975696656537, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.299947053194046, + "logits/rejected": -0.09621630609035492, + "logps/chosen": -2.9437007904052734, + "logps/rejected": -4.194373607635498, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9437007904052734, + "rewards/margins": 1.2506728172302246, + "rewards/rejected": -4.194373607635498, + "sft_loss": 3.020115375518799, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 19.595036919820167, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.4113302230834961, + "logits/rejected": -0.21265852451324463, + "logps/chosen": -2.9879603385925293, + "logps/rejected": -4.28658390045166, + "loss": 0.4121, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.9879603385925293, + "rewards/margins": 1.2986233234405518, + "rewards/rejected": -4.28658390045166, + "sft_loss": 3.139157772064209, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 16.51138964172436, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.3349289894104004, + "logits/rejected": -0.3230084776878357, + "logps/chosen": -3.0833542346954346, + "logps/rejected": -4.051263809204102, + "loss": 0.5243, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0833542346954346, + "rewards/margins": 0.9679099917411804, + "rewards/rejected": -4.051263809204102, + "sft_loss": 3.2449798583984375, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 18.197818314108957, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.34814295172691345, + "logits/rejected": -0.29311561584472656, + "logps/chosen": -3.0884833335876465, + "logps/rejected": -4.1714372634887695, + "loss": 0.5178, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0884833335876465, + "rewards/margins": 1.0829538106918335, + "rewards/rejected": -4.1714372634887695, + "sft_loss": 3.25970458984375, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 18.610757371590207, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.26829028129577637, + "logits/rejected": -0.1963682621717453, + "logps/chosen": -3.059337854385376, + "logps/rejected": -4.076889991760254, + "loss": 0.4642, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.059337854385376, + "rewards/margins": 1.0175515413284302, + "rewards/rejected": -4.076889991760254, + "sft_loss": 3.198467969894409, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 23.57394142189509, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.2515251040458679, + "logits/rejected": -0.15543124079704285, + "logps/chosen": -3.1278204917907715, + "logps/rejected": -4.222821235656738, + "loss": 0.4596, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1278204917907715, + "rewards/margins": 1.0950000286102295, + "rewards/rejected": -4.222821235656738, + "sft_loss": 3.204451322555542, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 19.210952164670346, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.3350864052772522, + "logits/rejected": -0.24469581246376038, + "logps/chosen": -3.0523061752319336, + "logps/rejected": -4.216456413269043, + "loss": 0.4467, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0523061752319336, + "rewards/margins": 1.1641499996185303, + "rewards/rejected": -4.216456413269043, + "sft_loss": 3.1259608268737793, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 22.822771825087354, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.3542567491531372, + "logits/rejected": -0.09227250516414642, + "logps/chosen": -2.9733572006225586, + "logps/rejected": -4.103068828582764, + "loss": 0.4809, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9733572006225586, + "rewards/margins": 1.1297115087509155, + "rewards/rejected": -4.103068828582764, + "sft_loss": 3.186063766479492, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 15.633412485679143, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.38056424260139465, + "logits/rejected": -0.2081277072429657, + "logps/chosen": -3.025116443634033, + "logps/rejected": -4.112739086151123, + "loss": 0.4807, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.025116443634033, + "rewards/margins": 1.0876230001449585, + "rewards/rejected": -4.112739086151123, + "sft_loss": 3.1368777751922607, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 15.403511604630026, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.341784805059433, + "logits/rejected": -0.2454969882965088, + "logps/chosen": -2.987480640411377, + "logps/rejected": -4.311043739318848, + "loss": 0.4443, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.987480640411377, + "rewards/margins": 1.323562502861023, + "rewards/rejected": -4.311043739318848, + "sft_loss": 3.0825743675231934, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 21.957693002820385, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.32171764969825745, + "logits/rejected": -0.1897803694009781, + "logps/chosen": -3.0886237621307373, + "logps/rejected": -4.227081775665283, + "loss": 0.4616, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0886237621307373, + "rewards/margins": 1.138457179069519, + "rewards/rejected": -4.227081775665283, + "sft_loss": 3.2400691509246826, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 17.284715714096862, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.15596535801887512, + "logits/rejected": -0.1801324039697647, + "logps/chosen": -3.184924364089966, + "logps/rejected": -4.2829718589782715, + "loss": 0.4788, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.184924364089966, + "rewards/margins": 1.0980473756790161, + "rewards/rejected": -4.2829718589782715, + "sft_loss": 3.25410795211792, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 21.944048134631043, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.2575821578502655, + "logits/rejected": -0.25107091665267944, + "logps/chosen": -3.027339220046997, + "logps/rejected": -4.128751277923584, + "loss": 0.4693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.027339220046997, + "rewards/margins": 1.1014121770858765, + "rewards/rejected": -4.128751277923584, + "sft_loss": 3.14856219291687, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 18.66301339111262, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.30721646547317505, + "logits/rejected": -0.22108140587806702, + "logps/chosen": -2.860776424407959, + "logps/rejected": -3.9469971656799316, + "loss": 0.4347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.860776424407959, + "rewards/margins": 1.0862209796905518, + "rewards/rejected": -3.9469971656799316, + "sft_loss": 3.0134289264678955, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 17.39476431005673, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.4259369969367981, + "logits/rejected": -0.3169228434562683, + "logps/chosen": -2.816646099090576, + "logps/rejected": -4.042110443115234, + "loss": 0.419, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.816646099090576, + "rewards/margins": 1.2254643440246582, + "rewards/rejected": -4.042110443115234, + "sft_loss": 2.9703478813171387, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 19.832856387364917, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.2929435074329376, + "logits/rejected": -0.20484165847301483, + "logps/chosen": -2.9489831924438477, + "logps/rejected": -4.000323295593262, + "loss": 0.4958, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9489831924438477, + "rewards/margins": 1.0513403415679932, + "rewards/rejected": -4.000323295593262, + "sft_loss": 3.115665912628174, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 18.512432157472645, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.3188219666481018, + "logits/rejected": -0.04534931108355522, + "logps/chosen": -2.9126038551330566, + "logps/rejected": -4.1106038093566895, + "loss": 0.4466, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.9126038551330566, + "rewards/margins": 1.197999358177185, + "rewards/rejected": -4.1106038093566895, + "sft_loss": 3.0568900108337402, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 21.857296108358923, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.21708163619041443, + "logits/rejected": -0.054964639246463776, + "logps/chosen": -2.9592323303222656, + "logps/rejected": -4.031314849853516, + "loss": 0.4542, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9592323303222656, + "rewards/margins": 1.0720826387405396, + "rewards/rejected": -4.031314849853516, + "sft_loss": 3.165611505508423, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 25.98677017324065, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.2887173295021057, + "logits/rejected": -0.20963990688323975, + "logps/chosen": -2.968780040740967, + "logps/rejected": -3.817471981048584, + "loss": 0.5555, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.968780040740967, + "rewards/margins": 0.8486918210983276, + "rewards/rejected": -3.817471981048584, + "sft_loss": 3.15004563331604, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 14.823323188091512, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.35453784465789795, + "logits/rejected": -0.17695707082748413, + "logps/chosen": -3.0053393840789795, + "logps/rejected": -4.123621940612793, + "loss": 0.4883, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0053393840789795, + "rewards/margins": 1.1182823181152344, + "rewards/rejected": -4.123621940612793, + "sft_loss": 3.142940044403076, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 35.681208849165785, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.3615763783454895, + "logits/rejected": -0.22551564872264862, + "logps/chosen": -2.9433486461639404, + "logps/rejected": -4.234328746795654, + "loss": 0.4195, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9433486461639404, + "rewards/margins": 1.2909801006317139, + "rewards/rejected": -4.234328746795654, + "sft_loss": 3.0968470573425293, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 19.324635659444645, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.4254387319087982, + "logits/rejected": -0.21585650742053986, + "logps/chosen": -3.1022112369537354, + "logps/rejected": -4.332705020904541, + "loss": 0.448, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1022112369537354, + "rewards/margins": 1.2304937839508057, + "rewards/rejected": -4.332705020904541, + "sft_loss": 3.239518404006958, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 15.705163473856352, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.4600273072719574, + "logits/rejected": -0.2306920289993286, + "logps/chosen": -2.96919322013855, + "logps/rejected": -4.197896480560303, + "loss": 0.4428, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.96919322013855, + "rewards/margins": 1.2287031412124634, + "rewards/rejected": -4.197896480560303, + "sft_loss": 3.110826015472412, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 12.42786764570172, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.24805191159248352, + "logits/rejected": -0.09726305305957794, + "logps/chosen": -3.012068748474121, + "logps/rejected": -4.305748462677002, + "loss": 0.4509, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.012068748474121, + "rewards/margins": 1.2936795949935913, + "rewards/rejected": -4.305748462677002, + "sft_loss": 3.1654953956604004, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 26.94058532427146, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.29765814542770386, + "logits/rejected": -0.11189641803503036, + "logps/chosen": -3.1366753578186035, + "logps/rejected": -4.3091325759887695, + "loss": 0.466, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1366753578186035, + "rewards/margins": 1.1724575757980347, + "rewards/rejected": -4.3091325759887695, + "sft_loss": 3.252049684524536, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 18.48452893303915, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.32929176092147827, + "logits/rejected": -0.19954873621463776, + "logps/chosen": -2.9225192070007324, + "logps/rejected": -4.042097568511963, + "loss": 0.4681, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.9225192070007324, + "rewards/margins": 1.1195781230926514, + "rewards/rejected": -4.042097568511963, + "sft_loss": 3.047149658203125, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 15.784808698281562, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.38457226753234863, + "logits/rejected": -0.14432287216186523, + "logps/chosen": -3.0213623046875, + "logps/rejected": -4.299278736114502, + "loss": 0.4068, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0213623046875, + "rewards/margins": 1.2779161930084229, + "rewards/rejected": -4.299278736114502, + "sft_loss": 3.104046583175659, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 22.853353508016887, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.33879074454307556, + "logits/rejected": -0.2555561065673828, + "logps/chosen": -3.044795513153076, + "logps/rejected": -4.244548320770264, + "loss": 0.4223, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.044795513153076, + "rewards/margins": 1.1997532844543457, + "rewards/rejected": -4.244548320770264, + "sft_loss": 3.1476967334747314, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 27.26187470654718, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.4439014792442322, + "logits/rejected": -0.19702909886837006, + "logps/chosen": -2.9273324012756348, + "logps/rejected": -4.080560207366943, + "loss": 0.4568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9273324012756348, + "rewards/margins": 1.1532284021377563, + "rewards/rejected": -4.080560207366943, + "sft_loss": 3.1291260719299316, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 19.92107499422407, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.45958924293518066, + "logits/rejected": -0.2524833083152771, + "logps/chosen": -3.048386812210083, + "logps/rejected": -4.28609561920166, + "loss": 0.4498, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.048386812210083, + "rewards/margins": 1.2377086877822876, + "rewards/rejected": -4.28609561920166, + "sft_loss": 3.2454352378845215, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 26.367710927243852, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.2267228662967682, + "logits/rejected": -0.15011325478553772, + "logps/chosen": -3.0078647136688232, + "logps/rejected": -4.023341178894043, + "loss": 0.4932, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0078647136688232, + "rewards/margins": 1.0154768228530884, + "rewards/rejected": -4.023341178894043, + "sft_loss": 3.212428331375122, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 22.474552444052417, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.28271785378456116, + "logits/rejected": -0.10942456871271133, + "logps/chosen": -3.120392322540283, + "logps/rejected": -4.208460807800293, + "loss": 0.4647, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.120392322540283, + "rewards/margins": 1.0880682468414307, + "rewards/rejected": -4.208460807800293, + "sft_loss": 3.176684856414795, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 27.186846650364313, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.28622376918792725, + "logits/rejected": -0.1410348266363144, + "logps/chosen": -3.016723871231079, + "logps/rejected": -4.079378128051758, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.016723871231079, + "rewards/margins": 1.0626541376113892, + "rewards/rejected": -4.079378128051758, + "sft_loss": 3.183474063873291, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 16.057516905741448, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.35685616731643677, + "logits/rejected": -0.2156098634004593, + "logps/chosen": -2.992096424102783, + "logps/rejected": -4.160267353057861, + "loss": 0.4594, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.992096424102783, + "rewards/margins": 1.16817045211792, + "rewards/rejected": -4.160267353057861, + "sft_loss": 3.1283581256866455, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 17.13120591922187, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.3916035592556, + "logits/rejected": -0.28982120752334595, + "logps/chosen": -2.9988656044006348, + "logps/rejected": -4.073007106781006, + "loss": 0.4696, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9988656044006348, + "rewards/margins": 1.0741417407989502, + "rewards/rejected": -4.073007106781006, + "sft_loss": 3.182443380355835, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 14.507481787012965, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.31555086374282837, + "logits/rejected": -0.2631721794605255, + "logps/chosen": -2.906529188156128, + "logps/rejected": -4.1582770347595215, + "loss": 0.4346, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.906529188156128, + "rewards/margins": 1.2517473697662354, + "rewards/rejected": -4.1582770347595215, + "sft_loss": 3.072540760040283, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 20.585576738264063, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.3451662063598633, + "logits/rejected": -0.1332363784313202, + "logps/chosen": -2.919412136077881, + "logps/rejected": -4.0757293701171875, + "loss": 0.4401, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.919412136077881, + "rewards/margins": 1.1563172340393066, + "rewards/rejected": -4.0757293701171875, + "sft_loss": 3.012556552886963, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 16.93919875228968, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.3259938955307007, + "logits/rejected": -0.17566534876823425, + "logps/chosen": -3.0338306427001953, + "logps/rejected": -4.260069370269775, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0338306427001953, + "rewards/margins": 1.2262380123138428, + "rewards/rejected": -4.260069370269775, + "sft_loss": 3.2258095741271973, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 17.058168897094994, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.2931608557701111, + "logits/rejected": -0.18977099657058716, + "logps/chosen": -3.1517324447631836, + "logps/rejected": -4.432768821716309, + "loss": 0.3983, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1517324447631836, + "rewards/margins": 1.281036615371704, + "rewards/rejected": -4.432768821716309, + "sft_loss": 3.2231268882751465, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 23.569625986902015, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.248325914144516, + "logits/rejected": -0.17798493802547455, + "logps/chosen": -3.194131374359131, + "logps/rejected": -4.191938877105713, + "loss": 0.5005, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.194131374359131, + "rewards/margins": 0.9978069067001343, + "rewards/rejected": -4.191938877105713, + "sft_loss": 3.4025511741638184, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 17.11771675584726, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.2721349895000458, + "logits/rejected": -0.11362195014953613, + "logps/chosen": -2.9763708114624023, + "logps/rejected": -4.256707191467285, + "loss": 0.4714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9763708114624023, + "rewards/margins": 1.2803370952606201, + "rewards/rejected": -4.256707191467285, + "sft_loss": 3.202526807785034, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 21.237414221087953, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.3889920711517334, + "logits/rejected": -0.19775991141796112, + "logps/chosen": -2.9420952796936035, + "logps/rejected": -4.098598957061768, + "loss": 0.4557, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9420952796936035, + "rewards/margins": 1.1565032005310059, + "rewards/rejected": -4.098598957061768, + "sft_loss": 3.1482629776000977, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 21.637278412651288, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.2731543183326721, + "logits/rejected": -0.10965131223201752, + "logps/chosen": -3.038461446762085, + "logps/rejected": -4.216440677642822, + "loss": 0.4455, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.038461446762085, + "rewards/margins": 1.177978754043579, + "rewards/rejected": -4.216440677642822, + "sft_loss": 3.1527132987976074, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 24.901410013852114, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.40109768509864807, + "logits/rejected": -0.21919843554496765, + "logps/chosen": -2.9902687072753906, + "logps/rejected": -4.239418029785156, + "loss": 0.439, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9902687072753906, + "rewards/margins": 1.2491494417190552, + "rewards/rejected": -4.239418029785156, + "sft_loss": 3.18410062789917, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.03516725078225136, + "eval_logits/rejected": 0.14618034660816193, + "eval_logps/chosen": -3.1772398948669434, + "eval_logps/rejected": -4.1641154289245605, + "eval_loss": 0.5584015250205994, + "eval_rewards/accuracies": 0.721068263053894, + "eval_rewards/chosen": -3.1772398948669434, + "eval_rewards/margins": 0.9868756532669067, + "eval_rewards/rejected": -4.1641154289245605, + "eval_runtime": 51.25, + "eval_samples_per_second": 26.244, + "eval_sft_loss": 3.3039963245391846, + "eval_steps_per_second": 6.576, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 11.349911011374502, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.28439879417419434, + "logits/rejected": -0.2091202735900879, + "logps/chosen": -2.9650275707244873, + "logps/rejected": -4.147830963134766, + "loss": 0.4494, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9650275707244873, + "rewards/margins": 1.1828030347824097, + "rewards/rejected": -4.147830963134766, + "sft_loss": 3.0423130989074707, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 15.93567970311688, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.3161846101284027, + "logits/rejected": -0.19209876656532288, + "logps/chosen": -3.036440372467041, + "logps/rejected": -4.228138446807861, + "loss": 0.4694, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.036440372467041, + "rewards/margins": 1.1916977167129517, + "rewards/rejected": -4.228138446807861, + "sft_loss": 3.137333393096924, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 14.87570642794917, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.3232470154762268, + "logits/rejected": -0.22589445114135742, + "logps/chosen": -2.9099388122558594, + "logps/rejected": -4.149899005889893, + "loss": 0.4381, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.9099388122558594, + "rewards/margins": 1.239959955215454, + "rewards/rejected": -4.149899005889893, + "sft_loss": 3.07240629196167, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 15.708089383825824, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.37510597705841064, + "logits/rejected": -0.2620985805988312, + "logps/chosen": -2.993945598602295, + "logps/rejected": -4.423148155212402, + "loss": 0.392, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.993945598602295, + "rewards/margins": 1.4292032718658447, + "rewards/rejected": -4.423148155212402, + "sft_loss": 3.146930694580078, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 23.91438518388687, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.3168920874595642, + "logits/rejected": -0.13376422226428986, + "logps/chosen": -3.074483633041382, + "logps/rejected": -4.210515975952148, + "loss": 0.473, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.074483633041382, + "rewards/margins": 1.136031985282898, + "rewards/rejected": -4.210515975952148, + "sft_loss": 3.172656297683716, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 13.810781848975232, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.4242118299007416, + "logits/rejected": -0.250504732131958, + "logps/chosen": -3.033071756362915, + "logps/rejected": -4.1675004959106445, + "loss": 0.4504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.033071756362915, + "rewards/margins": 1.1344287395477295, + "rewards/rejected": -4.1675004959106445, + "sft_loss": 3.1074013710021973, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 16.266341458942986, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.40003085136413574, + "logits/rejected": -0.2384142428636551, + "logps/chosen": -2.941970109939575, + "logps/rejected": -4.156831741333008, + "loss": 0.438, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.941970109939575, + "rewards/margins": 1.2148616313934326, + "rewards/rejected": -4.156831741333008, + "sft_loss": 3.055846691131592, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 18.466590109043757, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.36745721101760864, + "logits/rejected": -0.1435421258211136, + "logps/chosen": -3.0547235012054443, + "logps/rejected": -4.208713531494141, + "loss": 0.4631, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0547235012054443, + "rewards/margins": 1.1539896726608276, + "rewards/rejected": -4.208713531494141, + "sft_loss": 3.1727612018585205, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 20.405521035426105, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.31539779901504517, + "logits/rejected": -0.24084322154521942, + "logps/chosen": -2.9838807582855225, + "logps/rejected": -4.0755181312561035, + "loss": 0.4832, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9838807582855225, + "rewards/margins": 1.0916370153427124, + "rewards/rejected": -4.0755181312561035, + "sft_loss": 3.1585214138031006, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 29.4394596025172, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.40207844972610474, + "logits/rejected": -0.2369350641965866, + "logps/chosen": -3.1063499450683594, + "logps/rejected": -4.314638614654541, + "loss": 0.458, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1063499450683594, + "rewards/margins": 1.2082884311676025, + "rewards/rejected": -4.314638614654541, + "sft_loss": 3.2730610370635986, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 19.81157748000706, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.3515699505805969, + "logits/rejected": -0.11190332472324371, + "logps/chosen": -3.1851155757904053, + "logps/rejected": -4.409258842468262, + "loss": 0.4359, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1851155757904053, + "rewards/margins": 1.224143385887146, + "rewards/rejected": -4.409258842468262, + "sft_loss": 3.227384090423584, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 21.971960987506325, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.31455013155937195, + "logits/rejected": -0.23573537170886993, + "logps/chosen": -3.047395706176758, + "logps/rejected": -4.163751602172852, + "loss": 0.4894, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.047395706176758, + "rewards/margins": 1.1163556575775146, + "rewards/rejected": -4.163751602172852, + "sft_loss": 3.3204185962677, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 16.779204925233334, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.44790953397750854, + "logits/rejected": -0.18838870525360107, + "logps/chosen": -3.199179172515869, + "logps/rejected": -4.315297603607178, + "loss": 0.4731, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.199179172515869, + "rewards/margins": 1.1161186695098877, + "rewards/rejected": -4.315297603607178, + "sft_loss": 3.3300118446350098, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 22.629600760447133, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.2885778844356537, + "logits/rejected": -0.12899121642112732, + "logps/chosen": -3.0951497554779053, + "logps/rejected": -4.30334997177124, + "loss": 0.4359, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0951497554779053, + "rewards/margins": 1.208200216293335, + "rewards/rejected": -4.30334997177124, + "sft_loss": 3.252854585647583, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 14.960965202827843, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.33523792028427124, + "logits/rejected": -0.17725275456905365, + "logps/chosen": -3.1233747005462646, + "logps/rejected": -4.302353858947754, + "loss": 0.4212, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1233747005462646, + "rewards/margins": 1.1789793968200684, + "rewards/rejected": -4.302353858947754, + "sft_loss": 3.1863956451416016, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 15.585561296051177, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.3423822224140167, + "logits/rejected": -0.12648749351501465, + "logps/chosen": -3.2453339099884033, + "logps/rejected": -4.551573753356934, + "loss": 0.4263, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2453339099884033, + "rewards/margins": 1.306240200996399, + "rewards/rejected": -4.551573753356934, + "sft_loss": 3.2849297523498535, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 24.343265168221777, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.3864888548851013, + "logits/rejected": -0.25290876626968384, + "logps/chosen": -3.046827793121338, + "logps/rejected": -4.351868629455566, + "loss": 0.4402, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.046827793121338, + "rewards/margins": 1.3050403594970703, + "rewards/rejected": -4.351868629455566, + "sft_loss": 3.154933452606201, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 16.948247904423653, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.32873716950416565, + "logits/rejected": -0.15873248875141144, + "logps/chosen": -3.001735210418701, + "logps/rejected": -4.202784538269043, + "loss": 0.4276, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.001735210418701, + "rewards/margins": 1.2010494470596313, + "rewards/rejected": -4.202784538269043, + "sft_loss": 3.209874391555786, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 15.834554738414026, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.3988923132419586, + "logits/rejected": -0.18734273314476013, + "logps/chosen": -3.2281811237335205, + "logps/rejected": -4.491852760314941, + "loss": 0.4471, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2281811237335205, + "rewards/margins": 1.2636712789535522, + "rewards/rejected": -4.491852760314941, + "sft_loss": 3.325453281402588, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 20.48925827707505, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.3241375982761383, + "logits/rejected": -0.24348489940166473, + "logps/chosen": -3.0451343059539795, + "logps/rejected": -4.30004358291626, + "loss": 0.4306, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0451343059539795, + "rewards/margins": 1.2549090385437012, + "rewards/rejected": -4.30004358291626, + "sft_loss": 3.1367976665496826, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 31.491521093892707, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.31725817918777466, + "logits/rejected": -0.17649342119693756, + "logps/chosen": -3.1203866004943848, + "logps/rejected": -4.242855072021484, + "loss": 0.4812, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1203866004943848, + "rewards/margins": 1.1224688291549683, + "rewards/rejected": -4.242855072021484, + "sft_loss": 3.316706895828247, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 17.25842303345134, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.4709432125091553, + "logits/rejected": -0.22131678462028503, + "logps/chosen": -3.094914674758911, + "logps/rejected": -4.457337856292725, + "loss": 0.4612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.094914674758911, + "rewards/margins": 1.362423062324524, + "rewards/rejected": -4.457337856292725, + "sft_loss": 3.228952407836914, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 35.861043164824565, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.2745920419692993, + "logits/rejected": -0.07384338974952698, + "logps/chosen": -3.1135764122009277, + "logps/rejected": -4.157094478607178, + "loss": 0.4824, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1135764122009277, + "rewards/margins": 1.0435179471969604, + "rewards/rejected": -4.157094478607178, + "sft_loss": 3.250995635986328, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 21.065820661965578, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.2862524092197418, + "logits/rejected": -0.20901036262512207, + "logps/chosen": -3.0392966270446777, + "logps/rejected": -4.194875240325928, + "loss": 0.4305, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0392966270446777, + "rewards/margins": 1.155578374862671, + "rewards/rejected": -4.194875240325928, + "sft_loss": 3.0931644439697266, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 14.551933906690897, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.3247146010398865, + "logits/rejected": -0.1293286234140396, + "logps/chosen": -3.0812928676605225, + "logps/rejected": -4.426455497741699, + "loss": 0.3984, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0812928676605225, + "rewards/margins": 1.345163106918335, + "rewards/rejected": -4.426455497741699, + "sft_loss": 3.193646192550659, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 21.83556838472186, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.2696150839328766, + "logits/rejected": -0.16448882222175598, + "logps/chosen": -2.9790823459625244, + "logps/rejected": -4.206761360168457, + "loss": 0.4807, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9790823459625244, + "rewards/margins": 1.227678894996643, + "rewards/rejected": -4.206761360168457, + "sft_loss": 3.2123515605926514, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 14.238067056693254, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.23434197902679443, + "logits/rejected": -0.2795167863368988, + "logps/chosen": -2.9896132946014404, + "logps/rejected": -3.9661147594451904, + "loss": 0.5082, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9896132946014404, + "rewards/margins": 0.9765016436576843, + "rewards/rejected": -3.9661147594451904, + "sft_loss": 3.145164728164673, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 15.629585462898145, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.29780083894729614, + "logits/rejected": -0.22754064202308655, + "logps/chosen": -2.9592719078063965, + "logps/rejected": -4.178595542907715, + "loss": 0.453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9592719078063965, + "rewards/margins": 1.219322919845581, + "rewards/rejected": -4.178595542907715, + "sft_loss": 3.040536642074585, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 18.825198907230106, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.43763312697410583, + "logits/rejected": -0.22980007529258728, + "logps/chosen": -3.007265090942383, + "logps/rejected": -4.1483659744262695, + "loss": 0.4454, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.007265090942383, + "rewards/margins": 1.1411011219024658, + "rewards/rejected": -4.1483659744262695, + "sft_loss": 3.1913371086120605, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 24.09754601456622, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.25056177377700806, + "logits/rejected": -0.24701841175556183, + "logps/chosen": -3.0743985176086426, + "logps/rejected": -4.054999351501465, + "loss": 0.5204, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0743985176086426, + "rewards/margins": 0.9806006550788879, + "rewards/rejected": -4.054999351501465, + "sft_loss": 3.1449050903320312, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 16.841622362463834, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.327300488948822, + "logits/rejected": -0.24840061366558075, + "logps/chosen": -3.0672824382781982, + "logps/rejected": -4.048326015472412, + "loss": 0.4871, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0672824382781982, + "rewards/margins": 0.981043815612793, + "rewards/rejected": -4.048326015472412, + "sft_loss": 3.1784448623657227, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 24.06564336912586, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.3746258020401001, + "logits/rejected": -0.16087034344673157, + "logps/chosen": -3.024641752243042, + "logps/rejected": -4.097342014312744, + "loss": 0.4835, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.024641752243042, + "rewards/margins": 1.0727002620697021, + "rewards/rejected": -4.097342014312744, + "sft_loss": 3.2068710327148438, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 13.811767973628212, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.39718228578567505, + "logits/rejected": -0.23315775394439697, + "logps/chosen": -3.001275062561035, + "logps/rejected": -4.263126373291016, + "loss": 0.4283, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.001275062561035, + "rewards/margins": 1.2618507146835327, + "rewards/rejected": -4.263126373291016, + "sft_loss": 3.128613233566284, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 19.206923805132742, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.24512071907520294, + "logits/rejected": -0.19996261596679688, + "logps/chosen": -3.0810763835906982, + "logps/rejected": -4.056224346160889, + "loss": 0.5042, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0810763835906982, + "rewards/margins": 0.97514808177948, + "rewards/rejected": -4.056224346160889, + "sft_loss": 3.3016300201416016, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 18.151418865502585, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.32794076204299927, + "logits/rejected": -0.21797947585582733, + "logps/chosen": -2.933988094329834, + "logps/rejected": -4.380335807800293, + "loss": 0.4164, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.933988094329834, + "rewards/margins": 1.446347951889038, + "rewards/rejected": -4.380335807800293, + "sft_loss": 3.140775203704834, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 19.789281082524955, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.39932963252067566, + "logits/rejected": -0.13446304202079773, + "logps/chosen": -3.127798557281494, + "logps/rejected": -4.366634368896484, + "loss": 0.4286, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.127798557281494, + "rewards/margins": 1.2388359308242798, + "rewards/rejected": -4.366634368896484, + "sft_loss": 3.287930965423584, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 17.89650539798015, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.29105740785598755, + "logits/rejected": -0.18979479372501373, + "logps/chosen": -2.984229803085327, + "logps/rejected": -4.134560585021973, + "loss": 0.4395, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.984229803085327, + "rewards/margins": 1.1503304243087769, + "rewards/rejected": -4.134560585021973, + "sft_loss": 3.2013649940490723, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 16.136346966785975, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.3459378182888031, + "logits/rejected": -0.17318777740001678, + "logps/chosen": -3.1791253089904785, + "logps/rejected": -4.12210750579834, + "loss": 0.5108, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1791253089904785, + "rewards/margins": 0.9429818987846375, + "rewards/rejected": -4.12210750579834, + "sft_loss": 3.3328216075897217, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 15.568184572108814, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.4857475161552429, + "logits/rejected": -0.24215015769004822, + "logps/chosen": -3.0089166164398193, + "logps/rejected": -4.201539516448975, + "loss": 0.433, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0089166164398193, + "rewards/margins": 1.1926231384277344, + "rewards/rejected": -4.201539516448975, + "sft_loss": 3.1859726905822754, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 19.49373403612521, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.3905830979347229, + "logits/rejected": -0.1188025027513504, + "logps/chosen": -2.917149543762207, + "logps/rejected": -4.4046220779418945, + "loss": 0.3984, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.917149543762207, + "rewards/margins": 1.4874722957611084, + "rewards/rejected": -4.4046220779418945, + "sft_loss": 2.99169921875, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 18.319127277349217, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.27997198700904846, + "logits/rejected": -0.21191298961639404, + "logps/chosen": -3.1477558612823486, + "logps/rejected": -4.176412582397461, + "loss": 0.5048, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.1477558612823486, + "rewards/margins": 1.0286567211151123, + "rewards/rejected": -4.176412582397461, + "sft_loss": 3.182608127593994, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 13.206043980986074, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.339926540851593, + "logits/rejected": -0.19760119915008545, + "logps/chosen": -3.116013765335083, + "logps/rejected": -4.474963665008545, + "loss": 0.4585, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.116013765335083, + "rewards/margins": 1.3589502573013306, + "rewards/rejected": -4.474963665008545, + "sft_loss": 3.2682881355285645, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 12.092949101875329, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.35985273122787476, + "logits/rejected": -0.1344916820526123, + "logps/chosen": -3.164681911468506, + "logps/rejected": -4.202638626098633, + "loss": 0.4885, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.164681911468506, + "rewards/margins": 1.0379563570022583, + "rewards/rejected": -4.202638626098633, + "sft_loss": 3.3962695598602295, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 15.77643103729951, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.31049636006355286, + "logits/rejected": -0.22904932498931885, + "logps/chosen": -3.153508424758911, + "logps/rejected": -4.185422420501709, + "loss": 0.5176, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.153508424758911, + "rewards/margins": 1.0319141149520874, + "rewards/rejected": -4.185422420501709, + "sft_loss": 3.297576427459717, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 19.787753694119417, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.30364230275154114, + "logits/rejected": -0.15516886115074158, + "logps/chosen": -3.094975709915161, + "logps/rejected": -4.400447368621826, + "loss": 0.4194, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.094975709915161, + "rewards/margins": 1.3054720163345337, + "rewards/rejected": -4.400447368621826, + "sft_loss": 3.280155658721924, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 15.444519909963496, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.35556572675704956, + "logits/rejected": -0.14741787314414978, + "logps/chosen": -3.0733776092529297, + "logps/rejected": -4.3367719650268555, + "loss": 0.4164, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0733776092529297, + "rewards/margins": 1.2633943557739258, + "rewards/rejected": -4.3367719650268555, + "sft_loss": 3.206371307373047, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 18.64094817357154, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.4322203993797302, + "logits/rejected": -0.3308923840522766, + "logps/chosen": -2.9811339378356934, + "logps/rejected": -4.230893135070801, + "loss": 0.4331, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9811339378356934, + "rewards/margins": 1.2497590780258179, + "rewards/rejected": -4.230893135070801, + "sft_loss": 3.179506540298462, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 13.419565857750332, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.3933342397212982, + "logits/rejected": -0.23518791794776917, + "logps/chosen": -3.0282037258148193, + "logps/rejected": -4.129472255706787, + "loss": 0.4647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0282037258148193, + "rewards/margins": 1.1012687683105469, + "rewards/rejected": -4.129472255706787, + "sft_loss": 3.158400774002075, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 20.2744413481314, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.19685646891593933, + "logits/rejected": -0.13014453649520874, + "logps/chosen": -3.0012104511260986, + "logps/rejected": -4.314382076263428, + "loss": 0.4723, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0012104511260986, + "rewards/margins": 1.31317138671875, + "rewards/rejected": -4.314382076263428, + "sft_loss": 3.1396377086639404, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 20.60754015834533, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.2115621566772461, + "logits/rejected": -0.11249543726444244, + "logps/chosen": -3.1051881313323975, + "logps/rejected": -4.277671813964844, + "loss": 0.4395, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.1051881313323975, + "rewards/margins": 1.1724836826324463, + "rewards/rejected": -4.277671813964844, + "sft_loss": 3.2242259979248047, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 17.604786632420204, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.390419602394104, + "logits/rejected": -0.1984190046787262, + "logps/chosen": -3.1073858737945557, + "logps/rejected": -4.138300895690918, + "loss": 0.4884, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1073858737945557, + "rewards/margins": 1.030915379524231, + "rewards/rejected": -4.138300895690918, + "sft_loss": 3.204761028289795, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 13.87575549735312, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.34086284041404724, + "logits/rejected": -0.1693355292081833, + "logps/chosen": -2.8941614627838135, + "logps/rejected": -4.1378021240234375, + "loss": 0.4606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8941614627838135, + "rewards/margins": 1.2436411380767822, + "rewards/rejected": -4.1378021240234375, + "sft_loss": 3.085679531097412, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 12.381814139095438, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.4181889593601227, + "logits/rejected": -0.18107159435749054, + "logps/chosen": -3.1676297187805176, + "logps/rejected": -4.195403099060059, + "loss": 0.4951, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1676297187805176, + "rewards/margins": 1.0277737379074097, + "rewards/rejected": -4.195403099060059, + "sft_loss": 3.2065887451171875, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 29.507089109713135, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.33318883180618286, + "logits/rejected": -0.24138236045837402, + "logps/chosen": -3.0527524948120117, + "logps/rejected": -4.077324867248535, + "loss": 0.4885, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0527524948120117, + "rewards/margins": 1.0245723724365234, + "rewards/rejected": -4.077324867248535, + "sft_loss": 3.267174482345581, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 15.988901477420473, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.2998291850090027, + "logits/rejected": -0.33957356214523315, + "logps/chosen": -2.969876766204834, + "logps/rejected": -3.991947889328003, + "loss": 0.484, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.969876766204834, + "rewards/margins": 1.022071361541748, + "rewards/rejected": -3.991947889328003, + "sft_loss": 3.1517035961151123, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 18.14887590672031, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.33400124311447144, + "logits/rejected": -0.20043723285198212, + "logps/chosen": -3.100147247314453, + "logps/rejected": -4.258397102355957, + "loss": 0.4754, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.100147247314453, + "rewards/margins": 1.1582508087158203, + "rewards/rejected": -4.258397102355957, + "sft_loss": 3.2473278045654297, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 20.02783237572497, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.26498645544052124, + "logits/rejected": -0.06162431836128235, + "logps/chosen": -2.9117674827575684, + "logps/rejected": -3.9550652503967285, + "loss": 0.4418, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9117674827575684, + "rewards/margins": 1.0432971715927124, + "rewards/rejected": -3.9550652503967285, + "sft_loss": 3.029630661010742, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 19.7209031712253, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.3522023856639862, + "logits/rejected": -0.2869417071342468, + "logps/chosen": -2.9644052982330322, + "logps/rejected": -4.142910957336426, + "loss": 0.4551, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9644052982330322, + "rewards/margins": 1.1785061359405518, + "rewards/rejected": -4.142910957336426, + "sft_loss": 3.2182540893554688, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 12.521407433641665, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.41579556465148926, + "logits/rejected": -0.19478026032447815, + "logps/chosen": -2.9432249069213867, + "logps/rejected": -4.327273845672607, + "loss": 0.4058, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.9432249069213867, + "rewards/margins": 1.3840488195419312, + "rewards/rejected": -4.327273845672607, + "sft_loss": 3.180227756500244, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 13.05754787349225, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.265876829624176, + "logits/rejected": -0.14506739377975464, + "logps/chosen": -3.11197566986084, + "logps/rejected": -4.149270534515381, + "loss": 0.4817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.11197566986084, + "rewards/margins": 1.037294864654541, + "rewards/rejected": -4.149270534515381, + "sft_loss": 3.2263476848602295, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 19.91544793700982, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.2848760783672333, + "logits/rejected": -0.2564757466316223, + "logps/chosen": -3.073355197906494, + "logps/rejected": -4.214011192321777, + "loss": 0.4578, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.073355197906494, + "rewards/margins": 1.1406558752059937, + "rewards/rejected": -4.214011192321777, + "sft_loss": 3.2507076263427734, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 17.813566649956503, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.373394250869751, + "logits/rejected": -0.20408591628074646, + "logps/chosen": -3.0406758785247803, + "logps/rejected": -4.433874130249023, + "loss": 0.434, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0406758785247803, + "rewards/margins": 1.3931986093521118, + "rewards/rejected": -4.433874130249023, + "sft_loss": 3.1046528816223145, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 16.244572749023327, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.41688647866249084, + "logits/rejected": -0.2552393078804016, + "logps/chosen": -3.0431668758392334, + "logps/rejected": -4.226834297180176, + "loss": 0.4655, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0431668758392334, + "rewards/margins": 1.1836671829223633, + "rewards/rejected": -4.226834297180176, + "sft_loss": 3.256648302078247, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 19.60797999208122, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.30990904569625854, + "logits/rejected": -0.222394198179245, + "logps/chosen": -3.017165422439575, + "logps/rejected": -4.103949069976807, + "loss": 0.4548, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.017165422439575, + "rewards/margins": 1.0867832899093628, + "rewards/rejected": -4.103949069976807, + "sft_loss": 3.1367456912994385, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 19.10578857508277, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.3262700140476227, + "logits/rejected": -0.16396215558052063, + "logps/chosen": -3.2386765480041504, + "logps/rejected": -4.5179853439331055, + "loss": 0.4615, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2386765480041504, + "rewards/margins": 1.2793089151382446, + "rewards/rejected": -4.5179853439331055, + "sft_loss": 3.428264617919922, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 14.770948828555499, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.4027184545993805, + "logits/rejected": -0.1612546145915985, + "logps/chosen": -3.1544299125671387, + "logps/rejected": -4.1260833740234375, + "loss": 0.4836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1544299125671387, + "rewards/margins": 0.9716532826423645, + "rewards/rejected": -4.1260833740234375, + "sft_loss": 3.242033004760742, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 23.03402766448594, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.32584530115127563, + "logits/rejected": -0.25883787870407104, + "logps/chosen": -3.097665309906006, + "logps/rejected": -4.14642858505249, + "loss": 0.5031, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.097665309906006, + "rewards/margins": 1.0487632751464844, + "rewards/rejected": -4.14642858505249, + "sft_loss": 3.284144878387451, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 19.128617418074587, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.40291205048561096, + "logits/rejected": -0.25008195638656616, + "logps/chosen": -3.05241060256958, + "logps/rejected": -4.164105415344238, + "loss": 0.4697, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.05241060256958, + "rewards/margins": 1.1116950511932373, + "rewards/rejected": -4.164105415344238, + "sft_loss": 3.1327614784240723, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 25.594131415702417, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.3280094861984253, + "logits/rejected": -0.20888443291187286, + "logps/chosen": -2.939770221710205, + "logps/rejected": -3.989638566970825, + "loss": 0.4704, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.939770221710205, + "rewards/margins": 1.0498679876327515, + "rewards/rejected": -3.989638566970825, + "sft_loss": 3.02432918548584, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 15.41544308032071, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.24691219627857208, + "logits/rejected": -0.17742550373077393, + "logps/chosen": -3.015709400177002, + "logps/rejected": -4.0467987060546875, + "loss": 0.4573, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.015709400177002, + "rewards/margins": 1.031089425086975, + "rewards/rejected": -4.0467987060546875, + "sft_loss": 3.1204111576080322, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 22.34414377412042, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.3485460579395294, + "logits/rejected": -0.14062300324440002, + "logps/chosen": -3.0802266597747803, + "logps/rejected": -4.3738532066345215, + "loss": 0.4465, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0802266597747803, + "rewards/margins": 1.2936267852783203, + "rewards/rejected": -4.3738532066345215, + "sft_loss": 3.1362340450286865, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 20.00966499088373, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.26539528369903564, + "logits/rejected": -0.22001326084136963, + "logps/chosen": -3.059741497039795, + "logps/rejected": -4.021082878112793, + "loss": 0.5115, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.059741497039795, + "rewards/margins": 0.9613416790962219, + "rewards/rejected": -4.021082878112793, + "sft_loss": 3.258448362350464, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 17.624578233565497, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.35784557461738586, + "logits/rejected": -0.22992336750030518, + "logps/chosen": -3.0968403816223145, + "logps/rejected": -4.2120184898376465, + "loss": 0.4963, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0968403816223145, + "rewards/margins": 1.115178108215332, + "rewards/rejected": -4.2120184898376465, + "sft_loss": 3.2440342903137207, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 16.380030236450217, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.3142494559288025, + "logits/rejected": -0.15964466333389282, + "logps/chosen": -2.9746203422546387, + "logps/rejected": -4.431513786315918, + "loss": 0.414, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9746203422546387, + "rewards/margins": 1.4568936824798584, + "rewards/rejected": -4.431513786315918, + "sft_loss": 3.1340255737304688, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 19.14052944165982, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.32244330644607544, + "logits/rejected": -0.13985905051231384, + "logps/chosen": -2.8702309131622314, + "logps/rejected": -4.0701775550842285, + "loss": 0.4721, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8702309131622314, + "rewards/margins": 1.199946641921997, + "rewards/rejected": -4.0701775550842285, + "sft_loss": 3.0334982872009277, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 18.04040313922582, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.4082750380039215, + "logits/rejected": -0.11360353231430054, + "logps/chosen": -3.010322093963623, + "logps/rejected": -4.2205352783203125, + "loss": 0.4456, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.010322093963623, + "rewards/margins": 1.2102131843566895, + "rewards/rejected": -4.2205352783203125, + "sft_loss": 3.1632823944091797, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 19.264446580620298, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.37494826316833496, + "logits/rejected": -0.17825865745544434, + "logps/chosen": -3.0055809020996094, + "logps/rejected": -4.270408630371094, + "loss": 0.4199, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0055809020996094, + "rewards/margins": 1.2648277282714844, + "rewards/rejected": -4.270408630371094, + "sft_loss": 3.263542890548706, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 17.075054798421235, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.2850029468536377, + "logits/rejected": -0.2014038860797882, + "logps/chosen": -3.143592357635498, + "logps/rejected": -4.232028961181641, + "loss": 0.4806, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.143592357635498, + "rewards/margins": 1.088436484336853, + "rewards/rejected": -4.232028961181641, + "sft_loss": 3.220813751220703, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 28.69235281888499, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.3260534405708313, + "logits/rejected": -0.1166486144065857, + "logps/chosen": -3.000702142715454, + "logps/rejected": -4.186171531677246, + "loss": 0.4439, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.000702142715454, + "rewards/margins": 1.185469388961792, + "rewards/rejected": -4.186171531677246, + "sft_loss": 3.1785988807678223, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 17.986136529078, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.331061452627182, + "logits/rejected": -0.1841285526752472, + "logps/chosen": -3.105658769607544, + "logps/rejected": -4.51476526260376, + "loss": 0.4704, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.105658769607544, + "rewards/margins": 1.4091064929962158, + "rewards/rejected": -4.51476526260376, + "sft_loss": 3.228262424468994, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.05207514017820358, + "eval_logits/rejected": 0.16449151933193207, + "eval_logps/chosen": -3.185509204864502, + "eval_logps/rejected": -4.173870086669922, + "eval_loss": 0.558981716632843, + "eval_rewards/accuracies": 0.7225519418716431, + "eval_rewards/chosen": -3.185509204864502, + "eval_rewards/margins": 0.988361120223999, + "eval_rewards/rejected": -4.173870086669922, + "eval_runtime": 51.1788, + "eval_samples_per_second": 26.28, + "eval_sft_loss": 3.316328763961792, + "eval_steps_per_second": 6.585, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.5562219639746825, + "train_runtime": 39292.0664, + "train_samples_per_second": 4.565, + "train_steps_per_second": 0.143 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}