diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -27,18 +27,18 @@ }, { "epoch": 0.01, - "grad_norm": 0.6015625, + "grad_norm": 0.60546875, "learning_rate": 5.208333333333334e-07, - "logits/chosen": -2.2111542224884033, - "logits/rejected": -2.2718067169189453, - "logps/chosen": -57.56840133666992, - "logps/rejected": -65.20916748046875, + "logits/chosen": -2.2113068103790283, + "logits/rejected": -2.2719719409942627, + "logps/chosen": -57.57659149169922, + "logps/rejected": -65.19544219970703, "loss": 0.693, "pred_label": 0.0, - "rewards/accuracies": 0.2569444477558136, - "rewards/chosen": 0.0011389791034162045, - "rewards/margins": 0.0002508986508473754, - "rewards/rejected": 0.0008880805689841509, + "rewards/accuracies": 0.2152777761220932, + "rewards/chosen": 0.001057142741046846, + "rewards/margins": 3.17241829179693e-05, + "rewards/rejected": 0.001025418401695788, "step": 10, "use_label": 90.0 }, @@ -46,16 +46,16 @@ "epoch": 0.02, "grad_norm": 0.6796875, "learning_rate": 1.0416666666666667e-06, - "logits/chosen": -2.242893695831299, - "logits/rejected": -2.279961109161377, - "logps/chosen": -56.537681579589844, - "logps/rejected": -68.3794174194336, + "logits/chosen": -2.243159770965576, + "logits/rejected": -2.2802278995513916, + "logps/chosen": -56.544715881347656, + "logps/rejected": -68.35901641845703, "loss": 0.6924, "pred_label": 0.0, "rewards/accuracies": 0.22499999403953552, - "rewards/chosen": 0.006626849062740803, - "rewards/margins": 0.001654049614444375, - "rewards/rejected": 0.004972799215465784, + "rewards/chosen": 0.006556531880050898, + "rewards/margins": 0.001379690133035183, + "rewards/rejected": 0.005176841747015715, "step": 20, "use_label": 242.0 }, @@ -63,16 +63,16 @@ "epoch": 0.03, "grad_norm": 0.55078125, "learning_rate": 1.5625e-06, - "logits/chosen": -2.2637219429016113, - "logits/rejected": -2.2480521202087402, - "logps/chosen": -53.993507385253906, - "logps/rejected": -67.89700317382812, - "loss": 0.6919, + "logits/chosen": -2.2634024620056152, + "logits/rejected": -2.2475943565368652, + "logps/chosen": -53.98667526245117, + "logps/rejected": -67.89213562011719, + "loss": 0.692, "pred_label": 0.0, - "rewards/accuracies": 0.26875001192092896, - "rewards/chosen": 0.016421381384134293, - "rewards/margins": 0.002580237342044711, - "rewards/rejected": 0.013841142877936363, + "rewards/accuracies": 0.2750000059604645, + "rewards/chosen": 0.01648966409265995, + "rewards/margins": 0.002599921775981784, + "rewards/rejected": 0.013889740221202374, "step": 30, "use_label": 402.0 }, @@ -80,16 +80,16 @@ "epoch": 0.04, "grad_norm": 0.6328125, "learning_rate": 2.0833333333333334e-06, - "logits/chosen": -2.2831993103027344, - "logits/rejected": -2.2760486602783203, - "logps/chosen": -55.59602737426758, - "logps/rejected": -66.58573913574219, + "logits/chosen": -2.2825467586517334, + "logits/rejected": -2.2754693031311035, + "logps/chosen": -55.582061767578125, + "logps/rejected": -66.59407043457031, "loss": 0.6909, "pred_label": 0.0, - "rewards/accuracies": 0.20624999701976776, - "rewards/chosen": 0.018266689032316208, - "rewards/margins": 0.0004533957107923925, - "rewards/rejected": 0.017813291400671005, + "rewards/accuracies": 0.21250000596046448, + "rewards/chosen": 0.018406417220830917, + "rewards/margins": 0.0006764450808987021, + "rewards/rejected": 0.017729971557855606, "step": 40, "use_label": 562.0 }, @@ -97,16 +97,16 @@ "epoch": 0.05, "grad_norm": 0.6015625, "learning_rate": 2.604166666666667e-06, - "logits/chosen": -2.344376564025879, - "logits/rejected": -2.3342297077178955, - "logps/chosen": -69.12073516845703, - "logps/rejected": -84.67558288574219, + "logits/chosen": -2.3444912433624268, + "logits/rejected": -2.3341281414031982, + "logps/chosen": -69.13630676269531, + "logps/rejected": -84.64376831054688, "loss": 0.6889, "pred_label": 0.0, - "rewards/accuracies": 0.30000001192092896, - "rewards/chosen": 0.02673395536839962, - "rewards/margins": 0.00583356199786067, - "rewards/rejected": 0.020900394767522812, + "rewards/accuracies": 0.2874999940395355, + "rewards/chosen": 0.02657836303114891, + "rewards/margins": 0.005359734408557415, + "rewards/rejected": 0.021218623965978622, "step": 50, "use_label": 722.0 }, @@ -114,1701 +114,1701 @@ "epoch": 0.06, "grad_norm": 0.72265625, "learning_rate": 3.125e-06, - "logits/chosen": -2.3030121326446533, - "logits/rejected": -2.3094825744628906, - "logps/chosen": -82.04167175292969, - "logps/rejected": -90.7291488647461, - "loss": 0.6876, + "logits/chosen": -2.3026936054229736, + "logits/rejected": -2.309264659881592, + "logps/chosen": -82.00704193115234, + "logps/rejected": -90.7305908203125, + "loss": 0.6874, "pred_label": 0.0, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": 0.036534082144498825, - "rewards/margins": 0.013860121369361877, - "rewards/rejected": 0.022673960775136948, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.03688042238354683, + "rewards/margins": 0.014220851473510265, + "rewards/rejected": 0.02265957184135914, "step": 60, "use_label": 882.0 }, { "epoch": 0.07, - "grad_norm": 0.7890625, + "grad_norm": 0.79296875, "learning_rate": 3.6458333333333333e-06, - "logits/chosen": -2.345569610595703, - "logits/rejected": -2.3263676166534424, - "logps/chosen": -77.1853256225586, - "logps/rejected": -77.63880920410156, - "loss": 0.685, + "logits/chosen": -2.344853401184082, + "logits/rejected": -2.3261306285858154, + "logps/chosen": -77.20336151123047, + "logps/rejected": -77.6347885131836, + "loss": 0.6851, "pred_label": 0.0, - "rewards/accuracies": 0.3062500059604645, - "rewards/chosen": 0.025494003668427467, - "rewards/margins": 0.016305232420563698, - "rewards/rejected": 0.009188770316541195, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.02531364932656288, + "rewards/margins": 0.01608472317457199, + "rewards/rejected": 0.009228924289345741, "step": 70, "use_label": 1042.0 }, { "epoch": 0.08, - "grad_norm": 0.81640625, + "grad_norm": 0.80078125, "learning_rate": 4.166666666666667e-06, - "logits/chosen": -2.241882801055908, - "logits/rejected": -2.195146322250366, - "logps/chosen": -81.66094207763672, - "logps/rejected": -89.08940124511719, - "loss": 0.6805, - "pred_label": 0.0, + "logits/chosen": -2.241945743560791, + "logits/rejected": -2.195178985595703, + "logps/chosen": -81.6376953125, + "logps/rejected": -89.05104064941406, + "loss": 0.6814, + "pred_label": 0.9750000238418579, "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": 0.003909807652235031, - "rewards/margins": 0.025169039145112038, - "rewards/rejected": -0.021259231492877007, + "rewards/chosen": 0.004142354242503643, + "rewards/margins": 0.025017932057380676, + "rewards/rejected": -0.02087557688355446, "step": 80, - "use_label": 1202.0 + "use_label": 1201.0250244140625 }, { "epoch": 0.09, - "grad_norm": 1.7734375, + "grad_norm": 1.578125, "learning_rate": 4.6875000000000004e-06, - "logits/chosen": -2.1871695518493652, - "logits/rejected": -2.2313501834869385, - "logps/chosen": -62.76741409301758, - "logps/rejected": -81.16191101074219, - "loss": 0.6747, - "pred_label": 0.0, + "logits/chosen": -2.1907405853271484, + "logits/rejected": -2.232959270477295, + "logps/chosen": -62.31688690185547, + "logps/rejected": -80.38573455810547, + "loss": 0.6812, + "pred_label": 3.0999999046325684, "rewards/accuracies": 0.33125001192092896, - "rewards/chosen": -0.016776535660028458, - "rewards/margins": 0.048332639038562775, - "rewards/rejected": -0.06510917842388153, + "rewards/chosen": -0.012271342799067497, + "rewards/margins": 0.04507603123784065, + "rewards/rejected": -0.0573473684489727, "step": 90, - "use_label": 1362.0 + "use_label": 1358.9000244140625 }, { "epoch": 0.1, - "grad_norm": 1.4375, + "grad_norm": 0.796875, "learning_rate": 4.9997324926814375e-06, - "logits/chosen": -2.1414177417755127, - "logits/rejected": -2.107236623764038, - "logps/chosen": -78.60578155517578, - "logps/rejected": -81.1384506225586, - "loss": 0.6685, - "pred_label": 0.0, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.04031088575720787, - "rewards/margins": 0.052690792828798294, - "rewards/rejected": -0.09300167858600616, + "logits/chosen": -2.132638454437256, + "logits/rejected": -2.0995519161224365, + "logps/chosen": -76.97563171386719, + "logps/rejected": -79.27615356445312, + "loss": 0.6818, + "pred_label": 7.150000095367432, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.02400936186313629, + "rewards/margins": 0.05036945268511772, + "rewards/rejected": -0.07437881827354431, "step": 100, - "use_label": 1522.0 + "use_label": 1514.8499755859375 }, { "epoch": 0.1, - "eval_logits/chosen": -2.109715223312378, - "eval_logits/rejected": -2.0796475410461426, - "eval_logps/chosen": -71.95718383789062, - "eval_logps/rejected": -84.7625961303711, - "eval_loss": 0.6684110760688782, - "eval_pred_label": 0.0, - "eval_rewards/accuracies": 0.335317462682724, - "eval_rewards/chosen": -0.030566338449716568, - "eval_rewards/margins": 0.06307896971702576, - "eval_rewards/rejected": -0.09364530444145203, - "eval_runtime": 247.4954, - "eval_samples_per_second": 8.081, - "eval_steps_per_second": 0.255, - "eval_use_label": 1856.0, + "eval_logits/chosen": -2.097480297088623, + "eval_logits/rejected": -2.0663790702819824, + "eval_logps/chosen": -69.46318054199219, + "eval_logps/rejected": -80.35824584960938, + "eval_loss": 0.6813791394233704, + "eval_pred_label": 22.539682388305664, + "eval_rewards/accuracies": 0.3392857015132904, + "eval_rewards/chosen": -0.005626226309686899, + "eval_rewards/margins": 0.04397555813193321, + "eval_rewards/rejected": -0.04960178583860397, + "eval_runtime": 245.3242, + "eval_samples_per_second": 8.152, + "eval_steps_per_second": 0.257, + "eval_use_label": 1833.4603271484375, "step": 100 }, { "epoch": 0.12, - "grad_norm": 1.71875, + "grad_norm": 1.1171875, "learning_rate": 4.996723692767927e-06, - "logits/chosen": -2.12998104095459, - "logits/rejected": -2.1109042167663574, - "logps/chosen": -68.2921142578125, - "logps/rejected": -84.99057006835938, - "loss": 0.6713, - "pred_label": 0.07500000298023224, - "rewards/accuracies": 0.3062500059604645, - "rewards/chosen": -0.06523006409406662, - "rewards/margins": 0.0570509135723114, - "rewards/rejected": -0.12228099256753922, + "logits/chosen": -2.114673137664795, + "logits/rejected": -2.094468355178833, + "logps/chosen": -63.9236946105957, + "logps/rejected": -79.44518280029297, + "loss": 0.6827, + "pred_label": 34.0, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.02154584601521492, + "rewards/margins": 0.04528125748038292, + "rewards/rejected": -0.06682710349559784, "step": 110, - "use_label": 2185.925048828125 + "use_label": 2152.0 }, { "epoch": 0.13, - "grad_norm": 1.046875, + "grad_norm": 1.0390625, "learning_rate": 4.9903757462135984e-06, - "logits/chosen": -2.3605504035949707, - "logits/rejected": -2.243201971054077, - "logps/chosen": -80.2857666015625, - "logps/rejected": -96.13867950439453, - "loss": 0.6667, - "pred_label": 1.875, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.062185365706682205, - "rewards/margins": 0.0815814733505249, - "rewards/rejected": -0.143766850233078, + "logits/chosen": -2.2926628589630127, + "logits/rejected": -2.177788257598877, + "logps/chosen": -83.48246002197266, + "logps/rejected": -97.60291290283203, + "loss": 0.683, + "pred_label": 44.67499923706055, + "rewards/accuracies": 0.3062500059604645, + "rewards/chosen": -0.0941522866487503, + "rewards/margins": 0.06425690650939941, + "rewards/rejected": -0.15840919315814972, "step": 120, - "use_label": 2344.125 + "use_label": 2301.324951171875 }, { "epoch": 0.14, - "grad_norm": 1.0078125, + "grad_norm": 0.546875, "learning_rate": 4.980697142834315e-06, - "logits/chosen": -2.146286725997925, - "logits/rejected": -2.1618175506591797, - "logps/chosen": -67.6681137084961, - "logps/rejected": -78.9002456665039, - "loss": 0.6675, - "pred_label": 2.075000047683716, + "logits/chosen": -2.0968613624572754, + "logits/rejected": -2.1124091148376465, + "logps/chosen": -66.370849609375, + "logps/rejected": -77.3319320678711, + "loss": 0.6845, + "pred_label": 57.57500076293945, "rewards/accuracies": 0.2750000059604645, - "rewards/chosen": -0.09194014966487885, - "rewards/margins": 0.04880703240633011, - "rewards/rejected": -0.14074717462062836, + "rewards/chosen": -0.07896758615970612, + "rewards/margins": 0.04609644412994385, + "rewards/rejected": -0.12506404519081116, "step": 130, - "use_label": 2503.925048828125 + "use_label": 2448.425048828125 }, { "epoch": 0.15, - "grad_norm": 2.28125, + "grad_norm": 0.78515625, "learning_rate": 4.967700826904229e-06, - "logits/chosen": -2.1254963874816895, - "logits/rejected": -2.160235643386841, - "logps/chosen": -74.917724609375, - "logps/rejected": -99.1263427734375, - "loss": 0.6684, - "pred_label": 5.050000190734863, - "rewards/accuracies": 0.28125, - "rewards/chosen": -0.15644724667072296, - "rewards/margins": 0.08642110973596573, - "rewards/rejected": -0.2428683489561081, + "logits/chosen": -2.1041221618652344, + "logits/rejected": -2.138929843902588, + "logps/chosen": -68.11909484863281, + "logps/rejected": -90.16340637207031, + "loss": 0.6868, + "pred_label": 73.75, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.08846104890108109, + "rewards/margins": 0.0647779330611229, + "rewards/rejected": -0.15323898196220398, "step": 140, - "use_label": 2660.949951171875 + "use_label": 2592.25 }, { "epoch": 0.16, - "grad_norm": 1.359375, + "grad_norm": 1.1015625, "learning_rate": 4.951404179843963e-06, - "logits/chosen": -2.1338083744049072, - "logits/rejected": -2.0785932540893555, - "logps/chosen": -57.580589294433594, - "logps/rejected": -64.5077133178711, - "loss": 0.6658, - "pred_label": 9.100000381469727, - "rewards/accuracies": 0.26875001192092896, - "rewards/chosen": -0.10085760056972504, - "rewards/margins": 0.09126537293195724, - "rewards/rejected": -0.19212298095226288, + "logits/chosen": -2.1765952110290527, + "logits/rejected": -2.125175714492798, + "logps/chosen": -54.37804412841797, + "logps/rejected": -58.982269287109375, + "loss": 0.6809, + "pred_label": 91.3499984741211, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.06883221119642258, + "rewards/margins": 0.06803621351718903, + "rewards/rejected": -0.136868417263031, "step": 150, - "use_label": 2816.89990234375 + "use_label": 2734.64990234375 }, { "epoch": 0.17, - "grad_norm": 2.15625, + "grad_norm": 1.03125, "learning_rate": 4.931828996974498e-06, - "logits/chosen": -2.1478817462921143, - "logits/rejected": -2.1238207817077637, - "logps/chosen": -102.6265640258789, - "logps/rejected": -118.9216537475586, - "loss": 0.667, - "pred_label": 15.399999618530273, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.21023361384868622, - "rewards/margins": 0.16089467704296112, - "rewards/rejected": -0.37112829089164734, + "logits/chosen": -2.2455694675445557, + "logits/rejected": -2.213240623474121, + "logps/chosen": -94.4081802368164, + "logps/rejected": -107.48802185058594, + "loss": 0.6857, + "pred_label": 115.55000305175781, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.12804970145225525, + "rewards/margins": 0.12874242663383484, + "rewards/rejected": -0.2567921280860901, "step": 160, - "use_label": 2970.60009765625 + "use_label": 2870.449951171875 }, { "epoch": 0.18, - "grad_norm": 2.140625, + "grad_norm": 1.1875, "learning_rate": 4.909001458367867e-06, - "logits/chosen": -1.9664795398712158, - "logits/rejected": -1.9388923645019531, - "logps/chosen": -81.67234802246094, - "logps/rejected": -97.5047836303711, - "loss": 0.6635, - "pred_label": 23.475000381469727, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.17715924978256226, - "rewards/margins": 0.13409331440925598, - "rewards/rejected": -0.31125253438949585, + "logits/chosen": -2.1201233863830566, + "logits/rejected": -2.0822367668151855, + "logps/chosen": -75.75311279296875, + "logps/rejected": -87.55944061279297, + "loss": 0.6869, + "pred_label": 141.85000610351562, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.1179669052362442, + "rewards/margins": 0.09383226186037064, + "rewards/rejected": -0.21179917454719543, "step": 170, - "use_label": 3122.52490234375 + "use_label": 3004.14990234375 }, { "epoch": 0.19, - "grad_norm": 2.96875, + "grad_norm": 1.4296875, "learning_rate": 4.882952093833628e-06, - "logits/chosen": -1.896836519241333, - "logits/rejected": -1.9397751092910767, - "logps/chosen": -82.50892639160156, - "logps/rejected": -105.1452407836914, - "loss": 0.6701, - "pred_label": 29.649999618530273, + "logits/chosen": -2.1013779640197754, + "logits/rejected": -2.121537685394287, + "logps/chosen": -70.6474838256836, + "logps/rejected": -89.79743957519531, + "loss": 0.685, + "pred_label": 161.3249969482422, "rewards/accuracies": 0.33125001192092896, - "rewards/chosen": -0.20006974041461945, - "rewards/margins": 0.11658792197704315, - "rewards/rejected": -0.3166576623916626, + "rewards/chosen": -0.08145526796579361, + "rewards/margins": 0.08172430098056793, + "rewards/rejected": -0.16317956149578094, "step": 180, - "use_label": 3276.35009765625 + "use_label": 3144.675048828125 }, { "epoch": 0.2, - "grad_norm": 1.5859375, + "grad_norm": 0.8515625, "learning_rate": 4.853715742087947e-06, - "logits/chosen": -1.8957335948944092, - "logits/rejected": -1.8187646865844727, - "logps/chosen": -101.19456481933594, - "logps/rejected": -109.06144714355469, - "loss": 0.6648, - "pred_label": 35.17499923706055, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.27311572432518005, - "rewards/margins": 0.1226036325097084, - "rewards/rejected": -0.39571934938430786, + "logits/chosen": -2.1533255577087402, + "logits/rejected": -2.104222297668457, + "logps/chosen": -87.3572998046875, + "logps/rejected": -91.95249938964844, + "loss": 0.6862, + "pred_label": 181.39999389648438, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13474301993846893, + "rewards/margins": 0.08988693356513977, + "rewards/rejected": -0.2246299535036087, "step": 190, - "use_label": 3430.824951171875 + "use_label": 3284.60009765625 }, { "epoch": 0.21, - "grad_norm": 2.125, + "grad_norm": 0.96875, "learning_rate": 4.821331504159906e-06, - "logits/chosen": -1.8366466760635376, - "logits/rejected": -1.8133814334869385, - "logps/chosen": -118.866943359375, - "logps/rejected": -123.262451171875, - "loss": 0.676, - "pred_label": 43.42499923706055, - "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.38812780380249023, - "rewards/margins": 0.1128091812133789, - "rewards/rejected": -0.5009369850158691, + "logits/chosen": -2.137516736984253, + "logits/rejected": -2.13090443611145, + "logps/chosen": -94.10081481933594, + "logps/rejected": -95.15316009521484, + "loss": 0.6818, + "pred_label": 205.875, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.14046669006347656, + "rewards/margins": 0.07937734574079514, + "rewards/rejected": -0.2198440283536911, "step": 200, - "use_label": 3582.574951171875 + "use_label": 3420.125 }, { "epoch": 0.21, - "eval_logits/chosen": -1.7319464683532715, - "eval_logits/rejected": -1.6888620853424072, - "eval_logps/chosen": -106.19064331054688, - "eval_logps/rejected": -124.95625305175781, - "eval_loss": 0.6716896295547485, - "eval_pred_label": 61.55555725097656, - "eval_rewards/accuracies": 0.3214285671710968, - "eval_rewards/chosen": -0.372901052236557, - "eval_rewards/margins": 0.12268086522817612, - "eval_rewards/rejected": -0.49558189511299133, - "eval_runtime": 248.0123, - "eval_samples_per_second": 8.064, - "eval_steps_per_second": 0.254, - "eval_use_label": 3898.4443359375, + "eval_logits/chosen": -2.021465301513672, + "eval_logits/rejected": -1.9937611818313599, + "eval_logps/chosen": -82.4782485961914, + "eval_logps/rejected": -99.20675659179688, + "eval_loss": 0.6860649585723877, + "eval_pred_label": 258.79364013671875, + "eval_rewards/accuracies": 0.3373015820980072, + "eval_rewards/chosen": -0.13577698171138763, + "eval_rewards/margins": 0.10230996459722519, + "eval_rewards/rejected": -0.23808695375919342, + "eval_runtime": 245.9338, + "eval_samples_per_second": 8.132, + "eval_steps_per_second": 0.256, + "eval_use_label": 3701.206298828125, "step": 200 }, { "epoch": 0.22, - "grad_norm": 2.515625, + "grad_norm": 1.1484375, "learning_rate": 4.7858426910973435e-06, - "logits/chosen": -1.9356311559677124, - "logits/rejected": -1.9080215692520142, - "logps/chosen": -93.94760131835938, - "logps/rejected": -106.8377456665039, - "loss": 0.6743, - "pred_label": 81.25, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.259369432926178, - "rewards/margins": 0.13343419134616852, - "rewards/rejected": -0.3928036093711853, + "logits/chosen": -2.1574149131774902, + "logits/rejected": -2.1307334899902344, + "logps/chosen": -77.64894104003906, + "logps/rejected": -89.26710510253906, + "loss": 0.6828, + "pred_label": 313.32501220703125, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": -0.09638272225856781, + "rewards/margins": 0.12071452289819717, + "rewards/rejected": -0.2170972377061844, "step": 210, - "use_label": 4208.75 + "use_label": 3976.675048828125 }, { "epoch": 0.23, - "grad_norm": 2.34375, + "grad_norm": 1.40625, "learning_rate": 4.747296766042161e-06, - "logits/chosen": -1.8437402248382568, - "logits/rejected": -1.810903549194336, - "logps/chosen": -100.77757263183594, - "logps/rejected": -112.38002014160156, - "loss": 0.6651, - "pred_label": 92.5999984741211, + "logits/chosen": -2.1187565326690674, + "logits/rejected": -2.102626323699951, + "logps/chosen": -90.67762756347656, + "logps/rejected": -96.60699462890625, + "loss": 0.6884, + "pred_label": 343.875, "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.24724093079566956, - "rewards/margins": 0.1804189234972, - "rewards/rejected": -0.427659809589386, + "rewards/chosen": -0.1462414264678955, + "rewards/margins": 0.12368818372488022, + "rewards/rejected": -0.2699296176433563, "step": 220, - "use_label": 4357.39990234375 + "use_label": 4106.125 }, { "epoch": 0.24, - "grad_norm": 1.828125, + "grad_norm": 1.1484375, "learning_rate": 4.705745280752586e-06, - "logits/chosen": -1.5612363815307617, - "logits/rejected": -1.494425654411316, - "logps/chosen": -108.3369369506836, - "logps/rejected": -121.25785064697266, - "loss": 0.6784, - "pred_label": 107.0, - "rewards/accuracies": 0.36250001192092896, - "rewards/chosen": -0.29597795009613037, - "rewards/margins": 0.18698883056640625, - "rewards/rejected": -0.482966810464859, + "logits/chosen": -2.1437509059906006, + "logits/rejected": -2.084073781967163, + "logps/chosen": -90.86326599121094, + "logps/rejected": -96.72235870361328, + "loss": 0.6875, + "pred_label": 378.6000061035156, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.12124122679233551, + "rewards/margins": 0.11637073755264282, + "rewards/rejected": -0.23761197924613953, "step": 230, - "use_label": 4503.0 + "use_label": 4231.39990234375 }, { "epoch": 0.25, - "grad_norm": 3.109375, + "grad_norm": 0.953125, "learning_rate": 4.661243806657256e-06, - "logits/chosen": -1.279926061630249, - "logits/rejected": -1.207486629486084, - "logps/chosen": -94.6622314453125, - "logps/rejected": -121.74755859375, - "loss": 0.68, - "pred_label": 124.05000305175781, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.30948689579963684, - "rewards/margins": 0.16862434148788452, - "rewards/rejected": -0.4781111776828766, + "logits/chosen": -2.1431565284729004, + "logits/rejected": -2.1365227699279785, + "logps/chosen": -71.16796875, + "logps/rejected": -91.01861572265625, + "loss": 0.6846, + "pred_label": 403.125, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": -0.07454425096511841, + "rewards/margins": 0.09627760201692581, + "rewards/rejected": -0.17082183063030243, "step": 240, - "use_label": 4645.9501953125 + "use_label": 4366.875 }, { "epoch": 0.26, - "grad_norm": 1.8203125, + "grad_norm": 0.890625, "learning_rate": 4.613851860533367e-06, - "logits/chosen": -1.4483808279037476, - "logits/rejected": -1.535796880722046, - "logps/chosen": -88.96175384521484, - "logps/rejected": -106.26942443847656, - "loss": 0.678, - "pred_label": 140.64999389648438, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.2383408546447754, - "rewards/margins": 0.1766553670167923, - "rewards/rejected": -0.4149962067604065, + "logits/chosen": -2.1595332622528076, + "logits/rejected": -2.183953285217285, + "logps/chosen": -71.86934661865234, + "logps/rejected": -80.0597152709961, + "loss": 0.6844, + "pred_label": 422.25, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06741674989461899, + "rewards/margins": 0.08548234403133392, + "rewards/rejected": -0.1528991013765335, "step": 250, - "use_label": 4789.35009765625 + "use_label": 4507.75 }, { "epoch": 0.27, - "grad_norm": 2.359375, + "grad_norm": 1.0390625, "learning_rate": 4.563632824908252e-06, - "logits/chosen": -1.5566436052322388, - "logits/rejected": -1.472214937210083, - "logps/chosen": -101.36164855957031, - "logps/rejected": -132.7355194091797, - "loss": 0.6768, - "pred_label": 156.85000610351562, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.404205858707428, - "rewards/margins": 0.1837155818939209, - "rewards/rejected": -0.5879215002059937, + "logits/chosen": -2.1189560890197754, + "logits/rejected": -2.071620464324951, + "logps/chosen": -77.1129150390625, + "logps/rejected": -101.45845031738281, + "loss": 0.6837, + "pred_label": 445.79998779296875, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.16171860694885254, + "rewards/margins": 0.11343212425708771, + "rewards/rejected": -0.27515071630477905, "step": 260, - "use_label": 4933.14990234375 + "use_label": 4644.2001953125 }, { "epoch": 0.28, - "grad_norm": 2.265625, + "grad_norm": 1.0703125, "learning_rate": 4.510653863290871e-06, - "logits/chosen": -1.5190045833587646, - "logits/rejected": -1.5413776636123657, - "logps/chosen": -123.2553482055664, - "logps/rejected": -132.5965118408203, - "loss": 0.6699, - "pred_label": 174.9499969482422, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.478261798620224, - "rewards/margins": 0.1528010368347168, - "rewards/rejected": -0.6310628056526184, + "logits/chosen": -2.1512458324432373, + "logits/rejected": -2.164412021636963, + "logps/chosen": -91.74055480957031, + "logps/rejected": -95.13731384277344, + "loss": 0.6883, + "pred_label": 470.04998779296875, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.16311386227607727, + "rewards/margins": 0.0933571308851242, + "rewards/rejected": -0.2564709782600403, "step": 270, - "use_label": 5075.0498046875 + "use_label": 4779.9501953125 }, { "epoch": 0.29, - "grad_norm": 1.390625, + "grad_norm": 0.8828125, "learning_rate": 4.454985830346574e-06, - "logits/chosen": -1.4161837100982666, - "logits/rejected": -1.461897850036621, - "logps/chosen": -97.59378051757812, - "logps/rejected": -113.92098236083984, - "loss": 0.6682, - "pred_label": 182.6750030517578, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.36361420154571533, - "rewards/margins": 0.11149580776691437, - "rewards/rejected": -0.4751099944114685, + "logits/chosen": -2.0734293460845947, + "logits/rejected": -2.1033730506896973, + "logps/chosen": -76.7903823852539, + "logps/rejected": -86.99803161621094, + "loss": 0.6858, + "pred_label": 494.9750061035156, + "rewards/accuracies": 0.29374998807907104, + "rewards/chosen": -0.15558014810085297, + "rewards/margins": 0.050300367176532745, + "rewards/rejected": -0.2058805227279663, "step": 280, - "use_label": 5227.3251953125 + "use_label": 4915.02490234375 }, { "epoch": 0.3, - "grad_norm": 2.0, + "grad_norm": 1.3125, "learning_rate": 4.396703177135262e-06, - "logits/chosen": -1.1572140455245972, - "logits/rejected": -1.1582170724868774, - "logps/chosen": -113.84346008300781, - "logps/rejected": -136.17958068847656, - "loss": 0.6783, - "pred_label": 200.5749969482422, - "rewards/accuracies": 0.3687500059604645, - "rewards/chosen": -0.3756815791130066, - "rewards/margins": 0.2912302017211914, - "rewards/rejected": -0.666911780834198, + "logits/chosen": -1.9870249032974243, + "logits/rejected": -1.956434965133667, + "logps/chosen": -89.98160552978516, + "logps/rejected": -99.75212097167969, + "loss": 0.6905, + "pred_label": 527.0499877929688, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.13706301152706146, + "rewards/margins": 0.16557420790195465, + "rewards/rejected": -0.3026372492313385, "step": 290, - "use_label": 5369.4248046875 + "use_label": 5042.9501953125 }, { "epoch": 0.31, - "grad_norm": 2.59375, + "grad_norm": 1.6015625, "learning_rate": 4.335883851539693e-06, - "logits/chosen": -0.7739458084106445, - "logits/rejected": -0.8222519159317017, - "logps/chosen": -96.963623046875, - "logps/rejected": -129.59027099609375, - "loss": 0.6728, - "pred_label": 223.0749969482422, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.43035492300987244, - "rewards/margins": 0.2433358132839203, - "rewards/rejected": -0.6736907958984375, + "logits/chosen": -1.9497883319854736, + "logits/rejected": -1.964604377746582, + "logps/chosen": -68.64933013916016, + "logps/rejected": -91.48945617675781, + "loss": 0.6848, + "pred_label": 561.8499755859375, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.14721202850341797, + "rewards/margins": 0.14547064900398254, + "rewards/rejected": -0.2926826477050781, "step": 300, - "use_label": 5506.9248046875 + "use_label": 5168.14990234375 }, { "epoch": 0.31, - "eval_logits/chosen": -0.7414401173591614, - "eval_logits/rejected": -0.6762140393257141, - "eval_logps/chosen": -116.0198745727539, - "eval_logps/rejected": -145.98529052734375, - "eval_loss": 0.6783695220947266, - "eval_pred_label": 270.96826171875, - "eval_rewards/accuracies": 0.3373015820980072, - "eval_rewards/chosen": -0.4711931049823761, - "eval_rewards/margins": 0.23467905819416046, - "eval_rewards/rejected": -0.7058721780776978, - "eval_runtime": 248.0617, - "eval_samples_per_second": 8.063, - "eval_steps_per_second": 0.254, - "eval_use_label": 5793.03173828125, + "eval_logits/chosen": -1.9156862497329712, + "eval_logits/rejected": -1.8827954530715942, + "eval_logps/chosen": -89.57630920410156, + "eval_logps/rejected": -109.2765884399414, + "eval_loss": 0.6877307295799255, + "eval_pred_label": 626.1270141601562, + "eval_rewards/accuracies": 0.341269850730896, + "eval_rewards/chosen": -0.20675767958164215, + "eval_rewards/margins": 0.13202756643295288, + "eval_rewards/rejected": -0.33878523111343384, + "eval_runtime": 246.2269, + "eval_samples_per_second": 8.123, + "eval_steps_per_second": 0.256, + "eval_use_label": 5437.873046875, "step": 300 }, { "epoch": 0.32, - "grad_norm": 2.21875, + "grad_norm": 1.5, "learning_rate": 4.2726091940171055e-06, - "logits/chosen": -1.1870858669281006, - "logits/rejected": -1.1604619026184082, - "logps/chosen": -93.13883972167969, - "logps/rejected": -115.36534118652344, - "loss": 0.6683, - "pred_label": 314.92498779296875, - "rewards/accuracies": 0.3062500059604645, - "rewards/chosen": -0.4414878487586975, - "rewards/margins": 0.11716248840093613, - "rewards/rejected": -0.5586503148078918, + "logits/chosen": -2.043640613555908, + "logits/rejected": -2.01674222946167, + "logps/chosen": -72.24534606933594, + "logps/rejected": -89.407470703125, + "loss": 0.6865, + "pred_label": 688.9500122070312, + "rewards/accuracies": 0.29374998807907104, + "rewards/chosen": -0.23255303502082825, + "rewards/margins": 0.06651856750249863, + "rewards/rejected": -0.29907160997390747, "step": 310, - "use_label": 6079.0751953125 + "use_label": 5705.0498046875 }, { "epoch": 0.33, - "grad_norm": 2.34375, + "grad_norm": 1.1796875, "learning_rate": 4.206963828813555e-06, - "logits/chosen": -0.7778801918029785, - "logits/rejected": -0.8481136560440063, - "logps/chosen": -115.88163757324219, - "logps/rejected": -152.60556030273438, - "loss": 0.6715, - "pred_label": 340.4750061035156, + "logits/chosen": -1.9597671031951904, + "logits/rejected": -1.9893718957901, + "logps/chosen": -94.37977600097656, + "logps/rejected": -118.25643157958984, + "loss": 0.6871, + "pred_label": 724.375, "rewards/accuracies": 0.36250001192092896, - "rewards/chosen": -0.41940560936927795, - "rewards/margins": 0.2641361653804779, - "rewards/rejected": -0.6835418343544006, + "rewards/chosen": -0.20438706874847412, + "rewards/margins": 0.13566336035728455, + "rewards/rejected": -0.34005045890808105, "step": 320, - "use_label": 6213.52490234375 + "use_label": 5829.625 }, { "epoch": 0.35, - "grad_norm": 2.8125, + "grad_norm": 0.95703125, "learning_rate": 4.139035550786495e-06, - "logits/chosen": -0.7590861320495605, - "logits/rejected": -0.6889998316764832, - "logps/chosen": -90.45745086669922, - "logps/rejected": -116.98609924316406, - "loss": 0.6718, - "pred_label": 364.04998779296875, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.269859254360199, - "rewards/margins": 0.2574610114097595, - "rewards/rejected": -0.5273202657699585, + "logits/chosen": -1.989506483078003, + "logits/rejected": -1.9580066204071045, + "logps/chosen": -73.50363159179688, + "logps/rejected": -87.75289154052734, + "loss": 0.683, + "pred_label": 754.4500122070312, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.1003209576010704, + "rewards/margins": 0.13466720283031464, + "rewards/rejected": -0.23498816788196564, "step": 330, - "use_label": 6349.9501953125 + "use_label": 5959.5498046875 }, { "epoch": 0.36, - "grad_norm": 2.703125, + "grad_norm": 1.0234375, "learning_rate": 4.068915207986931e-06, - "logits/chosen": -0.6209542155265808, - "logits/rejected": -0.47464966773986816, - "logps/chosen": -102.54121398925781, - "logps/rejected": -130.24276733398438, - "loss": 0.678, - "pred_label": 391.07501220703125, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.395341694355011, - "rewards/margins": 0.19944116473197937, - "rewards/rejected": -0.594782829284668, + "logits/chosen": -2.0428695678710938, + "logits/rejected": -2.016120195388794, + "logps/chosen": -74.91081237792969, + "logps/rejected": -93.89201354980469, + "loss": 0.6894, + "pred_label": 786.4749755859375, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.11903776973485947, + "rewards/margins": 0.11223740875720978, + "rewards/rejected": -0.23127520084381104, "step": 340, - "use_label": 6482.9248046875 + "use_label": 6087.52490234375 }, { "epoch": 0.37, - "grad_norm": 1.8203125, + "grad_norm": 0.984375, "learning_rate": 3.996696580158211e-06, - "logits/chosen": -0.7593547701835632, - "logits/rejected": -0.6881019473075867, - "logps/chosen": -92.08587646484375, - "logps/rejected": -110.56968688964844, - "loss": 0.6727, - "pred_label": 411.9750061035156, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.31440719962120056, - "rewards/margins": 0.14519965648651123, - "rewards/rejected": -0.4596068263053894, + "logits/chosen": -2.0441341400146484, + "logits/rejected": -2.0229620933532715, + "logps/chosen": -73.9575424194336, + "logps/rejected": -86.34129333496094, + "loss": 0.6869, + "pred_label": 817.5250244140625, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.133123978972435, + "rewards/margins": 0.08419892936944962, + "rewards/rejected": -0.2173229157924652, "step": 350, - "use_label": 6622.02490234375 + "use_label": 6216.47509765625 }, { "epoch": 0.38, - "grad_norm": 1.9609375, + "grad_norm": 1.546875, "learning_rate": 3.922476253313921e-06, - "logits/chosen": -0.996785044670105, - "logits/rejected": -0.9698454737663269, - "logps/chosen": -101.95857238769531, - "logps/rejected": -114.76066589355469, - "loss": 0.6783, - "pred_label": 427.95001220703125, - "rewards/accuracies": 0.36250001192092896, - "rewards/chosen": -0.32893818616867065, - "rewards/margins": 0.16835859417915344, - "rewards/rejected": -0.4972967505455017, + "logits/chosen": -2.0575146675109863, + "logits/rejected": -2.054591417312622, + "logps/chosen": -82.88232421875, + "logps/rejected": -90.05668640136719, + "loss": 0.6863, + "pred_label": 848.6500244140625, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.13817565143108368, + "rewards/margins": 0.11208128929138184, + "rewards/rejected": -0.2502569556236267, "step": 360, - "use_label": 6766.0498046875 + "use_label": 6345.35009765625 }, { "epoch": 0.39, - "grad_norm": 1.484375, + "grad_norm": 0.75, "learning_rate": 3.846353490562664e-06, - "logits/chosen": -1.0720884799957275, - "logits/rejected": -0.859793484210968, - "logps/chosen": -103.12544250488281, - "logps/rejected": -120.26700592041016, - "loss": 0.6626, - "pred_label": 443.875, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.29031243920326233, - "rewards/margins": 0.2187139242887497, - "rewards/rejected": -0.509026288986206, + "logits/chosen": -2.076312780380249, + "logits/rejected": -1.9995708465576172, + "logps/chosen": -85.83981323242188, + "logps/rejected": -95.1656723022461, + "loss": 0.6844, + "pred_label": 880.4249877929688, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": -0.11745607852935791, + "rewards/margins": 0.14055705070495605, + "rewards/rejected": -0.2580130994319916, "step": 370, - "use_label": 6910.125 + "use_label": 6473.5751953125 }, { "epoch": 0.4, - "grad_norm": 2.671875, + "grad_norm": 0.96484375, "learning_rate": 3.768430099352445e-06, - "logits/chosen": -0.4451161324977875, - "logits/rejected": -0.32113510370254517, - "logps/chosen": -106.78487396240234, - "logps/rejected": -132.46365356445312, - "loss": 0.6759, - "pred_label": 463.2749938964844, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.4723123610019684, - "rewards/margins": 0.1635245531797409, - "rewards/rejected": -0.6358368992805481, + "logits/chosen": -2.0079166889190674, + "logits/rejected": -1.986297845840454, + "logps/chosen": -76.30638122558594, + "logps/rejected": -93.93800354003906, + "loss": 0.6924, + "pred_label": 912.5999755859375, + "rewards/accuracies": 0.3062500059604645, + "rewards/chosen": -0.1675274670124054, + "rewards/margins": 0.08305275440216064, + "rewards/rejected": -0.25058022141456604, "step": 380, - "use_label": 7050.72509765625 + "use_label": 6601.39990234375 }, { "epoch": 0.41, - "grad_norm": 2.875, + "grad_norm": 0.97265625, "learning_rate": 3.6888102953122307e-06, - "logits/chosen": -0.02978489175438881, - "logits/rejected": -0.03225391358137131, - "logps/chosen": -136.5101318359375, - "logps/rejected": -144.56173706054688, - "loss": 0.6827, - "pred_label": 495.82501220703125, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.516473650932312, - "rewards/margins": 0.25756725668907166, - "rewards/rejected": -0.774040937423706, + "logits/chosen": -1.9291635751724243, + "logits/rejected": -1.914608359336853, + "logps/chosen": -101.44157409667969, + "logps/rejected": -96.10136413574219, + "loss": 0.6878, + "pred_label": 952.8250122070312, + "rewards/accuracies": 0.3187499940395355, + "rewards/chosen": -0.1657881736755371, + "rewards/margins": 0.12364902347326279, + "rewards/rejected": -0.2894372344017029, "step": 390, - "use_label": 7178.1748046875 + "use_label": 6721.1748046875 }, { "epoch": 0.42, - "grad_norm": 2.515625, + "grad_norm": 1.296875, "learning_rate": 3.607600562872785e-06, - "logits/chosen": 0.09610392153263092, - "logits/rejected": 0.09092014282941818, - "logps/chosen": -117.39754486083984, - "logps/rejected": -152.1193084716797, - "loss": 0.6715, - "pred_label": 523.0250244140625, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.4636654853820801, - "rewards/margins": 0.2620038390159607, - "rewards/rejected": -0.725669264793396, + "logits/chosen": -1.8988447189331055, + "logits/rejected": -1.8926557302474976, + "logps/chosen": -87.97608947753906, + "logps/rejected": -108.15446472167969, + "loss": 0.6857, + "pred_label": 987.5999755859375, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.16945099830627441, + "rewards/margins": 0.11657001823186874, + "rewards/rejected": -0.28602102398872375, "step": 400, - "use_label": 7310.97509765625 + "use_label": 6846.39990234375 }, { "epoch": 0.42, - "eval_logits/chosen": 0.6419875621795654, - "eval_logits/rejected": 0.764842689037323, - "eval_logps/chosen": -113.52101135253906, - "eval_logps/rejected": -148.9146270751953, - "eval_loss": 0.6811794638633728, - "eval_pred_label": 572.6825561523438, - "eval_rewards/accuracies": 0.3551587164402008, - "eval_rewards/chosen": -0.44620463252067566, - "eval_rewards/margins": 0.2889607846736908, - "eval_rewards/rejected": -0.7351653575897217, - "eval_runtime": 247.9054, - "eval_samples_per_second": 8.068, - "eval_steps_per_second": 0.254, - "eval_use_label": 7595.3173828125, + "eval_logits/chosen": -1.4529144763946533, + "eval_logits/rejected": -1.4031411409378052, + "eval_logps/chosen": -86.92367553710938, + "eval_logps/rejected": -108.39134979248047, + "eval_loss": 0.6884719133377075, + "eval_pred_label": 1055.5555419921875, + "eval_rewards/accuracies": 0.3531745970249176, + "eval_rewards/chosen": -0.18023118376731873, + "eval_rewards/margins": 0.14970164000988007, + "eval_rewards/rejected": -0.32993283867836, + "eval_runtime": 246.35, + "eval_samples_per_second": 8.119, + "eval_steps_per_second": 0.256, + "eval_use_label": 7112.4443359375, "step": 400 }, { "epoch": 0.43, - "grad_norm": 2.671875, + "grad_norm": 1.28125, "learning_rate": 3.5249095128531863e-06, - "logits/chosen": 0.34008723497390747, - "logits/rejected": 0.07830000668764114, - "logps/chosen": -100.46337890625, - "logps/rejected": -124.54425048828125, - "loss": 0.6735, - "pred_label": 637.2999877929688, - "rewards/accuracies": 0.3812499940395355, - "rewards/chosen": -0.3595535457134247, - "rewards/margins": 0.30695658922195435, - "rewards/rejected": -0.6665101647377014, + "logits/chosen": -1.289879560470581, + "logits/rejected": -1.4085474014282227, + "logps/chosen": -85.75054168701172, + "logps/rejected": -96.24283599853516, + "loss": 0.6874, + "pred_label": 1135.699951171875, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.21242520213127136, + "rewards/margins": 0.17107079923152924, + "rewards/rejected": -0.3834960162639618, "step": 410, - "use_label": 7860.7001953125 + "use_label": 7362.2998046875 }, { "epoch": 0.44, - "grad_norm": 2.53125, + "grad_norm": 0.97265625, "learning_rate": 3.4408477372034743e-06, - "logits/chosen": 0.0810169205069542, - "logits/rejected": 0.1531490534543991, - "logps/chosen": -104.22071838378906, - "logps/rejected": -134.35255432128906, - "loss": 0.6774, - "pred_label": 659.75, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.4057747721672058, - "rewards/margins": 0.256902813911438, - "rewards/rejected": -0.6626775860786438, + "logits/chosen": -1.2336995601654053, + "logits/rejected": -1.1623611450195312, + "logps/chosen": -97.20266723632812, + "logps/rejected": -117.6893081665039, + "loss": 0.6882, + "pred_label": 1171.425048828125, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.3355943560600281, + "rewards/margins": 0.16045086085796356, + "rewards/rejected": -0.49604520201683044, "step": 420, - "use_label": 7998.25 + "use_label": 7486.5751953125 }, { "epoch": 0.45, - "grad_norm": 2.03125, + "grad_norm": 1.1484375, "learning_rate": 3.355527661097728e-06, - "logits/chosen": 0.1920831948518753, - "logits/rejected": 0.327668160200119, - "logps/chosen": -127.31324768066406, - "logps/rejected": -138.50869750976562, - "loss": 0.6761, - "pred_label": 684.7249755859375, - "rewards/accuracies": 0.30000001192092896, - "rewards/chosen": -0.5084472894668579, - "rewards/margins": 0.1930726319551468, - "rewards/rejected": -0.7015198469161987, + "logits/chosen": -1.3129976987838745, + "logits/rejected": -1.2275488376617432, + "logps/chosen": -106.88911437988281, + "logps/rejected": -112.3751449584961, + "loss": 0.6918, + "pred_label": 1207.9749755859375, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3042059540748596, + "rewards/margins": 0.13597823679447174, + "rewards/rejected": -0.44018417596817017, "step": 430, - "use_label": 8133.27490234375 + "use_label": 7610.02490234375 }, { "epoch": 0.46, - "grad_norm": 3.46875, + "grad_norm": 1.5625, "learning_rate": 3.269063392575352e-06, - "logits/chosen": 0.2943039536476135, - "logits/rejected": 0.04520421102643013, - "logps/chosen": -116.11759185791016, - "logps/rejected": -138.48878479003906, - "loss": 0.6761, - "pred_label": 708.7999877929688, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.48671650886535645, - "rewards/margins": 0.26555269956588745, - "rewards/rejected": -0.7522691488265991, + "logits/chosen": -1.3159044981002808, + "logits/rejected": -1.413769006729126, + "logps/chosen": -90.12797546386719, + "logps/rejected": -101.85379028320312, + "loss": 0.6858, + "pred_label": 1242.5, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.22682049870491028, + "rewards/margins": 0.159098818898201, + "rewards/rejected": -0.3859192728996277, "step": 440, - "use_label": 8269.2001953125 + "use_label": 7735.5 }, { "epoch": 0.47, - "grad_norm": 2.046875, + "grad_norm": 1.375, "learning_rate": 3.181570569931697e-06, - "logits/chosen": 0.16918572783470154, - "logits/rejected": -0.033099401742219925, - "logps/chosen": -121.46165466308594, - "logps/rejected": -148.68612670898438, - "loss": 0.6878, - "pred_label": 732.25, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.4864117503166199, - "rewards/margins": 0.24023088812828064, - "rewards/rejected": -0.7266427278518677, + "logits/chosen": -1.4389588832855225, + "logits/rejected": -1.5265202522277832, + "logps/chosen": -96.37947845458984, + "logps/rejected": -113.1718521118164, + "loss": 0.6951, + "pred_label": 1281.3499755859375, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.2355901300907135, + "rewards/margins": 0.13590970635414124, + "rewards/rejected": -0.37149983644485474, "step": 450, - "use_label": 8405.75 + "use_label": 7856.64990234375 }, { "epoch": 0.48, - "grad_norm": 2.109375, + "grad_norm": 1.015625, "learning_rate": 3.09316620706208e-06, - "logits/chosen": 0.21930424869060516, - "logits/rejected": 0.31035083532333374, - "logps/chosen": -91.38786315917969, - "logps/rejected": -110.15885925292969, - "loss": 0.6718, - "pred_label": 753.5999755859375, - "rewards/accuracies": 0.3062500059604645, - "rewards/chosen": -0.34389492869377136, - "rewards/margins": 0.23962631821632385, - "rewards/rejected": -0.5835212469100952, + "logits/chosen": -1.2455997467041016, + "logits/rejected": -1.1902601718902588, + "logps/chosen": -72.07853698730469, + "logps/rejected": -84.86478424072266, + "loss": 0.6842, + "pred_label": 1311.824951171875, + "rewards/accuracies": 0.29374998807907104, + "rewards/chosen": -0.1508016437292099, + "rewards/margins": 0.1797787994146347, + "rewards/rejected": -0.330580472946167, "step": 460, - "use_label": 8544.400390625 + "use_label": 7986.1748046875 }, { "epoch": 0.49, - "grad_norm": 2.453125, + "grad_norm": 1.1015625, "learning_rate": 3.0039685369660785e-06, - "logits/chosen": 0.31231826543807983, - "logits/rejected": 0.4929059147834778, - "logps/chosen": -109.46476745605469, - "logps/rejected": -141.3244171142578, - "loss": 0.6786, - "pred_label": 777.9000244140625, - "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.4255266785621643, - "rewards/margins": 0.30392220616340637, - "rewards/rejected": -0.7294487953186035, + "logits/chosen": -1.175449252128601, + "logits/rejected": -1.0759943723678589, + "logps/chosen": -88.91249084472656, + "logps/rejected": -110.02799987792969, + "loss": 0.6873, + "pred_label": 1345.1500244140625, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": -0.22000393271446228, + "rewards/margins": 0.1964809000492096, + "rewards/rejected": -0.4164848327636719, "step": 470, - "use_label": 8680.099609375 + "use_label": 8112.85009765625 }, { "epoch": 0.5, - "grad_norm": 1.6328125, + "grad_norm": 1.0859375, "learning_rate": 2.91409685362137e-06, - "logits/chosen": 0.5682842135429382, - "logits/rejected": 0.4352129399776459, - "logps/chosen": -135.01535034179688, - "logps/rejected": -157.18922424316406, - "loss": 0.6756, - "pred_label": 809.4000244140625, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.598731517791748, - "rewards/margins": 0.19433310627937317, - "rewards/rejected": -0.7930646538734436, + "logits/chosen": -1.0014227628707886, + "logits/rejected": -1.0880533456802368, + "logps/chosen": -99.41879272460938, + "logps/rejected": -120.02769470214844, + "loss": 0.6868, + "pred_label": 1391.25, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.24276605248451233, + "rewards/margins": 0.17868337035179138, + "rewards/rejected": -0.4214494228363037, "step": 480, - "use_label": 8808.599609375 + "use_label": 8226.75 }, { "epoch": 0.51, - "grad_norm": 1.8359375, + "grad_norm": 1.4375, "learning_rate": 2.8236713524386085e-06, - "logits/chosen": 0.13114799559116364, - "logits/rejected": 0.3516528606414795, - "logps/chosen": -117.50811767578125, - "logps/rejected": -130.34207153320312, - "loss": 0.6686, - "pred_label": 831.2999877929688, - "rewards/accuracies": 0.3062500059604645, - "rewards/chosen": -0.5088413953781128, - "rewards/margins": 0.19557976722717285, - "rewards/rejected": -0.7044212222099304, + "logits/chosen": -1.0729541778564453, + "logits/rejected": -0.9298813939094543, + "logps/chosen": -88.73147583007812, + "logps/rejected": -94.53245544433594, + "loss": 0.6921, + "pred_label": 1428.9000244140625, + "rewards/accuracies": 0.26875001192092896, + "rewards/chosen": -0.22107498347759247, + "rewards/margins": 0.12524999678134918, + "rewards/rejected": -0.34632498025894165, "step": 490, - "use_label": 8946.7001953125 + "use_label": 8349.099609375 }, { "epoch": 0.52, - "grad_norm": 1.9296875, + "grad_norm": 1.421875, "learning_rate": 2.7328129695107205e-06, - "logits/chosen": 0.332313597202301, - "logits/rejected": 0.04164884611964226, - "logps/chosen": -148.10968017578125, - "logps/rejected": -173.51394653320312, - "loss": 0.6744, - "pred_label": 849.9749755859375, - "rewards/accuracies": 0.42500001192092896, - "rewards/chosen": -0.7197140455245972, - "rewards/margins": 0.24881935119628906, - "rewards/rejected": -0.9685333967208862, + "logits/chosen": -0.8902079463005066, + "logits/rejected": -1.065393090248108, + "logps/chosen": -113.58573150634766, + "logps/rejected": -131.9083709716797, + "loss": 0.6894, + "pred_label": 1462.4000244140625, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.37447452545166016, + "rewards/margins": 0.17800332605838776, + "rewards/rejected": -0.5524778962135315, "step": 500, - "use_label": 9088.025390625 + "use_label": 8475.599609375 }, { "epoch": 0.52, - "eval_logits/chosen": 0.6149206757545471, - "eval_logits/rejected": 0.7128078937530518, - "eval_logps/chosen": -120.11334228515625, - "eval_logps/rejected": -151.15725708007812, - "eval_loss": 0.672174334526062, - "eval_pred_label": 893.8412475585938, - "eval_rewards/accuracies": 0.341269850730896, - "eval_rewards/chosen": -0.5121279954910278, - "eval_rewards/margins": 0.24546381831169128, - "eval_rewards/rejected": -0.7575918436050415, - "eval_runtime": 247.8447, - "eval_samples_per_second": 8.07, - "eval_steps_per_second": 0.254, - "eval_use_label": 9378.1591796875, + "eval_logits/chosen": -0.6888664960861206, + "eval_logits/rejected": -0.5997034311294556, + "eval_logps/chosen": -97.52025604248047, + "eval_logps/rejected": -120.9921646118164, + "eval_loss": 0.6891720294952393, + "eval_pred_label": 1530.5714111328125, + "eval_rewards/accuracies": 0.3551587164402008, + "eval_rewards/chosen": -0.28619715571403503, + "eval_rewards/margins": 0.1697438359260559, + "eval_rewards/rejected": -0.45594096183776855, + "eval_runtime": 246.2759, + "eval_samples_per_second": 8.121, + "eval_steps_per_second": 0.256, + "eval_use_label": 8741.4287109375, "step": 500 }, { "epoch": 0.53, - "grad_norm": 1.7578125, + "grad_norm": 1.0078125, "learning_rate": 2.641643219871597e-06, - "logits/chosen": 0.4815472662448883, - "logits/rejected": 0.2771294116973877, - "logps/chosen": -110.22953033447266, - "logps/rejected": -142.9767608642578, - "loss": 0.6765, - "pred_label": 940.7249755859375, + "logits/chosen": -0.7708507776260376, + "logits/rejected": -0.882653534412384, + "logps/chosen": -90.50456237792969, + "logps/rejected": -116.84162902832031, + "loss": 0.686, + "pred_label": 1610.5999755859375, "rewards/accuracies": 0.36250001192092896, - "rewards/chosen": -0.4598473608493805, - "rewards/margins": 0.2644655704498291, - "rewards/rejected": -0.724312961101532, + "rewards/chosen": -0.2625977396965027, + "rewards/margins": 0.20036396384239197, + "rewards/rejected": -0.4629616141319275, "step": 510, - "use_label": 9661.275390625 + "use_label": 8991.400390625 }, { "epoch": 0.54, - "grad_norm": 2.890625, + "grad_norm": 1.4765625, "learning_rate": 2.5502840349805074e-06, - "logits/chosen": 0.290465772151947, - "logits/rejected": 0.05848363786935806, - "logps/chosen": -115.6847915649414, - "logps/rejected": -137.01820373535156, - "loss": 0.6804, - "pred_label": 966.2000122070312, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.4329158365726471, - "rewards/margins": 0.21230947971343994, - "rewards/rejected": -0.6452253460884094, + "logits/chosen": -0.8800374865531921, + "logits/rejected": -1.038163185119629, + "logps/chosen": -100.99266052246094, + "logps/rejected": -116.75798034667969, + "loss": 0.6895, + "pred_label": 1653.0, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.2859944701194763, + "rewards/margins": 0.15662841498851776, + "rewards/rejected": -0.4426229000091553, "step": 520, - "use_label": 9795.7998046875 + "use_label": 9109.0 }, { "epoch": 0.55, - "grad_norm": 2.015625, + "grad_norm": 1.3671875, "learning_rate": 2.4588575996495797e-06, - "logits/chosen": 0.5015053153038025, - "logits/rejected": 0.544513463973999, - "logps/chosen": -124.15202331542969, - "logps/rejected": -145.21820068359375, - "loss": 0.6847, - "pred_label": 992.7999877929688, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.49871382117271423, - "rewards/margins": 0.27802106738090515, - "rewards/rejected": -0.7767347693443298, + "logits/chosen": -0.8304817080497742, + "logits/rejected": -0.7847825288772583, + "logps/chosen": -105.92545318603516, + "logps/rejected": -117.15931701660156, + "loss": 0.6895, + "pred_label": 1692.175048828125, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.316447913646698, + "rewards/margins": 0.17969803512096405, + "rewards/rejected": -0.49614596366882324, "step": 530, - "use_label": 9929.2001953125 + "use_label": 9229.8251953125 }, { "epoch": 0.57, - "grad_norm": 3.796875, + "grad_norm": 2.03125, "learning_rate": 2.367486188632446e-06, - "logits/chosen": 0.7326034903526306, - "logits/rejected": 0.5614863634109497, - "logps/chosen": -132.87228393554688, - "logps/rejected": -159.64352416992188, - "loss": 0.6731, - "pred_label": 1025.175048828125, - "rewards/accuracies": 0.3812499940395355, - "rewards/chosen": -0.5613355040550232, - "rewards/margins": 0.30218708515167236, - "rewards/rejected": -0.8635226488113403, + "logits/chosen": -0.67156982421875, + "logits/rejected": -0.8070074319839478, + "logps/chosen": -112.666748046875, + "logps/rejected": -131.92593383789062, + "loss": 0.6896, + "pred_label": 1734.375, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35928016901016235, + "rewards/margins": 0.22706659138202667, + "rewards/rejected": -0.5863467454910278, "step": 540, - "use_label": 10056.8251953125 + "use_label": 9347.625 }, { "epoch": 0.58, - "grad_norm": 2.4375, + "grad_norm": 1.796875, "learning_rate": 2.276292003092593e-06, - "logits/chosen": 0.6115967631340027, - "logits/rejected": 0.6694309711456299, - "logps/chosen": -123.36216735839844, - "logps/rejected": -129.92201232910156, - "loss": 0.684, - "pred_label": 1044.2249755859375, - "rewards/accuracies": 0.28125, - "rewards/chosen": -0.553015947341919, - "rewards/margins": 0.14989802241325378, - "rewards/rejected": -0.7029139995574951, + "logits/chosen": -0.7944391369819641, + "logits/rejected": -0.7596977353096008, + "logps/chosen": -107.38740539550781, + "logps/rejected": -111.28292083740234, + "loss": 0.6887, + "pred_label": 1775.7249755859375, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3932684063911438, + "rewards/margins": 0.12325477600097656, + "rewards/rejected": -0.5165232419967651, "step": 550, - "use_label": 10197.775390625 + "use_label": 9466.275390625 }, { "epoch": 0.59, - "grad_norm": 2.171875, + "grad_norm": 1.3515625, "learning_rate": 2.1853970071701415e-06, - "logits/chosen": 0.649623453617096, - "logits/rejected": 0.5992484092712402, - "logps/chosen": -117.4646987915039, - "logps/rejected": -130.9988555908203, - "loss": 0.6748, - "pred_label": 1060.5999755859375, - "rewards/accuracies": 0.29374998807907104, - "rewards/chosen": -0.47909289598464966, - "rewards/margins": 0.1609223484992981, - "rewards/rejected": -0.6400152444839478, + "logits/chosen": -0.7152852416038513, + "logits/rejected": -0.7174454927444458, + "logps/chosen": -104.6649398803711, + "logps/rejected": -117.61528015136719, + "loss": 0.6901, + "pred_label": 1814.375, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": -0.3510952889919281, + "rewards/margins": 0.15508435666561127, + "rewards/rejected": -0.5061796307563782, "step": 560, - "use_label": 10341.400390625 + "use_label": 9587.625 }, { "epoch": 0.6, - "grad_norm": 2.5, + "grad_norm": 2.125, "learning_rate": 2.0949227648656194e-06, - "logits/chosen": 0.1570337414741516, - "logits/rejected": 0.2956157624721527, - "logps/chosen": -116.41545104980469, - "logps/rejected": -151.84051513671875, - "loss": 0.6765, - "pred_label": 1081.375, + "logits/chosen": -0.925454318523407, + "logits/rejected": -0.849765956401825, + "logps/chosen": -100.53346252441406, + "logps/rejected": -131.70309448242188, + "loss": 0.6872, + "pred_label": 1852.2249755859375, "rewards/accuracies": 0.375, - "rewards/chosen": -0.49815383553504944, - "rewards/margins": 0.2765403389930725, - "rewards/rejected": -0.7746941447257996, + "rewards/chosen": -0.3393338620662689, + "rewards/margins": 0.23398590087890625, + "rewards/rejected": -0.5733197927474976, "step": 570, - "use_label": 10480.625 + "use_label": 9709.775390625 }, { "epoch": 0.61, - "grad_norm": 2.09375, + "grad_norm": 1.15625, "learning_rate": 2.00499027745888e-06, - "logits/chosen": 0.3431427478790283, - "logits/rejected": 0.18610627949237823, - "logps/chosen": -128.4036865234375, - "logps/rejected": -150.36404418945312, - "loss": 0.677, - "pred_label": 1101.3499755859375, - "rewards/accuracies": 0.33125001192092896, - "rewards/chosen": -0.5349212288856506, - "rewards/margins": 0.18440793454647064, - "rewards/rejected": -0.7193291783332825, + "logits/chosen": -0.7680953145027161, + "logits/rejected": -0.8566532135009766, + "logps/chosen": -111.98583984375, + "logps/rejected": -131.1743927001953, + "loss": 0.6879, + "pred_label": 1893.7750244140625, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.37074294686317444, + "rewards/margins": 0.1566895693540573, + "rewards/rejected": -0.5274325013160706, "step": 580, - "use_label": 10620.650390625 + "use_label": 9828.224609375 }, { "epoch": 0.62, - "grad_norm": 2.21875, + "grad_norm": 1.1171875, "learning_rate": 1.915719821680624e-06, - "logits/chosen": 0.18862374126911163, - "logits/rejected": 0.19857950508594513, - "logps/chosen": -134.26577758789062, - "logps/rejected": -165.70481872558594, - "loss": 0.6624, - "pred_label": 1121.5250244140625, - "rewards/accuracies": 0.4124999940395355, - "rewards/chosen": -0.5457721948623657, - "rewards/margins": 0.30153924226760864, - "rewards/rejected": -0.8473113775253296, + "logits/chosen": -0.8080962300300598, + "logits/rejected": -0.7905328869819641, + "logps/chosen": -125.2184066772461, + "logps/rejected": -148.79432678222656, + "loss": 0.6891, + "pred_label": 1939.25, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.4552985727787018, + "rewards/margins": 0.22290782630443573, + "rewards/rejected": -0.6782063245773315, "step": 590, - "use_label": 10760.474609375 + "use_label": 9942.75 }, { "epoch": 0.63, - "grad_norm": 3.125, + "grad_norm": 1.9609375, "learning_rate": 1.8272307888529276e-06, - "logits/chosen": 0.5807100534439087, - "logits/rejected": 0.25763237476348877, - "logps/chosen": -133.36561584472656, - "logps/rejected": -180.56822204589844, - "loss": 0.6784, - "pred_label": 1148.5999755859375, - "rewards/accuracies": 0.41874998807907104, - "rewards/chosen": -0.5903924703598022, - "rewards/margins": 0.3091353178024292, - "rewards/rejected": -0.8995277285575867, + "logits/chosen": -0.5244548320770264, + "logits/rejected": -0.7590290904045105, + "logps/chosen": -122.6807632446289, + "logps/rejected": -162.36203002929688, + "loss": 0.6881, + "pred_label": 1992.0, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.48354387283325195, + "rewards/margins": 0.23392179608345032, + "rewards/rejected": -0.7174656391143799, "step": 600, - "use_label": 10893.400390625 + "use_label": 10050.0 }, { "epoch": 0.63, - "eval_logits/chosen": 0.8859585523605347, - "eval_logits/rejected": 0.9939000606536865, - "eval_logps/chosen": -119.97545623779297, - "eval_logps/rejected": -156.7531280517578, - "eval_loss": 0.6791760325431824, - "eval_pred_label": 1206.1904296875, - "eval_rewards/accuracies": 0.3511904776096344, - "eval_rewards/chosen": -0.5107490420341492, - "eval_rewards/margins": 0.30280154943466187, - "eval_rewards/rejected": -0.8135506510734558, - "eval_runtime": 247.9094, - "eval_samples_per_second": 8.067, + "eval_logits/chosen": -0.35794487595558167, + "eval_logits/rejected": -0.2547617554664612, + "eval_logps/chosen": -107.16178131103516, + "eval_logps/rejected": -135.9844512939453, + "eval_loss": 0.6918326616287231, + "eval_pred_label": 2082.3173828125, + "eval_rewards/accuracies": 0.3531745970249176, + "eval_rewards/chosen": -0.3826123774051666, + "eval_rewards/margins": 0.22325147688388824, + "eval_rewards/rejected": -0.6058638095855713, + "eval_runtime": 248.3104, + "eval_samples_per_second": 8.054, "eval_steps_per_second": 0.254, - "eval_use_label": 11169.8095703125, + "eval_use_label": 10293.6826171875, "step": 600 }, { "epoch": 0.64, - "grad_norm": 3.234375, + "grad_norm": 1.515625, "learning_rate": 1.739641525213929e-06, - "logits/chosen": 0.6684261560440063, - "logits/rejected": 0.5376627445220947, - "logps/chosen": -112.9523696899414, - "logps/rejected": -154.13601684570312, - "loss": 0.6711, - "pred_label": 1272.925048828125, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.5404548645019531, - "rewards/margins": 0.259638249874115, - "rewards/rejected": -0.8000930547714233, + "logits/chosen": -0.572044312953949, + "logits/rejected": -0.654716432094574, + "logps/chosen": -95.46563720703125, + "logps/rejected": -132.0639190673828, + "loss": 0.6926, + "pred_label": 2185.449951171875, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.3655874729156494, + "rewards/margins": 0.21378450095653534, + "rewards/rejected": -0.579371988773346, "step": 610, - "use_label": 11433.0751953125 + "use_label": 10520.5498046875 }, { "epoch": 0.65, - "grad_norm": 2.296875, + "grad_norm": 1.0859375, "learning_rate": 1.6530691736402317e-06, - "logits/chosen": 0.6429753303527832, - "logits/rejected": 0.5887765288352966, - "logps/chosen": -125.70455169677734, - "logps/rejected": -175.96624755859375, - "loss": 0.6734, - "pred_label": 1297.300048828125, - "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.6399649381637573, - "rewards/margins": 0.3187193274497986, - "rewards/rejected": -0.9586843252182007, + "logits/chosen": -0.7425838708877563, + "logits/rejected": -0.7612688541412354, + "logps/chosen": -98.45491790771484, + "logps/rejected": -139.22779846191406, + "loss": 0.6874, + "pred_label": 2228.10009765625, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.3674684762954712, + "rewards/margins": 0.22383132576942444, + "rewards/rejected": -0.591299831867218, "step": 620, - "use_label": 11568.7001953125 + "use_label": 10637.900390625 }, { "epoch": 0.66, - "grad_norm": 2.046875, + "grad_norm": 1.34375, "learning_rate": 1.5676295169786864e-06, - "logits/chosen": 0.8863061666488647, - "logits/rejected": 0.5724608302116394, - "logps/chosen": -149.8594970703125, - "logps/rejected": -178.8180694580078, - "loss": 0.6716, - "pred_label": 1321.8499755859375, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.7683452367782593, - "rewards/margins": 0.2745322585105896, - "rewards/rejected": -1.042877435684204, + "logits/chosen": -0.5626051425933838, + "logits/rejected": -0.7373117208480835, + "logps/chosen": -109.76419830322266, + "logps/rejected": -132.89573669433594, + "loss": 0.6861, + "pred_label": 2268.074951171875, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.3673921525478363, + "rewards/margins": 0.2162620723247528, + "rewards/rejected": -0.5836542844772339, "step": 630, - "use_label": 11704.150390625 + "use_label": 10757.9248046875 }, { "epoch": 0.67, - "grad_norm": 2.453125, + "grad_norm": 1.2578125, "learning_rate": 1.4834368231970922e-06, - "logits/chosen": 0.6763439178466797, - "logits/rejected": 0.8844535946846008, - "logps/chosen": -157.50503540039062, - "logps/rejected": -179.59378051757812, - "loss": 0.6738, - "pred_label": 1351.8499755859375, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.8581286668777466, - "rewards/margins": 0.28607478737831116, - "rewards/rejected": -1.1442034244537354, + "logits/chosen": -0.70842045545578, + "logits/rejected": -0.5356844663619995, + "logps/chosen": -115.94453430175781, + "logps/rejected": -132.53977966308594, + "loss": 0.6881, + "pred_label": 2312.199951171875, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.4425238072872162, + "rewards/margins": 0.23113970458507538, + "rewards/rejected": -0.6736636161804199, "step": 640, - "use_label": 11834.150390625 + "use_label": 10873.7998046875 }, { "epoch": 0.68, - "grad_norm": 2.921875, + "grad_norm": 1.5, "learning_rate": 1.4006036925609245e-06, - "logits/chosen": 0.5444064736366272, - "logits/rejected": 1.0297753810882568, - "logps/chosen": -149.0248565673828, - "logps/rejected": -186.53790283203125, - "loss": 0.6803, - "pred_label": 1381.949951171875, + "logits/chosen": -0.7530516386032104, + "logits/rejected": -0.39667490124702454, + "logps/chosen": -117.97354888916016, + "logps/rejected": -148.5204620361328, + "loss": 0.6907, + "pred_label": 2364.60009765625, "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.7583541870117188, - "rewards/margins": 0.32841619849205017, - "rewards/rejected": -1.0867704153060913, + "rewards/chosen": -0.4478411078453064, + "rewards/margins": 0.25875502824783325, + "rewards/rejected": -0.7065961956977844, "step": 650, - "use_label": 11964.0498046875 + "use_label": 10981.400390625 }, { "epoch": 0.69, - "grad_norm": 2.21875, + "grad_norm": 1.2109375, "learning_rate": 1.3192409070404582e-06, - "logits/chosen": 1.0406488180160522, - "logits/rejected": 0.8413463830947876, - "logps/chosen": -115.27984619140625, - "logps/rejected": -135.30836486816406, - "loss": 0.6782, - "pred_label": 1410.574951171875, - "rewards/accuracies": 0.30000001192092896, - "rewards/chosen": -0.5715335607528687, - "rewards/margins": 0.21567881107330322, - "rewards/rejected": -0.7872124910354614, + "logits/chosen": -0.4164413511753082, + "logits/rejected": -0.5387105345726013, + "logps/chosen": -93.08172607421875, + "logps/rejected": -106.9631576538086, + "loss": 0.6884, + "pred_label": 2410.39990234375, + "rewards/accuracies": 0.3062500059604645, + "rewards/chosen": -0.3495523929595947, + "rewards/margins": 0.1542079746723175, + "rewards/rejected": -0.5037603378295898, "step": 660, - "use_label": 12095.4248046875 + "use_label": 11095.599609375 }, { "epoch": 0.7, - "grad_norm": 2.28125, + "grad_norm": 1.515625, "learning_rate": 1.2394572821496953e-06, - "logits/chosen": 0.3095243275165558, - "logits/rejected": 0.21946246922016144, - "logps/chosen": -121.89112854003906, - "logps/rejected": -147.43978881835938, - "loss": 0.6854, - "pred_label": 1429.925048828125, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.5618449449539185, - "rewards/margins": 0.2343917340040207, - "rewards/rejected": -0.7962367534637451, + "logits/chosen": -0.9564473032951355, + "logits/rejected": -1.0122594833374023, + "logps/chosen": -100.20994567871094, + "logps/rejected": -121.32554626464844, + "loss": 0.6935, + "pred_label": 2446.14990234375, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.3450331687927246, + "rewards/margins": 0.19006122648715973, + "rewards/rejected": -0.5350943803787231, "step": 670, - "use_label": 12236.0751953125 + "use_label": 11219.849609375 }, { "epoch": 0.71, - "grad_norm": 1.9921875, + "grad_norm": 1.546875, "learning_rate": 1.1613595214152713e-06, - "logits/chosen": 0.8114501237869263, - "logits/rejected": 0.7095287442207336, - "logps/chosen": -152.98135375976562, - "logps/rejected": -175.5968475341797, - "loss": 0.6781, - "pred_label": 1451.175048828125, - "rewards/accuracies": 0.3687500059604645, - "rewards/chosen": -0.6692850589752197, - "rewards/margins": 0.2704187035560608, - "rewards/rejected": -0.9397038221359253, + "logits/chosen": -0.588452935218811, + "logits/rejected": -0.6323766708374023, + "logps/chosen": -125.20991516113281, + "logps/rejected": -139.94993591308594, + "loss": 0.6902, + "pred_label": 2485.10009765625, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.3915707468986511, + "rewards/margins": 0.19166378676891327, + "rewards/rejected": -0.5832345485687256, "step": 680, - "use_label": 12374.8251953125 + "use_label": 11340.900390625 }, { "epoch": 0.72, - "grad_norm": 2.59375, + "grad_norm": 1.578125, "learning_rate": 1.0850520736699362e-06, - "logits/chosen": 0.7043443322181702, - "logits/rejected": 0.5973688364028931, - "logps/chosen": -175.5553436279297, - "logps/rejected": -207.06423950195312, - "loss": 0.6749, - "pred_label": 1481.625, + "logits/chosen": -0.6506579518318176, + "logits/rejected": -0.7167869806289673, + "logps/chosen": -144.53038024902344, + "logps/rejected": -167.38192749023438, + "loss": 0.6898, + "pred_label": 2534.75, "rewards/accuracies": 0.39375001192092896, - "rewards/chosen": -0.7385014891624451, - "rewards/margins": 0.37226757407188416, - "rewards/rejected": -1.1107690334320068, + "rewards/chosen": -0.42825189232826233, + "rewards/margins": 0.28569427132606506, + "rewards/rejected": -0.7139460444450378, "step": 690, - "use_label": 12504.375 + "use_label": 11451.25 }, { "epoch": 0.73, - "grad_norm": 1.75, + "grad_norm": 1.59375, "learning_rate": 1.0106369933615043e-06, - "logits/chosen": 0.44649118185043335, - "logits/rejected": 0.6410871744155884, - "logps/chosen": -136.0747833251953, - "logps/rejected": -162.20260620117188, - "loss": 0.6783, - "pred_label": 1506.0999755859375, - "rewards/accuracies": 0.32499998807907104, - "rewards/chosen": -0.6972768902778625, - "rewards/margins": 0.23985597491264343, - "rewards/rejected": -0.9371329545974731, + "logits/chosen": -0.8556931614875793, + "logits/rejected": -0.6913198232650757, + "logps/chosen": -105.3968505859375, + "logps/rejected": -124.95710754394531, + "loss": 0.6913, + "pred_label": 2580.824951171875, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.39049768447875977, + "rewards/margins": 0.17418017983436584, + "rewards/rejected": -0.564677894115448, "step": 700, - "use_label": 12639.900390625 + "use_label": 11565.1748046875 }, { "epoch": 0.73, - "eval_logits/chosen": 1.192717432975769, - "eval_logits/rejected": 1.2994883060455322, - "eval_logps/chosen": -135.23951721191406, - "eval_logps/rejected": -171.3760528564453, - "eval_loss": 0.6756439805030823, - "eval_pred_label": 1558.5238037109375, + "eval_logits/chosen": -0.3469957709312439, + "eval_logits/rejected": -0.24619349837303162, + "eval_logps/chosen": -104.32471466064453, + "eval_logps/rejected": -133.26370239257812, + "eval_loss": 0.6898515224456787, + "eval_pred_label": 2673.52392578125, "eval_rewards/accuracies": 0.3670634925365448, - "eval_rewards/chosen": -0.6633896827697754, - "eval_rewards/margins": 0.29639023542404175, - "eval_rewards/rejected": -0.9597799181938171, - "eval_runtime": 247.9992, - "eval_samples_per_second": 8.065, + "eval_rewards/chosen": -0.35424166917800903, + "eval_rewards/margins": 0.22441466152668, + "eval_rewards/rejected": -0.5786563754081726, + "eval_runtime": 248.2749, + "eval_samples_per_second": 8.056, "eval_steps_per_second": 0.254, - "eval_use_label": 12921.4765625, + "eval_use_label": 11806.4765625, "step": 700 }, { "epoch": 0.74, - "grad_norm": 3.421875, + "grad_norm": 1.03125, "learning_rate": 9.382138040640714e-07, - "logits/chosen": 0.6493266820907593, - "logits/rejected": 0.6850475072860718, - "logps/chosen": -134.1044158935547, - "logps/rejected": -165.8421173095703, - "loss": 0.6766, - "pred_label": 1616.875, - "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.7103394269943237, - "rewards/margins": 0.2792840301990509, - "rewards/rejected": -0.9896234273910522, + "logits/chosen": -0.6519032716751099, + "logits/rejected": -0.637380063533783, + "logps/chosen": -102.23021697998047, + "logps/rejected": -127.60137939453125, + "loss": 0.6903, + "pred_label": 2771.699951171875, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.3915974497795105, + "rewards/margins": 0.21561889350414276, + "rewards/rejected": -0.6072162985801697, "step": 710, - "use_label": 13193.125 + "use_label": 12038.2998046875 }, { "epoch": 0.75, - "grad_norm": 2.203125, + "grad_norm": 1.609375, "learning_rate": 8.678793653740633e-07, - "logits/chosen": 0.7393421530723572, - "logits/rejected": 0.6282132267951965, - "logps/chosen": -108.0786361694336, - "logps/rejected": -142.93173217773438, - "loss": 0.6886, - "pred_label": 1639.2750244140625, - "rewards/accuracies": 0.2874999940395355, - "rewards/chosen": -0.5120818614959717, - "rewards/margins": 0.260955274105072, - "rewards/rejected": -0.7730370759963989, + "logits/chosen": -0.6509895324707031, + "logits/rejected": -0.6935362815856934, + "logps/chosen": -87.30061340332031, + "logps/rejected": -114.2796630859375, + "loss": 0.6903, + "pred_label": 2811.47509765625, + "rewards/accuracies": 0.29374998807907104, + "rewards/chosen": -0.30430155992507935, + "rewards/margins": 0.18221500515937805, + "rewards/rejected": -0.486516535282135, "step": 720, - "use_label": 13330.724609375 + "use_label": 12158.525390625 }, { "epoch": 0.76, - "grad_norm": 2.984375, + "grad_norm": 2.21875, "learning_rate": 7.997277433690984e-07, - "logits/chosen": 0.7698175311088562, - "logits/rejected": 0.6512314677238464, - "logps/chosen": -120.15057373046875, - "logps/rejected": -147.58602905273438, - "loss": 0.6705, - "pred_label": 1667.324951171875, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.49796366691589355, - "rewards/margins": 0.33586567640304565, - "rewards/rejected": -0.8338292837142944, + "logits/chosen": -0.6035222411155701, + "logits/rejected": -0.65208500623703, + "logps/chosen": -100.17440032958984, + "logps/rejected": -119.87808990478516, + "loss": 0.6865, + "pred_label": 2850.0, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.2982019782066345, + "rewards/margins": 0.2585477828979492, + "rewards/rejected": -0.5567497611045837, "step": 730, - "use_label": 13462.6748046875 + "use_label": 12280.0 }, { "epoch": 0.77, - "grad_norm": 1.8984375, + "grad_norm": 0.80859375, "learning_rate": 7.338500848029603e-07, - "logits/chosen": 0.9725875854492188, - "logits/rejected": 0.894719123840332, - "logps/chosen": -113.24635314941406, - "logps/rejected": -142.79689025878906, - "loss": 0.6779, - "pred_label": 1692.25, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5262097716331482, - "rewards/margins": 0.26758259534835815, - "rewards/rejected": -0.7937922477722168, + "logits/chosen": -0.4770827293395996, + "logits/rejected": -0.5081530213356018, + "logps/chosen": -94.86068725585938, + "logps/rejected": -116.67037200927734, + "loss": 0.6916, + "pred_label": 2886.125, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.34235304594039917, + "rewards/margins": 0.19017408788204193, + "rewards/rejected": -0.5325270891189575, "step": 740, - "use_label": 13597.75 + "use_label": 12403.875 }, { "epoch": 0.79, - "grad_norm": 2.0, + "grad_norm": 1.1015625, "learning_rate": 6.70334495204884e-07, - "logits/chosen": 0.8061111569404602, - "logits/rejected": 0.7181490659713745, - "logps/chosen": -142.5282745361328, - "logps/rejected": -173.8203887939453, - "loss": 0.6814, - "pred_label": 1716.25, + "logits/chosen": -0.5357509851455688, + "logits/rejected": -0.594279408454895, + "logps/chosen": -119.76139831542969, + "logps/rejected": -145.1709747314453, + "loss": 0.6905, + "pred_label": 2929.22509765625, "rewards/accuracies": 0.35624998807907104, - "rewards/chosen": -0.6499794125556946, - "rewards/margins": 0.24588195979595184, - "rewards/rejected": -0.8958613276481628, + "rewards/chosen": -0.4223107397556305, + "rewards/margins": 0.18705633282661438, + "rewards/rejected": -0.6093670725822449, "step": 750, - "use_label": 13733.75 + "use_label": 12520.775390625 }, { "epoch": 0.8, - "grad_norm": 2.203125, + "grad_norm": 1.1640625, "learning_rate": 6.092659210462232e-07, - "logits/chosen": 0.6536890268325806, - "logits/rejected": 0.6605783104896545, - "logps/chosen": -102.72889709472656, - "logps/rejected": -149.5058135986328, - "loss": 0.6733, - "pred_label": 1741.074951171875, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.48761066794395447, - "rewards/margins": 0.28703850507736206, - "rewards/rejected": -0.7746490836143494, + "logits/chosen": -0.6737512350082397, + "logits/rejected": -0.6523575186729431, + "logps/chosen": -86.640625, + "logps/rejected": -124.01812744140625, + "loss": 0.6899, + "pred_label": 2976.050048828125, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.32672789692878723, + "rewards/margins": 0.1930442750453949, + "rewards/rejected": -0.5197721719741821, "step": 760, - "use_label": 13868.9248046875 + "use_label": 12633.9501953125 }, { "epoch": 0.81, - "grad_norm": 2.375, + "grad_norm": 1.4375, "learning_rate": 5.507260361320738e-07, - "logits/chosen": 0.6806662082672119, - "logits/rejected": 0.5722958445549011, - "logps/chosen": -151.7313690185547, - "logps/rejected": -176.56260681152344, - "loss": 0.6719, - "pred_label": 1766.5250244140625, - "rewards/accuracies": 0.40625, - "rewards/chosen": -0.6818407773971558, - "rewards/margins": 0.3464636206626892, - "rewards/rejected": -1.0283044576644897, + "logits/chosen": -0.6238114833831787, + "logits/rejected": -0.6686199307441711, + "logps/chosen": -127.0525131225586, + "logps/rejected": -142.44747924804688, + "loss": 0.689, + "pred_label": 3021.85009765625, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.43505221605300903, + "rewards/margins": 0.25210094451904297, + "rewards/rejected": -0.687153160572052, "step": 770, - "use_label": 14003.474609375 + "use_label": 12748.150390625 }, { "epoch": 0.82, - "grad_norm": 3.03125, + "grad_norm": 1.7578125, "learning_rate": 4.947931323697983e-07, - "logits/chosen": 0.6961285471916199, - "logits/rejected": 0.45934200286865234, - "logps/chosen": -136.76901245117188, - "logps/rejected": -162.4578094482422, - "loss": 0.6822, - "pred_label": 1797.9000244140625, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.6396945714950562, - "rewards/margins": 0.2714308798313141, - "rewards/rejected": -0.9111254811286926, + "logits/chosen": -0.6369722485542297, + "logits/rejected": -0.7722553014755249, + "logps/chosen": -112.76126861572266, + "logps/rejected": -133.56796264648438, + "loss": 0.6915, + "pred_label": 3075.72509765625, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.3996170461177826, + "rewards/margins": 0.22261002659797668, + "rewards/rejected": -0.6222270727157593, "step": 780, - "use_label": 14132.099609375 + "use_label": 12854.275390625 }, { "epoch": 0.83, - "grad_norm": 1.8046875, + "grad_norm": 1.421875, "learning_rate": 4.4154201506053985e-07, - "logits/chosen": 0.824557900428772, - "logits/rejected": 0.8706857562065125, - "logps/chosen": -113.81227111816406, - "logps/rejected": -128.26260375976562, - "loss": 0.6771, - "pred_label": 1823.199951171875, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.4897800385951996, - "rewards/margins": 0.2714278995990753, - "rewards/rejected": -0.7612079381942749, + "logits/chosen": -0.5256940126419067, + "logits/rejected": -0.467402845621109, + "logps/chosen": -95.73258209228516, + "logps/rejected": -103.3360366821289, + "loss": 0.6917, + "pred_label": 3123.85009765625, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": -0.30898317694664, + "rewards/margins": 0.2029590606689453, + "rewards/rejected": -0.5119422674179077, "step": 790, - "use_label": 14266.7998046875 + "use_label": 12966.150390625 }, { "epoch": 0.84, - "grad_norm": 2.453125, + "grad_norm": 1.359375, "learning_rate": 3.910439028537638e-07, - "logits/chosen": 0.6243492364883423, - "logits/rejected": 0.7354862689971924, - "logps/chosen": -111.455322265625, - "logps/rejected": -141.9140625, - "loss": 0.6776, - "pred_label": 1847.300048828125, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.5140770673751831, - "rewards/margins": 0.28408390283584595, - "rewards/rejected": -0.7981609106063843, + "logits/chosen": -0.6677756905555725, + "logits/rejected": -0.607046902179718, + "logps/chosen": -92.61612701416016, + "logps/rejected": -115.20296478271484, + "loss": 0.6893, + "pred_label": 3166.449951171875, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.3256850242614746, + "rewards/margins": 0.20536477863788605, + "rewards/rejected": -0.5310498476028442, "step": 800, - "use_label": 14402.7001953125 + "use_label": 13083.5498046875 }, { "epoch": 0.84, - "eval_logits/chosen": 1.3586419820785522, - "eval_logits/rejected": 1.4788893461227417, - "eval_logps/chosen": -123.90097045898438, - "eval_logps/rejected": -161.67913818359375, - "eval_loss": 0.6800512671470642, - "eval_pred_label": 1900.4920654296875, - "eval_rewards/accuracies": 0.3531745970249176, - "eval_rewards/chosen": -0.5500041842460632, - "eval_rewards/margins": 0.312806636095047, - "eval_rewards/rejected": -0.8628108501434326, - "eval_runtime": 247.6869, - "eval_samples_per_second": 8.075, + "eval_logits/chosen": -0.23666124045848846, + "eval_logits/rejected": -0.1293245106935501, + "eval_logps/chosen": -103.33552551269531, + "eval_logps/rejected": -132.24159240722656, + "eval_loss": 0.6903889179229736, + "eval_pred_label": 3252.09521484375, + "eval_rewards/accuracies": 0.363095223903656, + "eval_rewards/chosen": -0.34434974193573, + "eval_rewards/margins": 0.22408555448055267, + "eval_rewards/rejected": -0.5684353113174438, + "eval_runtime": 248.2839, + "eval_samples_per_second": 8.055, "eval_steps_per_second": 0.254, - "eval_use_label": 14683.5078125, + "eval_use_label": 13331.904296875, "step": 800 }, { "epoch": 0.85, - "grad_norm": 3.0, + "grad_norm": 1.3828125, "learning_rate": 3.4336633249862084e-07, - "logits/chosen": 0.6561521291732788, - "logits/rejected": 0.6886910200119019, - "logps/chosen": -128.8015899658203, - "logps/rejected": -164.35073852539062, - "loss": 0.678, - "pred_label": 1961.699951171875, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": -0.5894675254821777, - "rewards/margins": 0.26825448870658875, - "rewards/rejected": -0.8577221035957336, + "logits/chosen": -0.6630854606628418, + "logits/rejected": -0.6445407867431641, + "logps/chosen": -108.18148040771484, + "logps/rejected": -135.99142456054688, + "loss": 0.6901, + "pred_label": 3350.35009765625, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.3832666873931885, + "rewards/margins": 0.1908622682094574, + "rewards/rejected": -0.5741289258003235, "step": 810, - "use_label": 14952.2998046875 + "use_label": 13563.650390625 }, { "epoch": 0.86, - "grad_norm": 2.203125, + "grad_norm": 1.3359375, "learning_rate": 2.98573068519539e-07, - "logits/chosen": 0.7671118974685669, - "logits/rejected": 0.7413855791091919, - "logps/chosen": -113.36724853515625, - "logps/rejected": -125.75687408447266, - "loss": 0.6785, - "pred_label": 1987.5, - "rewards/accuracies": 0.2874999940395355, - "rewards/chosen": -0.533771276473999, - "rewards/margins": 0.18785560131072998, - "rewards/rejected": -0.721626877784729, + "logits/chosen": -0.6042599081993103, + "logits/rejected": -0.6371781826019287, + "logps/chosen": -94.31297302246094, + "logps/rejected": -101.22802734375, + "loss": 0.689, + "pred_label": 3393.47509765625, + "rewards/accuracies": 0.29374998807907104, + "rewards/chosen": -0.3432285487651825, + "rewards/margins": 0.13310988247394562, + "rewards/rejected": -0.4763384461402893, "step": 820, - "use_label": 15086.5 + "use_label": 13680.525390625 }, { "epoch": 0.87, - "grad_norm": 1.7734375, + "grad_norm": 1.484375, "learning_rate": 2.5672401793681854e-07, - "logits/chosen": 0.9424182176589966, - "logits/rejected": 1.0951740741729736, - "logps/chosen": -104.4168472290039, - "logps/rejected": -135.86708068847656, - "loss": 0.6757, - "pred_label": 2014.300048828125, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.4637536108493805, - "rewards/margins": 0.3284408450126648, - "rewards/rejected": -0.7921944260597229, + "logits/chosen": -0.5476540923118591, + "logits/rejected": -0.43125781416893005, + "logps/chosen": -86.91058349609375, + "logps/rejected": -110.5887222290039, + "loss": 0.6923, + "pred_label": 3435.074951171875, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.2886909246444702, + "rewards/margins": 0.25071993470191956, + "rewards/rejected": -0.5394108295440674, "step": 830, - "use_label": 15219.7001953125 + "use_label": 13798.9248046875 }, { "epoch": 0.88, - "grad_norm": 3.546875, + "grad_norm": 1.9296875, "learning_rate": 2.178751501463036e-07, - "logits/chosen": 0.880671501159668, - "logits/rejected": 0.7017362117767334, - "logps/chosen": -107.72086334228516, - "logps/rejected": -113.772216796875, - "loss": 0.6804, - "pred_label": 2034.0250244140625, - "rewards/accuracies": 0.23749999701976776, - "rewards/chosen": -0.48421382904052734, - "rewards/margins": 0.11718887090682983, - "rewards/rejected": -0.6014026999473572, + "logits/chosen": -0.5565081834793091, + "logits/rejected": -0.6612057685852051, + "logps/chosen": -89.98490142822266, + "logps/rejected": -93.48139953613281, + "loss": 0.6915, + "pred_label": 3471.35009765625, + "rewards/accuracies": 0.24375000596046448, + "rewards/chosen": -0.306854248046875, + "rewards/margins": 0.09164027869701385, + "rewards/rejected": -0.39849454164505005, "step": 840, - "use_label": 15359.974609375 + "use_label": 13922.650390625 }, { "epoch": 0.89, - "grad_norm": 2.1875, + "grad_norm": 1.359375, "learning_rate": 1.820784220652766e-07, - "logits/chosen": 0.7102145552635193, - "logits/rejected": 0.6271827816963196, - "logps/chosen": -143.84104919433594, - "logps/rejected": -181.9434814453125, - "loss": 0.661, - "pred_label": 2054.27490234375, - "rewards/accuracies": 0.4124999940395355, - "rewards/chosen": -0.5962380170822144, - "rewards/margins": 0.3919173777103424, - "rewards/rejected": -0.9881553649902344, + "logits/chosen": -0.6778563261032104, + "logits/rejected": -0.73534095287323, + "logps/chosen": -120.2663345336914, + "logps/rejected": -149.02294921875, + "loss": 0.6854, + "pred_label": 3509.0, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.36049091815948486, + "rewards/margins": 0.2984590530395508, + "rewards/rejected": -0.6589499711990356, "step": 850, - "use_label": 15499.724609375 + "use_label": 14045.0 }, { "epoch": 0.9, - "grad_norm": 2.03125, + "grad_norm": 1.796875, "learning_rate": 1.4938170864468636e-07, - "logits/chosen": 0.856406569480896, - "logits/rejected": 0.9847167730331421, - "logps/chosen": -134.33340454101562, - "logps/rejected": -162.52786254882812, - "loss": 0.6682, - "pred_label": 2086.074951171875, - "rewards/accuracies": 0.42500001192092896, - "rewards/chosen": -0.5313155651092529, - "rewards/margins": 0.33722516894340515, - "rewards/rejected": -0.8685407638549805, + "logits/chosen": -0.5929479002952576, + "logits/rejected": -0.48117414116859436, + "logps/chosen": -115.10990142822266, + "logps/rejected": -133.1912841796875, + "loss": 0.6892, + "pred_label": 3556.324951171875, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -0.33908045291900635, + "rewards/margins": 0.23609444499015808, + "rewards/rejected": -0.5751749277114868, "step": 860, - "use_label": 15627.9248046875 + "use_label": 14157.6748046875 }, { "epoch": 0.91, - "grad_norm": 2.453125, + "grad_norm": 1.7578125, "learning_rate": 1.1982873884064466e-07, - "logits/chosen": 0.7517425417900085, - "logits/rejected": 0.7345870137214661, - "logps/chosen": -139.02523803710938, - "logps/rejected": -179.33541870117188, - "loss": 0.681, - "pred_label": 2114.550048828125, - "rewards/accuracies": 0.38749998807907104, - "rewards/chosen": -0.5771310925483704, - "rewards/margins": 0.393027126789093, - "rewards/rejected": -0.9701582193374634, + "logits/chosen": -0.6633087992668152, + "logits/rejected": -0.6678288578987122, + "logps/chosen": -117.92154693603516, + "logps/rejected": -145.3701171875, + "loss": 0.6893, + "pred_label": 3603.75, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3660942316055298, + "rewards/margins": 0.2644110918045044, + "rewards/rejected": -0.6305053234100342, "step": 870, - "use_label": 15759.4501953125 + "use_label": 14270.25 }, { "epoch": 0.92, - "grad_norm": 1.3359375, + "grad_norm": 0.87890625, "learning_rate": 9.345903713082305e-08, - "logits/chosen": 0.8886432647705078, - "logits/rejected": 0.9018303751945496, - "logps/chosen": -118.17820739746094, - "logps/rejected": -171.14111328125, - "loss": 0.6788, - "pred_label": 2148.22509765625, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": -0.5542714595794678, - "rewards/margins": 0.41032201051712036, - "rewards/rejected": -0.9645935297012329, + "logits/chosen": -0.5895944237709045, + "logits/rejected": -0.5510295629501343, + "logps/chosen": -96.94719696044922, + "logps/rejected": -141.16554260253906, + "loss": 0.6891, + "pred_label": 3651.0, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.3419613242149353, + "rewards/margins": 0.32287630438804626, + "rewards/rejected": -0.6648377180099487, "step": 880, - "use_label": 15885.775390625 + "use_label": 14383.0 }, { "epoch": 0.93, - "grad_norm": 3.03125, + "grad_norm": 1.6484375, "learning_rate": 7.030787065396866e-08, - "logits/chosen": 0.9458627700805664, - "logits/rejected": 0.860288143157959, - "logps/chosen": -113.8023452758789, - "logps/rejected": -145.21006774902344, - "loss": 0.68, - "pred_label": 2174.175048828125, - "rewards/accuracies": 0.29374998807907104, - "rewards/chosen": -0.4997434616088867, - "rewards/margins": 0.2181231528520584, - "rewards/rejected": -0.7178665399551392, + "logits/chosen": -0.5159703493118286, + "logits/rejected": -0.5519541501998901, + "logps/chosen": -96.9026107788086, + "logps/rejected": -120.7626724243164, + "loss": 0.693, + "pred_label": 3690.675048828125, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.3307461142539978, + "rewards/margins": 0.1426464170217514, + "rewards/rejected": -0.4733925461769104, "step": 890, - "use_label": 16019.8251953125 + "use_label": 14503.3251953125 }, { "epoch": 0.94, - "grad_norm": 2.265625, + "grad_norm": 1.9609375, "learning_rate": 5.0406202043228604e-08, - "logits/chosen": 1.2760592699050903, - "logits/rejected": 1.0304285287857056, - "logps/chosen": -123.82283020019531, - "logps/rejected": -180.8885955810547, - "loss": 0.6751, - "pred_label": 2197.10009765625, + "logits/chosen": -0.2721698582172394, + "logits/rejected": -0.407818466424942, + "logps/chosen": -104.2662582397461, + "logps/rejected": -149.70314025878906, + "loss": 0.689, + "pred_label": 3732.824951171875, "rewards/accuracies": 0.39375001192092896, - "rewards/chosen": -0.5441134572029114, - "rewards/margins": 0.37965571880340576, - "rewards/rejected": -0.9237691164016724, + "rewards/chosen": -0.3485477864742279, + "rewards/margins": 0.2633667290210724, + "rewards/rejected": -0.6119145154953003, "step": 900, - "use_label": 16156.900390625 + "use_label": 14621.1748046875 }, { "epoch": 0.94, - "eval_logits/chosen": 1.3693251609802246, - "eval_logits/rejected": 1.4904797077178955, - "eval_logps/chosen": -123.65629577636719, - "eval_logps/rejected": -161.58062744140625, - "eval_loss": 0.678970456123352, - "eval_pred_label": 2251.015869140625, - "eval_rewards/accuracies": 0.3571428656578064, - "eval_rewards/chosen": -0.547557532787323, - "eval_rewards/margins": 0.31426796317100525, - "eval_rewards/rejected": -0.8618254065513611, - "eval_runtime": 247.8741, - "eval_samples_per_second": 8.069, + "eval_logits/chosen": -0.2437347173690796, + "eval_logits/rejected": -0.13671822845935822, + "eval_logps/chosen": -103.0300521850586, + "eval_logps/rejected": -131.91110229492188, + "eval_loss": 0.6907457709312439, + "eval_pred_label": 3821.52392578125, + "eval_rewards/accuracies": 0.363095223903656, + "eval_rewards/chosen": -0.3412950336933136, + "eval_rewards/margins": 0.22383520007133484, + "eval_rewards/rejected": -0.5651301741600037, + "eval_runtime": 248.2504, + "eval_samples_per_second": 8.056, "eval_steps_per_second": 0.254, - "eval_use_label": 16436.984375, + "eval_use_label": 14866.4765625, "step": 900 }, { "epoch": 0.95, - "grad_norm": 1.8984375, + "grad_norm": 1.171875, "learning_rate": 3.378064801637687e-08, - "logits/chosen": 0.8874324560165405, - "logits/rejected": 0.9277682304382324, - "logps/chosen": -107.72428131103516, - "logps/rejected": -141.05999755859375, - "loss": 0.6748, - "pred_label": 2314.64990234375, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.47065839171409607, - "rewards/margins": 0.3038247525691986, - "rewards/rejected": -0.7744830846786499, + "logits/chosen": -0.5370496511459351, + "logits/rejected": -0.5028234720230103, + "logps/chosen": -89.67744445800781, + "logps/rejected": -113.96895599365234, + "loss": 0.6882, + "pred_label": 3916.52490234375, + "rewards/accuracies": 0.3187499940395355, + "rewards/chosen": -0.2901899218559265, + "rewards/margins": 0.2133828103542328, + "rewards/rejected": -0.5035727024078369, "step": 910, - "use_label": 16703.349609375 + "use_label": 15101.474609375 }, { "epoch": 0.96, - "grad_norm": 2.359375, + "grad_norm": 1.3125, "learning_rate": 2.0453443778310766e-08, - "logits/chosen": 1.0454901456832886, - "logits/rejected": 1.0777199268341064, - "logps/chosen": -97.26091003417969, - "logps/rejected": -147.07302856445312, - "loss": 0.6799, - "pred_label": 2345.5, - "rewards/accuracies": 0.3187499940395355, - "rewards/chosen": -0.4330506920814514, - "rewards/margins": 0.3231905996799469, - "rewards/rejected": -0.7562412023544312, + "logits/chosen": -0.43033066391944885, + "logits/rejected": -0.4173038899898529, + "logps/chosen": -80.09765625, + "logps/rejected": -120.93513488769531, + "loss": 0.6934, + "pred_label": 3958.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.26141807436943054, + "rewards/margins": 0.23344416916370392, + "rewards/rejected": -0.49486222863197327, "step": 920, - "use_label": 16832.5 + "use_label": 15220.0 }, { "epoch": 0.97, - "grad_norm": 2.96875, + "grad_norm": 2.109375, "learning_rate": 1.0442413283435759e-08, - "logits/chosen": 1.0506960153579712, - "logits/rejected": 0.9065178632736206, - "logps/chosen": -111.4305648803711, - "logps/rejected": -146.4462432861328, - "loss": 0.6772, - "pred_label": 2373.125, - "rewards/accuracies": 0.3375000059604645, - "rewards/chosen": -0.4827675223350525, - "rewards/margins": 0.2878049314022064, - "rewards/rejected": -0.7705724835395813, + "logits/chosen": -0.4513850212097168, + "logits/rejected": -0.5099025964736938, + "logps/chosen": -92.44239807128906, + "logps/rejected": -119.61177062988281, + "loss": 0.6878, + "pred_label": 3998.60009765625, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.29288578033447266, + "rewards/margins": 0.20934204757213593, + "rewards/rejected": -0.502227783203125, "step": 930, - "use_label": 16964.875 + "use_label": 15339.400390625 }, { "epoch": 0.98, - "grad_norm": 2.515625, + "grad_norm": 1.25, "learning_rate": 3.760945397705828e-09, - "logits/chosen": 1.234220027923584, - "logits/rejected": 0.9605228304862976, - "logps/chosen": -124.61787414550781, - "logps/rejected": -160.95223999023438, - "loss": 0.6706, - "pred_label": 2397.75, - "rewards/accuracies": 0.34375, - "rewards/chosen": -0.5566731691360474, - "rewards/margins": 0.27601632475852966, - "rewards/rejected": -0.8326894640922546, + "logits/chosen": -0.3625331521034241, + "logits/rejected": -0.5358187556266785, + "logps/chosen": -103.41780090332031, + "logps/rejected": -130.23828125, + "loss": 0.691, + "pred_label": 4038.60009765625, + "rewards/accuracies": 0.3187499940395355, + "rewards/chosen": -0.34467238187789917, + "rewards/margins": 0.18087737262248993, + "rewards/rejected": -0.5255497694015503, "step": 940, - "use_label": 17100.25 + "use_label": 15459.400390625 }, { "epoch": 0.99, - "grad_norm": 2.375, + "grad_norm": 1.59375, "learning_rate": 4.1797599220405605e-10, - "logits/chosen": 0.7550326585769653, - "logits/rejected": 0.6674235463142395, - "logps/chosen": -136.30941772460938, - "logps/rejected": -160.4894561767578, - "loss": 0.6729, - "pred_label": 2425.39990234375, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5969915390014648, - "rewards/margins": 0.21693304181098938, - "rewards/rejected": -0.8139246106147766, + "logits/chosen": -0.674268901348114, + "logits/rejected": -0.7018919587135315, + "logps/chosen": -114.91938781738281, + "logps/rejected": -133.3175506591797, + "loss": 0.6895, + "pred_label": 4082.625, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": -0.3830910325050354, + "rewards/margins": 0.1591145098209381, + "rewards/rejected": -0.5422054529190063, "step": 950, - "use_label": 17232.599609375 + "use_label": 15575.375 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, - "train_loss": 0.6760230718482851, - "train_runtime": 20063.9235, - "train_samples_per_second": 3.047, + "train_loss": 0.6880922077838039, + "train_runtime": 20023.3666, + "train_samples_per_second": 3.053, "train_steps_per_second": 0.048 } ],