{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.59375, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.1666858196258545, "logits/rejected": -2.182244300842285, "logps/chosen": -12.368609428405762, "logps/rejected": -24.687644958496094, "loss": 0.6931, "pred_label": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "use_label": 10.0 }, { "epoch": 0.01, "grad_norm": 0.60546875, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.2113068103790283, "logits/rejected": -2.2719719409942627, "logps/chosen": -57.57659149169922, "logps/rejected": -65.19544219970703, "loss": 0.693, "pred_label": 0.0, "rewards/accuracies": 0.2152777761220932, "rewards/chosen": 0.001057142741046846, "rewards/margins": 3.17241829179693e-05, "rewards/rejected": 0.001025418401695788, "step": 10, "use_label": 90.0 }, { "epoch": 0.02, "grad_norm": 0.6796875, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.243159770965576, "logits/rejected": -2.2802278995513916, "logps/chosen": -56.544715881347656, "logps/rejected": -68.35901641845703, "loss": 0.6924, "pred_label": 0.0, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.006556531880050898, "rewards/margins": 0.001379690133035183, "rewards/rejected": 0.005176841747015715, "step": 20, "use_label": 242.0 }, { "epoch": 0.03, "grad_norm": 0.55078125, "learning_rate": 1.5625e-06, "logits/chosen": -2.2634024620056152, "logits/rejected": -2.2475943565368652, "logps/chosen": -53.98667526245117, "logps/rejected": -67.89213562011719, "loss": 0.692, "pred_label": 0.0, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.01648966409265995, "rewards/margins": 0.002599921775981784, "rewards/rejected": 0.013889740221202374, "step": 30, "use_label": 402.0 }, { "epoch": 0.04, "grad_norm": 0.6328125, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.2825467586517334, "logits/rejected": -2.2754693031311035, "logps/chosen": -55.582061767578125, "logps/rejected": -66.59407043457031, "loss": 0.6909, "pred_label": 0.0, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": 0.018406417220830917, "rewards/margins": 0.0006764450808987021, "rewards/rejected": 0.017729971557855606, "step": 40, "use_label": 562.0 }, { "epoch": 0.05, "grad_norm": 0.6015625, "learning_rate": 2.604166666666667e-06, "logits/chosen": -2.3444912433624268, "logits/rejected": -2.3341281414031982, "logps/chosen": -69.13630676269531, "logps/rejected": -84.64376831054688, "loss": 0.6889, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.02657836303114891, "rewards/margins": 0.005359734408557415, "rewards/rejected": 0.021218623965978622, "step": 50, "use_label": 722.0 }, { "epoch": 0.06, "grad_norm": 0.72265625, "learning_rate": 3.125e-06, "logits/chosen": -2.3026936054229736, "logits/rejected": -2.309264659881592, "logps/chosen": -82.00704193115234, "logps/rejected": -90.7305908203125, "loss": 0.6874, "pred_label": 0.0, "rewards/accuracies": 0.34375, "rewards/chosen": 0.03688042238354683, "rewards/margins": 0.014220851473510265, "rewards/rejected": 0.02265957184135914, "step": 60, "use_label": 882.0 }, { "epoch": 0.07, "grad_norm": 0.79296875, "learning_rate": 3.6458333333333333e-06, "logits/chosen": -2.344853401184082, "logits/rejected": -2.3261306285858154, "logps/chosen": -77.20336151123047, "logps/rejected": -77.6347885131836, "loss": 0.6851, "pred_label": 0.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.02531364932656288, "rewards/margins": 0.01608472317457199, "rewards/rejected": 0.009228924289345741, "step": 70, "use_label": 1042.0 }, { "epoch": 0.08, "grad_norm": 0.80078125, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.241945743560791, "logits/rejected": -2.195178985595703, "logps/chosen": -81.6376953125, "logps/rejected": -89.05104064941406, "loss": 0.6814, "pred_label": 0.9750000238418579, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.004142354242503643, "rewards/margins": 0.025017932057380676, "rewards/rejected": -0.02087557688355446, "step": 80, "use_label": 1201.0250244140625 }, { "epoch": 0.09, "grad_norm": 1.578125, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.1907405853271484, "logits/rejected": -2.232959270477295, "logps/chosen": -62.31688690185547, "logps/rejected": -80.38573455810547, "loss": 0.6812, "pred_label": 3.0999999046325684, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.012271342799067497, "rewards/margins": 0.04507603123784065, "rewards/rejected": -0.0573473684489727, "step": 90, "use_label": 1358.9000244140625 }, { "epoch": 0.1, "grad_norm": 0.796875, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -2.132638454437256, "logits/rejected": -2.0995519161224365, "logps/chosen": -76.97563171386719, "logps/rejected": -79.27615356445312, "loss": 0.6818, "pred_label": 7.150000095367432, "rewards/accuracies": 0.3125, "rewards/chosen": -0.02400936186313629, "rewards/margins": 0.05036945268511772, "rewards/rejected": -0.07437881827354431, "step": 100, "use_label": 1514.8499755859375 }, { "epoch": 0.1, "eval_logits/chosen": -2.097480297088623, "eval_logits/rejected": -2.0663790702819824, "eval_logps/chosen": -69.46318054199219, "eval_logps/rejected": -80.35824584960938, "eval_loss": 0.6813791394233704, "eval_pred_label": 22.539682388305664, "eval_rewards/accuracies": 0.3392857015132904, "eval_rewards/chosen": -0.005626226309686899, "eval_rewards/margins": 0.04397555813193321, "eval_rewards/rejected": -0.04960178583860397, "eval_runtime": 245.3242, "eval_samples_per_second": 8.152, "eval_steps_per_second": 0.257, "eval_use_label": 1833.4603271484375, "step": 100 }, { "epoch": 0.12, "grad_norm": 1.1171875, "learning_rate": 4.996723692767927e-06, "logits/chosen": -2.114673137664795, "logits/rejected": -2.094468355178833, "logps/chosen": -63.9236946105957, "logps/rejected": -79.44518280029297, "loss": 0.6827, "pred_label": 34.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.02154584601521492, "rewards/margins": 0.04528125748038292, "rewards/rejected": -0.06682710349559784, "step": 110, "use_label": 2152.0 }, { "epoch": 0.13, "grad_norm": 1.0390625, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -2.2926628589630127, "logits/rejected": -2.177788257598877, "logps/chosen": -83.48246002197266, "logps/rejected": -97.60291290283203, "loss": 0.683, "pred_label": 44.67499923706055, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.0941522866487503, "rewards/margins": 0.06425690650939941, "rewards/rejected": -0.15840919315814972, "step": 120, "use_label": 2301.324951171875 }, { "epoch": 0.14, "grad_norm": 0.546875, "learning_rate": 4.980697142834315e-06, "logits/chosen": -2.0968613624572754, "logits/rejected": -2.1124091148376465, "logps/chosen": -66.370849609375, "logps/rejected": -77.3319320678711, "loss": 0.6845, "pred_label": 57.57500076293945, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.07896758615970612, "rewards/margins": 0.04609644412994385, "rewards/rejected": -0.12506404519081116, "step": 130, "use_label": 2448.425048828125 }, { "epoch": 0.15, "grad_norm": 0.78515625, "learning_rate": 4.967700826904229e-06, "logits/chosen": -2.1041221618652344, "logits/rejected": -2.138929843902588, "logps/chosen": -68.11909484863281, "logps/rejected": -90.16340637207031, "loss": 0.6868, "pred_label": 73.75, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.08846104890108109, "rewards/margins": 0.0647779330611229, "rewards/rejected": -0.15323898196220398, "step": 140, "use_label": 2592.25 }, { "epoch": 0.16, "grad_norm": 1.1015625, "learning_rate": 4.951404179843963e-06, "logits/chosen": -2.1765952110290527, "logits/rejected": -2.125175714492798, "logps/chosen": -54.37804412841797, "logps/rejected": -58.982269287109375, "loss": 0.6809, "pred_label": 91.3499984741211, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06883221119642258, "rewards/margins": 0.06803621351718903, "rewards/rejected": -0.136868417263031, "step": 150, "use_label": 2734.64990234375 }, { "epoch": 0.17, "grad_norm": 1.03125, "learning_rate": 4.931828996974498e-06, "logits/chosen": -2.2455694675445557, "logits/rejected": -2.213240623474121, "logps/chosen": -94.4081802368164, "logps/rejected": -107.48802185058594, "loss": 0.6857, "pred_label": 115.55000305175781, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.12804970145225525, "rewards/margins": 0.12874242663383484, "rewards/rejected": -0.2567921280860901, "step": 160, "use_label": 2870.449951171875 }, { "epoch": 0.18, "grad_norm": 1.1875, "learning_rate": 4.909001458367867e-06, "logits/chosen": -2.1201233863830566, "logits/rejected": -2.0822367668151855, "logps/chosen": -75.75311279296875, "logps/rejected": -87.55944061279297, "loss": 0.6869, "pred_label": 141.85000610351562, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.1179669052362442, "rewards/margins": 0.09383226186037064, "rewards/rejected": -0.21179917454719543, "step": 170, "use_label": 3004.14990234375 }, { "epoch": 0.19, "grad_norm": 1.4296875, "learning_rate": 4.882952093833628e-06, "logits/chosen": -2.1013779640197754, "logits/rejected": -2.121537685394287, "logps/chosen": -70.6474838256836, "logps/rejected": -89.79743957519531, "loss": 0.685, "pred_label": 161.3249969482422, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.08145526796579361, "rewards/margins": 0.08172430098056793, "rewards/rejected": -0.16317956149578094, "step": 180, "use_label": 3144.675048828125 }, { "epoch": 0.2, "grad_norm": 0.8515625, "learning_rate": 4.853715742087947e-06, "logits/chosen": -2.1533255577087402, "logits/rejected": -2.104222297668457, "logps/chosen": -87.3572998046875, "logps/rejected": -91.95249938964844, "loss": 0.6862, "pred_label": 181.39999389648438, "rewards/accuracies": 0.375, "rewards/chosen": -0.13474301993846893, "rewards/margins": 0.08988693356513977, "rewards/rejected": -0.2246299535036087, "step": 190, "use_label": 3284.60009765625 }, { "epoch": 0.21, "grad_norm": 0.96875, "learning_rate": 4.821331504159906e-06, "logits/chosen": -2.137516736984253, "logits/rejected": -2.13090443611145, "logps/chosen": -94.10081481933594, "logps/rejected": -95.15316009521484, "loss": 0.6818, "pred_label": 205.875, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.14046669006347656, "rewards/margins": 0.07937734574079514, "rewards/rejected": -0.2198440283536911, "step": 200, "use_label": 3420.125 }, { "epoch": 0.21, "eval_logits/chosen": -2.021465301513672, "eval_logits/rejected": -1.9937611818313599, "eval_logps/chosen": -82.4782485961914, "eval_logps/rejected": -99.20675659179688, "eval_loss": 0.6860649585723877, "eval_pred_label": 258.79364013671875, "eval_rewards/accuracies": 0.3373015820980072, "eval_rewards/chosen": -0.13577698171138763, "eval_rewards/margins": 0.10230996459722519, "eval_rewards/rejected": -0.23808695375919342, "eval_runtime": 245.9338, "eval_samples_per_second": 8.132, "eval_steps_per_second": 0.256, "eval_use_label": 3701.206298828125, "step": 200 }, { "epoch": 0.22, "grad_norm": 1.1484375, "learning_rate": 4.7858426910973435e-06, "logits/chosen": -2.1574149131774902, "logits/rejected": -2.1307334899902344, "logps/chosen": -77.64894104003906, "logps/rejected": -89.26710510253906, "loss": 0.6828, "pred_label": 313.32501220703125, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.09638272225856781, "rewards/margins": 0.12071452289819717, "rewards/rejected": -0.2170972377061844, "step": 210, "use_label": 3976.675048828125 }, { "epoch": 0.23, "grad_norm": 1.40625, "learning_rate": 4.747296766042161e-06, "logits/chosen": -2.1187565326690674, "logits/rejected": -2.102626323699951, "logps/chosen": -90.67762756347656, "logps/rejected": -96.60699462890625, "loss": 0.6884, "pred_label": 343.875, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.1462414264678955, "rewards/margins": 0.12368818372488022, "rewards/rejected": -0.2699296176433563, "step": 220, "use_label": 4106.125 }, { "epoch": 0.24, "grad_norm": 1.1484375, "learning_rate": 4.705745280752586e-06, "logits/chosen": -2.1437509059906006, "logits/rejected": -2.084073781967163, "logps/chosen": -90.86326599121094, "logps/rejected": -96.72235870361328, "loss": 0.6875, "pred_label": 378.6000061035156, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.12124122679233551, "rewards/margins": 0.11637073755264282, "rewards/rejected": -0.23761197924613953, "step": 230, "use_label": 4231.39990234375 }, { "epoch": 0.25, "grad_norm": 0.953125, "learning_rate": 4.661243806657256e-06, "logits/chosen": -2.1431565284729004, "logits/rejected": -2.1365227699279785, "logps/chosen": -71.16796875, "logps/rejected": -91.01861572265625, "loss": 0.6846, "pred_label": 403.125, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.07454425096511841, "rewards/margins": 0.09627760201692581, "rewards/rejected": -0.17082183063030243, "step": 240, "use_label": 4366.875 }, { "epoch": 0.26, "grad_norm": 0.890625, "learning_rate": 4.613851860533367e-06, "logits/chosen": -2.1595332622528076, "logits/rejected": -2.183953285217285, "logps/chosen": -71.86934661865234, "logps/rejected": -80.0597152709961, "loss": 0.6844, "pred_label": 422.25, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06741674989461899, "rewards/margins": 0.08548234403133392, "rewards/rejected": -0.1528991013765335, "step": 250, "use_label": 4507.75 }, { "epoch": 0.27, "grad_norm": 1.0390625, "learning_rate": 4.563632824908252e-06, "logits/chosen": -2.1189560890197754, "logits/rejected": -2.071620464324951, "logps/chosen": -77.1129150390625, "logps/rejected": -101.45845031738281, "loss": 0.6837, "pred_label": 445.79998779296875, "rewards/accuracies": 0.3125, "rewards/chosen": -0.16171860694885254, "rewards/margins": 0.11343212425708771, "rewards/rejected": -0.27515071630477905, "step": 260, "use_label": 4644.2001953125 }, { "epoch": 0.28, "grad_norm": 1.0703125, "learning_rate": 4.510653863290871e-06, "logits/chosen": -2.1512458324432373, "logits/rejected": -2.164412021636963, "logps/chosen": -91.74055480957031, "logps/rejected": -95.13731384277344, "loss": 0.6883, "pred_label": 470.04998779296875, "rewards/accuracies": 0.3125, "rewards/chosen": -0.16311386227607727, "rewards/margins": 0.0933571308851242, "rewards/rejected": -0.2564709782600403, "step": 270, "use_label": 4779.9501953125 }, { "epoch": 0.29, "grad_norm": 0.8828125, "learning_rate": 4.454985830346574e-06, "logits/chosen": -2.0734293460845947, "logits/rejected": -2.1033730506896973, "logps/chosen": -76.7903823852539, "logps/rejected": -86.99803161621094, "loss": 0.6858, "pred_label": 494.9750061035156, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.15558014810085297, "rewards/margins": 0.050300367176532745, "rewards/rejected": -0.2058805227279663, "step": 280, "use_label": 4915.02490234375 }, { "epoch": 0.3, "grad_norm": 1.3125, "learning_rate": 4.396703177135262e-06, "logits/chosen": -1.9870249032974243, "logits/rejected": -1.956434965133667, "logps/chosen": -89.98160552978516, "logps/rejected": -99.75212097167969, "loss": 0.6905, "pred_label": 527.0499877929688, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.13706301152706146, "rewards/margins": 0.16557420790195465, "rewards/rejected": -0.3026372492313385, "step": 290, "use_label": 5042.9501953125 }, { "epoch": 0.31, "grad_norm": 1.6015625, "learning_rate": 4.335883851539693e-06, "logits/chosen": -1.9497883319854736, "logits/rejected": -1.964604377746582, "logps/chosen": -68.64933013916016, "logps/rejected": -91.48945617675781, "loss": 0.6848, "pred_label": 561.8499755859375, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.14721202850341797, "rewards/margins": 0.14547064900398254, "rewards/rejected": -0.2926826477050781, "step": 300, "use_label": 5168.14990234375 }, { "epoch": 0.31, "eval_logits/chosen": -1.9156862497329712, "eval_logits/rejected": -1.8827954530715942, "eval_logps/chosen": -89.57630920410156, "eval_logps/rejected": -109.2765884399414, "eval_loss": 0.6877307295799255, "eval_pred_label": 626.1270141601562, "eval_rewards/accuracies": 0.341269850730896, "eval_rewards/chosen": -0.20675767958164215, "eval_rewards/margins": 0.13202756643295288, "eval_rewards/rejected": -0.33878523111343384, "eval_runtime": 246.2269, "eval_samples_per_second": 8.123, "eval_steps_per_second": 0.256, "eval_use_label": 5437.873046875, "step": 300 }, { "epoch": 0.32, "grad_norm": 1.5, "learning_rate": 4.2726091940171055e-06, "logits/chosen": -2.043640613555908, "logits/rejected": -2.01674222946167, "logps/chosen": -72.24534606933594, "logps/rejected": -89.407470703125, "loss": 0.6865, "pred_label": 688.9500122070312, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.23255303502082825, "rewards/margins": 0.06651856750249863, "rewards/rejected": -0.29907160997390747, "step": 310, "use_label": 5705.0498046875 }, { "epoch": 0.33, "grad_norm": 1.1796875, "learning_rate": 4.206963828813555e-06, "logits/chosen": -1.9597671031951904, "logits/rejected": -1.9893718957901, "logps/chosen": -94.37977600097656, "logps/rejected": -118.25643157958984, "loss": 0.6871, "pred_label": 724.375, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.20438706874847412, "rewards/margins": 0.13566336035728455, "rewards/rejected": -0.34005045890808105, "step": 320, "use_label": 5829.625 }, { "epoch": 0.35, "grad_norm": 0.95703125, "learning_rate": 4.139035550786495e-06, "logits/chosen": -1.989506483078003, "logits/rejected": -1.9580066204071045, "logps/chosen": -73.50363159179688, "logps/rejected": -87.75289154052734, "loss": 0.683, "pred_label": 754.4500122070312, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.1003209576010704, "rewards/margins": 0.13466720283031464, "rewards/rejected": -0.23498816788196564, "step": 330, "use_label": 5959.5498046875 }, { "epoch": 0.36, "grad_norm": 1.0234375, "learning_rate": 4.068915207986931e-06, "logits/chosen": -2.0428695678710938, "logits/rejected": -2.016120195388794, "logps/chosen": -74.91081237792969, "logps/rejected": -93.89201354980469, "loss": 0.6894, "pred_label": 786.4749755859375, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.11903776973485947, "rewards/margins": 0.11223740875720978, "rewards/rejected": -0.23127520084381104, "step": 340, "use_label": 6087.52490234375 }, { "epoch": 0.37, "grad_norm": 0.984375, "learning_rate": 3.996696580158211e-06, "logits/chosen": -2.0441341400146484, "logits/rejected": -2.0229620933532715, "logps/chosen": -73.9575424194336, "logps/rejected": -86.34129333496094, "loss": 0.6869, "pred_label": 817.5250244140625, "rewards/accuracies": 0.3125, "rewards/chosen": -0.133123978972435, "rewards/margins": 0.08419892936944962, "rewards/rejected": -0.2173229157924652, "step": 350, "use_label": 6216.47509765625 }, { "epoch": 0.38, "grad_norm": 1.546875, "learning_rate": 3.922476253313921e-06, "logits/chosen": -2.0575146675109863, "logits/rejected": -2.054591417312622, "logps/chosen": -82.88232421875, "logps/rejected": -90.05668640136719, "loss": 0.6863, "pred_label": 848.6500244140625, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.13817565143108368, "rewards/margins": 0.11208128929138184, "rewards/rejected": -0.2502569556236267, "step": 360, "use_label": 6345.35009765625 }, { "epoch": 0.39, "grad_norm": 0.75, "learning_rate": 3.846353490562664e-06, "logits/chosen": -2.076312780380249, "logits/rejected": -1.9995708465576172, "logps/chosen": -85.83981323242188, "logps/rejected": -95.1656723022461, "loss": 0.6844, "pred_label": 880.4249877929688, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.11745607852935791, "rewards/margins": 0.14055705070495605, "rewards/rejected": -0.2580130994319916, "step": 370, "use_label": 6473.5751953125 }, { "epoch": 0.4, "grad_norm": 0.96484375, "learning_rate": 3.768430099352445e-06, "logits/chosen": -2.0079166889190674, "logits/rejected": -1.986297845840454, "logps/chosen": -76.30638122558594, "logps/rejected": -93.93800354003906, "loss": 0.6924, "pred_label": 912.5999755859375, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.1675274670124054, "rewards/margins": 0.08305275440216064, "rewards/rejected": -0.25058022141456604, "step": 380, "use_label": 6601.39990234375 }, { "epoch": 0.41, "grad_norm": 0.97265625, "learning_rate": 3.6888102953122307e-06, "logits/chosen": -1.9291635751724243, "logits/rejected": -1.914608359336853, "logps/chosen": -101.44157409667969, "logps/rejected": -96.10136413574219, "loss": 0.6878, "pred_label": 952.8250122070312, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.1657881736755371, "rewards/margins": 0.12364902347326279, "rewards/rejected": -0.2894372344017029, "step": 390, "use_label": 6721.1748046875 }, { "epoch": 0.42, "grad_norm": 1.296875, "learning_rate": 3.607600562872785e-06, "logits/chosen": -1.8988447189331055, "logits/rejected": -1.8926557302474976, "logps/chosen": -87.97608947753906, "logps/rejected": -108.15446472167969, "loss": 0.6857, "pred_label": 987.5999755859375, "rewards/accuracies": 0.34375, "rewards/chosen": -0.16945099830627441, "rewards/margins": 0.11657001823186874, "rewards/rejected": -0.28602102398872375, "step": 400, "use_label": 6846.39990234375 }, { "epoch": 0.42, "eval_logits/chosen": -1.4529144763946533, "eval_logits/rejected": -1.4031411409378052, "eval_logps/chosen": -86.92367553710938, "eval_logps/rejected": -108.39134979248047, "eval_loss": 0.6884719133377075, "eval_pred_label": 1055.5555419921875, "eval_rewards/accuracies": 0.3531745970249176, "eval_rewards/chosen": -0.18023118376731873, "eval_rewards/margins": 0.14970164000988007, "eval_rewards/rejected": -0.32993283867836, "eval_runtime": 246.35, "eval_samples_per_second": 8.119, "eval_steps_per_second": 0.256, "eval_use_label": 7112.4443359375, "step": 400 }, { "epoch": 0.43, "grad_norm": 1.28125, "learning_rate": 3.5249095128531863e-06, "logits/chosen": -1.289879560470581, "logits/rejected": -1.4085474014282227, "logps/chosen": -85.75054168701172, "logps/rejected": -96.24283599853516, "loss": 0.6874, "pred_label": 1135.699951171875, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.21242520213127136, "rewards/margins": 0.17107079923152924, "rewards/rejected": -0.3834960162639618, "step": 410, "use_label": 7362.2998046875 }, { "epoch": 0.44, "grad_norm": 0.97265625, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -1.2336995601654053, "logits/rejected": -1.1623611450195312, "logps/chosen": -97.20266723632812, "logps/rejected": -117.6893081665039, "loss": 0.6882, "pred_label": 1171.425048828125, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.3355943560600281, "rewards/margins": 0.16045086085796356, "rewards/rejected": -0.49604520201683044, "step": 420, "use_label": 7486.5751953125 }, { "epoch": 0.45, "grad_norm": 1.1484375, "learning_rate": 3.355527661097728e-06, "logits/chosen": -1.3129976987838745, "logits/rejected": -1.2275488376617432, "logps/chosen": -106.88911437988281, "logps/rejected": -112.3751449584961, "loss": 0.6918, "pred_label": 1207.9749755859375, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3042059540748596, "rewards/margins": 0.13597823679447174, "rewards/rejected": -0.44018417596817017, "step": 430, "use_label": 7610.02490234375 }, { "epoch": 0.46, "grad_norm": 1.5625, "learning_rate": 3.269063392575352e-06, "logits/chosen": -1.3159044981002808, "logits/rejected": -1.413769006729126, "logps/chosen": -90.12797546386719, "logps/rejected": -101.85379028320312, "loss": 0.6858, "pred_label": 1242.5, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.22682049870491028, "rewards/margins": 0.159098818898201, "rewards/rejected": -0.3859192728996277, "step": 440, "use_label": 7735.5 }, { "epoch": 0.47, "grad_norm": 1.375, "learning_rate": 3.181570569931697e-06, "logits/chosen": -1.4389588832855225, "logits/rejected": -1.5265202522277832, "logps/chosen": -96.37947845458984, "logps/rejected": -113.1718521118164, "loss": 0.6951, "pred_label": 1281.3499755859375, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.2355901300907135, "rewards/margins": 0.13590970635414124, "rewards/rejected": -0.37149983644485474, "step": 450, "use_label": 7856.64990234375 }, { "epoch": 0.48, "grad_norm": 1.015625, "learning_rate": 3.09316620706208e-06, "logits/chosen": -1.2455997467041016, "logits/rejected": -1.1902601718902588, "logps/chosen": -72.07853698730469, "logps/rejected": -84.86478424072266, "loss": 0.6842, "pred_label": 1311.824951171875, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.1508016437292099, "rewards/margins": 0.1797787994146347, "rewards/rejected": -0.330580472946167, "step": 460, "use_label": 7986.1748046875 }, { "epoch": 0.49, "grad_norm": 1.1015625, "learning_rate": 3.0039685369660785e-06, "logits/chosen": -1.175449252128601, "logits/rejected": -1.0759943723678589, "logps/chosen": -88.91249084472656, "logps/rejected": -110.02799987792969, "loss": 0.6873, "pred_label": 1345.1500244140625, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.22000393271446228, "rewards/margins": 0.1964809000492096, "rewards/rejected": -0.4164848327636719, "step": 470, "use_label": 8112.85009765625 }, { "epoch": 0.5, "grad_norm": 1.0859375, "learning_rate": 2.91409685362137e-06, "logits/chosen": -1.0014227628707886, "logits/rejected": -1.0880533456802368, "logps/chosen": -99.41879272460938, "logps/rejected": -120.02769470214844, "loss": 0.6868, "pred_label": 1391.25, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.24276605248451233, "rewards/margins": 0.17868337035179138, "rewards/rejected": -0.4214494228363037, "step": 480, "use_label": 8226.75 }, { "epoch": 0.51, "grad_norm": 1.4375, "learning_rate": 2.8236713524386085e-06, "logits/chosen": -1.0729541778564453, "logits/rejected": -0.9298813939094543, "logps/chosen": -88.73147583007812, "logps/rejected": -94.53245544433594, "loss": 0.6921, "pred_label": 1428.9000244140625, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.22107498347759247, "rewards/margins": 0.12524999678134918, "rewards/rejected": -0.34632498025894165, "step": 490, "use_label": 8349.099609375 }, { "epoch": 0.52, "grad_norm": 1.421875, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -0.8902079463005066, "logits/rejected": -1.065393090248108, "logps/chosen": -113.58573150634766, "logps/rejected": -131.9083709716797, "loss": 0.6894, "pred_label": 1462.4000244140625, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.37447452545166016, "rewards/margins": 0.17800332605838776, "rewards/rejected": -0.5524778962135315, "step": 500, "use_label": 8475.599609375 }, { "epoch": 0.52, "eval_logits/chosen": -0.6888664960861206, "eval_logits/rejected": -0.5997034311294556, "eval_logps/chosen": -97.52025604248047, "eval_logps/rejected": -120.9921646118164, "eval_loss": 0.6891720294952393, "eval_pred_label": 1530.5714111328125, "eval_rewards/accuracies": 0.3551587164402008, "eval_rewards/chosen": -0.28619715571403503, "eval_rewards/margins": 0.1697438359260559, "eval_rewards/rejected": -0.45594096183776855, "eval_runtime": 246.2759, "eval_samples_per_second": 8.121, "eval_steps_per_second": 0.256, "eval_use_label": 8741.4287109375, "step": 500 }, { "epoch": 0.53, "grad_norm": 1.0078125, "learning_rate": 2.641643219871597e-06, "logits/chosen": -0.7708507776260376, "logits/rejected": -0.882653534412384, "logps/chosen": -90.50456237792969, "logps/rejected": -116.84162902832031, "loss": 0.686, "pred_label": 1610.5999755859375, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.2625977396965027, "rewards/margins": 0.20036396384239197, "rewards/rejected": -0.4629616141319275, "step": 510, "use_label": 8991.400390625 }, { "epoch": 0.54, "grad_norm": 1.4765625, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -0.8800374865531921, "logits/rejected": -1.038163185119629, "logps/chosen": -100.99266052246094, "logps/rejected": -116.75798034667969, "loss": 0.6895, "pred_label": 1653.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.2859944701194763, "rewards/margins": 0.15662841498851776, "rewards/rejected": -0.4426229000091553, "step": 520, "use_label": 9109.0 }, { "epoch": 0.55, "grad_norm": 1.3671875, "learning_rate": 2.4588575996495797e-06, "logits/chosen": -0.8304817080497742, "logits/rejected": -0.7847825288772583, "logps/chosen": -105.92545318603516, "logps/rejected": -117.15931701660156, "loss": 0.6895, "pred_label": 1692.175048828125, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.316447913646698, "rewards/margins": 0.17969803512096405, "rewards/rejected": -0.49614596366882324, "step": 530, "use_label": 9229.8251953125 }, { "epoch": 0.57, "grad_norm": 2.03125, "learning_rate": 2.367486188632446e-06, "logits/chosen": -0.67156982421875, "logits/rejected": -0.8070074319839478, "logps/chosen": -112.666748046875, "logps/rejected": -131.92593383789062, "loss": 0.6896, "pred_label": 1734.375, "rewards/accuracies": 0.375, "rewards/chosen": -0.35928016901016235, "rewards/margins": 0.22706659138202667, "rewards/rejected": -0.5863467454910278, "step": 540, "use_label": 9347.625 }, { "epoch": 0.58, "grad_norm": 1.796875, "learning_rate": 2.276292003092593e-06, "logits/chosen": -0.7944391369819641, "logits/rejected": -0.7596977353096008, "logps/chosen": -107.38740539550781, "logps/rejected": -111.28292083740234, "loss": 0.6887, "pred_label": 1775.7249755859375, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3932684063911438, "rewards/margins": 0.12325477600097656, "rewards/rejected": -0.5165232419967651, "step": 550, "use_label": 9466.275390625 }, { "epoch": 0.59, "grad_norm": 1.3515625, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -0.7152852416038513, "logits/rejected": -0.7174454927444458, "logps/chosen": -104.6649398803711, "logps/rejected": -117.61528015136719, "loss": 0.6901, "pred_label": 1814.375, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.3510952889919281, "rewards/margins": 0.15508435666561127, "rewards/rejected": -0.5061796307563782, "step": 560, "use_label": 9587.625 }, { "epoch": 0.6, "grad_norm": 2.125, "learning_rate": 2.0949227648656194e-06, "logits/chosen": -0.925454318523407, "logits/rejected": -0.849765956401825, "logps/chosen": -100.53346252441406, "logps/rejected": -131.70309448242188, "loss": 0.6872, "pred_label": 1852.2249755859375, "rewards/accuracies": 0.375, "rewards/chosen": -0.3393338620662689, "rewards/margins": 0.23398590087890625, "rewards/rejected": -0.5733197927474976, "step": 570, "use_label": 9709.775390625 }, { "epoch": 0.61, "grad_norm": 1.15625, "learning_rate": 2.00499027745888e-06, "logits/chosen": -0.7680953145027161, "logits/rejected": -0.8566532135009766, "logps/chosen": -111.98583984375, "logps/rejected": -131.1743927001953, "loss": 0.6879, "pred_label": 1893.7750244140625, "rewards/accuracies": 0.34375, "rewards/chosen": -0.37074294686317444, "rewards/margins": 0.1566895693540573, "rewards/rejected": -0.5274325013160706, "step": 580, "use_label": 9828.224609375 }, { "epoch": 0.62, "grad_norm": 1.1171875, "learning_rate": 1.915719821680624e-06, "logits/chosen": -0.8080962300300598, "logits/rejected": -0.7905328869819641, "logps/chosen": -125.2184066772461, "logps/rejected": -148.79432678222656, "loss": 0.6891, "pred_label": 1939.25, "rewards/accuracies": 0.40625, "rewards/chosen": -0.4552985727787018, "rewards/margins": 0.22290782630443573, "rewards/rejected": -0.6782063245773315, "step": 590, "use_label": 9942.75 }, { "epoch": 0.63, "grad_norm": 1.9609375, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -0.5244548320770264, "logits/rejected": -0.7590290904045105, "logps/chosen": -122.6807632446289, "logps/rejected": -162.36203002929688, "loss": 0.6881, "pred_label": 1992.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.48354387283325195, "rewards/margins": 0.23392179608345032, "rewards/rejected": -0.7174656391143799, "step": 600, "use_label": 10050.0 }, { "epoch": 0.63, "eval_logits/chosen": -0.35794487595558167, "eval_logits/rejected": -0.2547617554664612, "eval_logps/chosen": -107.16178131103516, "eval_logps/rejected": -135.9844512939453, "eval_loss": 0.6918326616287231, "eval_pred_label": 2082.3173828125, "eval_rewards/accuracies": 0.3531745970249176, "eval_rewards/chosen": -0.3826123774051666, "eval_rewards/margins": 0.22325147688388824, "eval_rewards/rejected": -0.6058638095855713, "eval_runtime": 248.3104, "eval_samples_per_second": 8.054, "eval_steps_per_second": 0.254, "eval_use_label": 10293.6826171875, "step": 600 }, { "epoch": 0.64, "grad_norm": 1.515625, "learning_rate": 1.739641525213929e-06, "logits/chosen": -0.572044312953949, "logits/rejected": -0.654716432094574, "logps/chosen": -95.46563720703125, "logps/rejected": -132.0639190673828, "loss": 0.6926, "pred_label": 2185.449951171875, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.3655874729156494, "rewards/margins": 0.21378450095653534, "rewards/rejected": -0.579371988773346, "step": 610, "use_label": 10520.5498046875 }, { "epoch": 0.65, "grad_norm": 1.0859375, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -0.7425838708877563, "logits/rejected": -0.7612688541412354, "logps/chosen": -98.45491790771484, "logps/rejected": -139.22779846191406, "loss": 0.6874, "pred_label": 2228.10009765625, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.3674684762954712, "rewards/margins": 0.22383132576942444, "rewards/rejected": -0.591299831867218, "step": 620, "use_label": 10637.900390625 }, { "epoch": 0.66, "grad_norm": 1.34375, "learning_rate": 1.5676295169786864e-06, "logits/chosen": -0.5626051425933838, "logits/rejected": -0.7373117208480835, "logps/chosen": -109.76419830322266, "logps/rejected": -132.89573669433594, "loss": 0.6861, "pred_label": 2268.074951171875, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.3673921525478363, "rewards/margins": 0.2162620723247528, "rewards/rejected": -0.5836542844772339, "step": 630, "use_label": 10757.9248046875 }, { "epoch": 0.67, "grad_norm": 1.2578125, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -0.70842045545578, "logits/rejected": -0.5356844663619995, "logps/chosen": -115.94453430175781, "logps/rejected": -132.53977966308594, "loss": 0.6881, "pred_label": 2312.199951171875, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.4425238072872162, "rewards/margins": 0.23113970458507538, "rewards/rejected": -0.6736636161804199, "step": 640, "use_label": 10873.7998046875 }, { "epoch": 0.68, "grad_norm": 1.5, "learning_rate": 1.4006036925609245e-06, "logits/chosen": -0.7530516386032104, "logits/rejected": -0.39667490124702454, "logps/chosen": -117.97354888916016, "logps/rejected": -148.5204620361328, "loss": 0.6907, "pred_label": 2364.60009765625, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.4478411078453064, "rewards/margins": 0.25875502824783325, "rewards/rejected": -0.7065961956977844, "step": 650, "use_label": 10981.400390625 }, { "epoch": 0.69, "grad_norm": 1.2109375, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -0.4164413511753082, "logits/rejected": -0.5387105345726013, "logps/chosen": -93.08172607421875, "logps/rejected": -106.9631576538086, "loss": 0.6884, "pred_label": 2410.39990234375, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.3495523929595947, "rewards/margins": 0.1542079746723175, "rewards/rejected": -0.5037603378295898, "step": 660, "use_label": 11095.599609375 }, { "epoch": 0.7, "grad_norm": 1.515625, "learning_rate": 1.2394572821496953e-06, "logits/chosen": -0.9564473032951355, "logits/rejected": -1.0122594833374023, "logps/chosen": -100.20994567871094, "logps/rejected": -121.32554626464844, "loss": 0.6935, "pred_label": 2446.14990234375, "rewards/accuracies": 0.34375, "rewards/chosen": -0.3450331687927246, "rewards/margins": 0.19006122648715973, "rewards/rejected": -0.5350943803787231, "step": 670, "use_label": 11219.849609375 }, { "epoch": 0.71, "grad_norm": 1.546875, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -0.588452935218811, "logits/rejected": -0.6323766708374023, "logps/chosen": -125.20991516113281, "logps/rejected": -139.94993591308594, "loss": 0.6902, "pred_label": 2485.10009765625, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.3915707468986511, "rewards/margins": 0.19166378676891327, "rewards/rejected": -0.5832345485687256, "step": 680, "use_label": 11340.900390625 }, { "epoch": 0.72, "grad_norm": 1.578125, "learning_rate": 1.0850520736699362e-06, "logits/chosen": -0.6506579518318176, "logits/rejected": -0.7167869806289673, "logps/chosen": -144.53038024902344, "logps/rejected": -167.38192749023438, "loss": 0.6898, "pred_label": 2534.75, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.42825189232826233, "rewards/margins": 0.28569427132606506, "rewards/rejected": -0.7139460444450378, "step": 690, "use_label": 11451.25 }, { "epoch": 0.73, "grad_norm": 1.59375, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -0.8556931614875793, "logits/rejected": -0.6913198232650757, "logps/chosen": -105.3968505859375, "logps/rejected": -124.95710754394531, "loss": 0.6913, "pred_label": 2580.824951171875, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.39049768447875977, "rewards/margins": 0.17418017983436584, "rewards/rejected": -0.564677894115448, "step": 700, "use_label": 11565.1748046875 }, { "epoch": 0.73, "eval_logits/chosen": -0.3469957709312439, "eval_logits/rejected": -0.24619349837303162, "eval_logps/chosen": -104.32471466064453, "eval_logps/rejected": -133.26370239257812, "eval_loss": 0.6898515224456787, "eval_pred_label": 2673.52392578125, "eval_rewards/accuracies": 0.3670634925365448, "eval_rewards/chosen": -0.35424166917800903, "eval_rewards/margins": 0.22441466152668, "eval_rewards/rejected": -0.5786563754081726, "eval_runtime": 248.2749, "eval_samples_per_second": 8.056, "eval_steps_per_second": 0.254, "eval_use_label": 11806.4765625, "step": 700 }, { "epoch": 0.74, "grad_norm": 1.03125, "learning_rate": 9.382138040640714e-07, "logits/chosen": -0.6519032716751099, "logits/rejected": -0.637380063533783, "logps/chosen": -102.23021697998047, "logps/rejected": -127.60137939453125, "loss": 0.6903, "pred_label": 2771.699951171875, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.3915974497795105, "rewards/margins": 0.21561889350414276, "rewards/rejected": -0.6072162985801697, "step": 710, "use_label": 12038.2998046875 }, { "epoch": 0.75, "grad_norm": 1.609375, "learning_rate": 8.678793653740633e-07, "logits/chosen": -0.6509895324707031, "logits/rejected": -0.6935362815856934, "logps/chosen": -87.30061340332031, "logps/rejected": -114.2796630859375, "loss": 0.6903, "pred_label": 2811.47509765625, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.30430155992507935, "rewards/margins": 0.18221500515937805, "rewards/rejected": -0.486516535282135, "step": 720, "use_label": 12158.525390625 }, { "epoch": 0.76, "grad_norm": 2.21875, "learning_rate": 7.997277433690984e-07, "logits/chosen": -0.6035222411155701, "logits/rejected": -0.65208500623703, "logps/chosen": -100.17440032958984, "logps/rejected": -119.87808990478516, "loss": 0.6865, "pred_label": 2850.0, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.2982019782066345, "rewards/margins": 0.2585477828979492, "rewards/rejected": -0.5567497611045837, "step": 730, "use_label": 12280.0 }, { "epoch": 0.77, "grad_norm": 0.80859375, "learning_rate": 7.338500848029603e-07, "logits/chosen": -0.4770827293395996, "logits/rejected": -0.5081530213356018, "logps/chosen": -94.86068725585938, "logps/rejected": -116.67037200927734, "loss": 0.6916, "pred_label": 2886.125, "rewards/accuracies": 0.28125, "rewards/chosen": -0.34235304594039917, "rewards/margins": 0.19017408788204193, "rewards/rejected": -0.5325270891189575, "step": 740, "use_label": 12403.875 }, { "epoch": 0.79, "grad_norm": 1.1015625, "learning_rate": 6.70334495204884e-07, "logits/chosen": -0.5357509851455688, "logits/rejected": -0.594279408454895, "logps/chosen": -119.76139831542969, "logps/rejected": -145.1709747314453, "loss": 0.6905, "pred_label": 2929.22509765625, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.4223107397556305, "rewards/margins": 0.18705633282661438, "rewards/rejected": -0.6093670725822449, "step": 750, "use_label": 12520.775390625 }, { "epoch": 0.8, "grad_norm": 1.1640625, "learning_rate": 6.092659210462232e-07, "logits/chosen": -0.6737512350082397, "logits/rejected": -0.6523575186729431, "logps/chosen": -86.640625, "logps/rejected": -124.01812744140625, "loss": 0.6899, "pred_label": 2976.050048828125, "rewards/accuracies": 0.3125, "rewards/chosen": -0.32672789692878723, "rewards/margins": 0.1930442750453949, "rewards/rejected": -0.5197721719741821, "step": 760, "use_label": 12633.9501953125 }, { "epoch": 0.81, "grad_norm": 1.4375, "learning_rate": 5.507260361320738e-07, "logits/chosen": -0.6238114833831787, "logits/rejected": -0.6686199307441711, "logps/chosen": -127.0525131225586, "logps/rejected": -142.44747924804688, "loss": 0.689, "pred_label": 3021.85009765625, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.43505221605300903, "rewards/margins": 0.25210094451904297, "rewards/rejected": -0.687153160572052, "step": 770, "use_label": 12748.150390625 }, { "epoch": 0.82, "grad_norm": 1.7578125, "learning_rate": 4.947931323697983e-07, "logits/chosen": -0.6369722485542297, "logits/rejected": -0.7722553014755249, "logps/chosen": -112.76126861572266, "logps/rejected": -133.56796264648438, "loss": 0.6915, "pred_label": 3075.72509765625, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.3996170461177826, "rewards/margins": 0.22261002659797668, "rewards/rejected": -0.6222270727157593, "step": 780, "use_label": 12854.275390625 }, { "epoch": 0.83, "grad_norm": 1.421875, "learning_rate": 4.4154201506053985e-07, "logits/chosen": -0.5256940126419067, "logits/rejected": -0.467402845621109, "logps/chosen": -95.73258209228516, "logps/rejected": -103.3360366821289, "loss": 0.6917, "pred_label": 3123.85009765625, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.30898317694664, "rewards/margins": 0.2029590606689453, "rewards/rejected": -0.5119422674179077, "step": 790, "use_label": 12966.150390625 }, { "epoch": 0.84, "grad_norm": 1.359375, "learning_rate": 3.910439028537638e-07, "logits/chosen": -0.6677756905555725, "logits/rejected": -0.607046902179718, "logps/chosen": -92.61612701416016, "logps/rejected": -115.20296478271484, "loss": 0.6893, "pred_label": 3166.449951171875, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.3256850242614746, "rewards/margins": 0.20536477863788605, "rewards/rejected": -0.5310498476028442, "step": 800, "use_label": 13083.5498046875 }, { "epoch": 0.84, "eval_logits/chosen": -0.23666124045848846, "eval_logits/rejected": -0.1293245106935501, "eval_logps/chosen": -103.33552551269531, "eval_logps/rejected": -132.24159240722656, "eval_loss": 0.6903889179229736, "eval_pred_label": 3252.09521484375, "eval_rewards/accuracies": 0.363095223903656, "eval_rewards/chosen": -0.34434974193573, "eval_rewards/margins": 0.22408555448055267, "eval_rewards/rejected": -0.5684353113174438, "eval_runtime": 248.2839, "eval_samples_per_second": 8.055, "eval_steps_per_second": 0.254, "eval_use_label": 13331.904296875, "step": 800 }, { "epoch": 0.85, "grad_norm": 1.3828125, "learning_rate": 3.4336633249862084e-07, "logits/chosen": -0.6630854606628418, "logits/rejected": -0.6445407867431641, "logps/chosen": -108.18148040771484, "logps/rejected": -135.99142456054688, "loss": 0.6901, "pred_label": 3350.35009765625, "rewards/accuracies": 0.34375, "rewards/chosen": -0.3832666873931885, "rewards/margins": 0.1908622682094574, "rewards/rejected": -0.5741289258003235, "step": 810, "use_label": 13563.650390625 }, { "epoch": 0.86, "grad_norm": 1.3359375, "learning_rate": 2.98573068519539e-07, "logits/chosen": -0.6042599081993103, "logits/rejected": -0.6371781826019287, "logps/chosen": -94.31297302246094, "logps/rejected": -101.22802734375, "loss": 0.689, "pred_label": 3393.47509765625, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.3432285487651825, "rewards/margins": 0.13310988247394562, "rewards/rejected": -0.4763384461402893, "step": 820, "use_label": 13680.525390625 }, { "epoch": 0.87, "grad_norm": 1.484375, "learning_rate": 2.5672401793681854e-07, "logits/chosen": -0.5476540923118591, "logits/rejected": -0.43125781416893005, "logps/chosen": -86.91058349609375, "logps/rejected": -110.5887222290039, "loss": 0.6923, "pred_label": 3435.074951171875, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.2886909246444702, "rewards/margins": 0.25071993470191956, "rewards/rejected": -0.5394108295440674, "step": 830, "use_label": 13798.9248046875 }, { "epoch": 0.88, "grad_norm": 1.9296875, "learning_rate": 2.178751501463036e-07, "logits/chosen": -0.5565081834793091, "logits/rejected": -0.6612057685852051, "logps/chosen": -89.98490142822266, "logps/rejected": -93.48139953613281, "loss": 0.6915, "pred_label": 3471.35009765625, "rewards/accuracies": 0.24375000596046448, "rewards/chosen": -0.306854248046875, "rewards/margins": 0.09164027869701385, "rewards/rejected": -0.39849454164505005, "step": 840, "use_label": 13922.650390625 }, { "epoch": 0.89, "grad_norm": 1.359375, "learning_rate": 1.820784220652766e-07, "logits/chosen": -0.6778563261032104, "logits/rejected": -0.73534095287323, "logps/chosen": -120.2663345336914, "logps/rejected": -149.02294921875, "loss": 0.6854, "pred_label": 3509.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.36049091815948486, "rewards/margins": 0.2984590530395508, "rewards/rejected": -0.6589499711990356, "step": 850, "use_label": 14045.0 }, { "epoch": 0.9, "grad_norm": 1.796875, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -0.5929479002952576, "logits/rejected": -0.48117414116859436, "logps/chosen": -115.10990142822266, "logps/rejected": -133.1912841796875, "loss": 0.6892, "pred_label": 3556.324951171875, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.33908045291900635, "rewards/margins": 0.23609444499015808, "rewards/rejected": -0.5751749277114868, "step": 860, "use_label": 14157.6748046875 }, { "epoch": 0.91, "grad_norm": 1.7578125, "learning_rate": 1.1982873884064466e-07, "logits/chosen": -0.6633087992668152, "logits/rejected": -0.6678288578987122, "logps/chosen": -117.92154693603516, "logps/rejected": -145.3701171875, "loss": 0.6893, "pred_label": 3603.75, "rewards/accuracies": 0.375, "rewards/chosen": -0.3660942316055298, "rewards/margins": 0.2644110918045044, "rewards/rejected": -0.6305053234100342, "step": 870, "use_label": 14270.25 }, { "epoch": 0.92, "grad_norm": 0.87890625, "learning_rate": 9.345903713082305e-08, "logits/chosen": -0.5895944237709045, "logits/rejected": -0.5510295629501343, "logps/chosen": -96.94719696044922, "logps/rejected": -141.16554260253906, "loss": 0.6891, "pred_label": 3651.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.3419613242149353, "rewards/margins": 0.32287630438804626, "rewards/rejected": -0.6648377180099487, "step": 880, "use_label": 14383.0 }, { "epoch": 0.93, "grad_norm": 1.6484375, "learning_rate": 7.030787065396866e-08, "logits/chosen": -0.5159703493118286, "logits/rejected": -0.5519541501998901, "logps/chosen": -96.9026107788086, "logps/rejected": -120.7626724243164, "loss": 0.693, "pred_label": 3690.675048828125, "rewards/accuracies": 0.28125, "rewards/chosen": -0.3307461142539978, "rewards/margins": 0.1426464170217514, "rewards/rejected": -0.4733925461769104, "step": 890, "use_label": 14503.3251953125 }, { "epoch": 0.94, "grad_norm": 1.9609375, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -0.2721698582172394, "logits/rejected": -0.407818466424942, "logps/chosen": -104.2662582397461, "logps/rejected": -149.70314025878906, "loss": 0.689, "pred_label": 3732.824951171875, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.3485477864742279, "rewards/margins": 0.2633667290210724, "rewards/rejected": -0.6119145154953003, "step": 900, "use_label": 14621.1748046875 }, { "epoch": 0.94, "eval_logits/chosen": -0.2437347173690796, "eval_logits/rejected": -0.13671822845935822, "eval_logps/chosen": -103.0300521850586, "eval_logps/rejected": -131.91110229492188, "eval_loss": 0.6907457709312439, "eval_pred_label": 3821.52392578125, "eval_rewards/accuracies": 0.363095223903656, "eval_rewards/chosen": -0.3412950336933136, "eval_rewards/margins": 0.22383520007133484, "eval_rewards/rejected": -0.5651301741600037, "eval_runtime": 248.2504, "eval_samples_per_second": 8.056, "eval_steps_per_second": 0.254, "eval_use_label": 14866.4765625, "step": 900 }, { "epoch": 0.95, "grad_norm": 1.171875, "learning_rate": 3.378064801637687e-08, "logits/chosen": -0.5370496511459351, "logits/rejected": -0.5028234720230103, "logps/chosen": -89.67744445800781, "logps/rejected": -113.96895599365234, "loss": 0.6882, "pred_label": 3916.52490234375, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.2901899218559265, "rewards/margins": 0.2133828103542328, "rewards/rejected": -0.5035727024078369, "step": 910, "use_label": 15101.474609375 }, { "epoch": 0.96, "grad_norm": 1.3125, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -0.43033066391944885, "logits/rejected": -0.4173038899898529, "logps/chosen": -80.09765625, "logps/rejected": -120.93513488769531, "loss": 0.6934, "pred_label": 3958.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.26141807436943054, "rewards/margins": 0.23344416916370392, "rewards/rejected": -0.49486222863197327, "step": 920, "use_label": 15220.0 }, { "epoch": 0.97, "grad_norm": 2.109375, "learning_rate": 1.0442413283435759e-08, "logits/chosen": -0.4513850212097168, "logits/rejected": -0.5099025964736938, "logps/chosen": -92.44239807128906, "logps/rejected": -119.61177062988281, "loss": 0.6878, "pred_label": 3998.60009765625, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.29288578033447266, "rewards/margins": 0.20934204757213593, "rewards/rejected": -0.502227783203125, "step": 930, "use_label": 15339.400390625 }, { "epoch": 0.98, "grad_norm": 1.25, "learning_rate": 3.760945397705828e-09, "logits/chosen": -0.3625331521034241, "logits/rejected": -0.5358187556266785, "logps/chosen": -103.41780090332031, "logps/rejected": -130.23828125, "loss": 0.691, "pred_label": 4038.60009765625, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.34467238187789917, "rewards/margins": 0.18087737262248993, "rewards/rejected": -0.5255497694015503, "step": 940, "use_label": 15459.400390625 }, { "epoch": 0.99, "grad_norm": 1.59375, "learning_rate": 4.1797599220405605e-10, "logits/chosen": -0.674268901348114, "logits/rejected": -0.7018919587135315, "logps/chosen": -114.91938781738281, "logps/rejected": -133.3175506591797, "loss": 0.6895, "pred_label": 4082.625, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.3830910325050354, "rewards/margins": 0.1591145098209381, "rewards/rejected": -0.5422054529190063, "step": 950, "use_label": 15575.375 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.6880922077838039, "train_runtime": 20023.3666, "train_samples_per_second": 3.053, "train_steps_per_second": 0.048 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }