{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 23814117.69119963, "learning_rate": 2.127659574468085e-08, "logits/chosen": -1.1381689310073853, "logits/rejected": -0.9913416504859924, "logps/chosen": -0.2839311361312866, "logps/rejected": -0.29555341601371765, "loss": 305.9593, "rewards/accuracies": 0.625, "rewards/chosen": -0.7098277807235718, "rewards/margins": 0.029055725783109665, "rewards/rejected": -0.7388835549354553, "step": 1 }, { "epoch": 0.010686615014694095, "grad_norm": 1974395.8030804002, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.9901005029678345, "logits/rejected": -0.9188694953918457, "logps/chosen": -0.26972177624702454, "logps/rejected": -0.2686304748058319, "loss": 266.3214, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6743044853210449, "rewards/margins": -0.002728263381868601, "rewards/rejected": -0.6715761423110962, "step": 5 }, { "epoch": 0.02137323002938819, "grad_norm": 46220091.26670953, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.9833618998527527, "logits/rejected": -0.9393731951713562, "logps/chosen": -0.27256160974502563, "logps/rejected": -0.273215115070343, "loss": 185.9952, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.6814040541648865, "rewards/margins": 0.0016337722772732377, "rewards/rejected": -0.6830377578735352, "step": 10 }, { "epoch": 0.03205984504408229, "grad_norm": 474920.2122214309, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.963974118232727, "logits/rejected": -0.9196063876152039, "logps/chosen": -0.29573556780815125, "logps/rejected": -0.28305521607398987, "loss": 125.1317, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7393389940261841, "rewards/margins": -0.0317009761929512, "rewards/rejected": -0.7076379656791687, "step": 15 }, { "epoch": 0.04274646005877638, "grad_norm": 4515133.468491916, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.976075291633606, "logits/rejected": -0.9759608507156372, "logps/chosen": -0.2616123557090759, "logps/rejected": -0.27002111077308655, "loss": 127.9034, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6540309190750122, "rewards/margins": 0.02102179452776909, "rewards/rejected": -0.6750527620315552, "step": 20 }, { "epoch": 0.053433075073470476, "grad_norm": 5165268.674113416, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0451716184616089, "logits/rejected": -1.0216295719146729, "logps/chosen": -0.28275421261787415, "logps/rejected": -0.2863079905509949, "loss": 161.215, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.7068854570388794, "rewards/margins": 0.0088844895362854, "rewards/rejected": -0.7157700657844543, "step": 25 }, { "epoch": 0.06411969008816458, "grad_norm": 143372948.8120165, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.071578025817871, "logits/rejected": -0.9856084585189819, "logps/chosen": -0.2763022780418396, "logps/rejected": -0.2745462656021118, "loss": 388.8185, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.6907557249069214, "rewards/margins": -0.0043900711461901665, "rewards/rejected": -0.6863657236099243, "step": 30 }, { "epoch": 0.07480630510285867, "grad_norm": 1184929.3556330686, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.01273512840271, "logits/rejected": -0.9335028529167175, "logps/chosen": -0.27808648347854614, "logps/rejected": -0.29893654584884644, "loss": 115.7746, "rewards/accuracies": 0.5625, "rewards/chosen": -0.695216178894043, "rewards/margins": 0.05212521553039551, "rewards/rejected": -0.7473413348197937, "step": 35 }, { "epoch": 0.08549292011755276, "grad_norm": 1877904.3293607633, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9277470707893372, "logits/rejected": -0.9166946411132812, "logps/chosen": -0.2787823975086212, "logps/rejected": -0.2824743986129761, "loss": 138.8757, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6969559788703918, "rewards/margins": 0.009230067022144794, "rewards/rejected": -0.7061859965324402, "step": 40 }, { "epoch": 0.09617953513224686, "grad_norm": 2671724.397751134, "learning_rate": 9.574468085106384e-07, "logits/chosen": -0.9359474182128906, "logits/rejected": -0.8535245060920715, "logps/chosen": -0.33036336302757263, "logps/rejected": -0.33015647530555725, "loss": 104.7933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.8259084820747375, "rewards/margins": -0.0005173005047254264, "rewards/rejected": -0.8253911733627319, "step": 45 }, { "epoch": 0.10686615014694095, "grad_norm": 79521388.74298675, "learning_rate": 9.998741174712533e-07, "logits/chosen": -0.9259702563285828, "logits/rejected": -0.9349774122238159, "logps/chosen": -0.2925248146057129, "logps/rejected": -0.3076633810997009, "loss": 175.2819, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.731312096118927, "rewards/margins": 0.0378464013338089, "rewards/rejected": -0.7691584825515747, "step": 50 }, { "epoch": 0.11755276516163506, "grad_norm": 327828.43192551495, "learning_rate": 9.991050648838675e-07, "logits/chosen": -0.9278720021247864, "logits/rejected": -0.8686744570732117, "logps/chosen": -0.2634710669517517, "logps/rejected": -0.27794915437698364, "loss": 2695.6354, "rewards/accuracies": 0.625, "rewards/chosen": -0.6586776971817017, "rewards/margins": 0.03619522601366043, "rewards/rejected": -0.6948728561401367, "step": 55 }, { "epoch": 0.12823938017632916, "grad_norm": 899122.206242127, "learning_rate": 9.97637968732563e-07, "logits/chosen": -0.9561047554016113, "logits/rejected": -0.9336016774177551, "logps/chosen": -0.2656118869781494, "logps/rejected": -0.28187674283981323, "loss": 105.9757, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6640297174453735, "rewards/margins": 0.0406620129942894, "rewards/rejected": -0.7046917080879211, "step": 60 }, { "epoch": 0.13892599519102325, "grad_norm": 377242.1086806779, "learning_rate": 9.954748808839674e-07, "logits/chosen": -0.956866443157196, "logits/rejected": -1.005385398864746, "logps/chosen": -0.2731708288192749, "logps/rejected": -0.26419904828071594, "loss": 108.4874, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.682927131652832, "rewards/margins": -0.02242954447865486, "rewards/rejected": -0.6604975461959839, "step": 65 }, { "epoch": 0.14961261020571734, "grad_norm": 42496.77007169402, "learning_rate": 9.926188266120295e-07, "logits/chosen": -0.9903133511543274, "logits/rejected": -0.9588413238525391, "logps/chosen": -0.305401474237442, "logps/rejected": -0.298237681388855, "loss": 405.7784, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.7635036706924438, "rewards/margins": -0.017909497022628784, "rewards/rejected": -0.7455942034721375, "step": 70 }, { "epoch": 0.16029922522041143, "grad_norm": 523140.8331781128, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9634426236152649, "logits/rejected": -0.9494821429252625, "logps/chosen": -0.2741475999355316, "logps/rejected": -0.2895483672618866, "loss": 2624.7643, "rewards/accuracies": 0.5625, "rewards/chosen": -0.685369074344635, "rewards/margins": 0.038501907140016556, "rewards/rejected": -0.7238709926605225, "step": 75 }, { "epoch": 0.17098584023510552, "grad_norm": 37571862.32606961, "learning_rate": 9.848447601883433e-07, "logits/chosen": -1.0007914304733276, "logits/rejected": -0.9825354814529419, "logps/chosen": -0.28798869252204895, "logps/rejected": -0.28025856614112854, "loss": 131.06, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.7199716567993164, "rewards/margins": -0.019325237721204758, "rewards/rejected": -0.7006464600563049, "step": 80 }, { "epoch": 0.18167245524979964, "grad_norm": 8861430.225925114, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9258670806884766, "logits/rejected": -0.8755356073379517, "logps/chosen": -0.2675308287143707, "logps/rejected": -0.28247857093811035, "loss": 151.0586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6688271760940552, "rewards/margins": 0.03736928477883339, "rewards/rejected": -0.7061963081359863, "step": 85 }, { "epoch": 0.19235907026449373, "grad_norm": 322999.5648039634, "learning_rate": 9.743592451943998e-07, "logits/chosen": -0.8945444226264954, "logits/rejected": -0.8326283693313599, "logps/chosen": -0.2888963222503662, "logps/rejected": -0.30566543340682983, "loss": 104.0617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7222408056259155, "rewards/margins": 0.04192278906702995, "rewards/rejected": -0.764163613319397, "step": 90 }, { "epoch": 0.20304568527918782, "grad_norm": 16202558.227455074, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.0018240213394165, "logits/rejected": -1.024642825126648, "logps/chosen": -0.2775546908378601, "logps/rejected": -0.31214436888694763, "loss": 185.9518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6938868165016174, "rewards/margins": 0.08647419512271881, "rewards/rejected": -0.7803609371185303, "step": 95 }, { "epoch": 0.2137323002938819, "grad_norm": 881275.6537233666, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.0630947351455688, "logits/rejected": -1.0079935789108276, "logps/chosen": -0.28541457653045654, "logps/rejected": -0.27989768981933594, "loss": 377.6484, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7135364413261414, "rewards/margins": -0.01379220187664032, "rewards/rejected": -0.6997443437576294, "step": 100 }, { "epoch": 0.224418915308576, "grad_norm": 1471606.598734741, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9882336854934692, "logits/rejected": -0.9416030645370483, "logps/chosen": -0.2841026186943054, "logps/rejected": -0.30027633905410767, "loss": 205.1062, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7102565169334412, "rewards/margins": 0.0404343381524086, "rewards/rejected": -0.7506908178329468, "step": 105 }, { "epoch": 0.2351055303232701, "grad_norm": 313885.99955306284, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9041908383369446, "logits/rejected": -0.854825496673584, "logps/chosen": -0.33143380284309387, "logps/rejected": -0.3396168053150177, "loss": 2658.6553, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8285845518112183, "rewards/margins": 0.020457318052649498, "rewards/rejected": -0.8490419387817383, "step": 110 }, { "epoch": 0.2457921453379642, "grad_norm": 682808.4048805884, "learning_rate": 9.367041003085648e-07, "logits/chosen": -0.9211395978927612, "logits/rejected": -0.9187518358230591, "logps/chosen": -0.2765989303588867, "logps/rejected": -0.2790865898132324, "loss": 2651.5432, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6914974451065063, "rewards/margins": 0.006219107657670975, "rewards/rejected": -0.697716474533081, "step": 115 }, { "epoch": 0.2564787603526583, "grad_norm": 527643.1252709947, "learning_rate": 9.272941683504808e-07, "logits/chosen": -1.0119305849075317, "logits/rejected": -0.9751386642456055, "logps/chosen": -0.28785568475723267, "logps/rejected": -0.31191155314445496, "loss": 232.3885, "rewards/accuracies": 0.5, "rewards/chosen": -0.7196391820907593, "rewards/margins": 0.06013970449566841, "rewards/rejected": -0.7797788381576538, "step": 120 }, { "epoch": 0.2671653753673524, "grad_norm": 57160137.92819305, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0446767807006836, "logits/rejected": -1.03428316116333, "logps/chosen": -0.3408173620700836, "logps/rejected": -0.33484262228012085, "loss": 171.7718, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8520433306694031, "rewards/margins": -0.014936879277229309, "rewards/rejected": -0.8371064066886902, "step": 125 }, { "epoch": 0.2778519903820465, "grad_norm": 354260.60461613233, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0202778577804565, "logits/rejected": -1.040438175201416, "logps/chosen": -0.3069685399532318, "logps/rejected": -0.3384125232696533, "loss": 152.887, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7674213647842407, "rewards/margins": 0.07861001789569855, "rewards/rejected": -0.8460313677787781, "step": 130 }, { "epoch": 0.2885386053967406, "grad_norm": 4161760.6479958617, "learning_rate": 8.955355173281707e-07, "logits/chosen": -0.9897885322570801, "logits/rejected": -0.9403419494628906, "logps/chosen": -0.3388122022151947, "logps/rejected": -0.29513686895370483, "loss": 127.9212, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8470304608345032, "rewards/margins": -0.10918829590082169, "rewards/rejected": -0.7378422021865845, "step": 135 }, { "epoch": 0.2992252204114347, "grad_norm": 1984829.9018358907, "learning_rate": 8.838223701790055e-07, "logits/chosen": -0.9738261103630066, "logits/rejected": -0.9614647030830383, "logps/chosen": -0.3720606565475464, "logps/rejected": -0.3473301827907562, "loss": 110.1865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.930151641368866, "rewards/margins": -0.06182613968849182, "rewards/rejected": -0.8683255314826965, "step": 140 }, { "epoch": 0.30991183542612877, "grad_norm": 1513820.1920679864, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0015193223953247, "logits/rejected": -0.9965044856071472, "logps/chosen": -0.3061389625072479, "logps/rejected": -0.3093434274196625, "loss": 3025.9289, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7653473615646362, "rewards/margins": 0.008011135272681713, "rewards/rejected": -0.7733586430549622, "step": 145 }, { "epoch": 0.32059845044082286, "grad_norm": 1970591.2689892622, "learning_rate": 8.588027776804058e-07, "logits/chosen": -0.9444905519485474, "logits/rejected": -0.9439139366149902, "logps/chosen": -0.3537100851535797, "logps/rejected": -0.35441845655441284, "loss": 221.1807, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8842751383781433, "rewards/margins": 0.0017710126703605056, "rewards/rejected": -0.8860462307929993, "step": 150 }, { "epoch": 0.33128506545551695, "grad_norm": 12808984.8595381, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0120588541030884, "logits/rejected": -0.9681800603866577, "logps/chosen": -0.3124849796295166, "logps/rejected": -0.3563632667064667, "loss": 187.1941, "rewards/accuracies": 0.5, "rewards/chosen": -0.7812124490737915, "rewards/margins": 0.10969575494527817, "rewards/rejected": -0.8909081220626831, "step": 155 }, { "epoch": 0.34197168047021104, "grad_norm": 1183315.8265051153, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.0060454607009888, "logits/rejected": -0.9961159825325012, "logps/chosen": -0.2914329171180725, "logps/rejected": -0.3657309412956238, "loss": 120.0758, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7285822629928589, "rewards/margins": 0.1857450008392334, "rewards/rejected": -0.9143272638320923, "step": 160 }, { "epoch": 0.3526582954849052, "grad_norm": 2641730.3542562006, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0412616729736328, "logits/rejected": -0.9751707315444946, "logps/chosen": -0.33477407693862915, "logps/rejected": -0.31863099336624146, "loss": 222.9428, "rewards/accuracies": 0.5, "rewards/chosen": -0.8369352221488953, "rewards/margins": -0.04035765677690506, "rewards/rejected": -0.7965775728225708, "step": 165 }, { "epoch": 0.36334491049959927, "grad_norm": 24200663.77841142, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0198957920074463, "logits/rejected": -1.0187537670135498, "logps/chosen": -0.35278764367103577, "logps/rejected": -0.36079707741737366, "loss": 175.3017, "rewards/accuracies": 0.5, "rewards/chosen": -0.8819690942764282, "rewards/margins": 0.020023606717586517, "rewards/rejected": -0.901992678642273, "step": 170 }, { "epoch": 0.37403152551429336, "grad_norm": 423144.2753567647, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.0390782356262207, "logits/rejected": -1.0463870763778687, "logps/chosen": -0.3290930390357971, "logps/rejected": -0.33625391125679016, "loss": 172.6517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8227324485778809, "rewards/margins": 0.01790226623415947, "rewards/rejected": -0.840634822845459, "step": 175 }, { "epoch": 0.38471814052898745, "grad_norm": 66473449.795217186, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.074244737625122, "logits/rejected": -1.0753021240234375, "logps/chosen": -0.33172592520713806, "logps/rejected": -0.34797996282577515, "loss": 173.7831, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8293148279190063, "rewards/margins": 0.04063502699136734, "rewards/rejected": -0.8699499368667603, "step": 180 }, { "epoch": 0.39540475554368154, "grad_norm": 2508265.2610814595, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.105753779411316, "logits/rejected": -1.0690752267837524, "logps/chosen": -0.33374324440956116, "logps/rejected": -0.3317410349845886, "loss": 130.1423, "rewards/accuracies": 0.5, "rewards/chosen": -0.8343580961227417, "rewards/margins": -0.005005507729947567, "rewards/rejected": -0.8293525576591492, "step": 185 }, { "epoch": 0.40609137055837563, "grad_norm": 1415188.1564555708, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.1109493970870972, "logits/rejected": -1.0873216390609741, "logps/chosen": -0.3215797245502472, "logps/rejected": -0.34585997462272644, "loss": 103.3292, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8039493560791016, "rewards/margins": 0.06070064380764961, "rewards/rejected": -0.8646499514579773, "step": 190 }, { "epoch": 0.4167779855730697, "grad_norm": 22405463.240061384, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0102512836456299, "logits/rejected": -0.995439350605011, "logps/chosen": -0.34978950023651123, "logps/rejected": -0.3523608446121216, "loss": 144.2631, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8744736909866333, "rewards/margins": 0.006428359542042017, "rewards/rejected": -0.880902111530304, "step": 195 }, { "epoch": 0.4274646005877638, "grad_norm": 364076.0574983151, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0113328695297241, "logits/rejected": -1.0017603635787964, "logps/chosen": -0.3367648124694824, "logps/rejected": -0.334301233291626, "loss": 115.4949, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.8419120907783508, "rewards/margins": -0.006158898584544659, "rewards/rejected": -0.8357530832290649, "step": 200 }, { "epoch": 0.4381512156024579, "grad_norm": 1540771.9395518457, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.1025888919830322, "logits/rejected": -1.0531136989593506, "logps/chosen": -0.32284116744995117, "logps/rejected": -0.34286996722221375, "loss": 117.5218, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8071029782295227, "rewards/margins": 0.05007190629839897, "rewards/rejected": -0.8571747541427612, "step": 205 }, { "epoch": 0.448837830617152, "grad_norm": 197109620.3195932, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.1090078353881836, "logits/rejected": -1.0331655740737915, "logps/chosen": -0.2971234619617462, "logps/rejected": -0.3048322796821594, "loss": 199.9444, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.7428085207939148, "rewards/margins": 0.01927214488387108, "rewards/rejected": -0.7620807886123657, "step": 210 }, { "epoch": 0.45952444563184613, "grad_norm": 210941.10682300097, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.055289626121521, "logits/rejected": -0.9978870153427124, "logps/chosen": -0.31941694021224976, "logps/rejected": -0.3496856093406677, "loss": 117.6536, "rewards/accuracies": 0.5, "rewards/chosen": -0.7985422015190125, "rewards/margins": 0.07567177712917328, "rewards/rejected": -0.8742140531539917, "step": 215 }, { "epoch": 0.4702110606465402, "grad_norm": 17774561.440666944, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.0686638355255127, "logits/rejected": -1.0178725719451904, "logps/chosen": -0.33966127038002014, "logps/rejected": -0.3513311445713043, "loss": 109.0363, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8491531610488892, "rewards/margins": 0.029174691066145897, "rewards/rejected": -0.8783278465270996, "step": 220 }, { "epoch": 0.4808976756612343, "grad_norm": 1448517.5044393009, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.0401103496551514, "logits/rejected": -1.0147794485092163, "logps/chosen": -0.284532368183136, "logps/rejected": -0.2862989902496338, "loss": 111.407, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7113308906555176, "rewards/margins": 0.004416565410792828, "rewards/rejected": -0.7157474756240845, "step": 225 }, { "epoch": 0.4915842906759284, "grad_norm": 27408086.19895333, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1030638217926025, "logits/rejected": -0.9990617632865906, "logps/chosen": -0.33675864338874817, "logps/rejected": -0.3289005756378174, "loss": 2706.1715, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.841896653175354, "rewards/margins": -0.01964510791003704, "rewards/rejected": -0.8222514986991882, "step": 230 }, { "epoch": 0.5022709056906225, "grad_norm": 317843.4109202252, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.0890393257141113, "logits/rejected": -1.0440254211425781, "logps/chosen": -0.34919267892837524, "logps/rejected": -0.3513543903827667, "loss": 238.4326, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8729816675186157, "rewards/margins": 0.005404374096542597, "rewards/rejected": -0.8783860206604004, "step": 235 }, { "epoch": 0.5129575207053166, "grad_norm": 85019.6413638533, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.1137946844100952, "logits/rejected": -1.076812505722046, "logps/chosen": -0.32847946882247925, "logps/rejected": -0.322710782289505, "loss": 114.2576, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.8211986422538757, "rewards/margins": -0.014421721920371056, "rewards/rejected": -0.8067768812179565, "step": 240 }, { "epoch": 0.5236441357200107, "grad_norm": 6299240.513263524, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.1339685916900635, "logits/rejected": -1.0941574573516846, "logps/chosen": -0.3057493269443512, "logps/rejected": -0.3173142373561859, "loss": 145.971, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.7643733024597168, "rewards/margins": 0.028912359848618507, "rewards/rejected": -0.7932857275009155, "step": 245 }, { "epoch": 0.5343307507347048, "grad_norm": 847488.7365746452, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.1527339220046997, "logits/rejected": -1.114386796951294, "logps/chosen": -0.3197785019874573, "logps/rejected": -0.3439500629901886, "loss": 2684.4785, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7994462251663208, "rewards/margins": 0.060428936034440994, "rewards/rejected": -0.8598750829696655, "step": 250 }, { "epoch": 0.5450173657493989, "grad_norm": 231777.21031654373, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.0928058624267578, "logits/rejected": -1.079099416732788, "logps/chosen": -0.3362935483455658, "logps/rejected": -0.36446088552474976, "loss": 94.7642, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8407338261604309, "rewards/margins": 0.0704183503985405, "rewards/rejected": -0.911152184009552, "step": 255 }, { "epoch": 0.555703980764093, "grad_norm": 147041.27143323392, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.0703433752059937, "logits/rejected": -1.0498476028442383, "logps/chosen": -0.29196763038635254, "logps/rejected": -0.3219326138496399, "loss": 84.8449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7299190759658813, "rewards/margins": 0.07491237670183182, "rewards/rejected": -0.8048315048217773, "step": 260 }, { "epoch": 0.566390595778787, "grad_norm": 435566096526.86523, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.0955275297164917, "logits/rejected": -1.0940407514572144, "logps/chosen": -0.3326551914215088, "logps/rejected": -0.35088759660720825, "loss": 887.0658, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.831637978553772, "rewards/margins": 0.04558102414011955, "rewards/rejected": -0.877219021320343, "step": 265 }, { "epoch": 0.5770772107934812, "grad_norm": 3520938.773348992, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.086938500404358, "logits/rejected": -1.0884182453155518, "logps/chosen": -0.3024354875087738, "logps/rejected": -0.29867392778396606, "loss": 228.8071, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.7560887336730957, "rewards/margins": -0.009403971955180168, "rewards/rejected": -0.7466848492622375, "step": 270 }, { "epoch": 0.5877638258081752, "grad_norm": 62256.73172967315, "learning_rate": 4.328833670911724e-07, "logits/chosen": -0.9738815426826477, "logits/rejected": -0.9222286343574524, "logps/chosen": -0.2884615659713745, "logps/rejected": -0.3087163269519806, "loss": 1419.8171, "rewards/accuracies": 0.4375, "rewards/chosen": -0.721153974533081, "rewards/margins": 0.05063692852854729, "rewards/rejected": -0.7717908620834351, "step": 275 }, { "epoch": 0.5984504408228694, "grad_norm": 803294.007310266, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.1537045240402222, "logits/rejected": -1.0968632698059082, "logps/chosen": -0.33216923475265503, "logps/rejected": -0.2991081774234772, "loss": 301.7224, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8304230570793152, "rewards/margins": -0.08265267312526703, "rewards/rejected": -0.7477704286575317, "step": 280 }, { "epoch": 0.6091370558375635, "grad_norm": 263927.3164272957, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.044985055923462, "logits/rejected": -1.0020415782928467, "logps/chosen": -0.3105274736881256, "logps/rejected": -0.3335118591785431, "loss": 104.0724, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.7763187289237976, "rewards/margins": 0.057460904121398926, "rewards/rejected": -0.8337796330451965, "step": 285 }, { "epoch": 0.6198236708522575, "grad_norm": 4071370.4437897406, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.071925163269043, "logits/rejected": -1.0407798290252686, "logps/chosen": -0.3102174699306488, "logps/rejected": -0.33250361680984497, "loss": 156.4641, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7755436897277832, "rewards/margins": 0.055715300142765045, "rewards/rejected": -0.8312589526176453, "step": 290 }, { "epoch": 0.6305102858669517, "grad_norm": 965178.0635991972, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.1488522291183472, "logits/rejected": -1.1228643655776978, "logps/chosen": -0.3139174282550812, "logps/rejected": -0.32147642970085144, "loss": 134.6723, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.784793496131897, "rewards/margins": 0.01889752224087715, "rewards/rejected": -0.8036910891532898, "step": 295 }, { "epoch": 0.6411969008816457, "grad_norm": 20365408.82604466, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.0468319654464722, "logits/rejected": -1.0043448209762573, "logps/chosen": -0.3311859369277954, "logps/rejected": -0.3362448513507843, "loss": 120.3211, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8279649019241333, "rewards/margins": 0.012647300958633423, "rewards/rejected": -0.8406121134757996, "step": 300 }, { "epoch": 0.6518835158963399, "grad_norm": 68905.1696103493, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.1107118129730225, "logits/rejected": -1.0741993188858032, "logps/chosen": -0.3406161665916443, "logps/rejected": -0.32983919978141785, "loss": 118.2299, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8515404462814331, "rewards/margins": -0.026942413300275803, "rewards/rejected": -0.8245980143547058, "step": 305 }, { "epoch": 0.6625701309110339, "grad_norm": 3378905.921798347, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.069124460220337, "logits/rejected": -1.110024094581604, "logps/chosen": -0.3401089012622833, "logps/rejected": -0.37533271312713623, "loss": 151.3316, "rewards/accuracies": 0.5, "rewards/chosen": -0.8502721786499023, "rewards/margins": 0.08805962651968002, "rewards/rejected": -0.9383317828178406, "step": 310 }, { "epoch": 0.673256745925728, "grad_norm": 27112706002.48435, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2464616298675537, "logits/rejected": -1.1925503015518188, "logps/chosen": -0.34163057804107666, "logps/rejected": -0.34519410133361816, "loss": 951.2437, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.8540765047073364, "rewards/margins": 0.00890885479748249, "rewards/rejected": -0.8629853129386902, "step": 315 }, { "epoch": 0.6839433609404221, "grad_norm": 34759765.70829812, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.1456632614135742, "logits/rejected": -1.1187629699707031, "logps/chosen": -0.3103678226470947, "logps/rejected": -0.35759711265563965, "loss": 140.3946, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7759194374084473, "rewards/margins": 0.11807328462600708, "rewards/rejected": -0.8939927816390991, "step": 320 }, { "epoch": 0.6946299759551162, "grad_norm": 250533.47771917153, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.0254387855529785, "logits/rejected": -1.0484802722930908, "logps/chosen": -0.3333364427089691, "logps/rejected": -0.3406026363372803, "loss": 110.2831, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.8333410024642944, "rewards/margins": 0.01816548779606819, "rewards/rejected": -0.8515065312385559, "step": 325 }, { "epoch": 0.7053165909698104, "grad_norm": 112400196.05235167, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.15934157371521, "logits/rejected": -1.1072492599487305, "logps/chosen": -0.3412119746208191, "logps/rejected": -0.37822234630584717, "loss": 171.9244, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8530298471450806, "rewards/margins": 0.09252593666315079, "rewards/rejected": -0.9455558061599731, "step": 330 }, { "epoch": 0.7160032059845044, "grad_norm": 9446757.557474248, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.1374088525772095, "logits/rejected": -1.1560612916946411, "logps/chosen": -0.3190115988254547, "logps/rejected": -0.34075072407722473, "loss": 93.1427, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7975289225578308, "rewards/margins": 0.05434788018465042, "rewards/rejected": -0.8518768548965454, "step": 335 }, { "epoch": 0.7266898209991985, "grad_norm": 16695168.934409656, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.0877724885940552, "logits/rejected": -1.0620936155319214, "logps/chosen": -0.30620652437210083, "logps/rejected": -0.33592310547828674, "loss": 120.7583, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7655162811279297, "rewards/margins": 0.07429146766662598, "rewards/rejected": -0.8398076891899109, "step": 340 }, { "epoch": 0.7373764360138926, "grad_norm": 4144624.6729695094, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.0486472845077515, "logits/rejected": -1.0094027519226074, "logps/chosen": -0.30639034509658813, "logps/rejected": -0.321176141500473, "loss": 164.1478, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.765975832939148, "rewards/margins": 0.03696460276842117, "rewards/rejected": -0.8029405474662781, "step": 345 }, { "epoch": 0.7480630510285867, "grad_norm": 23390472.719695035, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.1522271633148193, "logits/rejected": -1.1415410041809082, "logps/chosen": -0.34453052282333374, "logps/rejected": -0.3794510066509247, "loss": 555.5947, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8613262176513672, "rewards/margins": 0.08730128407478333, "rewards/rejected": -0.9486274719238281, "step": 350 }, { "epoch": 0.7587496660432808, "grad_norm": 1280169.965933302, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1356937885284424, "logits/rejected": -1.0751426219940186, "logps/chosen": -0.3311988413333893, "logps/rejected": -0.3161237835884094, "loss": 82.217, "rewards/accuracies": 0.5, "rewards/chosen": -0.827997088432312, "rewards/margins": -0.037687692791223526, "rewards/rejected": -0.7903094291687012, "step": 355 }, { "epoch": 0.7694362810579749, "grad_norm": 1956574.339469866, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.1374176740646362, "logits/rejected": -1.164374589920044, "logps/chosen": -0.3486614227294922, "logps/rejected": -0.3802019953727722, "loss": 102.3601, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.8716535568237305, "rewards/margins": 0.07885149866342545, "rewards/rejected": -0.9505050778388977, "step": 360 }, { "epoch": 0.7801228960726689, "grad_norm": 84129.78530171402, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.137064814567566, "logits/rejected": -1.0832656621932983, "logps/chosen": -0.32139506936073303, "logps/rejected": -0.3225245177745819, "loss": 117.2444, "rewards/accuracies": 0.5, "rewards/chosen": -0.8034876585006714, "rewards/margins": 0.0028236303478479385, "rewards/rejected": -0.806311309337616, "step": 365 }, { "epoch": 0.7908095110873631, "grad_norm": 4136765.760326951, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.1603561639785767, "logits/rejected": -1.1292134523391724, "logps/chosen": -0.33979296684265137, "logps/rejected": -0.35157322883605957, "loss": 106.6815, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8494824171066284, "rewards/margins": 0.029450654983520508, "rewards/rejected": -0.8789331316947937, "step": 370 }, { "epoch": 0.8014961261020572, "grad_norm": 14999761.486437708, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.1542747020721436, "logits/rejected": -1.172849416732788, "logps/chosen": -0.31457456946372986, "logps/rejected": -0.369545042514801, "loss": 120.5736, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7864364385604858, "rewards/margins": 0.1374262273311615, "rewards/rejected": -0.923862636089325, "step": 375 }, { "epoch": 0.8121827411167513, "grad_norm": 244556.56746403236, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2167600393295288, "logits/rejected": -1.1646716594696045, "logps/chosen": -0.37414881587028503, "logps/rejected": -0.3576270043849945, "loss": 113.0907, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9353721737861633, "rewards/margins": -0.041304655373096466, "rewards/rejected": -0.8940675854682922, "step": 380 }, { "epoch": 0.8228693561314454, "grad_norm": 1775127.4439502584, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.1325044631958008, "logits/rejected": -1.110126256942749, "logps/chosen": -0.3134748637676239, "logps/rejected": -0.35259318351745605, "loss": 114.6353, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.7836871147155762, "rewards/margins": 0.09779568761587143, "rewards/rejected": -0.8814828991889954, "step": 385 }, { "epoch": 0.8335559711461394, "grad_norm": 138946.22290507445, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.220655083656311, "logits/rejected": -1.1926963329315186, "logps/chosen": -0.33643871545791626, "logps/rejected": -0.3400726318359375, "loss": 205.2888, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8410967588424683, "rewards/margins": 0.009084770455956459, "rewards/rejected": -0.8501815795898438, "step": 390 }, { "epoch": 0.8442425861608336, "grad_norm": 1993392.8238311838, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.0875790119171143, "logits/rejected": -1.0107152462005615, "logps/chosen": -0.3748469948768616, "logps/rejected": -0.380262553691864, "loss": 150.502, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.9371173977851868, "rewards/margins": 0.013538897037506104, "rewards/rejected": -0.950656533241272, "step": 395 }, { "epoch": 0.8549292011755276, "grad_norm": 42068510.12370043, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.1116609573364258, "logits/rejected": -1.0628454685211182, "logps/chosen": -0.36634019017219543, "logps/rejected": -0.35744190216064453, "loss": 196.6313, "rewards/accuracies": 0.5, "rewards/chosen": -0.9158504605293274, "rewards/margins": -0.022245775908231735, "rewards/rejected": -0.8936047554016113, "step": 400 }, { "epoch": 0.8549292011755276, "eval_logits/chosen": -1.337195634841919, "eval_logits/rejected": -1.2981722354888916, "eval_logps/chosen": -0.3401065170764923, "eval_logps/rejected": -0.35557428002357483, "eval_loss": 132.36317443847656, "eval_rewards/accuracies": 0.5040322542190552, "eval_rewards/chosen": -0.8502662181854248, "eval_rewards/margins": 0.038669489324092865, "eval_rewards/rejected": -0.8889357447624207, "eval_runtime": 72.0543, "eval_samples_per_second": 27.216, "eval_steps_per_second": 0.86, "step": 400 }, { "epoch": 0.8656158161902218, "grad_norm": 5594982.665070933, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.1403666734695435, "logits/rejected": -1.0579333305358887, "logps/chosen": -0.34073713421821594, "logps/rejected": -0.33352339267730713, "loss": 122.7746, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8518427610397339, "rewards/margins": -0.01803441345691681, "rewards/rejected": -0.833808422088623, "step": 405 }, { "epoch": 0.8763024312049158, "grad_norm": 365339.7208405054, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.0983816385269165, "logits/rejected": -1.0836502313613892, "logps/chosen": -0.33261579275131226, "logps/rejected": -0.3417048752307892, "loss": 111.2202, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.831539511680603, "rewards/margins": 0.022722622379660606, "rewards/rejected": -0.8542621731758118, "step": 410 }, { "epoch": 0.88698904621961, "grad_norm": 2850565.0664724754, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.0673637390136719, "logits/rejected": -1.040725827217102, "logps/chosen": -0.34171849489212036, "logps/rejected": -0.348112016916275, "loss": 191.3689, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.8542962074279785, "rewards/margins": 0.015983855351805687, "rewards/rejected": -0.8702800869941711, "step": 415 }, { "epoch": 0.897675661234304, "grad_norm": 507688.39572092163, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.1354072093963623, "logits/rejected": -1.1297013759613037, "logps/chosen": -0.3288155198097229, "logps/rejected": -0.3461647629737854, "loss": 112.2704, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8220387697219849, "rewards/margins": 0.04337311536073685, "rewards/rejected": -0.8654119372367859, "step": 420 }, { "epoch": 0.9083622762489981, "grad_norm": 7451525102.30645, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.0978658199310303, "logits/rejected": -1.0448986291885376, "logps/chosen": -0.3529340624809265, "logps/rejected": -0.35007423162460327, "loss": 3170.9543, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8823351860046387, "rewards/margins": -0.007149559445679188, "rewards/rejected": -0.8751856088638306, "step": 425 }, { "epoch": 0.9190488912636923, "grad_norm": 10934704.996058388, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.1120647192001343, "logits/rejected": -1.0629384517669678, "logps/chosen": -0.30386024713516235, "logps/rejected": -0.31913992762565613, "loss": 237.2099, "rewards/accuracies": 0.5625, "rewards/chosen": -0.759650707244873, "rewards/margins": 0.03819913789629936, "rewards/rejected": -0.7978497743606567, "step": 430 }, { "epoch": 0.9297355062783863, "grad_norm": 1059239098.6557789, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.1208689212799072, "logits/rejected": -1.076522946357727, "logps/chosen": -0.3260021507740021, "logps/rejected": -0.32419848442077637, "loss": 252.7438, "rewards/accuracies": 0.5, "rewards/chosen": -0.8150054216384888, "rewards/margins": -0.004509164486080408, "rewards/rejected": -0.8104962110519409, "step": 435 }, { "epoch": 0.9404221212930804, "grad_norm": 3163894.8555042273, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.1477479934692383, "logits/rejected": -1.1257246732711792, "logps/chosen": -0.3194740116596222, "logps/rejected": -0.3639461398124695, "loss": 283.9071, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7986849546432495, "rewards/margins": 0.11118029057979584, "rewards/rejected": -0.9098652601242065, "step": 440 }, { "epoch": 0.9511087363077745, "grad_norm": 313415.0194399257, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1323530673980713, "logits/rejected": -1.0702521800994873, "logps/chosen": -0.323483407497406, "logps/rejected": -0.3070305287837982, "loss": 142.406, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.8087084889411926, "rewards/margins": -0.041132211685180664, "rewards/rejected": -0.7675763368606567, "step": 445 }, { "epoch": 0.9617953513224686, "grad_norm": 312608.20391974325, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.0631930828094482, "logits/rejected": -1.0676857233047485, "logps/chosen": -0.34714624285697937, "logps/rejected": -0.3853607773780823, "loss": 98.3866, "rewards/accuracies": 0.5, "rewards/chosen": -0.8678655624389648, "rewards/margins": 0.09553632885217667, "rewards/rejected": -0.9634019136428833, "step": 450 }, { "epoch": 0.9724819663371627, "grad_norm": 98094.25868242758, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.1627556085586548, "logits/rejected": -1.131043791770935, "logps/chosen": -0.34627044200897217, "logps/rejected": -0.32559770345687866, "loss": 136.3491, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.8656761050224304, "rewards/margins": -0.05168183892965317, "rewards/rejected": -0.813994288444519, "step": 455 }, { "epoch": 0.9831685813518568, "grad_norm": 25583015.430213835, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.1027119159698486, "logits/rejected": -1.1237401962280273, "logps/chosen": -0.33976924419403076, "logps/rejected": -0.3329155147075653, "loss": 116.0967, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8494230508804321, "rewards/margins": -0.01713428646326065, "rewards/rejected": -0.8322887420654297, "step": 460 }, { "epoch": 0.9938551963665508, "grad_norm": 351352.5694241463, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.212425947189331, "logits/rejected": -1.1084251403808594, "logps/chosen": -0.3214932084083557, "logps/rejected": -0.3376317620277405, "loss": 2736.6258, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.8037330508232117, "rewards/margins": 0.04034631699323654, "rewards/rejected": -0.844079315662384, "step": 465 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 444.38003229635433, "train_runtime": 7255.1322, "train_samples_per_second": 8.253, "train_steps_per_second": 0.064 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }