{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 9.523809523809524e-09, "logits/chosen": -1.2850065231323242, "logits/rejected": -0.5420928001403809, "logps/chosen": -849.5712890625, "logps/rejected": -659.8780517578125, "loss": 0.2593, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 9.523809523809525e-08, "logits/chosen": -1.6733099222183228, "logits/rejected": -1.1435589790344238, "logps/chosen": -407.6390075683594, "logps/rejected": -750.5330200195312, "loss": 0.1933, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0002648972731549293, "rewards/margins": 0.0002683588827494532, "rewards/rejected": -3.461622554823407e-06, "step": 10 }, { "epoch": 0.0, "learning_rate": 1.904761904761905e-07, "logits/chosen": -1.5639336109161377, "logits/rejected": -1.1012383699417114, "logps/chosen": -465.546875, "logps/rejected": -766.64697265625, "loss": 0.2397, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00022372114472091198, "rewards/margins": -2.5915365768014453e-05, "rewards/rejected": -0.00019780578440986574, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.8571428571428575e-07, "logits/chosen": -1.6892967224121094, "logits/rejected": -1.0408269166946411, "logps/chosen": -473.25262451171875, "logps/rejected": -808.1934204101562, "loss": 0.2234, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0015700621297582984, "rewards/margins": 0.0015648994594812393, "rewards/rejected": 5.1628012442961335e-06, "step": 30 }, { "epoch": 0.01, "learning_rate": 3.80952380952381e-07, "logits/chosen": -1.7225767374038696, "logits/rejected": -1.1273142099380493, "logps/chosen": -457.7410583496094, "logps/rejected": -813.6912841796875, "loss": 0.2055, "rewards/accuracies": 0.75, "rewards/chosen": 0.002732831286266446, "rewards/margins": 0.005175677128136158, "rewards/rejected": -0.0024428460747003555, "step": 40 }, { "epoch": 0.01, "learning_rate": 4.7619047619047623e-07, "logits/chosen": -1.5604654550552368, "logits/rejected": -0.9060885310173035, "logps/chosen": -554.2913208007812, "logps/rejected": -849.5115356445312, "loss": 0.1971, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.00811075884848833, "rewards/margins": 0.014176970347762108, "rewards/rejected": -0.006066213361918926, "step": 50 }, { "epoch": 0.01, "learning_rate": 5.714285714285715e-07, "logits/chosen": -1.687990427017212, "logits/rejected": -1.2494174242019653, "logps/chosen": -386.49517822265625, "logps/rejected": -765.5401611328125, "loss": 0.2104, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.011224821209907532, "rewards/margins": 0.017030417919158936, "rewards/rejected": -0.005805597640573978, "step": 60 }, { "epoch": 0.01, "learning_rate": 6.666666666666667e-07, "logits/chosen": -1.2933578491210938, "logits/rejected": -1.1579430103302002, "logps/chosen": -321.789306640625, "logps/rejected": -766.0114135742188, "loss": 0.1858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009114318527281284, "rewards/margins": 0.020903298631310463, "rewards/rejected": -0.011788980104029179, "step": 70 }, { "epoch": 0.02, "learning_rate": 7.61904761904762e-07, "logits/chosen": -1.6945091485977173, "logits/rejected": -1.1569772958755493, "logps/chosen": -363.9807434082031, "logps/rejected": -795.9345703125, "loss": 0.1731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.010179867967963219, "rewards/margins": 0.04508482292294502, "rewards/rejected": -0.034904953092336655, "step": 80 }, { "epoch": 0.02, "learning_rate": 8.571428571428572e-07, "logits/chosen": -1.481457233428955, "logits/rejected": -1.0560935735702515, "logps/chosen": -575.8737182617188, "logps/rejected": -885.8480224609375, "loss": 0.1728, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0052037788555026054, "rewards/margins": 0.05343114212155342, "rewards/rejected": -0.04822736978530884, "step": 90 }, { "epoch": 0.02, "learning_rate": 9.523809523809525e-07, "logits/chosen": -1.6370322704315186, "logits/rejected": -0.8078746795654297, "logps/chosen": -443.8648986816406, "logps/rejected": -836.8392333984375, "loss": 0.1715, "rewards/accuracies": 0.75, "rewards/chosen": 0.012666692025959492, "rewards/margins": 0.07541676610708237, "rewards/rejected": -0.06275007873773575, "step": 100 }, { "epoch": 0.02, "learning_rate": 1.0476190476190478e-06, "logits/chosen": -1.531712293624878, "logits/rejected": -1.1594440937042236, "logps/chosen": -445.06427001953125, "logps/rejected": -885.134765625, "loss": 0.1563, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0009547686204314232, "rewards/margins": 0.08301910012960434, "rewards/rejected": -0.08206433802843094, "step": 110 }, { "epoch": 0.02, "learning_rate": 1.142857142857143e-06, "logits/chosen": -1.5637714862823486, "logits/rejected": -1.008846640586853, "logps/chosen": -559.6546020507812, "logps/rejected": -1037.636962890625, "loss": 0.1188, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.025890862569212914, "rewards/margins": 0.11646576225757599, "rewards/rejected": -0.14235660433769226, "step": 120 }, { "epoch": 0.02, "learning_rate": 1.2380952380952382e-06, "logits/chosen": -1.6181094646453857, "logits/rejected": -0.9313281178474426, "logps/chosen": -543.5655517578125, "logps/rejected": -926.2550048828125, "loss": 0.1378, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04493476450443268, "rewards/margins": 0.11141650378704071, "rewards/rejected": -0.1563512682914734, "step": 130 }, { "epoch": 0.03, "learning_rate": 1.3333333333333334e-06, "logits/chosen": -1.6120615005493164, "logits/rejected": -1.0635985136032104, "logps/chosen": -567.2598876953125, "logps/rejected": -1004.9212646484375, "loss": 0.0996, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09899488091468811, "rewards/margins": 0.13413195312023163, "rewards/rejected": -0.23312684893608093, "step": 140 }, { "epoch": 0.03, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -1.6825717687606812, "logits/rejected": -1.081813097000122, "logps/chosen": -560.16943359375, "logps/rejected": -930.1064453125, "loss": 0.1257, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0978316217660904, "rewards/margins": 0.12988914549350739, "rewards/rejected": -0.22772076725959778, "step": 150 }, { "epoch": 0.03, "learning_rate": 1.523809523809524e-06, "logits/chosen": -1.777329683303833, "logits/rejected": -1.2507864236831665, "logps/chosen": -512.7439575195312, "logps/rejected": -984.9051513671875, "loss": 0.1239, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05691201239824295, "rewards/margins": 0.1501971185207367, "rewards/rejected": -0.20710912346839905, "step": 160 }, { "epoch": 0.03, "learning_rate": 1.6190476190476193e-06, "logits/chosen": -1.6942923069000244, "logits/rejected": -1.1983150243759155, "logps/chosen": -547.2807006835938, "logps/rejected": -1033.548095703125, "loss": 0.0948, "rewards/accuracies": 0.75, "rewards/chosen": -0.09753384441137314, "rewards/margins": 0.16666623950004578, "rewards/rejected": -0.2642000913619995, "step": 170 }, { "epoch": 0.03, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -1.689398169517517, "logits/rejected": -1.2449653148651123, "logps/chosen": -630.5906982421875, "logps/rejected": -1181.1785888671875, "loss": 0.0868, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1752820611000061, "rewards/margins": 0.17259114980697632, "rewards/rejected": -0.3478732109069824, "step": 180 }, { "epoch": 0.04, "learning_rate": 1.8095238095238097e-06, "logits/chosen": -1.5203440189361572, "logits/rejected": -0.9198516011238098, "logps/chosen": -666.8179931640625, "logps/rejected": -1204.99072265625, "loss": 0.1113, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19398172199726105, "rewards/margins": 0.22316697239875793, "rewards/rejected": -0.4171486794948578, "step": 190 }, { "epoch": 0.04, "learning_rate": 1.904761904761905e-06, "logits/chosen": -1.682255744934082, "logits/rejected": -1.4462255239486694, "logps/chosen": -530.3055419921875, "logps/rejected": -1100.1158447265625, "loss": 0.0997, "rewards/accuracies": 0.75, "rewards/chosen": -0.13800662755966187, "rewards/margins": 0.17010769248008728, "rewards/rejected": -0.30811434984207153, "step": 200 }, { "epoch": 0.04, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.6906871795654297, "logits/rejected": -0.9419037103652954, "logps/chosen": -618.3939819335938, "logps/rejected": -1106.7994384765625, "loss": 0.0988, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13316693902015686, "rewards/margins": 0.18527300655841827, "rewards/rejected": -0.3184399902820587, "step": 210 }, { "epoch": 0.04, "learning_rate": 2.0952380952380955e-06, "logits/chosen": -1.7209393978118896, "logits/rejected": -0.9391362071037292, "logps/chosen": -616.0027465820312, "logps/rejected": -1283.7279052734375, "loss": 0.0825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.16547217965126038, "rewards/margins": 0.23828014731407166, "rewards/rejected": -0.40375232696533203, "step": 220 }, { "epoch": 0.04, "learning_rate": 2.1904761904761908e-06, "logits/chosen": -1.472318410873413, "logits/rejected": -0.8261554837226868, "logps/chosen": -656.9349365234375, "logps/rejected": -1206.586669921875, "loss": 0.0923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16559532284736633, "rewards/margins": 0.2132759392261505, "rewards/rejected": -0.37887123227119446, "step": 230 }, { "epoch": 0.05, "learning_rate": 2.285714285714286e-06, "logits/chosen": -1.5672019720077515, "logits/rejected": -1.0501900911331177, "logps/chosen": -499.3685607910156, "logps/rejected": -1164.9814453125, "loss": 0.0738, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08818192780017853, "rewards/margins": 0.24911494553089142, "rewards/rejected": -0.33729690313339233, "step": 240 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": -1.6451988220214844, "logits/rejected": -1.2338879108428955, "logps/chosen": -619.9409790039062, "logps/rejected": -1280.0615234375, "loss": 0.0972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15977323055267334, "rewards/margins": 0.23027309775352478, "rewards/rejected": -0.3900463879108429, "step": 250 }, { "epoch": 0.05, "learning_rate": 2.4761904761904764e-06, "logits/chosen": -1.847777009010315, "logits/rejected": -0.9876956939697266, "logps/chosen": -674.3220825195312, "logps/rejected": -1227.2720947265625, "loss": 0.0709, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18131622672080994, "rewards/margins": 0.23836851119995117, "rewards/rejected": -0.4196847975254059, "step": 260 }, { "epoch": 0.05, "learning_rate": 2.571428571428571e-06, "logits/chosen": -1.6134955883026123, "logits/rejected": -1.0790008306503296, "logps/chosen": -647.0198974609375, "logps/rejected": -1282.9854736328125, "loss": 0.0938, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.209293395280838, "rewards/margins": 0.25759097933769226, "rewards/rejected": -0.4668843150138855, "step": 270 }, { "epoch": 0.05, "learning_rate": 2.666666666666667e-06, "logits/chosen": -1.8159784078598022, "logits/rejected": -1.351468801498413, "logps/chosen": -561.7814331054688, "logps/rejected": -1274.318603515625, "loss": 0.067, "rewards/accuracies": 0.875, "rewards/chosen": -0.14992132782936096, "rewards/margins": 0.2866609990596771, "rewards/rejected": -0.43658238649368286, "step": 280 }, { "epoch": 0.06, "learning_rate": 2.7619047619047625e-06, "logits/chosen": -1.7803318500518799, "logits/rejected": -1.1618740558624268, "logps/chosen": -521.2305908203125, "logps/rejected": -1099.5322265625, "loss": 0.0655, "rewards/accuracies": 0.875, "rewards/chosen": -0.09962662309408188, "rewards/margins": 0.25189143419265747, "rewards/rejected": -0.35151809453964233, "step": 290 }, { "epoch": 0.06, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -1.6528995037078857, "logits/rejected": -1.113143801689148, "logps/chosen": -613.2098999023438, "logps/rejected": -1197.0404052734375, "loss": 0.0966, "rewards/accuracies": 0.75, "rewards/chosen": -0.1718660593032837, "rewards/margins": 0.22291307151317596, "rewards/rejected": -0.39477914571762085, "step": 300 }, { "epoch": 0.06, "learning_rate": 2.9523809523809525e-06, "logits/chosen": -1.746258020401001, "logits/rejected": -1.273449182510376, "logps/chosen": -666.4026489257812, "logps/rejected": -1100.668212890625, "loss": 0.1579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18353891372680664, "rewards/margins": 0.17098166048526764, "rewards/rejected": -0.3545205295085907, "step": 310 }, { "epoch": 0.06, "learning_rate": 3.047619047619048e-06, "logits/chosen": -1.8009140491485596, "logits/rejected": -0.9418787956237793, "logps/chosen": -602.3572387695312, "logps/rejected": -1305.9537353515625, "loss": 0.059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09984590113162994, "rewards/margins": 0.3039381206035614, "rewards/rejected": -0.40378403663635254, "step": 320 }, { "epoch": 0.06, "learning_rate": 3.142857142857143e-06, "logits/chosen": -1.5505433082580566, "logits/rejected": -1.3617745637893677, "logps/chosen": -469.3348083496094, "logps/rejected": -1237.8416748046875, "loss": 0.0744, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10851670801639557, "rewards/margins": 0.29302558302879333, "rewards/rejected": -0.4015422761440277, "step": 330 }, { "epoch": 0.06, "learning_rate": 3.2380952380952385e-06, "logits/chosen": -1.691335678100586, "logits/rejected": -1.183778166770935, "logps/chosen": -566.4237060546875, "logps/rejected": -1063.508544921875, "loss": 0.081, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13905121386051178, "rewards/margins": 0.18825221061706543, "rewards/rejected": -0.3273034691810608, "step": 340 }, { "epoch": 0.07, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.6669670343399048, "logits/rejected": -1.1208057403564453, "logps/chosen": -542.494384765625, "logps/rejected": -1139.1849365234375, "loss": 0.09, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13446304202079773, "rewards/margins": 0.23915879428386688, "rewards/rejected": -0.3736218512058258, "step": 350 }, { "epoch": 0.07, "learning_rate": 3.428571428571429e-06, "logits/chosen": -1.7811037302017212, "logits/rejected": -1.2279610633850098, "logps/chosen": -610.7708129882812, "logps/rejected": -1237.0147705078125, "loss": 0.0792, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15628622472286224, "rewards/margins": 0.25117939710617065, "rewards/rejected": -0.4074656069278717, "step": 360 }, { "epoch": 0.07, "learning_rate": 3.523809523809524e-06, "logits/chosen": -1.7308986186981201, "logits/rejected": -1.027928113937378, "logps/chosen": -556.2271728515625, "logps/rejected": -1258.7716064453125, "loss": 0.0658, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10915093123912811, "rewards/margins": 0.3074415326118469, "rewards/rejected": -0.4165925085544586, "step": 370 }, { "epoch": 0.07, "learning_rate": 3.6190476190476194e-06, "logits/chosen": -1.7067712545394897, "logits/rejected": -1.2722686529159546, "logps/chosen": -495.68060302734375, "logps/rejected": -1135.3636474609375, "loss": 0.1078, "rewards/accuracies": 0.75, "rewards/chosen": -0.09428682923316956, "rewards/margins": 0.24129095673561096, "rewards/rejected": -0.33557775616645813, "step": 380 }, { "epoch": 0.07, "learning_rate": 3.7142857142857146e-06, "logits/chosen": -1.8631842136383057, "logits/rejected": -1.3056199550628662, "logps/chosen": -689.7511596679688, "logps/rejected": -1181.241455078125, "loss": 0.0914, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2153368443250656, "rewards/margins": 0.2260267436504364, "rewards/rejected": -0.4413636326789856, "step": 390 }, { "epoch": 0.08, "learning_rate": 3.80952380952381e-06, "logits/chosen": -1.782406210899353, "logits/rejected": -1.061374545097351, "logps/chosen": -587.0792236328125, "logps/rejected": -1274.716796875, "loss": 0.0899, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16100212931632996, "rewards/margins": 0.29258546233177185, "rewards/rejected": -0.45358753204345703, "step": 400 }, { "epoch": 0.08, "learning_rate": 3.9047619047619055e-06, "logits/chosen": -1.5641822814941406, "logits/rejected": -1.193485975265503, "logps/chosen": -569.2401733398438, "logps/rejected": -1220.4403076171875, "loss": 0.0666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15877744555473328, "rewards/margins": 0.2378130853176117, "rewards/rejected": -0.39659056067466736, "step": 410 }, { "epoch": 0.08, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.828481674194336, "logits/rejected": -0.8795900344848633, "logps/chosen": -746.7064208984375, "logps/rejected": -1278.959716796875, "loss": 0.0972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2285885065793991, "rewards/margins": 0.2384357899427414, "rewards/rejected": -0.4670243263244629, "step": 420 }, { "epoch": 0.08, "learning_rate": 4.095238095238096e-06, "logits/chosen": -1.5313599109649658, "logits/rejected": -0.8583769798278809, "logps/chosen": -704.6945190429688, "logps/rejected": -1280.5216064453125, "loss": 0.0709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18623504042625427, "rewards/margins": 0.2717072367668152, "rewards/rejected": -0.45794230699539185, "step": 430 }, { "epoch": 0.08, "learning_rate": 4.190476190476191e-06, "logits/chosen": -1.716683030128479, "logits/rejected": -1.3741168975830078, "logps/chosen": -499.2115173339844, "logps/rejected": -1127.7908935546875, "loss": 0.0951, "rewards/accuracies": 0.75, "rewards/chosen": -0.10336490720510483, "rewards/margins": 0.2679286599159241, "rewards/rejected": -0.3712936043739319, "step": 440 }, { "epoch": 0.09, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -1.720665693283081, "logits/rejected": -1.3028696775436401, "logps/chosen": -550.9224243164062, "logps/rejected": -1267.247802734375, "loss": 0.0647, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11992333084344864, "rewards/margins": 0.24694469571113586, "rewards/rejected": -0.3668680191040039, "step": 450 }, { "epoch": 0.09, "learning_rate": 4.3809523809523815e-06, "logits/chosen": -1.8857879638671875, "logits/rejected": -1.194883108139038, "logps/chosen": -605.60009765625, "logps/rejected": -1175.531494140625, "loss": 0.0844, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13926714658737183, "rewards/margins": 0.2119566947221756, "rewards/rejected": -0.3512238562107086, "step": 460 }, { "epoch": 0.09, "learning_rate": 4.476190476190477e-06, "logits/chosen": -1.8993419408798218, "logits/rejected": -1.109785795211792, "logps/chosen": -661.8026733398438, "logps/rejected": -1132.317138671875, "loss": 0.0918, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12035910040140152, "rewards/margins": 0.21994027495384216, "rewards/rejected": -0.3402993679046631, "step": 470 }, { "epoch": 0.09, "learning_rate": 4.571428571428572e-06, "logits/chosen": -1.7157402038574219, "logits/rejected": -1.2350364923477173, "logps/chosen": -580.3271484375, "logps/rejected": -1264.6790771484375, "loss": 0.0755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12759478390216827, "rewards/margins": 0.2707314193248749, "rewards/rejected": -0.39832618832588196, "step": 480 }, { "epoch": 0.09, "learning_rate": 4.666666666666667e-06, "logits/chosen": -1.7638877630233765, "logits/rejected": -1.207639455795288, "logps/chosen": -577.3447265625, "logps/rejected": -1109.933349609375, "loss": 0.0964, "rewards/accuracies": 0.75, "rewards/chosen": -0.11431191861629486, "rewards/margins": 0.21803171932697296, "rewards/rejected": -0.3323436379432678, "step": 490 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": -1.8138492107391357, "logits/rejected": -1.0965001583099365, "logps/chosen": -514.5714111328125, "logps/rejected": -1028.302001953125, "loss": 0.0831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09624107927083969, "rewards/margins": 0.2407519370317459, "rewards/rejected": -0.336993008852005, "step": 500 }, { "epoch": 0.1, "learning_rate": 4.857142857142858e-06, "logits/chosen": -1.8310391902923584, "logits/rejected": -1.2400341033935547, "logps/chosen": -746.3948974609375, "logps/rejected": -1365.250732421875, "loss": 0.1075, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3059869408607483, "rewards/margins": 0.22657544910907745, "rewards/rejected": -0.5325623750686646, "step": 510 }, { "epoch": 0.1, "learning_rate": 4.952380952380953e-06, "logits/chosen": -1.9007635116577148, "logits/rejected": -1.0593281984329224, "logps/chosen": -776.7916259765625, "logps/rejected": -1380.4420166015625, "loss": 0.0684, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.275787889957428, "rewards/margins": 0.271630197763443, "rewards/rejected": -0.5474181175231934, "step": 520 }, { "epoch": 0.1, "learning_rate": 4.999986185163754e-06, "logits/chosen": -1.52447509765625, "logits/rejected": -0.8935421705245972, "logps/chosen": -769.381103515625, "logps/rejected": -1351.2783203125, "loss": 0.0936, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2779494822025299, "rewards/margins": 0.24540336430072784, "rewards/rejected": -0.5233529210090637, "step": 530 }, { "epoch": 0.1, "learning_rate": 4.999875667389858e-06, "logits/chosen": -1.5643224716186523, "logits/rejected": -0.9529625773429871, "logps/chosen": -711.6848754882812, "logps/rejected": -1355.2978515625, "loss": 0.0916, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19870035350322723, "rewards/margins": 0.30797114968299866, "rewards/rejected": -0.5066714882850647, "step": 540 }, { "epoch": 0.1, "learning_rate": 4.999654636727765e-06, "logits/chosen": -1.834673285484314, "logits/rejected": -0.9252250790596008, "logps/chosen": -665.4678955078125, "logps/rejected": -1259.138916015625, "loss": 0.0857, "rewards/accuracies": 0.875, "rewards/chosen": -0.13571450114250183, "rewards/margins": 0.24520739912986755, "rewards/rejected": -0.3809219300746918, "step": 550 }, { "epoch": 0.11, "learning_rate": 4.999323102948655e-06, "logits/chosen": -1.6249576807022095, "logits/rejected": -0.773546576499939, "logps/chosen": -658.3365478515625, "logps/rejected": -1433.79443359375, "loss": 0.0534, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19249185919761658, "rewards/margins": 0.33588099479675293, "rewards/rejected": -0.5283728837966919, "step": 560 }, { "epoch": 0.11, "learning_rate": 4.998881080708759e-06, "logits/chosen": -1.996830701828003, "logits/rejected": -1.0194107294082642, "logps/chosen": -766.8067626953125, "logps/rejected": -1454.341552734375, "loss": 0.054, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.21933317184448242, "rewards/margins": 0.32533079385757446, "rewards/rejected": -0.5446639657020569, "step": 570 }, { "epoch": 0.11, "learning_rate": 4.998328589548711e-06, "logits/chosen": -1.7593481540679932, "logits/rejected": -1.2708923816680908, "logps/chosen": -606.2332763671875, "logps/rejected": -1212.849365234375, "loss": 0.0793, "rewards/accuracies": 0.875, "rewards/chosen": -0.14150679111480713, "rewards/margins": 0.25646698474884033, "rewards/rejected": -0.39797380566596985, "step": 580 }, { "epoch": 0.11, "learning_rate": 4.997665653892682e-06, "logits/chosen": -1.8112876415252686, "logits/rejected": -1.1319668292999268, "logps/chosen": -566.7156982421875, "logps/rejected": -1126.965087890625, "loss": 0.1102, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1547660082578659, "rewards/margins": 0.23159603774547577, "rewards/rejected": -0.3863620460033417, "step": 590 }, { "epoch": 0.11, "learning_rate": 4.996892303047306e-06, "logits/chosen": -1.696459174156189, "logits/rejected": -1.433434009552002, "logps/chosen": -578.3829345703125, "logps/rejected": -1214.95654296875, "loss": 0.0854, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14466938376426697, "rewards/margins": 0.26130378246307373, "rewards/rejected": -0.4059731364250183, "step": 600 }, { "epoch": 0.12, "learning_rate": 4.996008571200375e-06, "logits/chosen": -1.578669786453247, "logits/rejected": -1.2551274299621582, "logps/chosen": -541.39208984375, "logps/rejected": -1256.5968017578125, "loss": 0.1037, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15888190269470215, "rewards/margins": 0.25673192739486694, "rewards/rejected": -0.4156138300895691, "step": 610 }, { "epoch": 0.12, "learning_rate": 4.995014497419336e-06, "logits/chosen": -1.7449144124984741, "logits/rejected": -1.0009429454803467, "logps/chosen": -669.4453735351562, "logps/rejected": -1329.12744140625, "loss": 0.095, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16107605397701263, "rewards/margins": 0.2727533280849457, "rewards/rejected": -0.4338293671607971, "step": 620 }, { "epoch": 0.12, "learning_rate": 4.993910125649561e-06, "logits/chosen": -1.494553804397583, "logits/rejected": -0.9566832780838013, "logps/chosen": -674.8986206054688, "logps/rejected": -1222.6563720703125, "loss": 0.1148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.19383887946605682, "rewards/margins": 0.21913234889507294, "rewards/rejected": -0.41297125816345215, "step": 630 }, { "epoch": 0.12, "learning_rate": 4.992695504712402e-06, "logits/chosen": -1.4821733236312866, "logits/rejected": -0.8176189661026001, "logps/chosen": -528.9064331054688, "logps/rejected": -1169.622314453125, "loss": 0.076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1681780219078064, "rewards/margins": 0.26834970712661743, "rewards/rejected": -0.43652772903442383, "step": 640 }, { "epoch": 0.12, "learning_rate": 4.9913706883030385e-06, "logits/chosen": -1.5352599620819092, "logits/rejected": -1.108190894126892, "logps/chosen": -747.9698486328125, "logps/rejected": -1408.2620849609375, "loss": 0.0947, "rewards/accuracies": 0.875, "rewards/chosen": -0.24020352959632874, "rewards/margins": 0.2763820290565491, "rewards/rejected": -0.5165855288505554, "step": 650 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -1.8327178955078125, "logits/rejected": -1.1306883096694946, "logps/chosen": -656.5528564453125, "logps/rejected": -1185.5146484375, "loss": 0.1075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2202688455581665, "rewards/margins": 0.25351718068122864, "rewards/rejected": -0.47378596663475037, "step": 660 }, { "epoch": 0.13, "learning_rate": 4.988390708203068e-06, "logits/chosen": -1.4716339111328125, "logits/rejected": -0.9073241353034973, "logps/chosen": -670.7069091796875, "logps/rejected": -1161.494384765625, "loss": 0.1167, "rewards/accuracies": 0.75, "rewards/chosen": -0.175540030002594, "rewards/margins": 0.22774294018745422, "rewards/rejected": -0.4032829701900482, "step": 670 }, { "epoch": 0.13, "learning_rate": 4.9867356762494955e-06, "logits/chosen": -1.644885778427124, "logits/rejected": -0.9986754655838013, "logps/chosen": -596.4325561523438, "logps/rejected": -1208.408447265625, "loss": 0.0909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11105004698038101, "rewards/margins": 0.2835782468318939, "rewards/rejected": -0.39462828636169434, "step": 680 }, { "epoch": 0.13, "learning_rate": 4.984970712291963e-06, "logits/chosen": -1.9281013011932373, "logits/rejected": -1.0944923162460327, "logps/chosen": -665.7894287109375, "logps/rejected": -1237.457275390625, "loss": 0.0685, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1318742334842682, "rewards/margins": 0.2946074903011322, "rewards/rejected": -0.4264817237854004, "step": 690 }, { "epoch": 0.13, "learning_rate": 4.983095894354858e-06, "logits/chosen": -1.8215450048446655, "logits/rejected": -1.1661744117736816, "logps/chosen": -684.5850219726562, "logps/rejected": -1356.67919921875, "loss": 0.0862, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2371421754360199, "rewards/margins": 0.28387588262557983, "rewards/rejected": -0.5210181474685669, "step": 700 }, { "epoch": 0.14, "learning_rate": 4.981111305318918e-06, "logits/chosen": -1.8345987796783447, "logits/rejected": -0.9164802432060242, "logps/chosen": -689.55810546875, "logps/rejected": -1234.62890625, "loss": 0.0965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2495756447315216, "rewards/margins": 0.27059414982795715, "rewards/rejected": -0.5201697945594788, "step": 710 }, { "epoch": 0.14, "learning_rate": 4.979017032917576e-06, "logits/chosen": -1.808998465538025, "logits/rejected": -1.2228367328643799, "logps/chosen": -628.9651489257812, "logps/rejected": -1082.283935546875, "loss": 0.1377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18333646655082703, "rewards/margins": 0.1819809228181839, "rewards/rejected": -0.3653174042701721, "step": 720 }, { "epoch": 0.14, "learning_rate": 4.97681316973307e-06, "logits/chosen": -1.6953952312469482, "logits/rejected": -1.284063458442688, "logps/chosen": -554.18896484375, "logps/rejected": -1180.788330078125, "loss": 0.0847, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09495721012353897, "rewards/margins": 0.2516638934612274, "rewards/rejected": -0.3466210961341858, "step": 730 }, { "epoch": 0.14, "learning_rate": 4.9744998131923625e-06, "logits/chosen": -1.8052667379379272, "logits/rejected": -1.09946870803833, "logps/chosen": -620.55517578125, "logps/rejected": -1198.6644287109375, "loss": 0.08, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09908594191074371, "rewards/margins": 0.27245354652404785, "rewards/rejected": -0.37153950333595276, "step": 740 }, { "epoch": 0.14, "learning_rate": 4.9720770655628216e-06, "logits/chosen": -1.8670793771743774, "logits/rejected": -1.3132587671279907, "logps/chosen": -594.9564208984375, "logps/rejected": -1287.123291015625, "loss": 0.0757, "rewards/accuracies": 0.875, "rewards/chosen": -0.14726175367832184, "rewards/margins": 0.29822516441345215, "rewards/rejected": -0.44548696279525757, "step": 750 }, { "epoch": 0.14, "learning_rate": 4.969545033947711e-06, "logits/chosen": -1.7041022777557373, "logits/rejected": -1.1802165508270264, "logps/chosen": -660.1702270507812, "logps/rejected": -1215.850830078125, "loss": 0.1037, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22151121497154236, "rewards/margins": 0.22661535441875458, "rewards/rejected": -0.44812655448913574, "step": 760 }, { "epoch": 0.15, "learning_rate": 4.966903830281449e-06, "logits/chosen": -1.8590688705444336, "logits/rejected": -1.039477825164795, "logps/chosen": -683.9407348632812, "logps/rejected": -1311.1129150390625, "loss": 0.0793, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1789616197347641, "rewards/margins": 0.28517478704452515, "rewards/rejected": -0.46413642168045044, "step": 770 }, { "epoch": 0.15, "learning_rate": 4.964153571324658e-06, "logits/chosen": -1.400686502456665, "logits/rejected": -0.961429238319397, "logps/chosen": -567.7184448242188, "logps/rejected": -1213.0838623046875, "loss": 0.0663, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13110116124153137, "rewards/margins": 0.28832724690437317, "rewards/rejected": -0.41942834854125977, "step": 780 }, { "epoch": 0.15, "learning_rate": 4.96129437865901e-06, "logits/chosen": -1.7145271301269531, "logits/rejected": -1.0960595607757568, "logps/chosen": -660.9743041992188, "logps/rejected": -1443.8707275390625, "loss": 0.053, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20914039015769958, "rewards/margins": 0.3439396321773529, "rewards/rejected": -0.5530800819396973, "step": 790 }, { "epoch": 0.15, "learning_rate": 4.958326378681849e-06, "logits/chosen": -1.4642733335494995, "logits/rejected": -0.7573977708816528, "logps/chosen": -715.9859619140625, "logps/rejected": -1360.409912109375, "loss": 0.0773, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2055422067642212, "rewards/margins": 0.2881919741630554, "rewards/rejected": -0.493734210729599, "step": 800 }, { "epoch": 0.15, "learning_rate": 4.955249702600598e-06, "logits/chosen": -1.6208610534667969, "logits/rejected": -0.9218361973762512, "logps/chosen": -623.2794799804688, "logps/rejected": -1308.27294921875, "loss": 0.0591, "rewards/accuracies": 0.875, "rewards/chosen": -0.1836785078048706, "rewards/margins": 0.28122177720069885, "rewards/rejected": -0.46490031480789185, "step": 810 }, { "epoch": 0.16, "learning_rate": 4.952064486426965e-06, "logits/chosen": -1.536716341972351, "logits/rejected": -1.1593210697174072, "logps/chosen": -590.9392700195312, "logps/rejected": -1318.531005859375, "loss": 0.0776, "rewards/accuracies": 0.875, "rewards/chosen": -0.1551203429698944, "rewards/margins": 0.28595298528671265, "rewards/rejected": -0.44107332825660706, "step": 820 }, { "epoch": 0.16, "learning_rate": 4.948770870970929e-06, "logits/chosen": -1.698622465133667, "logits/rejected": -1.0573958158493042, "logps/chosen": -677.7445068359375, "logps/rejected": -1307.689453125, "loss": 0.06, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20002396404743195, "rewards/margins": 0.27892419695854187, "rewards/rejected": -0.47894811630249023, "step": 830 }, { "epoch": 0.16, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -1.729888677597046, "logits/rejected": -1.1597099304199219, "logps/chosen": -571.38818359375, "logps/rejected": -1457.5614013671875, "loss": 0.052, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15872427821159363, "rewards/margins": 0.3792124390602112, "rewards/rejected": -0.5379367470741272, "step": 840 }, { "epoch": 0.16, "learning_rate": 4.941859029405354e-06, "logits/chosen": -1.7805073261260986, "logits/rejected": -1.0309171676635742, "logps/chosen": -588.3792724609375, "logps/rejected": -1260.41357421875, "loss": 0.0693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13170531392097473, "rewards/margins": 0.2891574501991272, "rewards/rejected": -0.42086273431777954, "step": 850 }, { "epoch": 0.16, "learning_rate": 4.938241108850039e-06, "logits/chosen": -1.6909992694854736, "logits/rejected": -1.0959560871124268, "logps/chosen": -590.9859619140625, "logps/rejected": -1218.2630615234375, "loss": 0.0891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11560378968715668, "rewards/margins": 0.2818681001663208, "rewards/rejected": -0.3974718451499939, "step": 860 }, { "epoch": 0.17, "learning_rate": 4.934515400107266e-06, "logits/chosen": -1.5700453519821167, "logits/rejected": -0.813576340675354, "logps/chosen": -640.3110961914062, "logps/rejected": -1193.339599609375, "loss": 0.0861, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11941035836935043, "rewards/margins": 0.25531017780303955, "rewards/rejected": -0.3747205436229706, "step": 870 }, { "epoch": 0.17, "learning_rate": 4.930682067880759e-06, "logits/chosen": -1.4913710355758667, "logits/rejected": -1.155098557472229, "logps/chosen": -568.5694580078125, "logps/rejected": -1217.7532958984375, "loss": 0.0765, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15131355822086334, "rewards/margins": 0.26503461599349976, "rewards/rejected": -0.4163481593132019, "step": 880 }, { "epoch": 0.17, "learning_rate": 4.926741281631991e-06, "logits/chosen": -1.4744211435317993, "logits/rejected": -0.8086342811584473, "logps/chosen": -660.7673950195312, "logps/rejected": -1496.2003173828125, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -0.19947215914726257, "rewards/margins": 0.3620762526988983, "rewards/rejected": -0.5615484714508057, "step": 890 }, { "epoch": 0.17, "learning_rate": 4.922693215572695e-06, "logits/chosen": -1.7864484786987305, "logits/rejected": -1.1300532817840576, "logps/chosen": -627.9884643554688, "logps/rejected": -1298.03076171875, "loss": 0.0583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15097679197788239, "rewards/margins": 0.30441802740097046, "rewards/rejected": -0.45539480447769165, "step": 900 }, { "epoch": 0.17, "learning_rate": 4.91853804865716e-06, "logits/chosen": -1.5407073497772217, "logits/rejected": -1.0573087930679321, "logps/chosen": -626.0189208984375, "logps/rejected": -1293.064697265625, "loss": 0.0591, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18492639064788818, "rewards/margins": 0.2943907380104065, "rewards/rejected": -0.4793171286582947, "step": 910 }, { "epoch": 0.18, "learning_rate": 4.91427596457432e-06, "logits/chosen": -1.7004728317260742, "logits/rejected": -1.099687099456787, "logps/chosen": -651.4698486328125, "logps/rejected": -1385.514892578125, "loss": 0.1016, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19086621701717377, "rewards/margins": 0.3267431855201721, "rewards/rejected": -0.5176093578338623, "step": 920 }, { "epoch": 0.18, "learning_rate": 4.909907151739634e-06, "logits/chosen": -1.6458594799041748, "logits/rejected": -1.0120216608047485, "logps/chosen": -739.9207763671875, "logps/rejected": -1332.773681640625, "loss": 0.0612, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.21796786785125732, "rewards/margins": 0.27975529432296753, "rewards/rejected": -0.4977231025695801, "step": 930 }, { "epoch": 0.18, "learning_rate": 4.905431803286756e-06, "logits/chosen": -1.6875731945037842, "logits/rejected": -1.1233506202697754, "logps/chosen": -620.7035522460938, "logps/rejected": -1154.03125, "loss": 0.069, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14572550356388092, "rewards/margins": 0.25223031640052795, "rewards/rejected": -0.3979558050632477, "step": 940 }, { "epoch": 0.18, "learning_rate": 4.900850117059e-06, "logits/chosen": -1.7308450937271118, "logits/rejected": -1.3464564085006714, "logps/chosen": -573.27587890625, "logps/rejected": -1219.830810546875, "loss": 0.0703, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10996156930923462, "rewards/margins": 0.28513264656066895, "rewards/rejected": -0.39509421586990356, "step": 950 }, { "epoch": 0.18, "learning_rate": 4.8961622956005895e-06, "logits/chosen": -1.813080072402954, "logits/rejected": -1.3277075290679932, "logps/chosen": -530.7855224609375, "logps/rejected": -1027.4674072265625, "loss": 0.1403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08806713670492172, "rewards/margins": 0.2371738851070404, "rewards/rejected": -0.3252410292625427, "step": 960 }, { "epoch": 0.18, "learning_rate": 4.891368546147707e-06, "logits/chosen": -1.595203161239624, "logits/rejected": -1.1859229803085327, "logps/chosen": -552.8775634765625, "logps/rejected": -1224.351318359375, "loss": 0.0807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12314770370721817, "rewards/margins": 0.304676353931427, "rewards/rejected": -0.42782407999038696, "step": 970 }, { "epoch": 0.19, "learning_rate": 4.88646908061933e-06, "logits/chosen": -1.6990375518798828, "logits/rejected": -0.8397040367126465, "logps/chosen": -629.1605224609375, "logps/rejected": -1143.335693359375, "loss": 0.077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17983214557170868, "rewards/margins": 0.2676909565925598, "rewards/rejected": -0.4475231170654297, "step": 980 }, { "epoch": 0.19, "learning_rate": 4.881464115607866e-06, "logits/chosen": -1.6023046970367432, "logits/rejected": -0.7092984318733215, "logps/chosen": -804.279296875, "logps/rejected": -1217.7445068359375, "loss": 0.1267, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2759309411048889, "rewards/margins": 0.2225939780473709, "rewards/rejected": -0.49852484464645386, "step": 990 }, { "epoch": 0.19, "learning_rate": 4.876353872369573e-06, "logits/chosen": -1.6359355449676514, "logits/rejected": -0.9567705392837524, "logps/chosen": -709.3012084960938, "logps/rejected": -1238.928955078125, "loss": 0.0841, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2836172878742218, "rewards/margins": 0.24482114613056183, "rewards/rejected": -0.5284383296966553, "step": 1000 }, { "epoch": 0.19, "learning_rate": 4.871138576814782e-06, "logits/chosen": -1.6317775249481201, "logits/rejected": -0.8598468899726868, "logps/chosen": -672.0849609375, "logps/rejected": -1243.135009765625, "loss": 0.069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24557438492774963, "rewards/margins": 0.3051298260688782, "rewards/rejected": -0.5507042407989502, "step": 1010 }, { "epoch": 0.19, "learning_rate": 4.865818459497911e-06, "logits/chosen": -1.388197898864746, "logits/rejected": -0.9666770696640015, "logps/chosen": -787.582763671875, "logps/rejected": -1298.2857666015625, "loss": 0.117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28274062275886536, "rewards/margins": 0.24993351101875305, "rewards/rejected": -0.5326741337776184, "step": 1020 }, { "epoch": 0.2, "learning_rate": 4.860393755607266e-06, "logits/chosen": -1.4828870296478271, "logits/rejected": -0.9877294301986694, "logps/chosen": -555.7489013671875, "logps/rejected": -1203.155517578125, "loss": 0.0766, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10617247968912125, "rewards/margins": 0.2726615071296692, "rewards/rejected": -0.37883394956588745, "step": 1030 }, { "epoch": 0.2, "learning_rate": 4.854864704954654e-06, "logits/chosen": -1.5119549036026, "logits/rejected": -1.1713879108428955, "logps/chosen": -597.2371215820312, "logps/rejected": -1206.311279296875, "loss": 0.071, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11909804493188858, "rewards/margins": 0.23684187233448029, "rewards/rejected": -0.35593992471694946, "step": 1040 }, { "epoch": 0.2, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.4709835052490234, "logits/rejected": -1.1632763147354126, "logps/chosen": -424.6221618652344, "logps/rejected": -979.3152465820312, "loss": 0.1164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08865557610988617, "rewards/margins": 0.20411546528339386, "rewards/rejected": -0.29277104139328003, "step": 1050 }, { "epoch": 0.2, "learning_rate": 4.843494545664407e-06, "logits/chosen": -1.7648117542266846, "logits/rejected": -1.3866671323776245, "logps/chosen": -586.7568969726562, "logps/rejected": -1174.9310302734375, "loss": 0.1011, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14828075468540192, "rewards/margins": 0.24527840316295624, "rewards/rejected": -0.39355915784835815, "step": 1060 }, { "epoch": 0.2, "learning_rate": 4.837653939671427e-06, "logits/chosen": -1.6241607666015625, "logits/rejected": -0.9201037287712097, "logps/chosen": -670.8914794921875, "logps/rejected": -1177.223388671875, "loss": 0.081, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12418527901172638, "rewards/margins": 0.2541216015815735, "rewards/rejected": -0.37830686569213867, "step": 1070 }, { "epoch": 0.21, "learning_rate": 4.8317099921835695e-06, "logits/chosen": -1.8830773830413818, "logits/rejected": -1.1888630390167236, "logps/chosen": -533.8602905273438, "logps/rejected": -1146.5938720703125, "loss": 0.0695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06473751366138458, "rewards/margins": 0.30346792936325073, "rewards/rejected": -0.3682054281234741, "step": 1080 }, { "epoch": 0.21, "learning_rate": 4.825662965967023e-06, "logits/chosen": -1.9320052862167358, "logits/rejected": -1.1186505556106567, "logps/chosen": -634.6085205078125, "logps/rejected": -1212.7740478515625, "loss": 0.0585, "rewards/accuracies": 0.875, "rewards/chosen": -0.10977531969547272, "rewards/margins": 0.29483669996261597, "rewards/rejected": -0.40461206436157227, "step": 1090 }, { "epoch": 0.21, "learning_rate": 4.819513128344814e-06, "logits/chosen": -1.7645708322525024, "logits/rejected": -1.1508641242980957, "logps/chosen": -611.5857543945312, "logps/rejected": -1242.5738525390625, "loss": 0.0742, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12029217183589935, "rewards/margins": 0.2845746874809265, "rewards/rejected": -0.4048668444156647, "step": 1100 }, { "epoch": 0.21, "learning_rate": 4.813260751184992e-06, "logits/chosen": -1.7668542861938477, "logits/rejected": -1.070345163345337, "logps/chosen": -608.2420654296875, "logps/rejected": -1294.886962890625, "loss": 0.0723, "rewards/accuracies": 0.875, "rewards/chosen": -0.10366489738225937, "rewards/margins": 0.3099459111690521, "rewards/rejected": -0.4136108458042145, "step": 1110 }, { "epoch": 0.21, "learning_rate": 4.806906110888606e-06, "logits/chosen": -1.5412280559539795, "logits/rejected": -0.9960755109786987, "logps/chosen": -532.5230712890625, "logps/rejected": -1119.4359130859375, "loss": 0.0574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1133192628622055, "rewards/margins": 0.2607772648334503, "rewards/rejected": -0.374096542596817, "step": 1120 }, { "epoch": 0.22, "learning_rate": 4.8004494883774885e-06, "logits/chosen": -1.9325168132781982, "logits/rejected": -1.2252503633499146, "logps/chosen": -583.9013061523438, "logps/rejected": -1194.834716796875, "loss": 0.0702, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.062377817928791046, "rewards/margins": 0.28015127778053284, "rewards/rejected": -0.3425291180610657, "step": 1130 }, { "epoch": 0.22, "learning_rate": 4.793891169081835e-06, "logits/chosen": -1.8249927759170532, "logits/rejected": -1.0183128118515015, "logps/chosen": -555.833984375, "logps/rejected": -1278.7852783203125, "loss": 0.0369, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.036963075399398804, "rewards/margins": 0.34940817952156067, "rewards/rejected": -0.38637128472328186, "step": 1140 }, { "epoch": 0.22, "learning_rate": 4.787231442927587e-06, "logits/chosen": -1.399440050125122, "logits/rejected": -1.2327266931533813, "logps/chosen": -471.8321228027344, "logps/rejected": -1072.462646484375, "loss": 0.12, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10728304088115692, "rewards/margins": 0.23449170589447021, "rewards/rejected": -0.34177470207214355, "step": 1150 }, { "epoch": 0.22, "learning_rate": 4.780470604323616e-06, "logits/chosen": -1.7676588296890259, "logits/rejected": -1.3527311086654663, "logps/chosen": -471.47772216796875, "logps/rejected": -1098.208984375, "loss": 0.0938, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08965645730495453, "rewards/margins": 0.2666322588920593, "rewards/rejected": -0.35628873109817505, "step": 1160 }, { "epoch": 0.22, "learning_rate": 4.773608952148706e-06, "logits/chosen": -1.6790664196014404, "logits/rejected": -1.1355869770050049, "logps/chosen": -555.5061645507812, "logps/rejected": -1149.15234375, "loss": 0.0837, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12319140136241913, "rewards/margins": 0.26579806208610535, "rewards/rejected": -0.38898950815200806, "step": 1170 }, { "epoch": 0.22, "learning_rate": 4.766646789738342e-06, "logits/chosen": -1.850873351097107, "logits/rejected": -0.993525505065918, "logps/chosen": -642.1211547851562, "logps/rejected": -1213.765869140625, "loss": 0.0629, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14874321222305298, "rewards/margins": 0.297253280878067, "rewards/rejected": -0.4459964632987976, "step": 1180 }, { "epoch": 0.23, "learning_rate": 4.759584424871302e-06, "logits/chosen": -1.6824783086776733, "logits/rejected": -1.339135766029358, "logps/chosen": -650.1998901367188, "logps/rejected": -1217.5582275390625, "loss": 0.0813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1805349886417389, "rewards/margins": 0.2520584166049957, "rewards/rejected": -0.43259334564208984, "step": 1190 }, { "epoch": 0.23, "learning_rate": 4.752422169756048e-06, "logits/chosen": -1.4297927618026733, "logits/rejected": -0.9142922163009644, "logps/chosen": -641.1162109375, "logps/rejected": -1092.507080078125, "loss": 0.1013, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17083248496055603, "rewards/margins": 0.227576345205307, "rewards/rejected": -0.39840883016586304, "step": 1200 }, { "epoch": 0.23, "learning_rate": 4.745160341016927e-06, "logits/chosen": -1.6701313257217407, "logits/rejected": -1.366464376449585, "logps/chosen": -591.3519287109375, "logps/rejected": -1383.019287109375, "loss": 0.042, "rewards/accuracies": 0.875, "rewards/chosen": -0.13523560762405396, "rewards/margins": 0.34861212968826294, "rewards/rejected": -0.4838477075099945, "step": 1210 }, { "epoch": 0.23, "learning_rate": 4.737799259680172e-06, "logits/chosen": -1.6060030460357666, "logits/rejected": -1.1717652082443237, "logps/chosen": -569.553466796875, "logps/rejected": -1148.726806640625, "loss": 0.1077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08871385455131531, "rewards/margins": 0.2584499716758728, "rewards/rejected": -0.3471638262271881, "step": 1220 }, { "epoch": 0.23, "learning_rate": 4.730339251159709e-06, "logits/chosen": -1.9548372030258179, "logits/rejected": -1.2164928913116455, "logps/chosen": -605.7603759765625, "logps/rejected": -1120.601318359375, "loss": 0.0943, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08201462775468826, "rewards/margins": 0.25623396039009094, "rewards/rejected": -0.3382485806941986, "step": 1230 }, { "epoch": 0.24, "learning_rate": 4.722780645242775e-06, "logits/chosen": -2.015829086303711, "logits/rejected": -1.1882562637329102, "logps/chosen": -563.5271606445312, "logps/rejected": -1133.745361328125, "loss": 0.0827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11626216024160385, "rewards/margins": 0.25964677333831787, "rewards/rejected": -0.3759089410305023, "step": 1240 }, { "epoch": 0.24, "learning_rate": 4.715123776075337e-06, "logits/chosen": -1.601769208908081, "logits/rejected": -1.0593763589859009, "logps/chosen": -637.7056884765625, "logps/rejected": -1188.92333984375, "loss": 0.0784, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17665165662765503, "rewards/margins": 0.2602120041847229, "rewards/rejected": -0.43686366081237793, "step": 1250 }, { "epoch": 0.24, "learning_rate": 4.707368982147318e-06, "logits/chosen": -1.898706078529358, "logits/rejected": -1.0478856563568115, "logps/chosen": -661.0333251953125, "logps/rejected": -1348.3873291015625, "loss": 0.0501, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.15230241417884827, "rewards/margins": 0.3299187421798706, "rewards/rejected": -0.4822211265563965, "step": 1260 }, { "epoch": 0.24, "learning_rate": 4.699516606277638e-06, "logits/chosen": -1.6510775089263916, "logits/rejected": -1.0354807376861572, "logps/chosen": -553.8106689453125, "logps/rejected": -1366.97802734375, "loss": 0.0515, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08891726285219193, "rewards/margins": 0.36568087339401245, "rewards/rejected": -0.45459818840026855, "step": 1270 }, { "epoch": 0.24, "learning_rate": 4.691566995599056e-06, "logits/chosen": -1.5789378881454468, "logits/rejected": -0.8932808041572571, "logps/chosen": -612.9600830078125, "logps/rejected": -1157.551025390625, "loss": 0.0897, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06621615588665009, "rewards/margins": 0.29278090596199036, "rewards/rejected": -0.35899704694747925, "step": 1280 }, { "epoch": 0.25, "learning_rate": 4.683520501542825e-06, "logits/chosen": -1.8256202936172485, "logits/rejected": -1.200262188911438, "logps/chosen": -573.4430541992188, "logps/rejected": -1288.5570068359375, "loss": 0.058, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.07125236839056015, "rewards/margins": 0.3250887393951416, "rewards/rejected": -0.39634108543395996, "step": 1290 }, { "epoch": 0.25, "learning_rate": 4.675377479823153e-06, "logits/chosen": -1.529521107673645, "logits/rejected": -1.1277577877044678, "logps/chosen": -562.3739013671875, "logps/rejected": -1271.357421875, "loss": 0.089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14400474727153778, "rewards/margins": 0.28928202390670776, "rewards/rejected": -0.43328675627708435, "step": 1300 }, { "epoch": 0.25, "learning_rate": 4.667138290421483e-06, "logits/chosen": -1.5254117250442505, "logits/rejected": -1.0594402551651, "logps/chosen": -579.7008056640625, "logps/rejected": -1145.4326171875, "loss": 0.1012, "rewards/accuracies": 0.75, "rewards/chosen": -0.18182730674743652, "rewards/margins": 0.2299845665693283, "rewards/rejected": -0.41181182861328125, "step": 1310 }, { "epoch": 0.25, "learning_rate": 4.658803297570578e-06, "logits/chosen": -1.471721887588501, "logits/rejected": -1.128235936164856, "logps/chosen": -643.552734375, "logps/rejected": -1248.0694580078125, "loss": 0.0933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18766087293624878, "rewards/margins": 0.2622078061103821, "rewards/rejected": -0.44986867904663086, "step": 1320 }, { "epoch": 0.25, "learning_rate": 4.650372869738415e-06, "logits/chosen": -1.587103247642517, "logits/rejected": -0.9101946949958801, "logps/chosen": -575.8819580078125, "logps/rejected": -1206.3009033203125, "loss": 0.0616, "rewards/accuracies": 0.875, "rewards/chosen": -0.11521546542644501, "rewards/margins": 0.2791483402252197, "rewards/rejected": -0.39436379075050354, "step": 1330 }, { "epoch": 0.26, "learning_rate": 4.641847379611898e-06, "logits/chosen": -1.6752538681030273, "logits/rejected": -1.1627941131591797, "logps/chosen": -563.6756591796875, "logps/rejected": -1274.301025390625, "loss": 0.0582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.07845500856637955, "rewards/margins": 0.29262250661849976, "rewards/rejected": -0.3710775077342987, "step": 1340 }, { "epoch": 0.26, "learning_rate": 4.633227204080389e-06, "logits/chosen": -1.5359529256820679, "logits/rejected": -1.0807862281799316, "logps/chosen": -550.9948120117188, "logps/rejected": -1217.546142578125, "loss": 0.0588, "rewards/accuracies": 0.875, "rewards/chosen": -0.1211240142583847, "rewards/margins": 0.2859167456626892, "rewards/rejected": -0.4070407450199127, "step": 1350 }, { "epoch": 0.26, "learning_rate": 4.624512724219038e-06, "logits/chosen": -1.829384207725525, "logits/rejected": -1.0859172344207764, "logps/chosen": -672.615966796875, "logps/rejected": -1285.6998291015625, "loss": 0.0727, "rewards/accuracies": 0.875, "rewards/chosen": -0.14164890348911285, "rewards/margins": 0.29436761140823364, "rewards/rejected": -0.4360164999961853, "step": 1360 }, { "epoch": 0.26, "learning_rate": 4.6157043252719374e-06, "logits/chosen": -1.4630229473114014, "logits/rejected": -1.0673094987869263, "logps/chosen": -489.92645263671875, "logps/rejected": -1276.04248046875, "loss": 0.0639, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08652294427156448, "rewards/margins": 0.35399168729782104, "rewards/rejected": -0.44051462411880493, "step": 1370 }, { "epoch": 0.26, "learning_rate": 4.606802396635098e-06, "logits/chosen": -1.5900306701660156, "logits/rejected": -1.249037504196167, "logps/chosen": -558.1952514648438, "logps/rejected": -1157.880615234375, "loss": 0.0838, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1465294063091278, "rewards/margins": 0.26196879148483276, "rewards/rejected": -0.40849822759628296, "step": 1380 }, { "epoch": 0.26, "learning_rate": 4.597807331839229e-06, "logits/chosen": -2.041189670562744, "logits/rejected": -1.0724743604660034, "logps/chosen": -690.7709350585938, "logps/rejected": -1363.1270751953125, "loss": 0.0571, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1370098888874054, "rewards/margins": 0.33667826652526855, "rewards/rejected": -0.4736880660057068, "step": 1390 }, { "epoch": 0.27, "learning_rate": 4.588719528532342e-06, "logits/chosen": -1.5918890237808228, "logits/rejected": -0.6338850259780884, "logps/chosen": -736.466064453125, "logps/rejected": -1296.2232666015625, "loss": 0.086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16654495894908905, "rewards/margins": 0.3148220181465149, "rewards/rejected": -0.48136696219444275, "step": 1400 }, { "epoch": 0.27, "learning_rate": 4.5795393884621735e-06, "logits/chosen": -1.5696302652359009, "logits/rejected": -0.8134934306144714, "logps/chosen": -579.7128295898438, "logps/rejected": -1253.761474609375, "loss": 0.0707, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12203441560268402, "rewards/margins": 0.3168005347251892, "rewards/rejected": -0.43883490562438965, "step": 1410 }, { "epoch": 0.27, "learning_rate": 4.5702673174584236e-06, "logits/chosen": -1.6604053974151611, "logits/rejected": -0.9312642812728882, "logps/chosen": -553.2318115234375, "logps/rejected": -1271.9256591796875, "loss": 0.0582, "rewards/accuracies": 0.875, "rewards/chosen": -0.13113778829574585, "rewards/margins": 0.318024218082428, "rewards/rejected": -0.4491620659828186, "step": 1420 }, { "epoch": 0.27, "learning_rate": 4.560903725414816e-06, "logits/chosen": -1.559654951095581, "logits/rejected": -0.9024654626846313, "logps/chosen": -584.5823974609375, "logps/rejected": -1253.152099609375, "loss": 0.0734, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15965160727500916, "rewards/margins": 0.265408456325531, "rewards/rejected": -0.4250600337982178, "step": 1430 }, { "epoch": 0.27, "learning_rate": 4.551449026270979e-06, "logits/chosen": -1.6541213989257812, "logits/rejected": -1.139235258102417, "logps/chosen": -553.2503051757812, "logps/rejected": -1190.677490234375, "loss": 0.0709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11237634718418121, "rewards/margins": 0.2757284343242645, "rewards/rejected": -0.38810476660728455, "step": 1440 }, { "epoch": 0.28, "learning_rate": 4.541903637994142e-06, "logits/chosen": -1.9984395503997803, "logits/rejected": -1.2730480432510376, "logps/chosen": -513.3294067382812, "logps/rejected": -1165.566650390625, "loss": 0.071, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05962224677205086, "rewards/margins": 0.3146773874759674, "rewards/rejected": -0.3742996156215668, "step": 1450 }, { "epoch": 0.28, "learning_rate": 4.532267982560662e-06, "logits/chosen": -1.7288652658462524, "logits/rejected": -1.0699656009674072, "logps/chosen": -536.3939208984375, "logps/rejected": -1226.364013671875, "loss": 0.0634, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05779455229640007, "rewards/margins": 0.32264310121536255, "rewards/rejected": -0.3804376721382141, "step": 1460 }, { "epoch": 0.28, "learning_rate": 4.522542485937369e-06, "logits/chosen": -1.3464348316192627, "logits/rejected": -0.9208024740219116, "logps/chosen": -469.87860107421875, "logps/rejected": -1257.689697265625, "loss": 0.085, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09766849130392075, "rewards/margins": 0.3110795319080353, "rewards/rejected": -0.40874800086021423, "step": 1470 }, { "epoch": 0.28, "learning_rate": 4.512727578062733e-06, "logits/chosen": -1.3829745054244995, "logits/rejected": -0.9307807087898254, "logps/chosen": -621.6275024414062, "logps/rejected": -1248.703369140625, "loss": 0.0948, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13931798934936523, "rewards/margins": 0.27737680077552795, "rewards/rejected": -0.4166947901248932, "step": 1480 }, { "epoch": 0.28, "learning_rate": 4.502823692827859e-06, "logits/chosen": -1.621721863746643, "logits/rejected": -1.0642528533935547, "logps/chosen": -549.4501342773438, "logps/rejected": -1250.82958984375, "loss": 0.0621, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1722426563501358, "rewards/margins": 0.29302817583084106, "rewards/rejected": -0.46527084708213806, "step": 1490 }, { "epoch": 0.29, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.7337934970855713, "logits/rejected": -1.0224395990371704, "logps/chosen": -634.6095581054688, "logps/rejected": -1207.4476318359375, "loss": 0.0836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17793257534503937, "rewards/margins": 0.25644272565841675, "rewards/rejected": -0.4343752861022949, "step": 1500 }, { "epoch": 0.29, "learning_rate": 4.482750745489733e-06, "logits/chosen": -1.4732965230941772, "logits/rejected": -0.9633675813674927, "logps/chosen": -550.6846923828125, "logps/rejected": -1190.68701171875, "loss": 0.0834, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13295753300189972, "rewards/margins": 0.29836997389793396, "rewards/rejected": -0.4313275218009949, "step": 1510 }, { "epoch": 0.29, "learning_rate": 4.472582570758367e-06, "logits/chosen": -1.7081899642944336, "logits/rejected": -0.7727378606796265, "logps/chosen": -666.4199829101562, "logps/rejected": -1332.3919677734375, "loss": 0.0735, "rewards/accuracies": 0.875, "rewards/chosen": -0.15550708770751953, "rewards/margins": 0.30337247252464294, "rewards/rejected": -0.4588795602321625, "step": 1520 }, { "epoch": 0.29, "learning_rate": 4.4623271933713065e-06, "logits/chosen": -1.5421324968338013, "logits/rejected": -1.1563916206359863, "logps/chosen": -582.2352294921875, "logps/rejected": -1248.558837890625, "loss": 0.106, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13999691605567932, "rewards/margins": 0.28287771344184875, "rewards/rejected": -0.4228746294975281, "step": 1530 }, { "epoch": 0.29, "learning_rate": 4.451985066691649e-06, "logits/chosen": -1.4042928218841553, "logits/rejected": -0.6304243803024292, "logps/chosen": -652.6393432617188, "logps/rejected": -1314.00341796875, "loss": 0.0531, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20784349739551544, "rewards/margins": 0.3043650984764099, "rewards/rejected": -0.5122085809707642, "step": 1540 }, { "epoch": 0.3, "learning_rate": 4.441556647917447e-06, "logits/chosen": -1.6000726222991943, "logits/rejected": -0.8292378187179565, "logps/chosen": -885.3034057617188, "logps/rejected": -1478.057861328125, "loss": 0.0593, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2784448564052582, "rewards/margins": 0.2961861491203308, "rewards/rejected": -0.5746309757232666, "step": 1550 }, { "epoch": 0.3, "learning_rate": 4.431042398061499e-06, "logits/chosen": -1.3935855627059937, "logits/rejected": -0.8830353021621704, "logps/chosen": -572.7501831054688, "logps/rejected": -1221.902099609375, "loss": 0.0715, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19357424974441528, "rewards/margins": 0.28787490725517273, "rewards/rejected": -0.4814492166042328, "step": 1560 }, { "epoch": 0.3, "learning_rate": 4.420442781930971e-06, "logits/chosen": -1.596678614616394, "logits/rejected": -0.9144500494003296, "logps/chosen": -583.464111328125, "logps/rejected": -1321.6702880859375, "loss": 0.0503, "rewards/accuracies": 0.875, "rewards/chosen": -0.1923380047082901, "rewards/margins": 0.3100276291370392, "rewards/rejected": -0.5023655891418457, "step": 1570 }, { "epoch": 0.3, "learning_rate": 4.409758268106842e-06, "logits/chosen": -1.463576078414917, "logits/rejected": -0.7617993950843811, "logps/chosen": -594.7489013671875, "logps/rejected": -1261.5989990234375, "loss": 0.0685, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17580083012580872, "rewards/margins": 0.3210605978965759, "rewards/rejected": -0.49686145782470703, "step": 1580 }, { "epoch": 0.3, "learning_rate": 4.398989328923196e-06, "logits/chosen": -1.3283376693725586, "logits/rejected": -0.7213112115859985, "logps/chosen": -590.7708740234375, "logps/rejected": -1166.47314453125, "loss": 0.0792, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14475390315055847, "rewards/margins": 0.29773521423339844, "rewards/rejected": -0.4424891471862793, "step": 1590 }, { "epoch": 0.3, "learning_rate": 4.388136440446338e-06, "logits/chosen": -1.483410358428955, "logits/rejected": -1.0970790386199951, "logps/chosen": -583.5833740234375, "logps/rejected": -1287.4739990234375, "loss": 0.07, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.156617671251297, "rewards/margins": 0.31275057792663574, "rewards/rejected": -0.4693682789802551, "step": 1600 }, { "epoch": 0.31, "learning_rate": 4.377200082453748e-06, "logits/chosen": -1.7334868907928467, "logits/rejected": -0.9580786824226379, "logps/chosen": -571.3174438476562, "logps/rejected": -1233.8887939453125, "loss": 0.0556, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13076187670230865, "rewards/margins": 0.319111704826355, "rewards/rejected": -0.44987359642982483, "step": 1610 }, { "epoch": 0.31, "learning_rate": 4.366180738412876e-06, "logits/chosen": -1.4784181118011475, "logits/rejected": -0.8439235687255859, "logps/chosen": -716.2511596679688, "logps/rejected": -1182.4652099609375, "loss": 0.1236, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22516557574272156, "rewards/margins": 0.23013241589069366, "rewards/rejected": -0.4552980065345764, "step": 1620 }, { "epoch": 0.31, "learning_rate": 4.355078895459761e-06, "logits/chosen": -1.4879430532455444, "logits/rejected": -0.943886399269104, "logps/chosen": -542.371337890625, "logps/rejected": -1122.3577880859375, "loss": 0.0909, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1614888310432434, "rewards/margins": 0.24517297744750977, "rewards/rejected": -0.40666183829307556, "step": 1630 }, { "epoch": 0.31, "learning_rate": 4.343895044377504e-06, "logits/chosen": -1.4709045886993408, "logits/rejected": -0.8647719621658325, "logps/chosen": -612.67138671875, "logps/rejected": -1236.822998046875, "loss": 0.0837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13855595886707306, "rewards/margins": 0.28404054045677185, "rewards/rejected": -0.42259645462036133, "step": 1640 }, { "epoch": 0.31, "learning_rate": 4.332629679574566e-06, "logits/chosen": -1.5026487112045288, "logits/rejected": -0.8090435266494751, "logps/chosen": -554.9136962890625, "logps/rejected": -1203.1842041015625, "loss": 0.0815, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11568528413772583, "rewards/margins": 0.3077128231525421, "rewards/rejected": -0.42339807748794556, "step": 1650 }, { "epoch": 0.32, "learning_rate": 4.321283299062916e-06, "logits/chosen": -1.6613775491714478, "logits/rejected": -1.0173249244689941, "logps/chosen": -496.1644592285156, "logps/rejected": -1142.2369384765625, "loss": 0.0793, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11908264458179474, "rewards/margins": 0.31051716208457947, "rewards/rejected": -0.4295998215675354, "step": 1660 }, { "epoch": 0.32, "learning_rate": 4.309856404436013e-06, "logits/chosen": -1.7883861064910889, "logits/rejected": -1.2383636236190796, "logps/chosen": -562.5878295898438, "logps/rejected": -1178.5867919921875, "loss": 0.082, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10906285047531128, "rewards/margins": 0.296166330575943, "rewards/rejected": -0.4052291512489319, "step": 1670 }, { "epoch": 0.32, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -1.5402190685272217, "logits/rejected": -1.1174333095550537, "logps/chosen": -574.9996337890625, "logps/rejected": -1121.726318359375, "loss": 0.083, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.151823490858078, "rewards/margins": 0.23926305770874023, "rewards/rejected": -0.39108654856681824, "step": 1680 }, { "epoch": 0.32, "learning_rate": 4.2867630969845235e-06, "logits/chosen": -1.6920133829116821, "logits/rejected": -0.9446209073066711, "logps/chosen": -681.039794921875, "logps/rejected": -1275.439453125, "loss": 0.0705, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15800414979457855, "rewards/margins": 0.2925296425819397, "rewards/rejected": -0.45053377747535706, "step": 1690 }, { "epoch": 0.32, "learning_rate": 4.275097705053951e-06, "logits/chosen": -1.7884811162948608, "logits/rejected": -1.1299197673797607, "logps/chosen": -559.9410400390625, "logps/rejected": -1295.1170654296875, "loss": 0.0706, "rewards/accuracies": 0.875, "rewards/chosen": -0.12000130116939545, "rewards/margins": 0.3231123387813568, "rewards/rejected": -0.44311365485191345, "step": 1700 }, { "epoch": 0.33, "learning_rate": 4.263353840751023e-06, "logits/chosen": -1.5231735706329346, "logits/rejected": -0.906470775604248, "logps/chosen": -560.7247924804688, "logps/rejected": -1323.56494140625, "loss": 0.0441, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11194105446338654, "rewards/margins": 0.3386703431606293, "rewards/rejected": -0.450611412525177, "step": 1710 }, { "epoch": 0.33, "learning_rate": 4.251532023240901e-06, "logits/chosen": -1.5177276134490967, "logits/rejected": -1.092954397201538, "logps/chosen": -654.8155517578125, "logps/rejected": -1194.3641357421875, "loss": 0.0935, "rewards/accuracies": 0.75, "rewards/chosen": -0.17031201720237732, "rewards/margins": 0.2336687594652176, "rewards/rejected": -0.4039807915687561, "step": 1720 }, { "epoch": 0.33, "learning_rate": 4.239632775134857e-06, "logits/chosen": -1.595540165901184, "logits/rejected": -0.9981781244277954, "logps/chosen": -609.3163452148438, "logps/rejected": -1126.6153564453125, "loss": 0.0942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14758938550949097, "rewards/margins": 0.2600153684616089, "rewards/rejected": -0.40760475397109985, "step": 1730 }, { "epoch": 0.33, "learning_rate": 4.227656622467162e-06, "logits/chosen": -1.6388444900512695, "logits/rejected": -1.2317636013031006, "logps/chosen": -632.548828125, "logps/rejected": -1173.0673828125, "loss": 0.0929, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13355585932731628, "rewards/margins": 0.26612335443496704, "rewards/rejected": -0.3996792137622833, "step": 1740 }, { "epoch": 0.33, "learning_rate": 4.215604094671835e-06, "logits/chosen": -1.3533117771148682, "logits/rejected": -0.6486954689025879, "logps/chosen": -734.0782470703125, "logps/rejected": -1259.840576171875, "loss": 0.1028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21947124600410461, "rewards/margins": 0.27327650785446167, "rewards/rejected": -0.49274778366088867, "step": 1750 }, { "epoch": 0.34, "learning_rate": 4.203475724559235e-06, "logits/chosen": -1.5916444063186646, "logits/rejected": -1.0552420616149902, "logps/chosen": -757.8905639648438, "logps/rejected": -1302.722900390625, "loss": 0.1028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2682662308216095, "rewards/margins": 0.2391747236251831, "rewards/rejected": -0.507440984249115, "step": 1760 }, { "epoch": 0.34, "learning_rate": 4.191272048292514e-06, "logits/chosen": -1.5339223146438599, "logits/rejected": -0.9809187650680542, "logps/chosen": -633.8641357421875, "logps/rejected": -1267.8603515625, "loss": 0.0968, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20641283690929413, "rewards/margins": 0.2913345396518707, "rewards/rejected": -0.49774736166000366, "step": 1770 }, { "epoch": 0.34, "learning_rate": 4.178993605363904e-06, "logits/chosen": -1.763193130493164, "logits/rejected": -1.235198736190796, "logps/chosen": -482.87615966796875, "logps/rejected": -1054.1783447265625, "loss": 0.1001, "rewards/accuracies": 0.75, "rewards/chosen": -0.10275360196828842, "rewards/margins": 0.2246766984462738, "rewards/rejected": -0.3274303078651428, "step": 1780 }, { "epoch": 0.34, "learning_rate": 4.166640938570879e-06, "logits/chosen": -1.6158673763275146, "logits/rejected": -0.9096325039863586, "logps/chosen": -688.8383178710938, "logps/rejected": -1214.840576171875, "loss": 0.0637, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13154849410057068, "rewards/margins": 0.2507414221763611, "rewards/rejected": -0.38228991627693176, "step": 1790 }, { "epoch": 0.34, "learning_rate": 4.154214593992149e-06, "logits/chosen": -1.6515785455703735, "logits/rejected": -1.28810715675354, "logps/chosen": -586.6110229492188, "logps/rejected": -1224.0474853515625, "loss": 0.0645, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15291659533977509, "rewards/margins": 0.26770395040512085, "rewards/rejected": -0.42062050104141235, "step": 1800 }, { "epoch": 0.34, "learning_rate": 4.1417151209635265e-06, "logits/chosen": -1.343052864074707, "logits/rejected": -1.1219394207000732, "logps/chosen": -727.157470703125, "logps/rejected": -1348.6068115234375, "loss": 0.1003, "rewards/accuracies": 0.875, "rewards/chosen": -0.22418935596942902, "rewards/margins": 0.24681849777698517, "rewards/rejected": -0.4710078239440918, "step": 1810 }, { "epoch": 0.35, "learning_rate": 4.129143072053639e-06, "logits/chosen": -1.6531670093536377, "logits/rejected": -1.0683776140213013, "logps/chosen": -628.3314208984375, "logps/rejected": -1343.596435546875, "loss": 0.061, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1800796240568161, "rewards/margins": 0.3129463195800781, "rewards/rejected": -0.4930259585380554, "step": 1820 }, { "epoch": 0.35, "learning_rate": 4.116499003039499e-06, "logits/chosen": -1.6415526866912842, "logits/rejected": -0.8921445608139038, "logps/chosen": -508.5306701660156, "logps/rejected": -1111.977783203125, "loss": 0.0744, "rewards/accuracies": 0.75, "rewards/chosen": -0.1160668283700943, "rewards/margins": 0.27721890807151794, "rewards/rejected": -0.39328575134277344, "step": 1830 }, { "epoch": 0.35, "learning_rate": 4.103783472881942e-06, "logits/chosen": -1.6161365509033203, "logits/rejected": -0.9930692911148071, "logps/chosen": -510.691162109375, "logps/rejected": -1178.771240234375, "loss": 0.0701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12652714550495148, "rewards/margins": 0.2798650562763214, "rewards/rejected": -0.4063921868801117, "step": 1840 }, { "epoch": 0.35, "learning_rate": 4.0909970437009094e-06, "logits/chosen": -1.407378077507019, "logits/rejected": -1.0627632141113281, "logps/chosen": -604.5719604492188, "logps/rejected": -1218.077880859375, "loss": 0.0961, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22636625170707703, "rewards/margins": 0.26213282346725464, "rewards/rejected": -0.4884990155696869, "step": 1850 }, { "epoch": 0.35, "learning_rate": 4.078140280750598e-06, "logits/chosen": -1.472339153289795, "logits/rejected": -1.0137022733688354, "logps/chosen": -691.4371948242188, "logps/rejected": -1220.0880126953125, "loss": 0.1188, "rewards/accuracies": 0.75, "rewards/chosen": -0.23141947388648987, "rewards/margins": 0.21889667212963104, "rewards/rejected": -0.4503161907196045, "step": 1860 }, { "epoch": 0.36, "learning_rate": 4.065213752394478e-06, "logits/chosen": -1.5468190908432007, "logits/rejected": -0.9151954650878906, "logps/chosen": -664.4764404296875, "logps/rejected": -1273.167724609375, "loss": 0.0767, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2180008590221405, "rewards/margins": 0.28027474880218506, "rewards/rejected": -0.4982755780220032, "step": 1870 }, { "epoch": 0.36, "learning_rate": 4.052218030080162e-06, "logits/chosen": -1.7460209131240845, "logits/rejected": -0.8932159543037415, "logps/chosen": -689.2654418945312, "logps/rejected": -1396.868896484375, "loss": 0.0715, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2002066820859909, "rewards/margins": 0.335799902677536, "rewards/rejected": -0.5360065698623657, "step": 1880 }, { "epoch": 0.36, "learning_rate": 4.039153688314146e-06, "logits/chosen": -1.3742153644561768, "logits/rejected": -0.9114626049995422, "logps/chosen": -634.3856811523438, "logps/rejected": -1262.1175537109375, "loss": 0.1016, "rewards/accuracies": 0.75, "rewards/chosen": -0.25341543555259705, "rewards/margins": 0.24519459903240204, "rewards/rejected": -0.49861007928848267, "step": 1890 }, { "epoch": 0.36, "learning_rate": 4.026021304636408e-06, "logits/chosen": -1.4285153150558472, "logits/rejected": -0.9699563980102539, "logps/chosen": -621.96533203125, "logps/rejected": -1336.2926025390625, "loss": 0.0789, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19991159439086914, "rewards/margins": 0.2923136055469513, "rewards/rejected": -0.49222517013549805, "step": 1900 }, { "epoch": 0.36, "learning_rate": 4.012821459594881e-06, "logits/chosen": -1.630352258682251, "logits/rejected": -1.1409363746643066, "logps/chosen": -484.24365234375, "logps/rejected": -1044.751953125, "loss": 0.0633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1116863489151001, "rewards/margins": 0.25152185559272766, "rewards/rejected": -0.36320820450782776, "step": 1910 }, { "epoch": 0.37, "learning_rate": 3.999554736719785e-06, "logits/chosen": -1.7693662643432617, "logits/rejected": -1.1132917404174805, "logps/chosen": -622.8857421875, "logps/rejected": -1309.3828125, "loss": 0.0626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14811810851097107, "rewards/margins": 0.32902562618255615, "rewards/rejected": -0.47714370489120483, "step": 1920 }, { "epoch": 0.37, "learning_rate": 3.986221722497832e-06, "logits/chosen": -1.7638092041015625, "logits/rejected": -1.087059736251831, "logps/chosen": -632.57666015625, "logps/rejected": -1238.3343505859375, "loss": 0.0726, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16446995735168457, "rewards/margins": 0.29051706194877625, "rewards/rejected": -0.4549869894981384, "step": 1930 }, { "epoch": 0.37, "learning_rate": 3.9728230063463e-06, "logits/chosen": -1.2721593379974365, "logits/rejected": -0.6474477052688599, "logps/chosen": -648.2594604492188, "logps/rejected": -1158.4620361328125, "loss": 0.0716, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1905958205461502, "rewards/margins": 0.24646992981433868, "rewards/rejected": -0.4370657801628113, "step": 1940 }, { "epoch": 0.37, "learning_rate": 3.9593591805869755e-06, "logits/chosen": -1.4620612859725952, "logits/rejected": -1.2620489597320557, "logps/chosen": -688.633056640625, "logps/rejected": -1529.7515869140625, "loss": 0.0975, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1938236951828003, "rewards/margins": 0.35576432943344116, "rewards/rejected": -0.5495880246162415, "step": 1950 }, { "epoch": 0.37, "learning_rate": 3.945830840419966e-06, "logits/chosen": -1.7115497589111328, "logits/rejected": -1.2753444910049438, "logps/chosen": -479.00384521484375, "logps/rejected": -1069.37548828125, "loss": 0.0806, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.021248655393719673, "rewards/margins": 0.23637816309928894, "rewards/rejected": -0.25762683153152466, "step": 1960 }, { "epoch": 0.38, "learning_rate": 3.932238583897395e-06, "logits/chosen": -1.6209239959716797, "logits/rejected": -1.3102586269378662, "logps/chosen": -497.66693115234375, "logps/rejected": -1194.21435546875, "loss": 0.0496, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.04905269294977188, "rewards/margins": 0.28805986046791077, "rewards/rejected": -0.33711254596710205, "step": 1970 }, { "epoch": 0.38, "learning_rate": 3.918583011896955e-06, "logits/chosen": -1.5563997030258179, "logits/rejected": -1.0230815410614014, "logps/chosen": -551.2222900390625, "logps/rejected": -1139.404052734375, "loss": 0.0892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.036595430225133896, "rewards/margins": 0.27522173523902893, "rewards/rejected": -0.31181710958480835, "step": 1980 }, { "epoch": 0.38, "learning_rate": 3.904864728095349e-06, "logits/chosen": -1.6914150714874268, "logits/rejected": -1.3743771314620972, "logps/chosen": -563.36865234375, "logps/rejected": -1147.3048095703125, "loss": 0.0887, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06336866319179535, "rewards/margins": 0.24804475903511047, "rewards/rejected": -0.311413437128067, "step": 1990 }, { "epoch": 0.38, "learning_rate": 3.891084338941603e-06, "logits/chosen": -1.4719524383544922, "logits/rejected": -1.0884759426116943, "logps/chosen": -480.8564453125, "logps/rejected": -1265.7791748046875, "loss": 0.1026, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.04422132298350334, "rewards/margins": 0.32557159662246704, "rewards/rejected": -0.3697928786277771, "step": 2000 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -1.7504775524139404, "logits/rejected": -1.0374706983566284, "logps/chosen": -581.1937255859375, "logps/rejected": -1179.3349609375, "loss": 0.0854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07610450685024261, "rewards/margins": 0.26427727937698364, "rewards/rejected": -0.34038177132606506, "step": 2010 }, { "epoch": 0.38, "learning_rate": 3.863339684074432e-06, "logits/chosen": -1.444999098777771, "logits/rejected": -1.0082156658172607, "logps/chosen": -567.4451904296875, "logps/rejected": -1163.037109375, "loss": 0.0942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09151064604520798, "rewards/margins": 0.25389835238456726, "rewards/rejected": -0.34540897607803345, "step": 2020 }, { "epoch": 0.39, "learning_rate": 3.849376644878783e-06, "logits/chosen": -1.7114700078964233, "logits/rejected": -1.2078514099121094, "logps/chosen": -580.4812622070312, "logps/rejected": -1253.7298583984375, "loss": 0.0739, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.11515311151742935, "rewards/margins": 0.26693490147590637, "rewards/rejected": -0.3820880353450775, "step": 2030 }, { "epoch": 0.39, "learning_rate": 3.835353953312322e-06, "logits/chosen": -1.5887187719345093, "logits/rejected": -1.0290062427520752, "logps/chosen": -550.4044189453125, "logps/rejected": -1165.352783203125, "loss": 0.0892, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11724672466516495, "rewards/margins": 0.26691046357154846, "rewards/rejected": -0.3841571807861328, "step": 2040 }, { "epoch": 0.39, "learning_rate": 3.821272229281139e-06, "logits/chosen": -1.243851900100708, "logits/rejected": -0.8313790559768677, "logps/chosen": -619.6917724609375, "logps/rejected": -1220.0845947265625, "loss": 0.1013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13393720984458923, "rewards/margins": 0.27672356367111206, "rewards/rejected": -0.41066068410873413, "step": 2050 }, { "epoch": 0.39, "learning_rate": 3.8071320953009906e-06, "logits/chosen": -1.6292989253997803, "logits/rejected": -1.1258556842803955, "logps/chosen": -542.9365234375, "logps/rejected": -1215.368896484375, "loss": 0.0616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09711267799139023, "rewards/margins": 0.28968364000320435, "rewards/rejected": -0.38679632544517517, "step": 2060 }, { "epoch": 0.39, "learning_rate": 3.792934176469782e-06, "logits/chosen": -1.644221544265747, "logits/rejected": -0.9224053621292114, "logps/chosen": -615.8048706054688, "logps/rejected": -1117.5472412109375, "loss": 0.0962, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09015320241451263, "rewards/margins": 0.2451203316450119, "rewards/rejected": -0.3352735638618469, "step": 2070 }, { "epoch": 0.4, "learning_rate": 3.7786791004399353e-06, "logits/chosen": -1.5849639177322388, "logits/rejected": -0.9769964218139648, "logps/chosen": -630.0858154296875, "logps/rejected": -1170.2745361328125, "loss": 0.081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12655183672904968, "rewards/margins": 0.2449195832014084, "rewards/rejected": -0.37147143483161926, "step": 2080 }, { "epoch": 0.4, "learning_rate": 3.764367497390642e-06, "logits/chosen": -1.6721210479736328, "logits/rejected": -1.1214954853057861, "logps/chosen": -552.5462646484375, "logps/rejected": -1353.6265869140625, "loss": 0.0585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14976239204406738, "rewards/margins": 0.30934378504753113, "rewards/rejected": -0.4591061472892761, "step": 2090 }, { "epoch": 0.4, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.4793144464492798, "logits/rejected": -0.9002103805541992, "logps/chosen": -640.5985107421875, "logps/rejected": -1225.0712890625, "loss": 0.0905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14169077575206757, "rewards/margins": 0.2867027223110199, "rewards/rejected": -0.4283934533596039, "step": 2100 }, { "epoch": 0.4, "learning_rate": 3.7355772434170523e-06, "logits/chosen": -1.7350581884384155, "logits/rejected": -1.0579384565353394, "logps/chosen": -617.4492797851562, "logps/rejected": -1214.168212890625, "loss": 0.0467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13681480288505554, "rewards/margins": 0.29050007462501526, "rewards/rejected": -0.4273148477077484, "step": 2110 }, { "epoch": 0.4, "learning_rate": 3.7210998652337016e-06, "logits/chosen": -1.676896095275879, "logits/rejected": -0.9806238412857056, "logps/chosen": -592.6138916015625, "logps/rejected": -1362.531982421875, "loss": 0.0429, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.13515350222587585, "rewards/margins": 0.3417731821537018, "rewards/rejected": -0.47692665457725525, "step": 2120 }, { "epoch": 0.41, "learning_rate": 3.7065685054565277e-06, "logits/chosen": -1.7495753765106201, "logits/rejected": -0.8682845830917358, "logps/chosen": -585.2340087890625, "logps/rejected": -1222.46044921875, "loss": 0.0467, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10164134204387665, "rewards/margins": 0.33252280950546265, "rewards/rejected": -0.4341641366481781, "step": 2130 }, { "epoch": 0.41, "learning_rate": 3.691983806478494e-06, "logits/chosen": -1.7462999820709229, "logits/rejected": -1.2676109075546265, "logps/chosen": -485.5740661621094, "logps/rejected": -1216.7451171875, "loss": 0.0894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07753627002239227, "rewards/margins": 0.3056955933570862, "rewards/rejected": -0.38323187828063965, "step": 2140 }, { "epoch": 0.41, "learning_rate": 3.677346413050551e-06, "logits/chosen": -1.674078345298767, "logits/rejected": -1.3559746742248535, "logps/chosen": -565.6333618164062, "logps/rejected": -1339.0338134765625, "loss": 0.039, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06904712319374084, "rewards/margins": 0.356864869594574, "rewards/rejected": -0.4259119927883148, "step": 2150 }, { "epoch": 0.41, "learning_rate": 3.6626569722531268e-06, "logits/chosen": -1.594458818435669, "logits/rejected": -0.9288966059684753, "logps/chosen": -594.22216796875, "logps/rejected": -1178.4859619140625, "loss": 0.0878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10133261978626251, "rewards/margins": 0.2484547644853592, "rewards/rejected": -0.3497873842716217, "step": 2160 }, { "epoch": 0.41, "learning_rate": 3.6479161334675294e-06, "logits/chosen": -1.582834005355835, "logits/rejected": -0.854317843914032, "logps/chosen": -631.1778564453125, "logps/rejected": -1246.053466796875, "loss": 0.0852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12199946492910385, "rewards/margins": 0.2943424880504608, "rewards/rejected": -0.41634196043014526, "step": 2170 }, { "epoch": 0.42, "learning_rate": 3.6331245483472353e-06, "logits/chosen": -1.458630919456482, "logits/rejected": -1.0171302556991577, "logps/chosen": -530.21142578125, "logps/rejected": -1186.492919921875, "loss": 0.0895, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10598714649677277, "rewards/margins": 0.29519587755203247, "rewards/rejected": -0.40118303894996643, "step": 2180 }, { "epoch": 0.42, "learning_rate": 3.6182828707890816e-06, "logits/chosen": -1.6425663232803345, "logits/rejected": -1.0837091207504272, "logps/chosen": -593.2979125976562, "logps/rejected": -1225.735595703125, "loss": 0.1029, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1416325867176056, "rewards/margins": 0.29666662216186523, "rewards/rejected": -0.43829917907714844, "step": 2190 }, { "epoch": 0.42, "learning_rate": 3.6033917569043604e-06, "logits/chosen": -1.7086814641952515, "logits/rejected": -1.0961934328079224, "logps/chosen": -586.9171142578125, "logps/rejected": -1241.936279296875, "loss": 0.0486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14558613300323486, "rewards/margins": 0.2909151613712311, "rewards/rejected": -0.43650132417678833, "step": 2200 }, { "epoch": 0.42, "learning_rate": 3.588451864989811e-06, "logits/chosen": -1.4291936159133911, "logits/rejected": -0.9254032373428345, "logps/chosen": -670.0648193359375, "logps/rejected": -1274.3505859375, "loss": 0.0869, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16813287138938904, "rewards/margins": 0.2989405393600464, "rewards/rejected": -0.4670734405517578, "step": 2210 }, { "epoch": 0.42, "learning_rate": 3.5734638554985234e-06, "logits/chosen": -1.6968755722045898, "logits/rejected": -1.161507248878479, "logps/chosen": -432.891845703125, "logps/rejected": -1126.6932373046875, "loss": 0.0636, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.04656369611620903, "rewards/margins": 0.2924540936946869, "rewards/rejected": -0.339017778635025, "step": 2220 }, { "epoch": 0.42, "learning_rate": 3.5584283910107343e-06, "logits/chosen": -1.5758813619613647, "logits/rejected": -0.5977634787559509, "logps/chosen": -618.425537109375, "logps/rejected": -1015.8828125, "loss": 0.081, "rewards/accuracies": 0.75, "rewards/chosen": -0.09514687210321426, "rewards/margins": 0.2435726374387741, "rewards/rejected": -0.33871954679489136, "step": 2230 }, { "epoch": 0.43, "learning_rate": 3.543346136204545e-06, "logits/chosen": -1.59457528591156, "logits/rejected": -0.7562257647514343, "logps/chosen": -662.0821533203125, "logps/rejected": -1304.8880615234375, "loss": 0.0677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12489262968301773, "rewards/margins": 0.32123300433158875, "rewards/rejected": -0.44612565636634827, "step": 2240 }, { "epoch": 0.43, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.7696926593780518, "logits/rejected": -1.2749775648117065, "logps/chosen": -482.4493103027344, "logps/rejected": -1066.5418701171875, "loss": 0.0886, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10509966313838959, "rewards/margins": 0.2710351347923279, "rewards/rejected": -0.37613481283187866, "step": 2250 }, { "epoch": 0.43, "learning_rate": 3.5130439246622635e-06, "logits/chosen": -1.3018282651901245, "logits/rejected": -0.8078845143318176, "logps/chosen": -570.1189575195312, "logps/rejected": -1327.9266357421875, "loss": 0.067, "rewards/accuracies": 0.875, "rewards/chosen": -0.1495288759469986, "rewards/margins": 0.3377271592617035, "rewards/rejected": -0.4872560501098633, "step": 2260 }, { "epoch": 0.43, "learning_rate": 3.497825307506758e-06, "logits/chosen": -1.7156795263290405, "logits/rejected": -1.2350342273712158, "logps/chosen": -651.0853271484375, "logps/rejected": -1260.7733154296875, "loss": 0.0751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15341240167617798, "rewards/margins": 0.28627315163612366, "rewards/rejected": -0.43968549370765686, "step": 2270 }, { "epoch": 0.43, "learning_rate": 3.4825625791348093e-06, "logits/chosen": -1.6436914205551147, "logits/rejected": -0.9447058439254761, "logps/chosen": -674.1998291015625, "logps/rejected": -1122.89599609375, "loss": 0.1037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17227864265441895, "rewards/margins": 0.2191500961780548, "rewards/rejected": -0.39142873883247375, "step": 2280 }, { "epoch": 0.44, "learning_rate": 3.467256414271249e-06, "logits/chosen": -1.7851943969726562, "logits/rejected": -0.9154273271560669, "logps/chosen": -685.0301513671875, "logps/rejected": -1168.9381103515625, "loss": 0.1071, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15380102396011353, "rewards/margins": 0.24055905640125275, "rewards/rejected": -0.39436009526252747, "step": 2290 }, { "epoch": 0.44, "learning_rate": 3.4519074895611245e-06, "logits/chosen": -1.6110646724700928, "logits/rejected": -1.0128824710845947, "logps/chosen": -507.72900390625, "logps/rejected": -1104.7791748046875, "loss": 0.0777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09803974628448486, "rewards/margins": 0.2795708179473877, "rewards/rejected": -0.37761059403419495, "step": 2300 }, { "epoch": 0.44, "learning_rate": 3.436516483539781e-06, "logits/chosen": -1.6653196811676025, "logits/rejected": -0.9805139303207397, "logps/chosen": -491.69525146484375, "logps/rejected": -1238.8643798828125, "loss": 0.0466, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06906162202358246, "rewards/margins": 0.3539879322052002, "rewards/rejected": -0.42304953932762146, "step": 2310 }, { "epoch": 0.44, "learning_rate": 3.421084076602867e-06, "logits/chosen": -1.6048294305801392, "logits/rejected": -1.1272857189178467, "logps/chosen": -570.3504028320312, "logps/rejected": -1230.8138427734375, "loss": 0.0497, "rewards/accuracies": 0.875, "rewards/chosen": -0.11816605180501938, "rewards/margins": 0.32945695519447327, "rewards/rejected": -0.44762295484542847, "step": 2320 }, { "epoch": 0.44, "learning_rate": 3.405610950976257e-06, "logits/chosen": -1.7304108142852783, "logits/rejected": -1.2529585361480713, "logps/chosen": -631.9100341796875, "logps/rejected": -1261.7708740234375, "loss": 0.0932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14346104860305786, "rewards/margins": 0.3222948908805847, "rewards/rejected": -0.4657559394836426, "step": 2330 }, { "epoch": 0.45, "learning_rate": 3.3900977906858923e-06, "logits/chosen": -1.6869083642959595, "logits/rejected": -1.047593355178833, "logps/chosen": -709.0650024414062, "logps/rejected": -1288.22021484375, "loss": 0.0822, "rewards/accuracies": 0.875, "rewards/chosen": -0.19333884119987488, "rewards/margins": 0.26803427934646606, "rewards/rejected": -0.46137315034866333, "step": 2340 }, { "epoch": 0.45, "learning_rate": 3.3745452815275375e-06, "logits/chosen": -1.5970579385757446, "logits/rejected": -1.0045435428619385, "logps/chosen": -671.1852416992188, "logps/rejected": -1301.151611328125, "loss": 0.0775, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16181443631649017, "rewards/margins": 0.26783424615859985, "rewards/rejected": -0.4296487271785736, "step": 2350 }, { "epoch": 0.45, "learning_rate": 3.3589541110364678e-06, "logits/chosen": -1.5840835571289062, "logits/rejected": -1.0760002136230469, "logps/chosen": -536.934814453125, "logps/rejected": -1156.111572265625, "loss": 0.0945, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10547558218240738, "rewards/margins": 0.2672649025917053, "rewards/rejected": -0.3727405071258545, "step": 2360 }, { "epoch": 0.45, "learning_rate": 3.3433249684570757e-06, "logits/chosen": -1.7080332040786743, "logits/rejected": -0.9376260042190552, "logps/chosen": -451.25927734375, "logps/rejected": -1115.48681640625, "loss": 0.0874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.03897695988416672, "rewards/margins": 0.29128187894821167, "rewards/rejected": -0.330258846282959, "step": 2370 }, { "epoch": 0.45, "learning_rate": 3.3276585447123957e-06, "logits/chosen": -1.6589637994766235, "logits/rejected": -1.1556400060653687, "logps/chosen": -603.0030517578125, "logps/rejected": -1086.166015625, "loss": 0.1274, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1070268377661705, "rewards/margins": 0.220013827085495, "rewards/rejected": -0.3270407021045685, "step": 2380 }, { "epoch": 0.46, "learning_rate": 3.3119555323735664e-06, "logits/chosen": -1.5776820182800293, "logits/rejected": -1.1128264665603638, "logps/chosen": -654.4660034179688, "logps/rejected": -1288.406005859375, "loss": 0.0588, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16737082600593567, "rewards/margins": 0.2906762957572937, "rewards/rejected": -0.458047091960907, "step": 2390 }, { "epoch": 0.46, "learning_rate": 3.2962166256292116e-06, "logits/chosen": -1.3844215869903564, "logits/rejected": -1.0129426717758179, "logps/chosen": -630.933837890625, "logps/rejected": -1127.8221435546875, "loss": 0.1021, "rewards/accuracies": 0.75, "rewards/chosen": -0.18399353325366974, "rewards/margins": 0.21213479340076447, "rewards/rejected": -0.3961283266544342, "step": 2400 }, { "epoch": 0.46, "learning_rate": 3.2804425202547494e-06, "logits/chosen": -1.4825665950775146, "logits/rejected": -0.9888661503791809, "logps/chosen": -685.001220703125, "logps/rejected": -1302.7615966796875, "loss": 0.09, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19598498940467834, "rewards/margins": 0.27030500769615173, "rewards/rejected": -0.46629005670547485, "step": 2410 }, { "epoch": 0.46, "learning_rate": 3.2646339135816386e-06, "logits/chosen": -1.4710817337036133, "logits/rejected": -0.8951467275619507, "logps/chosen": -549.333740234375, "logps/rejected": -1265.0982666015625, "loss": 0.0674, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09500465542078018, "rewards/margins": 0.3293246626853943, "rewards/rejected": -0.42432934045791626, "step": 2420 }, { "epoch": 0.46, "learning_rate": 3.2487915044665485e-06, "logits/chosen": -1.4130220413208008, "logits/rejected": -1.1104599237442017, "logps/chosen": -567.1212158203125, "logps/rejected": -1161.703857421875, "loss": 0.08, "rewards/accuracies": 0.875, "rewards/chosen": -0.10543553531169891, "rewards/margins": 0.2853809893131256, "rewards/rejected": -0.39081650972366333, "step": 2430 }, { "epoch": 0.46, "learning_rate": 3.2329159932604638e-06, "logits/chosen": -1.5364049673080444, "logits/rejected": -0.9546027183532715, "logps/chosen": -546.604736328125, "logps/rejected": -1226.5767822265625, "loss": 0.069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11398911476135254, "rewards/margins": 0.3227608799934387, "rewards/rejected": -0.43675002455711365, "step": 2440 }, { "epoch": 0.47, "learning_rate": 3.217008081777726e-06, "logits/chosen": -1.4038457870483398, "logits/rejected": -0.9470579028129578, "logps/chosen": -529.3914184570312, "logps/rejected": -1212.7598876953125, "loss": 0.078, "rewards/accuracies": 0.875, "rewards/chosen": -0.10971584171056747, "rewards/margins": 0.30522698163986206, "rewards/rejected": -0.41494280099868774, "step": 2450 }, { "epoch": 0.47, "learning_rate": 3.201068473265007e-06, "logits/chosen": -1.6278892755508423, "logits/rejected": -0.9200417399406433, "logps/chosen": -640.5203247070312, "logps/rejected": -1258.502197265625, "loss": 0.0703, "rewards/accuracies": 0.875, "rewards/chosen": -0.1412404179573059, "rewards/margins": 0.2885693311691284, "rewards/rejected": -0.42980971932411194, "step": 2460 }, { "epoch": 0.47, "learning_rate": 3.1850978723702213e-06, "logits/chosen": -1.5890612602233887, "logits/rejected": -0.9910513758659363, "logps/chosen": -669.634521484375, "logps/rejected": -1339.1484375, "loss": 0.0862, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17336814105510712, "rewards/margins": 0.3014371395111084, "rewards/rejected": -0.4748052656650543, "step": 2470 }, { "epoch": 0.47, "learning_rate": 3.1690969851113724e-06, "logits/chosen": -1.754757285118103, "logits/rejected": -1.1542155742645264, "logps/chosen": -693.3397216796875, "logps/rejected": -1303.1055908203125, "loss": 0.0825, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1847262680530548, "rewards/margins": 0.2816571593284607, "rewards/rejected": -0.4663833975791931, "step": 2480 }, { "epoch": 0.47, "learning_rate": 3.1530665188453463e-06, "logits/chosen": -1.4916189908981323, "logits/rejected": -0.8297182321548462, "logps/chosen": -726.0441284179688, "logps/rejected": -1387.1719970703125, "loss": 0.0609, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.22441205382347107, "rewards/margins": 0.29351991415023804, "rewards/rejected": -0.5179319381713867, "step": 2490 }, { "epoch": 0.48, "learning_rate": 3.137007182236637e-06, "logits/chosen": -1.4697926044464111, "logits/rejected": -1.0665897130966187, "logps/chosen": -555.7542724609375, "logps/rejected": -1162.6217041015625, "loss": 0.0632, "rewards/accuracies": 0.75, "rewards/chosen": -0.11064179986715317, "rewards/margins": 0.2883552014827728, "rewards/rejected": -0.39899706840515137, "step": 2500 }, { "epoch": 0.48, "learning_rate": 3.1209196852260204e-06, "logits/chosen": -1.5638642311096191, "logits/rejected": -0.9049351811408997, "logps/chosen": -530.5607299804688, "logps/rejected": -1162.5194091796875, "loss": 0.084, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10229738056659698, "rewards/margins": 0.29367098212242126, "rewards/rejected": -0.39596837759017944, "step": 2510 }, { "epoch": 0.48, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -1.5408356189727783, "logits/rejected": -0.8372970819473267, "logps/chosen": -528.8843994140625, "logps/rejected": -1207.514404296875, "loss": 0.0431, "rewards/accuracies": 0.875, "rewards/chosen": -0.1121770516037941, "rewards/margins": 0.3144737184047699, "rewards/rejected": -0.4266508221626282, "step": 2520 }, { "epoch": 0.48, "learning_rate": 3.0886630559552144e-06, "logits/chosen": -1.7452852725982666, "logits/rejected": -0.9543391466140747, "logps/chosen": -668.1608276367188, "logps/rejected": -1297.2008056640625, "loss": 0.0579, "rewards/accuracies": 0.875, "rewards/chosen": -0.1479906588792801, "rewards/margins": 0.3234715759754181, "rewards/rejected": -0.4714622497558594, "step": 2530 }, { "epoch": 0.48, "learning_rate": 3.072495349675249e-06, "logits/chosen": -1.7660051584243774, "logits/rejected": -0.9404336214065552, "logps/chosen": -675.3543090820312, "logps/rejected": -1376.85986328125, "loss": 0.0867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1329096257686615, "rewards/margins": 0.3379344046115875, "rewards/rejected": -0.470844030380249, "step": 2540 }, { "epoch": 0.49, "learning_rate": 3.056302334890786e-06, "logits/chosen": -1.5463120937347412, "logits/rejected": -0.9701935648918152, "logps/chosen": -559.4407958984375, "logps/rejected": -1268.9881591796875, "loss": 0.0539, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10604407638311386, "rewards/margins": 0.3344349265098572, "rewards/rejected": -0.440479040145874, "step": 2550 }, { "epoch": 0.49, "learning_rate": 3.04008472745216e-06, "logits/chosen": -1.4827139377593994, "logits/rejected": -0.9881393313407898, "logps/chosen": -527.420166015625, "logps/rejected": -1177.581298828125, "loss": 0.0891, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.079104483127594, "rewards/margins": 0.29415425658226013, "rewards/rejected": -0.3732587695121765, "step": 2560 }, { "epoch": 0.49, "learning_rate": 3.0238432442968803e-06, "logits/chosen": -1.557047724723816, "logits/rejected": -0.9846252202987671, "logps/chosen": -551.916259765625, "logps/rejected": -1224.5875244140625, "loss": 0.0524, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09614777565002441, "rewards/margins": 0.30018511414527893, "rewards/rejected": -0.39633288979530334, "step": 2570 }, { "epoch": 0.49, "learning_rate": 3.0075786034179407e-06, "logits/chosen": -1.450969934463501, "logits/rejected": -1.3005410432815552, "logps/chosen": -476.8797912597656, "logps/rejected": -1157.756103515625, "loss": 0.0783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0792437344789505, "rewards/margins": 0.27935245633125305, "rewards/rejected": -0.35859617590904236, "step": 2580 }, { "epoch": 0.49, "learning_rate": 2.9912915238320755e-06, "logits/chosen": -1.64337158203125, "logits/rejected": -1.1302156448364258, "logps/chosen": -503.0484924316406, "logps/rejected": -1008.0091552734375, "loss": 0.101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07734871655702591, "rewards/margins": 0.2203529179096222, "rewards/rejected": -0.2977016568183899, "step": 2590 }, { "epoch": 0.5, "learning_rate": 2.974982725547976e-06, "logits/chosen": -1.5070769786834717, "logits/rejected": -1.2087517976760864, "logps/chosen": -518.1423950195312, "logps/rejected": -1188.7457275390625, "loss": 0.0641, "rewards/accuracies": 0.875, "rewards/chosen": -0.06826291978359222, "rewards/margins": 0.28302261233329773, "rewards/rejected": -0.35128551721572876, "step": 2600 }, { "epoch": 0.5, "learning_rate": 2.958652929534456e-06, "logits/chosen": -1.5541181564331055, "logits/rejected": -0.9500897526741028, "logps/chosen": -427.2430725097656, "logps/rejected": -1138.5986328125, "loss": 0.0551, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06738888472318649, "rewards/margins": 0.3063209652900696, "rewards/rejected": -0.37370985746383667, "step": 2610 }, { "epoch": 0.5, "learning_rate": 2.9423028576885894e-06, "logits/chosen": -1.421770453453064, "logits/rejected": -0.9556745290756226, "logps/chosen": -574.5811767578125, "logps/rejected": -1172.9805908203125, "loss": 0.1039, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12262606620788574, "rewards/margins": 0.26561444997787476, "rewards/rejected": -0.3882405161857605, "step": 2620 }, { "epoch": 0.5, "learning_rate": 2.9259332328037852e-06, "logits/chosen": -1.5648980140686035, "logits/rejected": -0.9057445526123047, "logps/chosen": -605.000732421875, "logps/rejected": -1209.2060546875, "loss": 0.0589, "rewards/accuracies": 0.875, "rewards/chosen": -0.08145221322774887, "rewards/margins": 0.3243905007839203, "rewards/rejected": -0.40584272146224976, "step": 2630 }, { "epoch": 0.5, "learning_rate": 2.9095447785378446e-06, "logits/chosen": -1.631359338760376, "logits/rejected": -0.9921914339065552, "logps/chosen": -550.8638305664062, "logps/rejected": -1227.114501953125, "loss": 0.0745, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10412851721048355, "rewards/margins": 0.3038374185562134, "rewards/rejected": -0.4079659581184387, "step": 2640 }, { "epoch": 0.5, "learning_rate": 2.893138219380964e-06, "logits/chosen": -1.8246917724609375, "logits/rejected": -1.1634905338287354, "logps/chosen": -476.13397216796875, "logps/rejected": -1130.2000732421875, "loss": 0.0595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.034350063651800156, "rewards/margins": 0.33886438608169556, "rewards/rejected": -0.37321439385414124, "step": 2650 }, { "epoch": 0.51, "learning_rate": 2.876714280623708e-06, "logits/chosen": -1.4653544425964355, "logits/rejected": -0.8596148490905762, "logps/chosen": -475.05853271484375, "logps/rejected": -1144.7532958984375, "loss": 0.0852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07599582523107529, "rewards/margins": 0.2964363992214203, "rewards/rejected": -0.37243229150772095, "step": 2660 }, { "epoch": 0.51, "learning_rate": 2.8602736883249504e-06, "logits/chosen": -1.6749900579452515, "logits/rejected": -1.0397599935531616, "logps/chosen": -632.7877807617188, "logps/rejected": -1282.6087646484375, "loss": 0.0934, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13829271495342255, "rewards/margins": 0.28984689712524414, "rewards/rejected": -0.4281395971775055, "step": 2670 }, { "epoch": 0.51, "learning_rate": 2.843817169279772e-06, "logits/chosen": -1.4789026975631714, "logits/rejected": -1.0164978504180908, "logps/chosen": -588.529052734375, "logps/rejected": -1097.4697265625, "loss": 0.0939, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11081542819738388, "rewards/margins": 0.2728538513183594, "rewards/rejected": -0.38366931676864624, "step": 2680 }, { "epoch": 0.51, "learning_rate": 2.8273454509873333e-06, "logits/chosen": -1.5279271602630615, "logits/rejected": -1.0477749109268188, "logps/chosen": -694.4437866210938, "logps/rejected": -1294.1767578125, "loss": 0.0812, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15735623240470886, "rewards/margins": 0.27929773926734924, "rewards/rejected": -0.4366539418697357, "step": 2690 }, { "epoch": 0.51, "learning_rate": 2.8108592616187135e-06, "logits/chosen": -1.6551815271377563, "logits/rejected": -0.8189488649368286, "logps/chosen": -744.4472045898438, "logps/rejected": -1390.954345703125, "loss": 0.0533, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1515381634235382, "rewards/margins": 0.3364045023918152, "rewards/rejected": -0.4879426956176758, "step": 2700 }, { "epoch": 0.52, "learning_rate": 2.7943593299847186e-06, "logits/chosen": -1.5556144714355469, "logits/rejected": -0.8782920837402344, "logps/chosen": -578.2349853515625, "logps/rejected": -1223.9837646484375, "loss": 0.0502, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12136213481426239, "rewards/margins": 0.32430487871170044, "rewards/rejected": -0.4456670880317688, "step": 2710 }, { "epoch": 0.52, "learning_rate": 2.7778463855036656e-06, "logits/chosen": -1.4753865003585815, "logits/rejected": -0.8386212587356567, "logps/chosen": -651.51171875, "logps/rejected": -1406.922119140625, "loss": 0.0434, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1467413604259491, "rewards/margins": 0.3557416498661041, "rewards/rejected": -0.5024830102920532, "step": 2720 }, { "epoch": 0.52, "learning_rate": 2.761321158169134e-06, "logits/chosen": -1.6173675060272217, "logits/rejected": -1.0392745733261108, "logps/chosen": -618.5396118164062, "logps/rejected": -1164.376708984375, "loss": 0.07, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1464042365550995, "rewards/margins": 0.26444289088249207, "rewards/rejected": -0.41084709763526917, "step": 2730 }, { "epoch": 0.52, "learning_rate": 2.7447843785176958e-06, "logits/chosen": -1.4622766971588135, "logits/rejected": -0.898713231086731, "logps/chosen": -549.015625, "logps/rejected": -1307.8948974609375, "loss": 0.0594, "rewards/accuracies": 0.875, "rewards/chosen": -0.12090383470058441, "rewards/margins": 0.34172388911247253, "rewards/rejected": -0.46262773871421814, "step": 2740 }, { "epoch": 0.52, "learning_rate": 2.728236777596621e-06, "logits/chosen": -1.4183399677276611, "logits/rejected": -0.8845669031143188, "logps/chosen": -644.03662109375, "logps/rejected": -1180.503173828125, "loss": 0.1105, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14675351977348328, "rewards/margins": 0.2584415078163147, "rewards/rejected": -0.40519505739212036, "step": 2750 }, { "epoch": 0.53, "learning_rate": 2.7116790869315583e-06, "logits/chosen": -1.459571361541748, "logits/rejected": -0.855141282081604, "logps/chosen": -544.542236328125, "logps/rejected": -1199.67919921875, "loss": 0.0687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10830040276050568, "rewards/margins": 0.31138211488723755, "rewards/rejected": -0.4196825623512268, "step": 2760 }, { "epoch": 0.53, "learning_rate": 2.695112038494198e-06, "logits/chosen": -1.556896448135376, "logits/rejected": -1.0641980171203613, "logps/chosen": -526.887939453125, "logps/rejected": -1217.894775390625, "loss": 0.0929, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11782244592905045, "rewards/margins": 0.3022124171257019, "rewards/rejected": -0.42003482580184937, "step": 2770 }, { "epoch": 0.53, "learning_rate": 2.6785363646699125e-06, "logits/chosen": -1.475042700767517, "logits/rejected": -0.8031581044197083, "logps/chosen": -695.498046875, "logps/rejected": -1166.9180908203125, "loss": 0.0699, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12228862196207047, "rewards/margins": 0.24580267071723938, "rewards/rejected": -0.36809128522872925, "step": 2780 }, { "epoch": 0.53, "learning_rate": 2.6619527982253796e-06, "logits/chosen": -1.4715737104415894, "logits/rejected": -1.0043838024139404, "logps/chosen": -579.2100830078125, "logps/rejected": -1229.07470703125, "loss": 0.0635, "rewards/accuracies": 0.875, "rewards/chosen": -0.11626414954662323, "rewards/margins": 0.3015051782131195, "rewards/rejected": -0.41776934266090393, "step": 2790 }, { "epoch": 0.53, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -1.4490864276885986, "logits/rejected": -0.9089977145195007, "logps/chosen": -579.8900146484375, "logps/rejected": -1265.926025390625, "loss": 0.0658, "rewards/accuracies": 0.875, "rewards/chosen": -0.11970784515142441, "rewards/margins": 0.310236394405365, "rewards/rejected": -0.4299442172050476, "step": 2800 }, { "epoch": 0.54, "learning_rate": 2.628764920254435e-06, "logits/chosen": -1.4776760339736938, "logits/rejected": -0.9225019216537476, "logps/chosen": -567.9808959960938, "logps/rejected": -1210.098876953125, "loss": 0.077, "rewards/accuracies": 0.875, "rewards/chosen": -0.0780453160405159, "rewards/margins": 0.3206408619880676, "rewards/rejected": -0.39868617057800293, "step": 2810 }, { "epoch": 0.54, "learning_rate": 2.6121620758762877e-06, "logits/chosen": -1.4901140928268433, "logits/rejected": -0.8745683431625366, "logps/chosen": -634.2081298828125, "logps/rejected": -1223.249755859375, "loss": 0.0933, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13513752818107605, "rewards/margins": 0.25405651330947876, "rewards/rejected": -0.3891941010951996, "step": 2820 }, { "epoch": 0.54, "learning_rate": 2.595554273109564e-06, "logits/chosen": -1.6821086406707764, "logits/rejected": -1.0679817199707031, "logps/chosen": -539.3198852539062, "logps/rejected": -1233.5755615234375, "loss": 0.049, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09231746941804886, "rewards/margins": 0.336186945438385, "rewards/rejected": -0.4285043776035309, "step": 2830 }, { "epoch": 0.54, "learning_rate": 2.5789422461412776e-06, "logits/chosen": -1.4972844123840332, "logits/rejected": -0.905117392539978, "logps/chosen": -463.593994140625, "logps/rejected": -1129.23388671875, "loss": 0.0552, "rewards/accuracies": 0.875, "rewards/chosen": -0.05808283016085625, "rewards/margins": 0.323687881231308, "rewards/rejected": -0.3817707300186157, "step": 2840 }, { "epoch": 0.54, "learning_rate": 2.5623267293451827e-06, "logits/chosen": -1.6810455322265625, "logits/rejected": -1.0212938785552979, "logps/chosen": -548.9854125976562, "logps/rejected": -1143.27099609375, "loss": 0.0821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10023714601993561, "rewards/margins": 0.2769266366958618, "rewards/rejected": -0.3771637976169586, "step": 2850 }, { "epoch": 0.54, "learning_rate": 2.5457084572493094e-06, "logits/chosen": -1.5808911323547363, "logits/rejected": -0.896012008190155, "logps/chosen": -595.2738037109375, "logps/rejected": -1394.0679931640625, "loss": 0.0315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0908048152923584, "rewards/margins": 0.36780017614364624, "rewards/rejected": -0.45860499143600464, "step": 2860 }, { "epoch": 0.55, "learning_rate": 2.5290881645034932e-06, "logits/chosen": -1.5079399347305298, "logits/rejected": -1.0337510108947754, "logps/chosen": -497.4072265625, "logps/rejected": -1198.070556640625, "loss": 0.0651, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06317411363124847, "rewards/margins": 0.32769569754600525, "rewards/rejected": -0.3908698558807373, "step": 2870 }, { "epoch": 0.55, "learning_rate": 2.5124665858468956e-06, "logits/chosen": -1.6744495630264282, "logits/rejected": -1.0496574640274048, "logps/chosen": -609.0294189453125, "logps/rejected": -1278.1341552734375, "loss": 0.0615, "rewards/accuracies": 0.875, "rewards/chosen": -0.12275202572345734, "rewards/margins": 0.33510714769363403, "rewards/rejected": -0.45785918831825256, "step": 2880 }, { "epoch": 0.55, "learning_rate": 2.4958444560755268e-06, "logits/chosen": -1.5687849521636963, "logits/rejected": -1.0864049196243286, "logps/chosen": -631.5806884765625, "logps/rejected": -1329.831298828125, "loss": 0.0664, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1420772522687912, "rewards/margins": 0.3117358386516571, "rewards/rejected": -0.4538130760192871, "step": 2890 }, { "epoch": 0.55, "learning_rate": 2.479222510009758e-06, "logits/chosen": -1.3493144512176514, "logits/rejected": -0.8287954330444336, "logps/chosen": -622.3375244140625, "logps/rejected": -1253.612548828125, "loss": 0.0811, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15970450639724731, "rewards/margins": 0.3076263666152954, "rewards/rejected": -0.4673308730125427, "step": 2900 }, { "epoch": 0.55, "learning_rate": 2.4626014824618418e-06, "logits/chosen": -1.257110834121704, "logits/rejected": -0.9077134132385254, "logps/chosen": -573.6275634765625, "logps/rejected": -1228.2496337890625, "loss": 0.1086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13805031776428223, "rewards/margins": 0.27737957239151, "rewards/rejected": -0.41542989015579224, "step": 2910 }, { "epoch": 0.56, "learning_rate": 2.445982108203422e-06, "logits/chosen": -1.6566803455352783, "logits/rejected": -0.9453924298286438, "logps/chosen": -645.748046875, "logps/rejected": -1354.3961181640625, "loss": 0.0505, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11871250718832016, "rewards/margins": 0.35229814052581787, "rewards/rejected": -0.471010684967041, "step": 2920 }, { "epoch": 0.56, "learning_rate": 2.4293651219330614e-06, "logits/chosen": -1.4606636762619019, "logits/rejected": -0.8227102160453796, "logps/chosen": -565.6244506835938, "logps/rejected": -1142.8575439453125, "loss": 0.0706, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09208562225103378, "rewards/margins": 0.28996017575263977, "rewards/rejected": -0.38204577565193176, "step": 2930 }, { "epoch": 0.56, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -1.4561045169830322, "logits/rejected": -1.000232458114624, "logps/chosen": -633.3966064453125, "logps/rejected": -1223.904296875, "loss": 0.0647, "rewards/accuracies": 0.875, "rewards/chosen": -0.16247665882110596, "rewards/margins": 0.2604535222053528, "rewards/rejected": -0.42293015122413635, "step": 2940 }, { "epoch": 0.56, "learning_rate": 2.3961412515904337e-06, "logits/chosen": -1.4311959743499756, "logits/rejected": -0.9970407485961914, "logps/chosen": -565.9105224609375, "logps/rejected": -1229.419677734375, "loss": 0.0594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09461134672164917, "rewards/margins": 0.3016055226325989, "rewards/rejected": -0.39621683955192566, "step": 2950 }, { "epoch": 0.56, "learning_rate": 2.3795358362575618e-06, "logits/chosen": -1.6998859643936157, "logits/rejected": -1.164505958557129, "logps/chosen": -563.6510009765625, "logps/rejected": -1211.5599365234375, "loss": 0.0694, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08975648880004883, "rewards/margins": 0.2907021641731262, "rewards/rejected": -0.38045868277549744, "step": 2960 }, { "epoch": 0.57, "learning_rate": 2.3629357463266e-06, "logits/chosen": -1.4463509321212769, "logits/rejected": -1.0091583728790283, "logps/chosen": -492.5101013183594, "logps/rejected": -1144.726806640625, "loss": 0.0828, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09430743008852005, "rewards/margins": 0.2873263657093048, "rewards/rejected": -0.38163381814956665, "step": 2970 }, { "epoch": 0.57, "learning_rate": 2.346341715643601e-06, "logits/chosen": -1.5341848134994507, "logits/rejected": -1.1294262409210205, "logps/chosen": -613.6388549804688, "logps/rejected": -1147.2083740234375, "loss": 0.1093, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1467583328485489, "rewards/margins": 0.22035908699035645, "rewards/rejected": -0.3671174645423889, "step": 2980 }, { "epoch": 0.57, "learning_rate": 2.32975447778675e-06, "logits/chosen": -1.6252291202545166, "logits/rejected": -1.1982320547103882, "logps/chosen": -710.5306396484375, "logps/rejected": -1275.906494140625, "loss": 0.074, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16013281047344208, "rewards/margins": 0.2782554030418396, "rewards/rejected": -0.4383881986141205, "step": 2990 }, { "epoch": 0.57, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.457285761833191, "logits/rejected": -1.0221116542816162, "logps/chosen": -597.4036254882812, "logps/rejected": -1166.0738525390625, "loss": 0.084, "rewards/accuracies": 0.75, "rewards/chosen": -0.1460018903017044, "rewards/margins": 0.2565058767795563, "rewards/rejected": -0.4025077223777771, "step": 3000 }, { "epoch": 0.57, "learning_rate": 2.296603313330355e-06, "logits/chosen": -1.3567924499511719, "logits/rejected": -1.0843842029571533, "logps/chosen": -500.361328125, "logps/rejected": -1220.2147216796875, "loss": 0.0836, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1001061201095581, "rewards/margins": 0.3180326521396637, "rewards/rejected": -0.4181387424468994, "step": 3010 }, { "epoch": 0.58, "learning_rate": 2.280040852256068e-06, "logits/chosen": -1.7149471044540405, "logits/rejected": -1.1004184484481812, "logps/chosen": -613.2303466796875, "logps/rejected": -1190.1524658203125, "loss": 0.0794, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12324889749288559, "rewards/margins": 0.29667380452156067, "rewards/rejected": -0.41992267966270447, "step": 3020 }, { "epoch": 0.58, "learning_rate": 2.2634881149936576e-06, "logits/chosen": -1.6608912944793701, "logits/rejected": -0.9288992881774902, "logps/chosen": -696.2246704101562, "logps/rejected": -1400.7724609375, "loss": 0.0615, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1452145129442215, "rewards/margins": 0.3502580523490906, "rewards/rejected": -0.4954725205898285, "step": 3030 }, { "epoch": 0.58, "learning_rate": 2.246945833295836e-06, "logits/chosen": -1.8040908575057983, "logits/rejected": -0.9266663789749146, "logps/chosen": -639.0257568359375, "logps/rejected": -1127.646728515625, "loss": 0.1096, "rewards/accuracies": 0.75, "rewards/chosen": -0.16084113717079163, "rewards/margins": 0.26114019751548767, "rewards/rejected": -0.4219813346862793, "step": 3040 }, { "epoch": 0.58, "learning_rate": 2.230414738453104e-06, "logits/chosen": -1.4765208959579468, "logits/rejected": -0.8586834073066711, "logps/chosen": -596.9618530273438, "logps/rejected": -1303.4659423828125, "loss": 0.0685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11683762073516846, "rewards/margins": 0.3540942668914795, "rewards/rejected": -0.47093185782432556, "step": 3050 }, { "epoch": 0.58, "learning_rate": 2.2138955612614206e-06, "logits/chosen": -1.2896515130996704, "logits/rejected": -0.8482069969177246, "logps/chosen": -584.8584594726562, "logps/rejected": -1247.270263671875, "loss": 0.0827, "rewards/accuracies": 0.875, "rewards/chosen": -0.12709662318229675, "rewards/margins": 0.2933301329612732, "rewards/rejected": -0.42042669653892517, "step": 3060 }, { "epoch": 0.58, "learning_rate": 2.1973890319898965e-06, "logits/chosen": -1.9013397693634033, "logits/rejected": -1.2246954441070557, "logps/chosen": -570.5123291015625, "logps/rejected": -1332.769287109375, "loss": 0.0537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11388106644153595, "rewards/margins": 0.33937424421310425, "rewards/rejected": -0.4532553553581238, "step": 3070 }, { "epoch": 0.59, "learning_rate": 2.1808958803485134e-06, "logits/chosen": -1.4231480360031128, "logits/rejected": -1.1156870126724243, "logps/chosen": -568.2221069335938, "logps/rejected": -1169.4871826171875, "loss": 0.0791, "rewards/accuracies": 0.75, "rewards/chosen": -0.11777313798666, "rewards/margins": 0.28890877962112427, "rewards/rejected": -0.40668192505836487, "step": 3080 }, { "epoch": 0.59, "learning_rate": 2.1644168354558623e-06, "logits/chosen": -1.703005075454712, "logits/rejected": -1.1666195392608643, "logps/chosen": -596.4285278320312, "logps/rejected": -1018.8997192382812, "loss": 0.105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13551798462867737, "rewards/margins": 0.22267785668373108, "rewards/rejected": -0.35819584131240845, "step": 3090 }, { "epoch": 0.59, "learning_rate": 2.1479526258069086e-06, "logits/chosen": -1.3815600872039795, "logits/rejected": -0.6034301519393921, "logps/chosen": -637.18603515625, "logps/rejected": -1317.694580078125, "loss": 0.0633, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.13466182351112366, "rewards/margins": 0.3213178813457489, "rewards/rejected": -0.45597973465919495, "step": 3100 }, { "epoch": 0.59, "learning_rate": 2.1315039792407975e-06, "logits/chosen": -1.5379188060760498, "logits/rejected": -1.128328561782837, "logps/chosen": -510.88555908203125, "logps/rejected": -1054.700927734375, "loss": 0.0981, "rewards/accuracies": 0.75, "rewards/chosen": -0.11464317888021469, "rewards/margins": 0.24528208374977112, "rewards/rejected": -0.3599252998828888, "step": 3110 }, { "epoch": 0.59, "learning_rate": 2.115071622908666e-06, "logits/chosen": -1.3771955966949463, "logits/rejected": -1.0849876403808594, "logps/chosen": -440.3738708496094, "logps/rejected": -1029.5709228515625, "loss": 0.102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1041545420885086, "rewards/margins": 0.2349095344543457, "rewards/rejected": -0.3390640616416931, "step": 3120 }, { "epoch": 0.6, "learning_rate": 2.0986562832415063e-06, "logits/chosen": -1.4015318155288696, "logits/rejected": -1.1533081531524658, "logps/chosen": -526.330322265625, "logps/rejected": -1111.6566162109375, "loss": 0.0952, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14694051444530487, "rewards/margins": 0.2503613829612732, "rewards/rejected": -0.3973018527030945, "step": 3130 }, { "epoch": 0.6, "learning_rate": 2.082258685918047e-06, "logits/chosen": -1.5659064054489136, "logits/rejected": -1.0318849086761475, "logps/chosen": -659.6588134765625, "logps/rejected": -1159.3125, "loss": 0.0962, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1373148411512375, "rewards/margins": 0.2489592283964157, "rewards/rejected": -0.386274129152298, "step": 3140 }, { "epoch": 0.6, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.5127919912338257, "logits/rejected": -0.9335476160049438, "logps/chosen": -520.0562744140625, "logps/rejected": -1174.3912353515625, "loss": 0.0813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10218697786331177, "rewards/margins": 0.2773703634738922, "rewards/rejected": -0.37955737113952637, "step": 3150 }, { "epoch": 0.6, "learning_rate": 2.049519617063389e-06, "logits/chosen": -1.5129424333572388, "logits/rejected": -1.0811700820922852, "logps/chosen": -586.3703002929688, "logps/rejected": -1365.397705078125, "loss": 0.0335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10181821882724762, "rewards/margins": 0.34911856055259705, "rewards/rejected": -0.45093679428100586, "step": 3160 }, { "epoch": 0.6, "learning_rate": 2.033179592839792e-06, "logits/chosen": -1.4758931398391724, "logits/rejected": -0.8931997418403625, "logps/chosen": -583.43310546875, "logps/rejected": -1255.8292236328125, "loss": 0.0603, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11339731514453888, "rewards/margins": 0.32942765951156616, "rewards/rejected": -0.44282498955726624, "step": 3170 }, { "epoch": 0.61, "learning_rate": 2.0168602055111175e-06, "logits/chosen": -1.6357719898223877, "logits/rejected": -1.1608362197875977, "logps/chosen": -438.1368713378906, "logps/rejected": -1034.555419921875, "loss": 0.0944, "rewards/accuracies": 0.75, "rewards/chosen": -0.08265723288059235, "rewards/margins": 0.26866570115089417, "rewards/rejected": -0.3513229489326477, "step": 3180 }, { "epoch": 0.61, "learning_rate": 2.0005621765142942e-06, "logits/chosen": -1.407368779182434, "logits/rejected": -0.7318683862686157, "logps/chosen": -598.01171875, "logps/rejected": -1273.376708984375, "loss": 0.0509, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14980801939964294, "rewards/margins": 0.3117080628871918, "rewards/rejected": -0.46151605248451233, "step": 3190 }, { "epoch": 0.61, "learning_rate": 1.9842862263420565e-06, "logits/chosen": -1.8645998239517212, "logits/rejected": -0.8983448147773743, "logps/chosen": -590.1622924804688, "logps/rejected": -1256.179443359375, "loss": 0.0676, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13834363222122192, "rewards/margins": 0.3233782947063446, "rewards/rejected": -0.4617219567298889, "step": 3200 }, { "epoch": 0.61, "learning_rate": 1.9680330745110954e-06, "logits/chosen": -1.6306655406951904, "logits/rejected": -0.9761863946914673, "logps/chosen": -572.4918212890625, "logps/rejected": -1248.41796875, "loss": 0.06, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1350272297859192, "rewards/margins": 0.3037089407444, "rewards/rejected": -0.43873611092567444, "step": 3210 }, { "epoch": 0.61, "learning_rate": 1.9518034395302413e-06, "logits/chosen": -1.4094593524932861, "logits/rejected": -1.0162584781646729, "logps/chosen": -634.5624389648438, "logps/rejected": -1191.04638671875, "loss": 0.0957, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16423244774341583, "rewards/margins": 0.2612122893333435, "rewards/rejected": -0.42544469237327576, "step": 3220 }, { "epoch": 0.62, "learning_rate": 1.9355980388687145e-06, "logits/chosen": -1.4995319843292236, "logits/rejected": -1.1388894319534302, "logps/chosen": -529.622314453125, "logps/rejected": -1198.6038818359375, "loss": 0.0595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1255326271057129, "rewards/margins": 0.30638033151626587, "rewards/rejected": -0.43191295862197876, "step": 3230 }, { "epoch": 0.62, "learning_rate": 1.9194175889243942e-06, "logits/chosen": -1.6592363119125366, "logits/rejected": -0.9633617401123047, "logps/chosen": -595.4632568359375, "logps/rejected": -1168.793701171875, "loss": 0.0916, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.152585968375206, "rewards/margins": 0.2846713662147522, "rewards/rejected": -0.4372572898864746, "step": 3240 }, { "epoch": 0.62, "learning_rate": 1.903262804992156e-06, "logits/chosen": -1.6367032527923584, "logits/rejected": -1.0083138942718506, "logps/chosen": -616.48681640625, "logps/rejected": -1353.048095703125, "loss": 0.0572, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13493946194648743, "rewards/margins": 0.34219256043434143, "rewards/rejected": -0.47713202238082886, "step": 3250 }, { "epoch": 0.62, "learning_rate": 1.8871344012322504e-06, "logits/chosen": -1.1986305713653564, "logits/rejected": -0.8573341369628906, "logps/chosen": -563.2327270507812, "logps/rejected": -1138.6436767578125, "loss": 0.0999, "rewards/accuracies": 0.875, "rewards/chosen": -0.1444614678621292, "rewards/margins": 0.2410765141248703, "rewards/rejected": -0.3855380117893219, "step": 3260 }, { "epoch": 0.62, "learning_rate": 1.8710330906387288e-06, "logits/chosen": -1.242150068283081, "logits/rejected": -0.8809477686882019, "logps/chosen": -565.7841186523438, "logps/rejected": -1144.38818359375, "loss": 0.0931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13574641942977905, "rewards/margins": 0.26915091276168823, "rewards/rejected": -0.4048973023891449, "step": 3270 }, { "epoch": 0.62, "learning_rate": 1.8549595850079272e-06, "logits/chosen": -1.6145598888397217, "logits/rejected": -1.0401558876037598, "logps/chosen": -727.5928955078125, "logps/rejected": -1343.35302734375, "loss": 0.0752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17629578709602356, "rewards/margins": 0.29485493898391724, "rewards/rejected": -0.4711507260799408, "step": 3280 }, { "epoch": 0.63, "learning_rate": 1.8389145949069953e-06, "logits/chosen": -1.747926950454712, "logits/rejected": -0.7480489015579224, "logps/chosen": -552.2578125, "logps/rejected": -1244.274169921875, "loss": 0.0511, "rewards/accuracies": 0.875, "rewards/chosen": -0.11557585000991821, "rewards/margins": 0.3372827470302582, "rewards/rejected": -0.4528585970401764, "step": 3290 }, { "epoch": 0.63, "learning_rate": 1.8228988296424877e-06, "logits/chosen": -1.5611345767974854, "logits/rejected": -1.1634414196014404, "logps/chosen": -483.97607421875, "logps/rejected": -1324.218505859375, "loss": 0.05, "rewards/accuracies": 0.875, "rewards/chosen": -0.12933804094791412, "rewards/margins": 0.3463062345981598, "rewards/rejected": -0.4756442606449127, "step": 3300 }, { "epoch": 0.63, "learning_rate": 1.806912997229008e-06, "logits/chosen": -1.5756072998046875, "logits/rejected": -1.0822118520736694, "logps/chosen": -690.589599609375, "logps/rejected": -1392.318115234375, "loss": 0.0727, "rewards/accuracies": 0.875, "rewards/chosen": -0.19598130881786346, "rewards/margins": 0.3206397294998169, "rewards/rejected": -0.5166210532188416, "step": 3310 }, { "epoch": 0.63, "learning_rate": 1.7909578043579037e-06, "logits/chosen": -1.2178738117218018, "logits/rejected": -0.8397973775863647, "logps/chosen": -597.6338500976562, "logps/rejected": -1330.0804443359375, "loss": 0.0676, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16162751615047455, "rewards/margins": 0.312694787979126, "rewards/rejected": -0.47432225942611694, "step": 3320 }, { "epoch": 0.63, "learning_rate": 1.7750339563660346e-06, "logits/chosen": -1.4151805639266968, "logits/rejected": -0.889962375164032, "logps/chosen": -618.2703857421875, "logps/rejected": -1273.2015380859375, "loss": 0.0841, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1959114372730255, "rewards/margins": 0.2935883402824402, "rewards/rejected": -0.4894997477531433, "step": 3330 }, { "epoch": 0.64, "learning_rate": 1.759142157204583e-06, "logits/chosen": -1.6737163066864014, "logits/rejected": -0.7453832030296326, "logps/chosen": -753.6701049804688, "logps/rejected": -1307.577880859375, "loss": 0.068, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2204083949327469, "rewards/margins": 0.2919270992279053, "rewards/rejected": -0.5123355388641357, "step": 3340 }, { "epoch": 0.64, "learning_rate": 1.7432831094079357e-06, "logits/chosen": -1.4744651317596436, "logits/rejected": -0.9374326467514038, "logps/chosen": -700.9337768554688, "logps/rejected": -1292.358642578125, "loss": 0.0965, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21388837695121765, "rewards/margins": 0.28661873936653137, "rewards/rejected": -0.5005070567131042, "step": 3350 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -1.5619527101516724, "logits/rejected": -0.9883157014846802, "logps/chosen": -719.3538208007812, "logps/rejected": -1369.1768798828125, "loss": 0.0752, "rewards/accuracies": 0.875, "rewards/chosen": -0.17765849828720093, "rewards/margins": 0.314145565032959, "rewards/rejected": -0.4918040633201599, "step": 3360 }, { "epoch": 0.64, "learning_rate": 1.7116660707763637e-06, "logits/chosen": -1.7237812280654907, "logits/rejected": -1.041046380996704, "logps/chosen": -772.4146728515625, "logps/rejected": -1400.0970458984375, "loss": 0.0663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17615190148353577, "rewards/margins": 0.32200318574905396, "rewards/rejected": -0.49815505743026733, "step": 3370 }, { "epoch": 0.64, "learning_rate": 1.695909477647054e-06, "logits/chosen": -1.4328358173370361, "logits/rejected": -0.9994909167289734, "logps/chosen": -480.7454528808594, "logps/rejected": -1162.206298828125, "loss": 0.0577, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07886415719985962, "rewards/margins": 0.31568989157676697, "rewards/rejected": -0.394554078578949, "step": 3380 }, { "epoch": 0.65, "learning_rate": 1.6801884312319893e-06, "logits/chosen": -1.4784748554229736, "logits/rejected": -1.1017252206802368, "logps/chosen": -540.9151611328125, "logps/rejected": -1170.0943603515625, "loss": 0.0879, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11197110265493393, "rewards/margins": 0.2765769362449646, "rewards/rejected": -0.3885480463504791, "step": 3390 }, { "epoch": 0.65, "learning_rate": 1.6645036265170314e-06, "logits/chosen": -1.538333773612976, "logits/rejected": -0.7612485289573669, "logps/chosen": -702.917236328125, "logps/rejected": -1379.6119384765625, "loss": 0.0517, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15875710546970367, "rewards/margins": 0.3219629228115082, "rewards/rejected": -0.48071998357772827, "step": 3400 }, { "epoch": 0.65, "learning_rate": 1.648855756885893e-06, "logits/chosen": -1.2829601764678955, "logits/rejected": -0.7338923215866089, "logps/chosen": -631.0278930664062, "logps/rejected": -1219.173095703125, "loss": 0.0764, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17343007028102875, "rewards/margins": 0.2668561637401581, "rewards/rejected": -0.440286248922348, "step": 3410 }, { "epoch": 0.65, "learning_rate": 1.633245514089482e-06, "logits/chosen": -1.5770516395568848, "logits/rejected": -1.0756621360778809, "logps/chosen": -651.3983764648438, "logps/rejected": -1466.8804931640625, "loss": 0.0435, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.14593417942523956, "rewards/margins": 0.37087613344192505, "rewards/rejected": -0.5168102979660034, "step": 3420 }, { "epoch": 0.65, "learning_rate": 1.6176735882153284e-06, "logits/chosen": -1.3698972463607788, "logits/rejected": -0.9525673985481262, "logps/chosen": -620.1276245117188, "logps/rejected": -1276.9837646484375, "loss": 0.0513, "rewards/accuracies": 0.875, "rewards/chosen": -0.1597699522972107, "rewards/margins": 0.31164002418518066, "rewards/rejected": -0.47140997648239136, "step": 3430 }, { "epoch": 0.66, "learning_rate": 1.6021406676570667e-06, "logits/chosen": -1.5987428426742554, "logits/rejected": -1.0756624937057495, "logps/chosen": -580.5691528320312, "logps/rejected": -1147.496826171875, "loss": 0.0691, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.16023506224155426, "rewards/margins": 0.2685411274433136, "rewards/rejected": -0.42877617478370667, "step": 3440 }, { "epoch": 0.66, "learning_rate": 1.5866474390840126e-06, "logits/chosen": -1.8586333990097046, "logits/rejected": -1.290020227432251, "logps/chosen": -646.0453491210938, "logps/rejected": -1250.796630859375, "loss": 0.0797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1845080554485321, "rewards/margins": 0.30654436349868774, "rewards/rejected": -0.49105244874954224, "step": 3450 }, { "epoch": 0.66, "learning_rate": 1.5711945874108053e-06, "logits/chosen": -1.4048064947128296, "logits/rejected": -0.7289602160453796, "logps/chosen": -714.6954345703125, "logps/rejected": -1281.2022705078125, "loss": 0.0725, "rewards/accuracies": 0.875, "rewards/chosen": -0.20739439129829407, "rewards/margins": 0.28787916898727417, "rewards/rejected": -0.49527353048324585, "step": 3460 }, { "epoch": 0.66, "learning_rate": 1.5557827957671249e-06, "logits/chosen": -1.6873209476470947, "logits/rejected": -0.9829764366149902, "logps/chosen": -678.6327514648438, "logps/rejected": -1256.5675048828125, "loss": 0.0803, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15068724751472473, "rewards/margins": 0.30040639638900757, "rewards/rejected": -0.4510936737060547, "step": 3470 }, { "epoch": 0.66, "learning_rate": 1.5404127454674994e-06, "logits/chosen": -1.405564308166504, "logits/rejected": -0.9794474840164185, "logps/chosen": -574.632568359375, "logps/rejected": -1390.491455078125, "loss": 0.0701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14562784135341644, "rewards/margins": 0.3308250308036804, "rewards/rejected": -0.47645288705825806, "step": 3480 }, { "epoch": 0.66, "learning_rate": 1.5250851159811809e-06, "logits/chosen": -1.5441749095916748, "logits/rejected": -1.0190322399139404, "logps/chosen": -519.0746459960938, "logps/rejected": -1312.9346923828125, "loss": 0.0534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13192638754844666, "rewards/margins": 0.35666215419769287, "rewards/rejected": -0.48858851194381714, "step": 3490 }, { "epoch": 0.67, "learning_rate": 1.509800584902108e-06, "logits/chosen": -1.6326671838760376, "logits/rejected": -1.2049169540405273, "logps/chosen": -533.8888549804688, "logps/rejected": -1213.007080078125, "loss": 0.0797, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.126430943608284, "rewards/margins": 0.29648357629776, "rewards/rejected": -0.42291444540023804, "step": 3500 }, { "epoch": 0.67, "learning_rate": 1.4945598279189565e-06, "logits/chosen": -1.4451848268508911, "logits/rejected": -0.8163963556289673, "logps/chosen": -687.5403442382812, "logps/rejected": -1368.297119140625, "loss": 0.0548, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20320598781108856, "rewards/margins": 0.3315770626068115, "rewards/rejected": -0.534783124923706, "step": 3510 }, { "epoch": 0.67, "learning_rate": 1.4793635187852622e-06, "logits/chosen": -1.4984921216964722, "logits/rejected": -0.7433010339736938, "logps/chosen": -727.9129638671875, "logps/rejected": -1375.246337890625, "loss": 0.06, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2313799411058426, "rewards/margins": 0.3215225338935852, "rewards/rejected": -0.5529025197029114, "step": 3520 }, { "epoch": 0.67, "learning_rate": 1.4642123292896406e-06, "logits/chosen": -1.4329923391342163, "logits/rejected": -0.9319679141044617, "logps/chosen": -674.9684448242188, "logps/rejected": -1338.0560302734375, "loss": 0.0653, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.21940989792346954, "rewards/margins": 0.30023401975631714, "rewards/rejected": -0.5196439027786255, "step": 3530 }, { "epoch": 0.67, "learning_rate": 1.4491069292260867e-06, "logits/chosen": -1.4247496128082275, "logits/rejected": -1.11383855342865, "logps/chosen": -593.4032592773438, "logps/rejected": -1343.6129150390625, "loss": 0.0415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.19223792850971222, "rewards/margins": 0.3195153474807739, "rewards/rejected": -0.5117533206939697, "step": 3540 }, { "epoch": 0.68, "learning_rate": 1.4340479863643658e-06, "logits/chosen": -1.6909887790679932, "logits/rejected": -1.1834156513214111, "logps/chosen": -716.8977661132812, "logps/rejected": -1389.580322265625, "loss": 0.0618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1833467036485672, "rewards/margins": 0.31689900159835815, "rewards/rejected": -0.5002456903457642, "step": 3550 }, { "epoch": 0.68, "learning_rate": 1.4190361664204936e-06, "logits/chosen": -1.6068832874298096, "logits/rejected": -0.9983807802200317, "logps/chosen": -531.9963989257812, "logps/rejected": -1197.336669921875, "loss": 0.069, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13076290488243103, "rewards/margins": 0.31974467635154724, "rewards/rejected": -0.45050764083862305, "step": 3560 }, { "epoch": 0.68, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -1.47737717628479, "logits/rejected": -0.8850958943367004, "logps/chosen": -646.0311889648438, "logps/rejected": -1259.364501953125, "loss": 0.0716, "rewards/accuracies": 0.875, "rewards/chosen": -0.1776318997144699, "rewards/margins": 0.2986915707588196, "rewards/rejected": -0.4763234555721283, "step": 3570 }, { "epoch": 0.68, "learning_rate": 1.3891565477051242e-06, "logits/chosen": -1.8412586450576782, "logits/rejected": -1.2059457302093506, "logps/chosen": -585.1314697265625, "logps/rejected": -1392.73046875, "loss": 0.0587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1477021872997284, "rewards/margins": 0.34714001417160034, "rewards/rejected": -0.49484220147132874, "step": 3580 }, { "epoch": 0.68, "learning_rate": 1.3742900698325034e-06, "logits/chosen": -1.4215539693832397, "logits/rejected": -0.7746947407722473, "logps/chosen": -621.46044921875, "logps/rejected": -1410.5491943359375, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": -0.18173085153102875, "rewards/margins": 0.3614484369754791, "rewards/rejected": -0.5431792736053467, "step": 3590 }, { "epoch": 0.69, "learning_rate": 1.3594733566170925e-06, "logits/chosen": -1.562574028968811, "logits/rejected": -0.843326210975647, "logps/chosen": -620.8270263671875, "logps/rejected": -1377.2640380859375, "loss": 0.059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.16770535707473755, "rewards/margins": 0.3291049897670746, "rewards/rejected": -0.4968103766441345, "step": 3600 }, { "epoch": 0.69, "learning_rate": 1.3447070630665771e-06, "logits/chosen": -1.5109186172485352, "logits/rejected": -1.1313542127609253, "logps/chosen": -586.7550659179688, "logps/rejected": -1346.669189453125, "loss": 0.0605, "rewards/accuracies": 0.875, "rewards/chosen": -0.1870235800743103, "rewards/margins": 0.3292957544326782, "rewards/rejected": -0.5163193345069885, "step": 3610 }, { "epoch": 0.69, "learning_rate": 1.329991841959717e-06, "logits/chosen": -1.3923102617263794, "logits/rejected": -0.8757355809211731, "logps/chosen": -666.1890258789062, "logps/rejected": -1339.994873046875, "loss": 0.0668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1841500997543335, "rewards/margins": 0.3030817210674286, "rewards/rejected": -0.4872317910194397, "step": 3620 }, { "epoch": 0.69, "learning_rate": 1.3153283438175036e-06, "logits/chosen": -1.271019697189331, "logits/rejected": -0.8956009745597839, "logps/chosen": -613.0701904296875, "logps/rejected": -1512.3135986328125, "loss": 0.0526, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1648632436990738, "rewards/margins": 0.3938170373439789, "rewards/rejected": -0.5586802959442139, "step": 3630 }, { "epoch": 0.69, "learning_rate": 1.3007172168743854e-06, "logits/chosen": -1.6713581085205078, "logits/rejected": -1.1510488986968994, "logps/chosen": -534.3785400390625, "logps/rejected": -1104.552978515625, "loss": 0.079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12797674536705017, "rewards/margins": 0.2560235857963562, "rewards/rejected": -0.38400033116340637, "step": 3640 }, { "epoch": 0.7, "learning_rate": 1.2861591070496193e-06, "logits/chosen": -1.5630991458892822, "logits/rejected": -0.9822925329208374, "logps/chosen": -699.5548706054688, "logps/rejected": -1303.892333984375, "loss": 0.0574, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17523935437202454, "rewards/margins": 0.3121577501296997, "rewards/rejected": -0.48739710450172424, "step": 3650 }, { "epoch": 0.7, "learning_rate": 1.271654657918722e-06, "logits/chosen": -1.5802091360092163, "logits/rejected": -0.9614111185073853, "logps/chosen": -617.6490478515625, "logps/rejected": -1294.764404296875, "loss": 0.0456, "rewards/accuracies": 0.875, "rewards/chosen": -0.1591751128435135, "rewards/margins": 0.3368551731109619, "rewards/rejected": -0.496030330657959, "step": 3660 }, { "epoch": 0.7, "learning_rate": 1.2572045106850051e-06, "logits/chosen": -1.7605321407318115, "logits/rejected": -0.6013051867485046, "logps/chosen": -715.6431884765625, "logps/rejected": -1291.907958984375, "loss": 0.0522, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19700202345848083, "rewards/margins": 0.3319992125034332, "rewards/rejected": -0.5290011167526245, "step": 3670 }, { "epoch": 0.7, "learning_rate": 1.2428093041512418e-06, "logits/chosen": -1.5259217023849487, "logits/rejected": -1.1222094297409058, "logps/chosen": -633.7317504882812, "logps/rejected": -1275.761474609375, "loss": 0.0866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1652495414018631, "rewards/margins": 0.31429868936538696, "rewards/rejected": -0.47954821586608887, "step": 3680 }, { "epoch": 0.7, "learning_rate": 1.2284696746914216e-06, "logits/chosen": -1.5855969190597534, "logits/rejected": -1.0040165185928345, "logps/chosen": -776.5299072265625, "logps/rejected": -1395.8131103515625, "loss": 0.0519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.20241470634937286, "rewards/margins": 0.31013619899749756, "rewards/rejected": -0.512550950050354, "step": 3690 }, { "epoch": 0.7, "learning_rate": 1.2141862562226164e-06, "logits/chosen": -1.536430835723877, "logits/rejected": -0.9561670422554016, "logps/chosen": -641.7059936523438, "logps/rejected": -1212.5941162109375, "loss": 0.1015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15809763967990875, "rewards/margins": 0.27246394753456116, "rewards/rejected": -0.4305616021156311, "step": 3700 }, { "epoch": 0.71, "learning_rate": 1.1999596801769617e-06, "logits/chosen": -1.5653895139694214, "logits/rejected": -1.0047844648361206, "logps/chosen": -617.3829345703125, "logps/rejected": -1279.5341796875, "loss": 0.0666, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1557629555463791, "rewards/margins": 0.29767635464668274, "rewards/rejected": -0.45343929529190063, "step": 3710 }, { "epoch": 0.71, "learning_rate": 1.185790575473738e-06, "logits/chosen": -1.5880681276321411, "logits/rejected": -1.3316981792449951, "logps/chosen": -556.1331176757812, "logps/rejected": -1228.2022705078125, "loss": 0.075, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12465610355138779, "rewards/margins": 0.29106903076171875, "rewards/rejected": -0.41572514176368713, "step": 3720 }, { "epoch": 0.71, "learning_rate": 1.1716795684915728e-06, "logits/chosen": -1.4888153076171875, "logits/rejected": -1.1844497919082642, "logps/chosen": -580.6350708007812, "logps/rejected": -1149.899169921875, "loss": 0.0691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1421315222978592, "rewards/margins": 0.22847023606300354, "rewards/rejected": -0.37060171365737915, "step": 3730 }, { "epoch": 0.71, "learning_rate": 1.1576272830407418e-06, "logits/chosen": -1.7607017755508423, "logits/rejected": -0.8743270039558411, "logps/chosen": -670.7908325195312, "logps/rejected": -1380.0850830078125, "loss": 0.0511, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14333508908748627, "rewards/margins": 0.34049367904663086, "rewards/rejected": -0.4838287830352783, "step": 3740 }, { "epoch": 0.71, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.5949684381484985, "logits/rejected": -1.1345860958099365, "logps/chosen": -587.1764526367188, "logps/rejected": -1332.0303955078125, "loss": 0.0514, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1502644568681717, "rewards/margins": 0.338855117559433, "rewards/rejected": -0.48911961913108826, "step": 3750 }, { "epoch": 0.72, "learning_rate": 1.129701358967123e-06, "logits/chosen": -1.551690697669983, "logits/rejected": -0.9563910365104675, "logps/chosen": -690.2678833007812, "logps/rejected": -1210.73974609375, "loss": 0.0844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.20310266315937042, "rewards/margins": 0.24316063523292542, "rewards/rejected": -0.44626325368881226, "step": 3760 }, { "epoch": 0.72, "learning_rate": 1.11582895487554e-06, "logits/chosen": -1.4407981634140015, "logits/rejected": -0.8519558906555176, "logps/chosen": -559.5155639648438, "logps/rejected": -1227.58935546875, "loss": 0.0618, "rewards/accuracies": 0.875, "rewards/chosen": -0.13855567574501038, "rewards/margins": 0.3181162178516388, "rewards/rejected": -0.45667189359664917, "step": 3770 }, { "epoch": 0.72, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -1.3093456029891968, "logits/rejected": -0.7478567361831665, "logps/chosen": -676.5855102539062, "logps/rejected": -1445.390869140625, "loss": 0.0529, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18879786133766174, "rewards/margins": 0.3267724812030792, "rewards/rejected": -0.515570342540741, "step": 3780 }, { "epoch": 0.72, "learning_rate": 1.0882683288671041e-06, "logits/chosen": -1.4136669635772705, "logits/rejected": -1.191960334777832, "logps/chosen": -555.474609375, "logps/rejected": -1154.699951171875, "loss": 0.0892, "rewards/accuracies": 0.75, "rewards/chosen": -0.11162900924682617, "rewards/margins": 0.25221413373947144, "rewards/rejected": -0.3638431131839752, "step": 3790 }, { "epoch": 0.72, "learning_rate": 1.0745813253325957e-06, "logits/chosen": -1.4562398195266724, "logits/rejected": -1.0913138389587402, "logps/chosen": -537.0150146484375, "logps/rejected": -1177.9039306640625, "loss": 0.0686, "rewards/accuracies": 0.75, "rewards/chosen": -0.1197771430015564, "rewards/margins": 0.2807857394218445, "rewards/rejected": -0.4005628526210785, "step": 3800 }, { "epoch": 0.73, "learning_rate": 1.0609573357858166e-06, "logits/chosen": -1.4760421514511108, "logits/rejected": -0.8424805402755737, "logps/chosen": -640.1893920898438, "logps/rejected": -1284.22998046875, "loss": 0.054, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15238045156002045, "rewards/margins": 0.2920035719871521, "rewards/rejected": -0.44438403844833374, "step": 3810 }, { "epoch": 0.73, "learning_rate": 1.0473969625072922e-06, "logits/chosen": -1.4656455516815186, "logits/rejected": -0.8885079622268677, "logps/chosen": -703.3538208007812, "logps/rejected": -1381.887939453125, "loss": 0.0727, "rewards/accuracies": 0.875, "rewards/chosen": -0.17700466513633728, "rewards/margins": 0.30304640531539917, "rewards/rejected": -0.48005104064941406, "step": 3820 }, { "epoch": 0.73, "learning_rate": 1.0339008049652427e-06, "logits/chosen": -1.68500554561615, "logits/rejected": -1.075042963027954, "logps/chosen": -619.4908447265625, "logps/rejected": -1248.362060546875, "loss": 0.0699, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14864543080329895, "rewards/margins": 0.2982180118560791, "rewards/rejected": -0.44686347246170044, "step": 3830 }, { "epoch": 0.73, "learning_rate": 1.0204694597890814e-06, "logits/chosen": -1.7366454601287842, "logits/rejected": -1.0493816137313843, "logps/chosen": -547.7674560546875, "logps/rejected": -1223.79833984375, "loss": 0.049, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09998428821563721, "rewards/margins": 0.3351575434207916, "rewards/rejected": -0.4351418614387512, "step": 3840 }, { "epoch": 0.73, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -1.3894609212875366, "logits/rejected": -1.1518490314483643, "logps/chosen": -581.4041748046875, "logps/rejected": -1280.5262451171875, "loss": 0.0502, "rewards/accuracies": 0.875, "rewards/chosen": -0.13739798963069916, "rewards/margins": 0.3110824525356293, "rewards/rejected": -0.44848042726516724, "step": 3850 }, { "epoch": 0.74, "learning_rate": 9.938035786999018e-07, "logits/chosen": -1.5209680795669556, "logits/rejected": -0.8864911794662476, "logps/chosen": -659.5616455078125, "logps/rejected": -1236.633056640625, "loss": 0.0933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17649760842323303, "rewards/margins": 0.27773815393447876, "rewards/rejected": -0.4542357921600342, "step": 3860 }, { "epoch": 0.74, "learning_rate": 9.805702216149252e-07, "logits/chosen": -1.458902359008789, "logits/rejected": -1.0474576950073242, "logps/chosen": -530.28515625, "logps/rejected": -1270.1309814453125, "loss": 0.0446, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1434863954782486, "rewards/margins": 0.303547203540802, "rewards/rejected": -0.4470335841178894, "step": 3870 }, { "epoch": 0.74, "learning_rate": 9.674040344998056e-07, "logits/chosen": -1.5949146747589111, "logits/rejected": -0.9253498911857605, "logps/chosen": -625.6763916015625, "logps/rejected": -1221.9095458984375, "loss": 0.0658, "rewards/accuracies": 0.875, "rewards/chosen": -0.1248815506696701, "rewards/margins": 0.302616685628891, "rewards/rejected": -0.4274982511997223, "step": 3880 }, { "epoch": 0.74, "learning_rate": 9.543055993968339e-07, "logits/chosen": -1.3639336824417114, "logits/rejected": -0.909598708152771, "logps/chosen": -634.0882568359375, "logps/rejected": -1329.5985107421875, "loss": 0.0642, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.15912790596485138, "rewards/margins": 0.32093101739883423, "rewards/rejected": -0.4800589680671692, "step": 3890 }, { "epoch": 0.74, "learning_rate": 9.412754953531664e-07, "logits/chosen": -1.477228045463562, "logits/rejected": -0.9538853764533997, "logps/chosen": -521.6915893554688, "logps/rejected": -1194.7880859375, "loss": 0.0739, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1504165530204773, "rewards/margins": 0.2940949499607086, "rewards/rejected": -0.4445114731788635, "step": 3900 }, { "epoch": 0.74, "learning_rate": 9.283142983952231e-07, "logits/chosen": -1.6704351902008057, "logits/rejected": -0.9384806752204895, "logps/chosen": -702.3519287109375, "logps/rejected": -1316.037353515625, "loss": 0.064, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.16525360941886902, "rewards/margins": 0.308427631855011, "rewards/rejected": -0.4736812710762024, "step": 3910 }, { "epoch": 0.75, "learning_rate": 9.154225815032242e-07, "logits/chosen": -1.5252714157104492, "logits/rejected": -1.1429945230484009, "logps/chosen": -606.1965942382812, "logps/rejected": -1182.3236083984375, "loss": 0.1002, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1689486801624298, "rewards/margins": 0.2557165026664734, "rewards/rejected": -0.4246651530265808, "step": 3920 }, { "epoch": 0.75, "learning_rate": 9.026009145858608e-07, "logits/chosen": -1.4882303476333618, "logits/rejected": -1.0083402395248413, "logps/chosen": -578.5821533203125, "logps/rejected": -1250.6842041015625, "loss": 0.0846, "rewards/accuracies": 0.875, "rewards/chosen": -0.14970554411411285, "rewards/margins": 0.30618828535079956, "rewards/rejected": -0.4558938443660736, "step": 3930 }, { "epoch": 0.75, "learning_rate": 8.898498644550973e-07, "logits/chosen": -1.6834971904754639, "logits/rejected": -1.1314411163330078, "logps/chosen": -688.9229736328125, "logps/rejected": -1269.325927734375, "loss": 0.056, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.16720907390117645, "rewards/margins": 0.29985159635543823, "rewards/rejected": -0.4670606553554535, "step": 3940 }, { "epoch": 0.75, "learning_rate": 8.771699948011203e-07, "logits/chosen": -1.572192668914795, "logits/rejected": -0.983639121055603, "logps/chosen": -495.31317138671875, "logps/rejected": -1169.4541015625, "loss": 0.0708, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11250798404216766, "rewards/margins": 0.3055083155632019, "rewards/rejected": -0.4180162847042084, "step": 3950 }, { "epoch": 0.75, "learning_rate": 8.645618661674144e-07, "logits/chosen": -1.3754651546478271, "logits/rejected": -1.0094373226165771, "logps/chosen": -727.6187744140625, "logps/rejected": -1270.610107421875, "loss": 0.0845, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18097665905952454, "rewards/margins": 0.2794339954853058, "rewards/rejected": -0.4604106545448303, "step": 3960 }, { "epoch": 0.76, "learning_rate": 8.520260359259822e-07, "logits/chosen": -1.5351568460464478, "logits/rejected": -1.067787528038025, "logps/chosen": -595.1707763671875, "logps/rejected": -1178.1083984375, "loss": 0.0752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15523064136505127, "rewards/margins": 0.2633352279663086, "rewards/rejected": -0.4185658395290375, "step": 3970 }, { "epoch": 0.76, "learning_rate": 8.395630582527075e-07, "logits/chosen": -1.5825989246368408, "logits/rejected": -1.1352492570877075, "logps/chosen": -509.427978515625, "logps/rejected": -1257.157958984375, "loss": 0.0771, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13573279976844788, "rewards/margins": 0.3318541347980499, "rewards/rejected": -0.4675869345664978, "step": 3980 }, { "epoch": 0.76, "learning_rate": 8.271734841028553e-07, "logits/chosen": -1.5050828456878662, "logits/rejected": -0.9955355525016785, "logps/chosen": -755.04931640625, "logps/rejected": -1391.7149658203125, "loss": 0.074, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.22364020347595215, "rewards/margins": 0.31569841504096985, "rewards/rejected": -0.5393385291099548, "step": 3990 }, { "epoch": 0.76, "learning_rate": 8.148578611867114e-07, "logits/chosen": -1.5540293455123901, "logits/rejected": -1.1930735111236572, "logps/chosen": -556.0262451171875, "logps/rejected": -1192.371337890625, "loss": 0.0738, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1541956663131714, "rewards/margins": 0.28172677755355835, "rewards/rejected": -0.43592238426208496, "step": 4000 }, { "epoch": 0.76, "learning_rate": 8.026167339453792e-07, "logits/chosen": -1.4712079763412476, "logits/rejected": -0.9827225804328918, "logps/chosen": -662.958251953125, "logps/rejected": -1321.115966796875, "loss": 0.0744, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19676116108894348, "rewards/margins": 0.2918079197406769, "rewards/rejected": -0.48856911063194275, "step": 4010 }, { "epoch": 0.77, "learning_rate": 7.904506435266998e-07, "logits/chosen": -1.639409065246582, "logits/rejected": -1.1449193954467773, "logps/chosen": -582.4534912109375, "logps/rejected": -1267.9315185546875, "loss": 0.0727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1511387825012207, "rewards/margins": 0.3111017644405365, "rewards/rejected": -0.4622405469417572, "step": 4020 }, { "epoch": 0.77, "learning_rate": 7.783601277613378e-07, "logits/chosen": -1.5379669666290283, "logits/rejected": -1.0621458292007446, "logps/chosen": -552.1363525390625, "logps/rejected": -1202.6796875, "loss": 0.0671, "rewards/accuracies": 0.875, "rewards/chosen": -0.14880362153053284, "rewards/margins": 0.3249647617340088, "rewards/rejected": -0.47376832365989685, "step": 4030 }, { "epoch": 0.77, "learning_rate": 7.66345721139003e-07, "logits/chosen": -1.3945927619934082, "logits/rejected": -1.0799853801727295, "logps/chosen": -627.4310302734375, "logps/rejected": -1245.181396484375, "loss": 0.0679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1866941750049591, "rewards/margins": 0.2926812171936035, "rewards/rejected": -0.47937536239624023, "step": 4040 }, { "epoch": 0.77, "learning_rate": 7.544079547848183e-07, "logits/chosen": -1.474120855331421, "logits/rejected": -0.9780243635177612, "logps/chosen": -651.1190795898438, "logps/rejected": -1327.578857421875, "loss": 0.0686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18713738024234772, "rewards/margins": 0.31425991654396057, "rewards/rejected": -0.5013972520828247, "step": 4050 }, { "epoch": 0.77, "learning_rate": 7.425473564358457e-07, "logits/chosen": -1.621582269668579, "logits/rejected": -1.1246297359466553, "logps/chosen": -581.9295654296875, "logps/rejected": -1041.8333740234375, "loss": 0.1021, "rewards/accuracies": 0.75, "rewards/chosen": -0.16944041848182678, "rewards/margins": 0.22699041664600372, "rewards/rejected": -0.3964308202266693, "step": 4060 }, { "epoch": 0.78, "learning_rate": 7.307644504177539e-07, "logits/chosen": -1.3632748126983643, "logits/rejected": -0.8100827932357788, "logps/chosen": -676.1319580078125, "logps/rejected": -1327.113525390625, "loss": 0.0714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.19097070395946503, "rewards/margins": 0.3000251352787018, "rewards/rejected": -0.4909958839416504, "step": 4070 }, { "epoch": 0.78, "learning_rate": 7.190597576216385e-07, "logits/chosen": -1.6218783855438232, "logits/rejected": -1.0738632678985596, "logps/chosen": -640.3385620117188, "logps/rejected": -1275.8609619140625, "loss": 0.0817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1903303861618042, "rewards/margins": 0.2934018075466156, "rewards/rejected": -0.4837321639060974, "step": 4080 }, { "epoch": 0.78, "learning_rate": 7.074337954809945e-07, "logits/chosen": -1.6243613958358765, "logits/rejected": -1.0085480213165283, "logps/chosen": -593.51025390625, "logps/rejected": -1382.5362548828125, "loss": 0.0511, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1565353125333786, "rewards/margins": 0.3839346170425415, "rewards/rejected": -0.5404700040817261, "step": 4090 }, { "epoch": 0.78, "learning_rate": 6.958870779488447e-07, "logits/chosen": -1.8175923824310303, "logits/rejected": -0.9546326398849487, "logps/chosen": -654.0431518554688, "logps/rejected": -1326.212890625, "loss": 0.0702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1587834656238556, "rewards/margins": 0.3209785521030426, "rewards/rejected": -0.4797619879245758, "step": 4100 }, { "epoch": 0.78, "learning_rate": 6.844201154750176e-07, "logits/chosen": -1.4412815570831299, "logits/rejected": -0.8726056218147278, "logps/chosen": -592.4741821289062, "logps/rejected": -1309.003662109375, "loss": 0.0606, "rewards/accuracies": 0.875, "rewards/chosen": -0.161200150847435, "rewards/margins": 0.32679271697998047, "rewards/rejected": -0.4879928529262543, "step": 4110 }, { "epoch": 0.78, "learning_rate": 6.730334149835788e-07, "logits/chosen": -1.6734164953231812, "logits/rejected": -1.2678444385528564, "logps/chosen": -545.2771606445312, "logps/rejected": -1146.37890625, "loss": 0.0729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15675309300422668, "rewards/margins": 0.28524309396743774, "rewards/rejected": -0.44199615716934204, "step": 4120 }, { "epoch": 0.79, "learning_rate": 6.617274798504286e-07, "logits/chosen": -1.476194977760315, "logits/rejected": -0.7991756200790405, "logps/chosen": -661.20166015625, "logps/rejected": -1299.816162109375, "loss": 0.086, "rewards/accuracies": 0.875, "rewards/chosen": -0.1620057076215744, "rewards/margins": 0.303924024105072, "rewards/rejected": -0.4659297466278076, "step": 4130 }, { "epoch": 0.79, "learning_rate": 6.505028098810407e-07, "logits/chosen": -1.4836968183517456, "logits/rejected": -0.7697210311889648, "logps/chosen": -673.612060546875, "logps/rejected": -1344.207763671875, "loss": 0.0575, "rewards/accuracies": 0.875, "rewards/chosen": -0.1491011381149292, "rewards/margins": 0.332655131816864, "rewards/rejected": -0.4817562699317932, "step": 4140 }, { "epoch": 0.79, "learning_rate": 6.393599012883709e-07, "logits/chosen": -1.7030470371246338, "logits/rejected": -0.9431253671646118, "logps/chosen": -698.5299682617188, "logps/rejected": -1296.638427734375, "loss": 0.0826, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17169280350208282, "rewards/margins": 0.28853195905685425, "rewards/rejected": -0.46022477746009827, "step": 4150 }, { "epoch": 0.79, "learning_rate": 6.282992466709247e-07, "logits/chosen": -1.876447319984436, "logits/rejected": -1.2787059545516968, "logps/chosen": -610.8107299804688, "logps/rejected": -1240.9732666015625, "loss": 0.0764, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15487447381019592, "rewards/margins": 0.2935008704662323, "rewards/rejected": -0.4483753740787506, "step": 4160 }, { "epoch": 0.79, "learning_rate": 6.17321334990973e-07, "logits/chosen": -1.378609538078308, "logits/rejected": -1.188201665878296, "logps/chosen": -569.0336303710938, "logps/rejected": -1221.126220703125, "loss": 0.1055, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18387310206890106, "rewards/margins": 0.2487374097108841, "rewards/rejected": -0.43261051177978516, "step": 4170 }, { "epoch": 0.8, "learning_rate": 6.064266515529419e-07, "logits/chosen": -1.5385997295379639, "logits/rejected": -0.8599594235420227, "logps/chosen": -761.7626953125, "logps/rejected": -1329.2393798828125, "loss": 0.059, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1731938272714615, "rewards/margins": 0.30250200629234314, "rewards/rejected": -0.47569578886032104, "step": 4180 }, { "epoch": 0.8, "learning_rate": 5.956156779819586e-07, "logits/chosen": -1.5333473682403564, "logits/rejected": -0.9734029769897461, "logps/chosen": -637.819091796875, "logps/rejected": -1298.0240478515625, "loss": 0.0533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17162570357322693, "rewards/margins": 0.32062727212905884, "rewards/rejected": -0.49225300550460815, "step": 4190 }, { "epoch": 0.8, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1.6408611536026, "logits/rejected": -1.1250369548797607, "logps/chosen": -636.6069946289062, "logps/rejected": -1349.956787109375, "loss": 0.0653, "rewards/accuracies": 0.875, "rewards/chosen": -0.1537880152463913, "rewards/margins": 0.3114011287689209, "rewards/rejected": -0.4651891589164734, "step": 4200 }, { "epoch": 0.8, "learning_rate": 5.742467684175473e-07, "logits/chosen": -1.7282884120941162, "logits/rejected": -0.9091650247573853, "logps/chosen": -654.1572875976562, "logps/rejected": -1369.8782958984375, "loss": 0.0578, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1490090787410736, "rewards/margins": 0.3345029652118683, "rewards/rejected": -0.4835120141506195, "step": 4210 }, { "epoch": 0.8, "learning_rate": 5.636897770870667e-07, "logits/chosen": -1.5277897119522095, "logits/rejected": -1.0950522422790527, "logps/chosen": -621.8223876953125, "logps/rejected": -1188.8853759765625, "loss": 0.0727, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.16900405287742615, "rewards/margins": 0.27010002732276917, "rewards/rejected": -0.4391040802001953, "step": 4220 }, { "epoch": 0.81, "learning_rate": 5.532183849077651e-07, "logits/chosen": -1.478398323059082, "logits/rejected": -0.7619593739509583, "logps/chosen": -704.496826171875, "logps/rejected": -1236.497314453125, "loss": 0.0832, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1664319932460785, "rewards/margins": 0.30853888392448425, "rewards/rejected": -0.47497090697288513, "step": 4230 }, { "epoch": 0.81, "learning_rate": 5.428330547921809e-07, "logits/chosen": -1.4886611700057983, "logits/rejected": -1.0063542127609253, "logps/chosen": -659.728515625, "logps/rejected": -1120.3165283203125, "loss": 0.1056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18079857528209686, "rewards/margins": 0.2402995079755783, "rewards/rejected": -0.4210980534553528, "step": 4240 }, { "epoch": 0.81, "learning_rate": 5.32534245848278e-07, "logits/chosen": -1.6218599081039429, "logits/rejected": -1.0438997745513916, "logps/chosen": -647.7769165039062, "logps/rejected": -1325.986572265625, "loss": 0.0708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1493169367313385, "rewards/margins": 0.3103331923484802, "rewards/rejected": -0.45965009927749634, "step": 4250 }, { "epoch": 0.81, "learning_rate": 5.223224133591475e-07, "logits/chosen": -1.5376250743865967, "logits/rejected": -1.0985280275344849, "logps/chosen": -623.9844970703125, "logps/rejected": -1423.852783203125, "loss": 0.0325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15335962176322937, "rewards/margins": 0.3516461253166199, "rewards/rejected": -0.5050057172775269, "step": 4260 }, { "epoch": 0.81, "learning_rate": 5.121980087628802e-07, "logits/chosen": -1.5560121536254883, "logits/rejected": -1.2112586498260498, "logps/chosen": -535.8369140625, "logps/rejected": -1187.4632568359375, "loss": 0.0843, "rewards/accuracies": 0.875, "rewards/chosen": -0.14772644639015198, "rewards/margins": 0.28468599915504456, "rewards/rejected": -0.43241238594055176, "step": 4270 }, { "epoch": 0.82, "learning_rate": 5.021614796326155e-07, "logits/chosen": -1.6284434795379639, "logits/rejected": -0.9607194662094116, "logps/chosen": -645.3671264648438, "logps/rejected": -1252.421630859375, "loss": 0.0695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1374301016330719, "rewards/margins": 0.2948032021522522, "rewards/rejected": -0.43223339319229126, "step": 4280 }, { "epoch": 0.82, "learning_rate": 4.922132696567463e-07, "logits/chosen": -1.6314195394515991, "logits/rejected": -1.032524824142456, "logps/chosen": -592.7301025390625, "logps/rejected": -1306.8658447265625, "loss": 0.0372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12905898690223694, "rewards/margins": 0.3327252268791199, "rewards/rejected": -0.4617842137813568, "step": 4290 }, { "epoch": 0.82, "learning_rate": 4.823538186193097e-07, "logits/chosen": -1.5528547763824463, "logits/rejected": -1.1627963781356812, "logps/chosen": -598.49267578125, "logps/rejected": -1283.17138671875, "loss": 0.0738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13689306378364563, "rewards/margins": 0.32022175192832947, "rewards/rejected": -0.4571148753166199, "step": 4300 }, { "epoch": 0.82, "learning_rate": 4.725835623805494e-07, "logits/chosen": -1.7339166402816772, "logits/rejected": -1.2147337198257446, "logps/chosen": -593.4891357421875, "logps/rejected": -1166.099609375, "loss": 0.057, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1361609250307083, "rewards/margins": 0.2860185503959656, "rewards/rejected": -0.42217954993247986, "step": 4310 }, { "epoch": 0.82, "learning_rate": 4.6290293285763816e-07, "logits/chosen": -1.5797579288482666, "logits/rejected": -0.8604723215103149, "logps/chosen": -692.3845825195312, "logps/rejected": -1367.1331787109375, "loss": 0.0554, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1601569652557373, "rewards/margins": 0.337593138217926, "rewards/rejected": -0.49775010347366333, "step": 4320 }, { "epoch": 0.82, "learning_rate": 4.533123580055909e-07, "logits/chosen": -1.7544052600860596, "logits/rejected": -1.1722230911254883, "logps/chosen": -622.83203125, "logps/rejected": -1235.314208984375, "loss": 0.0853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1056605726480484, "rewards/margins": 0.32745251059532166, "rewards/rejected": -0.4331130385398865, "step": 4330 }, { "epoch": 0.83, "learning_rate": 4.438122617983442e-07, "logits/chosen": -1.6120226383209229, "logits/rejected": -0.8405524492263794, "logps/chosen": -629.9571533203125, "logps/rejected": -1155.5582275390625, "loss": 0.0841, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13840670883655548, "rewards/margins": 0.28906354308128357, "rewards/rejected": -0.42747029662132263, "step": 4340 }, { "epoch": 0.83, "learning_rate": 4.344030642100133e-07, "logits/chosen": -1.6726274490356445, "logits/rejected": -1.0421268939971924, "logps/chosen": -618.4395751953125, "logps/rejected": -1348.0491943359375, "loss": 0.0565, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14258643984794617, "rewards/margins": 0.3459554612636566, "rewards/rejected": -0.4885419011116028, "step": 4350 }, { "epoch": 0.83, "learning_rate": 4.250851811963236e-07, "logits/chosen": -1.7452752590179443, "logits/rejected": -1.0764765739440918, "logps/chosen": -628.9611206054688, "logps/rejected": -1291.5279541015625, "loss": 0.0524, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.13416460156440735, "rewards/margins": 0.33325523138046265, "rewards/rejected": -0.4674198031425476, "step": 4360 }, { "epoch": 0.83, "learning_rate": 4.158590246762278e-07, "logits/chosen": -1.4894119501113892, "logits/rejected": -1.0432569980621338, "logps/chosen": -569.1922607421875, "logps/rejected": -1308.944580078125, "loss": 0.0654, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14465302228927612, "rewards/margins": 0.32924753427505493, "rewards/rejected": -0.4739004969596863, "step": 4370 }, { "epoch": 0.83, "learning_rate": 4.0672500251369204e-07, "logits/chosen": -1.4088810682296753, "logits/rejected": -0.937311053276062, "logps/chosen": -637.6170043945312, "logps/rejected": -1380.856689453125, "loss": 0.0363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1410633772611618, "rewards/margins": 0.3729560971260071, "rewards/rejected": -0.5140194892883301, "step": 4380 }, { "epoch": 0.84, "learning_rate": 3.976835184996644e-07, "logits/chosen": -1.6045525074005127, "logits/rejected": -1.086709976196289, "logps/chosen": -607.0584716796875, "logps/rejected": -1337.4051513671875, "loss": 0.0612, "rewards/accuracies": 0.875, "rewards/chosen": -0.155624121427536, "rewards/margins": 0.3269900679588318, "rewards/rejected": -0.482614129781723, "step": 4390 }, { "epoch": 0.84, "learning_rate": 3.887349723342304e-07, "logits/chosen": -1.848745346069336, "logits/rejected": -1.1570312976837158, "logps/chosen": -744.2691650390625, "logps/rejected": -1386.67724609375, "loss": 0.0422, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18313486874103546, "rewards/margins": 0.3392416536808014, "rewards/rejected": -0.5223765969276428, "step": 4400 }, { "epoch": 0.84, "learning_rate": 3.798797596089351e-07, "logits/chosen": -1.6429369449615479, "logits/rejected": -1.0298631191253662, "logps/chosen": -593.02685546875, "logps/rejected": -1241.770751953125, "loss": 0.0617, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1318318098783493, "rewards/margins": 0.3228714168071747, "rewards/rejected": -0.4547031819820404, "step": 4410 }, { "epoch": 0.84, "learning_rate": 3.711182717893011e-07, "logits/chosen": -1.507433295249939, "logits/rejected": -0.786311686038971, "logps/chosen": -643.9988403320312, "logps/rejected": -1273.1219482421875, "loss": 0.071, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17599640786647797, "rewards/margins": 0.2902034521102905, "rewards/rejected": -0.46619993448257446, "step": 4420 }, { "epoch": 0.84, "learning_rate": 3.624508961975215e-07, "logits/chosen": -1.328204870223999, "logits/rejected": -1.0989032983779907, "logps/chosen": -629.1240234375, "logps/rejected": -1093.088623046875, "loss": 0.1225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18698564171791077, "rewards/margins": 0.21734988689422607, "rewards/rejected": -0.40433555841445923, "step": 4430 }, { "epoch": 0.85, "learning_rate": 3.538780159953348e-07, "logits/chosen": -1.5716381072998047, "logits/rejected": -0.8230584859848022, "logps/chosen": -647.3327026367188, "logps/rejected": -1299.0325927734375, "loss": 0.0529, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.14860112965106964, "rewards/margins": 0.3516216576099396, "rewards/rejected": -0.5002228021621704, "step": 4440 }, { "epoch": 0.85, "learning_rate": 3.454000101670901e-07, "logits/chosen": -1.4817876815795898, "logits/rejected": -0.9819103479385376, "logps/chosen": -719.157958984375, "logps/rejected": -1345.7685546875, "loss": 0.0874, "rewards/accuracies": 0.875, "rewards/chosen": -0.17831948399543762, "rewards/margins": 0.2975057065486908, "rewards/rejected": -0.4758252203464508, "step": 4450 }, { "epoch": 0.85, "learning_rate": 3.3701725350299143e-07, "logits/chosen": -1.5737899541854858, "logits/rejected": -0.9576247930526733, "logps/chosen": -624.8809204101562, "logps/rejected": -1283.3790283203125, "loss": 0.0594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15519653260707855, "rewards/margins": 0.31781700253486633, "rewards/rejected": -0.4730134904384613, "step": 4460 }, { "epoch": 0.85, "learning_rate": 3.2873011658252796e-07, "logits/chosen": -1.6041154861450195, "logits/rejected": -0.9867879748344421, "logps/chosen": -616.98876953125, "logps/rejected": -1315.84765625, "loss": 0.0685, "rewards/accuracies": 0.875, "rewards/chosen": -0.16912952065467834, "rewards/margins": 0.3227527439594269, "rewards/rejected": -0.49188223481178284, "step": 4470 }, { "epoch": 0.85, "learning_rate": 3.2053896575809426e-07, "logits/chosen": -1.7311811447143555, "logits/rejected": -1.067882776260376, "logps/chosen": -649.1055908203125, "logps/rejected": -1258.640869140625, "loss": 0.0901, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17431046068668365, "rewards/margins": 0.3118690252304077, "rewards/rejected": -0.48617953062057495, "step": 4480 }, { "epoch": 0.86, "learning_rate": 3.124441631387931e-07, "logits/chosen": -1.523450255393982, "logits/rejected": -1.0439492464065552, "logps/chosen": -491.62249755859375, "logps/rejected": -1279.021484375, "loss": 0.0445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12501509487628937, "rewards/margins": 0.3468592166900635, "rewards/rejected": -0.47187429666519165, "step": 4490 }, { "epoch": 0.86, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.5553473234176636, "logits/rejected": -1.015411138534546, "logps/chosen": -621.4290771484375, "logps/rejected": -1193.322265625, "loss": 0.0624, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17289629578590393, "rewards/margins": 0.2942846715450287, "rewards/rejected": -0.4671810269355774, "step": 4500 }, { "epoch": 0.86, "learning_rate": 2.9654502963968575e-07, "logits/chosen": -1.6609184741973877, "logits/rejected": -0.9597708582878113, "logps/chosen": -598.8553466796875, "logps/rejected": -1200.1343994140625, "loss": 0.0725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14991316199302673, "rewards/margins": 0.2949966788291931, "rewards/rejected": -0.44490987062454224, "step": 4510 }, { "epoch": 0.86, "learning_rate": 2.8874140161849915e-07, "logits/chosen": -1.6514486074447632, "logits/rejected": -0.9042898416519165, "logps/chosen": -669.432861328125, "logps/rejected": -1172.0797119140625, "loss": 0.0877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16250768303871155, "rewards/margins": 0.28728577494621277, "rewards/rejected": -0.4497934877872467, "step": 4520 }, { "epoch": 0.86, "learning_rate": 2.810355274886148e-07, "logits/chosen": -1.4443236589431763, "logits/rejected": -1.129891037940979, "logps/chosen": -550.1930541992188, "logps/rejected": -1096.160888671875, "loss": 0.0956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1617933064699173, "rewards/margins": 0.23670299351215363, "rewards/rejected": -0.39849624037742615, "step": 4530 }, { "epoch": 0.86, "learning_rate": 2.7342774790633686e-07, "logits/chosen": -1.6275829076766968, "logits/rejected": -1.0061298608779907, "logps/chosen": -649.8400268554688, "logps/rejected": -1309.129150390625, "loss": 0.0815, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17947590351104736, "rewards/margins": 0.28794533014297485, "rewards/rejected": -0.4674212336540222, "step": 4540 }, { "epoch": 0.87, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -1.280903697013855, "logits/rejected": -0.8944345712661743, "logps/chosen": -608.0111083984375, "logps/rejected": -1276.568115234375, "loss": 0.0597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1773298680782318, "rewards/margins": 0.3045726418495178, "rewards/rejected": -0.48190250992774963, "step": 4550 }, { "epoch": 0.87, "learning_rate": 2.58507813312448e-07, "logits/chosen": -1.6265952587127686, "logits/rejected": -0.855617344379425, "logps/chosen": -606.3870849609375, "logps/rejected": -1226.21533203125, "loss": 0.0519, "rewards/accuracies": 0.875, "rewards/chosen": -0.11800136417150497, "rewards/margins": 0.32250529527664185, "rewards/rejected": -0.4405066967010498, "step": 4560 }, { "epoch": 0.87, "learning_rate": 2.511963178716648e-07, "logits/chosen": -1.8203041553497314, "logits/rejected": -1.3360464572906494, "logps/chosen": -547.7073974609375, "logps/rejected": -1084.524658203125, "loss": 0.0827, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17347629368305206, "rewards/margins": 0.2538864314556122, "rewards/rejected": -0.42736274003982544, "step": 4570 }, { "epoch": 0.87, "learning_rate": 2.439842360909864e-07, "logits/chosen": -1.8694589138031006, "logits/rejected": -1.0154645442962646, "logps/chosen": -750.2979736328125, "logps/rejected": -1419.476806640625, "loss": 0.0595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18350553512573242, "rewards/margins": 0.32696714997291565, "rewards/rejected": -0.5104726552963257, "step": 4580 }, { "epoch": 0.87, "learning_rate": 2.3687188679746314e-07, "logits/chosen": -1.623376488685608, "logits/rejected": -0.7891206741333008, "logps/chosen": -701.0450439453125, "logps/rejected": -1194.2939453125, "loss": 0.0759, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19170424342155457, "rewards/margins": 0.27900928258895874, "rewards/rejected": -0.4707135558128357, "step": 4590 }, { "epoch": 0.88, "learning_rate": 2.2985958440923772e-07, "logits/chosen": -1.697003960609436, "logits/rejected": -1.1143566370010376, "logps/chosen": -708.6148071289062, "logps/rejected": -1290.4073486328125, "loss": 0.0789, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.183920219540596, "rewards/margins": 0.282669335603714, "rewards/rejected": -0.4665895402431488, "step": 4600 }, { "epoch": 0.88, "learning_rate": 2.2294763892164284e-07, "logits/chosen": -1.4680585861206055, "logits/rejected": -0.733687162399292, "logps/chosen": -641.66748046875, "logps/rejected": -1253.496826171875, "loss": 0.0761, "rewards/accuracies": 0.875, "rewards/chosen": -0.17868050932884216, "rewards/margins": 0.30219537019729614, "rewards/rejected": -0.4808759093284607, "step": 4610 }, { "epoch": 0.88, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -1.4635512828826904, "logits/rejected": -1.1326031684875488, "logps/chosen": -508.17340087890625, "logps/rejected": -1159.115966796875, "loss": 0.0711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12790942192077637, "rewards/margins": 0.3032786250114441, "rewards/rejected": -0.43118801712989807, "step": 4620 }, { "epoch": 0.88, "learning_rate": 2.094260364336026e-07, "logits/chosen": -1.6897389888763428, "logits/rejected": -1.156468391418457, "logps/chosen": -556.9920043945312, "logps/rejected": -1199.290771484375, "loss": 0.088, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14459089934825897, "rewards/margins": 0.27455028891563416, "rewards/rejected": -0.4191412031650543, "step": 4630 }, { "epoch": 0.88, "learning_rate": 2.0281697718742333e-07, "logits/chosen": -1.6482601165771484, "logits/rejected": -0.9699563980102539, "logps/chosen": -712.9674072265625, "logps/rejected": -1220.4947509765625, "loss": 0.0685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17597351968288422, "rewards/margins": 0.28940483927726746, "rewards/rejected": -0.4653783440589905, "step": 4640 }, { "epoch": 0.89, "learning_rate": 1.9630947032398068e-07, "logits/chosen": -1.8412061929702759, "logits/rejected": -1.1319139003753662, "logps/chosen": -749.4605712890625, "logps/rejected": -1322.244384765625, "loss": 0.06, "rewards/accuracies": 0.875, "rewards/chosen": -0.14990632236003876, "rewards/margins": 0.32965174317359924, "rewards/rejected": -0.4795580804347992, "step": 4650 }, { "epoch": 0.89, "learning_rate": 1.899038035229342e-07, "logits/chosen": -1.5682942867279053, "logits/rejected": -0.939906895160675, "logps/chosen": -631.8695678710938, "logps/rejected": -1207.721435546875, "loss": 0.073, "rewards/accuracies": 0.875, "rewards/chosen": -0.14423295855522156, "rewards/margins": 0.2942690849304199, "rewards/rejected": -0.4385020136833191, "step": 4660 }, { "epoch": 0.89, "learning_rate": 1.8360025996186138e-07, "logits/chosen": -1.2831768989562988, "logits/rejected": -0.8119879961013794, "logps/chosen": -607.0415649414062, "logps/rejected": -1206.13720703125, "loss": 0.0877, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1657409518957138, "rewards/margins": 0.2819930911064148, "rewards/rejected": -0.44773411750793457, "step": 4670 }, { "epoch": 0.89, "learning_rate": 1.7739911830374352e-07, "logits/chosen": -1.6281124353408813, "logits/rejected": -1.1297900676727295, "logps/chosen": -720.3666381835938, "logps/rejected": -1451.5751953125, "loss": 0.0598, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18103721737861633, "rewards/margins": 0.3197404742240906, "rewards/rejected": -0.5007776618003845, "step": 4680 }, { "epoch": 0.89, "learning_rate": 1.713006526846439e-07, "logits/chosen": -1.5305252075195312, "logits/rejected": -1.1208873987197876, "logps/chosen": -668.1603393554688, "logps/rejected": -1300.840087890625, "loss": 0.0905, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18247172236442566, "rewards/margins": 0.2804272174835205, "rewards/rejected": -0.46289896965026855, "step": 4690 }, { "epoch": 0.9, "learning_rate": 1.6530513270159116e-07, "logits/chosen": -1.6757923364639282, "logits/rejected": -1.1312105655670166, "logps/chosen": -585.9445190429688, "logps/rejected": -1365.5906982421875, "loss": 0.0651, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12228063493967056, "rewards/margins": 0.36077576875686646, "rewards/rejected": -0.48305636644363403, "step": 4700 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -1.3578886985778809, "logits/rejected": -0.8729084730148315, "logps/chosen": -595.7457885742188, "logps/rejected": -1204.9522705078125, "loss": 0.0826, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17481914162635803, "rewards/margins": 0.26105111837387085, "rewards/rejected": -0.43587031960487366, "step": 4710 }, { "epoch": 0.9, "learning_rate": 1.5362398526524463e-07, "logits/chosen": -1.5783363580703735, "logits/rejected": -0.9245842099189758, "logps/chosen": -709.2633056640625, "logps/rejected": -1288.1170654296875, "loss": 0.0861, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.16297248005867004, "rewards/margins": 0.2818123698234558, "rewards/rejected": -0.44478487968444824, "step": 4720 }, { "epoch": 0.9, "learning_rate": 1.4793887420457008e-07, "logits/chosen": -1.2609602212905884, "logits/rejected": -0.8738704919815063, "logps/chosen": -675.8863525390625, "logps/rejected": -1262.517333984375, "loss": 0.0905, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18329861760139465, "rewards/margins": 0.28873276710510254, "rewards/rejected": -0.4720313549041748, "step": 4730 }, { "epoch": 0.9, "learning_rate": 1.4235774154234855e-07, "logits/chosen": -1.5686185359954834, "logits/rejected": -0.8845101594924927, "logps/chosen": -705.612548828125, "logps/rejected": -1317.673828125, "loss": 0.0753, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1770937442779541, "rewards/margins": 0.28883716464042664, "rewards/rejected": -0.4659309387207031, "step": 4740 }, { "epoch": 0.9, "learning_rate": 1.368808340056879e-07, "logits/chosen": -1.370813012123108, "logits/rejected": -1.0830169916152954, "logps/chosen": -518.2303466796875, "logps/rejected": -1131.841064453125, "loss": 0.0874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12415562570095062, "rewards/margins": 0.28626319766044617, "rewards/rejected": -0.4104188084602356, "step": 4750 }, { "epoch": 0.91, "learning_rate": 1.31508393714177e-07, "logits/chosen": -1.6772340536117554, "logits/rejected": -1.01282799243927, "logps/chosen": -624.1659545898438, "logps/rejected": -1270.8428955078125, "loss": 0.0725, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1336231231689453, "rewards/margins": 0.3113330006599426, "rewards/rejected": -0.44495612382888794, "step": 4760 }, { "epoch": 0.91, "learning_rate": 1.2624065816918414e-07, "logits/chosen": -1.4747987985610962, "logits/rejected": -0.975101113319397, "logps/chosen": -650.6563720703125, "logps/rejected": -1342.7335205078125, "loss": 0.0655, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15981096029281616, "rewards/margins": 0.30302125215530396, "rewards/rejected": -0.4628322124481201, "step": 4770 }, { "epoch": 0.91, "learning_rate": 1.210778602433596e-07, "logits/chosen": -1.564624547958374, "logits/rejected": -1.0374842882156372, "logps/chosen": -614.6488037109375, "logps/rejected": -1203.357421875, "loss": 0.0821, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15904925763607025, "rewards/margins": 0.27073806524276733, "rewards/rejected": -0.4297873079776764, "step": 4780 }, { "epoch": 0.91, "learning_rate": 1.1602022817033709e-07, "logits/chosen": -1.6003332138061523, "logits/rejected": -0.8900405764579773, "logps/chosen": -649.1990966796875, "logps/rejected": -1338.3707275390625, "loss": 0.0775, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15437167882919312, "rewards/margins": 0.34441012144088745, "rewards/rejected": -0.49878183007240295, "step": 4790 }, { "epoch": 0.91, "learning_rate": 1.1106798553464804e-07, "logits/chosen": -1.5427950620651245, "logits/rejected": -0.9831592440605164, "logps/chosen": -677.7523193359375, "logps/rejected": -1397.6802978515625, "loss": 0.0643, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18140892684459686, "rewards/margins": 0.33899572491645813, "rewards/rejected": -0.5204046964645386, "step": 4800 }, { "epoch": 0.92, "learning_rate": 1.0622135126183514e-07, "logits/chosen": -1.4138776063919067, "logits/rejected": -0.6474016308784485, "logps/chosen": -728.1995849609375, "logps/rejected": -1283.4852294921875, "loss": 0.0618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.192392498254776, "rewards/margins": 0.2717262804508209, "rewards/rejected": -0.4641187787055969, "step": 4810 }, { "epoch": 0.92, "learning_rate": 1.0148053960877396e-07, "logits/chosen": -1.398097276687622, "logits/rejected": -0.9553489685058594, "logps/chosen": -585.2307739257812, "logps/rejected": -1273.9364013671875, "loss": 0.0691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1406571865081787, "rewards/margins": 0.3191017210483551, "rewards/rejected": -0.4597589373588562, "step": 4820 }, { "epoch": 0.92, "learning_rate": 9.684576015420277e-08, "logits/chosen": -1.5144402980804443, "logits/rejected": -0.7728510499000549, "logps/chosen": -749.4060668945312, "logps/rejected": -1322.1160888671875, "loss": 0.0965, "rewards/accuracies": 0.75, "rewards/chosen": -0.2126808613538742, "rewards/margins": 0.2606582045555115, "rewards/rejected": -0.4733390808105469, "step": 4830 }, { "epoch": 0.92, "learning_rate": 9.23172177894574e-08, "logits/chosen": -1.6027752161026, "logits/rejected": -0.8554502725601196, "logps/chosen": -739.6917724609375, "logps/rejected": -1458.238037109375, "loss": 0.0339, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17026378214359283, "rewards/margins": 0.35099369287490845, "rewards/rejected": -0.5212574601173401, "step": 4840 }, { "epoch": 0.92, "learning_rate": 8.78951127094127e-08, "logits/chosen": -1.6803109645843506, "logits/rejected": -0.9320834279060364, "logps/chosen": -662.111572265625, "logps/rejected": -1162.46484375, "loss": 0.0716, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15743741393089294, "rewards/margins": 0.2909800708293915, "rewards/rejected": -0.4484175145626068, "step": 4850 }, { "epoch": 0.93, "learning_rate": 8.357964040363209e-08, "logits/chosen": -1.7106218338012695, "logits/rejected": -0.9925411343574524, "logps/chosen": -710.9759521484375, "logps/rejected": -1317.148193359375, "loss": 0.0773, "rewards/accuracies": 0.875, "rewards/chosen": -0.16667839884757996, "rewards/margins": 0.2834816575050354, "rewards/rejected": -0.45016008615493774, "step": 4860 }, { "epoch": 0.93, "learning_rate": 7.937099164772699e-08, "logits/chosen": -1.3424303531646729, "logits/rejected": -1.0499579906463623, "logps/chosen": -523.9136962890625, "logps/rejected": -1224.3201904296875, "loss": 0.0886, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15465199947357178, "rewards/margins": 0.29365110397338867, "rewards/rejected": -0.44830313324928284, "step": 4870 }, { "epoch": 0.93, "learning_rate": 7.526935249492245e-08, "logits/chosen": -1.6472418308258057, "logits/rejected": -0.9855870008468628, "logps/chosen": -636.9083251953125, "logps/rejected": -1183.853515625, "loss": 0.0971, "rewards/accuracies": 0.875, "rewards/chosen": -0.15656232833862305, "rewards/margins": 0.2960873544216156, "rewards/rejected": -0.45264968276023865, "step": 4880 }, { "epoch": 0.93, "learning_rate": 7.127490426783124e-08, "logits/chosen": -1.6242201328277588, "logits/rejected": -1.0149205923080444, "logps/chosen": -678.8060302734375, "logps/rejected": -1352.32470703125, "loss": 0.0638, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14568215608596802, "rewards/margins": 0.3156759738922119, "rewards/rejected": -0.4613581597805023, "step": 4890 }, { "epoch": 0.93, "learning_rate": 6.738782355044048e-08, "logits/chosen": -1.3191697597503662, "logits/rejected": -1.201279878616333, "logps/chosen": -721.3082275390625, "logps/rejected": -1354.2021484375, "loss": 0.076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17443808913230896, "rewards/margins": 0.2681798040866852, "rewards/rejected": -0.44261789321899414, "step": 4900 }, { "epoch": 0.94, "learning_rate": 6.360828218030191e-08, "logits/chosen": -1.8827226161956787, "logits/rejected": -1.1188879013061523, "logps/chosen": -634.8402709960938, "logps/rejected": -1260.1407470703125, "loss": 0.0734, "rewards/accuracies": 0.875, "rewards/chosen": -0.1493465006351471, "rewards/margins": 0.29650747776031494, "rewards/rejected": -0.44585394859313965, "step": 4910 }, { "epoch": 0.94, "learning_rate": 5.993644724093889e-08, "logits/chosen": -1.6838546991348267, "logits/rejected": -0.9050960540771484, "logps/chosen": -648.4398803710938, "logps/rejected": -1251.8382568359375, "loss": 0.0554, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1434290111064911, "rewards/margins": 0.3137533664703369, "rewards/rejected": -0.4571823477745056, "step": 4920 }, { "epoch": 0.94, "learning_rate": 5.637248105445775e-08, "logits/chosen": -1.7320117950439453, "logits/rejected": -1.1256321668624878, "logps/chosen": -539.5606079101562, "logps/rejected": -1056.583251953125, "loss": 0.0972, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1480817347764969, "rewards/margins": 0.2356722354888916, "rewards/rejected": -0.3837539255619049, "step": 4930 }, { "epoch": 0.94, "learning_rate": 5.291654117437262e-08, "logits/chosen": -1.7257087230682373, "logits/rejected": -0.9479360580444336, "logps/chosen": -557.167724609375, "logps/rejected": -1281.1934814453125, "loss": 0.0506, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11244988441467285, "rewards/margins": 0.33333298563957214, "rewards/rejected": -0.4457828402519226, "step": 4940 }, { "epoch": 0.94, "learning_rate": 4.956878037864044e-08, "logits/chosen": -1.407482385635376, "logits/rejected": -0.8275833129882812, "logps/chosen": -626.5960693359375, "logps/rejected": -1172.99755859375, "loss": 0.0545, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1533767133951187, "rewards/margins": 0.2846202254295349, "rewards/rejected": -0.4379969537258148, "step": 4950 }, { "epoch": 0.94, "learning_rate": 4.632934666290778e-08, "logits/chosen": -1.4791433811187744, "logits/rejected": -1.235567569732666, "logps/chosen": -508.14501953125, "logps/rejected": -1220.041259765625, "loss": 0.0576, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13363158702850342, "rewards/margins": 0.29521042108535767, "rewards/rejected": -0.4288419783115387, "step": 4960 }, { "epoch": 0.95, "learning_rate": 4.319838323396691e-08, "logits/chosen": -1.6887500286102295, "logits/rejected": -1.1927210092544556, "logps/chosen": -635.3477783203125, "logps/rejected": -1253.641357421875, "loss": 0.1026, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16893738508224487, "rewards/margins": 0.25178489089012146, "rewards/rejected": -0.4207223057746887, "step": 4970 }, { "epoch": 0.95, "learning_rate": 4.017602850342584e-08, "logits/chosen": -1.5601575374603271, "logits/rejected": -0.8450828790664673, "logps/chosen": -624.2867431640625, "logps/rejected": -1262.434814453125, "loss": 0.0557, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1422443836927414, "rewards/margins": 0.29309436678886414, "rewards/rejected": -0.43533873558044434, "step": 4980 }, { "epoch": 0.95, "learning_rate": 3.7262416081589866e-08, "logits/chosen": -1.684685468673706, "logits/rejected": -1.0103908777236938, "logps/chosen": -624.5326538085938, "logps/rejected": -1345.8338623046875, "loss": 0.0487, "rewards/accuracies": 0.875, "rewards/chosen": -0.14531221985816956, "rewards/margins": 0.3458930253982544, "rewards/rejected": -0.49120527505874634, "step": 4990 }, { "epoch": 0.95, "learning_rate": 3.445767477155443e-08, "logits/chosen": -1.6047395467758179, "logits/rejected": -1.083069086074829, "logps/chosen": -439.0870666503906, "logps/rejected": -1007.619140625, "loss": 0.088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10899704694747925, "rewards/margins": 0.26882943511009216, "rewards/rejected": -0.3778265118598938, "step": 5000 }, { "epoch": 0.95, "learning_rate": 3.1761928563510956e-08, "logits/chosen": -1.5430387258529663, "logits/rejected": -1.1478708982467651, "logps/chosen": -566.592041015625, "logps/rejected": -1102.06201171875, "loss": 0.0955, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12917637825012207, "rewards/margins": 0.2524365484714508, "rewards/rejected": -0.38161295652389526, "step": 5010 }, { "epoch": 0.96, "learning_rate": 2.917529662926549e-08, "logits/chosen": -1.4810032844543457, "logits/rejected": -1.0692861080169678, "logps/chosen": -544.7088623046875, "logps/rejected": -1293.71435546875, "loss": 0.044, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11322245746850967, "rewards/margins": 0.34361839294433594, "rewards/rejected": -0.4568409025669098, "step": 5020 }, { "epoch": 0.96, "learning_rate": 2.669789331697148e-08, "logits/chosen": -1.4107751846313477, "logits/rejected": -0.8879677057266235, "logps/chosen": -596.4759521484375, "logps/rejected": -1232.4378662109375, "loss": 0.0773, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15274113416671753, "rewards/margins": 0.30707958340644836, "rewards/rejected": -0.4598206877708435, "step": 5030 }, { "epoch": 0.96, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -1.6700639724731445, "logits/rejected": -1.160444974899292, "logps/chosen": -638.5947265625, "logps/rejected": -1382.1214599609375, "loss": 0.0714, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.16618075966835022, "rewards/margins": 0.3303069770336151, "rewards/rejected": -0.49648770689964294, "step": 5040 }, { "epoch": 0.96, "learning_rate": 2.20712058024683e-08, "logits/chosen": -1.7391884326934814, "logits/rejected": -1.0803216695785522, "logps/chosen": -749.6804809570312, "logps/rejected": -1278.514404296875, "loss": 0.0449, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1625753939151764, "rewards/margins": 0.3114554286003113, "rewards/rejected": -0.47403082251548767, "step": 5050 }, { "epoch": 0.96, "learning_rate": 1.9922126133870568e-08, "logits/chosen": -1.579906702041626, "logits/rejected": -1.032274603843689, "logps/chosen": -662.6795654296875, "logps/rejected": -1298.6641845703125, "loss": 0.0915, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15554921329021454, "rewards/margins": 0.2925891578197479, "rewards/rejected": -0.4481383264064789, "step": 5060 }, { "epoch": 0.97, "learning_rate": 1.7882684145406616e-08, "logits/chosen": -1.3243744373321533, "logits/rejected": -1.0081312656402588, "logps/chosen": -598.328857421875, "logps/rejected": -1325.436279296875, "loss": 0.0686, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1774941235780716, "rewards/margins": 0.2830308973789215, "rewards/rejected": -0.4605250358581543, "step": 5070 }, { "epoch": 0.97, "learning_rate": 1.595296999541057e-08, "logits/chosen": -1.549423336982727, "logits/rejected": -0.9221774935722351, "logps/chosen": -638.889404296875, "logps/rejected": -1374.27783203125, "loss": 0.074, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.17353455722332, "rewards/margins": 0.3390573561191559, "rewards/rejected": -0.5125918984413147, "step": 5080 }, { "epoch": 0.97, "learning_rate": 1.4133068991437903e-08, "logits/chosen": -1.5194743871688843, "logits/rejected": -1.1621986627578735, "logps/chosen": -572.447509765625, "logps/rejected": -1069.026123046875, "loss": 0.0848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14077235758304596, "rewards/margins": 0.23328928649425507, "rewards/rejected": -0.374061644077301, "step": 5090 }, { "epoch": 0.97, "learning_rate": 1.2423061586496476e-08, "logits/chosen": -1.5293216705322266, "logits/rejected": -1.161948561668396, "logps/chosen": -603.0601806640625, "logps/rejected": -1281.250244140625, "loss": 0.0547, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15855684876441956, "rewards/margins": 0.3035425543785095, "rewards/rejected": -0.46209946274757385, "step": 5100 }, { "epoch": 0.97, "learning_rate": 1.0823023375489128e-08, "logits/chosen": -1.5255159139633179, "logits/rejected": -1.010140299797058, "logps/chosen": -623.25244140625, "logps/rejected": -1228.103759765625, "loss": 0.0726, "rewards/accuracies": 0.875, "rewards/chosen": -0.15898485481739044, "rewards/margins": 0.28266894817352295, "rewards/rejected": -0.4416537880897522, "step": 5110 }, { "epoch": 0.98, "learning_rate": 9.333025091870507e-09, "logits/chosen": -1.6246131658554077, "logits/rejected": -0.83955317735672, "logps/chosen": -676.0503540039062, "logps/rejected": -1231.9215087890625, "loss": 0.0798, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1609552502632141, "rewards/margins": 0.28405335545539856, "rewards/rejected": -0.44500860571861267, "step": 5120 }, { "epoch": 0.98, "learning_rate": 7.95313260452263e-09, "logits/chosen": -1.470959186553955, "logits/rejected": -0.8849459886550903, "logps/chosen": -626.1944580078125, "logps/rejected": -1259.2894287109375, "loss": 0.0592, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15113981068134308, "rewards/margins": 0.2945057451725006, "rewards/rejected": -0.4456455707550049, "step": 5130 }, { "epoch": 0.98, "learning_rate": 6.683406914840818e-09, "logits/chosen": -1.423765778541565, "logits/rejected": -0.8854363560676575, "logps/chosen": -525.4611206054688, "logps/rejected": -1161.06005859375, "loss": 0.0656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12872517108917236, "rewards/margins": 0.2833263576030731, "rewards/rejected": -0.4120515286922455, "step": 5140 }, { "epoch": 0.98, "learning_rate": 5.523904154037529e-09, "logits/chosen": -1.4726181030273438, "logits/rejected": -1.0987781286239624, "logps/chosen": -612.3428344726562, "logps/rejected": -1266.933837890625, "loss": 0.0767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17042508721351624, "rewards/margins": 0.2595577538013458, "rewards/rejected": -0.42998284101486206, "step": 5150 }, { "epoch": 0.98, "learning_rate": 4.474675580662113e-09, "logits/chosen": -1.48684561252594, "logits/rejected": -0.8791742324829102, "logps/chosen": -561.6771850585938, "logps/rejected": -1201.2552490234375, "loss": 0.0813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.145443856716156, "rewards/margins": 0.2917470335960388, "rewards/rejected": -0.4371909201145172, "step": 5160 }, { "epoch": 0.98, "learning_rate": 3.5357675783331823e-09, "logits/chosen": -1.7405544519424438, "logits/rejected": -1.0294724702835083, "logps/chosen": -674.488037109375, "logps/rejected": -1273.2628173828125, "loss": 0.0624, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15219458937644958, "rewards/margins": 0.3184397518634796, "rewards/rejected": -0.4706343710422516, "step": 5170 }, { "epoch": 0.99, "learning_rate": 2.7072216536885855e-09, "logits/chosen": -1.6870332956314087, "logits/rejected": -0.7788273096084595, "logps/chosen": -656.57421875, "logps/rejected": -1370.481201171875, "loss": 0.0638, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.15338242053985596, "rewards/margins": 0.3432921767234802, "rewards/rejected": -0.4966745972633362, "step": 5180 }, { "epoch": 0.99, "learning_rate": 1.989074434551874e-09, "logits/chosen": -1.3593934774398804, "logits/rejected": -0.986729621887207, "logps/chosen": -659.7386474609375, "logps/rejected": -1238.4710693359375, "loss": 0.0713, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17776012420654297, "rewards/margins": 0.27383026480674744, "rewards/rejected": -0.4515903890132904, "step": 5190 }, { "epoch": 0.99, "learning_rate": 1.3813576683111007e-09, "logits/chosen": -1.5167112350463867, "logits/rejected": -0.8397325277328491, "logps/chosen": -599.6024169921875, "logps/rejected": -1328.2471923828125, "loss": 0.0568, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13601288199424744, "rewards/margins": 0.34515735507011414, "rewards/rejected": -0.48117026686668396, "step": 5200 }, { "epoch": 0.99, "learning_rate": 8.840982205160498e-10, "logits/chosen": -1.4586691856384277, "logits/rejected": -0.971697211265564, "logps/chosen": -614.6513671875, "logps/rejected": -1244.3607177734375, "loss": 0.0793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14397016167640686, "rewards/margins": 0.2893129885196686, "rewards/rejected": -0.43328309059143066, "step": 5210 }, { "epoch": 0.99, "learning_rate": 4.973180736911332e-10, "logits/chosen": -1.6655791997909546, "logits/rejected": -1.1275146007537842, "logps/chosen": -627.2606201171875, "logps/rejected": -1310.1925048828125, "loss": 0.0681, "rewards/accuracies": 0.875, "rewards/chosen": -0.14530961215496063, "rewards/margins": 0.30237385630607605, "rewards/rejected": -0.4476834833621979, "step": 5220 }, { "epoch": 1.0, "learning_rate": 2.2103432636366718e-10, "logits/chosen": -1.3939180374145508, "logits/rejected": -0.9449717402458191, "logps/chosen": -691.179443359375, "logps/rejected": -1282.6971435546875, "loss": 0.0802, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.180943563580513, "rewards/margins": 0.28968390822410583, "rewards/rejected": -0.47062739729881287, "step": 5230 }, { "epoch": 1.0, "learning_rate": 5.525919230670029e-11, "logits/chosen": -1.4143139123916626, "logits/rejected": -1.0170581340789795, "logps/chosen": -506.2384338378906, "logps/rejected": -1102.7977294921875, "loss": 0.0889, "rewards/accuracies": 0.75, "rewards/chosen": -0.15023784339427948, "rewards/margins": 0.26667970418930054, "rewards/rejected": -0.4169175624847412, "step": 5240 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -1.594805359840393, "logits/rejected": -0.6739069223403931, "logps/chosen": -637.98779296875, "logps/rejected": -1227.662841796875, "loss": 0.0656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17544154822826385, "rewards/margins": 0.29885655641555786, "rewards/rejected": -0.4742980897426605, "step": 5250 }, { "epoch": 1.0, "step": 5250, "total_flos": 0.0, "train_loss": 0.07836547029586065, "train_runtime": 22365.132, "train_samples_per_second": 0.939, "train_steps_per_second": 0.235 } ], "logging_steps": 10, "max_steps": 5250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }