diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7394 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-09, + "logits/chosen": -1.2850065231323242, + "logits/rejected": -0.5420928001403809, + "logps/chosen": -849.5712890625, + "logps/rejected": -659.8780517578125, + "loss": 0.2593, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809525e-08, + "logits/chosen": -1.6733099222183228, + "logits/rejected": -1.1435589790344238, + "logps/chosen": -407.6390075683594, + "logps/rejected": -750.5330200195312, + "loss": 0.1933, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": 0.0002648972731549293, + "rewards/margins": 0.0002683588827494532, + "rewards/rejected": -3.461622554823407e-06, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.904761904761905e-07, + "logits/chosen": -1.5639336109161377, + "logits/rejected": -1.1012383699417114, + "logps/chosen": -465.546875, + "logps/rejected": -766.64697265625, + "loss": 0.2397, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.00022372114472091198, + "rewards/margins": -2.5915365768014453e-05, + "rewards/rejected": -0.00019780578440986574, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.8571428571428575e-07, + "logits/chosen": -1.6892967224121094, + "logits/rejected": -1.0408269166946411, + "logps/chosen": -473.25262451171875, + "logps/rejected": -808.1934204101562, + "loss": 0.2234, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0015700621297582984, + "rewards/margins": 0.0015648994594812393, + "rewards/rejected": 5.1628012442961335e-06, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.80952380952381e-07, + "logits/chosen": -1.7225767374038696, + "logits/rejected": -1.1273142099380493, + "logps/chosen": -457.7410583496094, + "logps/rejected": -813.6912841796875, + "loss": 0.2055, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002732831286266446, + "rewards/margins": 0.005175677128136158, + "rewards/rejected": -0.0024428460747003555, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.7619047619047623e-07, + "logits/chosen": -1.5604654550552368, + "logits/rejected": -0.9060885310173035, + "logps/chosen": -554.2913208007812, + "logps/rejected": -849.5115356445312, + "loss": 0.1971, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.00811075884848833, + "rewards/margins": 0.014176970347762108, + "rewards/rejected": -0.006066213361918926, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 5.714285714285715e-07, + "logits/chosen": -1.687990427017212, + "logits/rejected": -1.2494174242019653, + "logps/chosen": -386.49517822265625, + "logps/rejected": -765.5401611328125, + "loss": 0.2104, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.011224821209907532, + "rewards/margins": 0.017030417919158936, + "rewards/rejected": -0.005805597640573978, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -1.2933578491210938, + "logits/rejected": -1.1579430103302002, + "logps/chosen": -321.789306640625, + "logps/rejected": -766.0114135742188, + "loss": 0.1858, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.009114318527281284, + "rewards/margins": 0.020903298631310463, + "rewards/rejected": -0.011788980104029179, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 7.61904761904762e-07, + "logits/chosen": -1.6945091485977173, + "logits/rejected": -1.1569772958755493, + "logps/chosen": -363.9807434082031, + "logps/rejected": -795.9345703125, + "loss": 0.1731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.010179867967963219, + "rewards/margins": 0.04508482292294502, + "rewards/rejected": -0.034904953092336655, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 8.571428571428572e-07, + "logits/chosen": -1.481457233428955, + "logits/rejected": -1.0560935735702515, + "logps/chosen": -575.8737182617188, + "logps/rejected": -885.8480224609375, + "loss": 0.1728, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0052037788555026054, + "rewards/margins": 0.05343114212155342, + "rewards/rejected": -0.04822736978530884, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 9.523809523809525e-07, + "logits/chosen": -1.6370322704315186, + "logits/rejected": -0.8078746795654297, + "logps/chosen": -443.8648986816406, + "logps/rejected": -836.8392333984375, + "loss": 0.1715, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012666692025959492, + "rewards/margins": 0.07541676610708237, + "rewards/rejected": -0.06275007873773575, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.0476190476190478e-06, + "logits/chosen": -1.531712293624878, + "logits/rejected": -1.1594440937042236, + "logps/chosen": -445.06427001953125, + "logps/rejected": -885.134765625, + "loss": 0.1563, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0009547686204314232, + "rewards/margins": 0.08301910012960434, + "rewards/rejected": -0.08206433802843094, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.142857142857143e-06, + "logits/chosen": -1.5637714862823486, + "logits/rejected": -1.008846640586853, + "logps/chosen": -559.6546020507812, + "logps/rejected": -1037.636962890625, + "loss": 0.1188, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.025890862569212914, + "rewards/margins": 0.11646576225757599, + "rewards/rejected": -0.14235660433769226, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 1.2380952380952382e-06, + "logits/chosen": -1.6181094646453857, + "logits/rejected": -0.9313281178474426, + "logps/chosen": -543.5655517578125, + "logps/rejected": -926.2550048828125, + "loss": 0.1378, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04493476450443268, + "rewards/margins": 0.11141650378704071, + "rewards/rejected": -0.1563512682914734, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -1.6120615005493164, + "logits/rejected": -1.0635985136032104, + "logps/chosen": -567.2598876953125, + "logps/rejected": -1004.9212646484375, + "loss": 0.0996, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09899488091468811, + "rewards/margins": 0.13413195312023163, + "rewards/rejected": -0.23312684893608093, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -1.6825717687606812, + "logits/rejected": -1.081813097000122, + "logps/chosen": -560.16943359375, + "logps/rejected": -930.1064453125, + "loss": 0.1257, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0978316217660904, + "rewards/margins": 0.12988914549350739, + "rewards/rejected": -0.22772076725959778, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.523809523809524e-06, + "logits/chosen": -1.777329683303833, + "logits/rejected": -1.2507864236831665, + "logps/chosen": -512.7439575195312, + "logps/rejected": -984.9051513671875, + "loss": 0.1239, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05691201239824295, + "rewards/margins": 0.1501971185207367, + "rewards/rejected": -0.20710912346839905, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.6190476190476193e-06, + "logits/chosen": -1.6942923069000244, + "logits/rejected": -1.1983150243759155, + "logps/chosen": -547.2807006835938, + "logps/rejected": -1033.548095703125, + "loss": 0.0948, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09753384441137314, + "rewards/margins": 0.16666623950004578, + "rewards/rejected": -0.2642000913619995, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 1.7142857142857145e-06, + "logits/chosen": -1.689398169517517, + "logits/rejected": -1.2449653148651123, + "logps/chosen": -630.5906982421875, + "logps/rejected": -1181.1785888671875, + "loss": 0.0868, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1752820611000061, + "rewards/margins": 0.17259114980697632, + "rewards/rejected": -0.3478732109069824, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 1.8095238095238097e-06, + "logits/chosen": -1.5203440189361572, + "logits/rejected": -0.9198516011238098, + "logps/chosen": -666.8179931640625, + "logps/rejected": -1204.99072265625, + "loss": 0.1113, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19398172199726105, + "rewards/margins": 0.22316697239875793, + "rewards/rejected": -0.4171486794948578, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 1.904761904761905e-06, + "logits/chosen": -1.682255744934082, + "logits/rejected": -1.4462255239486694, + "logps/chosen": -530.3055419921875, + "logps/rejected": -1100.1158447265625, + "loss": 0.0997, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13800662755966187, + "rewards/margins": 0.17010769248008728, + "rewards/rejected": -0.30811434984207153, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.6906871795654297, + "logits/rejected": -0.9419037103652954, + "logps/chosen": -618.3939819335938, + "logps/rejected": -1106.7994384765625, + "loss": 0.0988, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13316693902015686, + "rewards/margins": 0.18527300655841827, + "rewards/rejected": -0.3184399902820587, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 2.0952380952380955e-06, + "logits/chosen": -1.7209393978118896, + "logits/rejected": -0.9391362071037292, + "logps/chosen": -616.0027465820312, + "logps/rejected": -1283.7279052734375, + "loss": 0.0825, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16547217965126038, + "rewards/margins": 0.23828014731407166, + "rewards/rejected": -0.40375232696533203, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 2.1904761904761908e-06, + "logits/chosen": -1.472318410873413, + "logits/rejected": -0.8261554837226868, + "logps/chosen": -656.9349365234375, + "logps/rejected": -1206.586669921875, + "loss": 0.0923, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16559532284736633, + "rewards/margins": 0.2132759392261505, + "rewards/rejected": -0.37887123227119446, + "step": 230 + }, + { + "epoch": 0.05, + "learning_rate": 2.285714285714286e-06, + "logits/chosen": -1.5672019720077515, + "logits/rejected": -1.0501900911331177, + "logps/chosen": -499.3685607910156, + "logps/rejected": -1164.9814453125, + "loss": 0.0738, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08818192780017853, + "rewards/margins": 0.24911494553089142, + "rewards/rejected": -0.33729690313339233, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 2.380952380952381e-06, + "logits/chosen": -1.6451988220214844, + "logits/rejected": -1.2338879108428955, + "logps/chosen": -619.9409790039062, + "logps/rejected": -1280.0615234375, + "loss": 0.0972, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15977323055267334, + "rewards/margins": 0.23027309775352478, + "rewards/rejected": -0.3900463879108429, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 2.4761904761904764e-06, + "logits/chosen": -1.847777009010315, + "logits/rejected": -0.9876956939697266, + "logps/chosen": -674.3220825195312, + "logps/rejected": -1227.2720947265625, + "loss": 0.0709, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18131622672080994, + "rewards/margins": 0.23836851119995117, + "rewards/rejected": -0.4196847975254059, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 2.571428571428571e-06, + "logits/chosen": -1.6134955883026123, + "logits/rejected": -1.0790008306503296, + "logps/chosen": -647.0198974609375, + "logps/rejected": -1282.9854736328125, + "loss": 0.0938, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.209293395280838, + "rewards/margins": 0.25759097933769226, + "rewards/rejected": -0.4668843150138855, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -1.8159784078598022, + "logits/rejected": -1.351468801498413, + "logps/chosen": -561.7814331054688, + "logps/rejected": -1274.318603515625, + "loss": 0.067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14992132782936096, + "rewards/margins": 0.2866609990596771, + "rewards/rejected": -0.43658238649368286, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 2.7619047619047625e-06, + "logits/chosen": -1.7803318500518799, + "logits/rejected": -1.1618740558624268, + "logps/chosen": -521.2305908203125, + "logps/rejected": -1099.5322265625, + "loss": 0.0655, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09962662309408188, + "rewards/margins": 0.25189143419265747, + "rewards/rejected": -0.35151809453964233, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": -1.6528995037078857, + "logits/rejected": -1.113143801689148, + "logps/chosen": -613.2098999023438, + "logps/rejected": -1197.0404052734375, + "loss": 0.0966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1718660593032837, + "rewards/margins": 0.22291307151317596, + "rewards/rejected": -0.39477914571762085, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 2.9523809523809525e-06, + "logits/chosen": -1.746258020401001, + "logits/rejected": -1.273449182510376, + "logps/chosen": -666.4026489257812, + "logps/rejected": -1100.668212890625, + "loss": 0.1579, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18353891372680664, + "rewards/margins": 0.17098166048526764, + "rewards/rejected": -0.3545205295085907, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 3.047619047619048e-06, + "logits/chosen": -1.8009140491485596, + "logits/rejected": -0.9418787956237793, + "logps/chosen": -602.3572387695312, + "logps/rejected": -1305.9537353515625, + "loss": 0.059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09984590113162994, + "rewards/margins": 0.3039381206035614, + "rewards/rejected": -0.40378403663635254, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 3.142857142857143e-06, + "logits/chosen": -1.5505433082580566, + "logits/rejected": -1.3617745637893677, + "logps/chosen": -469.3348083496094, + "logps/rejected": -1237.8416748046875, + "loss": 0.0744, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10851670801639557, + "rewards/margins": 0.29302558302879333, + "rewards/rejected": -0.4015422761440277, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 3.2380952380952385e-06, + "logits/chosen": -1.691335678100586, + "logits/rejected": -1.183778166770935, + "logps/chosen": -566.4237060546875, + "logps/rejected": -1063.508544921875, + "loss": 0.081, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13905121386051178, + "rewards/margins": 0.18825221061706543, + "rewards/rejected": -0.3273034691810608, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.6669670343399048, + "logits/rejected": -1.1208057403564453, + "logps/chosen": -542.494384765625, + "logps/rejected": -1139.1849365234375, + "loss": 0.09, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13446304202079773, + "rewards/margins": 0.23915879428386688, + "rewards/rejected": -0.3736218512058258, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 3.428571428571429e-06, + "logits/chosen": -1.7811037302017212, + "logits/rejected": -1.2279610633850098, + "logps/chosen": -610.7708129882812, + "logps/rejected": -1237.0147705078125, + "loss": 0.0792, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15628622472286224, + "rewards/margins": 0.25117939710617065, + "rewards/rejected": -0.4074656069278717, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 3.523809523809524e-06, + "logits/chosen": -1.7308986186981201, + "logits/rejected": -1.027928113937378, + "logps/chosen": -556.2271728515625, + "logps/rejected": -1258.7716064453125, + "loss": 0.0658, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.10915093123912811, + "rewards/margins": 0.3074415326118469, + "rewards/rejected": -0.4165925085544586, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 3.6190476190476194e-06, + "logits/chosen": -1.7067712545394897, + "logits/rejected": -1.2722686529159546, + "logps/chosen": -495.68060302734375, + "logps/rejected": -1135.3636474609375, + "loss": 0.1078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09428682923316956, + "rewards/margins": 0.24129095673561096, + "rewards/rejected": -0.33557775616645813, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 3.7142857142857146e-06, + "logits/chosen": -1.8631842136383057, + "logits/rejected": -1.3056199550628662, + "logps/chosen": -689.7511596679688, + "logps/rejected": -1181.241455078125, + "loss": 0.0914, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2153368443250656, + "rewards/margins": 0.2260267436504364, + "rewards/rejected": -0.4413636326789856, + "step": 390 + }, + { + "epoch": 0.08, + "learning_rate": 3.80952380952381e-06, + "logits/chosen": -1.782406210899353, + "logits/rejected": -1.061374545097351, + "logps/chosen": -587.0792236328125, + "logps/rejected": -1274.716796875, + "loss": 0.0899, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16100212931632996, + "rewards/margins": 0.29258546233177185, + "rewards/rejected": -0.45358753204345703, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 3.9047619047619055e-06, + "logits/chosen": -1.5641822814941406, + "logits/rejected": -1.193485975265503, + "logps/chosen": -569.2401733398438, + "logps/rejected": -1220.4403076171875, + "loss": 0.0666, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15877744555473328, + "rewards/margins": 0.2378130853176117, + "rewards/rejected": -0.39659056067466736, + "step": 410 + }, + { + "epoch": 0.08, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.828481674194336, + "logits/rejected": -0.8795900344848633, + "logps/chosen": -746.7064208984375, + "logps/rejected": -1278.959716796875, + "loss": 0.0972, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2285885065793991, + "rewards/margins": 0.2384357899427414, + "rewards/rejected": -0.4670243263244629, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 4.095238095238096e-06, + "logits/chosen": -1.5313599109649658, + "logits/rejected": -0.8583769798278809, + "logps/chosen": -704.6945190429688, + "logps/rejected": -1280.5216064453125, + "loss": 0.0709, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18623504042625427, + "rewards/margins": 0.2717072367668152, + "rewards/rejected": -0.45794230699539185, + "step": 430 + }, + { + "epoch": 0.08, + "learning_rate": 4.190476190476191e-06, + "logits/chosen": -1.716683030128479, + "logits/rejected": -1.3741168975830078, + "logps/chosen": -499.2115173339844, + "logps/rejected": -1127.7908935546875, + "loss": 0.0951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10336490720510483, + "rewards/margins": 0.2679286599159241, + "rewards/rejected": -0.3712936043739319, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": -1.720665693283081, + "logits/rejected": -1.3028696775436401, + "logps/chosen": -550.9224243164062, + "logps/rejected": -1267.247802734375, + "loss": 0.0647, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11992333084344864, + "rewards/margins": 0.24694469571113586, + "rewards/rejected": -0.3668680191040039, + "step": 450 + }, + { + "epoch": 0.09, + "learning_rate": 4.3809523809523815e-06, + "logits/chosen": -1.8857879638671875, + "logits/rejected": -1.194883108139038, + "logps/chosen": -605.60009765625, + "logps/rejected": -1175.531494140625, + "loss": 0.0844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13926714658737183, + "rewards/margins": 0.2119566947221756, + "rewards/rejected": -0.3512238562107086, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 4.476190476190477e-06, + "logits/chosen": -1.8993419408798218, + "logits/rejected": -1.109785795211792, + "logps/chosen": -661.8026733398438, + "logps/rejected": -1132.317138671875, + "loss": 0.0918, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12035910040140152, + "rewards/margins": 0.21994027495384216, + "rewards/rejected": -0.3402993679046631, + "step": 470 + }, + { + "epoch": 0.09, + "learning_rate": 4.571428571428572e-06, + "logits/chosen": -1.7157402038574219, + "logits/rejected": -1.2350364923477173, + "logps/chosen": -580.3271484375, + "logps/rejected": -1264.6790771484375, + "loss": 0.0755, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12759478390216827, + "rewards/margins": 0.2707314193248749, + "rewards/rejected": -0.39832618832588196, + "step": 480 + }, + { + "epoch": 0.09, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -1.7638877630233765, + "logits/rejected": -1.207639455795288, + "logps/chosen": -577.3447265625, + "logps/rejected": -1109.933349609375, + "loss": 0.0964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11431191861629486, + "rewards/margins": 0.21803171932697296, + "rewards/rejected": -0.3323436379432678, + "step": 490 + }, + { + "epoch": 0.1, + "learning_rate": 4.761904761904762e-06, + "logits/chosen": -1.8138492107391357, + "logits/rejected": -1.0965001583099365, + "logps/chosen": -514.5714111328125, + "logps/rejected": -1028.302001953125, + "loss": 0.0831, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09624107927083969, + "rewards/margins": 0.2407519370317459, + "rewards/rejected": -0.336993008852005, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 4.857142857142858e-06, + "logits/chosen": -1.8310391902923584, + "logits/rejected": -1.2400341033935547, + "logps/chosen": -746.3948974609375, + "logps/rejected": -1365.250732421875, + "loss": 0.1075, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3059869408607483, + "rewards/margins": 0.22657544910907745, + "rewards/rejected": -0.5325623750686646, + "step": 510 + }, + { + "epoch": 0.1, + "learning_rate": 4.952380952380953e-06, + "logits/chosen": -1.9007635116577148, + "logits/rejected": -1.0593281984329224, + "logps/chosen": -776.7916259765625, + "logps/rejected": -1380.4420166015625, + "loss": 0.0684, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.275787889957428, + "rewards/margins": 0.271630197763443, + "rewards/rejected": -0.5474181175231934, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 4.999986185163754e-06, + "logits/chosen": -1.52447509765625, + "logits/rejected": -0.8935421705245972, + "logps/chosen": -769.381103515625, + "logps/rejected": -1351.2783203125, + "loss": 0.0936, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2779494822025299, + "rewards/margins": 0.24540336430072784, + "rewards/rejected": -0.5233529210090637, + "step": 530 + }, + { + "epoch": 0.1, + "learning_rate": 4.999875667389858e-06, + "logits/chosen": -1.5643224716186523, + "logits/rejected": -0.9529625773429871, + "logps/chosen": -711.6848754882812, + "logps/rejected": -1355.2978515625, + "loss": 0.0916, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19870035350322723, + "rewards/margins": 0.30797114968299866, + "rewards/rejected": -0.5066714882850647, + "step": 540 + }, + { + "epoch": 0.1, + "learning_rate": 4.999654636727765e-06, + "logits/chosen": -1.834673285484314, + "logits/rejected": -0.9252250790596008, + "logps/chosen": -665.4678955078125, + "logps/rejected": -1259.138916015625, + "loss": 0.0857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13571450114250183, + "rewards/margins": 0.24520739912986755, + "rewards/rejected": -0.3809219300746918, + "step": 550 + }, + { + "epoch": 0.11, + "learning_rate": 4.999323102948655e-06, + "logits/chosen": -1.6249576807022095, + "logits/rejected": -0.773546576499939, + "logps/chosen": -658.3365478515625, + "logps/rejected": -1433.79443359375, + "loss": 0.0534, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19249185919761658, + "rewards/margins": 0.33588099479675293, + "rewards/rejected": -0.5283728837966919, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 4.998881080708759e-06, + "logits/chosen": -1.996830701828003, + "logits/rejected": -1.0194107294082642, + "logps/chosen": -766.8067626953125, + "logps/rejected": -1454.341552734375, + "loss": 0.054, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21933317184448242, + "rewards/margins": 0.32533079385757446, + "rewards/rejected": -0.5446639657020569, + "step": 570 + }, + { + "epoch": 0.11, + "learning_rate": 4.998328589548711e-06, + "logits/chosen": -1.7593481540679932, + "logits/rejected": -1.2708923816680908, + "logps/chosen": -606.2332763671875, + "logps/rejected": -1212.849365234375, + "loss": 0.0793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14150679111480713, + "rewards/margins": 0.25646698474884033, + "rewards/rejected": -0.39797380566596985, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 4.997665653892682e-06, + "logits/chosen": -1.8112876415252686, + "logits/rejected": -1.1319668292999268, + "logps/chosen": -566.7156982421875, + "logps/rejected": -1126.965087890625, + "loss": 0.1102, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1547660082578659, + "rewards/margins": 0.23159603774547577, + "rewards/rejected": -0.3863620460033417, + "step": 590 + }, + { + "epoch": 0.11, + "learning_rate": 4.996892303047306e-06, + "logits/chosen": -1.696459174156189, + "logits/rejected": -1.433434009552002, + "logps/chosen": -578.3829345703125, + "logps/rejected": -1214.95654296875, + "loss": 0.0854, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14466938376426697, + "rewards/margins": 0.26130378246307373, + "rewards/rejected": -0.4059731364250183, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 4.996008571200375e-06, + "logits/chosen": -1.578669786453247, + "logits/rejected": -1.2551274299621582, + "logps/chosen": -541.39208984375, + "logps/rejected": -1256.5968017578125, + "loss": 0.1037, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15888190269470215, + "rewards/margins": 0.25673192739486694, + "rewards/rejected": -0.4156138300895691, + "step": 610 + }, + { + "epoch": 0.12, + "learning_rate": 4.995014497419336e-06, + "logits/chosen": -1.7449144124984741, + "logits/rejected": -1.0009429454803467, + "logps/chosen": -669.4453735351562, + "logps/rejected": -1329.12744140625, + "loss": 0.095, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16107605397701263, + "rewards/margins": 0.2727533280849457, + "rewards/rejected": -0.4338293671607971, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -1.494553804397583, + "logits/rejected": -0.9566832780838013, + "logps/chosen": -674.8986206054688, + "logps/rejected": -1222.6563720703125, + "loss": 0.1148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19383887946605682, + "rewards/margins": 0.21913234889507294, + "rewards/rejected": -0.41297125816345215, + "step": 630 + }, + { + "epoch": 0.12, + "learning_rate": 4.992695504712402e-06, + "logits/chosen": -1.4821733236312866, + "logits/rejected": -0.8176189661026001, + "logps/chosen": -528.9064331054688, + "logps/rejected": -1169.622314453125, + "loss": 0.076, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1681780219078064, + "rewards/margins": 0.26834970712661743, + "rewards/rejected": -0.43652772903442383, + "step": 640 + }, + { + "epoch": 0.12, + "learning_rate": 4.9913706883030385e-06, + "logits/chosen": -1.5352599620819092, + "logits/rejected": -1.108190894126892, + "logps/chosen": -747.9698486328125, + "logps/rejected": -1408.2620849609375, + "loss": 0.0947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24020352959632874, + "rewards/margins": 0.2763820290565491, + "rewards/rejected": -0.5165855288505554, + "step": 650 + }, + { + "epoch": 0.13, + "learning_rate": 4.989935734988098e-06, + "logits/chosen": -1.8327178955078125, + "logits/rejected": -1.1306883096694946, + "logps/chosen": -656.5528564453125, + "logps/rejected": -1185.5146484375, + "loss": 0.1075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2202688455581665, + "rewards/margins": 0.25351718068122864, + "rewards/rejected": -0.47378596663475037, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 4.988390708203068e-06, + "logits/chosen": -1.4716339111328125, + "logits/rejected": -0.9073241353034973, + "logps/chosen": -670.7069091796875, + "logps/rejected": -1161.494384765625, + "loss": 0.1167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.175540030002594, + "rewards/margins": 0.22774294018745422, + "rewards/rejected": -0.4032829701900482, + "step": 670 + }, + { + "epoch": 0.13, + "learning_rate": 4.9867356762494955e-06, + "logits/chosen": -1.644885778427124, + "logits/rejected": -0.9986754655838013, + "logps/chosen": -596.4325561523438, + "logps/rejected": -1208.408447265625, + "loss": 0.0909, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11105004698038101, + "rewards/margins": 0.2835782468318939, + "rewards/rejected": -0.39462828636169434, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 4.984970712291963e-06, + "logits/chosen": -1.9281013011932373, + "logits/rejected": -1.0944923162460327, + "logps/chosen": -665.7894287109375, + "logps/rejected": -1237.457275390625, + "loss": 0.0685, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1318742334842682, + "rewards/margins": 0.2946074903011322, + "rewards/rejected": -0.4264817237854004, + "step": 690 + }, + { + "epoch": 0.13, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -1.8215450048446655, + "logits/rejected": -1.1661744117736816, + "logps/chosen": -684.5850219726562, + "logps/rejected": -1356.67919921875, + "loss": 0.0862, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2371421754360199, + "rewards/margins": 0.28387588262557983, + "rewards/rejected": -0.5210181474685669, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 4.981111305318918e-06, + "logits/chosen": -1.8345987796783447, + "logits/rejected": -0.9164802432060242, + "logps/chosen": -689.55810546875, + "logps/rejected": -1234.62890625, + "loss": 0.0965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2495756447315216, + "rewards/margins": 0.27059414982795715, + "rewards/rejected": -0.5201697945594788, + "step": 710 + }, + { + "epoch": 0.14, + "learning_rate": 4.979017032917576e-06, + "logits/chosen": -1.808998465538025, + "logits/rejected": -1.2228367328643799, + "logps/chosen": -628.9651489257812, + "logps/rejected": -1082.283935546875, + "loss": 0.1377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18333646655082703, + "rewards/margins": 0.1819809228181839, + "rewards/rejected": -0.3653174042701721, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 4.97681316973307e-06, + "logits/chosen": -1.6953952312469482, + "logits/rejected": -1.284063458442688, + "logps/chosen": -554.18896484375, + "logps/rejected": -1180.788330078125, + "loss": 0.0847, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09495721012353897, + "rewards/margins": 0.2516638934612274, + "rewards/rejected": -0.3466210961341858, + "step": 730 + }, + { + "epoch": 0.14, + "learning_rate": 4.9744998131923625e-06, + "logits/chosen": -1.8052667379379272, + "logits/rejected": -1.09946870803833, + "logps/chosen": -620.55517578125, + "logps/rejected": -1198.6644287109375, + "loss": 0.08, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09908594191074371, + "rewards/margins": 0.27245354652404785, + "rewards/rejected": -0.37153950333595276, + "step": 740 + }, + { + "epoch": 0.14, + "learning_rate": 4.9720770655628216e-06, + "logits/chosen": -1.8670793771743774, + "logits/rejected": -1.3132587671279907, + "logps/chosen": -594.9564208984375, + "logps/rejected": -1287.123291015625, + "loss": 0.0757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14726175367832184, + "rewards/margins": 0.29822516441345215, + "rewards/rejected": -0.44548696279525757, + "step": 750 + }, + { + "epoch": 0.14, + "learning_rate": 4.969545033947711e-06, + "logits/chosen": -1.7041022777557373, + "logits/rejected": -1.1802165508270264, + "logps/chosen": -660.1702270507812, + "logps/rejected": -1215.850830078125, + "loss": 0.1037, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22151121497154236, + "rewards/margins": 0.22661535441875458, + "rewards/rejected": -0.44812655448913574, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 4.966903830281449e-06, + "logits/chosen": -1.8590688705444336, + "logits/rejected": -1.039477825164795, + "logps/chosen": -683.9407348632812, + "logps/rejected": -1311.1129150390625, + "loss": 0.0793, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1789616197347641, + "rewards/margins": 0.28517478704452515, + "rewards/rejected": -0.46413642168045044, + "step": 770 + }, + { + "epoch": 0.15, + "learning_rate": 4.964153571324658e-06, + "logits/chosen": -1.400686502456665, + "logits/rejected": -0.961429238319397, + "logps/chosen": -567.7184448242188, + "logps/rejected": -1213.0838623046875, + "loss": 0.0663, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13110116124153137, + "rewards/margins": 0.28832724690437317, + "rewards/rejected": -0.41942834854125977, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 4.96129437865901e-06, + "logits/chosen": -1.7145271301269531, + "logits/rejected": -1.0960595607757568, + "logps/chosen": -660.9743041992188, + "logps/rejected": -1443.8707275390625, + "loss": 0.053, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20914039015769958, + "rewards/margins": 0.3439396321773529, + "rewards/rejected": -0.5530800819396973, + "step": 790 + }, + { + "epoch": 0.15, + "learning_rate": 4.958326378681849e-06, + "logits/chosen": -1.4642733335494995, + "logits/rejected": -0.7573977708816528, + "logps/chosen": -715.9859619140625, + "logps/rejected": -1360.409912109375, + "loss": 0.0773, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2055422067642212, + "rewards/margins": 0.2881919741630554, + "rewards/rejected": -0.493734210729599, + "step": 800 + }, + { + "epoch": 0.15, + "learning_rate": 4.955249702600598e-06, + "logits/chosen": -1.6208610534667969, + "logits/rejected": -0.9218361973762512, + "logps/chosen": -623.2794799804688, + "logps/rejected": -1308.27294921875, + "loss": 0.0591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1836785078048706, + "rewards/margins": 0.28122177720069885, + "rewards/rejected": -0.46490031480789185, + "step": 810 + }, + { + "epoch": 0.16, + "learning_rate": 4.952064486426965e-06, + "logits/chosen": -1.536716341972351, + "logits/rejected": -1.1593210697174072, + "logps/chosen": -590.9392700195312, + "logps/rejected": -1318.531005859375, + "loss": 0.0776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1551203429698944, + "rewards/margins": 0.28595298528671265, + "rewards/rejected": -0.44107332825660706, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 4.948770870970929e-06, + "logits/chosen": -1.698622465133667, + "logits/rejected": -1.0573958158493042, + "logps/chosen": -677.7445068359375, + "logps/rejected": -1307.689453125, + "loss": 0.06, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20002396404743195, + "rewards/margins": 0.27892419695854187, + "rewards/rejected": -0.47894811630249023, + "step": 830 + }, + { + "epoch": 0.16, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -1.729888677597046, + "logits/rejected": -1.1597099304199219, + "logps/chosen": -571.38818359375, + "logps/rejected": -1457.5614013671875, + "loss": 0.052, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15872427821159363, + "rewards/margins": 0.3792124390602112, + "rewards/rejected": -0.5379367470741272, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 4.941859029405354e-06, + "logits/chosen": -1.7805073261260986, + "logits/rejected": -1.0309171676635742, + "logps/chosen": -588.3792724609375, + "logps/rejected": -1260.41357421875, + "loss": 0.0693, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13170531392097473, + "rewards/margins": 0.2891574501991272, + "rewards/rejected": -0.42086273431777954, + "step": 850 + }, + { + "epoch": 0.16, + "learning_rate": 4.938241108850039e-06, + "logits/chosen": -1.6909992694854736, + "logits/rejected": -1.0959560871124268, + "logps/chosen": -590.9859619140625, + "logps/rejected": -1218.2630615234375, + "loss": 0.0891, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11560378968715668, + "rewards/margins": 0.2818681001663208, + "rewards/rejected": -0.3974718451499939, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 4.934515400107266e-06, + "logits/chosen": -1.5700453519821167, + "logits/rejected": -0.813576340675354, + "logps/chosen": -640.3110961914062, + "logps/rejected": -1193.339599609375, + "loss": 0.0861, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11941035836935043, + "rewards/margins": 0.25531017780303955, + "rewards/rejected": -0.3747205436229706, + "step": 870 + }, + { + "epoch": 0.17, + "learning_rate": 4.930682067880759e-06, + "logits/chosen": -1.4913710355758667, + "logits/rejected": -1.155098557472229, + "logps/chosen": -568.5694580078125, + "logps/rejected": -1217.7532958984375, + "loss": 0.0765, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15131355822086334, + "rewards/margins": 0.26503461599349976, + "rewards/rejected": -0.4163481593132019, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 4.926741281631991e-06, + "logits/chosen": -1.4744211435317993, + "logits/rejected": -0.8086342811584473, + "logps/chosen": -660.7673950195312, + "logps/rejected": -1496.2003173828125, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19947215914726257, + "rewards/margins": 0.3620762526988983, + "rewards/rejected": -0.5615484714508057, + "step": 890 + }, + { + "epoch": 0.17, + "learning_rate": 4.922693215572695e-06, + "logits/chosen": -1.7864484786987305, + "logits/rejected": -1.1300532817840576, + "logps/chosen": -627.9884643554688, + "logps/rejected": -1298.03076171875, + "loss": 0.0583, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15097679197788239, + "rewards/margins": 0.30441802740097046, + "rewards/rejected": -0.45539480447769165, + "step": 900 + }, + { + "epoch": 0.17, + "learning_rate": 4.91853804865716e-06, + "logits/chosen": -1.5407073497772217, + "logits/rejected": -1.0573087930679321, + "logps/chosen": -626.0189208984375, + "logps/rejected": -1293.064697265625, + "loss": 0.0591, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18492639064788818, + "rewards/margins": 0.2943907380104065, + "rewards/rejected": -0.4793171286582947, + "step": 910 + }, + { + "epoch": 0.18, + "learning_rate": 4.91427596457432e-06, + "logits/chosen": -1.7004728317260742, + "logits/rejected": -1.099687099456787, + "logps/chosen": -651.4698486328125, + "logps/rejected": -1385.514892578125, + "loss": 0.1016, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19086621701717377, + "rewards/margins": 0.3267431855201721, + "rewards/rejected": -0.5176093578338623, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 4.909907151739634e-06, + "logits/chosen": -1.6458594799041748, + "logits/rejected": -1.0120216608047485, + "logps/chosen": -739.9207763671875, + "logps/rejected": -1332.773681640625, + "loss": 0.0612, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21796786785125732, + "rewards/margins": 0.27975529432296753, + "rewards/rejected": -0.4977231025695801, + "step": 930 + }, + { + "epoch": 0.18, + "learning_rate": 4.905431803286756e-06, + "logits/chosen": -1.6875731945037842, + "logits/rejected": -1.1233506202697754, + "logps/chosen": -620.7035522460938, + "logps/rejected": -1154.03125, + "loss": 0.069, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14572550356388092, + "rewards/margins": 0.25223031640052795, + "rewards/rejected": -0.3979558050632477, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 4.900850117059e-06, + "logits/chosen": -1.7308450937271118, + "logits/rejected": -1.3464564085006714, + "logps/chosen": -573.27587890625, + "logps/rejected": -1219.830810546875, + "loss": 0.0703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.10996156930923462, + "rewards/margins": 0.28513264656066895, + "rewards/rejected": -0.39509421586990356, + "step": 950 + }, + { + "epoch": 0.18, + "learning_rate": 4.8961622956005895e-06, + "logits/chosen": -1.813080072402954, + "logits/rejected": -1.3277075290679932, + "logps/chosen": -530.7855224609375, + "logps/rejected": -1027.4674072265625, + "loss": 0.1403, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08806713670492172, + "rewards/margins": 0.2371738851070404, + "rewards/rejected": -0.3252410292625427, + "step": 960 + }, + { + "epoch": 0.18, + "learning_rate": 4.891368546147707e-06, + "logits/chosen": -1.595203161239624, + "logits/rejected": -1.1859229803085327, + "logps/chosen": -552.8775634765625, + "logps/rejected": -1224.351318359375, + "loss": 0.0807, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12314770370721817, + "rewards/margins": 0.304676353931427, + "rewards/rejected": -0.42782407999038696, + "step": 970 + }, + { + "epoch": 0.19, + "learning_rate": 4.88646908061933e-06, + "logits/chosen": -1.6990375518798828, + "logits/rejected": -0.8397040367126465, + "logps/chosen": -629.1605224609375, + "logps/rejected": -1143.335693359375, + "loss": 0.077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17983214557170868, + "rewards/margins": 0.2676909565925598, + "rewards/rejected": -0.4475231170654297, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 4.881464115607866e-06, + "logits/chosen": -1.6023046970367432, + "logits/rejected": -0.7092984318733215, + "logps/chosen": -804.279296875, + "logps/rejected": -1217.7445068359375, + "loss": 0.1267, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2759309411048889, + "rewards/margins": 0.2225939780473709, + "rewards/rejected": -0.49852484464645386, + "step": 990 + }, + { + "epoch": 0.19, + "learning_rate": 4.876353872369573e-06, + "logits/chosen": -1.6359355449676514, + "logits/rejected": -0.9567705392837524, + "logps/chosen": -709.3012084960938, + "logps/rejected": -1238.928955078125, + "loss": 0.0841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2836172878742218, + "rewards/margins": 0.24482114613056183, + "rewards/rejected": -0.5284383296966553, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 4.871138576814782e-06, + "logits/chosen": -1.6317775249481201, + "logits/rejected": -0.8598468899726868, + "logps/chosen": -672.0849609375, + "logps/rejected": -1243.135009765625, + "loss": 0.069, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24557438492774963, + "rewards/margins": 0.3051298260688782, + "rewards/rejected": -0.5507042407989502, + "step": 1010 + }, + { + "epoch": 0.19, + "learning_rate": 4.865818459497911e-06, + "logits/chosen": -1.388197898864746, + "logits/rejected": -0.9666770696640015, + "logps/chosen": -787.582763671875, + "logps/rejected": -1298.2857666015625, + "loss": 0.117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28274062275886536, + "rewards/margins": 0.24993351101875305, + "rewards/rejected": -0.5326741337776184, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 4.860393755607266e-06, + "logits/chosen": -1.4828870296478271, + "logits/rejected": -0.9877294301986694, + "logps/chosen": -555.7489013671875, + "logps/rejected": -1203.155517578125, + "loss": 0.0766, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10617247968912125, + "rewards/margins": 0.2726615071296692, + "rewards/rejected": -0.37883394956588745, + "step": 1030 + }, + { + "epoch": 0.2, + "learning_rate": 4.854864704954654e-06, + "logits/chosen": -1.5119549036026, + "logits/rejected": -1.1713879108428955, + "logps/chosen": -597.2371215820312, + "logps/rejected": -1206.311279296875, + "loss": 0.071, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11909804493188858, + "rewards/margins": 0.23684187233448029, + "rewards/rejected": -0.35593992471694946, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -1.4709835052490234, + "logits/rejected": -1.1632763147354126, + "logps/chosen": -424.6221618652344, + "logps/rejected": -979.3152465820312, + "loss": 0.1164, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08865557610988617, + "rewards/margins": 0.20411546528339386, + "rewards/rejected": -0.29277104139328003, + "step": 1050 + }, + { + "epoch": 0.2, + "learning_rate": 4.843494545664407e-06, + "logits/chosen": -1.7648117542266846, + "logits/rejected": -1.3866671323776245, + "logps/chosen": -586.7568969726562, + "logps/rejected": -1174.9310302734375, + "loss": 0.1011, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14828075468540192, + "rewards/margins": 0.24527840316295624, + "rewards/rejected": -0.39355915784835815, + "step": 1060 + }, + { + "epoch": 0.2, + "learning_rate": 4.837653939671427e-06, + "logits/chosen": -1.6241607666015625, + "logits/rejected": -0.9201037287712097, + "logps/chosen": -670.8914794921875, + "logps/rejected": -1177.223388671875, + "loss": 0.081, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12418527901172638, + "rewards/margins": 0.2541216015815735, + "rewards/rejected": -0.37830686569213867, + "step": 1070 + }, + { + "epoch": 0.21, + "learning_rate": 4.8317099921835695e-06, + "logits/chosen": -1.8830773830413818, + "logits/rejected": -1.1888630390167236, + "logps/chosen": -533.8602905273438, + "logps/rejected": -1146.5938720703125, + "loss": 0.0695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06473751366138458, + "rewards/margins": 0.30346792936325073, + "rewards/rejected": -0.3682054281234741, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 4.825662965967023e-06, + "logits/chosen": -1.9320052862167358, + "logits/rejected": -1.1186505556106567, + "logps/chosen": -634.6085205078125, + "logps/rejected": -1212.7740478515625, + "loss": 0.0585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10977531969547272, + "rewards/margins": 0.29483669996261597, + "rewards/rejected": -0.40461206436157227, + "step": 1090 + }, + { + "epoch": 0.21, + "learning_rate": 4.819513128344814e-06, + "logits/chosen": -1.7645708322525024, + "logits/rejected": -1.1508641242980957, + "logps/chosen": -611.5857543945312, + "logps/rejected": -1242.5738525390625, + "loss": 0.0742, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12029217183589935, + "rewards/margins": 0.2845746874809265, + "rewards/rejected": -0.4048668444156647, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 4.813260751184992e-06, + "logits/chosen": -1.7668542861938477, + "logits/rejected": -1.070345163345337, + "logps/chosen": -608.2420654296875, + "logps/rejected": -1294.886962890625, + "loss": 0.0723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10366489738225937, + "rewards/margins": 0.3099459111690521, + "rewards/rejected": -0.4136108458042145, + "step": 1110 + }, + { + "epoch": 0.21, + "learning_rate": 4.806906110888606e-06, + "logits/chosen": -1.5412280559539795, + "logits/rejected": -0.9960755109786987, + "logps/chosen": -532.5230712890625, + "logps/rejected": -1119.4359130859375, + "loss": 0.0574, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1133192628622055, + "rewards/margins": 0.2607772648334503, + "rewards/rejected": -0.374096542596817, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 4.8004494883774885e-06, + "logits/chosen": -1.9325168132781982, + "logits/rejected": -1.2252503633499146, + "logps/chosen": -583.9013061523438, + "logps/rejected": -1194.834716796875, + "loss": 0.0702, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.062377817928791046, + "rewards/margins": 0.28015127778053284, + "rewards/rejected": -0.3425291180610657, + "step": 1130 + }, + { + "epoch": 0.22, + "learning_rate": 4.793891169081835e-06, + "logits/chosen": -1.8249927759170532, + "logits/rejected": -1.0183128118515015, + "logps/chosen": -555.833984375, + "logps/rejected": -1278.7852783203125, + "loss": 0.0369, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.036963075399398804, + "rewards/margins": 0.34940817952156067, + "rewards/rejected": -0.38637128472328186, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 4.787231442927587e-06, + "logits/chosen": -1.399440050125122, + "logits/rejected": -1.2327266931533813, + "logps/chosen": -471.8321228027344, + "logps/rejected": -1072.462646484375, + "loss": 0.12, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10728304088115692, + "rewards/margins": 0.23449170589447021, + "rewards/rejected": -0.34177470207214355, + "step": 1150 + }, + { + "epoch": 0.22, + "learning_rate": 4.780470604323616e-06, + "logits/chosen": -1.7676588296890259, + "logits/rejected": -1.3527311086654663, + "logps/chosen": -471.47772216796875, + "logps/rejected": -1098.208984375, + "loss": 0.0938, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08965645730495453, + "rewards/margins": 0.2666322588920593, + "rewards/rejected": -0.35628873109817505, + "step": 1160 + }, + { + "epoch": 0.22, + "learning_rate": 4.773608952148706e-06, + "logits/chosen": -1.6790664196014404, + "logits/rejected": -1.1355869770050049, + "logps/chosen": -555.5061645507812, + "logps/rejected": -1149.15234375, + "loss": 0.0837, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12319140136241913, + "rewards/margins": 0.26579806208610535, + "rewards/rejected": -0.38898950815200806, + "step": 1170 + }, + { + "epoch": 0.22, + "learning_rate": 4.766646789738342e-06, + "logits/chosen": -1.850873351097107, + "logits/rejected": -0.993525505065918, + "logps/chosen": -642.1211547851562, + "logps/rejected": -1213.765869140625, + "loss": 0.0629, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14874321222305298, + "rewards/margins": 0.297253280878067, + "rewards/rejected": -0.4459964632987976, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 4.759584424871302e-06, + "logits/chosen": -1.6824783086776733, + "logits/rejected": -1.339135766029358, + "logps/chosen": -650.1998901367188, + "logps/rejected": -1217.5582275390625, + "loss": 0.0813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1805349886417389, + "rewards/margins": 0.2520584166049957, + "rewards/rejected": -0.43259334564208984, + "step": 1190 + }, + { + "epoch": 0.23, + "learning_rate": 4.752422169756048e-06, + "logits/chosen": -1.4297927618026733, + "logits/rejected": -0.9142922163009644, + "logps/chosen": -641.1162109375, + "logps/rejected": -1092.507080078125, + "loss": 0.1013, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17083248496055603, + "rewards/margins": 0.227576345205307, + "rewards/rejected": -0.39840883016586304, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 4.745160341016927e-06, + "logits/chosen": -1.6701313257217407, + "logits/rejected": -1.366464376449585, + "logps/chosen": -591.3519287109375, + "logps/rejected": -1383.019287109375, + "loss": 0.042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13523560762405396, + "rewards/margins": 0.34861212968826294, + "rewards/rejected": -0.4838477075099945, + "step": 1210 + }, + { + "epoch": 0.23, + "learning_rate": 4.737799259680172e-06, + "logits/chosen": -1.6060030460357666, + "logits/rejected": -1.1717652082443237, + "logps/chosen": -569.553466796875, + "logps/rejected": -1148.726806640625, + "loss": 0.1077, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08871385455131531, + "rewards/margins": 0.2584499716758728, + "rewards/rejected": -0.3471638262271881, + "step": 1220 + }, + { + "epoch": 0.23, + "learning_rate": 4.730339251159709e-06, + "logits/chosen": -1.9548372030258179, + "logits/rejected": -1.2164928913116455, + "logps/chosen": -605.7603759765625, + "logps/rejected": -1120.601318359375, + "loss": 0.0943, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08201462775468826, + "rewards/margins": 0.25623396039009094, + "rewards/rejected": -0.3382485806941986, + "step": 1230 + }, + { + "epoch": 0.24, + "learning_rate": 4.722780645242775e-06, + "logits/chosen": -2.015829086303711, + "logits/rejected": -1.1882562637329102, + "logps/chosen": -563.5271606445312, + "logps/rejected": -1133.745361328125, + "loss": 0.0827, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11626216024160385, + "rewards/margins": 0.25964677333831787, + "rewards/rejected": -0.3759089410305023, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 4.715123776075337e-06, + "logits/chosen": -1.601769208908081, + "logits/rejected": -1.0593763589859009, + "logps/chosen": -637.7056884765625, + "logps/rejected": -1188.92333984375, + "loss": 0.0784, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17665165662765503, + "rewards/margins": 0.2602120041847229, + "rewards/rejected": -0.43686366081237793, + "step": 1250 + }, + { + "epoch": 0.24, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -1.898706078529358, + "logits/rejected": -1.0478856563568115, + "logps/chosen": -661.0333251953125, + "logps/rejected": -1348.3873291015625, + "loss": 0.0501, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15230241417884827, + "rewards/margins": 0.3299187421798706, + "rewards/rejected": -0.4822211265563965, + "step": 1260 + }, + { + "epoch": 0.24, + "learning_rate": 4.699516606277638e-06, + "logits/chosen": -1.6510775089263916, + "logits/rejected": -1.0354807376861572, + "logps/chosen": -553.8106689453125, + "logps/rejected": -1366.97802734375, + "loss": 0.0515, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.08891726285219193, + "rewards/margins": 0.36568087339401245, + "rewards/rejected": -0.45459818840026855, + "step": 1270 + }, + { + "epoch": 0.24, + "learning_rate": 4.691566995599056e-06, + "logits/chosen": -1.5789378881454468, + "logits/rejected": -0.8932808041572571, + "logps/chosen": -612.9600830078125, + "logps/rejected": -1157.551025390625, + "loss": 0.0897, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06621615588665009, + "rewards/margins": 0.29278090596199036, + "rewards/rejected": -0.35899704694747925, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 4.683520501542825e-06, + "logits/chosen": -1.8256202936172485, + "logits/rejected": -1.200262188911438, + "logps/chosen": -573.4430541992188, + "logps/rejected": -1288.5570068359375, + "loss": 0.058, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.07125236839056015, + "rewards/margins": 0.3250887393951416, + "rewards/rejected": -0.39634108543395996, + "step": 1290 + }, + { + "epoch": 0.25, + "learning_rate": 4.675377479823153e-06, + "logits/chosen": -1.529521107673645, + "logits/rejected": -1.1277577877044678, + "logps/chosen": -562.3739013671875, + "logps/rejected": -1271.357421875, + "loss": 0.089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14400474727153778, + "rewards/margins": 0.28928202390670776, + "rewards/rejected": -0.43328675627708435, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 4.667138290421483e-06, + "logits/chosen": -1.5254117250442505, + "logits/rejected": -1.0594402551651, + "logps/chosen": -579.7008056640625, + "logps/rejected": -1145.4326171875, + "loss": 0.1012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18182730674743652, + "rewards/margins": 0.2299845665693283, + "rewards/rejected": -0.41181182861328125, + "step": 1310 + }, + { + "epoch": 0.25, + "learning_rate": 4.658803297570578e-06, + "logits/chosen": -1.471721887588501, + "logits/rejected": -1.128235936164856, + "logps/chosen": -643.552734375, + "logps/rejected": -1248.0694580078125, + "loss": 0.0933, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18766087293624878, + "rewards/margins": 0.2622078061103821, + "rewards/rejected": -0.44986867904663086, + "step": 1320 + }, + { + "epoch": 0.25, + "learning_rate": 4.650372869738415e-06, + "logits/chosen": -1.587103247642517, + "logits/rejected": -0.9101946949958801, + "logps/chosen": -575.8819580078125, + "logps/rejected": -1206.3009033203125, + "loss": 0.0616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11521546542644501, + "rewards/margins": 0.2791483402252197, + "rewards/rejected": -0.39436379075050354, + "step": 1330 + }, + { + "epoch": 0.26, + "learning_rate": 4.641847379611898e-06, + "logits/chosen": -1.6752538681030273, + "logits/rejected": -1.1627941131591797, + "logps/chosen": -563.6756591796875, + "logps/rejected": -1274.301025390625, + "loss": 0.0582, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.07845500856637955, + "rewards/margins": 0.29262250661849976, + "rewards/rejected": -0.3710775077342987, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 4.633227204080389e-06, + "logits/chosen": -1.5359529256820679, + "logits/rejected": -1.0807862281799316, + "logps/chosen": -550.9948120117188, + "logps/rejected": -1217.546142578125, + "loss": 0.0588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1211240142583847, + "rewards/margins": 0.2859167456626892, + "rewards/rejected": -0.4070407450199127, + "step": 1350 + }, + { + "epoch": 0.26, + "learning_rate": 4.624512724219038e-06, + "logits/chosen": -1.829384207725525, + "logits/rejected": -1.0859172344207764, + "logps/chosen": -672.615966796875, + "logps/rejected": -1285.6998291015625, + "loss": 0.0727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14164890348911285, + "rewards/margins": 0.29436761140823364, + "rewards/rejected": -0.4360164999961853, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 4.6157043252719374e-06, + "logits/chosen": -1.4630229473114014, + "logits/rejected": -1.0673094987869263, + "logps/chosen": -489.92645263671875, + "logps/rejected": -1276.04248046875, + "loss": 0.0639, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.08652294427156448, + "rewards/margins": 0.35399168729782104, + "rewards/rejected": -0.44051462411880493, + "step": 1370 + }, + { + "epoch": 0.26, + "learning_rate": 4.606802396635098e-06, + "logits/chosen": -1.5900306701660156, + "logits/rejected": -1.249037504196167, + "logps/chosen": -558.1952514648438, + "logps/rejected": -1157.880615234375, + "loss": 0.0838, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1465294063091278, + "rewards/margins": 0.26196879148483276, + "rewards/rejected": -0.40849822759628296, + "step": 1380 + }, + { + "epoch": 0.26, + "learning_rate": 4.597807331839229e-06, + "logits/chosen": -2.041189670562744, + "logits/rejected": -1.0724743604660034, + "logps/chosen": -690.7709350585938, + "logps/rejected": -1363.1270751953125, + "loss": 0.0571, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1370098888874054, + "rewards/margins": 0.33667826652526855, + "rewards/rejected": -0.4736880660057068, + "step": 1390 + }, + { + "epoch": 0.27, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -1.5918890237808228, + "logits/rejected": -0.6338850259780884, + "logps/chosen": -736.466064453125, + "logps/rejected": -1296.2232666015625, + "loss": 0.086, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16654495894908905, + "rewards/margins": 0.3148220181465149, + "rewards/rejected": -0.48136696219444275, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 4.5795393884621735e-06, + "logits/chosen": -1.5696302652359009, + "logits/rejected": -0.8134934306144714, + "logps/chosen": -579.7128295898438, + "logps/rejected": -1253.761474609375, + "loss": 0.0707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12203441560268402, + "rewards/margins": 0.3168005347251892, + "rewards/rejected": -0.43883490562438965, + "step": 1410 + }, + { + "epoch": 0.27, + "learning_rate": 4.5702673174584236e-06, + "logits/chosen": -1.6604053974151611, + "logits/rejected": -0.9312642812728882, + "logps/chosen": -553.2318115234375, + "logps/rejected": -1271.9256591796875, + "loss": 0.0582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13113778829574585, + "rewards/margins": 0.318024218082428, + "rewards/rejected": -0.4491620659828186, + "step": 1420 + }, + { + "epoch": 0.27, + "learning_rate": 4.560903725414816e-06, + "logits/chosen": -1.559654951095581, + "logits/rejected": -0.9024654626846313, + "logps/chosen": -584.5823974609375, + "logps/rejected": -1253.152099609375, + "loss": 0.0734, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15965160727500916, + "rewards/margins": 0.265408456325531, + "rewards/rejected": -0.4250600337982178, + "step": 1430 + }, + { + "epoch": 0.27, + "learning_rate": 4.551449026270979e-06, + "logits/chosen": -1.6541213989257812, + "logits/rejected": -1.139235258102417, + "logps/chosen": -553.2503051757812, + "logps/rejected": -1190.677490234375, + "loss": 0.0709, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11237634718418121, + "rewards/margins": 0.2757284343242645, + "rewards/rejected": -0.38810476660728455, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 4.541903637994142e-06, + "logits/chosen": -1.9984395503997803, + "logits/rejected": -1.2730480432510376, + "logps/chosen": -513.3294067382812, + "logps/rejected": -1165.566650390625, + "loss": 0.071, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05962224677205086, + "rewards/margins": 0.3146773874759674, + "rewards/rejected": -0.3742996156215668, + "step": 1450 + }, + { + "epoch": 0.28, + "learning_rate": 4.532267982560662e-06, + "logits/chosen": -1.7288652658462524, + "logits/rejected": -1.0699656009674072, + "logps/chosen": -536.3939208984375, + "logps/rejected": -1226.364013671875, + "loss": 0.0634, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05779455229640007, + "rewards/margins": 0.32264310121536255, + "rewards/rejected": -0.3804376721382141, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -1.3464348316192627, + "logits/rejected": -0.9208024740219116, + "logps/chosen": -469.87860107421875, + "logps/rejected": -1257.689697265625, + "loss": 0.085, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09766849130392075, + "rewards/margins": 0.3110795319080353, + "rewards/rejected": -0.40874800086021423, + "step": 1470 + }, + { + "epoch": 0.28, + "learning_rate": 4.512727578062733e-06, + "logits/chosen": -1.3829745054244995, + "logits/rejected": -0.9307807087898254, + "logps/chosen": -621.6275024414062, + "logps/rejected": -1248.703369140625, + "loss": 0.0948, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13931798934936523, + "rewards/margins": 0.27737680077552795, + "rewards/rejected": -0.4166947901248932, + "step": 1480 + }, + { + "epoch": 0.28, + "learning_rate": 4.502823692827859e-06, + "logits/chosen": -1.621721863746643, + "logits/rejected": -1.0642528533935547, + "logps/chosen": -549.4501342773438, + "logps/rejected": -1250.82958984375, + "loss": 0.0621, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1722426563501358, + "rewards/margins": 0.29302817583084106, + "rewards/rejected": -0.46527084708213806, + "step": 1490 + }, + { + "epoch": 0.29, + "learning_rate": 4.492831268057307e-06, + "logits/chosen": -1.7337934970855713, + "logits/rejected": -1.0224395990371704, + "logps/chosen": -634.6095581054688, + "logps/rejected": -1207.4476318359375, + "loss": 0.0836, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17793257534503937, + "rewards/margins": 0.25644272565841675, + "rewards/rejected": -0.4343752861022949, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 4.482750745489733e-06, + "logits/chosen": -1.4732965230941772, + "logits/rejected": -0.9633675813674927, + "logps/chosen": -550.6846923828125, + "logps/rejected": -1190.68701171875, + "loss": 0.0834, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13295753300189972, + "rewards/margins": 0.29836997389793396, + "rewards/rejected": -0.4313275218009949, + "step": 1510 + }, + { + "epoch": 0.29, + "learning_rate": 4.472582570758367e-06, + "logits/chosen": -1.7081899642944336, + "logits/rejected": -0.7727378606796265, + "logps/chosen": -666.4199829101562, + "logps/rejected": -1332.3919677734375, + "loss": 0.0735, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15550708770751953, + "rewards/margins": 0.30337247252464294, + "rewards/rejected": -0.4588795602321625, + "step": 1520 + }, + { + "epoch": 0.29, + "learning_rate": 4.4623271933713065e-06, + "logits/chosen": -1.5421324968338013, + "logits/rejected": -1.1563916206359863, + "logps/chosen": -582.2352294921875, + "logps/rejected": -1248.558837890625, + "loss": 0.106, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13999691605567932, + "rewards/margins": 0.28287771344184875, + "rewards/rejected": -0.4228746294975281, + "step": 1530 + }, + { + "epoch": 0.29, + "learning_rate": 4.451985066691649e-06, + "logits/chosen": -1.4042928218841553, + "logits/rejected": -0.6304243803024292, + "logps/chosen": -652.6393432617188, + "logps/rejected": -1314.00341796875, + "loss": 0.0531, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20784349739551544, + "rewards/margins": 0.3043650984764099, + "rewards/rejected": -0.5122085809707642, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 4.441556647917447e-06, + "logits/chosen": -1.6000726222991943, + "logits/rejected": -0.8292378187179565, + "logps/chosen": -885.3034057617188, + "logps/rejected": -1478.057861328125, + "loss": 0.0593, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2784448564052582, + "rewards/margins": 0.2961861491203308, + "rewards/rejected": -0.5746309757232666, + "step": 1550 + }, + { + "epoch": 0.3, + "learning_rate": 4.431042398061499e-06, + "logits/chosen": -1.3935855627059937, + "logits/rejected": -0.8830353021621704, + "logps/chosen": -572.7501831054688, + "logps/rejected": -1221.902099609375, + "loss": 0.0715, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19357424974441528, + "rewards/margins": 0.28787490725517273, + "rewards/rejected": -0.4814492166042328, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 4.420442781930971e-06, + "logits/chosen": -1.596678614616394, + "logits/rejected": -0.9144500494003296, + "logps/chosen": -583.464111328125, + "logps/rejected": -1321.6702880859375, + "loss": 0.0503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1923380047082901, + "rewards/margins": 0.3100276291370392, + "rewards/rejected": -0.5023655891418457, + "step": 1570 + }, + { + "epoch": 0.3, + "learning_rate": 4.409758268106842e-06, + "logits/chosen": -1.463576078414917, + "logits/rejected": -0.7617993950843811, + "logps/chosen": -594.7489013671875, + "logps/rejected": -1261.5989990234375, + "loss": 0.0685, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17580083012580872, + "rewards/margins": 0.3210605978965759, + "rewards/rejected": -0.49686145782470703, + "step": 1580 + }, + { + "epoch": 0.3, + "learning_rate": 4.398989328923196e-06, + "logits/chosen": -1.3283376693725586, + "logits/rejected": -0.7213112115859985, + "logps/chosen": -590.7708740234375, + "logps/rejected": -1166.47314453125, + "loss": 0.0792, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14475390315055847, + "rewards/margins": 0.29773521423339844, + "rewards/rejected": -0.4424891471862793, + "step": 1590 + }, + { + "epoch": 0.3, + "learning_rate": 4.388136440446338e-06, + "logits/chosen": -1.483410358428955, + "logits/rejected": -1.0970790386199951, + "logps/chosen": -583.5833740234375, + "logps/rejected": -1287.4739990234375, + "loss": 0.07, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.156617671251297, + "rewards/margins": 0.31275057792663574, + "rewards/rejected": -0.4693682789802551, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 4.377200082453748e-06, + "logits/chosen": -1.7334868907928467, + "logits/rejected": -0.9580786824226379, + "logps/chosen": -571.3174438476562, + "logps/rejected": -1233.8887939453125, + "loss": 0.0556, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13076187670230865, + "rewards/margins": 0.319111704826355, + "rewards/rejected": -0.44987359642982483, + "step": 1610 + }, + { + "epoch": 0.31, + "learning_rate": 4.366180738412876e-06, + "logits/chosen": -1.4784181118011475, + "logits/rejected": -0.8439235687255859, + "logps/chosen": -716.2511596679688, + "logps/rejected": -1182.4652099609375, + "loss": 0.1236, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22516557574272156, + "rewards/margins": 0.23013241589069366, + "rewards/rejected": -0.4552980065345764, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 4.355078895459761e-06, + "logits/chosen": -1.4879430532455444, + "logits/rejected": -0.943886399269104, + "logps/chosen": -542.371337890625, + "logps/rejected": -1122.3577880859375, + "loss": 0.0909, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1614888310432434, + "rewards/margins": 0.24517297744750977, + "rewards/rejected": -0.40666183829307556, + "step": 1630 + }, + { + "epoch": 0.31, + "learning_rate": 4.343895044377504e-06, + "logits/chosen": -1.4709045886993408, + "logits/rejected": -0.8647719621658325, + "logps/chosen": -612.67138671875, + "logps/rejected": -1236.822998046875, + "loss": 0.0837, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13855595886707306, + "rewards/margins": 0.28404054045677185, + "rewards/rejected": -0.42259645462036133, + "step": 1640 + }, + { + "epoch": 0.31, + "learning_rate": 4.332629679574566e-06, + "logits/chosen": -1.5026487112045288, + "logits/rejected": -0.8090435266494751, + "logps/chosen": -554.9136962890625, + "logps/rejected": -1203.1842041015625, + "loss": 0.0815, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11568528413772583, + "rewards/margins": 0.3077128231525421, + "rewards/rejected": -0.42339807748794556, + "step": 1650 + }, + { + "epoch": 0.32, + "learning_rate": 4.321283299062916e-06, + "logits/chosen": -1.6613775491714478, + "logits/rejected": -1.0173249244689941, + "logps/chosen": -496.1644592285156, + "logps/rejected": -1142.2369384765625, + "loss": 0.0793, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11908264458179474, + "rewards/margins": 0.31051716208457947, + "rewards/rejected": -0.4295998215675354, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 4.309856404436013e-06, + "logits/chosen": -1.7883861064910889, + "logits/rejected": -1.2383636236190796, + "logps/chosen": -562.5878295898438, + "logps/rejected": -1178.5867919921875, + "loss": 0.082, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10906285047531128, + "rewards/margins": 0.296166330575943, + "rewards/rejected": -0.4052291512489319, + "step": 1670 + }, + { + "epoch": 0.32, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -1.5402190685272217, + "logits/rejected": -1.1174333095550537, + "logps/chosen": -574.9996337890625, + "logps/rejected": -1121.726318359375, + "loss": 0.083, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.151823490858078, + "rewards/margins": 0.23926305770874023, + "rewards/rejected": -0.39108654856681824, + "step": 1680 + }, + { + "epoch": 0.32, + "learning_rate": 4.2867630969845235e-06, + "logits/chosen": -1.6920133829116821, + "logits/rejected": -0.9446209073066711, + "logps/chosen": -681.039794921875, + "logps/rejected": -1275.439453125, + "loss": 0.0705, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15800414979457855, + "rewards/margins": 0.2925296425819397, + "rewards/rejected": -0.45053377747535706, + "step": 1690 + }, + { + "epoch": 0.32, + "learning_rate": 4.275097705053951e-06, + "logits/chosen": -1.7884811162948608, + "logits/rejected": -1.1299197673797607, + "logps/chosen": -559.9410400390625, + "logps/rejected": -1295.1170654296875, + "loss": 0.0706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12000130116939545, + "rewards/margins": 0.3231123387813568, + "rewards/rejected": -0.44311365485191345, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 4.263353840751023e-06, + "logits/chosen": -1.5231735706329346, + "logits/rejected": -0.906470775604248, + "logps/chosen": -560.7247924804688, + "logps/rejected": -1323.56494140625, + "loss": 0.0441, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11194105446338654, + "rewards/margins": 0.3386703431606293, + "rewards/rejected": -0.450611412525177, + "step": 1710 + }, + { + "epoch": 0.33, + "learning_rate": 4.251532023240901e-06, + "logits/chosen": -1.5177276134490967, + "logits/rejected": -1.092954397201538, + "logps/chosen": -654.8155517578125, + "logps/rejected": -1194.3641357421875, + "loss": 0.0935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17031201720237732, + "rewards/margins": 0.2336687594652176, + "rewards/rejected": -0.4039807915687561, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 4.239632775134857e-06, + "logits/chosen": -1.595540165901184, + "logits/rejected": -0.9981781244277954, + "logps/chosen": -609.3163452148438, + "logps/rejected": -1126.6153564453125, + "loss": 0.0942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14758938550949097, + "rewards/margins": 0.2600153684616089, + "rewards/rejected": -0.40760475397109985, + "step": 1730 + }, + { + "epoch": 0.33, + "learning_rate": 4.227656622467162e-06, + "logits/chosen": -1.6388444900512695, + "logits/rejected": -1.2317636013031006, + "logps/chosen": -632.548828125, + "logps/rejected": -1173.0673828125, + "loss": 0.0929, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13355585932731628, + "rewards/margins": 0.26612335443496704, + "rewards/rejected": -0.3996792137622833, + "step": 1740 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.3533117771148682, + "logits/rejected": -0.6486954689025879, + "logps/chosen": -734.0782470703125, + "logps/rejected": -1259.840576171875, + "loss": 0.1028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21947124600410461, + "rewards/margins": 0.27327650785446167, + "rewards/rejected": -0.49274778366088867, + "step": 1750 + }, + { + "epoch": 0.34, + "learning_rate": 4.203475724559235e-06, + "logits/chosen": -1.5916444063186646, + "logits/rejected": -1.0552420616149902, + "logps/chosen": -757.8905639648438, + "logps/rejected": -1302.722900390625, + "loss": 0.1028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2682662308216095, + "rewards/margins": 0.2391747236251831, + "rewards/rejected": -0.507440984249115, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 4.191272048292514e-06, + "logits/chosen": -1.5339223146438599, + "logits/rejected": -0.9809187650680542, + "logps/chosen": -633.8641357421875, + "logps/rejected": -1267.8603515625, + "loss": 0.0968, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20641283690929413, + "rewards/margins": 0.2913345396518707, + "rewards/rejected": -0.49774736166000366, + "step": 1770 + }, + { + "epoch": 0.34, + "learning_rate": 4.178993605363904e-06, + "logits/chosen": -1.763193130493164, + "logits/rejected": -1.235198736190796, + "logps/chosen": -482.87615966796875, + "logps/rejected": -1054.1783447265625, + "loss": 0.1001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10275360196828842, + "rewards/margins": 0.2246766984462738, + "rewards/rejected": -0.3274303078651428, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 4.166640938570879e-06, + "logits/chosen": -1.6158673763275146, + "logits/rejected": -0.9096325039863586, + "logps/chosen": -688.8383178710938, + "logps/rejected": -1214.840576171875, + "loss": 0.0637, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13154849410057068, + "rewards/margins": 0.2507414221763611, + "rewards/rejected": -0.38228991627693176, + "step": 1790 + }, + { + "epoch": 0.34, + "learning_rate": 4.154214593992149e-06, + "logits/chosen": -1.6515785455703735, + "logits/rejected": -1.28810715675354, + "logps/chosen": -586.6110229492188, + "logps/rejected": -1224.0474853515625, + "loss": 0.0645, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15291659533977509, + "rewards/margins": 0.26770395040512085, + "rewards/rejected": -0.42062050104141235, + "step": 1800 + }, + { + "epoch": 0.34, + "learning_rate": 4.1417151209635265e-06, + "logits/chosen": -1.343052864074707, + "logits/rejected": -1.1219394207000732, + "logps/chosen": -727.157470703125, + "logps/rejected": -1348.6068115234375, + "loss": 0.1003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22418935596942902, + "rewards/margins": 0.24681849777698517, + "rewards/rejected": -0.4710078239440918, + "step": 1810 + }, + { + "epoch": 0.35, + "learning_rate": 4.129143072053639e-06, + "logits/chosen": -1.6531670093536377, + "logits/rejected": -1.0683776140213013, + "logps/chosen": -628.3314208984375, + "logps/rejected": -1343.596435546875, + "loss": 0.061, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1800796240568161, + "rewards/margins": 0.3129463195800781, + "rewards/rejected": -0.4930259585380554, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 4.116499003039499e-06, + "logits/chosen": -1.6415526866912842, + "logits/rejected": -0.8921445608139038, + "logps/chosen": -508.5306701660156, + "logps/rejected": -1111.977783203125, + "loss": 0.0744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1160668283700943, + "rewards/margins": 0.27721890807151794, + "rewards/rejected": -0.39328575134277344, + "step": 1830 + }, + { + "epoch": 0.35, + "learning_rate": 4.103783472881942e-06, + "logits/chosen": -1.6161365509033203, + "logits/rejected": -0.9930692911148071, + "logps/chosen": -510.691162109375, + "logps/rejected": -1178.771240234375, + "loss": 0.0701, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12652714550495148, + "rewards/margins": 0.2798650562763214, + "rewards/rejected": -0.4063921868801117, + "step": 1840 + }, + { + "epoch": 0.35, + "learning_rate": 4.0909970437009094e-06, + "logits/chosen": -1.407378077507019, + "logits/rejected": -1.0627632141113281, + "logps/chosen": -604.5719604492188, + "logps/rejected": -1218.077880859375, + "loss": 0.0961, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22636625170707703, + "rewards/margins": 0.26213282346725464, + "rewards/rejected": -0.4884990155696869, + "step": 1850 + }, + { + "epoch": 0.35, + "learning_rate": 4.078140280750598e-06, + "logits/chosen": -1.472339153289795, + "logits/rejected": -1.0137022733688354, + "logps/chosen": -691.4371948242188, + "logps/rejected": -1220.0880126953125, + "loss": 0.1188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23141947388648987, + "rewards/margins": 0.21889667212963104, + "rewards/rejected": -0.4503161907196045, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 4.065213752394478e-06, + "logits/chosen": -1.5468190908432007, + "logits/rejected": -0.9151954650878906, + "logps/chosen": -664.4764404296875, + "logps/rejected": -1273.167724609375, + "loss": 0.0767, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2180008590221405, + "rewards/margins": 0.28027474880218506, + "rewards/rejected": -0.4982755780220032, + "step": 1870 + }, + { + "epoch": 0.36, + "learning_rate": 4.052218030080162e-06, + "logits/chosen": -1.7460209131240845, + "logits/rejected": -0.8932159543037415, + "logps/chosen": -689.2654418945312, + "logps/rejected": -1396.868896484375, + "loss": 0.0715, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2002066820859909, + "rewards/margins": 0.335799902677536, + "rewards/rejected": -0.5360065698623657, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -1.3742153644561768, + "logits/rejected": -0.9114626049995422, + "logps/chosen": -634.3856811523438, + "logps/rejected": -1262.1175537109375, + "loss": 0.1016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25341543555259705, + "rewards/margins": 0.24519459903240204, + "rewards/rejected": -0.49861007928848267, + "step": 1890 + }, + { + "epoch": 0.36, + "learning_rate": 4.026021304636408e-06, + "logits/chosen": -1.4285153150558472, + "logits/rejected": -0.9699563980102539, + "logps/chosen": -621.96533203125, + "logps/rejected": -1336.2926025390625, + "loss": 0.0789, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19991159439086914, + "rewards/margins": 0.2923136055469513, + "rewards/rejected": -0.49222517013549805, + "step": 1900 + }, + { + "epoch": 0.36, + "learning_rate": 4.012821459594881e-06, + "logits/chosen": -1.630352258682251, + "logits/rejected": -1.1409363746643066, + "logps/chosen": -484.24365234375, + "logps/rejected": -1044.751953125, + "loss": 0.0633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1116863489151001, + "rewards/margins": 0.25152185559272766, + "rewards/rejected": -0.36320820450782776, + "step": 1910 + }, + { + "epoch": 0.37, + "learning_rate": 3.999554736719785e-06, + "logits/chosen": -1.7693662643432617, + "logits/rejected": -1.1132917404174805, + "logps/chosen": -622.8857421875, + "logps/rejected": -1309.3828125, + "loss": 0.0626, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14811810851097107, + "rewards/margins": 0.32902562618255615, + "rewards/rejected": -0.47714370489120483, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 3.986221722497832e-06, + "logits/chosen": -1.7638092041015625, + "logits/rejected": -1.087059736251831, + "logps/chosen": -632.57666015625, + "logps/rejected": -1238.3343505859375, + "loss": 0.0726, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16446995735168457, + "rewards/margins": 0.29051706194877625, + "rewards/rejected": -0.4549869894981384, + "step": 1930 + }, + { + "epoch": 0.37, + "learning_rate": 3.9728230063463e-06, + "logits/chosen": -1.2721593379974365, + "logits/rejected": -0.6474477052688599, + "logps/chosen": -648.2594604492188, + "logps/rejected": -1158.4620361328125, + "loss": 0.0716, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1905958205461502, + "rewards/margins": 0.24646992981433868, + "rewards/rejected": -0.4370657801628113, + "step": 1940 + }, + { + "epoch": 0.37, + "learning_rate": 3.9593591805869755e-06, + "logits/chosen": -1.4620612859725952, + "logits/rejected": -1.2620489597320557, + "logps/chosen": -688.633056640625, + "logps/rejected": -1529.7515869140625, + "loss": 0.0975, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1938236951828003, + "rewards/margins": 0.35576432943344116, + "rewards/rejected": -0.5495880246162415, + "step": 1950 + }, + { + "epoch": 0.37, + "learning_rate": 3.945830840419966e-06, + "logits/chosen": -1.7115497589111328, + "logits/rejected": -1.2753444910049438, + "logps/chosen": -479.00384521484375, + "logps/rejected": -1069.37548828125, + "loss": 0.0806, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.021248655393719673, + "rewards/margins": 0.23637816309928894, + "rewards/rejected": -0.25762683153152466, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 3.932238583897395e-06, + "logits/chosen": -1.6209239959716797, + "logits/rejected": -1.3102586269378662, + "logps/chosen": -497.66693115234375, + "logps/rejected": -1194.21435546875, + "loss": 0.0496, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.04905269294977188, + "rewards/margins": 0.28805986046791077, + "rewards/rejected": -0.33711254596710205, + "step": 1970 + }, + { + "epoch": 0.38, + "learning_rate": 3.918583011896955e-06, + "logits/chosen": -1.5563997030258179, + "logits/rejected": -1.0230815410614014, + "logps/chosen": -551.2222900390625, + "logps/rejected": -1139.404052734375, + "loss": 0.0892, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.036595430225133896, + "rewards/margins": 0.27522173523902893, + "rewards/rejected": -0.31181710958480835, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 3.904864728095349e-06, + "logits/chosen": -1.6914150714874268, + "logits/rejected": -1.3743771314620972, + "logps/chosen": -563.36865234375, + "logps/rejected": -1147.3048095703125, + "loss": 0.0887, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06336866319179535, + "rewards/margins": 0.24804475903511047, + "rewards/rejected": -0.311413437128067, + "step": 1990 + }, + { + "epoch": 0.38, + "learning_rate": 3.891084338941603e-06, + "logits/chosen": -1.4719524383544922, + "logits/rejected": -1.0884759426116943, + "logps/chosen": -480.8564453125, + "logps/rejected": -1265.7791748046875, + "loss": 0.1026, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.04422132298350334, + "rewards/margins": 0.32557159662246704, + "rewards/rejected": -0.3697928786277771, + "step": 2000 + }, + { + "epoch": 0.38, + "learning_rate": 3.8772424536302565e-06, + "logits/chosen": -1.7504775524139404, + "logits/rejected": -1.0374706983566284, + "logps/chosen": -581.1937255859375, + "logps/rejected": -1179.3349609375, + "loss": 0.0854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07610450685024261, + "rewards/margins": 0.26427727937698364, + "rewards/rejected": -0.34038177132606506, + "step": 2010 + }, + { + "epoch": 0.38, + "learning_rate": 3.863339684074432e-06, + "logits/chosen": -1.444999098777771, + "logits/rejected": -1.0082156658172607, + "logps/chosen": -567.4451904296875, + "logps/rejected": -1163.037109375, + "loss": 0.0942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09151064604520798, + "rewards/margins": 0.25389835238456726, + "rewards/rejected": -0.34540897607803345, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 3.849376644878783e-06, + "logits/chosen": -1.7114700078964233, + "logits/rejected": -1.2078514099121094, + "logps/chosen": -580.4812622070312, + "logps/rejected": -1253.7298583984375, + "loss": 0.0739, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.11515311151742935, + "rewards/margins": 0.26693490147590637, + "rewards/rejected": -0.3820880353450775, + "step": 2030 + }, + { + "epoch": 0.39, + "learning_rate": 3.835353953312322e-06, + "logits/chosen": -1.5887187719345093, + "logits/rejected": -1.0290062427520752, + "logps/chosen": -550.4044189453125, + "logps/rejected": -1165.352783203125, + "loss": 0.0892, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11724672466516495, + "rewards/margins": 0.26691046357154846, + "rewards/rejected": -0.3841571807861328, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 3.821272229281139e-06, + "logits/chosen": -1.243851900100708, + "logits/rejected": -0.8313790559768677, + "logps/chosen": -619.6917724609375, + "logps/rejected": -1220.0845947265625, + "loss": 0.1013, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13393720984458923, + "rewards/margins": 0.27672356367111206, + "rewards/rejected": -0.41066068410873413, + "step": 2050 + }, + { + "epoch": 0.39, + "learning_rate": 3.8071320953009906e-06, + "logits/chosen": -1.6292989253997803, + "logits/rejected": -1.1258556842803955, + "logps/chosen": -542.9365234375, + "logps/rejected": -1215.368896484375, + "loss": 0.0616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09711267799139023, + "rewards/margins": 0.28968364000320435, + "rewards/rejected": -0.38679632544517517, + "step": 2060 + }, + { + "epoch": 0.39, + "learning_rate": 3.792934176469782e-06, + "logits/chosen": -1.644221544265747, + "logits/rejected": -0.9224053621292114, + "logps/chosen": -615.8048706054688, + "logps/rejected": -1117.5472412109375, + "loss": 0.0962, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09015320241451263, + "rewards/margins": 0.2451203316450119, + "rewards/rejected": -0.3352735638618469, + "step": 2070 + }, + { + "epoch": 0.4, + "learning_rate": 3.7786791004399353e-06, + "logits/chosen": -1.5849639177322388, + "logits/rejected": -0.9769964218139648, + "logps/chosen": -630.0858154296875, + "logps/rejected": -1170.2745361328125, + "loss": 0.081, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12655183672904968, + "rewards/margins": 0.2449195832014084, + "rewards/rejected": -0.37147143483161926, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 3.764367497390642e-06, + "logits/chosen": -1.6721210479736328, + "logits/rejected": -1.1214954853057861, + "logps/chosen": -552.5462646484375, + "logps/rejected": -1353.6265869140625, + "loss": 0.0585, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14976239204406738, + "rewards/margins": 0.30934378504753113, + "rewards/rejected": -0.4591061472892761, + "step": 2090 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.4793144464492798, + "logits/rejected": -0.9002103805541992, + "logps/chosen": -640.5985107421875, + "logps/rejected": -1225.0712890625, + "loss": 0.0905, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14169077575206757, + "rewards/margins": 0.2867027223110199, + "rewards/rejected": -0.4283934533596039, + "step": 2100 + }, + { + "epoch": 0.4, + "learning_rate": 3.7355772434170523e-06, + "logits/chosen": -1.7350581884384155, + "logits/rejected": -1.0579384565353394, + "logps/chosen": -617.4492797851562, + "logps/rejected": -1214.168212890625, + "loss": 0.0467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13681480288505554, + "rewards/margins": 0.29050007462501526, + "rewards/rejected": -0.4273148477077484, + "step": 2110 + }, + { + "epoch": 0.4, + "learning_rate": 3.7210998652337016e-06, + "logits/chosen": -1.676896095275879, + "logits/rejected": -0.9806238412857056, + "logps/chosen": -592.6138916015625, + "logps/rejected": -1362.531982421875, + "loss": 0.0429, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13515350222587585, + "rewards/margins": 0.3417731821537018, + "rewards/rejected": -0.47692665457725525, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 3.7065685054565277e-06, + "logits/chosen": -1.7495753765106201, + "logits/rejected": -0.8682845830917358, + "logps/chosen": -585.2340087890625, + "logps/rejected": -1222.46044921875, + "loss": 0.0467, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.10164134204387665, + "rewards/margins": 0.33252280950546265, + "rewards/rejected": -0.4341641366481781, + "step": 2130 + }, + { + "epoch": 0.41, + "learning_rate": 3.691983806478494e-06, + "logits/chosen": -1.7462999820709229, + "logits/rejected": -1.2676109075546265, + "logps/chosen": -485.5740661621094, + "logps/rejected": -1216.7451171875, + "loss": 0.0894, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07753627002239227, + "rewards/margins": 0.3056955933570862, + "rewards/rejected": -0.38323187828063965, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 3.677346413050551e-06, + "logits/chosen": -1.674078345298767, + "logits/rejected": -1.3559746742248535, + "logps/chosen": -565.6333618164062, + "logps/rejected": -1339.0338134765625, + "loss": 0.039, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.06904712319374084, + "rewards/margins": 0.356864869594574, + "rewards/rejected": -0.4259119927883148, + "step": 2150 + }, + { + "epoch": 0.41, + "learning_rate": 3.6626569722531268e-06, + "logits/chosen": -1.594458818435669, + "logits/rejected": -0.9288966059684753, + "logps/chosen": -594.22216796875, + "logps/rejected": -1178.4859619140625, + "loss": 0.0878, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10133261978626251, + "rewards/margins": 0.2484547644853592, + "rewards/rejected": -0.3497873842716217, + "step": 2160 + }, + { + "epoch": 0.41, + "learning_rate": 3.6479161334675294e-06, + "logits/chosen": -1.582834005355835, + "logits/rejected": -0.854317843914032, + "logps/chosen": -631.1778564453125, + "logps/rejected": -1246.053466796875, + "loss": 0.0852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12199946492910385, + "rewards/margins": 0.2943424880504608, + "rewards/rejected": -0.41634196043014526, + "step": 2170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6331245483472353e-06, + "logits/chosen": -1.458630919456482, + "logits/rejected": -1.0171302556991577, + "logps/chosen": -530.21142578125, + "logps/rejected": -1186.492919921875, + "loss": 0.0895, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10598714649677277, + "rewards/margins": 0.29519587755203247, + "rewards/rejected": -0.40118303894996643, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6182828707890816e-06, + "logits/chosen": -1.6425663232803345, + "logits/rejected": -1.0837091207504272, + "logps/chosen": -593.2979125976562, + "logps/rejected": -1225.735595703125, + "loss": 0.1029, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1416325867176056, + "rewards/margins": 0.29666662216186523, + "rewards/rejected": -0.43829917907714844, + "step": 2190 + }, + { + "epoch": 0.42, + "learning_rate": 3.6033917569043604e-06, + "logits/chosen": -1.7086814641952515, + "logits/rejected": -1.0961934328079224, + "logps/chosen": -586.9171142578125, + "logps/rejected": -1241.936279296875, + "loss": 0.0486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14558613300323486, + "rewards/margins": 0.2909151613712311, + "rewards/rejected": -0.43650132417678833, + "step": 2200 + }, + { + "epoch": 0.42, + "learning_rate": 3.588451864989811e-06, + "logits/chosen": -1.4291936159133911, + "logits/rejected": -0.9254032373428345, + "logps/chosen": -670.0648193359375, + "logps/rejected": -1274.3505859375, + "loss": 0.0869, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16813287138938904, + "rewards/margins": 0.2989405393600464, + "rewards/rejected": -0.4670734405517578, + "step": 2210 + }, + { + "epoch": 0.42, + "learning_rate": 3.5734638554985234e-06, + "logits/chosen": -1.6968755722045898, + "logits/rejected": -1.161507248878479, + "logps/chosen": -432.891845703125, + "logps/rejected": -1126.6932373046875, + "loss": 0.0636, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.04656369611620903, + "rewards/margins": 0.2924540936946869, + "rewards/rejected": -0.339017778635025, + "step": 2220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5584283910107343e-06, + "logits/chosen": -1.5758813619613647, + "logits/rejected": -0.5977634787559509, + "logps/chosen": -618.425537109375, + "logps/rejected": -1015.8828125, + "loss": 0.081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09514687210321426, + "rewards/margins": 0.2435726374387741, + "rewards/rejected": -0.33871954679489136, + "step": 2230 + }, + { + "epoch": 0.43, + "learning_rate": 3.543346136204545e-06, + "logits/chosen": -1.59457528591156, + "logits/rejected": -0.7562257647514343, + "logps/chosen": -662.0821533203125, + "logps/rejected": -1304.8880615234375, + "loss": 0.0677, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12489262968301773, + "rewards/margins": 0.32123300433158875, + "rewards/rejected": -0.44612565636634827, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 3.5282177578265295e-06, + "logits/chosen": -1.7696926593780518, + "logits/rejected": -1.2749775648117065, + "logps/chosen": -482.4493103027344, + "logps/rejected": -1066.5418701171875, + "loss": 0.0886, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10509966313838959, + "rewards/margins": 0.2710351347923279, + "rewards/rejected": -0.37613481283187866, + "step": 2250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5130439246622635e-06, + "logits/chosen": -1.3018282651901245, + "logits/rejected": -0.8078845143318176, + "logps/chosen": -570.1189575195312, + "logps/rejected": -1327.9266357421875, + "loss": 0.067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1495288759469986, + "rewards/margins": 0.3377271592617035, + "rewards/rejected": -0.4872560501098633, + "step": 2260 + }, + { + "epoch": 0.43, + "learning_rate": 3.497825307506758e-06, + "logits/chosen": -1.7156795263290405, + "logits/rejected": -1.2350342273712158, + "logps/chosen": -651.0853271484375, + "logps/rejected": -1260.7733154296875, + "loss": 0.0751, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15341240167617798, + "rewards/margins": 0.28627315163612366, + "rewards/rejected": -0.43968549370765686, + "step": 2270 + }, + { + "epoch": 0.43, + "learning_rate": 3.4825625791348093e-06, + "logits/chosen": -1.6436914205551147, + "logits/rejected": -0.9447058439254761, + "logps/chosen": -674.1998291015625, + "logps/rejected": -1122.89599609375, + "loss": 0.1037, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17227864265441895, + "rewards/margins": 0.2191500961780548, + "rewards/rejected": -0.39142873883247375, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 3.467256414271249e-06, + "logits/chosen": -1.7851943969726562, + "logits/rejected": -0.9154273271560669, + "logps/chosen": -685.0301513671875, + "logps/rejected": -1168.9381103515625, + "loss": 0.1071, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15380102396011353, + "rewards/margins": 0.24055905640125275, + "rewards/rejected": -0.39436009526252747, + "step": 2290 + }, + { + "epoch": 0.44, + "learning_rate": 3.4519074895611245e-06, + "logits/chosen": -1.6110646724700928, + "logits/rejected": -1.0128824710845947, + "logps/chosen": -507.72900390625, + "logps/rejected": -1104.7791748046875, + "loss": 0.0777, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09803974628448486, + "rewards/margins": 0.2795708179473877, + "rewards/rejected": -0.37761059403419495, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -1.6653196811676025, + "logits/rejected": -0.9805139303207397, + "logps/chosen": -491.69525146484375, + "logps/rejected": -1238.8643798828125, + "loss": 0.0466, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.06906162202358246, + "rewards/margins": 0.3539879322052002, + "rewards/rejected": -0.42304953932762146, + "step": 2310 + }, + { + "epoch": 0.44, + "learning_rate": 3.421084076602867e-06, + "logits/chosen": -1.6048294305801392, + "logits/rejected": -1.1272857189178467, + "logps/chosen": -570.3504028320312, + "logps/rejected": -1230.8138427734375, + "loss": 0.0497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11816605180501938, + "rewards/margins": 0.32945695519447327, + "rewards/rejected": -0.44762295484542847, + "step": 2320 + }, + { + "epoch": 0.44, + "learning_rate": 3.405610950976257e-06, + "logits/chosen": -1.7304108142852783, + "logits/rejected": -1.2529585361480713, + "logps/chosen": -631.9100341796875, + "logps/rejected": -1261.7708740234375, + "loss": 0.0932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14346104860305786, + "rewards/margins": 0.3222948908805847, + "rewards/rejected": -0.4657559394836426, + "step": 2330 + }, + { + "epoch": 0.45, + "learning_rate": 3.3900977906858923e-06, + "logits/chosen": -1.6869083642959595, + "logits/rejected": -1.047593355178833, + "logps/chosen": -709.0650024414062, + "logps/rejected": -1288.22021484375, + "loss": 0.0822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19333884119987488, + "rewards/margins": 0.26803427934646606, + "rewards/rejected": -0.46137315034866333, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 3.3745452815275375e-06, + "logits/chosen": -1.5970579385757446, + "logits/rejected": -1.0045435428619385, + "logps/chosen": -671.1852416992188, + "logps/rejected": -1301.151611328125, + "loss": 0.0775, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16181443631649017, + "rewards/margins": 0.26783424615859985, + "rewards/rejected": -0.4296487271785736, + "step": 2350 + }, + { + "epoch": 0.45, + "learning_rate": 3.3589541110364678e-06, + "logits/chosen": -1.5840835571289062, + "logits/rejected": -1.0760002136230469, + "logps/chosen": -536.934814453125, + "logps/rejected": -1156.111572265625, + "loss": 0.0945, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10547558218240738, + "rewards/margins": 0.2672649025917053, + "rewards/rejected": -0.3727405071258545, + "step": 2360 + }, + { + "epoch": 0.45, + "learning_rate": 3.3433249684570757e-06, + "logits/chosen": -1.7080332040786743, + "logits/rejected": -0.9376260042190552, + "logps/chosen": -451.25927734375, + "logps/rejected": -1115.48681640625, + "loss": 0.0874, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.03897695988416672, + "rewards/margins": 0.29128187894821167, + "rewards/rejected": -0.330258846282959, + "step": 2370 + }, + { + "epoch": 0.45, + "learning_rate": 3.3276585447123957e-06, + "logits/chosen": -1.6589637994766235, + "logits/rejected": -1.1556400060653687, + "logps/chosen": -603.0030517578125, + "logps/rejected": -1086.166015625, + "loss": 0.1274, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1070268377661705, + "rewards/margins": 0.220013827085495, + "rewards/rejected": -0.3270407021045685, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 3.3119555323735664e-06, + "logits/chosen": -1.5776820182800293, + "logits/rejected": -1.1128264665603638, + "logps/chosen": -654.4660034179688, + "logps/rejected": -1288.406005859375, + "loss": 0.0588, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16737082600593567, + "rewards/margins": 0.2906762957572937, + "rewards/rejected": -0.458047091960907, + "step": 2390 + }, + { + "epoch": 0.46, + "learning_rate": 3.2962166256292116e-06, + "logits/chosen": -1.3844215869903564, + "logits/rejected": -1.0129426717758179, + "logps/chosen": -630.933837890625, + "logps/rejected": -1127.8221435546875, + "loss": 0.1021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18399353325366974, + "rewards/margins": 0.21213479340076447, + "rewards/rejected": -0.3961283266544342, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 3.2804425202547494e-06, + "logits/chosen": -1.4825665950775146, + "logits/rejected": -0.9888661503791809, + "logps/chosen": -685.001220703125, + "logps/rejected": -1302.7615966796875, + "loss": 0.09, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19598498940467834, + "rewards/margins": 0.27030500769615173, + "rewards/rejected": -0.46629005670547485, + "step": 2410 + }, + { + "epoch": 0.46, + "learning_rate": 3.2646339135816386e-06, + "logits/chosen": -1.4710817337036133, + "logits/rejected": -0.8951467275619507, + "logps/chosen": -549.333740234375, + "logps/rejected": -1265.0982666015625, + "loss": 0.0674, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09500465542078018, + "rewards/margins": 0.3293246626853943, + "rewards/rejected": -0.42432934045791626, + "step": 2420 + }, + { + "epoch": 0.46, + "learning_rate": 3.2487915044665485e-06, + "logits/chosen": -1.4130220413208008, + "logits/rejected": -1.1104599237442017, + "logps/chosen": -567.1212158203125, + "logps/rejected": -1161.703857421875, + "loss": 0.08, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10543553531169891, + "rewards/margins": 0.2853809893131256, + "rewards/rejected": -0.39081650972366333, + "step": 2430 + }, + { + "epoch": 0.46, + "learning_rate": 3.2329159932604638e-06, + "logits/chosen": -1.5364049673080444, + "logits/rejected": -0.9546027183532715, + "logps/chosen": -546.604736328125, + "logps/rejected": -1226.5767822265625, + "loss": 0.069, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11398911476135254, + "rewards/margins": 0.3227608799934387, + "rewards/rejected": -0.43675002455711365, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -1.4038457870483398, + "logits/rejected": -0.9470579028129578, + "logps/chosen": -529.3914184570312, + "logps/rejected": -1212.7598876953125, + "loss": 0.078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10971584171056747, + "rewards/margins": 0.30522698163986206, + "rewards/rejected": -0.41494280099868774, + "step": 2450 + }, + { + "epoch": 0.47, + "learning_rate": 3.201068473265007e-06, + "logits/chosen": -1.6278892755508423, + "logits/rejected": -0.9200417399406433, + "logps/chosen": -640.5203247070312, + "logps/rejected": -1258.502197265625, + "loss": 0.0703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1412404179573059, + "rewards/margins": 0.2885693311691284, + "rewards/rejected": -0.42980971932411194, + "step": 2460 + }, + { + "epoch": 0.47, + "learning_rate": 3.1850978723702213e-06, + "logits/chosen": -1.5890612602233887, + "logits/rejected": -0.9910513758659363, + "logps/chosen": -669.634521484375, + "logps/rejected": -1339.1484375, + "loss": 0.0862, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17336814105510712, + "rewards/margins": 0.3014371395111084, + "rewards/rejected": -0.4748052656650543, + "step": 2470 + }, + { + "epoch": 0.47, + "learning_rate": 3.1690969851113724e-06, + "logits/chosen": -1.754757285118103, + "logits/rejected": -1.1542155742645264, + "logps/chosen": -693.3397216796875, + "logps/rejected": -1303.1055908203125, + "loss": 0.0825, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1847262680530548, + "rewards/margins": 0.2816571593284607, + "rewards/rejected": -0.4663833975791931, + "step": 2480 + }, + { + "epoch": 0.47, + "learning_rate": 3.1530665188453463e-06, + "logits/chosen": -1.4916189908981323, + "logits/rejected": -0.8297182321548462, + "logps/chosen": -726.0441284179688, + "logps/rejected": -1387.1719970703125, + "loss": 0.0609, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22441205382347107, + "rewards/margins": 0.29351991415023804, + "rewards/rejected": -0.5179319381713867, + "step": 2490 + }, + { + "epoch": 0.48, + "learning_rate": 3.137007182236637e-06, + "logits/chosen": -1.4697926044464111, + "logits/rejected": -1.0665897130966187, + "logps/chosen": -555.7542724609375, + "logps/rejected": -1162.6217041015625, + "loss": 0.0632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11064179986715317, + "rewards/margins": 0.2883552014827728, + "rewards/rejected": -0.39899706840515137, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 3.1209196852260204e-06, + "logits/chosen": -1.5638642311096191, + "logits/rejected": -0.9049351811408997, + "logps/chosen": -530.5607299804688, + "logps/rejected": -1162.5194091796875, + "loss": 0.084, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10229738056659698, + "rewards/margins": 0.29367098212242126, + "rewards/rejected": -0.39596837759017944, + "step": 2510 + }, + { + "epoch": 0.48, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -1.5408356189727783, + "logits/rejected": -0.8372970819473267, + "logps/chosen": -528.8843994140625, + "logps/rejected": -1207.514404296875, + "loss": 0.0431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1121770516037941, + "rewards/margins": 0.3144737184047699, + "rewards/rejected": -0.4266508221626282, + "step": 2520 + }, + { + "epoch": 0.48, + "learning_rate": 3.0886630559552144e-06, + "logits/chosen": -1.7452852725982666, + "logits/rejected": -0.9543391466140747, + "logps/chosen": -668.1608276367188, + "logps/rejected": -1297.2008056640625, + "loss": 0.0579, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1479906588792801, + "rewards/margins": 0.3234715759754181, + "rewards/rejected": -0.4714622497558594, + "step": 2530 + }, + { + "epoch": 0.48, + "learning_rate": 3.072495349675249e-06, + "logits/chosen": -1.7660051584243774, + "logits/rejected": -0.9404336214065552, + "logps/chosen": -675.3543090820312, + "logps/rejected": -1376.85986328125, + "loss": 0.0867, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1329096257686615, + "rewards/margins": 0.3379344046115875, + "rewards/rejected": -0.470844030380249, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 3.056302334890786e-06, + "logits/chosen": -1.5463120937347412, + "logits/rejected": -0.9701935648918152, + "logps/chosen": -559.4407958984375, + "logps/rejected": -1268.9881591796875, + "loss": 0.0539, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10604407638311386, + "rewards/margins": 0.3344349265098572, + "rewards/rejected": -0.440479040145874, + "step": 2550 + }, + { + "epoch": 0.49, + "learning_rate": 3.04008472745216e-06, + "logits/chosen": -1.4827139377593994, + "logits/rejected": -0.9881393313407898, + "logps/chosen": -527.420166015625, + "logps/rejected": -1177.581298828125, + "loss": 0.0891, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.079104483127594, + "rewards/margins": 0.29415425658226013, + "rewards/rejected": -0.3732587695121765, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 3.0238432442968803e-06, + "logits/chosen": -1.557047724723816, + "logits/rejected": -0.9846252202987671, + "logps/chosen": -551.916259765625, + "logps/rejected": -1224.5875244140625, + "loss": 0.0524, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09614777565002441, + "rewards/margins": 0.30018511414527893, + "rewards/rejected": -0.39633288979530334, + "step": 2570 + }, + { + "epoch": 0.49, + "learning_rate": 3.0075786034179407e-06, + "logits/chosen": -1.450969934463501, + "logits/rejected": -1.3005410432815552, + "logps/chosen": -476.8797912597656, + "logps/rejected": -1157.756103515625, + "loss": 0.0783, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0792437344789505, + "rewards/margins": 0.27935245633125305, + "rewards/rejected": -0.35859617590904236, + "step": 2580 + }, + { + "epoch": 0.49, + "learning_rate": 2.9912915238320755e-06, + "logits/chosen": -1.64337158203125, + "logits/rejected": -1.1302156448364258, + "logps/chosen": -503.0484924316406, + "logps/rejected": -1008.0091552734375, + "loss": 0.101, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07734871655702591, + "rewards/margins": 0.2203529179096222, + "rewards/rejected": -0.2977016568183899, + "step": 2590 + }, + { + "epoch": 0.5, + "learning_rate": 2.974982725547976e-06, + "logits/chosen": -1.5070769786834717, + "logits/rejected": -1.2087517976760864, + "logps/chosen": -518.1423950195312, + "logps/rejected": -1188.7457275390625, + "loss": 0.0641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06826291978359222, + "rewards/margins": 0.28302261233329773, + "rewards/rejected": -0.35128551721572876, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 2.958652929534456e-06, + "logits/chosen": -1.5541181564331055, + "logits/rejected": -0.9500897526741028, + "logps/chosen": -427.2430725097656, + "logps/rejected": -1138.5986328125, + "loss": 0.0551, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06738888472318649, + "rewards/margins": 0.3063209652900696, + "rewards/rejected": -0.37370985746383667, + "step": 2610 + }, + { + "epoch": 0.5, + "learning_rate": 2.9423028576885894e-06, + "logits/chosen": -1.421770453453064, + "logits/rejected": -0.9556745290756226, + "logps/chosen": -574.5811767578125, + "logps/rejected": -1172.9805908203125, + "loss": 0.1039, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12262606620788574, + "rewards/margins": 0.26561444997787476, + "rewards/rejected": -0.3882405161857605, + "step": 2620 + }, + { + "epoch": 0.5, + "learning_rate": 2.9259332328037852e-06, + "logits/chosen": -1.5648980140686035, + "logits/rejected": -0.9057445526123047, + "logps/chosen": -605.000732421875, + "logps/rejected": -1209.2060546875, + "loss": 0.0589, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08145221322774887, + "rewards/margins": 0.3243905007839203, + "rewards/rejected": -0.40584272146224976, + "step": 2630 + }, + { + "epoch": 0.5, + "learning_rate": 2.9095447785378446e-06, + "logits/chosen": -1.631359338760376, + "logits/rejected": -0.9921914339065552, + "logps/chosen": -550.8638305664062, + "logps/rejected": -1227.114501953125, + "loss": 0.0745, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10412851721048355, + "rewards/margins": 0.3038374185562134, + "rewards/rejected": -0.4079659581184387, + "step": 2640 + }, + { + "epoch": 0.5, + "learning_rate": 2.893138219380964e-06, + "logits/chosen": -1.8246917724609375, + "logits/rejected": -1.1634905338287354, + "logps/chosen": -476.13397216796875, + "logps/rejected": -1130.2000732421875, + "loss": 0.0595, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.034350063651800156, + "rewards/margins": 0.33886438608169556, + "rewards/rejected": -0.37321439385414124, + "step": 2650 + }, + { + "epoch": 0.51, + "learning_rate": 2.876714280623708e-06, + "logits/chosen": -1.4653544425964355, + "logits/rejected": -0.8596148490905762, + "logps/chosen": -475.05853271484375, + "logps/rejected": -1144.7532958984375, + "loss": 0.0852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07599582523107529, + "rewards/margins": 0.2964363992214203, + "rewards/rejected": -0.37243229150772095, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 2.8602736883249504e-06, + "logits/chosen": -1.6749900579452515, + "logits/rejected": -1.0397599935531616, + "logps/chosen": -632.7877807617188, + "logps/rejected": -1282.6087646484375, + "loss": 0.0934, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13829271495342255, + "rewards/margins": 0.28984689712524414, + "rewards/rejected": -0.4281395971775055, + "step": 2670 + }, + { + "epoch": 0.51, + "learning_rate": 2.843817169279772e-06, + "logits/chosen": -1.4789026975631714, + "logits/rejected": -1.0164978504180908, + "logps/chosen": -588.529052734375, + "logps/rejected": -1097.4697265625, + "loss": 0.0939, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11081542819738388, + "rewards/margins": 0.2728538513183594, + "rewards/rejected": -0.38366931676864624, + "step": 2680 + }, + { + "epoch": 0.51, + "learning_rate": 2.8273454509873333e-06, + "logits/chosen": -1.5279271602630615, + "logits/rejected": -1.0477749109268188, + "logps/chosen": -694.4437866210938, + "logps/rejected": -1294.1767578125, + "loss": 0.0812, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15735623240470886, + "rewards/margins": 0.27929773926734924, + "rewards/rejected": -0.4366539418697357, + "step": 2690 + }, + { + "epoch": 0.51, + "learning_rate": 2.8108592616187135e-06, + "logits/chosen": -1.6551815271377563, + "logits/rejected": -0.8189488649368286, + "logps/chosen": -744.4472045898438, + "logps/rejected": -1390.954345703125, + "loss": 0.0533, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1515381634235382, + "rewards/margins": 0.3364045023918152, + "rewards/rejected": -0.4879426956176758, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 2.7943593299847186e-06, + "logits/chosen": -1.5556144714355469, + "logits/rejected": -0.8782920837402344, + "logps/chosen": -578.2349853515625, + "logps/rejected": -1223.9837646484375, + "loss": 0.0502, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.12136213481426239, + "rewards/margins": 0.32430487871170044, + "rewards/rejected": -0.4456670880317688, + "step": 2710 + }, + { + "epoch": 0.52, + "learning_rate": 2.7778463855036656e-06, + "logits/chosen": -1.4753865003585815, + "logits/rejected": -0.8386212587356567, + "logps/chosen": -651.51171875, + "logps/rejected": -1406.922119140625, + "loss": 0.0434, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1467413604259491, + "rewards/margins": 0.3557416498661041, + "rewards/rejected": -0.5024830102920532, + "step": 2720 + }, + { + "epoch": 0.52, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -1.6173675060272217, + "logits/rejected": -1.0392745733261108, + "logps/chosen": -618.5396118164062, + "logps/rejected": -1164.376708984375, + "loss": 0.07, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1464042365550995, + "rewards/margins": 0.26444289088249207, + "rewards/rejected": -0.41084709763526917, + "step": 2730 + }, + { + "epoch": 0.52, + "learning_rate": 2.7447843785176958e-06, + "logits/chosen": -1.4622766971588135, + "logits/rejected": -0.898713231086731, + "logps/chosen": -549.015625, + "logps/rejected": -1307.8948974609375, + "loss": 0.0594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12090383470058441, + "rewards/margins": 0.34172388911247253, + "rewards/rejected": -0.46262773871421814, + "step": 2740 + }, + { + "epoch": 0.52, + "learning_rate": 2.728236777596621e-06, + "logits/chosen": -1.4183399677276611, + "logits/rejected": -0.8845669031143188, + "logps/chosen": -644.03662109375, + "logps/rejected": -1180.503173828125, + "loss": 0.1105, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14675351977348328, + "rewards/margins": 0.2584415078163147, + "rewards/rejected": -0.40519505739212036, + "step": 2750 + }, + { + "epoch": 0.53, + "learning_rate": 2.7116790869315583e-06, + "logits/chosen": -1.459571361541748, + "logits/rejected": -0.855141282081604, + "logps/chosen": -544.542236328125, + "logps/rejected": -1199.67919921875, + "loss": 0.0687, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10830040276050568, + "rewards/margins": 0.31138211488723755, + "rewards/rejected": -0.4196825623512268, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 2.695112038494198e-06, + "logits/chosen": -1.556896448135376, + "logits/rejected": -1.0641980171203613, + "logps/chosen": -526.887939453125, + "logps/rejected": -1217.894775390625, + "loss": 0.0929, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11782244592905045, + "rewards/margins": 0.3022124171257019, + "rewards/rejected": -0.42003482580184937, + "step": 2770 + }, + { + "epoch": 0.53, + "learning_rate": 2.6785363646699125e-06, + "logits/chosen": -1.475042700767517, + "logits/rejected": -0.8031581044197083, + "logps/chosen": -695.498046875, + "logps/rejected": -1166.9180908203125, + "loss": 0.0699, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12228862196207047, + "rewards/margins": 0.24580267071723938, + "rewards/rejected": -0.36809128522872925, + "step": 2780 + }, + { + "epoch": 0.53, + "learning_rate": 2.6619527982253796e-06, + "logits/chosen": -1.4715737104415894, + "logits/rejected": -1.0043838024139404, + "logps/chosen": -579.2100830078125, + "logps/rejected": -1229.07470703125, + "loss": 0.0635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11626414954662323, + "rewards/margins": 0.3015051782131195, + "rewards/rejected": -0.41776934266090393, + "step": 2790 + }, + { + "epoch": 0.53, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -1.4490864276885986, + "logits/rejected": -0.9089977145195007, + "logps/chosen": -579.8900146484375, + "logps/rejected": -1265.926025390625, + "loss": 0.0658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11970784515142441, + "rewards/margins": 0.310236394405365, + "rewards/rejected": -0.4299442172050476, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 2.628764920254435e-06, + "logits/chosen": -1.4776760339736938, + "logits/rejected": -0.9225019216537476, + "logps/chosen": -567.9808959960938, + "logps/rejected": -1210.098876953125, + "loss": 0.077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0780453160405159, + "rewards/margins": 0.3206408619880676, + "rewards/rejected": -0.39868617057800293, + "step": 2810 + }, + { + "epoch": 0.54, + "learning_rate": 2.6121620758762877e-06, + "logits/chosen": -1.4901140928268433, + "logits/rejected": -0.8745683431625366, + "logps/chosen": -634.2081298828125, + "logps/rejected": -1223.249755859375, + "loss": 0.0933, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13513752818107605, + "rewards/margins": 0.25405651330947876, + "rewards/rejected": -0.3891941010951996, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 2.595554273109564e-06, + "logits/chosen": -1.6821086406707764, + "logits/rejected": -1.0679817199707031, + "logps/chosen": -539.3198852539062, + "logps/rejected": -1233.5755615234375, + "loss": 0.049, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.09231746941804886, + "rewards/margins": 0.336186945438385, + "rewards/rejected": -0.4285043776035309, + "step": 2830 + }, + { + "epoch": 0.54, + "learning_rate": 2.5789422461412776e-06, + "logits/chosen": -1.4972844123840332, + "logits/rejected": -0.905117392539978, + "logps/chosen": -463.593994140625, + "logps/rejected": -1129.23388671875, + "loss": 0.0552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05808283016085625, + "rewards/margins": 0.323687881231308, + "rewards/rejected": -0.3817707300186157, + "step": 2840 + }, + { + "epoch": 0.54, + "learning_rate": 2.5623267293451827e-06, + "logits/chosen": -1.6810455322265625, + "logits/rejected": -1.0212938785552979, + "logps/chosen": -548.9854125976562, + "logps/rejected": -1143.27099609375, + "loss": 0.0821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10023714601993561, + "rewards/margins": 0.2769266366958618, + "rewards/rejected": -0.3771637976169586, + "step": 2850 + }, + { + "epoch": 0.54, + "learning_rate": 2.5457084572493094e-06, + "logits/chosen": -1.5808911323547363, + "logits/rejected": -0.896012008190155, + "logps/chosen": -595.2738037109375, + "logps/rejected": -1394.0679931640625, + "loss": 0.0315, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.0908048152923584, + "rewards/margins": 0.36780017614364624, + "rewards/rejected": -0.45860499143600464, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 2.5290881645034932e-06, + "logits/chosen": -1.5079399347305298, + "logits/rejected": -1.0337510108947754, + "logps/chosen": -497.4072265625, + "logps/rejected": -1198.070556640625, + "loss": 0.0651, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.06317411363124847, + "rewards/margins": 0.32769569754600525, + "rewards/rejected": -0.3908698558807373, + "step": 2870 + }, + { + "epoch": 0.55, + "learning_rate": 2.5124665858468956e-06, + "logits/chosen": -1.6744495630264282, + "logits/rejected": -1.0496574640274048, + "logps/chosen": -609.0294189453125, + "logps/rejected": -1278.1341552734375, + "loss": 0.0615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12275202572345734, + "rewards/margins": 0.33510714769363403, + "rewards/rejected": -0.45785918831825256, + "step": 2880 + }, + { + "epoch": 0.55, + "learning_rate": 2.4958444560755268e-06, + "logits/chosen": -1.5687849521636963, + "logits/rejected": -1.0864049196243286, + "logps/chosen": -631.5806884765625, + "logps/rejected": -1329.831298828125, + "loss": 0.0664, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1420772522687912, + "rewards/margins": 0.3117358386516571, + "rewards/rejected": -0.4538130760192871, + "step": 2890 + }, + { + "epoch": 0.55, + "learning_rate": 2.479222510009758e-06, + "logits/chosen": -1.3493144512176514, + "logits/rejected": -0.8287954330444336, + "logps/chosen": -622.3375244140625, + "logps/rejected": -1253.612548828125, + "loss": 0.0811, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15970450639724731, + "rewards/margins": 0.3076263666152954, + "rewards/rejected": -0.4673308730125427, + "step": 2900 + }, + { + "epoch": 0.55, + "learning_rate": 2.4626014824618418e-06, + "logits/chosen": -1.257110834121704, + "logits/rejected": -0.9077134132385254, + "logps/chosen": -573.6275634765625, + "logps/rejected": -1228.2496337890625, + "loss": 0.1086, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13805031776428223, + "rewards/margins": 0.27737957239151, + "rewards/rejected": -0.41542989015579224, + "step": 2910 + }, + { + "epoch": 0.56, + "learning_rate": 2.445982108203422e-06, + "logits/chosen": -1.6566803455352783, + "logits/rejected": -0.9453924298286438, + "logps/chosen": -645.748046875, + "logps/rejected": -1354.3961181640625, + "loss": 0.0505, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11871250718832016, + "rewards/margins": 0.35229814052581787, + "rewards/rejected": -0.471010684967041, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 2.4293651219330614e-06, + "logits/chosen": -1.4606636762619019, + "logits/rejected": -0.8227102160453796, + "logps/chosen": -565.6244506835938, + "logps/rejected": -1142.8575439453125, + "loss": 0.0706, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09208562225103378, + "rewards/margins": 0.28996017575263977, + "rewards/rejected": -0.38204577565193176, + "step": 2930 + }, + { + "epoch": 0.56, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -1.4561045169830322, + "logits/rejected": -1.000232458114624, + "logps/chosen": -633.3966064453125, + "logps/rejected": -1223.904296875, + "loss": 0.0647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16247665882110596, + "rewards/margins": 0.2604535222053528, + "rewards/rejected": -0.42293015122413635, + "step": 2940 + }, + { + "epoch": 0.56, + "learning_rate": 2.3961412515904337e-06, + "logits/chosen": -1.4311959743499756, + "logits/rejected": -0.9970407485961914, + "logps/chosen": -565.9105224609375, + "logps/rejected": -1229.419677734375, + "loss": 0.0594, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09461134672164917, + "rewards/margins": 0.3016055226325989, + "rewards/rejected": -0.39621683955192566, + "step": 2950 + }, + { + "epoch": 0.56, + "learning_rate": 2.3795358362575618e-06, + "logits/chosen": -1.6998859643936157, + "logits/rejected": -1.164505958557129, + "logps/chosen": -563.6510009765625, + "logps/rejected": -1211.5599365234375, + "loss": 0.0694, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08975648880004883, + "rewards/margins": 0.2907021641731262, + "rewards/rejected": -0.38045868277549744, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 2.3629357463266e-06, + "logits/chosen": -1.4463509321212769, + "logits/rejected": -1.0091583728790283, + "logps/chosen": -492.5101013183594, + "logps/rejected": -1144.726806640625, + "loss": 0.0828, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09430743008852005, + "rewards/margins": 0.2873263657093048, + "rewards/rejected": -0.38163381814956665, + "step": 2970 + }, + { + "epoch": 0.57, + "learning_rate": 2.346341715643601e-06, + "logits/chosen": -1.5341848134994507, + "logits/rejected": -1.1294262409210205, + "logps/chosen": -613.6388549804688, + "logps/rejected": -1147.2083740234375, + "loss": 0.1093, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1467583328485489, + "rewards/margins": 0.22035908699035645, + "rewards/rejected": -0.3671174645423889, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 2.32975447778675e-06, + "logits/chosen": -1.6252291202545166, + "logits/rejected": -1.1982320547103882, + "logps/chosen": -710.5306396484375, + "logps/rejected": -1275.906494140625, + "loss": 0.074, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16013281047344208, + "rewards/margins": 0.2782554030418396, + "rewards/rejected": -0.4383881986141205, + "step": 2990 + }, + { + "epoch": 0.57, + "learning_rate": 2.3131747660339396e-06, + "logits/chosen": -1.457285761833191, + "logits/rejected": -1.0221116542816162, + "logps/chosen": -597.4036254882812, + "logps/rejected": -1166.0738525390625, + "loss": 0.084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1460018903017044, + "rewards/margins": 0.2565058767795563, + "rewards/rejected": -0.4025077223777771, + "step": 3000 + }, + { + "epoch": 0.57, + "learning_rate": 2.296603313330355e-06, + "logits/chosen": -1.3567924499511719, + "logits/rejected": -1.0843842029571533, + "logps/chosen": -500.361328125, + "logps/rejected": -1220.2147216796875, + "loss": 0.0836, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1001061201095581, + "rewards/margins": 0.3180326521396637, + "rewards/rejected": -0.4181387424468994, + "step": 3010 + }, + { + "epoch": 0.58, + "learning_rate": 2.280040852256068e-06, + "logits/chosen": -1.7149471044540405, + "logits/rejected": -1.1004184484481812, + "logps/chosen": -613.2303466796875, + "logps/rejected": -1190.1524658203125, + "loss": 0.0794, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12324889749288559, + "rewards/margins": 0.29667380452156067, + "rewards/rejected": -0.41992267966270447, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 2.2634881149936576e-06, + "logits/chosen": -1.6608912944793701, + "logits/rejected": -0.9288992881774902, + "logps/chosen": -696.2246704101562, + "logps/rejected": -1400.7724609375, + "loss": 0.0615, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1452145129442215, + "rewards/margins": 0.3502580523490906, + "rewards/rejected": -0.4954725205898285, + "step": 3030 + }, + { + "epoch": 0.58, + "learning_rate": 2.246945833295836e-06, + "logits/chosen": -1.8040908575057983, + "logits/rejected": -0.9266663789749146, + "logps/chosen": -639.0257568359375, + "logps/rejected": -1127.646728515625, + "loss": 0.1096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16084113717079163, + "rewards/margins": 0.26114019751548767, + "rewards/rejected": -0.4219813346862793, + "step": 3040 + }, + { + "epoch": 0.58, + "learning_rate": 2.230414738453104e-06, + "logits/chosen": -1.4765208959579468, + "logits/rejected": -0.8586834073066711, + "logps/chosen": -596.9618530273438, + "logps/rejected": -1303.4659423828125, + "loss": 0.0685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11683762073516846, + "rewards/margins": 0.3540942668914795, + "rewards/rejected": -0.47093185782432556, + "step": 3050 + }, + { + "epoch": 0.58, + "learning_rate": 2.2138955612614206e-06, + "logits/chosen": -1.2896515130996704, + "logits/rejected": -0.8482069969177246, + "logps/chosen": -584.8584594726562, + "logps/rejected": -1247.270263671875, + "loss": 0.0827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12709662318229675, + "rewards/margins": 0.2933301329612732, + "rewards/rejected": -0.42042669653892517, + "step": 3060 + }, + { + "epoch": 0.58, + "learning_rate": 2.1973890319898965e-06, + "logits/chosen": -1.9013397693634033, + "logits/rejected": -1.2246954441070557, + "logps/chosen": -570.5123291015625, + "logps/rejected": -1332.769287109375, + "loss": 0.0537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11388106644153595, + "rewards/margins": 0.33937424421310425, + "rewards/rejected": -0.4532553553581238, + "step": 3070 + }, + { + "epoch": 0.59, + "learning_rate": 2.1808958803485134e-06, + "logits/chosen": -1.4231480360031128, + "logits/rejected": -1.1156870126724243, + "logps/chosen": -568.2221069335938, + "logps/rejected": -1169.4871826171875, + "loss": 0.0791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11777313798666, + "rewards/margins": 0.28890877962112427, + "rewards/rejected": -0.40668192505836487, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 2.1644168354558623e-06, + "logits/chosen": -1.703005075454712, + "logits/rejected": -1.1666195392608643, + "logps/chosen": -596.4285278320312, + "logps/rejected": -1018.8997192382812, + "loss": 0.105, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13551798462867737, + "rewards/margins": 0.22267785668373108, + "rewards/rejected": -0.35819584131240845, + "step": 3090 + }, + { + "epoch": 0.59, + "learning_rate": 2.1479526258069086e-06, + "logits/chosen": -1.3815600872039795, + "logits/rejected": -0.6034301519393921, + "logps/chosen": -637.18603515625, + "logps/rejected": -1317.694580078125, + "loss": 0.0633, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13466182351112366, + "rewards/margins": 0.3213178813457489, + "rewards/rejected": -0.45597973465919495, + "step": 3100 + }, + { + "epoch": 0.59, + "learning_rate": 2.1315039792407975e-06, + "logits/chosen": -1.5379188060760498, + "logits/rejected": -1.128328561782837, + "logps/chosen": -510.88555908203125, + "logps/rejected": -1054.700927734375, + "loss": 0.0981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11464317888021469, + "rewards/margins": 0.24528208374977112, + "rewards/rejected": -0.3599252998828888, + "step": 3110 + }, + { + "epoch": 0.59, + "learning_rate": 2.115071622908666e-06, + "logits/chosen": -1.3771955966949463, + "logits/rejected": -1.0849876403808594, + "logps/chosen": -440.3738708496094, + "logps/rejected": -1029.5709228515625, + "loss": 0.102, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1041545420885086, + "rewards/margins": 0.2349095344543457, + "rewards/rejected": -0.3390640616416931, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 2.0986562832415063e-06, + "logits/chosen": -1.4015318155288696, + "logits/rejected": -1.1533081531524658, + "logps/chosen": -526.330322265625, + "logps/rejected": -1111.6566162109375, + "loss": 0.0952, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14694051444530487, + "rewards/margins": 0.2503613829612732, + "rewards/rejected": -0.3973018527030945, + "step": 3130 + }, + { + "epoch": 0.6, + "learning_rate": 2.082258685918047e-06, + "logits/chosen": -1.5659064054489136, + "logits/rejected": -1.0318849086761475, + "logps/chosen": -659.6588134765625, + "logps/rejected": -1159.3125, + "loss": 0.0962, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1373148411512375, + "rewards/margins": 0.2489592283964157, + "rewards/rejected": -0.386274129152298, + "step": 3140 + }, + { + "epoch": 0.6, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.5127919912338257, + "logits/rejected": -0.9335476160049438, + "logps/chosen": -520.0562744140625, + "logps/rejected": -1174.3912353515625, + "loss": 0.0813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10218697786331177, + "rewards/margins": 0.2773703634738922, + "rewards/rejected": -0.37955737113952637, + "step": 3150 + }, + { + "epoch": 0.6, + "learning_rate": 2.049519617063389e-06, + "logits/chosen": -1.5129424333572388, + "logits/rejected": -1.0811700820922852, + "logps/chosen": -586.3703002929688, + "logps/rejected": -1365.397705078125, + "loss": 0.0335, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.10181821882724762, + "rewards/margins": 0.34911856055259705, + "rewards/rejected": -0.45093679428100586, + "step": 3160 + }, + { + "epoch": 0.6, + "learning_rate": 2.033179592839792e-06, + "logits/chosen": -1.4758931398391724, + "logits/rejected": -0.8931997418403625, + "logps/chosen": -583.43310546875, + "logps/rejected": -1255.8292236328125, + "loss": 0.0603, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11339731514453888, + "rewards/margins": 0.32942765951156616, + "rewards/rejected": -0.44282498955726624, + "step": 3170 + }, + { + "epoch": 0.61, + "learning_rate": 2.0168602055111175e-06, + "logits/chosen": -1.6357719898223877, + "logits/rejected": -1.1608362197875977, + "logps/chosen": -438.1368713378906, + "logps/rejected": -1034.555419921875, + "loss": 0.0944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08265723288059235, + "rewards/margins": 0.26866570115089417, + "rewards/rejected": -0.3513229489326477, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 2.0005621765142942e-06, + "logits/chosen": -1.407368779182434, + "logits/rejected": -0.7318683862686157, + "logps/chosen": -598.01171875, + "logps/rejected": -1273.376708984375, + "loss": 0.0509, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14980801939964294, + "rewards/margins": 0.3117080628871918, + "rewards/rejected": -0.46151605248451233, + "step": 3190 + }, + { + "epoch": 0.61, + "learning_rate": 1.9842862263420565e-06, + "logits/chosen": -1.8645998239517212, + "logits/rejected": -0.8983448147773743, + "logps/chosen": -590.1622924804688, + "logps/rejected": -1256.179443359375, + "loss": 0.0676, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13834363222122192, + "rewards/margins": 0.3233782947063446, + "rewards/rejected": -0.4617219567298889, + "step": 3200 + }, + { + "epoch": 0.61, + "learning_rate": 1.9680330745110954e-06, + "logits/chosen": -1.6306655406951904, + "logits/rejected": -0.9761863946914673, + "logps/chosen": -572.4918212890625, + "logps/rejected": -1248.41796875, + "loss": 0.06, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1350272297859192, + "rewards/margins": 0.3037089407444, + "rewards/rejected": -0.43873611092567444, + "step": 3210 + }, + { + "epoch": 0.61, + "learning_rate": 1.9518034395302413e-06, + "logits/chosen": -1.4094593524932861, + "logits/rejected": -1.0162584781646729, + "logps/chosen": -634.5624389648438, + "logps/rejected": -1191.04638671875, + "loss": 0.0957, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16423244774341583, + "rewards/margins": 0.2612122893333435, + "rewards/rejected": -0.42544469237327576, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 1.9355980388687145e-06, + "logits/chosen": -1.4995319843292236, + "logits/rejected": -1.1388894319534302, + "logps/chosen": -529.622314453125, + "logps/rejected": -1198.6038818359375, + "loss": 0.0595, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1255326271057129, + "rewards/margins": 0.30638033151626587, + "rewards/rejected": -0.43191295862197876, + "step": 3230 + }, + { + "epoch": 0.62, + "learning_rate": 1.9194175889243942e-06, + "logits/chosen": -1.6592363119125366, + "logits/rejected": -0.9633617401123047, + "logps/chosen": -595.4632568359375, + "logps/rejected": -1168.793701171875, + "loss": 0.0916, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.152585968375206, + "rewards/margins": 0.2846713662147522, + "rewards/rejected": -0.4372572898864746, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 1.903262804992156e-06, + "logits/chosen": -1.6367032527923584, + "logits/rejected": -1.0083138942718506, + "logps/chosen": -616.48681640625, + "logps/rejected": -1353.048095703125, + "loss": 0.0572, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13493946194648743, + "rewards/margins": 0.34219256043434143, + "rewards/rejected": -0.47713202238082886, + "step": 3250 + }, + { + "epoch": 0.62, + "learning_rate": 1.8871344012322504e-06, + "logits/chosen": -1.1986305713653564, + "logits/rejected": -0.8573341369628906, + "logps/chosen": -563.2327270507812, + "logps/rejected": -1138.6436767578125, + "loss": 0.0999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1444614678621292, + "rewards/margins": 0.2410765141248703, + "rewards/rejected": -0.3855380117893219, + "step": 3260 + }, + { + "epoch": 0.62, + "learning_rate": 1.8710330906387288e-06, + "logits/chosen": -1.242150068283081, + "logits/rejected": -0.8809477686882019, + "logps/chosen": -565.7841186523438, + "logps/rejected": -1144.38818359375, + "loss": 0.0931, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13574641942977905, + "rewards/margins": 0.26915091276168823, + "rewards/rejected": -0.4048973023891449, + "step": 3270 + }, + { + "epoch": 0.62, + "learning_rate": 1.8549595850079272e-06, + "logits/chosen": -1.6145598888397217, + "logits/rejected": -1.0401558876037598, + "logps/chosen": -727.5928955078125, + "logps/rejected": -1343.35302734375, + "loss": 0.0752, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17629578709602356, + "rewards/margins": 0.29485493898391724, + "rewards/rejected": -0.4711507260799408, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 1.8389145949069953e-06, + "logits/chosen": -1.747926950454712, + "logits/rejected": -0.7480489015579224, + "logps/chosen": -552.2578125, + "logps/rejected": -1244.274169921875, + "loss": 0.0511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11557585000991821, + "rewards/margins": 0.3372827470302582, + "rewards/rejected": -0.4528585970401764, + "step": 3290 + }, + { + "epoch": 0.63, + "learning_rate": 1.8228988296424877e-06, + "logits/chosen": -1.5611345767974854, + "logits/rejected": -1.1634414196014404, + "logps/chosen": -483.97607421875, + "logps/rejected": -1324.218505859375, + "loss": 0.05, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12933804094791412, + "rewards/margins": 0.3463062345981598, + "rewards/rejected": -0.4756442606449127, + "step": 3300 + }, + { + "epoch": 0.63, + "learning_rate": 1.806912997229008e-06, + "logits/chosen": -1.5756072998046875, + "logits/rejected": -1.0822118520736694, + "logps/chosen": -690.589599609375, + "logps/rejected": -1392.318115234375, + "loss": 0.0727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19598130881786346, + "rewards/margins": 0.3206397294998169, + "rewards/rejected": -0.5166210532188416, + "step": 3310 + }, + { + "epoch": 0.63, + "learning_rate": 1.7909578043579037e-06, + "logits/chosen": -1.2178738117218018, + "logits/rejected": -0.8397973775863647, + "logps/chosen": -597.6338500976562, + "logps/rejected": -1330.0804443359375, + "loss": 0.0676, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16162751615047455, + "rewards/margins": 0.312694787979126, + "rewards/rejected": -0.47432225942611694, + "step": 3320 + }, + { + "epoch": 0.63, + "learning_rate": 1.7750339563660346e-06, + "logits/chosen": -1.4151805639266968, + "logits/rejected": -0.889962375164032, + "logps/chosen": -618.2703857421875, + "logps/rejected": -1273.2015380859375, + "loss": 0.0841, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1959114372730255, + "rewards/margins": 0.2935883402824402, + "rewards/rejected": -0.4894997477531433, + "step": 3330 + }, + { + "epoch": 0.64, + "learning_rate": 1.759142157204583e-06, + "logits/chosen": -1.6737163066864014, + "logits/rejected": -0.7453832030296326, + "logps/chosen": -753.6701049804688, + "logps/rejected": -1307.577880859375, + "loss": 0.068, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2204083949327469, + "rewards/margins": 0.2919270992279053, + "rewards/rejected": -0.5123355388641357, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 1.7432831094079357e-06, + "logits/chosen": -1.4744651317596436, + "logits/rejected": -0.9374326467514038, + "logps/chosen": -700.9337768554688, + "logps/rejected": -1292.358642578125, + "loss": 0.0965, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21388837695121765, + "rewards/margins": 0.28661873936653137, + "rewards/rejected": -0.5005070567131042, + "step": 3350 + }, + { + "epoch": 0.64, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -1.5619527101516724, + "logits/rejected": -0.9883157014846802, + "logps/chosen": -719.3538208007812, + "logps/rejected": -1369.1768798828125, + "loss": 0.0752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17765849828720093, + "rewards/margins": 0.314145565032959, + "rewards/rejected": -0.4918040633201599, + "step": 3360 + }, + { + "epoch": 0.64, + "learning_rate": 1.7116660707763637e-06, + "logits/chosen": -1.7237812280654907, + "logits/rejected": -1.041046380996704, + "logps/chosen": -772.4146728515625, + "logps/rejected": -1400.0970458984375, + "loss": 0.0663, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17615190148353577, + "rewards/margins": 0.32200318574905396, + "rewards/rejected": -0.49815505743026733, + "step": 3370 + }, + { + "epoch": 0.64, + "learning_rate": 1.695909477647054e-06, + "logits/chosen": -1.4328358173370361, + "logits/rejected": -0.9994909167289734, + "logps/chosen": -480.7454528808594, + "logps/rejected": -1162.206298828125, + "loss": 0.0577, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07886415719985962, + "rewards/margins": 0.31568989157676697, + "rewards/rejected": -0.394554078578949, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 1.6801884312319893e-06, + "logits/chosen": -1.4784748554229736, + "logits/rejected": -1.1017252206802368, + "logps/chosen": -540.9151611328125, + "logps/rejected": -1170.0943603515625, + "loss": 0.0879, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11197110265493393, + "rewards/margins": 0.2765769362449646, + "rewards/rejected": -0.3885480463504791, + "step": 3390 + }, + { + "epoch": 0.65, + "learning_rate": 1.6645036265170314e-06, + "logits/chosen": -1.538333773612976, + "logits/rejected": -0.7612485289573669, + "logps/chosen": -702.917236328125, + "logps/rejected": -1379.6119384765625, + "loss": 0.0517, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15875710546970367, + "rewards/margins": 0.3219629228115082, + "rewards/rejected": -0.48071998357772827, + "step": 3400 + }, + { + "epoch": 0.65, + "learning_rate": 1.648855756885893e-06, + "logits/chosen": -1.2829601764678955, + "logits/rejected": -0.7338923215866089, + "logps/chosen": -631.0278930664062, + "logps/rejected": -1219.173095703125, + "loss": 0.0764, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17343007028102875, + "rewards/margins": 0.2668561637401581, + "rewards/rejected": -0.440286248922348, + "step": 3410 + }, + { + "epoch": 0.65, + "learning_rate": 1.633245514089482e-06, + "logits/chosen": -1.5770516395568848, + "logits/rejected": -1.0756621360778809, + "logps/chosen": -651.3983764648438, + "logps/rejected": -1466.8804931640625, + "loss": 0.0435, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.14593417942523956, + "rewards/margins": 0.37087613344192505, + "rewards/rejected": -0.5168102979660034, + "step": 3420 + }, + { + "epoch": 0.65, + "learning_rate": 1.6176735882153284e-06, + "logits/chosen": -1.3698972463607788, + "logits/rejected": -0.9525673985481262, + "logps/chosen": -620.1276245117188, + "logps/rejected": -1276.9837646484375, + "loss": 0.0513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1597699522972107, + "rewards/margins": 0.31164002418518066, + "rewards/rejected": -0.47140997648239136, + "step": 3430 + }, + { + "epoch": 0.66, + "learning_rate": 1.6021406676570667e-06, + "logits/chosen": -1.5987428426742554, + "logits/rejected": -1.0756624937057495, + "logps/chosen": -580.5691528320312, + "logps/rejected": -1147.496826171875, + "loss": 0.0691, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16023506224155426, + "rewards/margins": 0.2685411274433136, + "rewards/rejected": -0.42877617478370667, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 1.5866474390840126e-06, + "logits/chosen": -1.8586333990097046, + "logits/rejected": -1.290020227432251, + "logps/chosen": -646.0453491210938, + "logps/rejected": -1250.796630859375, + "loss": 0.0797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1845080554485321, + "rewards/margins": 0.30654436349868774, + "rewards/rejected": -0.49105244874954224, + "step": 3450 + }, + { + "epoch": 0.66, + "learning_rate": 1.5711945874108053e-06, + "logits/chosen": -1.4048064947128296, + "logits/rejected": -0.7289602160453796, + "logps/chosen": -714.6954345703125, + "logps/rejected": -1281.2022705078125, + "loss": 0.0725, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20739439129829407, + "rewards/margins": 0.28787916898727417, + "rewards/rejected": -0.49527353048324585, + "step": 3460 + }, + { + "epoch": 0.66, + "learning_rate": 1.5557827957671249e-06, + "logits/chosen": -1.6873209476470947, + "logits/rejected": -0.9829764366149902, + "logps/chosen": -678.6327514648438, + "logps/rejected": -1256.5675048828125, + "loss": 0.0803, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15068724751472473, + "rewards/margins": 0.30040639638900757, + "rewards/rejected": -0.4510936737060547, + "step": 3470 + }, + { + "epoch": 0.66, + "learning_rate": 1.5404127454674994e-06, + "logits/chosen": -1.405564308166504, + "logits/rejected": -0.9794474840164185, + "logps/chosen": -574.632568359375, + "logps/rejected": -1390.491455078125, + "loss": 0.0701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14562784135341644, + "rewards/margins": 0.3308250308036804, + "rewards/rejected": -0.47645288705825806, + "step": 3480 + }, + { + "epoch": 0.66, + "learning_rate": 1.5250851159811809e-06, + "logits/chosen": -1.5441749095916748, + "logits/rejected": -1.0190322399139404, + "logps/chosen": -519.0746459960938, + "logps/rejected": -1312.9346923828125, + "loss": 0.0534, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13192638754844666, + "rewards/margins": 0.35666215419769287, + "rewards/rejected": -0.48858851194381714, + "step": 3490 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.6326671838760376, + "logits/rejected": -1.2049169540405273, + "logps/chosen": -533.8888549804688, + "logps/rejected": -1213.007080078125, + "loss": 0.0797, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.126430943608284, + "rewards/margins": 0.29648357629776, + "rewards/rejected": -0.42291444540023804, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 1.4945598279189565e-06, + "logits/chosen": -1.4451848268508911, + "logits/rejected": -0.8163963556289673, + "logps/chosen": -687.5403442382812, + "logps/rejected": -1368.297119140625, + "loss": 0.0548, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20320598781108856, + "rewards/margins": 0.3315770626068115, + "rewards/rejected": -0.534783124923706, + "step": 3510 + }, + { + "epoch": 0.67, + "learning_rate": 1.4793635187852622e-06, + "logits/chosen": -1.4984921216964722, + "logits/rejected": -0.7433010339736938, + "logps/chosen": -727.9129638671875, + "logps/rejected": -1375.246337890625, + "loss": 0.06, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2313799411058426, + "rewards/margins": 0.3215225338935852, + "rewards/rejected": -0.5529025197029114, + "step": 3520 + }, + { + "epoch": 0.67, + "learning_rate": 1.4642123292896406e-06, + "logits/chosen": -1.4329923391342163, + "logits/rejected": -0.9319679141044617, + "logps/chosen": -674.9684448242188, + "logps/rejected": -1338.0560302734375, + "loss": 0.0653, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21940989792346954, + "rewards/margins": 0.30023401975631714, + "rewards/rejected": -0.5196439027786255, + "step": 3530 + }, + { + "epoch": 0.67, + "learning_rate": 1.4491069292260867e-06, + "logits/chosen": -1.4247496128082275, + "logits/rejected": -1.11383855342865, + "logps/chosen": -593.4032592773438, + "logps/rejected": -1343.6129150390625, + "loss": 0.0415, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19223792850971222, + "rewards/margins": 0.3195153474807739, + "rewards/rejected": -0.5117533206939697, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 1.4340479863643658e-06, + "logits/chosen": -1.6909887790679932, + "logits/rejected": -1.1834156513214111, + "logps/chosen": -716.8977661132812, + "logps/rejected": -1389.580322265625, + "loss": 0.0618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1833467036485672, + "rewards/margins": 0.31689900159835815, + "rewards/rejected": -0.5002456903457642, + "step": 3550 + }, + { + "epoch": 0.68, + "learning_rate": 1.4190361664204936e-06, + "logits/chosen": -1.6068832874298096, + "logits/rejected": -0.9983807802200317, + "logps/chosen": -531.9963989257812, + "logps/rejected": -1197.336669921875, + "loss": 0.069, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13076290488243103, + "rewards/margins": 0.31974467635154724, + "rewards/rejected": -0.45050764083862305, + "step": 3560 + }, + { + "epoch": 0.68, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -1.47737717628479, + "logits/rejected": -0.8850958943367004, + "logps/chosen": -646.0311889648438, + "logps/rejected": -1259.364501953125, + "loss": 0.0716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1776318997144699, + "rewards/margins": 0.2986915707588196, + "rewards/rejected": -0.4763234555721283, + "step": 3570 + }, + { + "epoch": 0.68, + "learning_rate": 1.3891565477051242e-06, + "logits/chosen": -1.8412586450576782, + "logits/rejected": -1.2059457302093506, + "logps/chosen": -585.1314697265625, + "logps/rejected": -1392.73046875, + "loss": 0.0587, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1477021872997284, + "rewards/margins": 0.34714001417160034, + "rewards/rejected": -0.49484220147132874, + "step": 3580 + }, + { + "epoch": 0.68, + "learning_rate": 1.3742900698325034e-06, + "logits/chosen": -1.4215539693832397, + "logits/rejected": -0.7746947407722473, + "logps/chosen": -621.46044921875, + "logps/rejected": -1410.5491943359375, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18173085153102875, + "rewards/margins": 0.3614484369754791, + "rewards/rejected": -0.5431792736053467, + "step": 3590 + }, + { + "epoch": 0.69, + "learning_rate": 1.3594733566170925e-06, + "logits/chosen": -1.562574028968811, + "logits/rejected": -0.843326210975647, + "logps/chosen": -620.8270263671875, + "logps/rejected": -1377.2640380859375, + "loss": 0.059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16770535707473755, + "rewards/margins": 0.3291049897670746, + "rewards/rejected": -0.4968103766441345, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 1.3447070630665771e-06, + "logits/chosen": -1.5109186172485352, + "logits/rejected": -1.1313542127609253, + "logps/chosen": -586.7550659179688, + "logps/rejected": -1346.669189453125, + "loss": 0.0605, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1870235800743103, + "rewards/margins": 0.3292957544326782, + "rewards/rejected": -0.5163193345069885, + "step": 3610 + }, + { + "epoch": 0.69, + "learning_rate": 1.329991841959717e-06, + "logits/chosen": -1.3923102617263794, + "logits/rejected": -0.8757355809211731, + "logps/chosen": -666.1890258789062, + "logps/rejected": -1339.994873046875, + "loss": 0.0668, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1841500997543335, + "rewards/margins": 0.3030817210674286, + "rewards/rejected": -0.4872317910194397, + "step": 3620 + }, + { + "epoch": 0.69, + "learning_rate": 1.3153283438175036e-06, + "logits/chosen": -1.271019697189331, + "logits/rejected": -0.8956009745597839, + "logps/chosen": -613.0701904296875, + "logps/rejected": -1512.3135986328125, + "loss": 0.0526, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1648632436990738, + "rewards/margins": 0.3938170373439789, + "rewards/rejected": -0.5586802959442139, + "step": 3630 + }, + { + "epoch": 0.69, + "learning_rate": 1.3007172168743854e-06, + "logits/chosen": -1.6713581085205078, + "logits/rejected": -1.1510488986968994, + "logps/chosen": -534.3785400390625, + "logps/rejected": -1104.552978515625, + "loss": 0.079, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12797674536705017, + "rewards/margins": 0.2560235857963562, + "rewards/rejected": -0.38400033116340637, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 1.2861591070496193e-06, + "logits/chosen": -1.5630991458892822, + "logits/rejected": -0.9822925329208374, + "logps/chosen": -699.5548706054688, + "logps/rejected": -1303.892333984375, + "loss": 0.0574, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17523935437202454, + "rewards/margins": 0.3121577501296997, + "rewards/rejected": -0.48739710450172424, + "step": 3650 + }, + { + "epoch": 0.7, + "learning_rate": 1.271654657918722e-06, + "logits/chosen": -1.5802091360092163, + "logits/rejected": -0.9614111185073853, + "logps/chosen": -617.6490478515625, + "logps/rejected": -1294.764404296875, + "loss": 0.0456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1591751128435135, + "rewards/margins": 0.3368551731109619, + "rewards/rejected": -0.496030330657959, + "step": 3660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2572045106850051e-06, + "logits/chosen": -1.7605321407318115, + "logits/rejected": -0.6013051867485046, + "logps/chosen": -715.6431884765625, + "logps/rejected": -1291.907958984375, + "loss": 0.0522, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19700202345848083, + "rewards/margins": 0.3319992125034332, + "rewards/rejected": -0.5290011167526245, + "step": 3670 + }, + { + "epoch": 0.7, + "learning_rate": 1.2428093041512418e-06, + "logits/chosen": -1.5259217023849487, + "logits/rejected": -1.1222094297409058, + "logps/chosen": -633.7317504882812, + "logps/rejected": -1275.761474609375, + "loss": 0.0866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1652495414018631, + "rewards/margins": 0.31429868936538696, + "rewards/rejected": -0.47954821586608887, + "step": 3680 + }, + { + "epoch": 0.7, + "learning_rate": 1.2284696746914216e-06, + "logits/chosen": -1.5855969190597534, + "logits/rejected": -1.0040165185928345, + "logps/chosen": -776.5299072265625, + "logps/rejected": -1395.8131103515625, + "loss": 0.0519, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20241470634937286, + "rewards/margins": 0.31013619899749756, + "rewards/rejected": -0.512550950050354, + "step": 3690 + }, + { + "epoch": 0.7, + "learning_rate": 1.2141862562226164e-06, + "logits/chosen": -1.536430835723877, + "logits/rejected": -0.9561670422554016, + "logps/chosen": -641.7059936523438, + "logps/rejected": -1212.5941162109375, + "loss": 0.1015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15809763967990875, + "rewards/margins": 0.27246394753456116, + "rewards/rejected": -0.4305616021156311, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 1.1999596801769617e-06, + "logits/chosen": -1.5653895139694214, + "logits/rejected": -1.0047844648361206, + "logps/chosen": -617.3829345703125, + "logps/rejected": -1279.5341796875, + "loss": 0.0666, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1557629555463791, + "rewards/margins": 0.29767635464668274, + "rewards/rejected": -0.45343929529190063, + "step": 3710 + }, + { + "epoch": 0.71, + "learning_rate": 1.185790575473738e-06, + "logits/chosen": -1.5880681276321411, + "logits/rejected": -1.3316981792449951, + "logps/chosen": -556.1331176757812, + "logps/rejected": -1228.2022705078125, + "loss": 0.075, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12465610355138779, + "rewards/margins": 0.29106903076171875, + "rewards/rejected": -0.41572514176368713, + "step": 3720 + }, + { + "epoch": 0.71, + "learning_rate": 1.1716795684915728e-06, + "logits/chosen": -1.4888153076171875, + "logits/rejected": -1.1844497919082642, + "logps/chosen": -580.6350708007812, + "logps/rejected": -1149.899169921875, + "loss": 0.0691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1421315222978592, + "rewards/margins": 0.22847023606300354, + "rewards/rejected": -0.37060171365737915, + "step": 3730 + }, + { + "epoch": 0.71, + "learning_rate": 1.1576272830407418e-06, + "logits/chosen": -1.7607017755508423, + "logits/rejected": -0.8743270039558411, + "logps/chosen": -670.7908325195312, + "logps/rejected": -1380.0850830078125, + "loss": 0.0511, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14333508908748627, + "rewards/margins": 0.34049367904663086, + "rewards/rejected": -0.4838287830352783, + "step": 3740 + }, + { + "epoch": 0.71, + "learning_rate": 1.1436343403356019e-06, + "logits/chosen": -1.5949684381484985, + "logits/rejected": -1.1345860958099365, + "logps/chosen": -587.1764526367188, + "logps/rejected": -1332.0303955078125, + "loss": 0.0514, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1502644568681717, + "rewards/margins": 0.338855117559433, + "rewards/rejected": -0.48911961913108826, + "step": 3750 + }, + { + "epoch": 0.72, + "learning_rate": 1.129701358967123e-06, + "logits/chosen": -1.551690697669983, + "logits/rejected": -0.9563910365104675, + "logps/chosen": -690.2678833007812, + "logps/rejected": -1210.73974609375, + "loss": 0.0844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20310266315937042, + "rewards/margins": 0.24316063523292542, + "rewards/rejected": -0.44626325368881226, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 1.11582895487554e-06, + "logits/chosen": -1.4407981634140015, + "logits/rejected": -0.8519558906555176, + "logps/chosen": -559.5155639648438, + "logps/rejected": -1227.58935546875, + "loss": 0.0618, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13855567574501038, + "rewards/margins": 0.3181162178516388, + "rewards/rejected": -0.45667189359664917, + "step": 3770 + }, + { + "epoch": 0.72, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -1.3093456029891968, + "logits/rejected": -0.7478567361831665, + "logps/chosen": -676.5855102539062, + "logps/rejected": -1445.390869140625, + "loss": 0.0529, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18879786133766174, + "rewards/margins": 0.3267724812030792, + "rewards/rejected": -0.515570342540741, + "step": 3780 + }, + { + "epoch": 0.72, + "learning_rate": 1.0882683288671041e-06, + "logits/chosen": -1.4136669635772705, + "logits/rejected": -1.191960334777832, + "logps/chosen": -555.474609375, + "logps/rejected": -1154.699951171875, + "loss": 0.0892, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11162900924682617, + "rewards/margins": 0.25221413373947144, + "rewards/rejected": -0.3638431131839752, + "step": 3790 + }, + { + "epoch": 0.72, + "learning_rate": 1.0745813253325957e-06, + "logits/chosen": -1.4562398195266724, + "logits/rejected": -1.0913138389587402, + "logps/chosen": -537.0150146484375, + "logps/rejected": -1177.9039306640625, + "loss": 0.0686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1197771430015564, + "rewards/margins": 0.2807857394218445, + "rewards/rejected": -0.4005628526210785, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 1.0609573357858166e-06, + "logits/chosen": -1.4760421514511108, + "logits/rejected": -0.8424805402755737, + "logps/chosen": -640.1893920898438, + "logps/rejected": -1284.22998046875, + "loss": 0.054, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15238045156002045, + "rewards/margins": 0.2920035719871521, + "rewards/rejected": -0.44438403844833374, + "step": 3810 + }, + { + "epoch": 0.73, + "learning_rate": 1.0473969625072922e-06, + "logits/chosen": -1.4656455516815186, + "logits/rejected": -0.8885079622268677, + "logps/chosen": -703.3538208007812, + "logps/rejected": -1381.887939453125, + "loss": 0.0727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17700466513633728, + "rewards/margins": 0.30304640531539917, + "rewards/rejected": -0.48005104064941406, + "step": 3820 + }, + { + "epoch": 0.73, + "learning_rate": 1.0339008049652427e-06, + "logits/chosen": -1.68500554561615, + "logits/rejected": -1.075042963027954, + "logps/chosen": -619.4908447265625, + "logps/rejected": -1248.362060546875, + "loss": 0.0699, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14864543080329895, + "rewards/margins": 0.2982180118560791, + "rewards/rejected": -0.44686347246170044, + "step": 3830 + }, + { + "epoch": 0.73, + "learning_rate": 1.0204694597890814e-06, + "logits/chosen": -1.7366454601287842, + "logits/rejected": -1.0493816137313843, + "logps/chosen": -547.7674560546875, + "logps/rejected": -1223.79833984375, + "loss": 0.049, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09998428821563721, + "rewards/margins": 0.3351575434207916, + "rewards/rejected": -0.4351418614387512, + "step": 3840 + }, + { + "epoch": 0.73, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -1.3894609212875366, + "logits/rejected": -1.1518490314483643, + "logps/chosen": -581.4041748046875, + "logps/rejected": -1280.5262451171875, + "loss": 0.0502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13739798963069916, + "rewards/margins": 0.3110824525356293, + "rewards/rejected": -0.44848042726516724, + "step": 3850 + }, + { + "epoch": 0.74, + "learning_rate": 9.938035786999018e-07, + "logits/chosen": -1.5209680795669556, + "logits/rejected": -0.8864911794662476, + "logps/chosen": -659.5616455078125, + "logps/rejected": -1236.633056640625, + "loss": 0.0933, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17649760842323303, + "rewards/margins": 0.27773815393447876, + "rewards/rejected": -0.4542357921600342, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 9.805702216149252e-07, + "logits/chosen": -1.458902359008789, + "logits/rejected": -1.0474576950073242, + "logps/chosen": -530.28515625, + "logps/rejected": -1270.1309814453125, + "loss": 0.0446, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1434863954782486, + "rewards/margins": 0.303547203540802, + "rewards/rejected": -0.4470335841178894, + "step": 3870 + }, + { + "epoch": 0.74, + "learning_rate": 9.674040344998056e-07, + "logits/chosen": -1.5949146747589111, + "logits/rejected": -0.9253498911857605, + "logps/chosen": -625.6763916015625, + "logps/rejected": -1221.9095458984375, + "loss": 0.0658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1248815506696701, + "rewards/margins": 0.302616685628891, + "rewards/rejected": -0.4274982511997223, + "step": 3880 + }, + { + "epoch": 0.74, + "learning_rate": 9.543055993968339e-07, + "logits/chosen": -1.3639336824417114, + "logits/rejected": -0.909598708152771, + "logps/chosen": -634.0882568359375, + "logps/rejected": -1329.5985107421875, + "loss": 0.0642, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15912790596485138, + "rewards/margins": 0.32093101739883423, + "rewards/rejected": -0.4800589680671692, + "step": 3890 + }, + { + "epoch": 0.74, + "learning_rate": 9.412754953531664e-07, + "logits/chosen": -1.477228045463562, + "logits/rejected": -0.9538853764533997, + "logps/chosen": -521.6915893554688, + "logps/rejected": -1194.7880859375, + "loss": 0.0739, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1504165530204773, + "rewards/margins": 0.2940949499607086, + "rewards/rejected": -0.4445114731788635, + "step": 3900 + }, + { + "epoch": 0.74, + "learning_rate": 9.283142983952231e-07, + "logits/chosen": -1.6704351902008057, + "logits/rejected": -0.9384806752204895, + "logps/chosen": -702.3519287109375, + "logps/rejected": -1316.037353515625, + "loss": 0.064, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16525360941886902, + "rewards/margins": 0.308427631855011, + "rewards/rejected": -0.4736812710762024, + "step": 3910 + }, + { + "epoch": 0.75, + "learning_rate": 9.154225815032242e-07, + "logits/chosen": -1.5252714157104492, + "logits/rejected": -1.1429945230484009, + "logps/chosen": -606.1965942382812, + "logps/rejected": -1182.3236083984375, + "loss": 0.1002, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1689486801624298, + "rewards/margins": 0.2557165026664734, + "rewards/rejected": -0.4246651530265808, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 9.026009145858608e-07, + "logits/chosen": -1.4882303476333618, + "logits/rejected": -1.0083402395248413, + "logps/chosen": -578.5821533203125, + "logps/rejected": -1250.6842041015625, + "loss": 0.0846, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14970554411411285, + "rewards/margins": 0.30618828535079956, + "rewards/rejected": -0.4558938443660736, + "step": 3930 + }, + { + "epoch": 0.75, + "learning_rate": 8.898498644550973e-07, + "logits/chosen": -1.6834971904754639, + "logits/rejected": -1.1314411163330078, + "logps/chosen": -688.9229736328125, + "logps/rejected": -1269.325927734375, + "loss": 0.056, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16720907390117645, + "rewards/margins": 0.29985159635543823, + "rewards/rejected": -0.4670606553554535, + "step": 3940 + }, + { + "epoch": 0.75, + "learning_rate": 8.771699948011203e-07, + "logits/chosen": -1.572192668914795, + "logits/rejected": -0.983639121055603, + "logps/chosen": -495.31317138671875, + "logps/rejected": -1169.4541015625, + "loss": 0.0708, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11250798404216766, + "rewards/margins": 0.3055083155632019, + "rewards/rejected": -0.4180162847042084, + "step": 3950 + }, + { + "epoch": 0.75, + "learning_rate": 8.645618661674144e-07, + "logits/chosen": -1.3754651546478271, + "logits/rejected": -1.0094373226165771, + "logps/chosen": -727.6187744140625, + "logps/rejected": -1270.610107421875, + "loss": 0.0845, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18097665905952454, + "rewards/margins": 0.2794339954853058, + "rewards/rejected": -0.4604106545448303, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 8.520260359259822e-07, + "logits/chosen": -1.5351568460464478, + "logits/rejected": -1.067787528038025, + "logps/chosen": -595.1707763671875, + "logps/rejected": -1178.1083984375, + "loss": 0.0752, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15523064136505127, + "rewards/margins": 0.2633352279663086, + "rewards/rejected": -0.4185658395290375, + "step": 3970 + }, + { + "epoch": 0.76, + "learning_rate": 8.395630582527075e-07, + "logits/chosen": -1.5825989246368408, + "logits/rejected": -1.1352492570877075, + "logps/chosen": -509.427978515625, + "logps/rejected": -1257.157958984375, + "loss": 0.0771, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13573279976844788, + "rewards/margins": 0.3318541347980499, + "rewards/rejected": -0.4675869345664978, + "step": 3980 + }, + { + "epoch": 0.76, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -1.5050828456878662, + "logits/rejected": -0.9955355525016785, + "logps/chosen": -755.04931640625, + "logps/rejected": -1391.7149658203125, + "loss": 0.074, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22364020347595215, + "rewards/margins": 0.31569841504096985, + "rewards/rejected": -0.5393385291099548, + "step": 3990 + }, + { + "epoch": 0.76, + "learning_rate": 8.148578611867114e-07, + "logits/chosen": -1.5540293455123901, + "logits/rejected": -1.1930735111236572, + "logps/chosen": -556.0262451171875, + "logps/rejected": -1192.371337890625, + "loss": 0.0738, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1541956663131714, + "rewards/margins": 0.28172677755355835, + "rewards/rejected": -0.43592238426208496, + "step": 4000 + }, + { + "epoch": 0.76, + "learning_rate": 8.026167339453792e-07, + "logits/chosen": -1.4712079763412476, + "logits/rejected": -0.9827225804328918, + "logps/chosen": -662.958251953125, + "logps/rejected": -1321.115966796875, + "loss": 0.0744, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19676116108894348, + "rewards/margins": 0.2918079197406769, + "rewards/rejected": -0.48856911063194275, + "step": 4010 + }, + { + "epoch": 0.77, + "learning_rate": 7.904506435266998e-07, + "logits/chosen": -1.639409065246582, + "logits/rejected": -1.1449193954467773, + "logps/chosen": -582.4534912109375, + "logps/rejected": -1267.9315185546875, + "loss": 0.0727, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1511387825012207, + "rewards/margins": 0.3111017644405365, + "rewards/rejected": -0.4622405469417572, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 7.783601277613378e-07, + "logits/chosen": -1.5379669666290283, + "logits/rejected": -1.0621458292007446, + "logps/chosen": -552.1363525390625, + "logps/rejected": -1202.6796875, + "loss": 0.0671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14880362153053284, + "rewards/margins": 0.3249647617340088, + "rewards/rejected": -0.47376832365989685, + "step": 4030 + }, + { + "epoch": 0.77, + "learning_rate": 7.66345721139003e-07, + "logits/chosen": -1.3945927619934082, + "logits/rejected": -1.0799853801727295, + "logps/chosen": -627.4310302734375, + "logps/rejected": -1245.181396484375, + "loss": 0.0679, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1866941750049591, + "rewards/margins": 0.2926812171936035, + "rewards/rejected": -0.47937536239624023, + "step": 4040 + }, + { + "epoch": 0.77, + "learning_rate": 7.544079547848183e-07, + "logits/chosen": -1.474120855331421, + "logits/rejected": -0.9780243635177612, + "logps/chosen": -651.1190795898438, + "logps/rejected": -1327.578857421875, + "loss": 0.0686, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18713738024234772, + "rewards/margins": 0.31425991654396057, + "rewards/rejected": -0.5013972520828247, + "step": 4050 + }, + { + "epoch": 0.77, + "learning_rate": 7.425473564358457e-07, + "logits/chosen": -1.621582269668579, + "logits/rejected": -1.1246297359466553, + "logps/chosen": -581.9295654296875, + "logps/rejected": -1041.8333740234375, + "loss": 0.1021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16944041848182678, + "rewards/margins": 0.22699041664600372, + "rewards/rejected": -0.3964308202266693, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 7.307644504177539e-07, + "logits/chosen": -1.3632748126983643, + "logits/rejected": -0.8100827932357788, + "logps/chosen": -676.1319580078125, + "logps/rejected": -1327.113525390625, + "loss": 0.0714, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19097070395946503, + "rewards/margins": 0.3000251352787018, + "rewards/rejected": -0.4909958839416504, + "step": 4070 + }, + { + "epoch": 0.78, + "learning_rate": 7.190597576216385e-07, + "logits/chosen": -1.6218783855438232, + "logits/rejected": -1.0738632678985596, + "logps/chosen": -640.3385620117188, + "logps/rejected": -1275.8609619140625, + "loss": 0.0817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1903303861618042, + "rewards/margins": 0.2934018075466156, + "rewards/rejected": -0.4837321639060974, + "step": 4080 + }, + { + "epoch": 0.78, + "learning_rate": 7.074337954809945e-07, + "logits/chosen": -1.6243613958358765, + "logits/rejected": -1.0085480213165283, + "logps/chosen": -593.51025390625, + "logps/rejected": -1382.5362548828125, + "loss": 0.0511, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1565353125333786, + "rewards/margins": 0.3839346170425415, + "rewards/rejected": -0.5404700040817261, + "step": 4090 + }, + { + "epoch": 0.78, + "learning_rate": 6.958870779488447e-07, + "logits/chosen": -1.8175923824310303, + "logits/rejected": -0.9546326398849487, + "logps/chosen": -654.0431518554688, + "logps/rejected": -1326.212890625, + "loss": 0.0702, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1587834656238556, + "rewards/margins": 0.3209785521030426, + "rewards/rejected": -0.4797619879245758, + "step": 4100 + }, + { + "epoch": 0.78, + "learning_rate": 6.844201154750176e-07, + "logits/chosen": -1.4412815570831299, + "logits/rejected": -0.8726056218147278, + "logps/chosen": -592.4741821289062, + "logps/rejected": -1309.003662109375, + "loss": 0.0606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.161200150847435, + "rewards/margins": 0.32679271697998047, + "rewards/rejected": -0.4879928529262543, + "step": 4110 + }, + { + "epoch": 0.78, + "learning_rate": 6.730334149835788e-07, + "logits/chosen": -1.6734164953231812, + "logits/rejected": -1.2678444385528564, + "logps/chosen": -545.2771606445312, + "logps/rejected": -1146.37890625, + "loss": 0.0729, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15675309300422668, + "rewards/margins": 0.28524309396743774, + "rewards/rejected": -0.44199615716934204, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 6.617274798504286e-07, + "logits/chosen": -1.476194977760315, + "logits/rejected": -0.7991756200790405, + "logps/chosen": -661.20166015625, + "logps/rejected": -1299.816162109375, + "loss": 0.086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1620057076215744, + "rewards/margins": 0.303924024105072, + "rewards/rejected": -0.4659297466278076, + "step": 4130 + }, + { + "epoch": 0.79, + "learning_rate": 6.505028098810407e-07, + "logits/chosen": -1.4836968183517456, + "logits/rejected": -0.7697210311889648, + "logps/chosen": -673.612060546875, + "logps/rejected": -1344.207763671875, + "loss": 0.0575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1491011381149292, + "rewards/margins": 0.332655131816864, + "rewards/rejected": -0.4817562699317932, + "step": 4140 + }, + { + "epoch": 0.79, + "learning_rate": 6.393599012883709e-07, + "logits/chosen": -1.7030470371246338, + "logits/rejected": -0.9431253671646118, + "logps/chosen": -698.5299682617188, + "logps/rejected": -1296.638427734375, + "loss": 0.0826, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17169280350208282, + "rewards/margins": 0.28853195905685425, + "rewards/rejected": -0.46022477746009827, + "step": 4150 + }, + { + "epoch": 0.79, + "learning_rate": 6.282992466709247e-07, + "logits/chosen": -1.876447319984436, + "logits/rejected": -1.2787059545516968, + "logps/chosen": -610.8107299804688, + "logps/rejected": -1240.9732666015625, + "loss": 0.0764, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15487447381019592, + "rewards/margins": 0.2935008704662323, + "rewards/rejected": -0.4483753740787506, + "step": 4160 + }, + { + "epoch": 0.79, + "learning_rate": 6.17321334990973e-07, + "logits/chosen": -1.378609538078308, + "logits/rejected": -1.188201665878296, + "logps/chosen": -569.0336303710938, + "logps/rejected": -1221.126220703125, + "loss": 0.1055, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18387310206890106, + "rewards/margins": 0.2487374097108841, + "rewards/rejected": -0.43261051177978516, + "step": 4170 + }, + { + "epoch": 0.8, + "learning_rate": 6.064266515529419e-07, + "logits/chosen": -1.5385997295379639, + "logits/rejected": -0.8599594235420227, + "logps/chosen": -761.7626953125, + "logps/rejected": -1329.2393798828125, + "loss": 0.059, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1731938272714615, + "rewards/margins": 0.30250200629234314, + "rewards/rejected": -0.47569578886032104, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 5.956156779819586e-07, + "logits/chosen": -1.5333473682403564, + "logits/rejected": -0.9734029769897461, + "logps/chosen": -637.819091796875, + "logps/rejected": -1298.0240478515625, + "loss": 0.0533, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17162570357322693, + "rewards/margins": 0.32062727212905884, + "rewards/rejected": -0.49225300550460815, + "step": 4190 + }, + { + "epoch": 0.8, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -1.6408611536026, + "logits/rejected": -1.1250369548797607, + "logps/chosen": -636.6069946289062, + "logps/rejected": -1349.956787109375, + "loss": 0.0653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1537880152463913, + "rewards/margins": 0.3114011287689209, + "rewards/rejected": -0.4651891589164734, + "step": 4200 + }, + { + "epoch": 0.8, + "learning_rate": 5.742467684175473e-07, + "logits/chosen": -1.7282884120941162, + "logits/rejected": -0.9091650247573853, + "logps/chosen": -654.1572875976562, + "logps/rejected": -1369.8782958984375, + "loss": 0.0578, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1490090787410736, + "rewards/margins": 0.3345029652118683, + "rewards/rejected": -0.4835120141506195, + "step": 4210 + }, + { + "epoch": 0.8, + "learning_rate": 5.636897770870667e-07, + "logits/chosen": -1.5277897119522095, + "logits/rejected": -1.0950522422790527, + "logps/chosen": -621.8223876953125, + "logps/rejected": -1188.8853759765625, + "loss": 0.0727, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.16900405287742615, + "rewards/margins": 0.27010002732276917, + "rewards/rejected": -0.4391040802001953, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 5.532183849077651e-07, + "logits/chosen": -1.478398323059082, + "logits/rejected": -0.7619593739509583, + "logps/chosen": -704.496826171875, + "logps/rejected": -1236.497314453125, + "loss": 0.0832, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1664319932460785, + "rewards/margins": 0.30853888392448425, + "rewards/rejected": -0.47497090697288513, + "step": 4230 + }, + { + "epoch": 0.81, + "learning_rate": 5.428330547921809e-07, + "logits/chosen": -1.4886611700057983, + "logits/rejected": -1.0063542127609253, + "logps/chosen": -659.728515625, + "logps/rejected": -1120.3165283203125, + "loss": 0.1056, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18079857528209686, + "rewards/margins": 0.2402995079755783, + "rewards/rejected": -0.4210980534553528, + "step": 4240 + }, + { + "epoch": 0.81, + "learning_rate": 5.32534245848278e-07, + "logits/chosen": -1.6218599081039429, + "logits/rejected": -1.0438997745513916, + "logps/chosen": -647.7769165039062, + "logps/rejected": -1325.986572265625, + "loss": 0.0708, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1493169367313385, + "rewards/margins": 0.3103331923484802, + "rewards/rejected": -0.45965009927749634, + "step": 4250 + }, + { + "epoch": 0.81, + "learning_rate": 5.223224133591475e-07, + "logits/chosen": -1.5376250743865967, + "logits/rejected": -1.0985280275344849, + "logps/chosen": -623.9844970703125, + "logps/rejected": -1423.852783203125, + "loss": 0.0325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15335962176322937, + "rewards/margins": 0.3516461253166199, + "rewards/rejected": -0.5050057172775269, + "step": 4260 + }, + { + "epoch": 0.81, + "learning_rate": 5.121980087628802e-07, + "logits/chosen": -1.5560121536254883, + "logits/rejected": -1.2112586498260498, + "logps/chosen": -535.8369140625, + "logps/rejected": -1187.4632568359375, + "loss": 0.0843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14772644639015198, + "rewards/margins": 0.28468599915504456, + "rewards/rejected": -0.43241238594055176, + "step": 4270 + }, + { + "epoch": 0.82, + "learning_rate": 5.021614796326155e-07, + "logits/chosen": -1.6284434795379639, + "logits/rejected": -0.9607194662094116, + "logps/chosen": -645.3671264648438, + "logps/rejected": -1252.421630859375, + "loss": 0.0695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1374301016330719, + "rewards/margins": 0.2948032021522522, + "rewards/rejected": -0.43223339319229126, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 4.922132696567463e-07, + "logits/chosen": -1.6314195394515991, + "logits/rejected": -1.032524824142456, + "logps/chosen": -592.7301025390625, + "logps/rejected": -1306.8658447265625, + "loss": 0.0372, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12905898690223694, + "rewards/margins": 0.3327252268791199, + "rewards/rejected": -0.4617842137813568, + "step": 4290 + }, + { + "epoch": 0.82, + "learning_rate": 4.823538186193097e-07, + "logits/chosen": -1.5528547763824463, + "logits/rejected": -1.1627963781356812, + "logps/chosen": -598.49267578125, + "logps/rejected": -1283.17138671875, + "loss": 0.0738, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13689306378364563, + "rewards/margins": 0.32022175192832947, + "rewards/rejected": -0.4571148753166199, + "step": 4300 + }, + { + "epoch": 0.82, + "learning_rate": 4.725835623805494e-07, + "logits/chosen": -1.7339166402816772, + "logits/rejected": -1.2147337198257446, + "logps/chosen": -593.4891357421875, + "logps/rejected": -1166.099609375, + "loss": 0.057, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1361609250307083, + "rewards/margins": 0.2860185503959656, + "rewards/rejected": -0.42217954993247986, + "step": 4310 + }, + { + "epoch": 0.82, + "learning_rate": 4.6290293285763816e-07, + "logits/chosen": -1.5797579288482666, + "logits/rejected": -0.8604723215103149, + "logps/chosen": -692.3845825195312, + "logps/rejected": -1367.1331787109375, + "loss": 0.0554, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1601569652557373, + "rewards/margins": 0.337593138217926, + "rewards/rejected": -0.49775010347366333, + "step": 4320 + }, + { + "epoch": 0.82, + "learning_rate": 4.533123580055909e-07, + "logits/chosen": -1.7544052600860596, + "logits/rejected": -1.1722230911254883, + "logps/chosen": -622.83203125, + "logps/rejected": -1235.314208984375, + "loss": 0.0853, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1056605726480484, + "rewards/margins": 0.32745251059532166, + "rewards/rejected": -0.4331130385398865, + "step": 4330 + }, + { + "epoch": 0.83, + "learning_rate": 4.438122617983442e-07, + "logits/chosen": -1.6120226383209229, + "logits/rejected": -0.8405524492263794, + "logps/chosen": -629.9571533203125, + "logps/rejected": -1155.5582275390625, + "loss": 0.0841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13840670883655548, + "rewards/margins": 0.28906354308128357, + "rewards/rejected": -0.42747029662132263, + "step": 4340 + }, + { + "epoch": 0.83, + "learning_rate": 4.344030642100133e-07, + "logits/chosen": -1.6726274490356445, + "logits/rejected": -1.0421268939971924, + "logps/chosen": -618.4395751953125, + "logps/rejected": -1348.0491943359375, + "loss": 0.0565, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14258643984794617, + "rewards/margins": 0.3459554612636566, + "rewards/rejected": -0.4885419011116028, + "step": 4350 + }, + { + "epoch": 0.83, + "learning_rate": 4.250851811963236e-07, + "logits/chosen": -1.7452752590179443, + "logits/rejected": -1.0764765739440918, + "logps/chosen": -628.9611206054688, + "logps/rejected": -1291.5279541015625, + "loss": 0.0524, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.13416460156440735, + "rewards/margins": 0.33325523138046265, + "rewards/rejected": -0.4674198031425476, + "step": 4360 + }, + { + "epoch": 0.83, + "learning_rate": 4.158590246762278e-07, + "logits/chosen": -1.4894119501113892, + "logits/rejected": -1.0432569980621338, + "logps/chosen": -569.1922607421875, + "logps/rejected": -1308.944580078125, + "loss": 0.0654, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14465302228927612, + "rewards/margins": 0.32924753427505493, + "rewards/rejected": -0.4739004969596863, + "step": 4370 + }, + { + "epoch": 0.83, + "learning_rate": 4.0672500251369204e-07, + "logits/chosen": -1.4088810682296753, + "logits/rejected": -0.937311053276062, + "logps/chosen": -637.6170043945312, + "logps/rejected": -1380.856689453125, + "loss": 0.0363, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1410633772611618, + "rewards/margins": 0.3729560971260071, + "rewards/rejected": -0.5140194892883301, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 3.976835184996644e-07, + "logits/chosen": -1.6045525074005127, + "logits/rejected": -1.086709976196289, + "logps/chosen": -607.0584716796875, + "logps/rejected": -1337.4051513671875, + "loss": 0.0612, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.155624121427536, + "rewards/margins": 0.3269900679588318, + "rewards/rejected": -0.482614129781723, + "step": 4390 + }, + { + "epoch": 0.84, + "learning_rate": 3.887349723342304e-07, + "logits/chosen": -1.848745346069336, + "logits/rejected": -1.1570312976837158, + "logps/chosen": -744.2691650390625, + "logps/rejected": -1386.67724609375, + "loss": 0.0422, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18313486874103546, + "rewards/margins": 0.3392416536808014, + "rewards/rejected": -0.5223765969276428, + "step": 4400 + }, + { + "epoch": 0.84, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.6429369449615479, + "logits/rejected": -1.0298631191253662, + "logps/chosen": -593.02685546875, + "logps/rejected": -1241.770751953125, + "loss": 0.0617, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1318318098783493, + "rewards/margins": 0.3228714168071747, + "rewards/rejected": -0.4547031819820404, + "step": 4410 + }, + { + "epoch": 0.84, + "learning_rate": 3.711182717893011e-07, + "logits/chosen": -1.507433295249939, + "logits/rejected": -0.786311686038971, + "logps/chosen": -643.9988403320312, + "logps/rejected": -1273.1219482421875, + "loss": 0.071, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17599640786647797, + "rewards/margins": 0.2902034521102905, + "rewards/rejected": -0.46619993448257446, + "step": 4420 + }, + { + "epoch": 0.84, + "learning_rate": 3.624508961975215e-07, + "logits/chosen": -1.328204870223999, + "logits/rejected": -1.0989032983779907, + "logps/chosen": -629.1240234375, + "logps/rejected": -1093.088623046875, + "loss": 0.1225, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18698564171791077, + "rewards/margins": 0.21734988689422607, + "rewards/rejected": -0.40433555841445923, + "step": 4430 + }, + { + "epoch": 0.85, + "learning_rate": 3.538780159953348e-07, + "logits/chosen": -1.5716381072998047, + "logits/rejected": -0.8230584859848022, + "logps/chosen": -647.3327026367188, + "logps/rejected": -1299.0325927734375, + "loss": 0.0529, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14860112965106964, + "rewards/margins": 0.3516216576099396, + "rewards/rejected": -0.5002228021621704, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 3.454000101670901e-07, + "logits/chosen": -1.4817876815795898, + "logits/rejected": -0.9819103479385376, + "logps/chosen": -719.157958984375, + "logps/rejected": -1345.7685546875, + "loss": 0.0874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17831948399543762, + "rewards/margins": 0.2975057065486908, + "rewards/rejected": -0.4758252203464508, + "step": 4450 + }, + { + "epoch": 0.85, + "learning_rate": 3.3701725350299143e-07, + "logits/chosen": -1.5737899541854858, + "logits/rejected": -0.9576247930526733, + "logps/chosen": -624.8809204101562, + "logps/rejected": -1283.3790283203125, + "loss": 0.0594, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15519653260707855, + "rewards/margins": 0.31781700253486633, + "rewards/rejected": -0.4730134904384613, + "step": 4460 + }, + { + "epoch": 0.85, + "learning_rate": 3.2873011658252796e-07, + "logits/chosen": -1.6041154861450195, + "logits/rejected": -0.9867879748344421, + "logps/chosen": -616.98876953125, + "logps/rejected": -1315.84765625, + "loss": 0.0685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16912952065467834, + "rewards/margins": 0.3227527439594269, + "rewards/rejected": -0.49188223481178284, + "step": 4470 + }, + { + "epoch": 0.85, + "learning_rate": 3.2053896575809426e-07, + "logits/chosen": -1.7311811447143555, + "logits/rejected": -1.067882776260376, + "logps/chosen": -649.1055908203125, + "logps/rejected": -1258.640869140625, + "loss": 0.0901, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17431046068668365, + "rewards/margins": 0.3118690252304077, + "rewards/rejected": -0.48617953062057495, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 3.124441631387931e-07, + "logits/chosen": -1.523450255393982, + "logits/rejected": -1.0439492464065552, + "logps/chosen": -491.62249755859375, + "logps/rejected": -1279.021484375, + "loss": 0.0445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12501509487628937, + "rewards/margins": 0.3468592166900635, + "rewards/rejected": -0.47187429666519165, + "step": 4490 + }, + { + "epoch": 0.86, + "learning_rate": 3.044460665744284e-07, + "logits/chosen": -1.5553473234176636, + "logits/rejected": -1.015411138534546, + "logps/chosen": -621.4290771484375, + "logps/rejected": -1193.322265625, + "loss": 0.0624, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17289629578590393, + "rewards/margins": 0.2942846715450287, + "rewards/rejected": -0.4671810269355774, + "step": 4500 + }, + { + "epoch": 0.86, + "learning_rate": 2.9654502963968575e-07, + "logits/chosen": -1.6609184741973877, + "logits/rejected": -0.9597708582878113, + "logps/chosen": -598.8553466796875, + "logps/rejected": -1200.1343994140625, + "loss": 0.0725, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14991316199302673, + "rewards/margins": 0.2949966788291931, + "rewards/rejected": -0.44490987062454224, + "step": 4510 + }, + { + "epoch": 0.86, + "learning_rate": 2.8874140161849915e-07, + "logits/chosen": -1.6514486074447632, + "logits/rejected": -0.9042898416519165, + "logps/chosen": -669.432861328125, + "logps/rejected": -1172.0797119140625, + "loss": 0.0877, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16250768303871155, + "rewards/margins": 0.28728577494621277, + "rewards/rejected": -0.4497934877872467, + "step": 4520 + }, + { + "epoch": 0.86, + "learning_rate": 2.810355274886148e-07, + "logits/chosen": -1.4443236589431763, + "logits/rejected": -1.129891037940979, + "logps/chosen": -550.1930541992188, + "logps/rejected": -1096.160888671875, + "loss": 0.0956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1617933064699173, + "rewards/margins": 0.23670299351215363, + "rewards/rejected": -0.39849624037742615, + "step": 4530 + }, + { + "epoch": 0.86, + "learning_rate": 2.7342774790633686e-07, + "logits/chosen": -1.6275829076766968, + "logits/rejected": -1.0061298608779907, + "logps/chosen": -649.8400268554688, + "logps/rejected": -1309.129150390625, + "loss": 0.0815, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17947590351104736, + "rewards/margins": 0.28794533014297485, + "rewards/rejected": -0.4674212336540222, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -1.280903697013855, + "logits/rejected": -0.8944345712661743, + "logps/chosen": -608.0111083984375, + "logps/rejected": -1276.568115234375, + "loss": 0.0597, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1773298680782318, + "rewards/margins": 0.3045726418495178, + "rewards/rejected": -0.48190250992774963, + "step": 4550 + }, + { + "epoch": 0.87, + "learning_rate": 2.58507813312448e-07, + "logits/chosen": -1.6265952587127686, + "logits/rejected": -0.855617344379425, + "logps/chosen": -606.3870849609375, + "logps/rejected": -1226.21533203125, + "loss": 0.0519, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11800136417150497, + "rewards/margins": 0.32250529527664185, + "rewards/rejected": -0.4405066967010498, + "step": 4560 + }, + { + "epoch": 0.87, + "learning_rate": 2.511963178716648e-07, + "logits/chosen": -1.8203041553497314, + "logits/rejected": -1.3360464572906494, + "logps/chosen": -547.7073974609375, + "logps/rejected": -1084.524658203125, + "loss": 0.0827, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17347629368305206, + "rewards/margins": 0.2538864314556122, + "rewards/rejected": -0.42736274003982544, + "step": 4570 + }, + { + "epoch": 0.87, + "learning_rate": 2.439842360909864e-07, + "logits/chosen": -1.8694589138031006, + "logits/rejected": -1.0154645442962646, + "logps/chosen": -750.2979736328125, + "logps/rejected": -1419.476806640625, + "loss": 0.0595, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18350553512573242, + "rewards/margins": 0.32696714997291565, + "rewards/rejected": -0.5104726552963257, + "step": 4580 + }, + { + "epoch": 0.87, + "learning_rate": 2.3687188679746314e-07, + "logits/chosen": -1.623376488685608, + "logits/rejected": -0.7891206741333008, + "logps/chosen": -701.0450439453125, + "logps/rejected": -1194.2939453125, + "loss": 0.0759, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19170424342155457, + "rewards/margins": 0.27900928258895874, + "rewards/rejected": -0.4707135558128357, + "step": 4590 + }, + { + "epoch": 0.88, + "learning_rate": 2.2985958440923772e-07, + "logits/chosen": -1.697003960609436, + "logits/rejected": -1.1143566370010376, + "logps/chosen": -708.6148071289062, + "logps/rejected": -1290.4073486328125, + "loss": 0.0789, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.183920219540596, + "rewards/margins": 0.282669335603714, + "rewards/rejected": -0.4665895402431488, + "step": 4600 + }, + { + "epoch": 0.88, + "learning_rate": 2.2294763892164284e-07, + "logits/chosen": -1.4680585861206055, + "logits/rejected": -0.733687162399292, + "logps/chosen": -641.66748046875, + "logps/rejected": -1253.496826171875, + "loss": 0.0761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17868050932884216, + "rewards/margins": 0.30219537019729614, + "rewards/rejected": -0.4808759093284607, + "step": 4610 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.4635512828826904, + "logits/rejected": -1.1326031684875488, + "logps/chosen": -508.17340087890625, + "logps/rejected": -1159.115966796875, + "loss": 0.0711, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12790942192077637, + "rewards/margins": 0.3032786250114441, + "rewards/rejected": -0.43118801712989807, + "step": 4620 + }, + { + "epoch": 0.88, + "learning_rate": 2.094260364336026e-07, + "logits/chosen": -1.6897389888763428, + "logits/rejected": -1.156468391418457, + "logps/chosen": -556.9920043945312, + "logps/rejected": -1199.290771484375, + "loss": 0.088, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14459089934825897, + "rewards/margins": 0.27455028891563416, + "rewards/rejected": -0.4191412031650543, + "step": 4630 + }, + { + "epoch": 0.88, + "learning_rate": 2.0281697718742333e-07, + "logits/chosen": -1.6482601165771484, + "logits/rejected": -0.9699563980102539, + "logps/chosen": -712.9674072265625, + "logps/rejected": -1220.4947509765625, + "loss": 0.0685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17597351968288422, + "rewards/margins": 0.28940483927726746, + "rewards/rejected": -0.4653783440589905, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 1.9630947032398068e-07, + "logits/chosen": -1.8412061929702759, + "logits/rejected": -1.1319139003753662, + "logps/chosen": -749.4605712890625, + "logps/rejected": -1322.244384765625, + "loss": 0.06, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14990632236003876, + "rewards/margins": 0.32965174317359924, + "rewards/rejected": -0.4795580804347992, + "step": 4650 + }, + { + "epoch": 0.89, + "learning_rate": 1.899038035229342e-07, + "logits/chosen": -1.5682942867279053, + "logits/rejected": -0.939906895160675, + "logps/chosen": -631.8695678710938, + "logps/rejected": -1207.721435546875, + "loss": 0.073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14423295855522156, + "rewards/margins": 0.2942690849304199, + "rewards/rejected": -0.4385020136833191, + "step": 4660 + }, + { + "epoch": 0.89, + "learning_rate": 1.8360025996186138e-07, + "logits/chosen": -1.2831768989562988, + "logits/rejected": -0.8119879961013794, + "logps/chosen": -607.0415649414062, + "logps/rejected": -1206.13720703125, + "loss": 0.0877, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1657409518957138, + "rewards/margins": 0.2819930911064148, + "rewards/rejected": -0.44773411750793457, + "step": 4670 + }, + { + "epoch": 0.89, + "learning_rate": 1.7739911830374352e-07, + "logits/chosen": -1.6281124353408813, + "logits/rejected": -1.1297900676727295, + "logps/chosen": -720.3666381835938, + "logps/rejected": -1451.5751953125, + "loss": 0.0598, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18103721737861633, + "rewards/margins": 0.3197404742240906, + "rewards/rejected": -0.5007776618003845, + "step": 4680 + }, + { + "epoch": 0.89, + "learning_rate": 1.713006526846439e-07, + "logits/chosen": -1.5305252075195312, + "logits/rejected": -1.1208873987197876, + "logps/chosen": -668.1603393554688, + "logps/rejected": -1300.840087890625, + "loss": 0.0905, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18247172236442566, + "rewards/margins": 0.2804272174835205, + "rewards/rejected": -0.46289896965026855, + "step": 4690 + }, + { + "epoch": 0.9, + "learning_rate": 1.6530513270159116e-07, + "logits/chosen": -1.6757923364639282, + "logits/rejected": -1.1312105655670166, + "logps/chosen": -585.9445190429688, + "logps/rejected": -1365.5906982421875, + "loss": 0.0651, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.12228063493967056, + "rewards/margins": 0.36077576875686646, + "rewards/rejected": -0.48305636644363403, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 1.59412823400657e-07, + "logits/chosen": -1.3578886985778809, + "logits/rejected": -0.8729084730148315, + "logps/chosen": -595.7457885742188, + "logps/rejected": -1204.9522705078125, + "loss": 0.0826, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17481914162635803, + "rewards/margins": 0.26105111837387085, + "rewards/rejected": -0.43587031960487366, + "step": 4710 + }, + { + "epoch": 0.9, + "learning_rate": 1.5362398526524463e-07, + "logits/chosen": -1.5783363580703735, + "logits/rejected": -0.9245842099189758, + "logps/chosen": -709.2633056640625, + "logps/rejected": -1288.1170654296875, + "loss": 0.0861, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.16297248005867004, + "rewards/margins": 0.2818123698234558, + "rewards/rejected": -0.44478487968444824, + "step": 4720 + }, + { + "epoch": 0.9, + "learning_rate": 1.4793887420457008e-07, + "logits/chosen": -1.2609602212905884, + "logits/rejected": -0.8738704919815063, + "logps/chosen": -675.8863525390625, + "logps/rejected": -1262.517333984375, + "loss": 0.0905, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18329861760139465, + "rewards/margins": 0.28873276710510254, + "rewards/rejected": -0.4720313549041748, + "step": 4730 + }, + { + "epoch": 0.9, + "learning_rate": 1.4235774154234855e-07, + "logits/chosen": -1.5686185359954834, + "logits/rejected": -0.8845101594924927, + "logps/chosen": -705.612548828125, + "logps/rejected": -1317.673828125, + "loss": 0.0753, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1770937442779541, + "rewards/margins": 0.28883716464042664, + "rewards/rejected": -0.4659309387207031, + "step": 4740 + }, + { + "epoch": 0.9, + "learning_rate": 1.368808340056879e-07, + "logits/chosen": -1.370813012123108, + "logits/rejected": -1.0830169916152954, + "logps/chosen": -518.2303466796875, + "logps/rejected": -1131.841064453125, + "loss": 0.0874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12415562570095062, + "rewards/margins": 0.28626319766044617, + "rewards/rejected": -0.4104188084602356, + "step": 4750 + }, + { + "epoch": 0.91, + "learning_rate": 1.31508393714177e-07, + "logits/chosen": -1.6772340536117554, + "logits/rejected": -1.01282799243927, + "logps/chosen": -624.1659545898438, + "logps/rejected": -1270.8428955078125, + "loss": 0.0725, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1336231231689453, + "rewards/margins": 0.3113330006599426, + "rewards/rejected": -0.44495612382888794, + "step": 4760 + }, + { + "epoch": 0.91, + "learning_rate": 1.2624065816918414e-07, + "logits/chosen": -1.4747987985610962, + "logits/rejected": -0.975101113319397, + "logps/chosen": -650.6563720703125, + "logps/rejected": -1342.7335205078125, + "loss": 0.0655, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15981096029281616, + "rewards/margins": 0.30302125215530396, + "rewards/rejected": -0.4628322124481201, + "step": 4770 + }, + { + "epoch": 0.91, + "learning_rate": 1.210778602433596e-07, + "logits/chosen": -1.564624547958374, + "logits/rejected": -1.0374842882156372, + "logps/chosen": -614.6488037109375, + "logps/rejected": -1203.357421875, + "loss": 0.0821, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15904925763607025, + "rewards/margins": 0.27073806524276733, + "rewards/rejected": -0.4297873079776764, + "step": 4780 + }, + { + "epoch": 0.91, + "learning_rate": 1.1602022817033709e-07, + "logits/chosen": -1.6003332138061523, + "logits/rejected": -0.8900405764579773, + "logps/chosen": -649.1990966796875, + "logps/rejected": -1338.3707275390625, + "loss": 0.0775, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15437167882919312, + "rewards/margins": 0.34441012144088745, + "rewards/rejected": -0.49878183007240295, + "step": 4790 + }, + { + "epoch": 0.91, + "learning_rate": 1.1106798553464804e-07, + "logits/chosen": -1.5427950620651245, + "logits/rejected": -0.9831592440605164, + "logps/chosen": -677.7523193359375, + "logps/rejected": -1397.6802978515625, + "loss": 0.0643, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18140892684459686, + "rewards/margins": 0.33899572491645813, + "rewards/rejected": -0.5204046964645386, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 1.0622135126183514e-07, + "logits/chosen": -1.4138776063919067, + "logits/rejected": -0.6474016308784485, + "logps/chosen": -728.1995849609375, + "logps/rejected": -1283.4852294921875, + "loss": 0.0618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.192392498254776, + "rewards/margins": 0.2717262804508209, + "rewards/rejected": -0.4641187787055969, + "step": 4810 + }, + { + "epoch": 0.92, + "learning_rate": 1.0148053960877396e-07, + "logits/chosen": -1.398097276687622, + "logits/rejected": -0.9553489685058594, + "logps/chosen": -585.2307739257812, + "logps/rejected": -1273.9364013671875, + "loss": 0.0691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1406571865081787, + "rewards/margins": 0.3191017210483551, + "rewards/rejected": -0.4597589373588562, + "step": 4820 + }, + { + "epoch": 0.92, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -1.5144402980804443, + "logits/rejected": -0.7728510499000549, + "logps/chosen": -749.4060668945312, + "logps/rejected": -1322.1160888671875, + "loss": 0.0965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2126808613538742, + "rewards/margins": 0.2606582045555115, + "rewards/rejected": -0.4733390808105469, + "step": 4830 + }, + { + "epoch": 0.92, + "learning_rate": 9.23172177894574e-08, + "logits/chosen": -1.6027752161026, + "logits/rejected": -0.8554502725601196, + "logps/chosen": -739.6917724609375, + "logps/rejected": -1458.238037109375, + "loss": 0.0339, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17026378214359283, + "rewards/margins": 0.35099369287490845, + "rewards/rejected": -0.5212574601173401, + "step": 4840 + }, + { + "epoch": 0.92, + "learning_rate": 8.78951127094127e-08, + "logits/chosen": -1.6803109645843506, + "logits/rejected": -0.9320834279060364, + "logps/chosen": -662.111572265625, + "logps/rejected": -1162.46484375, + "loss": 0.0716, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15743741393089294, + "rewards/margins": 0.2909800708293915, + "rewards/rejected": -0.4484175145626068, + "step": 4850 + }, + { + "epoch": 0.93, + "learning_rate": 8.357964040363209e-08, + "logits/chosen": -1.7106218338012695, + "logits/rejected": -0.9925411343574524, + "logps/chosen": -710.9759521484375, + "logps/rejected": -1317.148193359375, + "loss": 0.0773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16667839884757996, + "rewards/margins": 0.2834816575050354, + "rewards/rejected": -0.45016008615493774, + "step": 4860 + }, + { + "epoch": 0.93, + "learning_rate": 7.937099164772699e-08, + "logits/chosen": -1.3424303531646729, + "logits/rejected": -1.0499579906463623, + "logps/chosen": -523.9136962890625, + "logps/rejected": -1224.3201904296875, + "loss": 0.0886, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15465199947357178, + "rewards/margins": 0.29365110397338867, + "rewards/rejected": -0.44830313324928284, + "step": 4870 + }, + { + "epoch": 0.93, + "learning_rate": 7.526935249492245e-08, + "logits/chosen": -1.6472418308258057, + "logits/rejected": -0.9855870008468628, + "logps/chosen": -636.9083251953125, + "logps/rejected": -1183.853515625, + "loss": 0.0971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15656232833862305, + "rewards/margins": 0.2960873544216156, + "rewards/rejected": -0.45264968276023865, + "step": 4880 + }, + { + "epoch": 0.93, + "learning_rate": 7.127490426783124e-08, + "logits/chosen": -1.6242201328277588, + "logits/rejected": -1.0149205923080444, + "logps/chosen": -678.8060302734375, + "logps/rejected": -1352.32470703125, + "loss": 0.0638, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14568215608596802, + "rewards/margins": 0.3156759738922119, + "rewards/rejected": -0.4613581597805023, + "step": 4890 + }, + { + "epoch": 0.93, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -1.3191697597503662, + "logits/rejected": -1.201279878616333, + "logps/chosen": -721.3082275390625, + "logps/rejected": -1354.2021484375, + "loss": 0.076, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17443808913230896, + "rewards/margins": 0.2681798040866852, + "rewards/rejected": -0.44261789321899414, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 6.360828218030191e-08, + "logits/chosen": -1.8827226161956787, + "logits/rejected": -1.1188879013061523, + "logps/chosen": -634.8402709960938, + "logps/rejected": -1260.1407470703125, + "loss": 0.0734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1493465006351471, + "rewards/margins": 0.29650747776031494, + "rewards/rejected": -0.44585394859313965, + "step": 4910 + }, + { + "epoch": 0.94, + "learning_rate": 5.993644724093889e-08, + "logits/chosen": -1.6838546991348267, + "logits/rejected": -0.9050960540771484, + "logps/chosen": -648.4398803710938, + "logps/rejected": -1251.8382568359375, + "loss": 0.0554, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1434290111064911, + "rewards/margins": 0.3137533664703369, + "rewards/rejected": -0.4571823477745056, + "step": 4920 + }, + { + "epoch": 0.94, + "learning_rate": 5.637248105445775e-08, + "logits/chosen": -1.7320117950439453, + "logits/rejected": -1.1256321668624878, + "logps/chosen": -539.5606079101562, + "logps/rejected": -1056.583251953125, + "loss": 0.0972, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1480817347764969, + "rewards/margins": 0.2356722354888916, + "rewards/rejected": -0.3837539255619049, + "step": 4930 + }, + { + "epoch": 0.94, + "learning_rate": 5.291654117437262e-08, + "logits/chosen": -1.7257087230682373, + "logits/rejected": -0.9479360580444336, + "logps/chosen": -557.167724609375, + "logps/rejected": -1281.1934814453125, + "loss": 0.0506, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11244988441467285, + "rewards/margins": 0.33333298563957214, + "rewards/rejected": -0.4457828402519226, + "step": 4940 + }, + { + "epoch": 0.94, + "learning_rate": 4.956878037864044e-08, + "logits/chosen": -1.407482385635376, + "logits/rejected": -0.8275833129882812, + "logps/chosen": -626.5960693359375, + "logps/rejected": -1172.99755859375, + "loss": 0.0545, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1533767133951187, + "rewards/margins": 0.2846202254295349, + "rewards/rejected": -0.4379969537258148, + "step": 4950 + }, + { + "epoch": 0.94, + "learning_rate": 4.632934666290778e-08, + "logits/chosen": -1.4791433811187744, + "logits/rejected": -1.235567569732666, + "logps/chosen": -508.14501953125, + "logps/rejected": -1220.041259765625, + "loss": 0.0576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13363158702850342, + "rewards/margins": 0.29521042108535767, + "rewards/rejected": -0.4288419783115387, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 4.319838323396691e-08, + "logits/chosen": -1.6887500286102295, + "logits/rejected": -1.1927210092544556, + "logps/chosen": -635.3477783203125, + "logps/rejected": -1253.641357421875, + "loss": 0.1026, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16893738508224487, + "rewards/margins": 0.25178489089012146, + "rewards/rejected": -0.4207223057746887, + "step": 4970 + }, + { + "epoch": 0.95, + "learning_rate": 4.017602850342584e-08, + "logits/chosen": -1.5601575374603271, + "logits/rejected": -0.8450828790664673, + "logps/chosen": -624.2867431640625, + "logps/rejected": -1262.434814453125, + "loss": 0.0557, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1422443836927414, + "rewards/margins": 0.29309436678886414, + "rewards/rejected": -0.43533873558044434, + "step": 4980 + }, + { + "epoch": 0.95, + "learning_rate": 3.7262416081589866e-08, + "logits/chosen": -1.684685468673706, + "logits/rejected": -1.0103908777236938, + "logps/chosen": -624.5326538085938, + "logps/rejected": -1345.8338623046875, + "loss": 0.0487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14531221985816956, + "rewards/margins": 0.3458930253982544, + "rewards/rejected": -0.49120527505874634, + "step": 4990 + }, + { + "epoch": 0.95, + "learning_rate": 3.445767477155443e-08, + "logits/chosen": -1.6047395467758179, + "logits/rejected": -1.083069086074829, + "logps/chosen": -439.0870666503906, + "logps/rejected": -1007.619140625, + "loss": 0.088, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10899704694747925, + "rewards/margins": 0.26882943511009216, + "rewards/rejected": -0.3778265118598938, + "step": 5000 + }, + { + "epoch": 0.95, + "learning_rate": 3.1761928563510956e-08, + "logits/chosen": -1.5430387258529663, + "logits/rejected": -1.1478708982467651, + "logps/chosen": -566.592041015625, + "logps/rejected": -1102.06201171875, + "loss": 0.0955, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12917637825012207, + "rewards/margins": 0.2524365484714508, + "rewards/rejected": -0.38161295652389526, + "step": 5010 + }, + { + "epoch": 0.96, + "learning_rate": 2.917529662926549e-08, + "logits/chosen": -1.4810032844543457, + "logits/rejected": -1.0692861080169678, + "logps/chosen": -544.7088623046875, + "logps/rejected": -1293.71435546875, + "loss": 0.044, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11322245746850967, + "rewards/margins": 0.34361839294433594, + "rewards/rejected": -0.4568409025669098, + "step": 5020 + }, + { + "epoch": 0.96, + "learning_rate": 2.669789331697148e-08, + "logits/chosen": -1.4107751846313477, + "logits/rejected": -0.8879677057266235, + "logps/chosen": -596.4759521484375, + "logps/rejected": -1232.4378662109375, + "loss": 0.0773, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15274113416671753, + "rewards/margins": 0.30707958340644836, + "rewards/rejected": -0.4598206877708435, + "step": 5030 + }, + { + "epoch": 0.96, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -1.6700639724731445, + "logits/rejected": -1.160444974899292, + "logps/chosen": -638.5947265625, + "logps/rejected": -1382.1214599609375, + "loss": 0.0714, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16618075966835022, + "rewards/margins": 0.3303069770336151, + "rewards/rejected": -0.49648770689964294, + "step": 5040 + }, + { + "epoch": 0.96, + "learning_rate": 2.20712058024683e-08, + "logits/chosen": -1.7391884326934814, + "logits/rejected": -1.0803216695785522, + "logps/chosen": -749.6804809570312, + "logps/rejected": -1278.514404296875, + "loss": 0.0449, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1625753939151764, + "rewards/margins": 0.3114554286003113, + "rewards/rejected": -0.47403082251548767, + "step": 5050 + }, + { + "epoch": 0.96, + "learning_rate": 1.9922126133870568e-08, + "logits/chosen": -1.579906702041626, + "logits/rejected": -1.032274603843689, + "logps/chosen": -662.6795654296875, + "logps/rejected": -1298.6641845703125, + "loss": 0.0915, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15554921329021454, + "rewards/margins": 0.2925891578197479, + "rewards/rejected": -0.4481383264064789, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 1.7882684145406616e-08, + "logits/chosen": -1.3243744373321533, + "logits/rejected": -1.0081312656402588, + "logps/chosen": -598.328857421875, + "logps/rejected": -1325.436279296875, + "loss": 0.0686, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1774941235780716, + "rewards/margins": 0.2830308973789215, + "rewards/rejected": -0.4605250358581543, + "step": 5070 + }, + { + "epoch": 0.97, + "learning_rate": 1.595296999541057e-08, + "logits/chosen": -1.549423336982727, + "logits/rejected": -0.9221774935722351, + "logps/chosen": -638.889404296875, + "logps/rejected": -1374.27783203125, + "loss": 0.074, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17353455722332, + "rewards/margins": 0.3390573561191559, + "rewards/rejected": -0.5125918984413147, + "step": 5080 + }, + { + "epoch": 0.97, + "learning_rate": 1.4133068991437903e-08, + "logits/chosen": -1.5194743871688843, + "logits/rejected": -1.1621986627578735, + "logps/chosen": -572.447509765625, + "logps/rejected": -1069.026123046875, + "loss": 0.0848, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14077235758304596, + "rewards/margins": 0.23328928649425507, + "rewards/rejected": -0.374061644077301, + "step": 5090 + }, + { + "epoch": 0.97, + "learning_rate": 1.2423061586496476e-08, + "logits/chosen": -1.5293216705322266, + "logits/rejected": -1.161948561668396, + "logps/chosen": -603.0601806640625, + "logps/rejected": -1281.250244140625, + "loss": 0.0547, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15855684876441956, + "rewards/margins": 0.3035425543785095, + "rewards/rejected": -0.46209946274757385, + "step": 5100 + }, + { + "epoch": 0.97, + "learning_rate": 1.0823023375489128e-08, + "logits/chosen": -1.5255159139633179, + "logits/rejected": -1.010140299797058, + "logps/chosen": -623.25244140625, + "logps/rejected": -1228.103759765625, + "loss": 0.0726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15898485481739044, + "rewards/margins": 0.28266894817352295, + "rewards/rejected": -0.4416537880897522, + "step": 5110 + }, + { + "epoch": 0.98, + "learning_rate": 9.333025091870507e-09, + "logits/chosen": -1.6246131658554077, + "logits/rejected": -0.83955317735672, + "logps/chosen": -676.0503540039062, + "logps/rejected": -1231.9215087890625, + "loss": 0.0798, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1609552502632141, + "rewards/margins": 0.28405335545539856, + "rewards/rejected": -0.44500860571861267, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 7.95313260452263e-09, + "logits/chosen": -1.470959186553955, + "logits/rejected": -0.8849459886550903, + "logps/chosen": -626.1944580078125, + "logps/rejected": -1259.2894287109375, + "loss": 0.0592, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15113981068134308, + "rewards/margins": 0.2945057451725006, + "rewards/rejected": -0.4456455707550049, + "step": 5130 + }, + { + "epoch": 0.98, + "learning_rate": 6.683406914840818e-09, + "logits/chosen": -1.423765778541565, + "logits/rejected": -0.8854363560676575, + "logps/chosen": -525.4611206054688, + "logps/rejected": -1161.06005859375, + "loss": 0.0656, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12872517108917236, + "rewards/margins": 0.2833263576030731, + "rewards/rejected": -0.4120515286922455, + "step": 5140 + }, + { + "epoch": 0.98, + "learning_rate": 5.523904154037529e-09, + "logits/chosen": -1.4726181030273438, + "logits/rejected": -1.0987781286239624, + "logps/chosen": -612.3428344726562, + "logps/rejected": -1266.933837890625, + "loss": 0.0767, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17042508721351624, + "rewards/margins": 0.2595577538013458, + "rewards/rejected": -0.42998284101486206, + "step": 5150 + }, + { + "epoch": 0.98, + "learning_rate": 4.474675580662113e-09, + "logits/chosen": -1.48684561252594, + "logits/rejected": -0.8791742324829102, + "logps/chosen": -561.6771850585938, + "logps/rejected": -1201.2552490234375, + "loss": 0.0813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.145443856716156, + "rewards/margins": 0.2917470335960388, + "rewards/rejected": -0.4371909201145172, + "step": 5160 + }, + { + "epoch": 0.98, + "learning_rate": 3.5357675783331823e-09, + "logits/chosen": -1.7405544519424438, + "logits/rejected": -1.0294724702835083, + "logps/chosen": -674.488037109375, + "logps/rejected": -1273.2628173828125, + "loss": 0.0624, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15219458937644958, + "rewards/margins": 0.3184397518634796, + "rewards/rejected": -0.4706343710422516, + "step": 5170 + }, + { + "epoch": 0.99, + "learning_rate": 2.7072216536885855e-09, + "logits/chosen": -1.6870332956314087, + "logits/rejected": -0.7788273096084595, + "logps/chosen": -656.57421875, + "logps/rejected": -1370.481201171875, + "loss": 0.0638, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15338242053985596, + "rewards/margins": 0.3432921767234802, + "rewards/rejected": -0.4966745972633362, + "step": 5180 + }, + { + "epoch": 0.99, + "learning_rate": 1.989074434551874e-09, + "logits/chosen": -1.3593934774398804, + "logits/rejected": -0.986729621887207, + "logps/chosen": -659.7386474609375, + "logps/rejected": -1238.4710693359375, + "loss": 0.0713, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17776012420654297, + "rewards/margins": 0.27383026480674744, + "rewards/rejected": -0.4515903890132904, + "step": 5190 + }, + { + "epoch": 0.99, + "learning_rate": 1.3813576683111007e-09, + "logits/chosen": -1.5167112350463867, + "logits/rejected": -0.8397325277328491, + "logps/chosen": -599.6024169921875, + "logps/rejected": -1328.2471923828125, + "loss": 0.0568, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13601288199424744, + "rewards/margins": 0.34515735507011414, + "rewards/rejected": -0.48117026686668396, + "step": 5200 + }, + { + "epoch": 0.99, + "learning_rate": 8.840982205160498e-10, + "logits/chosen": -1.4586691856384277, + "logits/rejected": -0.971697211265564, + "logps/chosen": -614.6513671875, + "logps/rejected": -1244.3607177734375, + "loss": 0.0793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14397016167640686, + "rewards/margins": 0.2893129885196686, + "rewards/rejected": -0.43328309059143066, + "step": 5210 + }, + { + "epoch": 0.99, + "learning_rate": 4.973180736911332e-10, + "logits/chosen": -1.6655791997909546, + "logits/rejected": -1.1275146007537842, + "logps/chosen": -627.2606201171875, + "logps/rejected": -1310.1925048828125, + "loss": 0.0681, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14530961215496063, + "rewards/margins": 0.30237385630607605, + "rewards/rejected": -0.4476834833621979, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 2.2103432636366718e-10, + "logits/chosen": -1.3939180374145508, + "logits/rejected": -0.9449717402458191, + "logps/chosen": -691.179443359375, + "logps/rejected": -1282.6971435546875, + "loss": 0.0802, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.180943563580513, + "rewards/margins": 0.28968390822410583, + "rewards/rejected": -0.47062739729881287, + "step": 5230 + }, + { + "epoch": 1.0, + "learning_rate": 5.525919230670029e-11, + "logits/chosen": -1.4143139123916626, + "logits/rejected": -1.0170581340789795, + "logps/chosen": -506.2384338378906, + "logps/rejected": -1102.7977294921875, + "loss": 0.0889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15023784339427948, + "rewards/margins": 0.26667970418930054, + "rewards/rejected": -0.4169175624847412, + "step": 5240 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.594805359840393, + "logits/rejected": -0.6739069223403931, + "logps/chosen": -637.98779296875, + "logps/rejected": -1227.662841796875, + "loss": 0.0656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17544154822826385, + "rewards/margins": 0.29885655641555786, + "rewards/rejected": -0.4742980897426605, + "step": 5250 + }, + { + "epoch": 1.0, + "step": 5250, + "total_flos": 0.0, + "train_loss": 0.07836547029586065, + "train_runtime": 22365.132, + "train_samples_per_second": 0.939, + "train_steps_per_second": 0.235 + } + ], + "logging_steps": 10, + "max_steps": 5250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}