{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.617801047120419e-08, "logits/chosen": -0.22574472427368164, "logits/rejected": -0.2384113073348999, "logps/chosen": -1586.180908203125, "logps/rejected": -1626.5421142578125, "loss": 0.0638, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.617801047120419e-07, "logits/chosen": -0.1639188826084137, "logits/rejected": -0.1851254105567932, "logps/chosen": -2052.12841796875, "logps/rejected": -1800.1533203125, "loss": 0.0588, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 6.274010956985876e-05, "rewards/margins": -1.1924101272597909e-05, "rewards/rejected": 7.466421811841428e-05, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.235602094240838e-07, "logits/chosen": -0.21358470618724823, "logits/rejected": -0.1908903419971466, "logps/chosen": -2196.85498046875, "logps/rejected": -1773.3756103515625, "loss": 0.0627, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.00044371531112119555, "rewards/margins": 9.080490417545661e-05, "rewards/rejected": 0.00035291039966978133, "step": 20 }, { "epoch": 0.02, "learning_rate": 7.853403141361258e-07, "logits/chosen": -0.2191818505525589, "logits/rejected": -0.22062306106090546, "logps/chosen": -2141.364501953125, "logps/rejected": -1710.662353515625, "loss": 0.0522, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0025672917254269123, "rewards/margins": 0.0005076726665720344, "rewards/rejected": 0.0020596194081008434, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0471204188481676e-06, "logits/chosen": -0.2520692050457001, "logits/rejected": -0.22583802044391632, "logps/chosen": -2189.7646484375, "logps/rejected": -1715.2425537109375, "loss": 0.0495, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00766522204503417, "rewards/margins": 0.0016571322921663523, "rewards/rejected": 0.0060080899856984615, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3089005235602096e-06, "logits/chosen": -0.17123639583587646, "logits/rejected": -0.19555726647377014, "logps/chosen": -2526.5703125, "logps/rejected": -2165.141845703125, "loss": 0.0538, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.02015666291117668, "rewards/margins": 0.0033235768787562847, "rewards/rejected": 0.01683308556675911, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.5706806282722515e-06, "logits/chosen": -0.18598869442939758, "logits/rejected": -0.20677652955055237, "logps/chosen": -2151.3115234375, "logps/rejected": -1970.6624755859375, "loss": 0.0505, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.029178302735090256, "rewards/margins": 0.0026255736593157053, "rewards/rejected": 0.026552731171250343, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8324607329842933e-06, "logits/chosen": -0.18310071527957916, "logits/rejected": -0.20503754913806915, "logps/chosen": -1844.6480712890625, "logps/rejected": -1762.2308349609375, "loss": 0.056, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.028074929490685463, "rewards/margins": 0.001591854146681726, "rewards/rejected": 0.026483073830604553, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.094240837696335e-06, "logits/chosen": -0.22824080288410187, "logits/rejected": -0.24587313830852509, "logps/chosen": -1901.586181640625, "logps/rejected": -1624.0626220703125, "loss": 0.064, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.031114792451262474, "rewards/margins": 0.005912109278142452, "rewards/rejected": 0.025202685967087746, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.356020942408377e-06, "logits/chosen": -0.2366272509098053, "logits/rejected": -0.22877153754234314, "logps/chosen": -1691.4013671875, "logps/rejected": -1524.5679931640625, "loss": 0.0481, "rewards/accuracies": 0.46875, "rewards/chosen": 0.035714153200387955, "rewards/margins": 0.0030426979064941406, "rewards/rejected": 0.032671455293893814, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.617801047120419e-06, "logits/chosen": -0.22739839553833008, "logits/rejected": -0.24034900963306427, "logps/chosen": -2141.99365234375, "logps/rejected": -2006.7513427734375, "loss": 0.0519, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.038611847907304764, "rewards/margins": 0.0053280796855688095, "rewards/rejected": 0.033283766359090805, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -0.25320005416870117, "eval_logits/rejected": -0.25199252367019653, "eval_logps/chosen": -2183.76953125, "eval_logps/rejected": -1849.702880859375, "eval_loss": 0.052377186715602875, "eval_rewards/accuracies": 0.5254999995231628, "eval_rewards/chosen": 0.03263631835579872, "eval_rewards/margins": 0.00592681672424078, "eval_rewards/rejected": 0.026709498837590218, "eval_runtime": 510.4972, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.979, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.8795811518324613e-06, "logits/chosen": -0.2320372760295868, "logits/rejected": -0.27123022079467773, "logps/chosen": -1939.3607177734375, "logps/rejected": -1764.5439453125, "loss": 0.0502, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.027445796877145767, "rewards/margins": 0.00373500632122159, "rewards/rejected": 0.023710791021585464, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.141361256544503e-06, "logits/chosen": -0.2857373058795929, "logits/rejected": -0.26925256848335266, "logps/chosen": -2433.180419921875, "logps/rejected": -2053.70361328125, "loss": 0.0785, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03826409578323364, "rewards/margins": 0.007337054703384638, "rewards/rejected": 0.030927041545510292, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.403141361256545e-06, "logits/chosen": -0.27496081590652466, "logits/rejected": -0.30028867721557617, "logps/chosen": -2130.792236328125, "logps/rejected": -1784.03125, "loss": 0.0549, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.05773577094078064, "rewards/margins": 0.011168297380208969, "rewards/rejected": 0.04656747728586197, "step": 130 }, { "epoch": 0.07, "learning_rate": 3.6649214659685865e-06, "logits/chosen": -0.31289300322532654, "logits/rejected": -0.31437715888023376, "logps/chosen": -2071.06982421875, "logps/rejected": -1879.8802490234375, "loss": 0.055, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.07026473432779312, "rewards/margins": 0.007077778223901987, "rewards/rejected": 0.06318695098161697, "step": 140 }, { "epoch": 0.08, "learning_rate": 3.926701570680629e-06, "logits/chosen": -0.29269808530807495, "logits/rejected": -0.3180951476097107, "logps/chosen": -2014.0640869140625, "logps/rejected": -1808.185302734375, "loss": 0.0543, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.060369331389665604, "rewards/margins": 0.007043222431093454, "rewards/rejected": 0.05332610756158829, "step": 150 }, { "epoch": 0.08, "learning_rate": 4.18848167539267e-06, "logits/chosen": -0.2773135304450989, "logits/rejected": -0.2673946022987366, "logps/chosen": -2283.48779296875, "logps/rejected": -1938.6422119140625, "loss": 0.0524, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.05114240199327469, "rewards/margins": 0.008266921155154705, "rewards/rejected": 0.04287547618150711, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.450261780104713e-06, "logits/chosen": -0.2700185179710388, "logits/rejected": -0.26662972569465637, "logps/chosen": -2404.58984375, "logps/rejected": -1977.1859130859375, "loss": 0.0624, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.05304870009422302, "rewards/margins": 0.011285845190286636, "rewards/rejected": 0.04176285117864609, "step": 170 }, { "epoch": 0.09, "learning_rate": 4.712041884816754e-06, "logits/chosen": -0.2975671887397766, "logits/rejected": -0.2988983690738678, "logps/chosen": -2047.671630859375, "logps/rejected": -1742.282470703125, "loss": 0.0418, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.05823253467679024, "rewards/margins": 0.01046661101281643, "rewards/rejected": 0.04776592180132866, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.9738219895287965e-06, "logits/chosen": -0.2745932936668396, "logits/rejected": -0.2855191230773926, "logps/chosen": -2184.26220703125, "logps/rejected": -1788.6656494140625, "loss": 0.0408, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06112230569124222, "rewards/margins": 0.012786999344825745, "rewards/rejected": 0.04833530634641647, "step": 190 }, { "epoch": 0.1, "learning_rate": 4.999661831436499e-06, "logits/chosen": -0.27325528860092163, "logits/rejected": -0.2756146490573883, "logps/chosen": -2187.59130859375, "logps/rejected": -2025.250732421875, "loss": 0.0379, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05468825250864029, "rewards/margins": 0.006374381482601166, "rewards/rejected": 0.04831386357545853, "step": 200 }, { "epoch": 0.1, "eval_logits/chosen": -0.27396515011787415, "eval_logits/rejected": -0.2760486304759979, "eval_logps/chosen": -2172.962890625, "eval_logps/rejected": -1842.2476806640625, "eval_loss": 0.051403772085905075, "eval_rewards/accuracies": 0.5389999747276306, "eval_rewards/chosen": 0.043442659080028534, "eval_rewards/margins": 0.009277699515223503, "eval_rewards/rejected": 0.03416495770215988, "eval_runtime": 510.5925, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.9984929711403395e-06, "logits/chosen": -0.24565927684307098, "logits/rejected": -0.24346761405467987, "logps/chosen": -2105.339111328125, "logps/rejected": -1993.477294921875, "loss": 0.0456, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.03817785158753395, "rewards/margins": 0.0046168239787220955, "rewards/rejected": 0.03356102854013443, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.996489634487865e-06, "logits/chosen": -0.2854730486869812, "logits/rejected": -0.27373185753822327, "logps/chosen": -2071.35595703125, "logps/rejected": -1617.6314697265625, "loss": 0.0471, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.03769981488585472, "rewards/margins": 0.012330549769103527, "rewards/rejected": 0.02536926604807377, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.9936524905772466e-06, "logits/chosen": -0.2610529661178589, "logits/rejected": -0.28053849935531616, "logps/chosen": -1956.2564697265625, "logps/rejected": -1615.5814208984375, "loss": 0.0735, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04565655067563057, "rewards/margins": 0.011204726994037628, "rewards/rejected": 0.03445183113217354, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.9899824869915e-06, "logits/chosen": -0.24108798801898956, "logits/rejected": -0.2399587333202362, "logps/chosen": -1775.907470703125, "logps/rejected": -1713.854736328125, "loss": 0.0715, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03222992643713951, "rewards/margins": 0.008830582723021507, "rewards/rejected": 0.023399341851472855, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.985480849482012e-06, "logits/chosen": -0.20024847984313965, "logits/rejected": -0.22306282818317413, "logps/chosen": -2255.089599609375, "logps/rejected": -1934.8642578125, "loss": 0.0577, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.030694425106048584, "rewards/margins": 0.01082837488502264, "rewards/rejected": 0.01986604928970337, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.980149081559142e-06, "logits/chosen": -0.21732480823993683, "logits/rejected": -0.24718734622001648, "logps/chosen": -1957.7998046875, "logps/rejected": -1881.0550537109375, "loss": 0.056, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.05905503034591675, "rewards/margins": 0.0030602319166064262, "rewards/rejected": 0.05599479004740715, "step": 260 }, { "epoch": 0.14, "learning_rate": 4.9739889639900655e-06, "logits/chosen": -0.24414131045341492, "logits/rejected": -0.22118325531482697, "logps/chosen": -1925.445556640625, "logps/rejected": -1909.0667724609375, "loss": 0.0539, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05300917103886604, "rewards/margins": 0.006324948277324438, "rewards/rejected": 0.04668421670794487, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.967002554204009e-06, "logits/chosen": -0.25582337379455566, "logits/rejected": -0.2471769154071808, "logps/chosen": -2269.031982421875, "logps/rejected": -2033.6907958984375, "loss": 0.0621, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.01642546057701111, "rewards/margins": 0.0032355361618101597, "rewards/rejected": 0.013189923949539661, "step": 280 }, { "epoch": 0.15, "learning_rate": 4.959192185605089e-06, "logits/chosen": -0.30079659819602966, "logits/rejected": -0.28022244572639465, "logps/chosen": -1992.6142578125, "logps/rejected": -1820.0687255859375, "loss": 0.0584, "rewards/accuracies": 0.53125, "rewards/chosen": 0.02403336763381958, "rewards/margins": 0.00603306433185935, "rewards/rejected": 0.018000302836298943, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.950560466792969e-06, "logits/chosen": -0.28634804487228394, "logits/rejected": -0.2918199896812439, "logps/chosen": -2390.38623046875, "logps/rejected": -1984.9703369140625, "loss": 0.0425, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.035278573632240295, "rewards/margins": 0.012735734693706036, "rewards/rejected": 0.022542843595147133, "step": 300 }, { "epoch": 0.16, "eval_logits/chosen": -0.29014500975608826, "eval_logits/rejected": -0.28990820050239563, "eval_logps/chosen": -2182.04541015625, "eval_logps/rejected": -1851.862060546875, "eval_loss": 0.05131419003009796, "eval_rewards/accuracies": 0.5630000233650208, "eval_rewards/chosen": 0.03436028212308884, "eval_rewards/margins": 0.009809814393520355, "eval_rewards/rejected": 0.02455046772956848, "eval_runtime": 510.7215, "eval_samples_per_second": 3.916, "eval_steps_per_second": 0.979, "step": 300 }, { "epoch": 0.16, "learning_rate": 4.9411102806916185e-06, "logits/chosen": -0.2964246869087219, "logits/rejected": -0.3249427080154419, "logps/chosen": -2153.874267578125, "logps/rejected": -1754.1324462890625, "loss": 0.0521, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.04025361314415932, "rewards/margins": 0.009616317227482796, "rewards/rejected": 0.03063729964196682, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.930844783586424e-06, "logits/chosen": -0.26167505979537964, "logits/rejected": -0.2782900929450989, "logps/chosen": -2090.10986328125, "logps/rejected": -1866.400146484375, "loss": 0.0581, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.044223010540008545, "rewards/margins": 0.0066053010523319244, "rewards/rejected": 0.03761770576238632, "step": 320 }, { "epoch": 0.17, "learning_rate": 4.919767404070033e-06, "logits/chosen": -0.2866571545600891, "logits/rejected": -0.2904338836669922, "logps/chosen": -2089.8603515625, "logps/rejected": -1703.691650390625, "loss": 0.0577, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0637887567281723, "rewards/margins": 0.019382018595933914, "rewards/rejected": 0.04440673440694809, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.907881841897216e-06, "logits/chosen": -0.2776980698108673, "logits/rejected": -0.2663383185863495, "logps/chosen": -1941.0628662109375, "logps/rejected": -1724.725830078125, "loss": 0.057, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.07600688189268112, "rewards/margins": 0.014780363067984581, "rewards/rejected": 0.06122652441263199, "step": 340 }, { "epoch": 0.18, "learning_rate": 4.89519206674919e-06, "logits/chosen": -0.28663453459739685, "logits/rejected": -0.2781517803668976, "logps/chosen": -2123.11865234375, "logps/rejected": -1684.65625, "loss": 0.0578, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07864506542682648, "rewards/margins": 0.023925408720970154, "rewards/rejected": 0.05471965670585632, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.881702316907769e-06, "logits/chosen": -0.2786110043525696, "logits/rejected": -0.2901211082935333, "logps/chosen": -2082.64208984375, "logps/rejected": -1863.649169921875, "loss": 0.067, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.052565790712833405, "rewards/margins": 0.014038707129657269, "rewards/rejected": 0.038527075201272964, "step": 360 }, { "epoch": 0.19, "learning_rate": 4.86741709783982e-06, "logits/chosen": -0.34627729654312134, "logits/rejected": -0.33580657839775085, "logps/chosen": -1979.3060302734375, "logps/rejected": -1685.088134765625, "loss": 0.0563, "rewards/accuracies": 0.5625, "rewards/chosen": 0.017150847241282463, "rewards/margins": 0.008625769056379795, "rewards/rejected": 0.008525079116225243, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.852341180692471e-06, "logits/chosen": -0.28853368759155273, "logits/rejected": -0.33309391140937805, "logps/chosen": -2051.138671875, "logps/rejected": -1604.300537109375, "loss": 0.0629, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.03238735720515251, "rewards/margins": 0.011523480527102947, "rewards/rejected": 0.02086387760937214, "step": 380 }, { "epoch": 0.2, "learning_rate": 4.836479600699579e-06, "logits/chosen": -0.2653834819793701, "logits/rejected": -0.27924028038978577, "logps/chosen": -2167.791748046875, "logps/rejected": -1883.7181396484375, "loss": 0.0567, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.07641658931970596, "rewards/margins": 0.014170339331030846, "rewards/rejected": 0.06224624067544937, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.819837655500014e-06, "logits/chosen": -0.23558492958545685, "logits/rejected": -0.252250611782074, "logps/chosen": -2008.2281494140625, "logps/rejected": -1735.037109375, "loss": 0.0522, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.08481944352388382, "rewards/margins": 0.018725356087088585, "rewards/rejected": 0.06609407812356949, "step": 400 }, { "epoch": 0.21, "eval_logits/chosen": -0.26833415031433105, "eval_logits/rejected": -0.27769944071769714, "eval_logps/chosen": -2134.577880859375, "eval_logps/rejected": -1810.503662109375, "eval_loss": 0.052033666521310806, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": 0.08182776719331741, "eval_rewards/margins": 0.01591898687183857, "eval_rewards/rejected": 0.06590878218412399, "eval_runtime": 510.467, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.979, "step": 400 }, { "epoch": 0.21, "learning_rate": 4.802420903368286e-06, "logits/chosen": -0.22416555881500244, "logits/rejected": -0.23775295913219452, "logps/chosen": -2305.072265625, "logps/rejected": -2017.150390625, "loss": 0.055, "rewards/accuracies": 0.46875, "rewards/chosen": 0.07892463356256485, "rewards/margins": 0.010944006033241749, "rewards/rejected": 0.06798062473535538, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.784235161358124e-06, "logits/chosen": -0.24204190075397491, "logits/rejected": -0.24225695431232452, "logps/chosen": -1825.3125, "logps/rejected": -1693.0045166015625, "loss": 0.0497, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.05524778366088867, "rewards/margins": 0.006465147249400616, "rewards/rejected": 0.048782628029584885, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.765286503359632e-06, "logits/chosen": -0.23344504833221436, "logits/rejected": -0.27365198731422424, "logps/chosen": -2049.459716796875, "logps/rejected": -1840.787841796875, "loss": 0.0565, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04559114947915077, "rewards/margins": 0.006585550494492054, "rewards/rejected": 0.039005596190690994, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.745581258070654e-06, "logits/chosen": -0.27591726183891296, "logits/rejected": -0.2608277499675751, "logps/chosen": -1806.8870849609375, "logps/rejected": -1811.4437255859375, "loss": 0.0541, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.033870596438646317, "rewards/margins": 0.0030527892522513866, "rewards/rejected": 0.030817802995443344, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.725126006883047e-06, "logits/chosen": -0.2728896141052246, "logits/rejected": -0.2633044123649597, "logps/chosen": -2298.3818359375, "logps/rejected": -2048.328125, "loss": 0.052, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.027686957269906998, "rewards/margins": 0.0040281787514686584, "rewards/rejected": 0.02365877851843834, "step": 450 }, { "epoch": 0.24, "learning_rate": 4.70392758168454e-06, "logits/chosen": -0.2538016438484192, "logits/rejected": -0.25012341141700745, "logps/chosen": -2255.5146484375, "logps/rejected": -1954.8531494140625, "loss": 0.0536, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.03767388314008713, "rewards/margins": 0.008208373561501503, "rewards/rejected": 0.029465511441230774, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.68199306257695e-06, "logits/chosen": -0.2599068284034729, "logits/rejected": -0.26421061158180237, "logps/chosen": -2130.776123046875, "logps/rejected": -1925.4456787109375, "loss": 0.0521, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.05753815174102783, "rewards/margins": 0.011079727672040462, "rewards/rejected": 0.04645842686295509, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.659329775511478e-06, "logits/chosen": -0.27710121870040894, "logits/rejected": -0.2857569754123688, "logps/chosen": -2018.772705078125, "logps/rejected": -1903.8472900390625, "loss": 0.0537, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.07305508106946945, "rewards/margins": 0.009427006356418133, "rewards/rejected": 0.06362807750701904, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.635945289841902e-06, "logits/chosen": -0.28116849064826965, "logits/rejected": -0.2983720004558563, "logps/chosen": -1921.1497802734375, "logps/rejected": -1723.8843994140625, "loss": 0.0443, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0624106340110302, "rewards/margins": 0.009529463946819305, "rewards/rejected": 0.05288117378950119, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.611847415796476e-06, "logits/chosen": -0.27481353282928467, "logits/rejected": -0.29158735275268555, "logps/chosen": -2325.54345703125, "logps/rejected": -2043.1536865234375, "loss": 0.0559, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0638991966843605, "rewards/margins": 0.01127773616462946, "rewards/rejected": 0.052621446549892426, "step": 500 }, { "epoch": 0.26, "eval_logits/chosen": -0.29030030965805054, "eval_logits/rejected": -0.29912662506103516, "eval_logps/chosen": -2155.4169921875, "eval_logps/rejected": -1828.8736572265625, "eval_loss": 0.05023103952407837, "eval_rewards/accuracies": 0.5625, "eval_rewards/chosen": 0.060988761484622955, "eval_rewards/margins": 0.013449816033244133, "eval_rewards/rejected": 0.04753894358873367, "eval_runtime": 510.5382, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.587044201869378e-06, "logits/chosen": -0.2749403417110443, "logits/rejected": -0.28757306933403015, "logps/chosen": -2167.8203125, "logps/rejected": -1664.1771240234375, "loss": 0.0518, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.06645651906728745, "rewards/margins": 0.021903514862060547, "rewards/rejected": 0.0445530042052269, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.561543932132574e-06, "logits/chosen": -0.3093597888946533, "logits/rejected": -0.3130527138710022, "logps/chosen": -2028.697509765625, "logps/rejected": -1775.0302734375, "loss": 0.0559, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.07832999527454376, "rewards/margins": 0.014924841932952404, "rewards/rejected": 0.06340514868497849, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.535355123469009e-06, "logits/chosen": -0.32513946294784546, "logits/rejected": -0.34443390369415283, "logps/chosen": -2135.48974609375, "logps/rejected": -1824.90625, "loss": 0.0565, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.06139357015490532, "rewards/margins": 0.012111430056393147, "rewards/rejected": 0.04928214102983475, "step": 530 }, { "epoch": 0.28, "learning_rate": 4.508486522728037e-06, "logits/chosen": -0.34302735328674316, "logits/rejected": -0.36917632818222046, "logps/chosen": -2007.6627197265625, "logps/rejected": -1699.0699462890625, "loss": 0.0676, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04311789572238922, "rewards/margins": 0.012735480442643166, "rewards/rejected": 0.030382419005036354, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.480947103804044e-06, "logits/chosen": -0.35971927642822266, "logits/rejected": -0.36432451009750366, "logps/chosen": -2163.0068359375, "logps/rejected": -2066.22509765625, "loss": 0.0428, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.04490477591753006, "rewards/margins": 0.006312023848295212, "rewards/rejected": 0.03859275206923485, "step": 550 }, { "epoch": 0.29, "learning_rate": 4.452746064639239e-06, "logits/chosen": -0.38384127616882324, "logits/rejected": -0.3922134339809418, "logps/chosen": -2226.274658203125, "logps/rejected": -1989.887451171875, "loss": 0.0582, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.057973384857177734, "rewards/margins": 0.015295244753360748, "rewards/rejected": 0.042678140103816986, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.423892824151617e-06, "logits/chosen": -0.37657466530799866, "logits/rejected": -0.38766008615493774, "logps/chosen": -1836.3118896484375, "logps/rejected": -1472.295654296875, "loss": 0.0701, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07644511014223099, "rewards/margins": 0.02132570371031761, "rewards/rejected": 0.05511941760778427, "step": 570 }, { "epoch": 0.3, "learning_rate": 4.3943970190891164e-06, "logits/chosen": -0.37011387944221497, "logits/rejected": -0.42118391394615173, "logps/chosen": -2419.860107421875, "logps/rejected": -1769.7777099609375, "loss": 0.0626, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.12023104727268219, "rewards/margins": 0.03250167518854141, "rewards/rejected": 0.08772937208414078, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.364268500811025e-06, "logits/chosen": -0.35418859124183655, "logits/rejected": -0.37661364674568176, "logps/chosen": -1887.2279052734375, "logps/rejected": -1624.3062744140625, "loss": 0.072, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08441803604364395, "rewards/margins": 0.01872970722615719, "rewards/rejected": 0.06568832695484161, "step": 590 }, { "epoch": 0.31, "learning_rate": 4.333517331997704e-06, "logits/chosen": -0.36238303780555725, "logits/rejected": -0.36792057752609253, "logps/chosen": -1933.2572021484375, "logps/rejected": -1661.876953125, "loss": 0.0546, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06127943471074104, "rewards/margins": 0.016731832176446915, "rewards/rejected": 0.044547609984874725, "step": 600 }, { "epoch": 0.31, "eval_logits/chosen": -0.37191054224967957, "eval_logits/rejected": -0.38397690653800964, "eval_logps/chosen": -2167.108642578125, "eval_logps/rejected": -1839.52685546875, "eval_loss": 0.05038134753704071, "eval_rewards/accuracies": 0.5525000095367432, "eval_rewards/chosen": 0.049297019839286804, "eval_rewards/margins": 0.01241131592541933, "eval_rewards/rejected": 0.03688570857048035, "eval_runtime": 510.5837, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 600 }, { "epoch": 0.32, "learning_rate": 4.302153783289737e-06, "logits/chosen": -0.3634631633758545, "logits/rejected": -0.37499555945396423, "logps/chosen": -2023.001220703125, "logps/rejected": -1739.4332275390625, "loss": 0.0544, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.041168130934238434, "rewards/margins": 0.010140376165509224, "rewards/rejected": 0.03102775290608406, "step": 610 }, { "epoch": 0.32, "learning_rate": 4.270188329857613e-06, "logits/chosen": -0.3298744261264801, "logits/rejected": -0.32282137870788574, "logps/chosen": -2020.5091552734375, "logps/rejected": -1689.3531494140625, "loss": 0.047, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.050172846764326096, "rewards/margins": 0.009077770635485649, "rewards/rejected": 0.041095077991485596, "step": 620 }, { "epoch": 0.33, "learning_rate": 4.237631647903115e-06, "logits/chosen": -0.321160227060318, "logits/rejected": -0.34205105900764465, "logps/chosen": -1793.309326171875, "logps/rejected": -1498.567626953125, "loss": 0.049, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0489073321223259, "rewards/margins": 0.012575352564454079, "rewards/rejected": 0.03633198142051697, "step": 630 }, { "epoch": 0.33, "learning_rate": 4.204494611093548e-06, "logits/chosen": -0.32717442512512207, "logits/rejected": -0.34008845686912537, "logps/chosen": -1978.6207275390625, "logps/rejected": -1785.7669677734375, "loss": 0.0617, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.05437788367271423, "rewards/margins": 0.005566168110817671, "rewards/rejected": 0.048811715096235275, "step": 640 }, { "epoch": 0.34, "learning_rate": 4.170788286930024e-06, "logits/chosen": -0.3271678388118744, "logits/rejected": -0.3383072018623352, "logps/chosen": -2002.5355224609375, "logps/rejected": -1623.6373291015625, "loss": 0.0429, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.056448131799697876, "rewards/margins": 0.013460059650242329, "rewards/rejected": 0.04298807680606842, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.136523933051005e-06, "logits/chosen": -0.28324219584465027, "logits/rejected": -0.2753041982650757, "logps/chosen": -1772.493896484375, "logps/rejected": -1581.4808349609375, "loss": 0.047, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.04456415772438049, "rewards/margins": 0.007894165813922882, "rewards/rejected": 0.03666999563574791, "step": 660 }, { "epoch": 0.35, "learning_rate": 4.101712993472348e-06, "logits/chosen": -0.286260187625885, "logits/rejected": -0.3045397698879242, "logps/chosen": -1830.4456787109375, "logps/rejected": -1603.759521484375, "loss": 0.0541, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.05162501335144043, "rewards/margins": 0.011338387615978718, "rewards/rejected": 0.040286630392074585, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.066367094765091e-06, "logits/chosen": -0.2880704998970032, "logits/rejected": -0.2942127585411072, "logps/chosen": -2038.3916015625, "logps/rejected": -1857.885498046875, "loss": 0.0472, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.06582482159137726, "rewards/margins": 0.009796356782317162, "rewards/rejected": 0.05602846294641495, "step": 680 }, { "epoch": 0.36, "learning_rate": 4.030498042172277e-06, "logits/chosen": -0.29781144857406616, "logits/rejected": -0.3116939663887024, "logps/chosen": -2132.72802734375, "logps/rejected": -1934.0364990234375, "loss": 0.0439, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.06504924595355988, "rewards/margins": 0.00774806085973978, "rewards/rejected": 0.05730118602514267, "step": 690 }, { "epoch": 0.37, "learning_rate": 3.994117815666095e-06, "logits/chosen": -0.3007664084434509, "logits/rejected": -0.29853954911231995, "logps/chosen": -1988.636962890625, "logps/rejected": -1707.418212890625, "loss": 0.0443, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.05466890335083008, "rewards/margins": 0.013832475058734417, "rewards/rejected": 0.040836431086063385, "step": 700 }, { "epoch": 0.37, "eval_logits/chosen": -0.3144506812095642, "eval_logits/rejected": -0.3237921893596649, "eval_logps/chosen": -2163.694091796875, "eval_logps/rejected": -1836.1396484375, "eval_loss": 0.05007108300924301, "eval_rewards/accuracies": 0.5669999718666077, "eval_rewards/chosen": 0.052711814641952515, "eval_rewards/margins": 0.012439063750207424, "eval_rewards/rejected": 0.04027275741100311, "eval_runtime": 510.3528, "eval_samples_per_second": 3.919, "eval_steps_per_second": 0.98, "step": 700 }, { "epoch": 0.37, "learning_rate": 3.957238565946672e-06, "logits/chosen": -0.28171759843826294, "logits/rejected": -0.3016406297683716, "logps/chosen": -1951.7197265625, "logps/rejected": -1821.9302978515625, "loss": 0.0746, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.046342235058546066, "rewards/margins": 0.00639796257019043, "rewards/rejected": 0.03994427248835564, "step": 710 }, { "epoch": 0.38, "learning_rate": 3.919872610383831e-06, "logits/chosen": -0.30082041025161743, "logits/rejected": -0.3195782005786896, "logps/chosen": -2009.2193603515625, "logps/rejected": -1790.225830078125, "loss": 0.0684, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.04109364002943039, "rewards/margins": 0.007526120636612177, "rewards/rejected": 0.03356752544641495, "step": 720 }, { "epoch": 0.38, "learning_rate": 3.882032428903195e-06, "logits/chosen": -0.3266572058200836, "logits/rejected": -0.3410620093345642, "logps/chosen": -2097.94140625, "logps/rejected": -1642.9635009765625, "loss": 0.0475, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06069540977478027, "rewards/margins": 0.01798270270228386, "rewards/rejected": 0.042712707072496414, "step": 730 }, { "epoch": 0.39, "learning_rate": 3.84373065981799e-06, "logits/chosen": -0.29377710819244385, "logits/rejected": -0.2976624369621277, "logps/chosen": -2122.676513671875, "logps/rejected": -1956.9495849609375, "loss": 0.0456, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0653495341539383, "rewards/margins": 0.013613177463412285, "rewards/rejected": 0.051736362278461456, "step": 740 }, { "epoch": 0.39, "learning_rate": 3.8049800956079552e-06, "logits/chosen": -0.33634868264198303, "logits/rejected": -0.3460080027580261, "logps/chosen": -1977.577392578125, "logps/rejected": -1716.836669921875, "loss": 0.0618, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05900438502430916, "rewards/margins": 0.01787043735384941, "rewards/rejected": 0.04113394767045975, "step": 750 }, { "epoch": 0.4, "learning_rate": 3.765793678646753e-06, "logits/chosen": -0.3246403634548187, "logits/rejected": -0.3240343928337097, "logps/chosen": -2022.0374755859375, "logps/rejected": -1934.3929443359375, "loss": 0.0499, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.056883443146944046, "rewards/margins": 0.010135297663509846, "rewards/rejected": 0.046748142689466476, "step": 760 }, { "epoch": 0.4, "learning_rate": 3.726184496879323e-06, "logits/chosen": -0.32194751501083374, "logits/rejected": -0.3437530994415283, "logps/chosen": -2066.994873046875, "logps/rejected": -1785.517333984375, "loss": 0.0618, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06701908260583878, "rewards/margins": 0.017347043380141258, "rewards/rejected": 0.04967203736305237, "step": 770 }, { "epoch": 0.41, "learning_rate": 3.686165779450619e-06, "logits/chosen": -0.32135313749313354, "logits/rejected": -0.33263832330703735, "logps/chosen": -2046.4375, "logps/rejected": -1752.5133056640625, "loss": 0.0629, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.07863454520702362, "rewards/margins": 0.013034949079155922, "rewards/rejected": 0.06559960544109344, "step": 780 }, { "epoch": 0.41, "learning_rate": 3.645750892287178e-06, "logits/chosen": -0.30609697103500366, "logits/rejected": -0.3328899145126343, "logps/chosen": -2209.924560546875, "logps/rejected": -1803.526123046875, "loss": 0.06, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.08860823512077332, "rewards/margins": 0.02287045121192932, "rewards/rejected": 0.065737783908844, "step": 790 }, { "epoch": 0.42, "learning_rate": 3.604953333633009e-06, "logits/chosen": -0.301249623298645, "logits/rejected": -0.3167082369327545, "logps/chosen": -1958.112548828125, "logps/rejected": -1749.8187255859375, "loss": 0.0583, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0594819113612175, "rewards/margins": 0.00925761554390192, "rewards/rejected": 0.050224293023347855, "step": 800 }, { "epoch": 0.42, "eval_logits/chosen": -0.2990359365940094, "eval_logits/rejected": -0.3079277575016022, "eval_logps/chosen": -2160.533447265625, "eval_logps/rejected": -1833.001220703125, "eval_loss": 0.05018917843699455, "eval_rewards/accuracies": 0.5625, "eval_rewards/chosen": 0.055872511118650436, "eval_rewards/margins": 0.012461244128644466, "eval_rewards/rejected": 0.043411269783973694, "eval_runtime": 510.4542, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.98, "step": 800 }, { "epoch": 0.42, "learning_rate": 3.56378672954129e-06, "logits/chosen": -0.2567403316497803, "logits/rejected": -0.3088562786579132, "logps/chosen": -1969.7041015625, "logps/rejected": -1587.13134765625, "loss": 0.0589, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05050656199455261, "rewards/margins": 0.01574171707034111, "rewards/rejected": 0.0347648449242115, "step": 810 }, { "epoch": 0.43, "learning_rate": 3.5222648293233806e-06, "logits/chosen": -0.3206945061683655, "logits/rejected": -0.32324275374412537, "logps/chosen": -2125.584228515625, "logps/rejected": -1908.9595947265625, "loss": 0.0508, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07730694115161896, "rewards/margins": 0.020585492253303528, "rewards/rejected": 0.05672144889831543, "step": 820 }, { "epoch": 0.43, "learning_rate": 3.4804015009566573e-06, "logits/chosen": -0.30177921056747437, "logits/rejected": -0.30555492639541626, "logps/chosen": -2047.0084228515625, "logps/rejected": -1866.709228515625, "loss": 0.0529, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.080367811024189, "rewards/margins": 0.01644848845899105, "rewards/rejected": 0.0639193207025528, "step": 830 }, { "epoch": 0.44, "learning_rate": 3.4382107264527244e-06, "logits/chosen": -0.2914479076862335, "logits/rejected": -0.3034920394420624, "logps/chosen": -2094.360595703125, "logps/rejected": -1812.98046875, "loss": 0.0469, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.08005829900503159, "rewards/margins": 0.010312746278941631, "rewards/rejected": 0.06974555552005768, "step": 840 }, { "epoch": 0.44, "learning_rate": 3.3957065971875387e-06, "logits/chosen": -0.3109249472618103, "logits/rejected": -0.32668763399124146, "logps/chosen": -2224.466796875, "logps/rejected": -1824.3785400390625, "loss": 0.0493, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06907899677753448, "rewards/margins": 0.012219742871820927, "rewards/rejected": 0.056859247386455536, "step": 850 }, { "epoch": 0.45, "learning_rate": 3.352903309194999e-06, "logits/chosen": -0.29552769660949707, "logits/rejected": -0.30279669165611267, "logps/chosen": -2010.127685546875, "logps/rejected": -1726.2581787109375, "loss": 0.0523, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.05864205211400986, "rewards/margins": 0.011820727959275246, "rewards/rejected": 0.04682133346796036, "step": 860 }, { "epoch": 0.46, "learning_rate": 3.309815158425591e-06, "logits/chosen": -0.30413001775741577, "logits/rejected": -0.317624032497406, "logps/chosen": -2200.095947265625, "logps/rejected": -1815.937744140625, "loss": 0.0634, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.07513566315174103, "rewards/margins": 0.01764606684446335, "rewards/rejected": 0.057489603757858276, "step": 870 }, { "epoch": 0.46, "learning_rate": 3.266456535971654e-06, "logits/chosen": -0.2624972462654114, "logits/rejected": -0.28810930252075195, "logps/chosen": -2114.169189453125, "logps/rejected": -1792.790771484375, "loss": 0.0522, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0786682516336441, "rewards/margins": 0.01509961299598217, "rewards/rejected": 0.06356863677501678, "step": 880 }, { "epoch": 0.47, "learning_rate": 3.2228419232608692e-06, "logits/chosen": -0.2368161380290985, "logits/rejected": -0.24519118666648865, "logps/chosen": -1967.4462890625, "logps/rejected": -1798.807373046875, "loss": 0.0492, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.07654932141304016, "rewards/margins": 0.007422330789268017, "rewards/rejected": 0.06912699341773987, "step": 890 }, { "epoch": 0.47, "learning_rate": 3.1789858872195888e-06, "logits/chosen": -0.21885935962200165, "logits/rejected": -0.24373655021190643, "logps/chosen": -2283.5390625, "logps/rejected": -1840.8265380859375, "loss": 0.0432, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.09643899649381638, "rewards/margins": 0.018616409972310066, "rewards/rejected": 0.07782258838415146, "step": 900 }, { "epoch": 0.47, "eval_logits/chosen": -0.24547961354255676, "eval_logits/rejected": -0.25286465883255005, "eval_logps/chosen": -2129.181884765625, "eval_logps/rejected": -1806.208740234375, "eval_loss": 0.050007544457912445, "eval_rewards/accuracies": 0.5485000014305115, "eval_rewards/chosen": 0.0872238427400589, "eval_rewards/margins": 0.017020048573613167, "eval_rewards/rejected": 0.07020379602909088, "eval_runtime": 510.5362, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 900 }, { "epoch": 0.48, "learning_rate": 3.1349030754075945e-06, "logits/chosen": -0.22288069128990173, "logits/rejected": -0.2447211742401123, "logps/chosen": -2140.885498046875, "logps/rejected": -1654.9674072265625, "loss": 0.0626, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09542791545391083, "rewards/margins": 0.02674751542508602, "rewards/rejected": 0.06868041306734085, "step": 910 }, { "epoch": 0.48, "learning_rate": 3.0906082111259313e-06, "logits/chosen": -0.2237463891506195, "logits/rejected": -0.2500147521495819, "logps/chosen": -2403.999267578125, "logps/rejected": -1815.796142578125, "loss": 0.0436, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.09730223566293716, "rewards/margins": 0.025161966681480408, "rewards/rejected": 0.07214026153087616, "step": 920 }, { "epoch": 0.49, "learning_rate": 3.046116088499449e-06, "logits/chosen": -0.2534050941467285, "logits/rejected": -0.27334827184677124, "logps/chosen": -2099.66064453125, "logps/rejected": -1671.605712890625, "loss": 0.0409, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09167732298374176, "rewards/margins": 0.01841827854514122, "rewards/rejected": 0.07325904071331024, "step": 930 }, { "epoch": 0.49, "learning_rate": 3.0014415675356813e-06, "logits/chosen": -0.24992087483406067, "logits/rejected": -0.2547626495361328, "logps/chosen": -2133.786376953125, "logps/rejected": -1852.0260009765625, "loss": 0.0401, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.09754703938961029, "rewards/margins": 0.015503397211432457, "rewards/rejected": 0.08204366266727448, "step": 940 }, { "epoch": 0.5, "learning_rate": 2.9565995691617242e-06, "logits/chosen": -0.23162353038787842, "logits/rejected": -0.24423262476921082, "logps/chosen": -1878.375732421875, "logps/rejected": -1515.8773193359375, "loss": 0.047, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07727902382612228, "rewards/margins": 0.018643613904714584, "rewards/rejected": 0.058635413646698, "step": 950 }, { "epoch": 0.5, "learning_rate": 2.9116050702407706e-06, "logits/chosen": -0.2648778259754181, "logits/rejected": -0.2825019359588623, "logps/chosen": -2200.065185546875, "logps/rejected": -1777.919677734375, "loss": 0.0385, "rewards/accuracies": 0.5, "rewards/chosen": 0.07928620278835297, "rewards/margins": 0.01838754117488861, "rewards/rejected": 0.06089866906404495, "step": 960 }, { "epoch": 0.51, "learning_rate": 2.8664730985699537e-06, "logits/chosen": -0.23890802264213562, "logits/rejected": -0.2561323344707489, "logps/chosen": -2259.957763671875, "logps/rejected": -1922.253173828125, "loss": 0.0508, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.06860624998807907, "rewards/margins": 0.010691315867006779, "rewards/rejected": 0.05791493132710457, "step": 970 }, { "epoch": 0.51, "learning_rate": 2.8212187278611907e-06, "logits/chosen": -0.2580435872077942, "logits/rejected": -0.2608950734138489, "logps/chosen": -2114.46044921875, "logps/rejected": -1843.2421875, "loss": 0.0502, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.060202427208423615, "rewards/margins": 0.0087806461378932, "rewards/rejected": 0.05142177268862724, "step": 980 }, { "epoch": 0.52, "learning_rate": 2.7758570727066843e-06, "logits/chosen": -0.26601457595825195, "logits/rejected": -0.27015531063079834, "logps/chosen": -1853.6099853515625, "logps/rejected": -1555.1754150390625, "loss": 0.0605, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.060611844062805176, "rewards/margins": 0.013232124969363213, "rewards/rejected": 0.047379713505506516, "step": 990 }, { "epoch": 0.52, "learning_rate": 2.730403283530767e-06, "logits/chosen": -0.24036483466625214, "logits/rejected": -0.2455415278673172, "logps/chosen": -1859.8070068359375, "logps/rejected": -1691.760498046875, "loss": 0.0538, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06643722951412201, "rewards/margins": 0.01359983254224062, "rewards/rejected": 0.052837394177913666, "step": 1000 }, { "epoch": 0.52, "eval_logits/chosen": -0.2564674913883209, "eval_logits/rejected": -0.25929296016693115, "eval_logps/chosen": -2156.65283203125, "eval_logps/rejected": -1829.5831298828125, "eval_loss": 0.04961266368627548, "eval_rewards/accuracies": 0.5649999976158142, "eval_rewards/chosen": 0.05975308269262314, "eval_rewards/margins": 0.012923642992973328, "eval_rewards/rejected": 0.04682943597435951, "eval_runtime": 510.5574, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 1000 }, { "epoch": 0.53, "learning_rate": 2.6848725415297888e-06, "logits/chosen": -0.251176655292511, "logits/rejected": -0.255452036857605, "logps/chosen": -2234.47119140625, "logps/rejected": -1848.127685546875, "loss": 0.0405, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06075858324766159, "rewards/margins": 0.016493605449795723, "rewards/rejected": 0.04426497966051102, "step": 1010 }, { "epoch": 0.53, "learning_rate": 2.639280053601719e-06, "logits/chosen": -0.2578621506690979, "logits/rejected": -0.26309382915496826, "logps/chosen": -2132.969970703125, "logps/rejected": -1790.293212890625, "loss": 0.034, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06318069994449615, "rewards/margins": 0.01207827776670456, "rewards/rejected": 0.0511024184525013, "step": 1020 }, { "epoch": 0.54, "learning_rate": 2.59364104726716e-06, "logits/chosen": -0.25486692786216736, "logits/rejected": -0.24112336337566376, "logps/chosen": -1739.303955078125, "logps/rejected": -1716.464599609375, "loss": 0.0597, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.053387343883514404, "rewards/margins": 0.00615662382915616, "rewards/rejected": 0.04723071679472923, "step": 1030 }, { "epoch": 0.54, "learning_rate": 2.547970765583491e-06, "logits/chosen": -0.23829662799835205, "logits/rejected": -0.2544878125190735, "logps/chosen": -2119.11474609375, "logps/rejected": -1764.7984619140625, "loss": 0.0518, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06005573272705078, "rewards/margins": 0.01636183261871338, "rewards/rejected": 0.0436939001083374, "step": 1040 }, { "epoch": 0.55, "learning_rate": 2.502284462053799e-06, "logits/chosen": -0.2548423409461975, "logits/rejected": -0.24885638058185577, "logps/chosen": -2136.72998046875, "logps/rejected": -1792.2359619140625, "loss": 0.0579, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0639239102602005, "rewards/margins": 0.014888137578964233, "rewards/rejected": 0.04903577268123627, "step": 1050 }, { "epoch": 0.55, "learning_rate": 2.456597395532338e-06, "logits/chosen": -0.2554526925086975, "logits/rejected": -0.29498496651649475, "logps/chosen": -1785.2249755859375, "logps/rejected": -1444.23291015625, "loss": 0.0467, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.051430024206638336, "rewards/margins": 0.014165714383125305, "rewards/rejected": 0.03726430982351303, "step": 1060 }, { "epoch": 0.56, "learning_rate": 2.4109248251281953e-06, "logits/chosen": -0.25295186042785645, "logits/rejected": -0.2443423569202423, "logps/chosen": -2214.04541015625, "logps/rejected": -1779.729248046875, "loss": 0.0427, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.07067938894033432, "rewards/margins": 0.0159921832382679, "rewards/rejected": 0.05468720197677612, "step": 1070 }, { "epoch": 0.57, "learning_rate": 2.365282005108875e-06, "logits/chosen": -0.21839866042137146, "logits/rejected": -0.22934658825397491, "logps/chosen": -2245.37646484375, "logps/rejected": -2051.3115234375, "loss": 0.0387, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.07870273292064667, "rewards/margins": 0.015385419130325317, "rewards/rejected": 0.06331731379032135, "step": 1080 }, { "epoch": 0.57, "learning_rate": 2.319684179805491e-06, "logits/chosen": -0.2654665410518646, "logits/rejected": -0.2958211302757263, "logps/chosen": -2201.913818359375, "logps/rejected": -1623.56298828125, "loss": 0.0428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08239830285310745, "rewards/margins": 0.022424213588237762, "rewards/rejected": 0.05997408553957939, "step": 1090 }, { "epoch": 0.58, "learning_rate": 2.2741465785212905e-06, "logits/chosen": -0.24744835495948792, "logits/rejected": -0.27335745096206665, "logps/chosen": -2357.655029296875, "logps/rejected": -1799.583740234375, "loss": 0.0545, "rewards/accuracies": 0.53125, "rewards/chosen": 0.09354601800441742, "rewards/margins": 0.019701533019542694, "rewards/rejected": 0.07384449243545532, "step": 1100 }, { "epoch": 0.58, "eval_logits/chosen": -0.23114541172981262, "eval_logits/rejected": -0.2394075095653534, "eval_logps/chosen": -2124.1083984375, "eval_logps/rejected": -1802.593505859375, "eval_loss": 0.04950037598609924, "eval_rewards/accuracies": 0.5559999942779541, "eval_rewards/chosen": 0.0922975018620491, "eval_rewards/margins": 0.018478482961654663, "eval_rewards/rejected": 0.07381902635097504, "eval_runtime": 510.4268, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.98, "step": 1100 }, { "epoch": 0.58, "learning_rate": 2.2286844104451848e-06, "logits/chosen": -0.2077624499797821, "logits/rejected": -0.2412451207637787, "logps/chosen": -2270.83935546875, "logps/rejected": -1800.899169921875, "loss": 0.049, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.09902598708868027, "rewards/margins": 0.026153406128287315, "rewards/rejected": 0.0728725865483284, "step": 1110 }, { "epoch": 0.59, "learning_rate": 2.183312859572008e-06, "logits/chosen": -0.20635256171226501, "logits/rejected": -0.19912874698638916, "logps/chosen": -2253.06689453125, "logps/rejected": -1995.773193359375, "loss": 0.0598, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.08980433642864227, "rewards/margins": 0.015359434299170971, "rewards/rejected": 0.07444489747285843, "step": 1120 }, { "epoch": 0.59, "learning_rate": 2.1380470796311843e-06, "logits/chosen": -0.21904154121875763, "logits/rejected": -0.24687853455543518, "logps/chosen": -2060.12109375, "logps/rejected": -1746.0015869140625, "loss": 0.0457, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0698038712143898, "rewards/margins": 0.01698939874768257, "rewards/rejected": 0.05281447246670723, "step": 1130 }, { "epoch": 0.6, "learning_rate": 2.092902189025507e-06, "logits/chosen": -0.2082248479127884, "logits/rejected": -0.21504366397857666, "logps/chosen": -2301.1181640625, "logps/rejected": -1757.7796630859375, "loss": 0.0446, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0717499703168869, "rewards/margins": 0.022804908454418182, "rewards/rejected": 0.04894506186246872, "step": 1140 }, { "epoch": 0.6, "learning_rate": 2.0478932657817105e-06, "logits/chosen": -0.21141843497753143, "logits/rejected": -0.2168281078338623, "logps/chosen": -2182.329345703125, "logps/rejected": -1772.6962890625, "loss": 0.0492, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.07858923077583313, "rewards/margins": 0.014738768339157104, "rewards/rejected": 0.06385046243667603, "step": 1150 }, { "epoch": 0.61, "learning_rate": 2.0030353425145376e-06, "logits/chosen": -0.21451938152313232, "logits/rejected": -0.23753699660301208, "logps/chosen": -2020.727783203125, "logps/rejected": -1757.3990478515625, "loss": 0.0512, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.07901870459318161, "rewards/margins": 0.01762666180729866, "rewards/rejected": 0.06139205023646355, "step": 1160 }, { "epoch": 0.61, "learning_rate": 1.958343401405964e-06, "logits/chosen": -0.18361331522464752, "logits/rejected": -0.1837645322084427, "logps/chosen": -2371.175537109375, "logps/rejected": -1958.777099609375, "loss": 0.0508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0850948616862297, "rewards/margins": 0.020093852654099464, "rewards/rejected": 0.06500101089477539, "step": 1170 }, { "epoch": 0.62, "learning_rate": 1.9138323692012734e-06, "logits/chosen": -0.22541293501853943, "logits/rejected": -0.23021379113197327, "logps/chosen": -2021.7099609375, "logps/rejected": -1991.3853759765625, "loss": 0.0582, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.06989626586437225, "rewards/margins": 0.008970921859145164, "rewards/rejected": 0.060925353318452835, "step": 1180 }, { "epoch": 0.62, "learning_rate": 1.8695171122236443e-06, "logits/chosen": -0.19789089262485504, "logits/rejected": -0.21101799607276917, "logps/chosen": -2177.219970703125, "logps/rejected": -1758.7890625, "loss": 0.0571, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06635448336601257, "rewards/margins": 0.014402633532881737, "rewards/rejected": 0.05195184424519539, "step": 1190 }, { "epoch": 0.63, "learning_rate": 1.8254124314089225e-06, "logits/chosen": -0.2192670851945877, "logits/rejected": -0.20262674987316132, "logps/chosen": -2045.339111328125, "logps/rejected": -1922.4957275390625, "loss": 0.0481, "rewards/accuracies": 0.4375, "rewards/chosen": 0.06084597855806351, "rewards/margins": 0.0031062946654856205, "rewards/rejected": 0.05773968622088432, "step": 1200 }, { "epoch": 0.63, "eval_logits/chosen": -0.21473824977874756, "eval_logits/rejected": -0.2180851548910141, "eval_logps/chosen": -2155.742919921875, "eval_logps/rejected": -1829.7305908203125, "eval_loss": 0.04951399564743042, "eval_rewards/accuracies": 0.5684999823570251, "eval_rewards/chosen": 0.06066294014453888, "eval_rewards/margins": 0.013980962336063385, "eval_rewards/rejected": 0.046681977808475494, "eval_runtime": 510.4546, "eval_samples_per_second": 3.918, "eval_steps_per_second": 0.98, "step": 1200 }, { "epoch": 0.63, "learning_rate": 1.781533057362221e-06, "logits/chosen": -0.23097166419029236, "logits/rejected": -0.2347377985715866, "logps/chosen": -1948.361328125, "logps/rejected": -1586.739013671875, "loss": 0.0511, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05206098034977913, "rewards/margins": 0.012983322143554688, "rewards/rejected": 0.03907765448093414, "step": 1210 }, { "epoch": 0.64, "learning_rate": 1.7378936454380277e-06, "logits/chosen": -0.20886722207069397, "logits/rejected": -0.21028542518615723, "logps/chosen": -2190.082763671875, "logps/rejected": -1998.083984375, "loss": 0.0517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.060171376913785934, "rewards/margins": 0.008495164103806019, "rewards/rejected": 0.05167621374130249, "step": 1220 }, { "epoch": 0.64, "learning_rate": 1.6945087708454273e-06, "logits/chosen": -0.18295393884181976, "logits/rejected": -0.1880742609500885, "logps/chosen": -2117.80908203125, "logps/rejected": -1735.502197265625, "loss": 0.0499, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.060192208737134933, "rewards/margins": 0.009002082981169224, "rewards/rejected": 0.05119013041257858, "step": 1230 }, { "epoch": 0.65, "learning_rate": 1.651392923780105e-06, "logits/chosen": -0.19351014494895935, "logits/rejected": -0.20447520911693573, "logps/chosen": -2093.595703125, "logps/rejected": -1855.68359375, "loss": 0.0491, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.06083123758435249, "rewards/margins": 0.008342139422893524, "rewards/rejected": 0.05248909443616867, "step": 1240 }, { "epoch": 0.65, "learning_rate": 1.608560504584737e-06, "logits/chosen": -0.20279578864574432, "logits/rejected": -0.21171894669532776, "logps/chosen": -2053.47412109375, "logps/rejected": -1884.202392578125, "loss": 0.0516, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06217293068766594, "rewards/margins": 0.01359265111386776, "rewards/rejected": 0.04858027398586273, "step": 1250 }, { "epoch": 0.66, "learning_rate": 1.5660258189393945e-06, "logits/chosen": -0.2138860523700714, "logits/rejected": -0.20899005234241486, "logps/chosen": -2356.08447265625, "logps/rejected": -2091.6435546875, "loss": 0.0455, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06469549238681793, "rewards/margins": 0.009515106678009033, "rewards/rejected": 0.055180393159389496, "step": 1260 }, { "epoch": 0.66, "learning_rate": 1.5238030730835578e-06, "logits/chosen": -0.22949472069740295, "logits/rejected": -0.22715874016284943, "logps/chosen": -2223.537353515625, "logps/rejected": -1747.806640625, "loss": 0.0535, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06524848937988281, "rewards/margins": 0.01758180931210518, "rewards/rejected": 0.047666680067777634, "step": 1270 }, { "epoch": 0.67, "learning_rate": 1.4819063690713565e-06, "logits/chosen": -0.19447948038578033, "logits/rejected": -0.2098011076450348, "logps/chosen": -1938.2740478515625, "logps/rejected": -1719.427734375, "loss": 0.0583, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.057724129408597946, "rewards/margins": 0.012677346356213093, "rewards/rejected": 0.04504678025841713, "step": 1280 }, { "epoch": 0.68, "learning_rate": 1.4403497000615885e-06, "logits/chosen": -0.20776407420635223, "logits/rejected": -0.1970272809267044, "logps/chosen": -1996.076171875, "logps/rejected": -1722.409423828125, "loss": 0.0624, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06180752441287041, "rewards/margins": 0.012057540938258171, "rewards/rejected": 0.049749989062547684, "step": 1290 }, { "epoch": 0.68, "learning_rate": 1.3991469456441273e-06, "logits/chosen": -0.19028015434741974, "logits/rejected": -0.18771126866340637, "logps/chosen": -2132.365966796875, "logps/rejected": -1654.494140625, "loss": 0.0441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.053263597190380096, "rewards/margins": 0.01780819520354271, "rewards/rejected": 0.03545539826154709, "step": 1300 }, { "epoch": 0.68, "eval_logits/chosen": -0.21745455265045166, "eval_logits/rejected": -0.22021788358688354, "eval_logps/chosen": -2159.675537109375, "eval_logps/rejected": -1833.5484619140625, "eval_loss": 0.04945502430200577, "eval_rewards/accuracies": 0.5690000057220459, "eval_rewards/chosen": 0.05673002824187279, "eval_rewards/margins": 0.013866120018064976, "eval_rewards/rejected": 0.042863909155130386, "eval_runtime": 510.5607, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 1300 }, { "epoch": 0.69, "learning_rate": 1.3583118672042441e-06, "logits/chosen": -0.20240898430347443, "logits/rejected": -0.23169991374015808, "logps/chosen": -2309.421630859375, "logps/rejected": -1885.686279296875, "loss": 0.0545, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06426791846752167, "rewards/margins": 0.016114329919219017, "rewards/rejected": 0.0481535978615284, "step": 1310 }, { "epoch": 0.69, "learning_rate": 1.3178581033264218e-06, "logits/chosen": -0.2012084424495697, "logits/rejected": -0.23439760506153107, "logps/chosen": -1958.7874755859375, "logps/rejected": -1563.35302734375, "loss": 0.0578, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05583029240369797, "rewards/margins": 0.016821032389998436, "rewards/rejected": 0.03900925815105438, "step": 1320 }, { "epoch": 0.7, "learning_rate": 1.2777991652391757e-06, "logits/chosen": -0.2176096886396408, "logits/rejected": -0.23196351528167725, "logps/chosen": -2121.034912109375, "logps/rejected": -1711.7109375, "loss": 0.0395, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.06722841411828995, "rewards/margins": 0.011516690254211426, "rewards/rejected": 0.055711716413497925, "step": 1330 }, { "epoch": 0.7, "learning_rate": 1.2381484323024178e-06, "logits/chosen": -0.19108158349990845, "logits/rejected": -0.2023816853761673, "logps/chosen": -2302.89697265625, "logps/rejected": -1892.548095703125, "loss": 0.055, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.08529181778430939, "rewards/margins": 0.020634423941373825, "rewards/rejected": 0.06465739011764526, "step": 1340 }, { "epoch": 0.71, "learning_rate": 1.1989191475388518e-06, "logits/chosen": -0.2374308556318283, "logits/rejected": -0.2234220951795578, "logps/chosen": -2113.50146484375, "logps/rejected": -1874.0279541015625, "loss": 0.0679, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.07649590075016022, "rewards/margins": 0.014174291864037514, "rewards/rejected": 0.06232162192463875, "step": 1350 }, { "epoch": 0.71, "learning_rate": 1.160124413210918e-06, "logits/chosen": -0.24203363060951233, "logits/rejected": -0.2396487444639206, "logps/chosen": -2022.40625, "logps/rejected": -1880.9495849609375, "loss": 0.0406, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.07169513404369354, "rewards/margins": 0.015681343153119087, "rewards/rejected": 0.05601378530263901, "step": 1360 }, { "epoch": 0.72, "learning_rate": 1.1217771864447396e-06, "logits/chosen": -0.2442229688167572, "logits/rejected": -0.2381734549999237, "logps/chosen": -2145.31689453125, "logps/rejected": -1823.568359375, "loss": 0.0528, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.059424418956041336, "rewards/margins": 0.017248233780264854, "rewards/rejected": 0.042176179587841034, "step": 1370 }, { "epoch": 0.72, "learning_rate": 1.08389027490255e-06, "logits/chosen": -0.22975793480873108, "logits/rejected": -0.2311103641986847, "logps/chosen": -2078.482421875, "logps/rejected": -1920.7864990234375, "loss": 0.0372, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05944003537297249, "rewards/margins": 0.012296736240386963, "rewards/rejected": 0.04714329540729523, "step": 1380 }, { "epoch": 0.73, "learning_rate": 1.046476332505036e-06, "logits/chosen": -0.22442571818828583, "logits/rejected": -0.24409636855125427, "logps/chosen": -1815.8922119140625, "logps/rejected": -1615.753173828125, "loss": 0.0427, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0425129197537899, "rewards/margins": 0.005890417378395796, "rewards/rejected": 0.03662250563502312, "step": 1390 }, { "epoch": 0.73, "learning_rate": 1.0095478552050348e-06, "logits/chosen": -0.23266033828258514, "logits/rejected": -0.22884194552898407, "logps/chosen": -2215.30078125, "logps/rejected": -1916.711669921875, "loss": 0.0524, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.05568776652216911, "rewards/margins": 0.016158053651452065, "rewards/rejected": 0.0395297110080719, "step": 1400 }, { "epoch": 0.73, "eval_logits/chosen": -0.24221491813659668, "eval_logits/rejected": -0.24745041131973267, "eval_logps/chosen": -2163.659912109375, "eval_logps/rejected": -1837.5037841796875, "eval_loss": 0.04962093383073807, "eval_rewards/accuracies": 0.5684999823570251, "eval_rewards/chosen": 0.05274572595953941, "eval_rewards/margins": 0.013837032951414585, "eval_rewards/rejected": 0.0389086939394474, "eval_runtime": 510.7706, "eval_samples_per_second": 3.916, "eval_steps_per_second": 0.979, "step": 1400 }, { "epoch": 0.74, "learning_rate": 9.731171768139808e-07, "logits/chosen": -0.2232085019350052, "logits/rejected": -0.2379104197025299, "logps/chosen": -2600.76953125, "logps/rejected": -2177.601806640625, "loss": 0.0462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06885553896427155, "rewards/margins": 0.017164334654808044, "rewards/rejected": 0.0516912117600441, "step": 1410 }, { "epoch": 0.74, "learning_rate": 9.371964648825221e-07, "logits/chosen": -0.2546294629573822, "logits/rejected": -0.2645355761051178, "logps/chosen": -1949.8623046875, "logps/rejected": -1700.5302734375, "loss": 0.0471, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.05458490923047066, "rewards/margins": 0.015815045684576035, "rewards/rejected": 0.03876986354589462, "step": 1420 }, { "epoch": 0.75, "learning_rate": 9.017977166366445e-07, "logits/chosen": -0.2653118669986725, "logits/rejected": -0.26530537009239197, "logps/chosen": -2068.649169921875, "logps/rejected": -1767.718017578125, "loss": 0.0417, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.065969318151474, "rewards/margins": 0.021549370139837265, "rewards/rejected": 0.044419944286346436, "step": 1430 }, { "epoch": 0.75, "learning_rate": 8.669327549707096e-07, "logits/chosen": -0.24164719879627228, "logits/rejected": -0.2417771816253662, "logps/chosen": -2113.025390625, "logps/rejected": -1844.6246337890625, "loss": 0.037, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.07654988765716553, "rewards/margins": 0.013212883844971657, "rewards/rejected": 0.06333700567483902, "step": 1440 }, { "epoch": 0.76, "learning_rate": 8.326132244986932e-07, "logits/chosen": -0.22752514481544495, "logits/rejected": -0.23347719013690948, "logps/chosen": -1921.048828125, "logps/rejected": -1598.9993896484375, "loss": 0.0546, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.06702348589897156, "rewards/margins": 0.018387358635663986, "rewards/rejected": 0.048636119812726974, "step": 1450 }, { "epoch": 0.76, "learning_rate": 7.988505876649863e-07, "logits/chosen": -0.22021660208702087, "logits/rejected": -0.21698196232318878, "logps/chosen": -2022.852783203125, "logps/rejected": -1766.4072265625, "loss": 0.0638, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.06179197505116463, "rewards/margins": 0.01092799287289381, "rewards/rejected": 0.05086398124694824, "step": 1460 }, { "epoch": 0.77, "learning_rate": 7.656561209160248e-07, "logits/chosen": -0.22338895499706268, "logits/rejected": -0.2199423760175705, "logps/chosen": -1993.245361328125, "logps/rejected": -1980.831787109375, "loss": 0.0596, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.06557862460613251, "rewards/margins": 0.014192071743309498, "rewards/rejected": 0.051386553794145584, "step": 1470 }, { "epoch": 0.77, "learning_rate": 7.330409109340563e-07, "logits/chosen": -0.21173898875713348, "logits/rejected": -0.23590870201587677, "logps/chosen": -2017.1929931640625, "logps/rejected": -1657.1859130859375, "loss": 0.0577, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.055760689079761505, "rewards/margins": 0.013670523650944233, "rewards/rejected": 0.04209016636013985, "step": 1480 }, { "epoch": 0.78, "learning_rate": 7.010158509342682e-07, "logits/chosen": -0.23114773631095886, "logits/rejected": -0.23729057610034943, "logps/chosen": -2105.003173828125, "logps/rejected": -1778.635498046875, "loss": 0.0626, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.06233568117022514, "rewards/margins": 0.0166020505130291, "rewards/rejected": 0.04573363438248634, "step": 1490 }, { "epoch": 0.79, "learning_rate": 6.695916370265529e-07, "logits/chosen": -0.2337017059326172, "logits/rejected": -0.234249085187912, "logps/chosen": -2289.919189453125, "logps/rejected": -2049.482177734375, "loss": 0.0425, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.06871043145656586, "rewards/margins": 0.010876113548874855, "rewards/rejected": 0.05783431604504585, "step": 1500 }, { "epoch": 0.79, "eval_logits/chosen": -0.22742050886154175, "eval_logits/rejected": -0.23352740705013275, "eval_logps/chosen": -2154.34033203125, "eval_logps/rejected": -1829.7928466796875, "eval_loss": 0.04929284378886223, "eval_rewards/accuracies": 0.5674999952316284, "eval_rewards/chosen": 0.06206566095352173, "eval_rewards/margins": 0.015445946715772152, "eval_rewards/rejected": 0.0466197207570076, "eval_runtime": 510.6117, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 1500 }, { "epoch": 0.79, "learning_rate": 6.387787646430854e-07, "logits/chosen": -0.22113287448883057, "logits/rejected": -0.21311786770820618, "logps/chosen": -2256.49951171875, "logps/rejected": -1961.322509765625, "loss": 0.0569, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06963467597961426, "rewards/margins": 0.015023264102637768, "rewards/rejected": 0.05461140722036362, "step": 1510 }, { "epoch": 0.8, "learning_rate": 6.085875250329401e-07, "logits/chosen": -0.24658381938934326, "logits/rejected": -0.2602604925632477, "logps/chosen": -1738.8958740234375, "logps/rejected": -1530.5950927734375, "loss": 0.0426, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0541699044406414, "rewards/margins": 0.007052128203213215, "rewards/rejected": 0.04711777716875076, "step": 1520 }, { "epoch": 0.8, "learning_rate": 5.79028001824894e-07, "logits/chosen": -0.23341718316078186, "logits/rejected": -0.2346893846988678, "logps/chosen": -2004.2154541015625, "logps/rejected": -1676.0980224609375, "loss": 0.0508, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.06295724958181381, "rewards/margins": 0.0180866289883852, "rewards/rejected": 0.04487061873078346, "step": 1530 }, { "epoch": 0.81, "learning_rate": 5.501100676595761e-07, "logits/chosen": -0.23593036830425262, "logits/rejected": -0.2430458515882492, "logps/chosen": -2228.714111328125, "logps/rejected": -1928.757080078125, "loss": 0.0463, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07435286045074463, "rewards/margins": 0.01624133810400963, "rewards/rejected": 0.0581115186214447, "step": 1540 }, { "epoch": 0.81, "learning_rate": 5.218433808920884e-07, "logits/chosen": -0.22192791104316711, "logits/rejected": -0.23522309958934784, "logps/chosen": -2106.592041015625, "logps/rejected": -1706.7427978515625, "loss": 0.0499, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0745842233300209, "rewards/margins": 0.023264039307832718, "rewards/rejected": 0.05132018402218819, "step": 1550 }, { "epoch": 0.82, "learning_rate": 4.942373823661928e-07, "logits/chosen": -0.22204573452472687, "logits/rejected": -0.22397270798683167, "logps/chosen": -1921.546875, "logps/rejected": -1686.9296875, "loss": 0.0498, "rewards/accuracies": 0.5, "rewards/chosen": 0.060963042080402374, "rewards/margins": 0.011381834745407104, "rewards/rejected": 0.04958119988441467, "step": 1560 }, { "epoch": 0.82, "learning_rate": 4.6730129226114363e-07, "logits/chosen": -0.22597956657409668, "logits/rejected": -0.24938449263572693, "logps/chosen": -1889.419189453125, "logps/rejected": -1659.8782958984375, "loss": 0.0636, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06332211196422577, "rewards/margins": 0.013857582584023476, "rewards/rejected": 0.04946453124284744, "step": 1570 }, { "epoch": 0.83, "learning_rate": 4.4104410701222703e-07, "logits/chosen": -0.21659445762634277, "logits/rejected": -0.22937150299549103, "logps/chosen": -2046.7357177734375, "logps/rejected": -1769.0198974609375, "loss": 0.0451, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06872855126857758, "rewards/margins": 0.014504766091704369, "rewards/rejected": 0.05422378331422806, "step": 1580 }, { "epoch": 0.83, "learning_rate": 4.154745963060197e-07, "logits/chosen": -0.21276862919330597, "logits/rejected": -0.2097276896238327, "logps/chosen": -2136.18505859375, "logps/rejected": -2049.29931640625, "loss": 0.0536, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.07395701855421066, "rewards/margins": 0.006907849106937647, "rewards/rejected": 0.06704917550086975, "step": 1590 }, { "epoch": 0.84, "learning_rate": 3.9060130015138863e-07, "logits/chosen": -0.22508184611797333, "logits/rejected": -0.23881450295448303, "logps/chosen": -1998.213623046875, "logps/rejected": -1759.222412109375, "loss": 0.0387, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06664810329675674, "rewards/margins": 0.011612234637141228, "rewards/rejected": 0.05503587797284126, "step": 1600 }, { "epoch": 0.84, "eval_logits/chosen": -0.22299662232398987, "eval_logits/rejected": -0.2297811657190323, "eval_logps/chosen": -2145.159423828125, "eval_logps/rejected": -1821.890869140625, "eval_loss": 0.049171119928359985, "eval_rewards/accuracies": 0.5705000162124634, "eval_rewards/chosen": 0.07124640792608261, "eval_rewards/margins": 0.01672479324042797, "eval_rewards/rejected": 0.05452162027359009, "eval_runtime": 510.5649, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 1600 }, { "epoch": 0.84, "learning_rate": 3.664325260271953e-07, "logits/chosen": -0.21083417534828186, "logits/rejected": -0.21836061775684357, "logps/chosen": -1921.0771484375, "logps/rejected": -1615.395263671875, "loss": 0.0511, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06516659259796143, "rewards/margins": 0.010829558596014977, "rewards/rejected": 0.0543370358645916, "step": 1610 }, { "epoch": 0.85, "learning_rate": 3.429763461076677e-07, "logits/chosen": -0.24559417366981506, "logits/rejected": -0.23943760991096497, "logps/chosen": -1962.534423828125, "logps/rejected": -1884.5029296875, "loss": 0.0509, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.06632138788700104, "rewards/margins": 0.007880722172558308, "rewards/rejected": 0.0584406740963459, "step": 1620 }, { "epoch": 0.85, "learning_rate": 3.202405945663556e-07, "logits/chosen": -0.1999385952949524, "logits/rejected": -0.22029852867126465, "logps/chosen": -2002.773193359375, "logps/rejected": -1645.963623046875, "loss": 0.0417, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06611990928649902, "rewards/margins": 0.018743688240647316, "rewards/rejected": 0.04737623408436775, "step": 1630 }, { "epoch": 0.86, "learning_rate": 2.982328649595856e-07, "logits/chosen": -0.23098058998584747, "logits/rejected": -0.2513691782951355, "logps/chosen": -2234.155517578125, "logps/rejected": -1985.674560546875, "loss": 0.0346, "rewards/accuracies": 0.53125, "rewards/chosen": 0.08107596635818481, "rewards/margins": 0.014828977175056934, "rewards/rejected": 0.06624698638916016, "step": 1640 }, { "epoch": 0.86, "learning_rate": 2.7696050769026954e-07, "logits/chosen": -0.18273136019706726, "logits/rejected": -0.18867138028144836, "logps/chosen": -2058.58349609375, "logps/rejected": -1610.1470947265625, "loss": 0.054, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.06953487545251846, "rewards/margins": 0.014529886655509472, "rewards/rejected": 0.055004991590976715, "step": 1650 }, { "epoch": 0.87, "learning_rate": 2.564306275529341e-07, "logits/chosen": -0.21245570480823517, "logits/rejected": -0.23336832225322723, "logps/chosen": -1910.864013671875, "logps/rejected": -1668.5755615234375, "loss": 0.0652, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.06216276437044144, "rewards/margins": 0.015415112487971783, "rewards/rejected": 0.04674764350056648, "step": 1660 }, { "epoch": 0.87, "learning_rate": 2.3665008136077332e-07, "logits/chosen": -0.2325417697429657, "logits/rejected": -0.2111097276210785, "logps/chosen": -2226.126220703125, "logps/rejected": -2146.695556640625, "loss": 0.0541, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.07398100197315216, "rewards/margins": 0.01466774009168148, "rewards/rejected": 0.05931326001882553, "step": 1670 }, { "epoch": 0.88, "learning_rate": 2.1762547565553293e-07, "logits/chosen": -0.23489132523536682, "logits/rejected": -0.26181578636169434, "logps/chosen": -2163.59130859375, "logps/rejected": -1799.791015625, "loss": 0.045, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0649464800953865, "rewards/margins": 0.013443303294479847, "rewards/rejected": 0.051503174006938934, "step": 1680 }, { "epoch": 0.88, "learning_rate": 1.993631645009747e-07, "logits/chosen": -0.2324393093585968, "logits/rejected": -0.2406429946422577, "logps/chosen": -2243.14990234375, "logps/rejected": -1793.9261474609375, "loss": 0.0459, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.07493821531534195, "rewards/margins": 0.016798479482531548, "rewards/rejected": 0.05813973397016525, "step": 1690 }, { "epoch": 0.89, "learning_rate": 1.818692473606748e-07, "logits/chosen": -0.2428218573331833, "logits/rejected": -0.22781512141227722, "logps/chosen": -1967.1754150390625, "logps/rejected": -1719.901123046875, "loss": 0.0556, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05938352271914482, "rewards/margins": 0.014253886416554451, "rewards/rejected": 0.04512963443994522, "step": 1700 }, { "epoch": 0.89, "eval_logits/chosen": -0.21960072219371796, "eval_logits/rejected": -0.22588692605495453, "eval_logps/chosen": -2149.13818359375, "eval_logps/rejected": -1825.278564453125, "eval_loss": 0.04918248951435089, "eval_rewards/accuracies": 0.5674999952316284, "eval_rewards/chosen": 0.06726768612861633, "eval_rewards/margins": 0.016133680939674377, "eval_rewards/rejected": 0.05113400146365166, "eval_runtime": 510.6325, "eval_samples_per_second": 3.917, "eval_steps_per_second": 0.979, "step": 1700 }, { "epoch": 0.9, "learning_rate": 1.6514956706084885e-07, "logits/chosen": -0.1846579611301422, "logits/rejected": -0.21179255843162537, "logps/chosen": -2128.083740234375, "logps/rejected": -1733.182861328125, "loss": 0.0404, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06305380910634995, "rewards/margins": 0.013125176541507244, "rewards/rejected": 0.049928631633520126, "step": 1710 }, { "epoch": 0.9, "learning_rate": 1.4920970783889737e-07, "logits/chosen": -0.19069206714630127, "logits/rejected": -0.2094193696975708, "logps/chosen": -2153.69580078125, "logps/rejected": -1683.8616943359375, "loss": 0.053, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06470336019992828, "rewards/margins": 0.014464011415839195, "rewards/rejected": 0.05023934692144394, "step": 1720 }, { "epoch": 0.91, "learning_rate": 1.340549934783164e-07, "logits/chosen": -0.20567326247692108, "logits/rejected": -0.22026868164539337, "logps/chosen": -2285.531005859375, "logps/rejected": -1900.779052734375, "loss": 0.0393, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07268913835287094, "rewards/margins": 0.022747965529561043, "rewards/rejected": 0.04994116351008415, "step": 1730 }, { "epoch": 0.91, "learning_rate": 1.196904855305961e-07, "logits/chosen": -0.21483811736106873, "logits/rejected": -0.23524871468544006, "logps/chosen": -2039.6002197265625, "logps/rejected": -1728.5648193359375, "loss": 0.0567, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.06144179031252861, "rewards/margins": 0.012769539840519428, "rewards/rejected": 0.04867224767804146, "step": 1740 }, { "epoch": 0.92, "learning_rate": 1.0612098162470302e-07, "logits/chosen": -0.209224671125412, "logits/rejected": -0.22041518986225128, "logps/chosen": -1966.912353515625, "logps/rejected": -1770.239990234375, "loss": 0.0473, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0644414871931076, "rewards/margins": 0.014524770900607109, "rewards/rejected": 0.049916718155145645, "step": 1750 }, { "epoch": 0.92, "learning_rate": 9.335101386471285e-08, "logits/chosen": -0.1998235136270523, "logits/rejected": -0.205234095454216, "logps/chosen": -2081.581787109375, "logps/rejected": -1733.4261474609375, "loss": 0.0385, "rewards/accuracies": 0.625, "rewards/chosen": 0.07012965530157089, "rewards/margins": 0.018411414697766304, "rewards/rejected": 0.05171824246644974, "step": 1760 }, { "epoch": 0.93, "learning_rate": 8.138484731612273e-08, "logits/chosen": -0.21281655132770538, "logits/rejected": -0.2385600358247757, "logps/chosen": -2208.79736328125, "logps/rejected": -1743.3955078125, "loss": 0.0508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06378593295812607, "rewards/margins": 0.012727012857794762, "rewards/rejected": 0.051058925688266754, "step": 1770 }, { "epoch": 0.93, "learning_rate": 7.022647858135501e-08, "logits/chosen": -0.22884276509284973, "logits/rejected": -0.2317463457584381, "logps/chosen": -2076.50048828125, "logps/rejected": -1827.575439453125, "loss": 0.0497, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06807791441679001, "rewards/margins": 0.011940672062337399, "rewards/rejected": 0.05613725259900093, "step": 1780 }, { "epoch": 0.94, "learning_rate": 5.987963446492384e-08, "logits/chosen": -0.20104601979255676, "logits/rejected": -0.19782570004463196, "logps/chosen": -1880.1734619140625, "logps/rejected": -1685.8695068359375, "loss": 0.0577, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.05542607977986336, "rewards/margins": 0.011624794453382492, "rewards/rejected": 0.04380128160119057, "step": 1790 }, { "epoch": 0.94, "learning_rate": 5.034777072871394e-08, "logits/chosen": -0.1951800137758255, "logits/rejected": -0.21924810111522675, "logps/chosen": -1900.6998291015625, "logps/rejected": -1563.48876953125, "loss": 0.0519, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.05453474447131157, "rewards/margins": 0.008965181186795235, "rewards/rejected": 0.045569561421871185, "step": 1800 }, { "epoch": 0.94, "eval_logits/chosen": -0.21762163937091827, "eval_logits/rejected": -0.2241181582212448, "eval_logps/chosen": -2148.109619140625, "eval_logps/rejected": -1824.3348388671875, "eval_loss": 0.04916713759303093, "eval_rewards/accuracies": 0.5690000057220459, "eval_rewards/chosen": 0.0682961568236351, "eval_rewards/margins": 0.016218481585383415, "eval_rewards/rejected": 0.05207766965031624, "eval_runtime": 510.6606, "eval_samples_per_second": 3.916, "eval_steps_per_second": 0.979, "step": 1800 }, { "epoch": 0.95, "learning_rate": 4.163407093778243e-08, "logits/chosen": -0.1938626617193222, "logits/rejected": -0.21109886467456818, "logps/chosen": -1975.074951171875, "logps/rejected": -1626.160888671875, "loss": 0.0487, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.06530580669641495, "rewards/margins": 0.017866965383291245, "rewards/rejected": 0.0474388413131237, "step": 1810 }, { "epoch": 0.95, "learning_rate": 3.37414453970758e-08, "logits/chosen": -0.20253758132457733, "logits/rejected": -0.21794748306274414, "logps/chosen": -2259.384765625, "logps/rejected": -2048.346435546875, "loss": 0.055, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07413917034864426, "rewards/margins": 0.02017979882657528, "rewards/rejected": 0.053959377110004425, "step": 1820 }, { "epoch": 0.96, "learning_rate": 2.6672530179410183e-08, "logits/chosen": -0.19098524749279022, "logits/rejected": -0.19925786554813385, "logps/chosen": -2076.921630859375, "logps/rejected": -1773.484619140625, "loss": 0.0567, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.06547501683235168, "rewards/margins": 0.016652025282382965, "rewards/rejected": 0.04882299154996872, "step": 1830 }, { "epoch": 0.96, "learning_rate": 2.04296862450451e-08, "logits/chosen": -0.20272760093212128, "logits/rejected": -0.23647110164165497, "logps/chosen": -2243.960205078125, "logps/rejected": -2028.1578369140625, "loss": 0.0483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07614084333181381, "rewards/margins": 0.01566244289278984, "rewards/rejected": 0.06047840043902397, "step": 1840 }, { "epoch": 0.97, "learning_rate": 1.501499865314171e-08, "logits/chosen": -0.22630052268505096, "logits/rejected": -0.20891804993152618, "logps/chosen": -1954.311279296875, "logps/rejected": -1862.181640625, "loss": 0.0493, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06258489936590195, "rewards/margins": 0.01086291205137968, "rewards/rejected": 0.05172199010848999, "step": 1850 }, { "epoch": 0.97, "learning_rate": 1.0430275865371265e-08, "logits/chosen": -0.21302291750907898, "logits/rejected": -0.22670722007751465, "logps/chosen": -1873.691650390625, "logps/rejected": -1755.8060302734375, "loss": 0.0559, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.05816579982638359, "rewards/margins": 0.013311423361301422, "rewards/rejected": 0.04485438019037247, "step": 1860 }, { "epoch": 0.98, "learning_rate": 6.677049141901315e-09, "logits/chosen": -0.1987680345773697, "logits/rejected": -0.2282913625240326, "logps/chosen": -1935.877197265625, "logps/rejected": -1745.5570068359375, "loss": 0.0648, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06320817768573761, "rewards/margins": 0.013358126394450665, "rewards/rejected": 0.049850039184093475, "step": 1870 }, { "epoch": 0.98, "learning_rate": 3.756572029968708e-09, "logits/chosen": -0.21312955021858215, "logits/rejected": -0.22611579298973083, "logps/chosen": -2255.653564453125, "logps/rejected": -1939.3330078125, "loss": 0.0459, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.07814554870128632, "rewards/margins": 0.018966957926750183, "rewards/rejected": 0.05917859077453613, "step": 1880 }, { "epoch": 0.99, "learning_rate": 1.6698199452053199e-09, "logits/chosen": -0.2335490882396698, "logits/rejected": -0.22842903435230255, "logps/chosen": -2171.2451171875, "logps/rejected": -1897.8079833984375, "loss": 0.0534, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.07153952866792679, "rewards/margins": 0.017695123329758644, "rewards/rejected": 0.053844403475522995, "step": 1890 }, { "epoch": 0.99, "learning_rate": 4.1748984585560094e-10, "logits/chosen": -0.18400521576404572, "logits/rejected": -0.20925450325012207, "logps/chosen": -2247.09765625, "logps/rejected": -1860.405517578125, "loss": 0.05, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0739186555147171, "rewards/margins": 0.018731053918600082, "rewards/rejected": 0.05518760159611702, "step": 1900 }, { "epoch": 0.99, "eval_logits/chosen": -0.21872195601463318, "eval_logits/rejected": -0.22539223730564117, "eval_logps/chosen": -2148.457763671875, "eval_logps/rejected": -1824.6458740234375, "eval_loss": 0.04917627200484276, "eval_rewards/accuracies": 0.5669999718666077, "eval_rewards/chosen": 0.06794830411672592, "eval_rewards/margins": 0.016181621700525284, "eval_rewards/rejected": 0.05176668241620064, "eval_runtime": 511.1542, "eval_samples_per_second": 3.913, "eval_steps_per_second": 0.978, "step": 1900 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -0.21931472420692444, "logits/rejected": -0.21789617836475372, "logps/chosen": -2259.647216796875, "logps/rejected": -1940.0595703125, "loss": 0.0525, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07358353585004807, "rewards/margins": 0.015202896669507027, "rewards/rejected": 0.05838064104318619, "step": 1910 }, { "epoch": 1.0, "step": 1910, "total_flos": 0.0, "train_loss": 0.05238237046290443, "train_runtime": 26355.2814, "train_samples_per_second": 1.16, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }