diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2351 +1,1501 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.9988002399520095, - "eval_steps": 10000, - "global_step": 1666, + "epoch": 1.9968602825745683, + "eval_steps": 100, + "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.01, - "learning_rate": 5.988023952095808e-08, - "logits/chosen": 0.08723282814025879, - "logits/rejected": 0.1474362313747406, - "logps/chosen": -289.8438415527344, - "logps/rejected": -246.7926788330078, - "loss": 0.3233, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": 0.00020217681594658643, - "rewards/margins": 0.00030051826615817845, - "rewards/rejected": -9.834145021159202e-05, + "epoch": 0.02, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": 0.2701188623905182, + "logits/rejected": 0.3601176142692566, + "logps/chosen": -304.10650634765625, + "logps/rejected": -281.93804931640625, + "loss": 0.6931, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.00010811944957822561, + "rewards/margins": 0.00021427827596198767, + "rewards/rejected": -0.00032239771098829806, "step": 10 }, { - "epoch": 0.02, - "learning_rate": 1.1976047904191617e-07, - "logits/chosen": 0.08892221748828888, - "logits/rejected": 0.14665499329566956, - "logps/chosen": -336.4267272949219, - "logps/rejected": -287.6366271972656, - "loss": 0.326, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.00019070778216701, - "rewards/margins": 0.0006663546664640307, - "rewards/rejected": -0.0008570626378059387, + "epoch": 0.04, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": 0.33982083201408386, + "logits/rejected": 0.366676390171051, + "logps/chosen": -287.29412841796875, + "logps/rejected": -261.15667724609375, + "loss": 0.6929, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.00013875472359359264, + "rewards/margins": 0.00036414124770089984, + "rewards/rejected": -0.00022538653865922242, "step": 20 }, { - "epoch": 0.04, - "learning_rate": 1.7964071856287425e-07, - "logits/chosen": 0.03857799991965294, - "logits/rejected": 0.16380922496318817, - "logps/chosen": -342.10687255859375, - "logps/rejected": -264.2757263183594, - "loss": 0.3235, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0013111254666000605, - "rewards/margins": 0.0015496534761041403, - "rewards/rejected": -0.0002385281550232321, + "epoch": 0.06, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": 0.1978740394115448, + "logits/rejected": 0.29795143008232117, + "logps/chosen": -355.30877685546875, + "logps/rejected": -307.54730224609375, + "loss": 0.6919, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0008478848030790687, + "rewards/margins": 0.0029860823415219784, + "rewards/rejected": -0.0021381976548582315, "step": 30 }, { - "epoch": 0.05, - "learning_rate": 2.3952095808383233e-07, - "logits/chosen": 0.09239836037158966, - "logits/rejected": 0.10474340617656708, - "logps/chosen": -246.67648315429688, - "logps/rejected": -242.5207061767578, - "loss": 0.324, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.0020021735690534115, - "rewards/margins": 0.0020810733549296856, - "rewards/rejected": -7.889991684351116e-05, + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": 0.203146293759346, + "logits/rejected": 0.27501967549324036, + "logps/chosen": -319.9775085449219, + "logps/rejected": -295.44622802734375, + "loss": 0.6883, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 2.8305406885920092e-05, + "rewards/margins": 0.007146574556827545, + "rewards/rejected": -0.00711826840415597, "step": 40 }, { - "epoch": 0.06, - "learning_rate": 2.9940119760479036e-07, - "logits/chosen": 0.08982165902853012, - "logits/rejected": 0.17564034461975098, - "logps/chosen": -295.698974609375, - "logps/rejected": -221.64443969726562, - "loss": 0.3224, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.007395695894956589, - "rewards/margins": 0.00678109098225832, - "rewards/rejected": 0.0006146054365672171, + "epoch": 0.1, + "learning_rate": 5.208333333333334e-07, + "logits/chosen": 0.3048178255558014, + "logits/rejected": 0.34185779094696045, + "logps/chosen": -328.4739074707031, + "logps/rejected": -330.8270568847656, + "loss": 0.6814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.002576820319518447, + "rewards/margins": 0.029984716325998306, + "rewards/rejected": -0.027407895773649216, "step": 50 }, { - "epoch": 0.07, - "learning_rate": 3.592814371257485e-07, - "logits/chosen": 0.14322063326835632, - "logits/rejected": 0.20782272517681122, - "logps/chosen": -289.552978515625, - "logps/rejected": -268.31988525390625, - "loss": 0.3298, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.015825165435671806, - "rewards/margins": 0.008219954557716846, - "rewards/rejected": 0.007605212740600109, + "epoch": 0.13, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": 0.2885764241218567, + "logits/rejected": 0.38096073269844055, + "logps/chosen": -289.23529052734375, + "logps/rejected": -293.72637939453125, + "loss": 0.6732, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.015276116319000721, + "rewards/margins": 0.05601469799876213, + "rewards/rejected": -0.07129080593585968, "step": 60 }, { - "epoch": 0.08, - "learning_rate": 4.191616766467065e-07, - "logits/chosen": 0.06546024978160858, - "logits/rejected": 0.12448060512542725, - "logps/chosen": -317.5198974609375, - "logps/rejected": -250.50119018554688, - "loss": 0.3414, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0307091623544693, - "rewards/margins": 0.03058113530278206, - "rewards/rejected": 0.00012802743003703654, + "epoch": 0.15, + "learning_rate": 7.291666666666666e-07, + "logits/chosen": 0.41116613149642944, + "logits/rejected": 0.44518256187438965, + "logps/chosen": -271.6816101074219, + "logps/rejected": -300.5724182128906, + "loss": 0.6566, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.051550477743148804, + "rewards/margins": 0.09335148334503174, + "rewards/rejected": -0.14490197598934174, "step": 70 }, { - "epoch": 0.1, - "learning_rate": 4.790419161676647e-07, - "logits/chosen": 0.0973881259560585, - "logits/rejected": 0.18290364742279053, - "logps/chosen": -333.90106201171875, - "logps/rejected": -289.8985290527344, - "loss": 0.3516, + "epoch": 0.17, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": 0.38054439425468445, + "logits/rejected": 0.42509716749191284, + "logps/chosen": -306.13409423828125, + "logps/rejected": -310.5857849121094, + "loss": 0.6474, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.024836184456944466, - "rewards/margins": 0.036705560982227325, - "rewards/rejected": -0.011869379319250584, + "rewards/chosen": -0.11951945722103119, + "rewards/margins": 0.18832235038280487, + "rewards/rejected": -0.30784183740615845, "step": 80 }, { - "epoch": 0.11, - "learning_rate": 5.389221556886228e-07, - "logits/chosen": 0.07697711139917374, - "logits/rejected": 0.13395674526691437, - "logps/chosen": -350.3150939941406, - "logps/rejected": -295.5214538574219, - "loss": 0.3653, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.012771876528859138, - "rewards/margins": 0.04415798559784889, - "rewards/rejected": -0.031386107206344604, + "epoch": 0.19, + "learning_rate": 9.374999999999999e-07, + "logits/chosen": 0.3940204977989197, + "logits/rejected": 0.5040494203567505, + "logps/chosen": -303.87579345703125, + "logps/rejected": -295.0137023925781, + "loss": 0.6365, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1944989413022995, + "rewards/margins": 0.25527024269104004, + "rewards/rejected": -0.44976916909217834, "step": 90 }, { - "epoch": 0.12, - "learning_rate": 5.988023952095807e-07, - "logits/chosen": 0.13352516293525696, - "logits/rejected": 0.18881280720233917, - "logps/chosen": -359.6278076171875, - "logps/rejected": -266.6817321777344, - "loss": 0.3554, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.03511623665690422, - "rewards/margins": 0.09815546870231628, - "rewards/rejected": -0.06303922832012177, + "epoch": 0.21, + "learning_rate": 9.999463737538052e-07, + "logits/chosen": 0.33193686604499817, + "logits/rejected": 0.4174256920814514, + "logps/chosen": -339.90606689453125, + "logps/rejected": -321.69439697265625, + "loss": 0.6209, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3108145296573639, + "rewards/margins": 0.22784237563610077, + "rewards/rejected": -0.5386568307876587, "step": 100 }, { - "epoch": 0.13, - "learning_rate": 6.586826347305389e-07, - "logits/chosen": 0.0884179100394249, - "logits/rejected": 0.21645644307136536, - "logps/chosen": -361.37237548828125, - "logps/rejected": -261.2263488769531, - "loss": 0.3636, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.05635715276002884, - "rewards/margins": 0.11102348566055298, - "rewards/rejected": -0.05466633290052414, + "epoch": 0.21, + "eval_logits/chosen": 0.3388860821723938, + "eval_logits/rejected": 0.39094123244285583, + "eval_logps/chosen": -329.033203125, + "eval_logps/rejected": -375.22589111328125, + "eval_loss": 0.6259796023368835, + "eval_rewards/accuracies": 0.6953125, + "eval_rewards/chosen": -0.2544853985309601, + "eval_rewards/margins": 0.38152238726615906, + "eval_rewards/rejected": -0.6360077857971191, + "eval_runtime": 66.4403, + "eval_samples_per_second": 30.102, + "eval_steps_per_second": 0.482, + "step": 100 + }, + { + "epoch": 0.23, + "learning_rate": 9.993432105822034e-07, + "logits/chosen": 0.24386301636695862, + "logits/rejected": 0.31328675150871277, + "logps/chosen": -339.8066711425781, + "logps/rejected": -345.63275146484375, + "loss": 0.618, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30914023518562317, + "rewards/margins": 0.2655971348285675, + "rewards/rejected": -0.5747373700141907, "step": 110 }, { - "epoch": 0.14, - "learning_rate": 7.18562874251497e-07, - "logits/chosen": 0.18063569068908691, - "logits/rejected": 0.266376793384552, - "logps/chosen": -325.956787109375, - "logps/rejected": -271.6441345214844, - "loss": 0.3595, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.0006754301721230149, - "rewards/margins": 0.04642491415143013, - "rewards/rejected": -0.04710034281015396, + "epoch": 0.25, + "learning_rate": 9.980706626858607e-07, + "logits/chosen": 0.12805965542793274, + "logits/rejected": 0.18846169114112854, + "logps/chosen": -358.5738525390625, + "logps/rejected": -370.335693359375, + "loss": 0.5994, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3086175322532654, + "rewards/margins": 0.34092435240745544, + "rewards/rejected": -0.6495418548583984, "step": 120 }, { - "epoch": 0.16, - "learning_rate": 7.784431137724551e-07, - "logits/chosen": 0.14518184959888458, - "logits/rejected": 0.22254076600074768, - "logps/chosen": -319.9495544433594, - "logps/rejected": -263.5156555175781, - "loss": 0.3544, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0138475950807333, - "rewards/margins": 0.12771600484848022, - "rewards/rejected": -0.11386840045452118, + "epoch": 0.27, + "learning_rate": 9.961304359538434e-07, + "logits/chosen": 0.1531093418598175, + "logits/rejected": 0.2172633707523346, + "logps/chosen": -346.32293701171875, + "logps/rejected": -343.2093200683594, + "loss": 0.5942, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.39903703331947327, + "rewards/margins": 0.2545042037963867, + "rewards/rejected": -0.6535412073135376, "step": 130 }, { - "epoch": 0.17, - "learning_rate": 8.38323353293413e-07, - "logits/chosen": 0.12264908850193024, - "logits/rejected": 0.18698899447917938, - "logps/chosen": -313.93426513671875, - "logps/rejected": -276.4668273925781, - "loss": 0.3494, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.01783711649477482, - "rewards/margins": 0.10452733933925629, - "rewards/rejected": -0.08669020980596542, + "epoch": 0.29, + "learning_rate": 9.935251313189563e-07, + "logits/chosen": 0.16342206299304962, + "logits/rejected": 0.2908289134502411, + "logps/chosen": -347.50091552734375, + "logps/rejected": -348.30712890625, + "loss": 0.5752, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48607850074768066, + "rewards/margins": 0.3913617730140686, + "rewards/rejected": -0.8774402737617493, "step": 140 }, { - "epoch": 0.18, - "learning_rate": 8.982035928143712e-07, - "logits/chosen": 0.09591711312532425, - "logits/rejected": 0.17884746193885803, - "logps/chosen": -350.5014343261719, - "logps/rejected": -266.41552734375, - "loss": 0.3383, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.0012602738570421934, - "rewards/margins": 0.14207753539085388, - "rewards/rejected": -0.14081726968288422, + "epoch": 0.31, + "learning_rate": 9.902582412711118e-07, + "logits/chosen": 0.26018935441970825, + "logits/rejected": 0.3341614305973053, + "logps/chosen": -387.11895751953125, + "logps/rejected": -399.8661804199219, + "loss": 0.5578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5754287242889404, + "rewards/margins": 0.44557419419288635, + "rewards/rejected": -1.0210028886795044, "step": 150 }, { - "epoch": 0.19, - "learning_rate": 9.580838323353293e-07, - "logits/chosen": 0.0989338606595993, - "logits/rejected": 0.13404981791973114, - "logps/chosen": -316.9004821777344, - "logps/rejected": -312.2667236328125, - "loss": 0.3203, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.0687781348824501, - "rewards/margins": 0.09935374557971954, - "rewards/rejected": -0.16813188791275024, + "epoch": 0.33, + "learning_rate": 9.86334145175542e-07, + "logits/chosen": 0.2852904796600342, + "logits/rejected": 0.4398563802242279, + "logps/chosen": -398.2310791015625, + "logps/rejected": -403.6026611328125, + "loss": 0.593, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7268518209457397, + "rewards/margins": 0.5443054437637329, + "rewards/rejected": -1.2711572647094727, "step": 160 }, { - "epoch": 0.2, - "learning_rate": 9.999901172555115e-07, - "logits/chosen": 0.15517865121364594, - "logits/rejected": 0.20303210616111755, - "logps/chosen": -282.88330078125, - "logps/rejected": -263.9994812011719, - "loss": 0.3198, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.11205162107944489, - "rewards/margins": 0.10533325374126434, - "rewards/rejected": -0.21738485991954803, + "epoch": 0.36, + "learning_rate": 9.817581034021272e-07, + "logits/chosen": 0.1794786900281906, + "logits/rejected": 0.33308178186416626, + "logps/chosen": -407.6117248535156, + "logps/rejected": -413.76080322265625, + "loss": 0.5684, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.44304603338241577, + "rewards/margins": 0.4069553017616272, + "rewards/rejected": -0.8500012159347534, "step": 170 }, { - "epoch": 0.22, - "learning_rate": 9.998144348880984e-07, - "logits/chosen": 0.12583962082862854, - "logits/rejected": 0.18330255150794983, - "logps/chosen": -332.62225341796875, - "logps/rejected": -299.07421875, - "loss": 0.3114, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.10971450805664062, - "rewards/margins": 0.15495118498802185, - "rewards/rejected": -0.2646656930446625, + "epoch": 0.38, + "learning_rate": 9.765362502737097e-07, + "logits/chosen": 0.42060035467147827, + "logits/rejected": 0.49758368730545044, + "logps/chosen": -387.24993896484375, + "logps/rejected": -434.32861328125, + "loss": 0.5524, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6233189702033997, + "rewards/margins": 0.6569790244102478, + "rewards/rejected": -1.2802979946136475, "step": 180 }, { - "epoch": 0.23, - "learning_rate": 9.994192247951515e-07, - "logits/chosen": 0.14090900123119354, - "logits/rejected": 0.1716761291027069, - "logps/chosen": -332.7867431640625, - "logps/rejected": -332.4990539550781, - "loss": 0.2964, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.17104236781597137, - "rewards/margins": 0.13000008463859558, - "rewards/rejected": -0.30104246735572815, + "epoch": 0.4, + "learning_rate": 9.706755858428485e-07, + "logits/chosen": 0.3799760341644287, + "logits/rejected": 0.4900673031806946, + "logps/chosen": -379.1689453125, + "logps/rejected": -424.69384765625, + "loss": 0.5687, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6506041884422302, + "rewards/margins": 0.5374189019203186, + "rewards/rejected": -1.1880230903625488, "step": 190 }, { - "epoch": 0.24, - "learning_rate": 9.988046605602389e-07, - "logits/chosen": 0.1038103699684143, - "logits/rejected": 0.15158522129058838, - "logps/chosen": -353.9814758300781, - "logps/rejected": -326.39556884765625, - "loss": 0.2851, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.19866228103637695, - "rewards/margins": 0.3053509593009949, - "rewards/rejected": -0.504013180732727, + "epoch": 0.42, + "learning_rate": 9.641839665080363e-07, + "logits/chosen": 0.46871694922447205, + "logits/rejected": 0.636328935623169, + "logps/chosen": -360.5708923339844, + "logps/rejected": -375.3180847167969, + "loss": 0.5447, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5866760611534119, + "rewards/margins": 0.48661094903945923, + "rewards/rejected": -1.0732871294021606, "step": 200 }, { - "epoch": 0.25, - "learning_rate": 9.979710121113161e-07, - "logits/chosen": 0.07421442121267319, - "logits/rejected": 0.10481990873813629, - "logps/chosen": -354.7379455566406, - "logps/rejected": -324.5375061035156, - "loss": 0.2649, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.1955946683883667, - "rewards/margins": 0.20845285058021545, - "rewards/rejected": -0.40404754877090454, + "epoch": 0.42, + "eval_logits/chosen": 0.5112473964691162, + "eval_logits/rejected": 0.5762841105461121, + "eval_logps/chosen": -365.8961486816406, + "eval_logps/rejected": -452.6488952636719, + "eval_loss": 0.5506153702735901, + "eval_rewards/accuracies": 0.78515625, + "eval_rewards/chosen": -0.6231149435043335, + "eval_rewards/margins": 0.787122905254364, + "eval_rewards/rejected": -1.4102377891540527, + "eval_runtime": 65.0437, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.492, + "step": 200 + }, + { + "epoch": 0.44, + "learning_rate": 9.570700944819582e-07, + "logits/chosen": 0.4811934530735016, + "logits/rejected": 0.5919052362442017, + "logps/chosen": -371.3291015625, + "logps/rejected": -403.1416015625, + "loss": 0.5727, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7050508260726929, + "rewards/margins": 0.6323877573013306, + "rewards/rejected": -1.337438702583313, "step": 210 }, { - "epoch": 0.26, - "learning_rate": 9.969186456021698e-07, - "logits/chosen": 0.09808576852083206, - "logits/rejected": 0.16462191939353943, - "logps/chosen": -335.7714538574219, - "logps/rejected": -331.1880798339844, - "loss": 0.2537, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.23376011848449707, - "rewards/margins": 0.29557761549949646, - "rewards/rejected": -0.5293377637863159, + "epoch": 0.46, + "learning_rate": 9.493435061259129e-07, + "logits/chosen": 0.3835849165916443, + "logits/rejected": 0.5547031164169312, + "logps/chosen": -366.06134033203125, + "logps/rejected": -362.992919921875, + "loss": 0.5383, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4402705132961273, + "rewards/margins": 0.5309184193611145, + "rewards/rejected": -0.9711888432502747, "step": 220 }, { - "epoch": 0.28, - "learning_rate": 9.956480232515958e-07, - "logits/chosen": 0.08053131401538849, - "logits/rejected": 0.2201889455318451, - "logps/chosen": -398.9234619140625, - "logps/rejected": -311.8883361816406, - "loss": 0.2271, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.2779925465583801, - "rewards/margins": 0.2911529839038849, - "rewards/rejected": -0.5691455006599426, + "epoch": 0.48, + "learning_rate": 9.4101455916603e-07, + "logits/chosen": 0.3896172046661377, + "logits/rejected": 0.4787639081478119, + "logps/chosen": -378.92718505859375, + "logps/rejected": -454.00848388671875, + "loss": 0.5109, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7004601359367371, + "rewards/margins": 0.8989294767379761, + "rewards/rejected": -1.5993895530700684, "step": 230 }, { - "epoch": 0.29, - "learning_rate": 9.941597031403838e-07, - "logits/chosen": 0.05627553537487984, - "logits/rejected": 0.22260212898254395, - "logps/chosen": -429.3963928222656, - "logps/rejected": -313.8590393066406, - "loss": 0.2285, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.370193213224411, - "rewards/margins": 0.20042189955711365, - "rewards/rejected": -0.5706151127815247, + "epoch": 0.5, + "learning_rate": 9.320944188084241e-07, + "logits/chosen": 0.30041638016700745, + "logits/rejected": 0.4409019351005554, + "logps/chosen": -457.48736572265625, + "logps/rejected": -434.3902893066406, + "loss": 0.5443, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7445753216743469, + "rewards/margins": 0.6970599293708801, + "rewards/rejected": -1.4416351318359375, "step": 240 }, { - "epoch": 0.3, - "learning_rate": 9.924543389661986e-07, - "logits/chosen": 0.11179877817630768, - "logits/rejected": 0.19428351521492004, - "logps/chosen": -391.66510009765625, - "logps/rejected": -318.9273376464844, - "loss": 0.2167, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.43162259459495544, - "rewards/margins": 0.27592721581459045, - "rewards/rejected": -0.7075497508049011, + "epoch": 0.52, + "learning_rate": 9.225950427718974e-07, + "logits/chosen": 0.2954685389995575, + "logits/rejected": 0.38950592279434204, + "logps/chosen": -369.9661560058594, + "logps/rejected": -399.80255126953125, + "loss": 0.5482, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5683563947677612, + "rewards/margins": 0.6092018485069275, + "rewards/rejected": -1.1775583028793335, "step": 250 }, { - "epoch": 0.31, - "learning_rate": 9.905326797564637e-07, - "logits/chosen": 0.11460906267166138, - "logits/rejected": 0.19496436417102814, - "logps/chosen": -373.25457763671875, - "logps/rejected": -342.4559020996094, - "loss": 0.2219, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5576636791229248, - "rewards/margins": 0.3388262391090393, - "rewards/rejected": -0.8964899182319641, + "epoch": 0.54, + "learning_rate": 9.125291652582547e-07, + "logits/chosen": 0.25557464361190796, + "logits/rejected": 0.28696995973587036, + "logps/chosen": -393.4580383300781, + "logps/rejected": -450.3002014160156, + "loss": 0.5323, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8674219250679016, + "rewards/margins": 0.8023292422294617, + "rewards/rejected": -1.6697509288787842, "step": 260 }, { - "epoch": 0.32, - "learning_rate": 9.883955695393743e-07, - "logits/chosen": 0.07489059120416641, - "logits/rejected": 0.1965363323688507, - "logps/chosen": -456.4065856933594, - "logps/rejected": -375.62841796875, - "loss": 0.1962, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.47448381781578064, - "rewards/margins": 0.28402233123779297, - "rewards/rejected": -0.758506178855896, + "epoch": 0.57, + "learning_rate": 9.019102798817195e-07, + "logits/chosen": 0.32073482871055603, + "logits/rejected": 0.3113810420036316, + "logps/chosen": -385.3840637207031, + "logps/rejected": -446.5083923339844, + "loss": 0.5209, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9727315902709961, + "rewards/margins": 0.7288861274719238, + "rewards/rejected": -1.7016175985336304, "step": 270 }, { - "epoch": 0.34, - "learning_rate": 9.860439469731857e-07, - "logits/chosen": 0.08477760851383209, - "logits/rejected": 0.18167419731616974, - "logps/chosen": -363.19488525390625, - "logps/rejected": -343.9361877441406, - "loss": 0.1958, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.5575977563858032, - "rewards/margins": 0.1699770838022232, - "rewards/rejected": -0.7275748252868652, + "epoch": 0.59, + "learning_rate": 8.90752621580335e-07, + "logits/chosen": 0.2364472895860672, + "logits/rejected": 0.3129078447818756, + "logps/chosen": -436.9453125, + "logps/rejected": -469.55450439453125, + "loss": 0.5313, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0823299884796143, + "rewards/margins": 0.7604998350143433, + "rewards/rejected": -1.842829942703247, "step": 280 }, { - "epoch": 0.35, - "learning_rate": 9.834788449339357e-07, - "logits/chosen": 0.07804753631353378, - "logits/rejected": 0.10795494168996811, - "logps/chosen": -339.80242919921875, - "logps/rejected": -364.9732360839844, - "loss": 0.1985, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6320484280586243, - "rewards/margins": 0.27737244963645935, - "rewards/rejected": -0.9094208478927612, + "epoch": 0.61, + "learning_rate": 8.79071147533597e-07, + "logits/chosen": 0.225694939494133, + "logits/rejected": 0.2702820301055908, + "logps/chosen": -382.134033203125, + "logps/rejected": -445.76873779296875, + "loss": 0.5355, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.5520836114883423, + "rewards/margins": 0.6939669847488403, + "rewards/rejected": -1.2460505962371826, "step": 290 }, { - "epoch": 0.36, - "learning_rate": 9.807013900617874e-07, - "logits/chosen": 0.04762539267539978, - "logits/rejected": 0.06572198867797852, - "logps/chosen": -371.88922119140625, - "logps/rejected": -344.0954284667969, - "loss": 0.2034, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5951868295669556, - "rewards/margins": 0.362405002117157, - "rewards/rejected": -0.9575918912887573, + "epoch": 0.63, + "learning_rate": 8.668815171119019e-07, + "logits/chosen": 0.2409524917602539, + "logits/rejected": 0.2485930621623993, + "logps/chosen": -370.5341796875, + "logps/rejected": -434.1331481933594, + "loss": 0.5257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7270389795303345, + "rewards/margins": 0.655768096446991, + "rewards/rejected": -1.3828070163726807, "step": 300 }, { - "epoch": 0.37, - "learning_rate": 9.777128022661876e-07, - "logits/chosen": 0.021948417648673058, - "logits/rejected": 0.0974685400724411, - "logps/chosen": -377.27667236328125, - "logps/rejected": -320.70843505859375, - "loss": 0.2046, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5508317947387695, - "rewards/margins": 0.27007168531417847, - "rewards/rejected": -0.8209035992622375, + "epoch": 0.63, + "eval_logits/chosen": 0.2655918300151825, + "eval_logits/rejected": 0.2916402518749237, + "eval_logps/chosen": -397.6915283203125, + "eval_logps/rejected": -507.246826171875, + "eval_loss": 0.5325405597686768, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -0.9410690069198608, + "eval_rewards/margins": 1.015148401260376, + "eval_rewards/rejected": -1.9562172889709473, + "eval_runtime": 66.9601, + "eval_samples_per_second": 29.869, + "eval_steps_per_second": 0.478, + "step": 300 + }, + { + "epoch": 0.65, + "learning_rate": 8.54200070884685e-07, + "logits/chosen": 0.30960172414779663, + "logits/rejected": 0.3277510702610016, + "logps/chosen": -356.87579345703125, + "logps/rejected": -420.3736877441406, + "loss": 0.5308, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7447237372398376, + "rewards/margins": 0.7556108832359314, + "rewards/rejected": -1.5003347396850586, "step": 310 }, { - "epoch": 0.38, - "learning_rate": 9.745143941900607e-07, - "logits/chosen": -0.05026810243725777, - "logits/rejected": -0.028773341327905655, - "logps/chosen": -397.15936279296875, - "logps/rejected": -438.9103088378906, - "loss": 0.1889, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.7560676336288452, - "rewards/margins": 0.41686925292015076, - "rewards/rejected": -1.172937035560608, + "epoch": 0.67, + "learning_rate": 8.410438087153911e-07, + "logits/chosen": 0.30577388405799866, + "logits/rejected": 0.3105422258377075, + "logps/chosen": -380.59344482421875, + "logps/rejected": -451.85919189453125, + "loss": 0.5101, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.856540322303772, + "rewards/margins": 0.8256477117538452, + "rewards/rejected": -1.682187795639038, "step": 320 }, { - "epoch": 0.4, - "learning_rate": 9.711075706332709e-07, - "logits/chosen": -0.04232814535498619, - "logits/rejected": 0.0720619410276413, - "logps/chosen": -409.37701416015625, - "logps/rejected": -379.40960693359375, - "loss": 0.1766, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7803069353103638, - "rewards/margins": 0.3621472716331482, - "rewards/rejected": -1.1424543857574463, + "epoch": 0.69, + "learning_rate": 8.274303669726426e-07, + "logits/chosen": 0.15973469614982605, + "logits/rejected": 0.09850945323705673, + "logps/chosen": -430.9588928222656, + "logps/rejected": -490.13250732421875, + "loss": 0.5328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0572891235351562, + "rewards/margins": 0.730115532875061, + "rewards/rejected": -1.7874046564102173, "step": 330 }, { - "epoch": 0.41, - "learning_rate": 9.674938279356085e-07, - "logits/chosen": 0.008946272544562817, - "logits/rejected": 0.05618705600500107, - "logps/chosen": -401.8056335449219, - "logps/rejected": -350.46331787109375, - "loss": 0.1868, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.7637485265731812, - "rewards/margins": 0.30835336446762085, - "rewards/rejected": -1.0721018314361572, + "epoch": 0.71, + "learning_rate": 8.133779948881513e-07, + "logits/chosen": 0.2449232041835785, + "logits/rejected": 0.23727181553840637, + "logps/chosen": -361.06561279296875, + "logps/rejected": -415.7433166503906, + "loss": 0.5382, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7502766847610474, + "rewards/margins": 0.702269434928894, + "rewards/rejected": -1.4525461196899414, "step": 340 }, { - "epoch": 0.42, - "learning_rate": 9.636747533195696e-07, - "logits/chosen": -0.005753317382186651, - "logits/rejected": 0.10436991602182388, - "logps/chosen": -375.9329528808594, - "logps/rejected": -351.3202209472656, - "loss": 0.1826, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6658205986022949, - "rewards/margins": 0.3570438027381897, - "rewards/rejected": -1.0228643417358398, + "epoch": 0.73, + "learning_rate": 7.989055300930704e-07, + "logits/chosen": 0.15988311171531677, + "logits/rejected": 0.13738436996936798, + "logps/chosen": -386.3119201660156, + "logps/rejected": -460.19012451171875, + "loss": 0.524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8789035081863403, + "rewards/margins": 0.6686772108078003, + "rewards/rejected": -1.547580599784851, "step": 350 }, { - "epoch": 0.43, - "learning_rate": 9.596520241932198e-07, - "logits/chosen": -0.10068678855895996, - "logits/rejected": -0.0736171081662178, - "logps/chosen": -413.36505126953125, - "logps/rejected": -384.11761474609375, - "loss": 0.1781, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6729551553726196, - "rewards/margins": 0.567957878112793, - "rewards/rejected": -1.2409130334854126, + "epoch": 0.75, + "learning_rate": 7.840323733655778e-07, + "logits/chosen": 0.12602314352989197, + "logits/rejected": 0.1656724512577057, + "logps/chosen": -409.4278869628906, + "logps/rejected": -502.9934997558594, + "loss": 0.5049, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9467867612838745, + "rewards/margins": 0.9048227071762085, + "rewards/rejected": -1.8516095876693726, "step": 360 }, { - "epoch": 0.44, - "learning_rate": 9.554274074134438e-07, - "logits/chosen": -0.11362670361995697, - "logits/rejected": 0.022236399352550507, - "logps/chosen": -431.1710510253906, - "logps/rejected": -389.5150451660156, - "loss": 0.1684, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9115616083145142, - "rewards/margins": 0.35060685873031616, - "rewards/rejected": -1.262168526649475, + "epoch": 0.77, + "learning_rate": 7.687784626235447e-07, + "logits/chosen": 0.14312385022640228, + "logits/rejected": 0.20448152720928192, + "logps/chosen": -431.67034912109375, + "logps/rejected": -471.8050231933594, + "loss": 0.5492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0165982246398926, + "rewards/margins": 0.7825287580490112, + "rewards/rejected": -1.7991268634796143, "step": 370 }, { - "epoch": 0.46, - "learning_rate": 9.510027585099106e-07, - "logits/chosen": -0.11619666963815689, - "logits/rejected": -0.10023369640111923, - "logps/chosen": -390.26239013671875, - "logps/rejected": -380.4273376464844, - "loss": 0.1742, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8604833483695984, - "rewards/margins": 0.4251517355442047, - "rewards/rejected": -1.2856351137161255, + "epoch": 0.8, + "learning_rate": 7.531642461971514e-07, + "logits/chosen": 0.2566662132740021, + "logits/rejected": 0.27717217803001404, + "logps/chosen": -353.5281066894531, + "logps/rejected": -436.3070373535156, + "loss": 0.5311, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.795007586479187, + "rewards/margins": 0.7797006964683533, + "rewards/rejected": -1.5747082233428955, "step": 380 }, { - "epoch": 0.47, - "learning_rate": 9.463800208700903e-07, - "logits/chosen": -0.13372397422790527, - "logits/rejected": -0.04371088743209839, - "logps/chosen": -406.59423828125, - "logps/rejected": -367.8764953613281, - "loss": 0.1872, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.8710261583328247, - "rewards/margins": 0.2594194710254669, - "rewards/rejected": -1.1304455995559692, + "epoch": 0.82, + "learning_rate": 7.372106554172801e-07, + "logits/chosen": 0.19471058249473572, + "logits/rejected": 0.25721031427383423, + "logps/chosen": -352.6814270019531, + "logps/rejected": -415.6796875, + "loss": 0.5278, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6281698346138, + "rewards/margins": 0.6472196578979492, + "rewards/rejected": -1.2753894329071045, "step": 390 }, { - "epoch": 0.48, - "learning_rate": 9.415612248856824e-07, - "logits/chosen": -0.059288300573825836, - "logits/rejected": -0.031034788116812706, - "logps/chosen": -350.6271057128906, - "logps/rejected": -349.37396240234375, - "loss": 0.2133, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6463299989700317, - "rewards/margins": 0.44987383484840393, - "rewards/rejected": -1.0962039232254028, + "epoch": 0.84, + "learning_rate": 7.209390765564318e-07, + "logits/chosen": 0.16345885396003723, + "logits/rejected": 0.08809966593980789, + "logps/chosen": -390.4371643066406, + "logps/rejected": -480.34771728515625, + "loss": 0.5016, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0055781602859497, + "rewards/margins": 0.9243377447128296, + "rewards/rejected": -1.9299157857894897, "step": 400 }, { - "epoch": 0.49, - "learning_rate": 9.365484870608296e-07, - "logits/chosen": -0.13688340783119202, - "logits/rejected": -0.1447652131319046, - "logps/chosen": -356.8211364746094, - "logps/rejected": -380.91314697265625, - "loss": 0.2025, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6279365420341492, - "rewards/margins": 0.3849129378795624, - "rewards/rejected": -1.0128495693206787, + "epoch": 0.84, + "eval_logits/chosen": 0.15811440348625183, + "eval_logits/rejected": 0.13012486696243286, + "eval_logps/chosen": -417.2896423339844, + "eval_logps/rejected": -539.0118408203125, + "eval_loss": 0.5168445110321045, + "eval_rewards/accuracies": 0.79296875, + "eval_rewards/chosen": -1.137049913406372, + "eval_rewards/margins": 1.136817216873169, + "eval_rewards/rejected": -2.273867130279541, + "eval_runtime": 66.6257, + "eval_samples_per_second": 30.018, + "eval_steps_per_second": 0.48, + "step": 400 + }, + { + "epoch": 0.86, + "learning_rate": 7.043713221597773e-07, + "logits/chosen": 0.014328557066619396, + "logits/rejected": 0.07505010813474655, + "logps/chosen": -482.8560485839844, + "logps/rejected": -536.3275756835938, + "loss": 0.5295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4197783470153809, + "rewards/margins": 0.8773676753044128, + "rewards/rejected": -2.2971460819244385, "step": 410 }, { - "epoch": 0.5, - "learning_rate": 9.313440090825118e-07, - "logits/chosen": -0.12175603210926056, - "logits/rejected": -0.05414363741874695, - "logps/chosen": -396.71197509765625, - "logps/rejected": -352.76312255859375, - "loss": 0.185, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.8293508291244507, - "rewards/margins": 0.2594980001449585, - "rewards/rejected": -1.0888488292694092, + "epoch": 0.88, + "learning_rate": 6.875296018047809e-07, + "logits/chosen": 0.046616051346063614, + "logits/rejected": 0.05625314265489578, + "logps/chosen": -431.9371643066406, + "logps/rejected": -498.2359924316406, + "loss": 0.5346, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.14110267162323, + "rewards/margins": 0.6319513320922852, + "rewards/rejected": -1.7730541229248047, "step": 420 }, { - "epoch": 0.52, - "learning_rate": 9.259500768535226e-07, - "logits/chosen": -0.1462847888469696, - "logits/rejected": -0.1344456970691681, - "logps/chosen": -405.64959716796875, - "logps/rejected": -365.44989013671875, - "loss": 0.1819, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.7200776934623718, - "rewards/margins": 0.4672032296657562, - "rewards/rejected": -1.1872810125350952, + "epoch": 0.9, + "learning_rate": 6.704364923285857e-07, + "logits/chosen": 0.12441425025463104, + "logits/rejected": 0.273973673582077, + "logps/chosen": -408.7701110839844, + "logps/rejected": -417.74053955078125, + "loss": 0.4996, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0271834135055542, + "rewards/margins": 0.705409049987793, + "rewards/rejected": -1.7325923442840576, "step": 430 }, { - "epoch": 0.53, - "learning_rate": 9.203690594884599e-07, - "logits/chosen": -0.20732097327709198, - "logits/rejected": -0.10968241840600967, - "logps/chosen": -454.0621643066406, - "logps/rejected": -413.76629638671875, - "loss": 0.1467, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.8635724782943726, - "rewards/margins": 0.43014320731163025, - "rewards/rejected": -1.2937157154083252, + "epoch": 0.92, + "learning_rate": 6.531149075630796e-07, + "logits/chosen": 0.03405492380261421, + "logits/rejected": 0.1526843011379242, + "logps/chosen": -446.88970947265625, + "logps/rejected": -494.64923095703125, + "loss": 0.5164, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2411255836486816, + "rewards/margins": 0.8954195976257324, + "rewards/rejected": -2.136545181274414, "step": 440 }, { - "epoch": 0.54, - "learning_rate": 9.146034082731666e-07, - "logits/chosen": -0.24053998291492462, - "logits/rejected": -0.10800532251596451, - "logps/chosen": -424.5973205566406, - "logps/rejected": -410.4818420410156, - "loss": 0.1682, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.041223168373108, - "rewards/margins": 0.5180162191390991, - "rewards/rejected": -1.559239387512207, + "epoch": 0.94, + "learning_rate": 6.355880676182085e-07, + "logits/chosen": 0.09311509132385254, + "logits/rejected": 0.06277544796466827, + "logps/chosen": -402.934326171875, + "logps/rejected": -523.3145751953125, + "loss": 0.5185, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0525462627410889, + "rewards/margins": 1.0419820547103882, + "rewards/rejected": -2.0945284366607666, "step": 450 }, { - "epoch": 0.55, - "learning_rate": 9.086556555880808e-07, - "logits/chosen": -0.2650529742240906, - "logits/rejected": -0.13450463116168976, - "logps/chosen": -446.36468505859375, - "logps/rejected": -422.0096130371094, - "loss": 0.1489, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1759449243545532, - "rewards/margins": 0.5576841235160828, - "rewards/rejected": -1.7336289882659912, + "epoch": 0.96, + "learning_rate": 6.178794677547137e-07, + "logits/chosen": 0.1986207216978073, + "logits/rejected": 0.24227669835090637, + "logps/chosen": -439.70208740234375, + "logps/rejected": -479.1068420410156, + "loss": 0.5149, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.960884690284729, + "rewards/margins": 0.8626872897148132, + "rewards/rejected": -1.8235719203948975, "step": 460 }, { - "epoch": 0.56, - "learning_rate": 9.025284137959672e-07, - "logits/chosen": -0.08063942193984985, - "logits/rejected": -0.08374373614788055, - "logps/chosen": -368.17987060546875, - "logps/rejected": -371.68597412109375, - "loss": 0.1861, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8910188674926758, - "rewards/margins": 0.3901366889476776, - "rewards/rejected": -1.2811555862426758, + "epoch": 0.98, + "learning_rate": 6.000128468880222e-07, + "logits/chosen": 0.21710078418254852, + "logits/rejected": 0.21177777647972107, + "logps/chosen": -434.6083068847656, + "logps/rejected": -511.0328063964844, + "loss": 0.5112, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0398690700531006, + "rewards/margins": 0.9446641802787781, + "rewards/rejected": -1.9845333099365234, "step": 470 }, { - "epoch": 0.58, - "learning_rate": 8.962243740945193e-07, - "logits/chosen": -0.13440537452697754, - "logits/rejected": -0.04218355566263199, - "logps/chosen": -416.8321228027344, - "logps/rejected": -364.4325866699219, - "loss": 0.2108, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6111631393432617, - "rewards/margins": 0.44943752884864807, - "rewards/rejected": -1.0606005191802979, + "epoch": 1.0, + "learning_rate": 5.820121557655108e-07, + "logits/chosen": 0.1629343330860138, + "logits/rejected": 0.19159266352653503, + "logps/chosen": -446.3037109375, + "logps/rejected": -550.802734375, + "loss": 0.4919, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.040998101234436, + "rewards/margins": 1.0897653102874756, + "rewards/rejected": -2.130763292312622, "step": 480 }, { - "epoch": 0.59, - "learning_rate": 8.897463053343362e-07, - "logits/chosen": -0.15430530905723572, - "logits/rejected": -0.07277049124240875, - "logps/chosen": -382.2574462890625, - "logps/rejected": -377.4708557128906, - "loss": 0.189, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.6694794297218323, - "rewards/margins": 0.4132021367549896, - "rewards/rejected": -1.08268141746521, + "epoch": 1.03, + "learning_rate": 5.639015248598023e-07, + "logits/chosen": 0.18678539991378784, + "logits/rejected": 0.09687422960996628, + "logps/chosen": -393.89947509765625, + "logps/rejected": -537.8324584960938, + "loss": 0.3439, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -0.9865323901176453, + "rewards/margins": 1.5319825410842896, + "rewards/rejected": -2.518515110015869, "step": 490 }, { - "epoch": 0.6, - "learning_rate": 8.83097052802791e-07, - "logits/chosen": -0.16823723912239075, - "logits/rejected": -0.05930133908987045, - "logps/chosen": -431.1293029785156, - "logps/rejected": -416.7881774902344, - "loss": 0.1843, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7873836159706116, - "rewards/margins": 0.39316868782043457, - "rewards/rejected": -1.1805522441864014, + "epoch": 1.05, + "learning_rate": 5.457052320211339e-07, + "logits/chosen": 0.05529998615384102, + "logits/rejected": 0.05775626748800278, + "logps/chosen": -478.3514709472656, + "logps/rejected": -619.7296142578125, + "loss": 0.3557, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3684320449829102, + "rewards/margins": 1.7347371578216553, + "rewards/rejected": -3.1031689643859863, "step": 500 }, { - "epoch": 0.61, - "learning_rate": 8.762795369743302e-07, - "logits/chosen": -0.11715607345104218, - "logits/rejected": -0.08342987298965454, - "logps/chosen": -378.78887939453125, - "logps/rejected": -364.96392822265625, - "loss": 0.1598, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.930017352104187, - "rewards/margins": 0.35262879729270935, - "rewards/rejected": -1.2826461791992188, + "epoch": 1.05, + "eval_logits/chosen": 0.19708794355392456, + "eval_logits/rejected": 0.16058923304080963, + "eval_logps/chosen": -463.1487731933594, + "eval_logps/rejected": -613.962646484375, + "eval_loss": 0.5232166051864624, + "eval_rewards/accuracies": 0.78515625, + "eval_rewards/chosen": -1.5956411361694336, + "eval_rewards/margins": 1.4277347326278687, + "eval_rewards/rejected": -3.0233757495880127, + "eval_runtime": 65.6922, + "eval_samples_per_second": 30.445, + "eval_steps_per_second": 0.487, + "step": 500 + }, + { + "epoch": 1.07, + "learning_rate": 5.274476699321637e-07, + "logits/chosen": 0.0674450471997261, + "logits/rejected": 0.08521180599927902, + "logps/chosen": -445.9688415527344, + "logps/rejected": -562.4978637695312, + "loss": 0.3544, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3177635669708252, + "rewards/margins": 1.4718105792999268, + "rewards/rejected": -2.789574146270752, "step": 510 }, { - "epoch": 0.62, - "learning_rate": 8.692967522277452e-07, - "logits/chosen": -0.2919595241546631, - "logits/rejected": -0.13776831328868866, - "logps/chosen": -471.62213134765625, - "logps/rejected": -412.658935546875, - "loss": 0.151, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.0413435697555542, - "rewards/margins": 0.5299603343009949, - "rewards/rejected": -1.5713039636611938, + "epoch": 1.09, + "learning_rate": 5.091533134088387e-07, + "logits/chosen": 0.05661384016275406, + "logits/rejected": 0.062000591307878494, + "logps/chosen": -445.4009704589844, + "logps/rejected": -600.1541748046875, + "loss": 0.3416, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.33919358253479, + "rewards/margins": 1.8340098857879639, + "rewards/rejected": -3.173203229904175, "step": 520 }, { - "epoch": 0.64, - "learning_rate": 8.621517655309871e-07, - "logits/chosen": -0.22538213431835175, - "logits/rejected": -0.16458283364772797, - "logps/chosen": -471.0306091308594, - "logps/rejected": -448.32958984375, - "loss": 0.1764, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8633302450180054, - "rewards/margins": 0.5896649956703186, - "rewards/rejected": -1.4529950618743896, + "epoch": 1.11, + "learning_rate": 4.908466865911614e-07, + "logits/chosen": -0.030503610149025917, + "logits/rejected": -0.011303985491394997, + "logps/chosen": -524.8907470703125, + "logps/rejected": -669.0797119140625, + "loss": 0.3284, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8121287822723389, + "rewards/margins": 1.7762531042099, + "rewards/rejected": -3.5883820056915283, "step": 530 }, { - "epoch": 0.65, - "learning_rate": 8.548477150940976e-07, - "logits/chosen": -0.2914785146713257, - "logits/rejected": -0.2505750358104706, - "logps/chosen": -396.79302978515625, - "logps/rejected": -392.1474609375, - "loss": 0.1663, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.8635896444320679, - "rewards/margins": 0.45653051137924194, - "rewards/rejected": -1.320120096206665, + "epoch": 1.13, + "learning_rate": 4.7255233006783624e-07, + "logits/chosen": 0.03242509439587593, + "logits/rejected": -0.11479414999485016, + "logps/chosen": -485.2374572753906, + "logps/rejected": -665.2349243164062, + "loss": 0.3414, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8120667934417725, + "rewards/margins": 1.935499906539917, + "rewards/rejected": -3.7475669384002686, "step": 540 }, { - "epoch": 0.66, - "learning_rate": 8.473878089908488e-07, - "logits/chosen": -0.23568303883075714, - "logits/rejected": -0.13943514227867126, - "logps/chosen": -386.1482849121094, - "logps/rejected": -365.82904052734375, - "loss": 0.1494, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7458887696266174, - "rewards/margins": 0.29060885310173035, - "rewards/rejected": -1.0364975929260254, + "epoch": 1.15, + "learning_rate": 4.5429476797886617e-07, + "logits/chosen": -0.0022813579998910427, + "logits/rejected": -0.008704641833901405, + "logps/chosen": -475.7318420410156, + "logps/rejected": -642.6895751953125, + "loss": 0.3286, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6344352960586548, + "rewards/margins": 1.7103891372680664, + "rewards/rejected": -3.3448245525360107, "step": 550 }, { - "epoch": 0.67, - "learning_rate": 8.397753237496989e-07, - "logits/chosen": -0.16948673129081726, - "logits/rejected": -0.13608554005622864, - "logps/chosen": -403.01434326171875, - "logps/rejected": -411.62725830078125, - "loss": 0.1565, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9865185618400574, - "rewards/margins": 0.4449576735496521, - "rewards/rejected": -1.431476354598999, + "epoch": 1.17, + "learning_rate": 4.3609847514019763e-07, + "logits/chosen": -0.17216846346855164, + "logits/rejected": -0.13854144513607025, + "logps/chosen": -476.8624572753906, + "logps/rejected": -648.0611572265625, + "loss": 0.3404, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7177269458770752, + "rewards/margins": 1.7771364450454712, + "rewards/rejected": -3.4948630332946777, "step": 560 }, { - "epoch": 0.68, - "learning_rate": 8.320136029146792e-07, - "logits/chosen": -0.2247391641139984, - "logits/rejected": -0.1330215483903885, - "logps/chosen": -418.1533203125, - "logps/rejected": -402.0802307128906, - "loss": 0.1422, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1958593130111694, - "rewards/margins": 0.4591103494167328, - "rewards/rejected": -1.654969573020935, + "epoch": 1.19, + "learning_rate": 4.179878442344892e-07, + "logits/chosen": -0.031789492815732956, + "logits/rejected": -0.02642265520989895, + "logps/chosen": -455.97052001953125, + "logps/rejected": -630.4293212890625, + "loss": 0.3518, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6433439254760742, + "rewards/margins": 1.930822730064392, + "rewards/rejected": -3.574166774749756, "step": 570 }, { - "epoch": 0.7, - "learning_rate": 8.241060555768485e-07, - "logits/chosen": -0.2574161887168884, - "logits/rejected": -0.2403256893157959, - "logps/chosen": -499.733642578125, - "logps/rejected": -511.239501953125, - "loss": 0.1269, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4520691633224487, - "rewards/margins": 0.6710880994796753, - "rewards/rejected": -2.123157262802124, + "epoch": 1.21, + "learning_rate": 3.9998715311197783e-07, + "logits/chosen": -0.0298085268586874, + "logits/rejected": -0.03243950009346008, + "logps/chosen": -454.34228515625, + "logps/rejected": -626.9178466796875, + "loss": 0.331, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.680559754371643, + "rewards/margins": 1.679256796836853, + "rewards/rejected": -3.359816789627075, "step": 580 }, { - "epoch": 0.71, - "learning_rate": 8.160561548769579e-07, - "logits/chosen": -0.2796838879585266, - "logits/rejected": -0.18458350002765656, - "logps/chosen": -447.6980895996094, - "logps/rejected": -411.94921875, - "loss": 0.1419, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.0911153554916382, - "rewards/margins": 0.625370979309082, - "rewards/rejected": -1.7164863348007202, + "epoch": 1.23, + "learning_rate": 3.821205322452863e-07, + "logits/chosen": 0.0827791839838028, + "logits/rejected": 0.12741518020629883, + "logps/chosen": -477.8468322753906, + "logps/rejected": -667.1104736328125, + "loss": 0.3152, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8902521133422852, + "rewards/margins": 2.016261339187622, + "rewards/rejected": -3.906513214111328, "step": 590 }, { - "epoch": 0.72, - "learning_rate": 8.078674364799822e-07, - "logits/chosen": -0.2084120213985443, - "logits/rejected": -0.13774996995925903, - "logps/chosen": -430.189697265625, - "logps/rejected": -379.07867431640625, - "loss": 0.1509, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.9517067074775696, - "rewards/margins": 0.3666831851005554, - "rewards/rejected": -1.318389654159546, + "epoch": 1.26, + "learning_rate": 3.6441193238179146e-07, + "logits/chosen": 0.15034182369709015, + "logits/rejected": 0.08470882475376129, + "logps/chosen": -504.47979736328125, + "logps/rejected": -689.1786499023438, + "loss": 0.3459, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7678565979003906, + "rewards/margins": 2.0546722412109375, + "rewards/rejected": -3.822528839111328, "step": 600 }, { - "epoch": 0.73, - "learning_rate": 7.995434970221915e-07, - "logits/chosen": -0.20178177952766418, - "logits/rejected": -0.08668573200702667, - "logps/chosen": -446.00244140625, - "logps/rejected": -375.57171630859375, - "loss": 0.1637, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9166814088821411, - "rewards/margins": 0.42661458253860474, - "rewards/rejected": -1.3432958126068115, + "epoch": 1.26, + "eval_logits/chosen": 0.21488572657108307, + "eval_logits/rejected": 0.16770756244659424, + "eval_logps/chosen": -487.6650390625, + "eval_logps/rejected": -626.5206298828125, + "eval_loss": 0.5179377198219299, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -1.8408037424087524, + "eval_rewards/margins": 1.3081512451171875, + "eval_rewards/rejected": -3.1489551067352295, + "eval_runtime": 66.2317, + "eval_samples_per_second": 30.197, + "eval_steps_per_second": 0.483, + "step": 600 + }, + { + "epoch": 1.28, + "learning_rate": 3.4688509243692034e-07, + "logits/chosen": 0.02503601647913456, + "logits/rejected": 0.023386284708976746, + "logps/chosen": -503.43194580078125, + "logps/rejected": -680.9974975585938, + "loss": 0.3141, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6242990493774414, + "rewards/margins": 1.9624935388565063, + "rewards/rejected": -3.586792469024658, "step": 610 }, { - "epoch": 0.74, - "learning_rate": 7.910879925314412e-07, - "logits/chosen": -0.24198313057422638, - "logits/rejected": -0.14964896440505981, - "logps/chosen": -444.73773193359375, - "logps/rejected": -413.18994140625, - "loss": 0.16, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.036938190460205, - "rewards/margins": 0.41328781843185425, - "rewards/rejected": -1.450226068496704, + "epoch": 1.3, + "learning_rate": 3.295635076714144e-07, + "logits/chosen": -0.15497131645679474, + "logits/rejected": -0.10766881704330444, + "logps/chosen": -539.1582641601562, + "logps/rejected": -690.4927978515625, + "loss": 0.3141, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8774610757827759, + "rewards/margins": 1.9384454488754272, + "rewards/rejected": -3.8159070014953613, "step": 620 }, { - "epoch": 0.76, - "learning_rate": 7.825046368213781e-07, - "logits/chosen": -0.13304699957370758, - "logits/rejected": -0.12154946476221085, - "logps/chosen": -403.96533203125, - "logps/rejected": -399.8603820800781, - "loss": 0.1637, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8464315533638, - "rewards/margins": 0.501135528087616, - "rewards/rejected": -1.3475672006607056, + "epoch": 1.32, + "learning_rate": 3.12470398195219e-07, + "logits/chosen": -0.07300277799367905, + "logits/rejected": -0.11429448425769806, + "logps/chosen": -579.0623779296875, + "logps/rejected": -720.4669189453125, + "loss": 0.3392, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.068450689315796, + "rewards/margins": 2.211409330368042, + "rewards/rejected": -4.2798590660095215, "step": 630 }, { - "epoch": 0.77, - "learning_rate": 7.737971998602646e-07, - "logits/chosen": -0.12783931195735931, - "logits/rejected": -0.1469259113073349, - "logps/chosen": -413.7705993652344, - "logps/rejected": -416.5107421875, - "loss": 0.1468, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2587213516235352, - "rewards/margins": 0.4588088095188141, - "rewards/rejected": -1.7175302505493164, + "epoch": 1.34, + "learning_rate": 2.956286778402226e-07, + "logits/chosen": -0.019360024482011795, + "logits/rejected": -0.11440087854862213, + "logps/chosen": -461.2447204589844, + "logps/rejected": -671.9784545898438, + "loss": 0.3298, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7676658630371094, + "rewards/margins": 1.9985605478286743, + "rewards/rejected": -3.7662265300750732, "step": 640 }, { - "epoch": 0.78, - "learning_rate": 7.649695061151383e-07, - "logits/chosen": -0.22540828585624695, - "logits/rejected": -0.1876813918352127, - "logps/chosen": -507.74346923828125, - "logps/rejected": -520.5947265625, - "loss": 0.1349, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4848111867904663, - "rewards/margins": 0.5542536973953247, - "rewards/rejected": -2.039064884185791, + "epoch": 1.36, + "learning_rate": 2.7906092344356826e-07, + "logits/chosen": 0.05438992381095886, + "logits/rejected": 0.030209839344024658, + "logps/chosen": -490.1392517089844, + "logps/rejected": -637.2672119140625, + "loss": 0.3327, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.465033769607544, + "rewards/margins": 1.7672345638275146, + "rewards/rejected": -3.2322685718536377, "step": 650 }, { - "epoch": 0.79, - "learning_rate": 7.560254328720362e-07, - "logits/chosen": -0.2085895836353302, - "logits/rejected": -0.042162101715803146, - "logps/chosen": -440.8785095214844, - "logps/rejected": -387.02215576171875, - "loss": 0.1585, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.233938455581665, - "rewards/margins": 0.3742896020412445, - "rewards/rejected": -1.6082279682159424, + "epoch": 1.38, + "learning_rate": 2.6278934458271996e-07, + "logits/chosen": -0.0673966184258461, + "logits/rejected": -0.08923710882663727, + "logps/chosen": -499.862548828125, + "logps/rejected": -689.426513671875, + "loss": 0.3354, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7509543895721436, + "rewards/margins": 1.90044367313385, + "rewards/rejected": -3.6513984203338623, "step": 660 }, { - "epoch": 0.8, - "learning_rate": 7.469689085330195e-07, - "logits/chosen": -0.1412975937128067, - "logits/rejected": -0.17901337146759033, - "logps/chosen": -446.0218200683594, - "logps/rejected": -433.8721618652344, - "loss": 0.1612, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8437545895576477, - "rewards/margins": 0.5731817483901978, - "rewards/rejected": -1.4169362783432007, + "epoch": 1.4, + "learning_rate": 2.468357538028487e-07, + "logits/chosen": -0.06420551240444183, + "logits/rejected": -0.05014914274215698, + "logps/chosen": -509.65020751953125, + "logps/rejected": -648.8363037109375, + "loss": 0.3347, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.751989722251892, + "rewards/margins": 1.888462781906128, + "rewards/rejected": -3.6404526233673096, "step": 670 }, { - "epoch": 0.82, - "learning_rate": 7.37803910890746e-07, - "logits/chosen": -0.17172157764434814, - "logits/rejected": -0.15589216351509094, - "logps/chosen": -413.99176025390625, - "logps/rejected": -443.12994384765625, - "loss": 0.145, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0244253873825073, - "rewards/margins": 0.5451046824455261, - "rewards/rejected": -1.5695301294326782, + "epoch": 1.42, + "learning_rate": 2.312215373764551e-07, + "logits/chosen": -0.07077746093273163, + "logits/rejected": -0.07987900823354721, + "logps/chosen": -501.39105224609375, + "logps/rejected": -663.5813598632812, + "loss": 0.3273, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7682859897613525, + "rewards/margins": 1.806647539138794, + "rewards/rejected": -3.5749335289001465, "step": 680 }, { - "epoch": 0.83, - "learning_rate": 7.285344653813504e-07, - "logits/chosen": -0.16723224520683289, - "logits/rejected": -0.11218209564685822, - "logps/chosen": -459.5580139160156, - "logps/rejected": -502.71514892578125, - "loss": 0.1259, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.443068027496338, - "rewards/margins": 0.6126604676246643, - "rewards/rejected": -2.0557284355163574, + "epoch": 1.44, + "learning_rate": 2.1596762663442213e-07, + "logits/chosen": -0.10694106668233871, + "logits/rejected": -0.13092787563800812, + "logps/chosen": -485.4988708496094, + "logps/rejected": -654.7581176757812, + "loss": 0.3102, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.802573800086975, + "rewards/margins": 1.9674146175384521, + "rewards/rejected": -3.7699882984161377, "step": 690 }, { - "epoch": 0.84, - "learning_rate": 7.19164643316399e-07, - "logits/chosen": -0.1634540855884552, - "logits/rejected": -0.008752308785915375, - "logps/chosen": -477.576416015625, - "logps/rejected": -446.4156188964844, - "loss": 0.1141, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.5389097929000854, - "rewards/margins": 0.4564815163612366, - "rewards/rejected": -1.9953914880752563, + "epoch": 1.47, + "learning_rate": 2.0109446990692963e-07, + "logits/chosen": -0.13472847640514374, + "logits/rejected": -0.11757614463567734, + "logps/chosen": -529.8543701171875, + "logps/rejected": -666.8710327148438, + "loss": 0.3321, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.0080933570861816, + "rewards/margins": 1.8389003276824951, + "rewards/rejected": -3.8469932079315186, "step": 700 }, { - "epoch": 0.85, - "learning_rate": 7.096985600946937e-07, - "logits/chosen": -0.12237264961004257, - "logits/rejected": -0.10291185230016708, - "logps/chosen": -393.76837158203125, - "logps/rejected": -429.14447021484375, - "loss": 0.1637, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9208710789680481, - "rewards/margins": 0.6199368238449097, - "rewards/rejected": -1.5408079624176025, + "epoch": 1.47, + "eval_logits/chosen": 0.003292468376457691, + "eval_logits/rejected": -0.06900674104690552, + "eval_logps/chosen": -510.547607421875, + "eval_logps/rejected": -664.8507080078125, + "eval_loss": 0.5330736041069031, + "eval_rewards/accuracies": 0.7890625, + "eval_rewards/chosen": -2.069629192352295, + "eval_rewards/margins": 1.4626271724700928, + "eval_rewards/rejected": -3.532256603240967, + "eval_runtime": 66.7195, + "eval_samples_per_second": 29.976, + "eval_steps_per_second": 0.48, + "step": 700 + }, + { + "epoch": 1.49, + "learning_rate": 1.8662200511184872e-07, + "logits/chosen": -0.10703661292791367, + "logits/rejected": -0.13788972795009613, + "logps/chosen": -488.99029541015625, + "logps/rejected": -686.2767333984375, + "loss": 0.3475, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7703863382339478, + "rewards/margins": 2.0744519233703613, + "rewards/rejected": -3.8448383808135986, "step": 710 }, { - "epoch": 0.86, - "learning_rate": 7.001403733947133e-07, - "logits/chosen": -0.1201944500207901, - "logits/rejected": -0.05582220479846001, - "logps/chosen": -382.8240661621094, - "logps/rejected": -375.1324462890625, - "loss": 0.1732, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.789650559425354, - "rewards/margins": 0.3769993185997009, - "rewards/rejected": -1.1666499376296997, + "epoch": 1.51, + "learning_rate": 1.725696330273575e-07, + "logits/chosen": -0.08608827739953995, + "logits/rejected": -0.15545812249183655, + "logps/chosen": -515.587890625, + "logps/rejected": -661.5228881835938, + "loss": 0.3325, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.766017198562622, + "rewards/margins": 1.590886116027832, + "rewards/rejected": -3.356903553009033, "step": 720 }, { - "epoch": 0.88, - "learning_rate": 6.904942813484846e-07, - "logits/chosen": -0.08779577165842056, - "logits/rejected": -0.05727202445268631, - "logps/chosen": -375.1112976074219, - "logps/rejected": -363.4750061035156, - "loss": 0.1511, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8722445368766785, - "rewards/margins": 0.4427576959133148, - "rewards/rejected": -1.315002202987671, + "epoch": 1.53, + "learning_rate": 1.589561912846089e-07, + "logits/chosen": -0.06848429143428802, + "logits/rejected": -0.056651823222637177, + "logps/chosen": -459.4026794433594, + "logps/rejected": -636.302001953125, + "loss": 0.3154, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.583231806755066, + "rewards/margins": 1.8955827951431274, + "rewards/rejected": -3.4788146018981934, "step": 730 }, { - "epoch": 0.89, - "learning_rate": 6.807645206976847e-07, - "logits/chosen": -0.15901021659374237, - "logits/rejected": -0.046349309384822845, - "logps/chosen": -454.17816162109375, - "logps/rejected": -410.53814697265625, - "loss": 0.139, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2105097770690918, - "rewards/margins": 0.5188924670219421, - "rewards/rejected": -1.7294021844863892, + "epoch": 1.55, + "learning_rate": 1.4579992911531496e-07, + "logits/chosen": -0.05496741458773613, + "logits/rejected": -0.1415387988090515, + "logps/chosen": -439.0248107910156, + "logps/rejected": -623.8389892578125, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6619327068328857, + "rewards/margins": 1.9101438522338867, + "rewards/rejected": -3.5720763206481934, "step": 740 }, { - "epoch": 0.9, - "learning_rate": 6.709553649327864e-07, - "logits/chosen": -0.10003723204135895, - "logits/rejected": -0.11153779178857803, - "logps/chosen": -424.2566833496094, - "logps/rejected": -428.76263427734375, - "loss": 0.1449, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.9821775555610657, - "rewards/margins": 0.5280328989028931, - "rewards/rejected": -1.510210633277893, + "epoch": 1.57, + "learning_rate": 1.3311848288809813e-07, + "logits/chosen": -0.17769137024879456, + "logits/rejected": -0.21130616962909698, + "logps/chosen": -489.0089416503906, + "logps/rejected": -651.5428466796875, + "loss": 0.3275, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6139405965805054, + "rewards/margins": 1.9740289449691772, + "rewards/rejected": -3.5879695415496826, "step": 750 }, { - "epoch": 0.91, - "learning_rate": 6.610711224160624e-07, - "logits/chosen": -0.09928876161575317, - "logits/rejected": 0.037872180342674255, - "logps/chosen": -461.2574157714844, - "logps/rejected": -414.75433349609375, - "loss": 0.1549, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.9287399053573608, - "rewards/margins": 0.4935649037361145, - "rewards/rejected": -1.4223048686981201, + "epoch": 1.59, + "learning_rate": 1.209288524664029e-07, + "logits/chosen": -0.13984176516532898, + "logits/rejected": -0.21566995978355408, + "logps/chosen": -504.34814453125, + "logps/rejected": -702.9925537109375, + "loss": 0.3136, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8642832040786743, + "rewards/margins": 2.1428329944610596, + "rewards/rejected": -4.007115840911865, "step": 760 }, { - "epoch": 0.92, - "learning_rate": 6.51116134489272e-07, - "logits/chosen": -0.04129798337817192, - "logits/rejected": 0.029511254280805588, - "logps/chosen": -425.9366149902344, - "logps/rejected": -408.80267333984375, - "loss": 0.1517, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.9357036352157593, - "rewards/margins": 0.4151946008205414, - "rewards/rejected": -1.3508983850479126, + "epoch": 1.61, + "learning_rate": 1.0924737841966497e-07, + "logits/chosen": -0.08519931882619858, + "logits/rejected": -0.04222021624445915, + "logps/chosen": -487.46307373046875, + "logps/rejected": -656.1431884765625, + "loss": 0.3088, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8077300786972046, + "rewards/margins": 1.9863264560699463, + "rewards/rejected": -3.7940566539764404, "step": 770 }, { - "epoch": 0.94, - "learning_rate": 6.410947735668653e-07, - "logits/chosen": -0.05360158160328865, - "logits/rejected": -0.07298514991998672, - "logps/chosen": -392.5763854980469, - "logps/rejected": -427.8981018066406, - "loss": 0.1527, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.8554953336715698, - "rewards/margins": 0.7457026243209839, - "rewards/rejected": -1.6011979579925537, + "epoch": 1.63, + "learning_rate": 9.808972011828054e-08, + "logits/chosen": -0.15101949870586395, + "logits/rejected": -0.13765974342823029, + "logps/chosen": -501.7498474121094, + "logps/rejected": -694.2449951171875, + "loss": 0.3388, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8827495574951172, + "rewards/margins": 2.0869569778442383, + "rewards/rejected": -3.9697060585021973, "step": 780 }, { - "epoch": 0.95, - "learning_rate": 6.310114412155368e-07, - "logits/chosen": -0.10374744981527328, - "logits/rejected": 0.0370178297162056, - "logps/chosen": -447.22381591796875, - "logps/rejected": -401.534423828125, - "loss": 0.1595, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.0632203817367554, - "rewards/margins": 0.2709798216819763, - "rewards/rejected": -1.334200143814087, + "epoch": 1.65, + "learning_rate": 8.747083474174527e-08, + "logits/chosen": -0.15161341428756714, + "logits/rejected": -0.15945735573768616, + "logps/chosen": -497.11602783203125, + "logps/rejected": -660.0712280273438, + "loss": 0.3148, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8078014850616455, + "rewards/margins": 1.9871124029159546, + "rewards/rejected": -3.7949135303497314, "step": 790 }, { - "epoch": 0.96, - "learning_rate": 6.208705662209762e-07, - "logits/chosen": -0.08297502994537354, - "logits/rejected": -0.03830767422914505, - "logps/chosen": -397.39202880859375, - "logps/rejected": -369.2044372558594, - "loss": 0.1663, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8531109094619751, - "rewards/margins": 0.4128567576408386, - "rewards/rejected": -1.265967607498169, + "epoch": 1.67, + "learning_rate": 7.740495722810269e-08, + "logits/chosen": -0.07586201280355453, + "logits/rejected": -0.13467435538768768, + "logps/chosen": -477.7305603027344, + "logps/rejected": -668.1069946289062, + "loss": 0.2983, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8016554117202759, + "rewards/margins": 2.105151414871216, + "rewards/rejected": -3.906806468963623, "step": 800 }, - { - "epoch": 0.97, - "learning_rate": 6.106766026426648e-07, - "logits/chosen": -0.07533316314220428, - "logits/rejected": 0.044484030455350876, - "logps/chosen": -390.5148010253906, - "logps/rejected": -376.8379211425781, - "loss": 0.175, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.9820185899734497, - "rewards/margins": 0.4463561475276947, - "rewards/rejected": -1.4283746480941772, - "step": 810 - }, - { - "epoch": 0.98, - "learning_rate": 6.004340278575695e-07, - "logits/chosen": -0.0853220671415329, - "logits/rejected": -0.05402841418981552, - "logps/chosen": -432.50543212890625, - "logps/rejected": -429.68017578125, - "loss": 0.1552, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.9129465222358704, - "rewards/margins": 0.6217477321624756, - "rewards/rejected": -1.5346943140029907, - "step": 820 - }, - { - "epoch": 1.0, - "learning_rate": 5.901473405935966e-07, - "logits/chosen": -0.08009025454521179, - "logits/rejected": 0.029772957786917686, - "logps/chosen": -389.1134948730469, - "logps/rejected": -390.2154846191406, - "loss": 0.1582, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.0638234615325928, - "rewards/margins": 0.27301082015037537, - "rewards/rejected": -1.336834192276001, - "step": 830 - }, - { - "epoch": 1.01, - "learning_rate": 5.798210589536672e-07, - "logits/chosen": 0.038750506937503815, - "logits/rejected": 0.011047865264117718, - "logps/chosen": -398.5277099609375, - "logps/rejected": -468.46044921875, - "loss": 0.1019, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.989227294921875, - "rewards/margins": 0.8411723971366882, - "rewards/rejected": -1.830399751663208, - "step": 840 - }, - { - "epoch": 1.02, - "learning_rate": 5.694597184312832e-07, - "logits/chosen": -0.08361298590898514, - "logits/rejected": 0.025835633277893066, - "logps/chosen": -512.9071655273438, - "logps/rejected": -552.0848388671875, - "loss": 0.0639, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.7927268743515015, - "rewards/margins": 1.0089781284332275, - "rewards/rejected": -2.8017051219940186, - "step": 850 - }, - { - "epoch": 1.03, - "learning_rate": 5.590678699184552e-07, - "logits/chosen": -0.1973053216934204, - "logits/rejected": -0.08661861717700958, - "logps/chosen": -538.1658935546875, - "logps/rejected": -642.89697265625, - "loss": 0.0429, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.399050235748291, - "rewards/margins": 1.239048719406128, - "rewards/rejected": -3.638098955154419, - "step": 860 - }, - { - "epoch": 1.04, - "learning_rate": 5.486500777068659e-07, - "logits/chosen": -0.09753044694662094, - "logits/rejected": 0.013812586665153503, - "logps/chosen": -533.619873046875, - "logps/rejected": -590.58544921875, - "loss": 0.0332, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.461451530456543, - "rewards/margins": 1.150248646736145, - "rewards/rejected": -3.6117000579833984, - "step": 870 - }, - { - "epoch": 1.06, - "learning_rate": 5.382109174831493e-07, - "logits/chosen": -0.16656556725502014, - "logits/rejected": -0.014411838725209236, - "logps/chosen": -612.1375732421875, - "logps/rejected": -666.2962646484375, - "loss": 0.0369, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.350978374481201, - "rewards/margins": 1.2788021564483643, - "rewards/rejected": -3.6297805309295654, - "step": 880 - }, - { - "epoch": 1.07, - "learning_rate": 5.277549743191652e-07, - "logits/chosen": -0.04728949815034866, - "logits/rejected": 0.05177200958132744, - "logps/chosen": -481.9996032714844, - "logps/rejected": -563.8628540039062, - "loss": 0.0408, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.9757206439971924, - "rewards/margins": 1.2286030054092407, - "rewards/rejected": -3.2043235301971436, - "step": 890 - }, - { - "epoch": 1.08, - "learning_rate": 5.172868406581501e-07, - "logits/chosen": -0.10119952261447906, - "logits/rejected": 0.03440069407224655, - "logps/chosen": -575.5086059570312, - "logps/rejected": -601.7667236328125, - "loss": 0.0379, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.9103889465332031, - "rewards/margins": 1.1987693309783936, - "rewards/rejected": -3.109158515930176, - "step": 900 - }, - { - "epoch": 1.09, - "learning_rate": 5.068111142976319e-07, - "logits/chosen": -0.012911921367049217, - "logits/rejected": 0.05222976952791214, - "logps/chosen": -544.2637939453125, - "logps/rejected": -609.9290161132812, - "loss": 0.0308, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.4613072872161865, - "rewards/margins": 1.0719743967056274, - "rewards/rejected": -3.5332818031311035, - "step": 910 - }, - { - "epoch": 1.1, - "learning_rate": 4.963323963699926e-07, - "logits/chosen": 0.07444079965353012, - "logits/rejected": 0.07786104083061218, - "logps/chosen": -598.2421875, - "logps/rejected": -706.32470703125, - "loss": 0.0256, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.6978304386138916, - "rewards/margins": 1.5244108438491821, - "rewards/rejected": -4.222241401672363, - "step": 920 - }, - { - "epoch": 1.12, - "learning_rate": 4.858552893215655e-07, - "logits/chosen": 0.01419870276004076, - "logits/rejected": 0.17051884531974792, - "logps/chosen": -530.5960083007812, - "logps/rejected": -619.4927978515625, - "loss": 0.0215, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.6646533012390137, - "rewards/margins": 1.1269919872283936, - "rewards/rejected": -3.791645050048828, - "step": 930 - }, - { - "epoch": 1.13, - "learning_rate": 4.753843948911556e-07, - "logits/chosen": 0.08497779071331024, - "logits/rejected": 0.1790248453617096, - "logps/chosen": -576.8201293945312, - "logps/rejected": -674.81982421875, - "loss": 0.0222, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.776611804962158, - "rewards/margins": 1.3886988162994385, - "rewards/rejected": -4.165310382843018, - "step": 940 - }, - { - "epoch": 1.14, - "learning_rate": 4.649243120888722e-07, - "logits/chosen": 0.0691087394952774, - "logits/rejected": 0.14361132681369781, - "logps/chosen": -612.6062622070312, - "logps/rejected": -648.2825317382812, - "loss": 0.025, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.668504238128662, - "rewards/margins": 1.0527547597885132, - "rewards/rejected": -3.721259355545044, - "step": 950 - }, - { - "epoch": 1.15, - "learning_rate": 4.544796351761574e-07, - "logits/chosen": 0.07248688489198685, - "logits/rejected": 0.18278047442436218, - "logps/chosen": -539.9449462890625, - "logps/rejected": -617.9127197265625, - "loss": 0.0232, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3308327198028564, - "rewards/margins": 1.2744853496551514, - "rewards/rejected": -3.605318069458008, - "step": 960 - }, - { - "epoch": 1.16, - "learning_rate": 4.440549516479022e-07, - "logits/chosen": 0.14086951315402985, - "logits/rejected": 0.1397445946931839, - "logps/chosen": -541.9815673828125, - "logps/rejected": -650.6157836914062, - "loss": 0.0259, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4639604091644287, - "rewards/margins": 1.412184476852417, - "rewards/rejected": -3.876145124435425, - "step": 970 - }, - { - "epoch": 1.18, - "learning_rate": 4.336548402175345e-07, - "logits/chosen": 0.08946482837200165, - "logits/rejected": 0.23766390979290009, - "logps/chosen": -570.1082153320312, - "logps/rejected": -633.4486083984375, - "loss": 0.022, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.765277862548828, - "rewards/margins": 1.223432183265686, - "rewards/rejected": -3.9887099266052246, - "step": 980 - }, - { - "epoch": 1.19, - "learning_rate": 4.232838688059627e-07, - "logits/chosen": 0.20700442790985107, - "logits/rejected": 0.21234914660453796, - "logps/chosen": -642.5216064453125, - "logps/rejected": -769.6670532226562, - "loss": 0.0217, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -3.0695090293884277, - "rewards/margins": 1.9037091732025146, - "rewards/rejected": -4.9732184410095215, - "step": 990 - }, - { - "epoch": 1.2, - "learning_rate": 4.129465925352618e-07, - "logits/chosen": 0.16028758883476257, - "logits/rejected": 0.3449386954307556, - "logps/chosen": -624.1184692382812, - "logps/rejected": -672.5079956054688, - "loss": 0.0174, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.061570405960083, - "rewards/margins": 1.1443489789962769, - "rewards/rejected": -4.205918788909912, - "step": 1000 - }, - { - "epoch": 1.21, - "learning_rate": 4.0264755172797837e-07, - "logits/chosen": 0.052181851118803024, - "logits/rejected": 0.18743617832660675, - "logps/chosen": -607.5186767578125, - "logps/rejected": -672.8460693359375, - "loss": 0.0207, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.874335765838623, - "rewards/margins": 1.2407417297363281, - "rewards/rejected": -4.115077018737793, - "step": 1010 - }, - { - "epoch": 1.22, - "learning_rate": 3.9239126991293775e-07, - "logits/chosen": 0.24214263260364532, - "logits/rejected": 0.40768828988075256, - "logps/chosen": -620.0783081054688, - "logps/rejected": -685.48388671875, - "loss": 0.0195, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.8905162811279297, - "rewards/margins": 1.159411907196045, - "rewards/rejected": -4.049928188323975, - "step": 1020 - }, - { - "epoch": 1.24, - "learning_rate": 3.82182251838427e-07, - "logits/chosen": 0.1082601398229599, - "logits/rejected": 0.29715824127197266, - "logps/chosen": -589.5526123046875, - "logps/rejected": -628.5372924804688, - "loss": 0.0202, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.6682987213134766, - "rewards/margins": 1.2489776611328125, - "rewards/rejected": -3.9172768592834473, - "step": 1030 - }, - { - "epoch": 1.25, - "learning_rate": 3.720249814936255e-07, - "logits/chosen": 0.2514716684818268, - "logits/rejected": 0.27680593729019165, - "logps/chosen": -592.1260986328125, - "logps/rejected": -714.8380737304688, - "loss": 0.0218, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.839751720428467, - "rewards/margins": 1.4019855260849, - "rewards/rejected": -4.241737365722656, - "step": 1040 - }, - { - "epoch": 1.26, - "learning_rate": 3.6192392013915473e-07, - "logits/chosen": 0.16709819436073303, - "logits/rejected": 0.29886525869369507, - "logps/chosen": -598.5506591796875, - "logps/rejected": -646.1890258789062, - "loss": 0.0207, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.756328582763672, - "rewards/margins": 1.287087321281433, - "rewards/rejected": -4.0434160232543945, - "step": 1050 - }, - { - "epoch": 1.27, - "learning_rate": 3.5188350434761025e-07, - "logits/chosen": 0.20310468971729279, - "logits/rejected": 0.32862892746925354, - "logps/chosen": -669.3348388671875, - "logps/rejected": -772.3299560546875, - "loss": 0.0196, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.227342128753662, - "rewards/margins": 1.726514220237732, - "rewards/rejected": -4.953856468200684, - "step": 1060 - }, - { - "epoch": 1.28, - "learning_rate": 3.419081440549368e-07, - "logits/chosen": 0.12433931976556778, - "logits/rejected": 0.20679005980491638, - "logps/chosen": -597.89501953125, - "logps/rejected": -657.15869140625, - "loss": 0.0186, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.850302219390869, - "rewards/margins": 1.2778496742248535, - "rewards/rejected": -4.128152370452881, - "step": 1070 - }, - { - "epoch": 1.3, - "learning_rate": 3.3200222062350324e-07, - "logits/chosen": 0.20479054749011993, - "logits/rejected": 0.4336840510368347, - "logps/chosen": -650.9668579101562, - "logps/rejected": -683.4470825195312, - "loss": 0.0227, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.9142518043518066, - "rewards/margins": 1.1479203701019287, - "rewards/rejected": -4.062172889709473, - "step": 1080 - }, - { - "epoch": 1.31, - "learning_rate": 3.2217008491772724e-07, - "logits/chosen": 0.17460568249225616, - "logits/rejected": 0.32013821601867676, - "logps/chosen": -649.8709716796875, - "logps/rejected": -717.8926391601562, - "loss": 0.0204, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -3.0518133640289307, - "rewards/margins": 1.3449747562408447, - "rewards/rejected": -4.396788120269775, - "step": 1090 - }, - { - "epoch": 1.32, - "learning_rate": 3.124160553930953e-07, - "logits/chosen": 0.23403926193714142, - "logits/rejected": 0.4171646535396576, - "logps/chosen": -583.6199951171875, - "logps/rejected": -628.165771484375, - "loss": 0.0221, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5831427574157715, - "rewards/margins": 1.416587471961975, - "rewards/rejected": -3.999729871749878, - "step": 1100 - }, - { - "epoch": 1.33, - "learning_rate": 3.027444161994178e-07, - "logits/chosen": 0.26827913522720337, - "logits/rejected": 0.3499876856803894, - "logps/chosen": -567.4503784179688, - "logps/rejected": -667.9777221679688, - "loss": 0.0182, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.5715322494506836, - "rewards/margins": 1.4084771871566772, - "rewards/rejected": -3.980009078979492, - "step": 1110 - }, - { - "epoch": 1.34, - "learning_rate": 2.9315941529915055e-07, - "logits/chosen": 0.2547352910041809, - "logits/rejected": 0.47194820642471313, - "logps/chosen": -698.35888671875, - "logps/rejected": -709.6544189453125, - "loss": 0.0169, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -3.165942668914795, - "rewards/margins": 1.3185456991195679, - "rewards/rejected": -4.484488487243652, - "step": 1120 - }, - { - "epoch": 1.36, - "learning_rate": 2.8366526260161205e-07, - "logits/chosen": 0.3811063766479492, - "logits/rejected": 0.5796335935592651, - "logps/chosen": -667.5139770507812, - "logps/rejected": -759.1563720703125, - "loss": 0.0147, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.5972373485565186, - "rewards/margins": 1.5377312898635864, - "rewards/rejected": -5.134967803955078, - "step": 1130 - }, - { - "epoch": 1.37, - "learning_rate": 2.742661281139129e-07, - "logits/chosen": 0.4097815155982971, - "logits/rejected": 0.5813673734664917, - "logps/chosen": -648.1516723632812, - "logps/rejected": -774.3548583984375, - "loss": 0.0125, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.3875465393066406, - "rewards/margins": 1.6946359872817993, - "rewards/rejected": -5.082181930541992, - "step": 1140 - }, - { - "epoch": 1.38, - "learning_rate": 2.6496614010941214e-07, - "logits/chosen": 0.40659332275390625, - "logits/rejected": 0.47693824768066406, - "logps/chosen": -579.2315673828125, - "logps/rejected": -690.2713623046875, - "loss": 0.0162, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.0188193321228027, - "rewards/margins": 1.4409698247909546, - "rewards/rejected": -4.459788799285889, - "step": 1150 - }, - { - "epoch": 1.39, - "learning_rate": 2.557693833145038e-07, - "logits/chosen": 0.3575282692909241, - "logits/rejected": 0.393863707780838, - "logps/chosen": -661.1494140625, - "logps/rejected": -756.2431640625, - "loss": 0.0165, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -3.1486072540283203, - "rewards/margins": 1.5837332010269165, - "rewards/rejected": -4.7323408126831055, - "step": 1160 - }, - { - "epoch": 1.4, - "learning_rate": 2.4667989711452873e-07, - "logits/chosen": 0.28119125962257385, - "logits/rejected": 0.5084460973739624, - "logps/chosen": -585.8961181640625, - "logps/rejected": -656.994873046875, - "loss": 0.0165, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.9292385578155518, - "rewards/margins": 1.1598174571990967, - "rewards/rejected": -4.089056015014648, - "step": 1170 - }, - { - "epoch": 1.42, - "learning_rate": 2.3770167377960237e-07, - "logits/chosen": 0.2689970135688782, - "logits/rejected": 0.5148590803146362, - "logps/chosen": -717.4390869140625, - "logps/rejected": -785.14404296875, - "loss": 0.015, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -3.3781027793884277, - "rewards/margins": 1.664996862411499, - "rewards/rejected": -5.043099880218506, - "step": 1180 - }, - { - "epoch": 1.43, - "learning_rate": 2.2883865671113633e-07, - "logits/chosen": 0.31557050347328186, - "logits/rejected": 0.411182701587677, - "logps/chosen": -648.5716552734375, - "logps/rejected": -728.0420532226562, - "loss": 0.0149, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -3.0543835163116455, - "rewards/margins": 1.5502524375915527, - "rewards/rejected": -4.604635715484619, - "step": 1190 - }, - { - "epoch": 1.44, - "learning_rate": 2.200947387098232e-07, - "logits/chosen": 0.26245275139808655, - "logits/rejected": 0.37014099955558777, - "logps/chosen": -645.890380859375, - "logps/rejected": -696.8558959960938, - "loss": 0.0165, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.092529773712158, - "rewards/margins": 1.4243347644805908, - "rewards/rejected": -4.516864776611328, - "step": 1200 - }, - { - "epoch": 1.45, - "learning_rate": 2.1147376026584757e-07, - "logits/chosen": 0.3540686070919037, - "logits/rejected": 0.44220852851867676, - "logps/chosen": -694.4542236328125, - "logps/rejected": -797.4610595703125, - "loss": 0.016, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -3.1828300952911377, - "rewards/margins": 1.86924147605896, - "rewards/rejected": -5.052072048187256, - "step": 1210 - }, - { - "epoch": 1.46, - "learning_rate": 2.0297950787207047e-07, - "logits/chosen": 0.36199456453323364, - "logits/rejected": 0.5422841906547546, - "logps/chosen": -615.1419677734375, - "logps/rejected": -665.1968383789062, - "loss": 0.0177, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.897984027862549, - "rewards/margins": 1.3999998569488525, - "rewards/rejected": -4.2979841232299805, - "step": 1220 - }, - { - "epoch": 1.48, - "learning_rate": 1.9461571236093288e-07, - "logits/chosen": 0.2255418598651886, - "logits/rejected": 0.3416988253593445, - "logps/chosen": -586.51806640625, - "logps/rejected": -626.4557495117188, - "loss": 0.02, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6398327350616455, - "rewards/margins": 1.3156733512878418, - "rewards/rejected": -3.9555065631866455, - "step": 1230 - }, - { - "epoch": 1.49, - "learning_rate": 1.8638604726580476e-07, - "logits/chosen": 0.4877316355705261, - "logits/rejected": 0.5342193245887756, - "logps/chosen": -573.1345825195312, - "logps/rejected": -688.9697875976562, - "loss": 0.0201, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.821394205093384, - "rewards/margins": 1.4121348857879639, - "rewards/rejected": -4.233529090881348, - "step": 1240 - }, - { - "epoch": 1.5, - "learning_rate": 1.782941272075017e-07, - "logits/chosen": 0.3591156601905823, - "logits/rejected": 0.5589128732681274, - "logps/chosen": -630.3018798828125, - "logps/rejected": -729.1790771484375, - "loss": 0.017, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.9634406566619873, - "rewards/margins": 1.6115903854370117, - "rewards/rejected": -4.575031280517578, - "step": 1250 - }, - { - "epoch": 1.51, - "learning_rate": 1.7034350630667626e-07, - "logits/chosen": 0.4524804651737213, - "logits/rejected": 0.6036592721939087, - "logps/chosen": -576.6759033203125, - "logps/rejected": -693.260986328125, - "loss": 0.0143, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.0177433490753174, - "rewards/margins": 1.490710973739624, - "rewards/rejected": -4.508454322814941, - "step": 1260 - }, - { - "epoch": 1.52, - "learning_rate": 1.6253767662278345e-07, - "logits/chosen": 0.3014351427555084, - "logits/rejected": 0.5008861422538757, - "logps/chosen": -718.8760986328125, - "logps/rejected": -813.004150390625, - "loss": 0.0128, - "rewards/accuracies": 0.71875, - "rewards/chosen": -3.741528034210205, - "rewards/margins": 1.4162757396697998, - "rewards/rejected": -5.157803535461426, - "step": 1270 - }, - { - "epoch": 1.54, - "learning_rate": 1.548800666203028e-07, - "logits/chosen": 0.47732776403427124, - "logits/rejected": 0.6197739839553833, - "logps/chosen": -660.623291015625, - "logps/rejected": -757.375244140625, - "loss": 0.0109, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.632997989654541, - "rewards/margins": 1.6059176921844482, - "rewards/rejected": -5.23891544342041, - "step": 1280 - }, - { - "epoch": 1.55, - "learning_rate": 1.4737403966289385e-07, - "logits/chosen": 0.3244774639606476, - "logits/rejected": 0.5204547643661499, - "logps/chosen": -726.2711181640625, - "logps/rejected": -772.5819091796875, - "loss": 0.0123, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -4.009468078613281, - "rewards/margins": 1.0510145425796509, - "rewards/rejected": -5.060482501983643, - "step": 1290 - }, - { - "epoch": 1.56, - "learning_rate": 1.400228925361449e-07, - "logits/chosen": 0.47019681334495544, - "logits/rejected": 0.6322409510612488, - "logps/chosen": -661.2239379882812, - "logps/rejected": -696.6038818359375, - "loss": 0.0152, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -3.51452898979187, - "rewards/margins": 1.1311668157577515, - "rewards/rejected": -4.64569616317749, - "step": 1300 - }, - { - "epoch": 1.57, - "learning_rate": 1.328298539995637e-07, - "logits/chosen": 0.29750996828079224, - "logits/rejected": 0.3950595259666443, - "logps/chosen": -645.1038208007812, - "logps/rejected": -781.1411743164062, - "loss": 0.0151, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.2570998668670654, - "rewards/margins": 1.7285950183868408, - "rewards/rejected": -4.985694408416748, - "step": 1310 - }, - { - "epoch": 1.58, - "learning_rate": 1.257980833684471e-07, - "logits/chosen": 0.47976547479629517, - "logits/rejected": 0.51587975025177, - "logps/chosen": -616.9915771484375, - "logps/rejected": -749.0167236328125, - "loss": 0.0174, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -3.158031940460205, - "rewards/margins": 1.6786686182022095, - "rewards/rejected": -4.836700439453125, - "step": 1320 - }, - { - "epoch": 1.6, - "learning_rate": 1.1893066912625078e-07, - "logits/chosen": 0.40313810110092163, - "logits/rejected": 0.5334981083869934, - "logps/chosen": -605.7171020507812, - "logps/rejected": -690.5645141601562, - "loss": 0.015, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.071347951889038, - "rewards/margins": 1.3815619945526123, - "rewards/rejected": -4.45290994644165, - "step": 1330 - }, - { - "epoch": 1.61, - "learning_rate": 1.1223062756807078e-07, - "logits/chosen": 0.3190918564796448, - "logits/rejected": 0.4667654037475586, - "logps/chosen": -641.0958862304688, - "logps/rejected": -681.6056518554688, - "loss": 0.0151, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.0553221702575684, - "rewards/margins": 1.2857555150985718, - "rewards/rejected": -4.3410773277282715, - "step": 1340 - }, - { - "epoch": 1.62, - "learning_rate": 1.0570090147583088e-07, - "logits/chosen": 0.4918629229068756, - "logits/rejected": 0.6110485792160034, - "logps/chosen": -679.90185546875, - "logps/rejected": -766.8203735351562, - "loss": 0.0135, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.3473587036132812, - "rewards/margins": 1.5597641468048096, - "rewards/rejected": -4.90712308883667, - "step": 1350 - }, - { - "epoch": 1.63, - "learning_rate": 9.934435882575848e-08, - "logits/chosen": 0.32224616408348083, - "logits/rejected": 0.4978526532649994, - "logps/chosen": -746.5880737304688, - "logps/rejected": -807.6434326171875, - "loss": 0.0147, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.446526288986206, - "rewards/margins": 1.6646654605865479, - "rewards/rejected": -5.111191749572754, - "step": 1360 - }, - { - "epoch": 1.64, - "learning_rate": 9.316379152871668e-08, - "logits/chosen": 0.3561343550682068, - "logits/rejected": 0.5769251585006714, - "logps/chosen": -660.674072265625, - "logps/rejected": -767.1783447265625, - "loss": 0.0145, - "rewards/accuracies": 0.71875, - "rewards/chosen": -3.296705722808838, - "rewards/margins": 1.5563912391662598, - "rewards/rejected": -4.853096961975098, - "step": 1370 - }, - { - "epoch": 1.66, - "learning_rate": 8.716191420394509e-08, - "logits/chosen": 0.48394888639450073, - "logits/rejected": 0.5622594356536865, - "logps/chosen": -608.4400634765625, - "logps/rejected": -728.1359252929688, - "loss": 0.0144, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.9183599948883057, - "rewards/margins": 1.6131757497787476, - "rewards/rejected": -4.5315351486206055, - "step": 1380 - }, { "epoch": 1.67, - "learning_rate": 8.134136298674931e-08, - "logits/chosen": 0.4225463271141052, - "logits/rejected": 0.5670620799064636, - "logps/chosen": -591.5728149414062, - "logps/rejected": -708.0966186523438, - "loss": 0.0151, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.0601813793182373, - "rewards/margins": 1.5123151540756226, - "rewards/rejected": -4.57249641418457, - "step": 1390 - }, - { - "epoch": 1.68, - "learning_rate": 7.570469437066146e-08, - "logits/chosen": 0.5167518854141235, - "logits/rejected": 0.501775324344635, - "logps/chosen": -587.5382080078125, - "logps/rejected": -788.7468872070312, - "loss": 0.0139, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -3.152822971343994, - "rewards/margins": 1.8745685815811157, - "rewards/rejected": -5.02739143371582, - "step": 1400 - }, - { - "epoch": 1.69, - "learning_rate": 7.025438408458106e-08, - "logits/chosen": 0.5128912925720215, - "logits/rejected": 0.6313267350196838, - "logps/chosen": -668.2738037109375, - "logps/rejected": -725.1722412109375, - "loss": 0.0135, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.4178764820098877, - "rewards/margins": 1.2856941223144531, - "rewards/rejected": -4.70357084274292, - "step": 1410 + "eval_logits/chosen": 0.005962640047073364, + "eval_logits/rejected": -0.07456669211387634, + "eval_logps/chosen": -507.4386291503906, + "eval_logps/rejected": -661.1429443359375, + "eval_loss": 0.5288560390472412, + "eval_rewards/accuracies": 0.79296875, + "eval_rewards/chosen": -2.0385398864746094, + "eval_rewards/margins": 1.4566388130187988, + "eval_rewards/rejected": -3.495178699493408, + "eval_runtime": 66.4387, + "eval_samples_per_second": 30.103, + "eval_steps_per_second": 0.482, + "step": 800 }, { "epoch": 1.7, - "learning_rate": 6.49928260053893e-08, - "logits/chosen": 0.47072911262512207, - "logits/rejected": 0.5178387761116028, - "logps/chosen": -693.3670043945312, - "logps/rejected": -771.8135375976562, - "loss": 0.0168, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.376781940460205, - "rewards/margins": 1.6709791421890259, - "rewards/rejected": -5.0477614402771, - "step": 1420 + "learning_rate": 6.790558119157597e-08, + "logits/chosen": -0.1177646666765213, + "logits/rejected": -0.27872687578201294, + "logps/chosen": -533.6497802734375, + "logps/rejected": -695.5762939453125, + "loss": 0.3495, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8121811151504517, + "rewards/margins": 1.8153536319732666, + "rewards/rejected": -3.627534866333008, + "step": 810 }, { "epoch": 1.72, - "learning_rate": 5.992233110651412e-08, - "logits/chosen": 0.4310898184776306, - "logits/rejected": 0.5147913694381714, - "logps/chosen": -654.1636962890625, - "logps/rejected": -809.3177490234375, - "loss": 0.013, - "rewards/accuracies": 0.71875, - "rewards/chosen": -3.412320375442505, - "rewards/margins": 1.580108880996704, - "rewards/rejected": -4.992428779602051, - "step": 1430 - }, - { - "epoch": 1.73, - "learning_rate": 5.504512644290787e-08, - "logits/chosen": 0.34709280729293823, - "logits/rejected": 0.4276227056980133, - "logps/chosen": -631.5572509765625, - "logps/rejected": -723.4016723632812, - "loss": 0.0147, - "rewards/accuracies": 0.71875, - "rewards/chosen": -3.13346791267395, - "rewards/margins": 1.2867248058319092, - "rewards/rejected": -4.420193195343018, - "step": 1440 + "learning_rate": 5.898544083397e-08, + "logits/chosen": -0.14028558135032654, + "logits/rejected": -0.17688541114330292, + "logps/chosen": -478.90252685546875, + "logps/rejected": -667.4837646484375, + "loss": 0.3159, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7867857217788696, + "rewards/margins": 1.8063859939575195, + "rewards/rejected": -3.593172073364258, + "step": 820 }, { "epoch": 1.74, - "learning_rate": 5.036335417288373e-08, - "logits/chosen": 0.3249759376049042, - "logits/rejected": 0.5136023163795471, - "logps/chosen": -692.810791015625, - "logps/rejected": -764.9146728515625, - "loss": 0.0143, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.325841188430786, - "rewards/margins": 1.6227283477783203, - "rewards/rejected": -4.9485697746276855, - "step": 1450 - }, - { - "epoch": 1.75, - "learning_rate": 4.587907061724033e-08, - "logits/chosen": 0.4547889828681946, - "logits/rejected": 0.6101347804069519, - "logps/chosen": -648.64404296875, - "logps/rejected": -718.5267333984375, - "loss": 0.0157, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -3.184832811355591, - "rewards/margins": 1.4085993766784668, - "rewards/rejected": -4.593432426452637, - "step": 1460 + "learning_rate": 5.065649387408705e-08, + "logits/chosen": -0.10511211305856705, + "logits/rejected": -0.15123674273490906, + "logps/chosen": -486.4148864746094, + "logps/rejected": -686.9715576171875, + "loss": 0.299, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7775415182113647, + "rewards/margins": 2.1987690925598145, + "rewards/rejected": -3.9763107299804688, + "step": 830 }, { "epoch": 1.76, - "learning_rate": 4.1594245356087467e-08, - "logits/chosen": 0.3632095158100128, - "logits/rejected": 0.4777224659919739, - "logps/chosen": -618.84765625, - "logps/rejected": -726.1567993164062, - "loss": 0.0152, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.185579776763916, - "rewards/margins": 1.4200047254562378, - "rewards/rejected": -4.605584144592285, - "step": 1470 + "learning_rate": 4.292990551804171e-08, + "logits/chosen": -0.17270681262016296, + "logits/rejected": -0.17791520059108734, + "logps/chosen": -508.0003356933594, + "logps/rejected": -639.8585205078125, + "loss": 0.326, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6620067358016968, + "rewards/margins": 1.9285541772842407, + "rewards/rejected": -3.5905609130859375, + "step": 840 }, { "epoch": 1.78, - "learning_rate": 3.751076036377071e-08, - "logits/chosen": 0.3495107591152191, - "logits/rejected": 0.6066192388534546, - "logps/chosen": -640.0892333984375, - "logps/rejected": -659.4434204101562, - "loss": 0.0147, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.1215078830718994, - "rewards/margins": 1.1207990646362305, - "rewards/rejected": -4.242306709289551, - "step": 1480 - }, - { - "epoch": 1.79, - "learning_rate": 3.363040918227289e-08, - "logits/chosen": 0.47332292795181274, - "logits/rejected": 0.5948122143745422, - "logps/chosen": -615.6058349609375, - "logps/rejected": -682.501708984375, - "loss": 0.0137, - "rewards/accuracies": 0.6875, - "rewards/chosen": -3.3774936199188232, - "rewards/margins": 1.1772058010101318, - "rewards/rejected": -4.554699420928955, - "step": 1490 + "learning_rate": 3.581603349196371e-08, + "logits/chosen": -0.12471990287303925, + "logits/rejected": -0.14701852202415466, + "logps/chosen": -472.03851318359375, + "logps/rejected": -641.9672241210938, + "loss": 0.3071, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6948821544647217, + "rewards/margins": 1.8576085567474365, + "rewards/rejected": -3.552490234375, + "step": 850 }, { "epoch": 1.8, - "learning_rate": 2.995489613345753e-08, - "logits/chosen": 0.4362161159515381, - "logits/rejected": 0.6808967590332031, - "logps/chosen": -674.7125244140625, - "logps/rejected": -721.2472534179688, - "loss": 0.0142, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -3.4577622413635254, - "rewards/margins": 1.0942115783691406, - "rewards/rejected": -4.551973342895508, - "step": 1500 - }, - { - "epoch": 1.81, - "learning_rate": 2.6485835570499494e-08, - "logits/chosen": 0.43375635147094727, - "logits/rejected": 0.6752065420150757, - "logps/chosen": -632.56396484375, - "logps/rejected": -739.25439453125, - "loss": 0.0147, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.9880669116973877, - "rewards/margins": 1.6647765636444092, - "rewards/rejected": -4.652843475341797, - "step": 1510 + "learning_rate": 2.9324414157151367e-08, + "logits/chosen": -0.04490917921066284, + "logits/rejected": -0.14014053344726562, + "logps/chosen": -501.17724609375, + "logps/rejected": -699.5184936523438, + "loss": 0.3172, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.888622522354126, + "rewards/margins": 1.9717451333999634, + "rewards/rejected": -3.8603675365448, + "step": 860 }, { "epoch": 1.82, - "learning_rate": 2.3224751168831048e-08, - "logits/chosen": 0.4767048954963684, - "logits/rejected": 0.7012344598770142, - "logps/chosen": -584.639892578125, - "logps/rejected": -666.232666015625, - "loss": 0.016, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.8932371139526367, - "rewards/margins": 1.4125299453735352, - "rewards/rejected": -4.305767059326172, - "step": 1520 + "learning_rate": 2.3463749726290284e-08, + "logits/chosen": -0.2078404426574707, + "logits/rejected": -0.2235480546951294, + "logps/chosen": -515.3598022460938, + "logps/rejected": -679.1048583984375, + "loss": 0.3106, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7966963052749634, + "rewards/margins": 2.0378241539001465, + "rewards/rejected": -3.8345208168029785, + "step": 870 }, { "epoch": 1.84, - "learning_rate": 2.0173075256915418e-08, - "logits/chosen": 0.5076589584350586, - "logits/rejected": 0.5357536673545837, - "logps/chosen": -660.1342163085938, - "logps/rejected": -769.7876586914062, - "loss": 0.0161, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.183320999145508, - "rewards/margins": 1.548128366470337, - "rewards/rejected": -4.731449127197266, - "step": 1530 - }, - { - "epoch": 1.85, - "learning_rate": 1.7332148187142126e-08, - "logits/chosen": 0.5189244151115417, - "logits/rejected": 0.5688571929931641, - "logps/chosen": -592.2298583984375, - "logps/rejected": -701.166748046875, - "loss": 0.0136, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.999229907989502, - "rewards/margins": 1.526184320449829, - "rewards/rejected": -4.525413990020752, - "step": 1540 + "learning_rate": 1.824189659787284e-08, + "logits/chosen": -0.13853205740451813, + "logits/rejected": -0.21100616455078125, + "logps/chosen": -472.497314453125, + "logps/rejected": -661.0221557617188, + "loss": 0.3159, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8647406101226807, + "rewards/margins": 1.8935458660125732, + "rewards/rejected": -3.758286237716675, + "step": 880 }, { "epoch": 1.86, - "learning_rate": 1.4703217747118746e-08, - "logits/chosen": 0.47760123014450073, - "logits/rejected": 0.569862961769104, - "logps/chosen": -608.0677490234375, - "logps/rejected": -744.98974609375, - "loss": 0.0168, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -3.1274056434631348, - "rewards/margins": 1.7376220226287842, - "rewards/rejected": -4.86502742767334, - "step": 1550 - }, - { - "epoch": 1.87, - "learning_rate": 1.2287438611620182e-08, - "logits/chosen": 0.40803298354148865, - "logits/rejected": 0.47397923469543457, - "logps/chosen": -605.1934814453125, - "logps/rejected": -706.01416015625, - "loss": 0.0142, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.087914228439331, - "rewards/margins": 1.4495090246200562, - "rewards/rejected": -4.537423133850098, - "step": 1560 + "learning_rate": 1.3665854824458035e-08, + "logits/chosen": -0.1093883290886879, + "logits/rejected": -0.13345304131507874, + "logps/chosen": -502.65576171875, + "logps/rejected": -679.2914428710938, + "loss": 0.3287, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.9036403894424438, + "rewards/margins": 1.891225814819336, + "rewards/rejected": -3.7948665618896484, + "step": 890 }, { "epoch": 1.88, - "learning_rate": 1.0085871835434023e-08, - "logits/chosen": 0.457242488861084, - "logits/rejected": 0.5799147486686707, - "logps/chosen": -618.122802734375, - "logps/rejected": -734.541259765625, - "loss": 0.014, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.0382626056671143, - "rewards/margins": 1.7189610004425049, - "rewards/rejected": -4.757222652435303, - "step": 1570 + "learning_rate": 9.741758728888217e-09, + "logits/chosen": -0.17935501039028168, + "logits/rejected": -0.27814191579818726, + "logps/chosen": -524.1412353515625, + "logps/rejected": -714.8737182617188, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8200117349624634, + "rewards/margins": 2.01808500289917, + "rewards/rejected": -3.838097095489502, + "step": 900 + }, + { + "epoch": 1.88, + "eval_logits/chosen": 0.004436488263309002, + "eval_logits/rejected": -0.08048967272043228, + "eval_logps/chosen": -512.359619140625, + "eval_logps/rejected": -669.754638671875, + "eval_loss": 0.5303945541381836, + "eval_rewards/accuracies": 0.796875, + "eval_rewards/chosen": -2.087749719619751, + "eval_rewards/margins": 1.4935452938079834, + "eval_rewards/rejected": -3.5812950134277344, + "eval_runtime": 65.9658, + "eval_samples_per_second": 30.319, + "eval_steps_per_second": 0.485, + "step": 900 }, { "epoch": 1.9, - "learning_rate": 8.099484387325494e-09, - "logits/chosen": 0.4558071494102478, - "logits/rejected": 0.5063390135765076, - "logps/chosen": -659.652099609375, - "logps/rejected": -760.4281005859375, - "loss": 0.0164, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -3.3247177600860596, - "rewards/margins": 1.6427574157714844, - "rewards/rejected": -4.967474937438965, - "step": 1580 - }, - { - "epoch": 1.91, - "learning_rate": 6.3291487253271936e-09, - "logits/chosen": 0.4046391546726227, - "logits/rejected": 0.537802517414093, - "logps/chosen": -669.2388916015625, - "logps/rejected": -760.1110229492188, - "loss": 0.0148, - "rewards/accuracies": 0.6875, - "rewards/chosen": -3.352482557296753, - "rewards/margins": 1.2877779006958008, - "rewards/rejected": -4.640260219573975, - "step": 1590 - }, - { - "epoch": 1.92, - "learning_rate": 4.775642413539338e-09, - "logits/chosen": 0.5053682327270508, - "logits/rejected": 0.637668251991272, - "logps/chosen": -612.4566650390625, - "logps/rejected": -682.6697998046875, - "loss": 0.0146, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -3.1416869163513184, - "rewards/margins": 1.0475047826766968, - "rewards/rejected": -4.189192295074463, - "step": 1600 + "learning_rate": 6.474868681043577e-09, + "logits/chosen": -0.07753603905439377, + "logits/rejected": -0.13247697055339813, + "logps/chosen": -449.83734130859375, + "logps/rejected": -685.8665161132812, + "loss": 0.3233, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.7036330699920654, + "rewards/margins": 2.366624355316162, + "rewards/rejected": -4.070257186889648, + "step": 910 }, { "epoch": 1.93, - "learning_rate": 3.4396477806090674e-09, - "logits/chosen": 0.4289971888065338, - "logits/rejected": 0.6064268350601196, - "logps/chosen": -650.7198486328125, - "logps/rejected": -691.7114868164062, - "loss": 0.0169, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.111639976501465, - "rewards/margins": 1.4212944507598877, - "rewards/rejected": -4.53293514251709, - "step": 1610 - }, - { - "epoch": 1.94, - "learning_rate": 2.321751620039447e-09, - "logits/chosen": 0.3730877935886383, - "logits/rejected": 0.5358615517616272, - "logps/chosen": -634.3133544921875, - "logps/rejected": -689.7657470703125, - "loss": 0.0136, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.0234384536743164, - "rewards/margins": 1.1467082500457764, - "rewards/rejected": -4.170146942138672, - "step": 1620 - }, - { - "epoch": 1.96, - "learning_rate": 1.422444932458633e-09, - "logits/chosen": 0.4879744052886963, - "logits/rejected": 0.5437642335891724, - "logps/chosen": -628.467529296875, - "logps/rejected": -736.6702880859375, - "loss": 0.0162, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -3.185553550720215, - "rewards/margins": 1.3367440700531006, - "rewards/rejected": -4.522297382354736, - "step": 1630 + "learning_rate": 3.869564046156459e-09, + "logits/chosen": -0.23725202679634094, + "logits/rejected": -0.2444530427455902, + "logps/chosen": -493.62469482421875, + "logps/rejected": -695.1348266601562, + "loss": 0.3278, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.83344304561615, + "rewards/margins": 2.064387798309326, + "rewards/rejected": -3.8978304862976074, + "step": 920 + }, + { + "epoch": 1.95, + "learning_rate": 1.929337314139412e-09, + "logits/chosen": -0.09604072570800781, + "logits/rejected": -0.23820845782756805, + "logps/chosen": -500.5563049316406, + "logps/rejected": -702.85693359375, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7390083074569702, + "rewards/margins": 2.2644007205963135, + "rewards/rejected": -4.003409385681152, + "step": 930 }, { "epoch": 1.97, - "learning_rate": 7.421227099634886e-10, - "logits/chosen": 0.3235490620136261, - "logits/rejected": 0.5868870615959167, - "logps/chosen": -694.3482055664062, - "logps/rejected": -766.0265502929688, - "loss": 0.0155, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -3.1539559364318848, - "rewards/margins": 1.6172927618026733, - "rewards/rejected": -4.771248817443848, - "step": 1640 - }, - { - "epoch": 1.98, - "learning_rate": 2.8108376263175083e-10, - "logits/chosen": 0.40071624517440796, - "logits/rejected": 0.5358114242553711, - "logps/chosen": -693.2542114257812, - "logps/rejected": -787.3406982421875, - "loss": 0.0156, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -3.3677265644073486, - "rewards/margins": 1.6144136190414429, - "rewards/rejected": -4.982139587402344, - "step": 1650 + "learning_rate": 6.567894177967325e-10, + "logits/chosen": -0.07687229663133621, + "logits/rejected": -0.13503529131412506, + "logps/chosen": -526.2444458007812, + "logps/rejected": -671.4005126953125, + "loss": 0.3099, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8536115884780884, + "rewards/margins": 1.9118531942367554, + "rewards/rejected": -3.7654647827148438, + "step": 940 }, { "epoch": 1.99, - "learning_rate": 3.953058727912406e-11, - "logits/chosen": 0.3938693106174469, - "logits/rejected": 0.5522228479385376, - "logps/chosen": -619.6367797851562, - "logps/rejected": -711.4348754882812, - "loss": 0.017, - "rewards/accuracies": 0.71875, - "rewards/chosen": -3.093592882156372, - "rewards/margins": 1.4544086456298828, - "rewards/rejected": -4.548001289367676, - "step": 1660 + "learning_rate": 5.3626246194704575e-11, + "logits/chosen": -0.07911854982376099, + "logits/rejected": -0.14566132426261902, + "logps/chosen": -516.5146484375, + "logps/rejected": -673.5474853515625, + "loss": 0.328, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8891903162002563, + "rewards/margins": 1.9256713390350342, + "rewards/rejected": -3.814861297607422, + "step": 950 }, { "epoch": 2.0, - "step": 1666, + "step": 954, "total_flos": 0.0, - "train_loss": 0.11547961056518669, - "train_runtime": 23573.0656, - "train_samples_per_second": 9.051, - "train_steps_per_second": 0.071 + "train_loss": 0.44723546317538376, + "train_runtime": 8668.6976, + "train_samples_per_second": 14.105, + "train_steps_per_second": 0.11 } ], "logging_steps": 10, - "max_steps": 1666, + "max_steps": 954, "num_train_epochs": 2, - "save_steps": 10000, + "save_steps": 1000, "total_flos": 0.0, "trial_name": null, "trial_params": null