{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9994765768123528, "eval_steps": 100, "global_step": 3820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.3089005235602096e-08, "logits/chosen": 0.896942138671875, "logits/rejected": 0.9175108075141907, "logps/chosen": -192.32028198242188, "logps/rejected": -193.69876098632812, "loss": 2500.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.3089005235602095e-07, "logits/chosen": 0.903715968132019, "logits/rejected": 0.9309377670288086, "logps/chosen": -253.598876953125, "logps/rejected": -228.25482177734375, "loss": 2504.6897, "rewards/accuracies": 0.3819444477558136, "rewards/chosen": -0.0001807510998332873, "rewards/margins": -0.0004412428825162351, "rewards/rejected": 0.00026049179723486304, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.617801047120419e-07, "logits/chosen": 0.8256899118423462, "logits/rejected": 0.9293961524963379, "logps/chosen": -252.84963989257812, "logps/rejected": -214.4913330078125, "loss": 2511.0686, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0004701187717728317, "rewards/margins": -0.0010407656664028764, "rewards/rejected": 0.0005706468946300447, "step": 20 }, { "epoch": 0.02, "learning_rate": 3.926701570680629e-07, "logits/chosen": 0.8985889554023743, "logits/rejected": 0.8785662651062012, "logps/chosen": -236.40536499023438, "logps/rejected": -219.20285034179688, "loss": 2494.8072, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0004474873130675405, "rewards/margins": 0.0005769692361354828, "rewards/rejected": -0.00012948190851602703, "step": 30 }, { "epoch": 0.02, "learning_rate": 5.235602094240838e-07, "logits/chosen": 0.819919228553772, "logits/rejected": 0.9144619703292847, "logps/chosen": -252.99588012695312, "logps/rejected": -225.9224853515625, "loss": 2504.7604, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0003411176148802042, "rewards/margins": -0.0004235326196067035, "rewards/rejected": 8.241502655437216e-05, "step": 40 }, { "epoch": 0.03, "learning_rate": 6.544502617801048e-07, "logits/chosen": 0.7974398136138916, "logits/rejected": 0.8803712725639343, "logps/chosen": -254.3247528076172, "logps/rejected": -243.318603515625, "loss": 2498.3947, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00023676609271205962, "rewards/margins": 0.0002204025659011677, "rewards/rejected": -0.00045716846943832934, "step": 50 }, { "epoch": 0.03, "learning_rate": 7.853403141361258e-07, "logits/chosen": 0.8626053929328918, "logits/rejected": 0.8485649824142456, "logps/chosen": -262.6585693359375, "logps/rejected": -248.63272094726562, "loss": 2500.4902, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0002292692952323705, "rewards/margins": 2.0124425645917654e-05, "rewards/rejected": -0.0002493937499821186, "step": 60 }, { "epoch": 0.04, "learning_rate": 9.162303664921466e-07, "logits/chosen": 0.8897444605827332, "logits/rejected": 0.8922082185745239, "logps/chosen": -232.531005859375, "logps/rejected": -234.0869903564453, "loss": 2496.8041, "rewards/accuracies": 0.53125, "rewards/chosen": 3.7896970752626657e-06, "rewards/margins": 0.00037148987757973373, "rewards/rejected": -0.00036770018050447106, "step": 70 }, { "epoch": 0.04, "learning_rate": 1.0471204188481676e-06, "logits/chosen": 0.8788009881973267, "logits/rejected": 0.8891068696975708, "logps/chosen": -242.5009765625, "logps/rejected": -229.5125732421875, "loss": 2508.6898, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0007867829990573227, "rewards/margins": -0.0008174808463081717, "rewards/rejected": 3.069788363063708e-05, "step": 80 }, { "epoch": 0.05, "learning_rate": 1.1780104712041885e-06, "logits/chosen": 0.8606799840927124, "logits/rejected": 0.9575719833374023, "logps/chosen": -232.0597686767578, "logps/rejected": -218.4732666015625, "loss": 2496.8559, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0003882866003550589, "rewards/margins": 0.000359431782271713, "rewards/rejected": 2.8854870834038593e-05, "step": 90 }, { "epoch": 0.05, "learning_rate": 1.3089005235602096e-06, "logits/chosen": 0.8833224177360535, "logits/rejected": 0.8661258816719055, "logps/chosen": -245.799072265625, "logps/rejected": -249.2645721435547, "loss": 2496.843, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005279771285131574, "rewards/margins": 0.00037464461638592184, "rewards/rejected": -0.0009026216575875878, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": 0.8318074345588684, "eval_logits/rejected": 0.8888298273086548, "eval_logps/chosen": -256.65057373046875, "eval_logps/rejected": -233.56494140625, "eval_loss": 2502.266845703125, "eval_rewards/accuracies": 0.5005000233650208, "eval_rewards/chosen": -0.0003313073539175093, "eval_rewards/margins": -0.00017098072567023337, "eval_rewards/rejected": -0.00016032661369536072, "eval_runtime": 416.835, "eval_samples_per_second": 4.798, "eval_steps_per_second": 1.2, "step": 100 }, { "epoch": 0.06, "learning_rate": 1.4397905759162306e-06, "logits/chosen": 0.9012953042984009, "logits/rejected": 0.8766192197799683, "logps/chosen": -229.46292114257812, "logps/rejected": -210.2642364501953, "loss": 2501.3449, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010574392217677087, "rewards/margins": -7.803810149198398e-05, "rewards/rejected": -2.7705809770850465e-05, "step": 110 }, { "epoch": 0.06, "learning_rate": 1.5706806282722515e-06, "logits/chosen": 0.8108441233634949, "logits/rejected": 0.8906086087226868, "logps/chosen": -273.14385986328125, "logps/rejected": -259.1924133300781, "loss": 2494.2678, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00021679741621483117, "rewards/margins": 0.0006280258530750871, "rewards/rejected": -0.00041122836410067976, "step": 120 }, { "epoch": 0.07, "learning_rate": 1.7015706806282726e-06, "logits/chosen": 0.8712674975395203, "logits/rejected": 0.9049458503723145, "logps/chosen": -277.8616943359375, "logps/rejected": -222.53662109375, "loss": 2489.5006, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0006732499459758401, "rewards/margins": 0.001107201213017106, "rewards/rejected": -0.0004339513252489269, "step": 130 }, { "epoch": 0.07, "learning_rate": 1.8324607329842933e-06, "logits/chosen": 0.7955681085586548, "logits/rejected": 0.8811987638473511, "logps/chosen": -248.83865356445312, "logps/rejected": -246.317138671875, "loss": 2504.0979, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00031032637343741953, "rewards/margins": -0.00034084441722370684, "rewards/rejected": 3.0518032872350886e-05, "step": 140 }, { "epoch": 0.08, "learning_rate": 1.9633507853403143e-06, "logits/chosen": 0.7933157086372375, "logits/rejected": 0.8591764569282532, "logps/chosen": -257.7363586425781, "logps/rejected": -217.54580688476562, "loss": 2507.8082, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0007291415822692215, "rewards/margins": -0.0007318807765841484, "rewards/rejected": 2.739173851296073e-06, "step": 150 }, { "epoch": 0.08, "learning_rate": 2.094240837696335e-06, "logits/chosen": 0.8041954040527344, "logits/rejected": 0.8887465596199036, "logps/chosen": -276.43304443359375, "logps/rejected": -250.4193572998047, "loss": 2504.2807, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010556764755165204, "rewards/margins": -0.0003692187019623816, "rewards/rejected": 0.00026365104713477194, "step": 160 }, { "epoch": 0.09, "learning_rate": 2.2251308900523565e-06, "logits/chosen": 0.8059272766113281, "logits/rejected": 0.8950363993644714, "logps/chosen": -274.240234375, "logps/rejected": -247.8701171875, "loss": 2501.8248, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.001084670191630721, "rewards/margins": -0.0001141707762144506, "rewards/rejected": -0.0009704994154162705, "step": 170 }, { "epoch": 0.09, "learning_rate": 2.356020942408377e-06, "logits/chosen": 0.8783141374588013, "logits/rejected": 0.8253491520881653, "logps/chosen": -242.3585968017578, "logps/rejected": -221.0929718017578, "loss": 2496.3063, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0004814372514374554, "rewards/margins": 0.0004220888367854059, "rewards/rejected": -0.0009035261464305222, "step": 180 }, { "epoch": 0.1, "learning_rate": 2.4869109947643982e-06, "logits/chosen": 0.8767743110656738, "logits/rejected": 0.8822822570800781, "logps/chosen": -246.2511444091797, "logps/rejected": -224.4364471435547, "loss": 2498.3156, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00032182232826016843, "rewards/margins": 0.00022237170196603984, "rewards/rejected": 9.945056081051007e-05, "step": 190 }, { "epoch": 0.1, "learning_rate": 2.617801047120419e-06, "logits/chosen": 0.855573296546936, "logits/rejected": 0.9106731414794922, "logps/chosen": -258.25885009765625, "logps/rejected": -236.6140594482422, "loss": 2499.2807, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00018535128037910908, "rewards/margins": 0.0001189738031825982, "rewards/rejected": -0.0003043250762857497, "step": 200 }, { "epoch": 0.1, "eval_logits/chosen": 0.8310006856918335, "eval_logits/rejected": 0.8882209062576294, "eval_logps/chosen": -256.6106262207031, "eval_logps/rejected": -233.5994873046875, "eval_loss": 2494.83544921875, "eval_rewards/accuracies": 0.5189999938011169, "eval_rewards/chosen": 6.786447193007916e-05, "eval_rewards/margins": 0.0005738017498515546, "eval_rewards/rejected": -0.0005059372633695602, "eval_runtime": 416.4863, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 200 }, { "epoch": 0.11, "learning_rate": 2.74869109947644e-06, "logits/chosen": 0.9517833590507507, "logits/rejected": 0.910740852355957, "logps/chosen": -244.80032348632812, "logps/rejected": -232.45321655273438, "loss": 2500.5746, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -9.258640056941658e-05, "rewards/margins": -6.444106020353502e-06, "rewards/rejected": -8.61423322930932e-05, "step": 210 }, { "epoch": 0.12, "learning_rate": 2.8795811518324613e-06, "logits/chosen": 0.8369059562683105, "logits/rejected": 0.8937622904777527, "logps/chosen": -267.40948486328125, "logps/rejected": -209.13290405273438, "loss": 2487.1783, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0004562476242426783, "rewards/margins": 0.0013524172827601433, "rewards/rejected": -0.0008961696876212955, "step": 220 }, { "epoch": 0.12, "learning_rate": 3.010471204188482e-06, "logits/chosen": 0.8930699229240417, "logits/rejected": 0.9343907237052917, "logps/chosen": -258.6376037597656, "logps/rejected": -221.6857452392578, "loss": 2493.877, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0002062669227598235, "rewards/margins": 0.0006668218411505222, "rewards/rejected": -0.00046055493294261396, "step": 230 }, { "epoch": 0.13, "learning_rate": 3.141361256544503e-06, "logits/chosen": 0.9376400113105774, "logits/rejected": 0.8995400667190552, "logps/chosen": -228.9315185546875, "logps/rejected": -242.112548828125, "loss": 2480.1844, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0004402367048896849, "rewards/margins": 0.002044759690761566, "rewards/rejected": -0.001604523160494864, "step": 240 }, { "epoch": 0.13, "learning_rate": 3.2722513089005235e-06, "logits/chosen": 0.9079924821853638, "logits/rejected": 0.8767238855361938, "logps/chosen": -242.71121215820312, "logps/rejected": -233.9228973388672, "loss": 2493.2939, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -6.936644058441743e-05, "rewards/margins": 0.0007372990949079394, "rewards/rejected": -0.0008066653972491622, "step": 250 }, { "epoch": 0.14, "learning_rate": 3.403141361256545e-06, "logits/chosen": 0.8814166784286499, "logits/rejected": 0.9410937428474426, "logps/chosen": -236.88095092773438, "logps/rejected": -223.930419921875, "loss": 2496.3828, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00017086375737562776, "rewards/margins": 0.0004164519195910543, "rewards/rejected": -0.0005873156478628516, "step": 260 }, { "epoch": 0.14, "learning_rate": 3.534031413612566e-06, "logits/chosen": 0.8372557759284973, "logits/rejected": 0.8741558194160461, "logps/chosen": -212.303466796875, "logps/rejected": -239.15158081054688, "loss": 2483.2088, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.00031574201420880854, "rewards/margins": 0.0017516377847641706, "rewards/rejected": -0.0020673798862844706, "step": 270 }, { "epoch": 0.15, "learning_rate": 3.6649214659685865e-06, "logits/chosen": 0.8835927844047546, "logits/rejected": 0.9312320947647095, "logps/chosen": -250.3417510986328, "logps/rejected": -263.51531982421875, "loss": 2492.1223, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0005325328675098717, "rewards/margins": 0.000863347842823714, "rewards/rejected": -0.0013958807103335857, "step": 280 }, { "epoch": 0.15, "learning_rate": 3.7958115183246074e-06, "logits/chosen": 0.8323150873184204, "logits/rejected": 0.8896921277046204, "logps/chosen": -250.1656951904297, "logps/rejected": -234.757568359375, "loss": 2483.249, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00013801059685647488, "rewards/margins": 0.0017679758602753282, "rewards/rejected": -0.001905986457131803, "step": 290 }, { "epoch": 0.16, "learning_rate": 3.926701570680629e-06, "logits/chosen": 0.8511127233505249, "logits/rejected": 0.8209661245346069, "logps/chosen": -273.7172546386719, "logps/rejected": -250.82748413085938, "loss": 2477.7609, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0011967200553044677, "rewards/margins": 0.0023162723518908024, "rewards/rejected": -0.0035129922907799482, "step": 300 }, { "epoch": 0.16, "eval_logits/chosen": 0.8318725824356079, "eval_logits/rejected": 0.8892252445220947, "eval_logps/chosen": -256.7284851074219, "eval_logps/rejected": -233.8547821044922, "eval_loss": 2481.50146484375, "eval_rewards/accuracies": 0.559499979019165, "eval_rewards/chosen": -0.001110685057938099, "eval_rewards/margins": 0.0019479345064610243, "eval_rewards/rejected": -0.003058619564399123, "eval_runtime": 416.6935, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 300 }, { "epoch": 0.16, "learning_rate": 4.05759162303665e-06, "logits/chosen": 0.7432538866996765, "logits/rejected": 0.817090630531311, "logps/chosen": -274.85931396484375, "logps/rejected": -236.4228057861328, "loss": 2469.5742, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.001123375492170453, "rewards/margins": 0.003155052661895752, "rewards/rejected": -0.004278427921235561, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.18848167539267e-06, "logits/chosen": 0.9147623777389526, "logits/rejected": 0.9334859848022461, "logps/chosen": -233.425537109375, "logps/rejected": -214.6092071533203, "loss": 2457.892, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.00017569802002981305, "rewards/margins": 0.004348465241491795, "rewards/rejected": -0.004172767512500286, "step": 320 }, { "epoch": 0.17, "learning_rate": 4.319371727748692e-06, "logits/chosen": 0.8749006390571594, "logits/rejected": 0.9252738952636719, "logps/chosen": -247.8308563232422, "logps/rejected": -218.9490203857422, "loss": 2463.7824, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0017152890795841813, "rewards/margins": 0.0037990615237504244, "rewards/rejected": -0.0055143507197499275, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.450261780104713e-06, "logits/chosen": 0.8781224489212036, "logits/rejected": 0.9259663820266724, "logps/chosen": -253.4806365966797, "logps/rejected": -239.67434692382812, "loss": 2474.3055, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0034939595498144627, "rewards/margins": 0.0027302266098558903, "rewards/rejected": -0.00622418662533164, "step": 340 }, { "epoch": 0.18, "learning_rate": 4.5811518324607335e-06, "logits/chosen": 0.7855554223060608, "logits/rejected": 0.9314893484115601, "logps/chosen": -255.0915985107422, "logps/rejected": -206.8708038330078, "loss": 2432.458, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0009129707468673587, "rewards/margins": 0.007046517916023731, "rewards/rejected": -0.007959488779306412, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.712041884816754e-06, "logits/chosen": 0.8957219123840332, "logits/rejected": 0.8874330520629883, "logps/chosen": -257.02764892578125, "logps/rejected": -230.2834014892578, "loss": 2441.5672, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.001695218845270574, "rewards/margins": 0.006147631909698248, "rewards/rejected": -0.007842850871384144, "step": 360 }, { "epoch": 0.19, "learning_rate": 4.842931937172775e-06, "logits/chosen": 0.9125442504882812, "logits/rejected": 0.8982815742492676, "logps/chosen": -230.16940307617188, "logps/rejected": -211.3495635986328, "loss": 2439.5756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004968739114701748, "rewards/margins": 0.006435071583837271, "rewards/rejected": -0.011403810232877731, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.9738219895287965e-06, "logits/chosen": 0.9134615063667297, "logits/rejected": 0.8667083978652954, "logps/chosen": -267.7635192871094, "logps/rejected": -219.404541015625, "loss": 2438.1262, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004255574196577072, "rewards/margins": 0.006601777859032154, "rewards/rejected": -0.0108573529869318, "step": 380 }, { "epoch": 0.2, "learning_rate": 4.999933200062888e-06, "logits/chosen": 0.8681972622871399, "logits/rejected": 0.8684479594230652, "logps/chosen": -253.1089324951172, "logps/rejected": -232.1811981201172, "loss": 2414.8473, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0038361139595508575, "rewards/margins": 0.009047028608620167, "rewards/rejected": -0.012883143499493599, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.999661831436499e-06, "logits/chosen": 0.9156022071838379, "logits/rejected": 0.9197471737861633, "logps/chosen": -260.40093994140625, "logps/rejected": -238.50961303710938, "loss": 2428.4195, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005848343018442392, "rewards/margins": 0.007789201103150845, "rewards/rejected": -0.013637542724609375, "step": 400 }, { "epoch": 0.21, "eval_logits/chosen": 0.8404272794723511, "eval_logits/rejected": 0.8983384966850281, "eval_logps/chosen": -257.29510498046875, "eval_logps/rejected": -235.11265563964844, "eval_loss": 2419.1044921875, "eval_rewards/accuracies": 0.6495000123977661, "eval_rewards/chosen": -0.006776793394237757, "eval_rewards/margins": 0.008860657922923565, "eval_rewards/rejected": -0.01563744992017746, "eval_runtime": 416.4578, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 400 }, { "epoch": 0.21, "learning_rate": 4.999181741766532e-06, "logits/chosen": 0.8992105722427368, "logits/rejected": 0.8969219923019409, "logps/chosen": -252.7702178955078, "logps/rejected": -249.8018798828125, "loss": 2438.1242, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007479649968445301, "rewards/margins": 0.006951476447284222, "rewards/rejected": -0.014431129209697247, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.9984929711403395e-06, "logits/chosen": 0.913814902305603, "logits/rejected": 0.9069592356681824, "logps/chosen": -255.3079833984375, "logps/rejected": -251.0774383544922, "loss": 2417.1992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011085378006100655, "rewards/margins": 0.009236546233296394, "rewards/rejected": -0.020321926102042198, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.997595577070068e-06, "logits/chosen": 0.8943805694580078, "logits/rejected": 0.8994030952453613, "logps/chosen": -235.75009155273438, "logps/rejected": -232.7864990234375, "loss": 2429.1924, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.012942090630531311, "rewards/margins": 0.007976246066391468, "rewards/rejected": -0.020918335765600204, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.996489634487865e-06, "logits/chosen": 0.8081871867179871, "logits/rejected": 0.8945296406745911, "logps/chosen": -231.6891632080078, "logps/rejected": -253.6363067626953, "loss": 2484.0605, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.012701654806733131, "rewards/margins": 0.003084682859480381, "rewards/rejected": -0.015786338597536087, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.995175235739619e-06, "logits/chosen": 0.8565770983695984, "logits/rejected": 0.8623224496841431, "logps/chosen": -264.3777770996094, "logps/rejected": -267.23712158203125, "loss": 2392.5188, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.011126170866191387, "rewards/margins": 0.011980591341853142, "rewards/rejected": -0.023106763139367104, "step": 450 }, { "epoch": 0.24, "learning_rate": 4.9936524905772466e-06, "logits/chosen": 0.78350430727005, "logits/rejected": 0.8708200454711914, "logps/chosen": -267.3443298339844, "logps/rejected": -236.05459594726562, "loss": 2404.3129, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.015727100893855095, "rewards/margins": 0.010795451700687408, "rewards/rejected": -0.026522550731897354, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.991921526149529e-06, "logits/chosen": 0.9162321090698242, "logits/rejected": 0.9280640482902527, "logps/chosen": -256.3532409667969, "logps/rejected": -247.89315795898438, "loss": 2386.3984, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01587381586432457, "rewards/margins": 0.01281227171421051, "rewards/rejected": -0.02868608757853508, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.9899824869915e-06, "logits/chosen": 0.8048622012138367, "logits/rejected": 0.8283928036689758, "logps/chosen": -246.1285858154297, "logps/rejected": -252.7605743408203, "loss": 2396.8398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.021037602797150612, "rewards/margins": 0.012100132182240486, "rewards/rejected": -0.0331377312541008, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.987835535012371e-06, "logits/chosen": 0.8453197479248047, "logits/rejected": 0.86089026927948, "logps/chosen": -240.1754913330078, "logps/rejected": -229.0949249267578, "loss": 2350.6213, "rewards/accuracies": 0.625, "rewards/chosen": -0.019980577751994133, "rewards/margins": 0.01681477203965187, "rewards/rejected": -0.03679535537958145, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.985480849482012e-06, "logits/chosen": 0.8507258296012878, "logits/rejected": 0.8831195831298828, "logps/chosen": -264.3097839355469, "logps/rejected": -267.3841552734375, "loss": 2296.8842, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019409244880080223, "rewards/margins": 0.023494381457567215, "rewards/rejected": -0.04290362820029259, "step": 500 }, { "epoch": 0.26, "eval_logits/chosen": 0.8214389681816101, "eval_logits/rejected": 0.8805551528930664, "eval_logps/chosen": -259.0124206542969, "eval_logps/rejected": -237.73793029785156, "eval_loss": 2349.435791015625, "eval_rewards/accuracies": 0.656499981880188, "eval_rewards/chosen": -0.023950034752488136, "eval_rewards/margins": 0.017940117046236992, "eval_rewards/rejected": -0.04189015179872513, "eval_runtime": 416.5178, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.2, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.98291862701599e-06, "logits/chosen": 0.8344039916992188, "logits/rejected": 0.8795874714851379, "logps/chosen": -254.190673828125, "logps/rejected": -214.09396362304688, "loss": 2307.1967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02996075712144375, "rewards/margins": 0.022381700575351715, "rewards/rejected": -0.052342455834150314, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.980149081559142e-06, "logits/chosen": 0.8595240712165833, "logits/rejected": 0.903663158416748, "logps/chosen": -237.7533416748047, "logps/rejected": -241.7561492919922, "loss": 2330.1758, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02604847028851509, "rewards/margins": 0.020816484466195107, "rewards/rejected": -0.04686495289206505, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.977172444367718e-06, "logits/chosen": 0.8232777714729309, "logits/rejected": 0.8955798149108887, "logps/chosen": -248.8101806640625, "logps/rejected": -226.21786499023438, "loss": 2284.3357, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0257116612046957, "rewards/margins": 0.026299094781279564, "rewards/rejected": -0.052010755985975266, "step": 530 }, { "epoch": 0.28, "learning_rate": 4.9739889639900655e-06, "logits/chosen": 0.8850505948066711, "logits/rejected": 0.9008530378341675, "logps/chosen": -260.18963623046875, "logps/rejected": -228.6940155029297, "loss": 2314.4127, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.028372693806886673, "rewards/margins": 0.02354586310684681, "rewards/rejected": -0.051918547600507736, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.9705989062458805e-06, "logits/chosen": 0.8566417694091797, "logits/rejected": 0.8558026552200317, "logps/chosen": -242.9883270263672, "logps/rejected": -247.76370239257812, "loss": 2339.4164, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03327028453350067, "rewards/margins": 0.02076330967247486, "rewards/rejected": -0.054033588618040085, "step": 550 }, { "epoch": 0.29, "learning_rate": 4.967002554204009e-06, "logits/chosen": 0.7901058793067932, "logits/rejected": 0.8357075452804565, "logps/chosen": -263.3518371582031, "logps/rejected": -250.97958374023438, "loss": 2302.1379, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.04014524444937706, "rewards/margins": 0.025143718346953392, "rewards/rejected": -0.0652889683842659, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.963200208158811e-06, "logits/chosen": 0.8461757898330688, "logits/rejected": 0.9372328519821167, "logps/chosen": -223.61373901367188, "logps/rejected": -212.1987762451172, "loss": 2320.3783, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04337463900446892, "rewards/margins": 0.021725038066506386, "rewards/rejected": -0.06509967893362045, "step": 570 }, { "epoch": 0.3, "learning_rate": 4.959192185605089e-06, "logits/chosen": 0.7897135615348816, "logits/rejected": 0.9126697778701782, "logps/chosen": -294.5254821777344, "logps/rejected": -232.0474395751953, "loss": 2167.2896, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03270890563726425, "rewards/margins": 0.041936445981264114, "rewards/rejected": -0.07464535534381866, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.95497882121157e-06, "logits/chosen": 0.8067277669906616, "logits/rejected": 0.8418477773666382, "logps/chosen": -240.95639038085938, "logps/rejected": -217.8979949951172, "loss": 2249.4051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04393559694290161, "rewards/margins": 0.03141217678785324, "rewards/rejected": -0.07534776628017426, "step": 590 }, { "epoch": 0.31, "learning_rate": 4.950560466792969e-06, "logits/chosen": 0.8411356806755066, "logits/rejected": 0.8524805307388306, "logps/chosen": -241.7827606201172, "logps/rejected": -234.87985229492188, "loss": 2254.5846, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04195866733789444, "rewards/margins": 0.03153757005929947, "rewards/rejected": -0.07349623739719391, "step": 600 }, { "epoch": 0.31, "eval_logits/chosen": 0.7868022322654724, "eval_logits/rejected": 0.8478493094444275, "eval_logps/chosen": -261.86590576171875, "eval_logps/rejected": -241.83828735351562, "eval_loss": 2273.499267578125, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": -0.05248467996716499, "eval_rewards/margins": 0.03040897473692894, "eval_rewards/rejected": -0.08289366215467453, "eval_runtime": 416.6239, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 600 }, { "epoch": 0.32, "learning_rate": 4.945937491280611e-06, "logits/chosen": 0.7756252288818359, "logits/rejected": 0.8814484477043152, "logps/chosen": -245.0117950439453, "logps/rejected": -217.2014923095703, "loss": 2348.3988, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.05819234997034073, "rewards/margins": 0.021564457565546036, "rewards/rejected": -0.07975681126117706, "step": 610 }, { "epoch": 0.32, "learning_rate": 4.9411102806916185e-06, "logits/chosen": 0.7903825640678406, "logits/rejected": 0.8663345575332642, "logps/chosen": -270.52239990234375, "logps/rejected": -254.25369262695312, "loss": 2181.5219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05034894496202469, "rewards/margins": 0.042683206498622894, "rewards/rejected": -0.09303215146064758, "step": 620 }, { "epoch": 0.33, "learning_rate": 4.9360792380966875e-06, "logits/chosen": 0.8880133628845215, "logits/rejected": 0.8636928796768188, "logps/chosen": -241.08279418945312, "logps/rejected": -220.8309783935547, "loss": 2263.642, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05800148844718933, "rewards/margins": 0.0320889875292778, "rewards/rejected": -0.09009047597646713, "step": 630 }, { "epoch": 0.33, "learning_rate": 4.930844783586424e-06, "logits/chosen": 0.8702048063278198, "logits/rejected": 0.8985433578491211, "logps/chosen": -240.00460815429688, "logps/rejected": -238.61227416992188, "loss": 2244.0928, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05921437591314316, "rewards/margins": 0.032827965915203094, "rewards/rejected": -0.09204234182834625, "step": 640 }, { "epoch": 0.34, "learning_rate": 4.925407354236279e-06, "logits/chosen": 0.8151038885116577, "logits/rejected": 0.8790351152420044, "logps/chosen": -241.45755004882812, "logps/rejected": -226.37667846679688, "loss": 2236.8199, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06360708922147751, "rewards/margins": 0.03408312052488327, "rewards/rejected": -0.09769020974636078, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.919767404070033e-06, "logits/chosen": 0.8537635803222656, "logits/rejected": 0.8909260034561157, "logps/chosen": -221.7104034423828, "logps/rejected": -206.9849395751953, "loss": 2317.7387, "rewards/accuracies": 0.625, "rewards/chosen": -0.07372693717479706, "rewards/margins": 0.02632719837129116, "rewards/rejected": -0.10005412995815277, "step": 660 }, { "epoch": 0.35, "learning_rate": 4.913925404021905e-06, "logits/chosen": 0.8039971590042114, "logits/rejected": 0.8006811141967773, "logps/chosen": -240.97213745117188, "logps/rejected": -209.97219848632812, "loss": 2208.3297, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.07874181121587753, "rewards/margins": 0.03945617750287056, "rewards/rejected": -0.11819799244403839, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.907881841897216e-06, "logits/chosen": 0.8469578623771667, "logits/rejected": 0.8435947299003601, "logps/chosen": -257.61810302734375, "logps/rejected": -245.639892578125, "loss": 2260.2275, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07468362152576447, "rewards/margins": 0.03678290545940399, "rewards/rejected": -0.11146652698516846, "step": 680 }, { "epoch": 0.36, "learning_rate": 4.901637222331665e-06, "logits/chosen": 0.7657278776168823, "logits/rejected": 0.7471415996551514, "logps/chosen": -259.5301513671875, "logps/rejected": -236.86032104492188, "loss": 2287.06, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08623397350311279, "rewards/margins": 0.03296409547328949, "rewards/rejected": -0.11919806897640228, "step": 690 }, { "epoch": 0.37, "learning_rate": 4.89519206674919e-06, "logits/chosen": 0.7977254390716553, "logits/rejected": 0.8500420451164246, "logps/chosen": -244.900634765625, "logps/rejected": -251.4091339111328, "loss": 2330.7787, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08788236975669861, "rewards/margins": 0.02753649279475212, "rewards/rejected": -0.11541886627674103, "step": 700 }, { "epoch": 0.37, "eval_logits/chosen": 0.7516666650772095, "eval_logits/rejected": 0.812827467918396, "eval_logps/chosen": -264.809326171875, "eval_logps/rejected": -245.7631378173828, "eval_loss": 2224.3349609375, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": -0.08191882818937302, "eval_rewards/margins": 0.0402236245572567, "eval_rewards/rejected": -0.12214244902133942, "eval_runtime": 416.6908, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 700 }, { "epoch": 0.37, "learning_rate": 4.8885469133184235e-06, "logits/chosen": 0.8586422204971313, "logits/rejected": 0.8278988003730774, "logps/chosen": -244.2412109375, "logps/rejected": -248.913818359375, "loss": 2265.9631, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0877029225230217, "rewards/margins": 0.03201908990740776, "rewards/rejected": -0.11972200870513916, "step": 710 }, { "epoch": 0.38, "learning_rate": 4.881702316907769e-06, "logits/chosen": 0.853478729724884, "logits/rejected": 0.9013971090316772, "logps/chosen": -240.85440063476562, "logps/rejected": -233.39761352539062, "loss": 2238.1352, "rewards/accuracies": 0.625, "rewards/chosen": -0.08841414749622345, "rewards/margins": 0.03821689262986183, "rewards/rejected": -0.12663105130195618, "step": 720 }, { "epoch": 0.38, "learning_rate": 4.874658849039054e-06, "logits/chosen": 0.6829933524131775, "logits/rejected": 0.7707113027572632, "logps/chosen": -274.42095947265625, "logps/rejected": -232.1612091064453, "loss": 2119.9938, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07465031743049622, "rewards/margins": 0.049049459397792816, "rewards/rejected": -0.12369978427886963, "step": 730 }, { "epoch": 0.39, "learning_rate": 4.86741709783982e-06, "logits/chosen": 0.7617892026901245, "logits/rejected": 0.8164576292037964, "logps/chosen": -267.9073486328125, "logps/rejected": -240.621826171875, "loss": 2243.2557, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.085872121155262, "rewards/margins": 0.042114924639463425, "rewards/rejected": -0.1279870569705963, "step": 740 }, { "epoch": 0.39, "learning_rate": 4.859977667994209e-06, "logits/chosen": 0.74756920337677, "logits/rejected": 0.8244425654411316, "logps/chosen": -255.57754516601562, "logps/rejected": -242.3626251220703, "loss": 2231.4938, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09324190765619278, "rewards/margins": 0.04055342823266983, "rewards/rejected": -0.1337953507900238, "step": 750 }, { "epoch": 0.4, "learning_rate": 4.852341180692471e-06, "logits/chosen": 0.7698923945426941, "logits/rejected": 0.7992275953292847, "logps/chosen": -256.96868896484375, "logps/rejected": -271.1941833496094, "loss": 2239.7266, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.08798633515834808, "rewards/margins": 0.0419507697224617, "rewards/rejected": -0.1299370974302292, "step": 760 }, { "epoch": 0.4, "learning_rate": 4.844508273579097e-06, "logits/chosen": 0.803545355796814, "logits/rejected": 0.7743754982948303, "logps/chosen": -249.5584716796875, "logps/rejected": -238.66683959960938, "loss": 2204.2545, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08735480904579163, "rewards/margins": 0.04183940216898918, "rewards/rejected": -0.1291942000389099, "step": 770 }, { "epoch": 0.41, "learning_rate": 4.836479600699579e-06, "logits/chosen": 0.7211157083511353, "logits/rejected": 0.7573873400688171, "logps/chosen": -251.00076293945312, "logps/rejected": -248.5784454345703, "loss": 2175.5199, "rewards/accuracies": 0.625, "rewards/chosen": -0.09977405518293381, "rewards/margins": 0.047359712421894073, "rewards/rejected": -0.14713376760482788, "step": 780 }, { "epoch": 0.41, "learning_rate": 4.82825583244579e-06, "logits/chosen": 0.7303954362869263, "logits/rejected": 0.7423623204231262, "logps/chosen": -271.29888916015625, "logps/rejected": -255.95645141601562, "loss": 2097.0797, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09688698500394821, "rewards/margins": 0.060234714299440384, "rewards/rejected": -0.1571216881275177, "step": 790 }, { "epoch": 0.42, "learning_rate": 4.819837655500014e-06, "logits/chosen": 0.7324298620223999, "logits/rejected": 0.8285747766494751, "logps/chosen": -259.47650146484375, "logps/rejected": -261.0966491699219, "loss": 2223.6863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1077154278755188, "rewards/margins": 0.04667884111404419, "rewards/rejected": -0.154394268989563, "step": 800 }, { "epoch": 0.42, "eval_logits/chosen": 0.6992308497428894, "eval_logits/rejected": 0.7610952854156494, "eval_logps/chosen": -266.70574951171875, "eval_logps/rejected": -248.42222595214844, "eval_loss": 2196.09912109375, "eval_rewards/accuracies": 0.6675000190734863, "eval_rewards/chosen": -0.1008833572268486, "eval_rewards/margins": 0.04784964770078659, "eval_rewards/rejected": -0.1487330049276352, "eval_runtime": 416.5458, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 800 }, { "epoch": 0.42, "learning_rate": 4.811225772777603e-06, "logits/chosen": 0.8175959587097168, "logits/rejected": 0.7778623700141907, "logps/chosen": -281.6056823730469, "logps/rejected": -235.4732666015625, "loss": 2154.2184, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10602346807718277, "rewards/margins": 0.0522245354950428, "rewards/rejected": -0.15824799239635468, "step": 810 }, { "epoch": 0.43, "learning_rate": 4.802420903368286e-06, "logits/chosen": 0.6645683646202087, "logits/rejected": 0.7504470944404602, "logps/chosen": -262.6244812011719, "logps/rejected": -246.62728881835938, "loss": 2070.7916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08415599167346954, "rewards/margins": 0.06298204511404037, "rewards/rejected": -0.14713802933692932, "step": 820 }, { "epoch": 0.43, "learning_rate": 4.793423782476125e-06, "logits/chosen": 0.7014611959457397, "logits/rejected": 0.7595884203910828, "logps/chosen": -261.1951599121094, "logps/rejected": -237.8996124267578, "loss": 2303.9654, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11634109169244766, "rewards/margins": 0.03935299813747406, "rewards/rejected": -0.15569409728050232, "step": 830 }, { "epoch": 0.44, "learning_rate": 4.784235161358124e-06, "logits/chosen": 0.7220847010612488, "logits/rejected": 0.8208295702934265, "logps/chosen": -274.5125427246094, "logps/rejected": -267.01275634765625, "loss": 2374.0922, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11991143226623535, "rewards/margins": 0.03418232128024101, "rewards/rejected": -0.15409375727176666, "step": 840 }, { "epoch": 0.44, "learning_rate": 4.774855807261504e-06, "logits/chosen": 0.771617591381073, "logits/rejected": 0.7759231925010681, "logps/chosen": -266.54156494140625, "logps/rejected": -232.45114135742188, "loss": 2085.9527, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09387587755918503, "rewards/margins": 0.059171438217163086, "rewards/rejected": -0.1530473232269287, "step": 850 }, { "epoch": 0.45, "learning_rate": 4.765286503359632e-06, "logits/chosen": 0.7542043924331665, "logits/rejected": 0.7199236154556274, "logps/chosen": -247.95004272460938, "logps/rejected": -234.8477325439453, "loss": 2140.2543, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09680913388729095, "rewards/margins": 0.05759376287460327, "rewards/rejected": -0.15440289676189423, "step": 860 }, { "epoch": 0.46, "learning_rate": 4.755528048686629e-06, "logits/chosen": 0.7054905891418457, "logits/rejected": 0.7627168297767639, "logps/chosen": -269.0238342285156, "logps/rejected": -231.31594848632812, "loss": 2018.9971, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09635048359632492, "rewards/margins": 0.07080944627523422, "rewards/rejected": -0.16715992987155914, "step": 870 }, { "epoch": 0.46, "learning_rate": 4.745581258070654e-06, "logits/chosen": 0.7587330937385559, "logits/rejected": 0.7288376092910767, "logps/chosen": -279.3045959472656, "logps/rejected": -248.2842559814453, "loss": 2182.2826, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10375384986400604, "rewards/margins": 0.04985477030277252, "rewards/rejected": -0.15360862016677856, "step": 880 }, { "epoch": 0.47, "learning_rate": 4.73544696206586e-06, "logits/chosen": 0.7099634408950806, "logits/rejected": 0.7651978731155396, "logps/chosen": -241.7110137939453, "logps/rejected": -226.1469268798828, "loss": 2313.518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11171738058328629, "rewards/margins": 0.033566057682037354, "rewards/rejected": -0.14528343081474304, "step": 890 }, { "epoch": 0.47, "learning_rate": 4.725126006883047e-06, "logits/chosen": 0.6832414865493774, "logits/rejected": 0.6996358633041382, "logps/chosen": -287.02655029296875, "logps/rejected": -271.8457946777344, "loss": 2066.7418, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1082179993391037, "rewards/margins": 0.06335000693798065, "rewards/rejected": -0.17156800627708435, "step": 900 }, { "epoch": 0.47, "eval_logits/chosen": 0.6917389035224915, "eval_logits/rejected": 0.7518260478973389, "eval_logps/chosen": -267.73968505859375, "eval_logps/rejected": -250.13189697265625, "eval_loss": 2166.0732421875, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": -0.11122233420610428, "eval_rewards/margins": 0.0546073243021965, "eval_rewards/rejected": -0.16582968831062317, "eval_runtime": 416.784, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 900 }, { "epoch": 0.48, "learning_rate": 4.7146192543190005e-06, "logits/chosen": 0.7068012952804565, "logits/rejected": 0.7604703307151794, "logps/chosen": -301.75897216796875, "logps/rejected": -260.5961608886719, "loss": 2114.2752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10466556251049042, "rewards/margins": 0.06352122128009796, "rewards/rejected": -0.16818679869174957, "step": 910 }, { "epoch": 0.48, "learning_rate": 4.70392758168454e-06, "logits/chosen": 0.7013599872589111, "logits/rejected": 0.7524459362030029, "logps/chosen": -280.307861328125, "logps/rejected": -256.722900390625, "loss": 2081.7857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11534447968006134, "rewards/margins": 0.06791369616985321, "rewards/rejected": -0.18325819075107574, "step": 920 }, { "epoch": 0.49, "learning_rate": 4.693051881731251e-06, "logits/chosen": 0.6879482269287109, "logits/rejected": 0.7315651178359985, "logps/chosen": -267.4771728515625, "logps/rejected": -269.62249755859375, "loss": 2219.8021, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11604435741901398, "rewards/margins": 0.04904730245471001, "rewards/rejected": -0.16509169340133667, "step": 930 }, { "epoch": 0.49, "learning_rate": 4.68199306257695e-06, "logits/chosen": 0.7221347093582153, "logits/rejected": 0.8089338541030884, "logps/chosen": -277.0709533691406, "logps/rejected": -288.2813415527344, "loss": 2066.718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09484682977199554, "rewards/margins": 0.066488116979599, "rewards/rejected": -0.16133496165275574, "step": 940 }, { "epoch": 0.5, "learning_rate": 4.670752047629855e-06, "logits/chosen": 0.7649358510971069, "logits/rejected": 0.8068546056747437, "logps/chosen": -289.7987976074219, "logps/rejected": -257.411865234375, "loss": 1989.5814, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09663524478673935, "rewards/margins": 0.07417033612728119, "rewards/rejected": -0.17080560326576233, "step": 950 }, { "epoch": 0.5, "learning_rate": 4.659329775511478e-06, "logits/chosen": 0.6801126599311829, "logits/rejected": 0.7201008796691895, "logps/chosen": -275.82244873046875, "logps/rejected": -263.333251953125, "loss": 2137.0553, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10409847646951675, "rewards/margins": 0.06283750385046005, "rewards/rejected": -0.1669359654188156, "step": 960 }, { "epoch": 0.51, "learning_rate": 4.647727199978255e-06, "logits/chosen": 0.675479531288147, "logits/rejected": 0.755820095539093, "logps/chosen": -281.660888671875, "logps/rejected": -264.6354675292969, "loss": 2290.1266, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11869573593139648, "rewards/margins": 0.0478622205555439, "rewards/rejected": -0.1665579378604889, "step": 970 }, { "epoch": 0.51, "learning_rate": 4.635945289841902e-06, "logits/chosen": 0.6294044852256775, "logits/rejected": 0.701261043548584, "logps/chosen": -249.8705596923828, "logps/rejected": -245.52346801757812, "loss": 2214.392, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12245059013366699, "rewards/margins": 0.05106619745492935, "rewards/rejected": -0.17351679503917694, "step": 980 }, { "epoch": 0.52, "learning_rate": 4.623985028888527e-06, "logits/chosen": 0.7620214223861694, "logits/rejected": 0.799843430519104, "logps/chosen": -236.2934112548828, "logps/rejected": -222.5919189453125, "loss": 2149.6912, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12090835720300674, "rewards/margins": 0.05957023426890373, "rewards/rejected": -0.18047860264778137, "step": 990 }, { "epoch": 0.52, "learning_rate": 4.611847415796476e-06, "logits/chosen": 0.7124743461608887, "logits/rejected": 0.6904253363609314, "logps/chosen": -265.0976257324219, "logps/rejected": -254.4890594482422, "loss": 2119.2691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11531393229961395, "rewards/margins": 0.0636182576417923, "rewards/rejected": -0.17893218994140625, "step": 1000 }, { "epoch": 0.52, "eval_logits/chosen": 0.6619382500648499, "eval_logits/rejected": 0.721328854560852, "eval_logps/chosen": -268.7693176269531, "eval_logps/rejected": -251.76100158691406, "eval_loss": 2138.93115234375, "eval_rewards/accuracies": 0.671500027179718, "eval_rewards/chosen": -0.12151883542537689, "eval_rewards/margins": 0.060602057725191116, "eval_rewards/rejected": -0.1821209043264389, "eval_runtime": 416.4897, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 1000 }, { "epoch": 0.53, "learning_rate": 4.599533464052951e-06, "logits/chosen": 0.7095866203308105, "logits/rejected": 0.7142434120178223, "logps/chosen": -285.8958740234375, "logps/rejected": -269.59906005859375, "loss": 2002.4428, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09995652735233307, "rewards/margins": 0.08161594718694687, "rewards/rejected": -0.18157246708869934, "step": 1010 }, { "epoch": 0.53, "learning_rate": 4.587044201869378e-06, "logits/chosen": 0.6786423921585083, "logits/rejected": 0.7168447375297546, "logps/chosen": -285.7081298828125, "logps/rejected": -245.5774688720703, "loss": 2104.4162, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10604969412088394, "rewards/margins": 0.06783930957317352, "rewards/rejected": -0.17388899624347687, "step": 1020 }, { "epoch": 0.54, "learning_rate": 4.574380672095555e-06, "logits/chosen": 0.6884575486183167, "logits/rejected": 0.7298802733421326, "logps/chosen": -223.19393920898438, "logps/rejected": -240.6363983154297, "loss": 2218.8305, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1402120441198349, "rewards/margins": 0.049252741038799286, "rewards/rejected": -0.1894647777080536, "step": 1030 }, { "epoch": 0.54, "learning_rate": 4.561543932132574e-06, "logits/chosen": 0.6861045360565186, "logits/rejected": 0.7231858968734741, "logps/chosen": -282.3648376464844, "logps/rejected": -247.057373046875, "loss": 2149.1324, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1236349567770958, "rewards/margins": 0.05487797409296036, "rewards/rejected": -0.17851293087005615, "step": 1040 }, { "epoch": 0.55, "learning_rate": 4.548535053844527e-06, "logits/chosen": 0.6396581530570984, "logits/rejected": 0.7092006206512451, "logps/chosen": -280.1047058105469, "logps/rejected": -267.3292541503906, "loss": 2057.9182, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12618432939052582, "rewards/margins": 0.07123459875583649, "rewards/rejected": -0.1974189579486847, "step": 1050 }, { "epoch": 0.55, "learning_rate": 4.535355123469009e-06, "logits/chosen": 0.7520751357078552, "logits/rejected": 0.7533235549926758, "logps/chosen": -246.0561065673828, "logps/rejected": -210.5857391357422, "loss": 2157.4354, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11811725795269012, "rewards/margins": 0.05882970616221428, "rewards/rejected": -0.1769469678401947, "step": 1060 }, { "epoch": 0.56, "learning_rate": 4.522005241526411e-06, "logits/chosen": 0.670494019985199, "logits/rejected": 0.7469106912612915, "logps/chosen": -281.83013916015625, "logps/rejected": -235.8207244873047, "loss": 2224.4785, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12622186541557312, "rewards/margins": 0.04977993294596672, "rewards/rejected": -0.17600181698799133, "step": 1070 }, { "epoch": 0.57, "learning_rate": 4.508486522728037e-06, "logits/chosen": 0.6910241842269897, "logits/rejected": 0.7257175445556641, "logps/chosen": -276.518798828125, "logps/rejected": -260.69781494140625, "loss": 2003.6445, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1153578907251358, "rewards/margins": 0.07534319162368774, "rewards/rejected": -0.19070109724998474, "step": 1080 }, { "epoch": 0.57, "learning_rate": 4.494800095883014e-06, "logits/chosen": 0.6132059097290039, "logits/rejected": 0.6958727836608887, "logps/chosen": -290.1937255859375, "logps/rejected": -244.7795867919922, "loss": 1914.4395, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1075458973646164, "rewards/margins": 0.09307406842708588, "rewards/rejected": -0.20061998069286346, "step": 1090 }, { "epoch": 0.58, "learning_rate": 4.480947103804044e-06, "logits/chosen": 0.5848616361618042, "logits/rejected": 0.6844476461410522, "logps/chosen": -286.0004577636719, "logps/rejected": -235.7235565185547, "loss": 2191.7109, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12465153634548187, "rewards/margins": 0.05475841090083122, "rewards/rejected": -0.1794099658727646, "step": 1100 }, { "epoch": 0.58, "eval_logits/chosen": 0.6583799123764038, "eval_logits/rejected": 0.7175658345222473, "eval_logps/chosen": -269.19097900390625, "eval_logps/rejected": -252.60589599609375, "eval_loss": 2121.8115234375, "eval_rewards/accuracies": 0.6694999933242798, "eval_rewards/chosen": -0.12573528289794922, "eval_rewards/margins": 0.06483451277017593, "eval_rewards/rejected": -0.19056977331638336, "eval_runtime": 416.4568, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 1100 }, { "epoch": 0.58, "learning_rate": 4.466928703211981e-06, "logits/chosen": 0.6980951428413391, "logits/rejected": 0.6862035989761353, "logps/chosen": -281.24700927734375, "logps/rejected": -248.4276123046875, "loss": 2063.3184, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11639384180307388, "rewards/margins": 0.07433497160673141, "rewards/rejected": -0.1907288283109665, "step": 1110 }, { "epoch": 0.59, "learning_rate": 4.452746064639239e-06, "logits/chosen": 0.678636372089386, "logits/rejected": 0.6571283936500549, "logps/chosen": -258.97674560546875, "logps/rejected": -254.3134307861328, "loss": 2036.2967, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10998505353927612, "rewards/margins": 0.06968870759010315, "rewards/rejected": -0.17967377603054047, "step": 1120 }, { "epoch": 0.59, "learning_rate": 4.438400372332058e-06, "logits/chosen": 0.7093490958213806, "logits/rejected": 0.7673132419586182, "logps/chosen": -265.0453796386719, "logps/rejected": -250.15982055664062, "loss": 1899.6914, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10779963433742523, "rewards/margins": 0.08556055277585983, "rewards/rejected": -0.19336020946502686, "step": 1130 }, { "epoch": 0.6, "learning_rate": 4.423892824151617e-06, "logits/chosen": 0.6779216527938843, "logits/rejected": 0.7495394945144653, "logps/chosen": -276.6440734863281, "logps/rejected": -247.07150268554688, "loss": 2002.1414, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11242429912090302, "rewards/margins": 0.08376909792423248, "rewards/rejected": -0.19619342684745789, "step": 1140 }, { "epoch": 0.6, "learning_rate": 4.409224631474014e-06, "logits/chosen": 0.6950255632400513, "logits/rejected": 0.7308493256568909, "logps/chosen": -258.3533020019531, "logps/rejected": -235.54483032226562, "loss": 1995.5852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12844698131084442, "rewards/margins": 0.07693418860435486, "rewards/rejected": -0.20538118481636047, "step": 1150 }, { "epoch": 0.61, "learning_rate": 4.3943970190891164e-06, "logits/chosen": 0.6389755010604858, "logits/rejected": 0.6700756549835205, "logps/chosen": -264.4709167480469, "logps/rejected": -256.2208557128906, "loss": 2016.508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.13102933764457703, "rewards/margins": 0.07655525207519531, "rewards/rejected": -0.20758457481861115, "step": 1160 }, { "epoch": 0.61, "learning_rate": 4.379411225098292e-06, "logits/chosen": 0.6980705261230469, "logits/rejected": 0.7923838496208191, "logps/chosen": -283.1957702636719, "logps/rejected": -267.30938720703125, "loss": 2012.2674, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.138292133808136, "rewards/margins": 0.08078579604625702, "rewards/rejected": -0.2190779149532318, "step": 1170 }, { "epoch": 0.62, "learning_rate": 4.364268500811025e-06, "logits/chosen": 0.6915451288223267, "logits/rejected": 0.6770834922790527, "logps/chosen": -258.2509460449219, "logps/rejected": -266.7464294433594, "loss": 2101.8338, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13359954953193665, "rewards/margins": 0.07587826251983643, "rewards/rejected": -0.20947781205177307, "step": 1180 }, { "epoch": 0.62, "learning_rate": 4.348970110640437e-06, "logits/chosen": 0.6509718298912048, "logits/rejected": 0.7129366993904114, "logps/chosen": -258.56280517578125, "logps/rejected": -237.4566192626953, "loss": 2027.6717, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.14995795488357544, "rewards/margins": 0.07759587466716766, "rewards/rejected": -0.2275538146495819, "step": 1190 }, { "epoch": 0.63, "learning_rate": 4.333517331997704e-06, "logits/chosen": 0.5978332161903381, "logits/rejected": 0.6565033793449402, "logps/chosen": -272.02166748046875, "logps/rejected": -271.3028259277344, "loss": 2308.1883, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1387997567653656, "rewards/margins": 0.04619471728801727, "rewards/rejected": -0.18499447405338287, "step": 1200 }, { "epoch": 0.63, "eval_logits/chosen": 0.6329967379570007, "eval_logits/rejected": 0.6920445561408997, "eval_logps/chosen": -270.7044372558594, "eval_logps/rejected": -254.78115844726562, "eval_loss": 2110.306884765625, "eval_rewards/accuracies": 0.6664999723434448, "eval_rewards/chosen": -0.14087003469467163, "eval_rewards/margins": 0.07145243883132935, "eval_rewards/rejected": -0.21232248842716217, "eval_runtime": 416.6934, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 1200 }, { "epoch": 0.63, "learning_rate": 4.317911455185396e-06, "logits/chosen": 0.6959893703460693, "logits/rejected": 0.7259203791618347, "logps/chosen": -266.06829833984375, "logps/rejected": -238.3871307373047, "loss": 2262.5527, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.15047064423561096, "rewards/margins": 0.04972488805651665, "rewards/rejected": -0.2001955509185791, "step": 1210 }, { "epoch": 0.64, "learning_rate": 4.302153783289737e-06, "logits/chosen": 0.6275352239608765, "logits/rejected": 0.7193800806999207, "logps/chosen": -266.05908203125, "logps/rejected": -258.8971252441406, "loss": 2233.5896, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1438441276550293, "rewards/margins": 0.05622429400682449, "rewards/rejected": -0.2000684291124344, "step": 1220 }, { "epoch": 0.64, "learning_rate": 4.286245632071791e-06, "logits/chosen": 0.6443454623222351, "logits/rejected": 0.6870865225791931, "logps/chosen": -257.45611572265625, "logps/rejected": -253.72756958007812, "loss": 2126.4154, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15624599158763885, "rewards/margins": 0.06528286635875702, "rewards/rejected": -0.22152885794639587, "step": 1230 }, { "epoch": 0.65, "learning_rate": 4.270188329857613e-06, "logits/chosen": 0.7713927626609802, "logits/rejected": 0.7753847241401672, "logps/chosen": -263.57708740234375, "logps/rejected": -270.3426513671875, "loss": 2108.1736, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.12420948594808578, "rewards/margins": 0.06607901304960251, "rewards/rejected": -0.1902884989976883, "step": 1240 }, { "epoch": 0.65, "learning_rate": 4.253983217427313e-06, "logits/chosen": 0.6878337860107422, "logits/rejected": 0.7090884447097778, "logps/chosen": -271.42657470703125, "logps/rejected": -288.615478515625, "loss": 2128.0695, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.13797307014465332, "rewards/margins": 0.06890544295310974, "rewards/rejected": -0.20687851309776306, "step": 1250 }, { "epoch": 0.66, "learning_rate": 4.237631647903115e-06, "logits/chosen": 0.6635148525238037, "logits/rejected": 0.6455484628677368, "logps/chosen": -266.98248291015625, "logps/rejected": -250.23403930664062, "loss": 2209.6877, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.13750961422920227, "rewards/margins": 0.05613657087087631, "rewards/rejected": -0.1936461478471756, "step": 1260 }, { "epoch": 0.66, "learning_rate": 4.221134986636371e-06, "logits/chosen": 0.6171488761901855, "logits/rejected": 0.6567360758781433, "logps/chosen": -273.8824157714844, "logps/rejected": -249.103515625, "loss": 1885.2906, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11038468778133392, "rewards/margins": 0.10038020461797714, "rewards/rejected": -0.21076488494873047, "step": 1270 }, { "epoch": 0.67, "learning_rate": 4.204494611093548e-06, "logits/chosen": 0.7011705636978149, "logits/rejected": 0.6791177988052368, "logps/chosen": -251.37478637695312, "logps/rejected": -261.1529846191406, "loss": 2258.1799, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14569328725337982, "rewards/margins": 0.0609690323472023, "rewards/rejected": -0.20666229724884033, "step": 1280 }, { "epoch": 0.68, "learning_rate": 4.1877119107412165e-06, "logits/chosen": 0.6343793869018555, "logits/rejected": 0.6927725672721863, "logps/chosen": -237.8370819091797, "logps/rejected": -256.54254150390625, "loss": 2061.6873, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1375429332256317, "rewards/margins": 0.07506345212459564, "rewards/rejected": -0.21260638535022736, "step": 1290 }, { "epoch": 0.68, "learning_rate": 4.170788286930024e-06, "logits/chosen": 0.6356396675109863, "logits/rejected": 0.7518913149833679, "logps/chosen": -275.9583435058594, "logps/rejected": -253.1591033935547, "loss": 1996.7178, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1259593516588211, "rewards/margins": 0.09192151576280594, "rewards/rejected": -0.21788087487220764, "step": 1300 }, { "epoch": 0.68, "eval_logits/chosen": 0.6140788793563843, "eval_logits/rejected": 0.6721699833869934, "eval_logps/chosen": -269.7620544433594, "eval_logps/rejected": -253.97259521484375, "eval_loss": 2095.31298828125, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": -0.1314462274312973, "eval_rewards/margins": 0.07279053330421448, "eval_rewards/rejected": -0.20423679053783417, "eval_runtime": 416.679, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 1300 }, { "epoch": 0.69, "learning_rate": 4.15372515277769e-06, "logits/chosen": 0.6244436502456665, "logits/rejected": 0.6609630584716797, "logps/chosen": -280.32794189453125, "logps/rejected": -248.7582244873047, "loss": 2011.6244, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11511021852493286, "rewards/margins": 0.08765153586864471, "rewards/rejected": -0.20276173949241638, "step": 1310 }, { "epoch": 0.69, "learning_rate": 4.136523933051005e-06, "logits/chosen": 0.6894992589950562, "logits/rejected": 0.6712801456451416, "logps/chosen": -263.05767822265625, "logps/rejected": -234.242431640625, "loss": 1956.5369, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11226965487003326, "rewards/margins": 0.09646574407815933, "rewards/rejected": -0.208735391497612, "step": 1320 }, { "epoch": 0.7, "learning_rate": 4.119186064046868e-06, "logits/chosen": 0.6183528900146484, "logits/rejected": 0.644507110118866, "logps/chosen": -274.1941223144531, "logps/rejected": -245.35562133789062, "loss": 2166.2785, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12718230485916138, "rewards/margins": 0.06536950916051865, "rewards/rejected": -0.19255182147026062, "step": 1330 }, { "epoch": 0.7, "learning_rate": 4.101712993472348e-06, "logits/chosen": 0.6670488715171814, "logits/rejected": 0.6754225492477417, "logps/chosen": -278.2078552246094, "logps/rejected": -241.5122528076172, "loss": 1883.2881, "rewards/accuracies": 0.75, "rewards/chosen": -0.119564950466156, "rewards/margins": 0.09150619804859161, "rewards/rejected": -0.2110711634159088, "step": 1340 }, { "epoch": 0.71, "learning_rate": 4.084106180323813e-06, "logits/chosen": 0.6214176416397095, "logits/rejected": 0.66867595911026, "logps/chosen": -261.7587890625, "logps/rejected": -251.6807098388672, "loss": 2017.3074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1294572651386261, "rewards/margins": 0.08191975206136703, "rewards/rejected": -0.21137702465057373, "step": 1350 }, { "epoch": 0.71, "learning_rate": 4.066367094765091e-06, "logits/chosen": 0.6692546606063843, "logits/rejected": 0.6942587494850159, "logps/chosen": -259.8294982910156, "logps/rejected": -268.8597412109375, "loss": 2097.6473, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12980519235134125, "rewards/margins": 0.07286903262138367, "rewards/rejected": -0.20267422497272491, "step": 1360 }, { "epoch": 0.72, "learning_rate": 4.048497218004724e-06, "logits/chosen": 0.5632964968681335, "logits/rejected": 0.6666015386581421, "logps/chosen": -265.3409729003906, "logps/rejected": -252.78958129882812, "loss": 2076.2803, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13252495229244232, "rewards/margins": 0.08360429853200912, "rewards/rejected": -0.21612922847270966, "step": 1370 }, { "epoch": 0.72, "learning_rate": 4.030498042172277e-06, "logits/chosen": 0.6174412369728088, "logits/rejected": 0.6783226728439331, "logps/chosen": -249.213134765625, "logps/rejected": -241.413330078125, "loss": 2285.0457, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14881116151809692, "rewards/margins": 0.05003712326288223, "rewards/rejected": -0.19884827733039856, "step": 1380 }, { "epoch": 0.73, "learning_rate": 4.012371070193753e-06, "logits/chosen": 0.6269063353538513, "logits/rejected": 0.6346549391746521, "logps/chosen": -241.5870361328125, "logps/rejected": -245.96456909179688, "loss": 2133.8158, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1412040889263153, "rewards/margins": 0.06111832335591316, "rewards/rejected": -0.20232239365577698, "step": 1390 }, { "epoch": 0.73, "learning_rate": 3.994117815666095e-06, "logits/chosen": 0.6533576250076294, "logits/rejected": 0.6740087866783142, "logps/chosen": -283.4432678222656, "logps/rejected": -267.79071044921875, "loss": 2038.3844, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1330070197582245, "rewards/margins": 0.07623559981584549, "rewards/rejected": -0.20924265682697296, "step": 1400 }, { "epoch": 0.73, "eval_logits/chosen": 0.5933060050010681, "eval_logits/rejected": 0.6512511968612671, "eval_logps/chosen": -270.4488220214844, "eval_logps/rejected": -254.944091796875, "eval_loss": 2085.085205078125, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.13831348717212677, "eval_rewards/margins": 0.07563827186822891, "eval_rewards/rejected": -0.21395176649093628, "eval_runtime": 416.609, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 1400 }, { "epoch": 0.74, "learning_rate": 3.975739802730805e-06, "logits/chosen": 0.5807250738143921, "logits/rejected": 0.6861320734024048, "logps/chosen": -298.25604248046875, "logps/rejected": -273.8665466308594, "loss": 2016.2207, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12952394783496857, "rewards/margins": 0.08187790215015411, "rewards/rejected": -0.21140184998512268, "step": 1410 }, { "epoch": 0.74, "learning_rate": 3.957238565946672e-06, "logits/chosen": 0.6601132750511169, "logits/rejected": 0.6705759763717651, "logps/chosen": -266.2096252441406, "logps/rejected": -249.5722198486328, "loss": 2228.1479, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1489478349685669, "rewards/margins": 0.05832044407725334, "rewards/rejected": -0.20726828277111053, "step": 1420 }, { "epoch": 0.75, "learning_rate": 3.938615650161645e-06, "logits/chosen": 0.6056556701660156, "logits/rejected": 0.5954689979553223, "logps/chosen": -244.7415771484375, "logps/rejected": -234.8352813720703, "loss": 2023.5078, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11950768530368805, "rewards/margins": 0.0772583931684494, "rewards/rejected": -0.19676607847213745, "step": 1430 }, { "epoch": 0.75, "learning_rate": 3.919872610383831e-06, "logits/chosen": 0.5716265439987183, "logits/rejected": 0.6326289176940918, "logps/chosen": -257.7333984375, "logps/rejected": -234.2956085205078, "loss": 2199.8311, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14616943895816803, "rewards/margins": 0.055809132754802704, "rewards/rejected": -0.20197856426239014, "step": 1440 }, { "epoch": 0.76, "learning_rate": 3.9010110116516595e-06, "logits/chosen": 0.655591607093811, "logits/rejected": 0.7094139456748962, "logps/chosen": -266.137939453125, "logps/rejected": -247.2853240966797, "loss": 1965.8004, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.13736537098884583, "rewards/margins": 0.0867539569735527, "rewards/rejected": -0.22411933541297913, "step": 1450 }, { "epoch": 0.76, "learning_rate": 3.882032428903195e-06, "logits/chosen": 0.6148696541786194, "logits/rejected": 0.6896382570266724, "logps/chosen": -256.70904541015625, "logps/rejected": -245.3892822265625, "loss": 2095.5068, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12945787608623505, "rewards/margins": 0.07017168402671814, "rewards/rejected": -0.19962957501411438, "step": 1460 }, { "epoch": 0.77, "learning_rate": 3.8629384468446365e-06, "logits/chosen": 0.5744356513023376, "logits/rejected": 0.5954487919807434, "logps/chosen": -250.98178100585938, "logps/rejected": -272.93701171875, "loss": 2119.459, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13649822771549225, "rewards/margins": 0.07962769269943237, "rewards/rejected": -0.21612592041492462, "step": 1470 }, { "epoch": 0.77, "learning_rate": 3.84373065981799e-06, "logits/chosen": 0.6630114912986755, "logits/rejected": 0.6675506830215454, "logps/chosen": -265.52447509765625, "logps/rejected": -247.32455444335938, "loss": 1991.6379, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12121538817882538, "rewards/margins": 0.08006526529788971, "rewards/rejected": -0.20128066837787628, "step": 1480 }, { "epoch": 0.78, "learning_rate": 3.824410671667948e-06, "logits/chosen": 0.6106497645378113, "logits/rejected": 0.680738091468811, "logps/chosen": -260.89288330078125, "logps/rejected": -252.52017211914062, "loss": 1942.9977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12056465446949005, "rewards/margins": 0.09015407413244247, "rewards/rejected": -0.21071875095367432, "step": 1490 }, { "epoch": 0.79, "learning_rate": 3.8049800956079552e-06, "logits/chosen": 0.5932056903839111, "logits/rejected": 0.6287232637405396, "logps/chosen": -291.2415771484375, "logps/rejected": -279.5646057128906, "loss": 2094.2182, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13307559490203857, "rewards/margins": 0.0750352293252945, "rewards/rejected": -0.20811080932617188, "step": 1500 }, { "epoch": 0.79, "eval_logits/chosen": 0.5898318886756897, "eval_logits/rejected": 0.6474130749702454, "eval_logps/chosen": -270.5129089355469, "eval_logps/rejected": -255.21328735351562, "eval_loss": 2076.30419921875, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.13895468413829803, "eval_rewards/margins": 0.07768914848566055, "eval_rewards/rejected": -0.2166438102722168, "eval_runtime": 416.7121, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 1500 }, { "epoch": 0.79, "learning_rate": 3.7854405540855268e-06, "logits/chosen": 0.580877959728241, "logits/rejected": 0.6030541658401489, "logps/chosen": -255.82693481445312, "logps/rejected": -249.1620635986328, "loss": 2165.7623, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1440289467573166, "rewards/margins": 0.06519783288240433, "rewards/rejected": -0.2092268019914627, "step": 1510 }, { "epoch": 0.8, "learning_rate": 3.765793678646753e-06, "logits/chosen": 0.612065851688385, "logits/rejected": 0.6108434200286865, "logps/chosen": -236.6591796875, "logps/rejected": -245.4730224609375, "loss": 2151.6375, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13691964745521545, "rewards/margins": 0.06442641466856003, "rewards/rejected": -0.20134606957435608, "step": 1520 }, { "epoch": 0.8, "learning_rate": 3.7460411098000804e-06, "logits/chosen": 0.620397686958313, "logits/rejected": 0.6705790758132935, "logps/chosen": -279.47003173828125, "logps/rejected": -242.50320434570312, "loss": 2097.6518, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13826757669448853, "rewards/margins": 0.06730998307466507, "rewards/rejected": -0.2055775672197342, "step": 1530 }, { "epoch": 0.81, "learning_rate": 3.726184496879323e-06, "logits/chosen": 0.5731703042984009, "logits/rejected": 0.6038475036621094, "logps/chosen": -273.57684326171875, "logps/rejected": -263.6417541503906, "loss": 1980.56, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13297039270401, "rewards/margins": 0.08892510086297989, "rewards/rejected": -0.2218955010175705, "step": 1540 }, { "epoch": 0.81, "learning_rate": 3.706225497905946e-06, "logits/chosen": 0.5495398640632629, "logits/rejected": 0.6184272170066833, "logps/chosen": -278.1634521484375, "logps/rejected": -250.0457763671875, "loss": 1925.0881, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.14073289930820465, "rewards/margins": 0.08853240311145782, "rewards/rejected": -0.22926530241966248, "step": 1550 }, { "epoch": 0.82, "learning_rate": 3.686165779450619e-06, "logits/chosen": 0.6478545069694519, "logits/rejected": 0.6362086534500122, "logps/chosen": -267.02618408203125, "logps/rejected": -239.1699676513672, "loss": 2063.3338, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13111211359500885, "rewards/margins": 0.07986196875572205, "rewards/rejected": -0.2109740674495697, "step": 1560 }, { "epoch": 0.82, "learning_rate": 3.6660070164940614e-06, "logits/chosen": 0.6316484808921814, "logits/rejected": 0.686813473701477, "logps/chosen": -254.73837280273438, "logps/rejected": -249.4886016845703, "loss": 1958.6893, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1321488916873932, "rewards/margins": 0.09855500608682632, "rewards/rejected": -0.23070387542247772, "step": 1570 }, { "epoch": 0.83, "learning_rate": 3.645750892287178e-06, "logits/chosen": 0.6227657794952393, "logits/rejected": 0.642948567867279, "logps/chosen": -254.3902587890625, "logps/rejected": -239.3006134033203, "loss": 2093.4068, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.14098116755485535, "rewards/margins": 0.07492861151695251, "rewards/rejected": -0.21590976417064667, "step": 1580 }, { "epoch": 0.83, "learning_rate": 3.6253990982105114e-06, "logits/chosen": 0.5823426842689514, "logits/rejected": 0.6044851541519165, "logps/chosen": -282.6208801269531, "logps/rejected": -284.2301330566406, "loss": 2308.56, "rewards/accuracies": 0.625, "rewards/chosen": -0.1489720195531845, "rewards/margins": 0.050694145262241364, "rewards/rejected": -0.19966615736484528, "step": 1590 }, { "epoch": 0.84, "learning_rate": 3.604953333633009e-06, "logits/chosen": 0.6414502859115601, "logits/rejected": 0.6938506960868835, "logps/chosen": -254.117431640625, "logps/rejected": -234.0909423828125, "loss": 2171.3457, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12779875099658966, "rewards/margins": 0.06595613807439804, "rewards/rejected": -0.1937548816204071, "step": 1600 }, { "epoch": 0.84, "eval_logits/chosen": 0.5817673802375793, "eval_logits/rejected": 0.6391910910606384, "eval_logps/chosen": -270.3594970703125, "eval_logps/rejected": -255.21298217773438, "eval_loss": 2069.375732421875, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -0.137420654296875, "eval_rewards/margins": 0.07922003418207169, "eval_rewards/rejected": -0.2166406810283661, "eval_runtime": 416.6975, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 1600 }, { "epoch": 0.84, "learning_rate": 3.5844153057701303e-06, "logits/chosen": 0.6806268095970154, "logits/rejected": 0.6613883376121521, "logps/chosen": -293.35455322265625, "logps/rejected": -249.47317504882812, "loss": 2235.1336, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13428549468517303, "rewards/margins": 0.06987977027893066, "rewards/rejected": -0.2041652649641037, "step": 1610 }, { "epoch": 0.85, "learning_rate": 3.56378672954129e-06, "logits/chosen": 0.5934259295463562, "logits/rejected": 0.6393053531646729, "logps/chosen": -263.8625183105469, "logps/rejected": -268.93646240234375, "loss": 2115.3395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12568698823451996, "rewards/margins": 0.06675116717815399, "rewards/rejected": -0.19243815541267395, "step": 1620 }, { "epoch": 0.85, "learning_rate": 3.5430693274266694e-06, "logits/chosen": 0.6212111711502075, "logits/rejected": 0.6776979565620422, "logps/chosen": -265.48065185546875, "logps/rejected": -242.78189086914062, "loss": 1885.1145, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.12569081783294678, "rewards/margins": 0.09405811876058578, "rewards/rejected": -0.21974892914295197, "step": 1630 }, { "epoch": 0.86, "learning_rate": 3.5222648293233806e-06, "logits/chosen": 0.5869291424751282, "logits/rejected": 0.614780843257904, "logps/chosen": -309.3984069824219, "logps/rejected": -281.76800537109375, "loss": 2295.801, "rewards/accuracies": 0.625, "rewards/chosen": -0.13158050179481506, "rewards/margins": 0.05296233296394348, "rewards/rejected": -0.18454284965991974, "step": 1640 }, { "epoch": 0.86, "learning_rate": 3.5013749724010298e-06, "logits/chosen": 0.6291738152503967, "logits/rejected": 0.6847606897354126, "logps/chosen": -269.4018249511719, "logps/rejected": -248.9547576904297, "loss": 1985.4633, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12610626220703125, "rewards/margins": 0.08754386752843857, "rewards/rejected": -0.21365013718605042, "step": 1650 }, { "epoch": 0.87, "learning_rate": 3.4804015009566573e-06, "logits/chosen": 0.6193640828132629, "logits/rejected": 0.6583009958267212, "logps/chosen": -260.786865234375, "logps/rejected": -246.417724609375, "loss": 2067.8201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13031774759292603, "rewards/margins": 0.07650937139987946, "rewards/rejected": -0.20682711899280548, "step": 1660 }, { "epoch": 0.87, "learning_rate": 3.459346166269093e-06, "logits/chosen": 0.569218635559082, "logits/rejected": 0.6178910136222839, "logps/chosen": -286.010498046875, "logps/rejected": -284.7559509277344, "loss": 2057.5484, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11479117721319199, "rewards/margins": 0.08598540723323822, "rewards/rejected": -0.20077654719352722, "step": 1670 }, { "epoch": 0.88, "learning_rate": 3.4382107264527244e-06, "logits/chosen": 0.6346784234046936, "logits/rejected": 0.7338213920593262, "logps/chosen": -300.38739013671875, "logps/rejected": -259.44525146484375, "loss": 1994.4248, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13437870144844055, "rewards/margins": 0.08268047124147415, "rewards/rejected": -0.2170591652393341, "step": 1680 }, { "epoch": 0.88, "learning_rate": 3.416996946310694e-06, "logits/chosen": 0.5468164086341858, "logits/rejected": 0.5939579010009766, "logps/chosen": -299.57061767578125, "logps/rejected": -265.4569396972656, "loss": 1896.8961, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11913790553808212, "rewards/margins": 0.10197613388299942, "rewards/rejected": -0.22111406922340393, "step": 1690 }, { "epoch": 0.89, "learning_rate": 3.3957065971875387e-06, "logits/chosen": 0.5587860345840454, "logits/rejected": 0.6276572346687317, "logps/chosen": -253.8291473388672, "logps/rejected": -239.05685424804688, "loss": 2189.3863, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14831021428108215, "rewards/margins": 0.06738562881946564, "rewards/rejected": -0.2156958281993866, "step": 1700 }, { "epoch": 0.89, "eval_logits/chosen": 0.5722830295562744, "eval_logits/rejected": 0.629075288772583, "eval_logps/chosen": -270.473876953125, "eval_logps/rejected": -255.46749877929688, "eval_loss": 2062.199462890625, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": -0.13856419920921326, "eval_rewards/margins": 0.08062165975570679, "eval_rewards/rejected": -0.21918585896492004, "eval_runtime": 416.4246, "eval_samples_per_second": 4.803, "eval_steps_per_second": 1.201, "step": 1700 }, { "epoch": 0.9, "learning_rate": 3.3743414568212828e-06, "logits/chosen": 0.6158628463745117, "logits/rejected": 0.6673662066459656, "logps/chosen": -296.7020263671875, "logps/rejected": -242.6101837158203, "loss": 2082.5406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13253936171531677, "rewards/margins": 0.07927088439464569, "rewards/rejected": -0.21181027591228485, "step": 1710 }, { "epoch": 0.9, "learning_rate": 3.352903309194999e-06, "logits/chosen": 0.6274576187133789, "logits/rejected": 0.6294026374816895, "logps/chosen": -293.9241638183594, "logps/rejected": -253.40036010742188, "loss": 2083.2709, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13250832259655, "rewards/margins": 0.08279639482498169, "rewards/rejected": -0.21530470252037048, "step": 1720 }, { "epoch": 0.91, "learning_rate": 3.331393944387845e-06, "logits/chosen": 0.5965205430984497, "logits/rejected": 0.70032799243927, "logps/chosen": -291.1014099121094, "logps/rejected": -274.6158752441406, "loss": 2108.5279, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.12708380818367004, "rewards/margins": 0.08121255040168762, "rewards/rejected": -0.20829637348651886, "step": 1730 }, { "epoch": 0.91, "learning_rate": 3.309815158425591e-06, "logits/chosen": 0.588997483253479, "logits/rejected": 0.6009566783905029, "logps/chosen": -244.7908172607422, "logps/rejected": -238.26651000976562, "loss": 2017.7098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11969755589962006, "rewards/margins": 0.07758014649152756, "rewards/rejected": -0.19727769494056702, "step": 1740 }, { "epoch": 0.92, "learning_rate": 3.288168753130657e-06, "logits/chosen": 0.6095719933509827, "logits/rejected": 0.6279308199882507, "logps/chosen": -250.91116333007812, "logps/rejected": -265.10302734375, "loss": 1975.5611, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1158142238855362, "rewards/margins": 0.08399386703968048, "rewards/rejected": -0.19980809092521667, "step": 1750 }, { "epoch": 0.92, "learning_rate": 3.266456535971654e-06, "logits/chosen": 0.5891221165657043, "logits/rejected": 0.5675392746925354, "logps/chosen": -283.29901123046875, "logps/rejected": -258.32562255859375, "loss": 1991.3586, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1158837080001831, "rewards/margins": 0.09360859543085098, "rewards/rejected": -0.2094922959804535, "step": 1760 }, { "epoch": 0.93, "learning_rate": 3.2446803199124666e-06, "logits/chosen": 0.542614221572876, "logits/rejected": 0.5660384893417358, "logps/chosen": -260.6263427734375, "logps/rejected": -240.3424835205078, "loss": 2044.5988, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12066911160945892, "rewards/margins": 0.0799749344587326, "rewards/rejected": -0.20064406096935272, "step": 1770 }, { "epoch": 0.93, "learning_rate": 3.2228419232608692e-06, "logits/chosen": 0.5963379144668579, "logits/rejected": 0.6299723386764526, "logps/chosen": -248.21920776367188, "logps/rejected": -235.8214874267578, "loss": 2095.258, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.12164044380187988, "rewards/margins": 0.07495652884244919, "rewards/rejected": -0.19659698009490967, "step": 1780 }, { "epoch": 0.94, "learning_rate": 3.2009431695166985e-06, "logits/chosen": 0.5749480724334717, "logits/rejected": 0.627223014831543, "logps/chosen": -239.6404266357422, "logps/rejected": -239.62014770507812, "loss": 1970.9955, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11313848197460175, "rewards/margins": 0.08247244358062744, "rewards/rejected": -0.19561094045639038, "step": 1790 }, { "epoch": 0.94, "learning_rate": 3.1789858872195888e-06, "logits/chosen": 0.6324980854988098, "logits/rejected": 0.6260117888450623, "logps/chosen": -244.56362915039062, "logps/rejected": -245.474609375, "loss": 2292.8938, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14712049067020416, "rewards/margins": 0.05482936650514603, "rewards/rejected": -0.2019498646259308, "step": 1800 }, { "epoch": 0.94, "eval_logits/chosen": 0.5703141689300537, "eval_logits/rejected": 0.627535343170166, "eval_logps/chosen": -268.5789489746094, "eval_logps/rejected": -253.6024627685547, "eval_loss": 2053.1298828125, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": -0.11961515992879868, "eval_rewards/margins": 0.08092045783996582, "eval_rewards/rejected": -0.2005356103181839, "eval_runtime": 416.7248, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 1800 }, { "epoch": 0.95, "learning_rate": 3.156971909796295e-06, "logits/chosen": 0.6370185613632202, "logits/rejected": 0.7445378303527832, "logps/chosen": -265.6059265136719, "logps/rejected": -232.7034454345703, "loss": 1958.5383, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11271758377552032, "rewards/margins": 0.08874475955963135, "rewards/rejected": -0.20146234333515167, "step": 1810 }, { "epoch": 0.95, "learning_rate": 3.1349030754075945e-06, "logits/chosen": 0.623261034488678, "logits/rejected": 0.6591364741325378, "logps/chosen": -263.49993896484375, "logps/rejected": -241.6349334716797, "loss": 1940.6969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1059044599533081, "rewards/margins": 0.09299680590629578, "rewards/rejected": -0.19890126585960388, "step": 1820 }, { "epoch": 0.96, "learning_rate": 3.1127812267948095e-06, "logits/chosen": 0.6355741024017334, "logits/rejected": 0.6655168533325195, "logps/chosen": -264.20062255859375, "logps/rejected": -258.43310546875, "loss": 1993.0014, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12248332798480988, "rewards/margins": 0.08727528899908066, "rewards/rejected": -0.20975859463214874, "step": 1830 }, { "epoch": 0.96, "learning_rate": 3.0906082111259313e-06, "logits/chosen": 0.548941433429718, "logits/rejected": 0.5715293884277344, "logps/chosen": -277.3153076171875, "logps/rejected": -248.1029510498047, "loss": 2108.9805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10435888916254044, "rewards/margins": 0.07835109531879425, "rewards/rejected": -0.18270999193191528, "step": 1840 }, { "epoch": 0.97, "learning_rate": 3.068385879841389e-06, "logits/chosen": 0.6165980100631714, "logits/rejected": 0.6937299966812134, "logps/chosen": -233.5325469970703, "logps/rejected": -246.7562255859375, "loss": 2168.8945, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10994801670312881, "rewards/margins": 0.06389383226633072, "rewards/rejected": -0.17384183406829834, "step": 1850 }, { "epoch": 0.97, "learning_rate": 3.046116088499449e-06, "logits/chosen": 0.6379483938217163, "logits/rejected": 0.6270259618759155, "logps/chosen": -266.16009521484375, "logps/rejected": -259.60919189453125, "loss": 2145.4496, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.11401374638080597, "rewards/margins": 0.07185572385787964, "rewards/rejected": -0.1858694702386856, "step": 1860 }, { "epoch": 0.98, "learning_rate": 3.02380069662128e-06, "logits/chosen": 0.623966634273529, "logits/rejected": 0.5938777327537537, "logps/chosen": -252.69869995117188, "logps/rejected": -245.094482421875, "loss": 2034.7914, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10203299671411514, "rewards/margins": 0.07526172697544098, "rewards/rejected": -0.17729471623897552, "step": 1870 }, { "epoch": 0.98, "learning_rate": 3.0014415675356813e-06, "logits/chosen": 0.6284725069999695, "logits/rejected": 0.6297743916511536, "logps/chosen": -270.23333740234375, "logps/rejected": -252.12704467773438, "loss": 2150.0496, "rewards/accuracies": 0.625, "rewards/chosen": -0.10082075744867325, "rewards/margins": 0.06684517115354538, "rewards/rejected": -0.16766592860221863, "step": 1880 }, { "epoch": 0.99, "learning_rate": 2.979040568223498e-06, "logits/chosen": 0.5534299612045288, "logits/rejected": 0.675399661064148, "logps/chosen": -263.3745422363281, "logps/rejected": -266.8883972167969, "loss": 2205.4939, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11857882887125015, "rewards/margins": 0.06667140126228333, "rewards/rejected": -0.18525022268295288, "step": 1890 }, { "epoch": 0.99, "learning_rate": 2.9565995691617242e-06, "logits/chosen": 0.6073340773582458, "logits/rejected": 0.6487486362457275, "logps/chosen": -292.90704345703125, "logps/rejected": -238.2965545654297, "loss": 2085.5805, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.09591428190469742, "rewards/margins": 0.08266115188598633, "rewards/rejected": -0.17857542634010315, "step": 1900 }, { "epoch": 0.99, "eval_logits/chosen": 0.5747328996658325, "eval_logits/rejected": 0.6318737864494324, "eval_logps/chosen": -267.4730224609375, "eval_logps/rejected": -252.61306762695312, "eval_loss": 2052.32373046875, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.10855603218078613, "eval_rewards/margins": 0.08208546042442322, "eval_rewards/rejected": -0.19064147770404816, "eval_runtime": 416.81, "eval_samples_per_second": 4.798, "eval_steps_per_second": 1.2, "step": 1900 }, { "epoch": 1.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": 0.5886205434799194, "logits/rejected": 0.6063315272331238, "logps/chosen": -281.51129150390625, "logps/rejected": -263.0057678222656, "loss": 1992.2906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10820697247982025, "rewards/margins": 0.08012684434652328, "rewards/rejected": -0.18833380937576294, "step": 1910 }, { "epoch": 1.0, "learning_rate": 2.9116050702407706e-06, "logits/chosen": 0.6380060315132141, "logits/rejected": 0.6841104030609131, "logps/chosen": -267.55145263671875, "logps/rejected": -248.828369140625, "loss": 2045.3801, "rewards/accuracies": 0.6604167222976685, "rewards/chosen": -0.10555033385753632, "rewards/margins": 0.08623509109020233, "rewards/rejected": -0.19178542494773865, "step": 1920 }, { "epoch": 1.01, "learning_rate": 2.889055327409301e-06, "logits/chosen": 0.5285671353340149, "logits/rejected": 0.5704804062843323, "logps/chosen": -263.24725341796875, "logps/rejected": -248.02395629882812, "loss": 2023.134, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10273389518260956, "rewards/margins": 0.08054333180189133, "rewards/rejected": -0.1832772046327591, "step": 1930 }, { "epoch": 1.02, "learning_rate": 2.8664730985699537e-06, "logits/chosen": 0.5331718325614929, "logits/rejected": 0.5988043546676636, "logps/chosen": -242.79061889648438, "logps/rejected": -238.0269012451172, "loss": 1960.2693, "rewards/accuracies": 0.75, "rewards/chosen": -0.09900447726249695, "rewards/margins": 0.0868852287530899, "rewards/rejected": -0.18588972091674805, "step": 1940 }, { "epoch": 1.02, "learning_rate": 2.843860269332339e-06, "logits/chosen": 0.6072074174880981, "logits/rejected": 0.631058394908905, "logps/chosen": -273.4151306152344, "logps/rejected": -246.41238403320312, "loss": 1955.907, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09934862703084946, "rewards/margins": 0.09240168333053589, "rewards/rejected": -0.19175033271312714, "step": 1950 }, { "epoch": 1.03, "learning_rate": 2.8212187278611907e-06, "logits/chosen": 0.6683967113494873, "logits/rejected": 0.6856907606124878, "logps/chosen": -257.2086181640625, "logps/rejected": -247.91683959960938, "loss": 1924.8703, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1071348562836647, "rewards/margins": 0.09893598407506943, "rewards/rejected": -0.20607082545757294, "step": 1960 }, { "epoch": 1.03, "learning_rate": 2.7985503647187063e-06, "logits/chosen": 0.5825555920600891, "logits/rejected": 0.6476297378540039, "logps/chosen": -288.1867980957031, "logps/rejected": -254.5727996826172, "loss": 1907.2662, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.08619461953639984, "rewards/margins": 0.10769355297088623, "rewards/rejected": -0.19388815760612488, "step": 1970 }, { "epoch": 1.04, "learning_rate": 2.7758570727066843e-06, "logits/chosen": 0.5205335021018982, "logits/rejected": 0.6433119177818298, "logps/chosen": -261.37982177734375, "logps/rejected": -240.10952758789062, "loss": 2241.4863, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12351379543542862, "rewards/margins": 0.05891970917582512, "rewards/rejected": -0.18243351578712463, "step": 1980 }, { "epoch": 1.04, "learning_rate": 2.753140746708477e-06, "logits/chosen": 0.6216637492179871, "logits/rejected": 0.669810950756073, "logps/chosen": -282.500244140625, "logps/rejected": -273.12310791015625, "loss": 1961.7119, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09472217410802841, "rewards/margins": 0.09439438581466675, "rewards/rejected": -0.18911656737327576, "step": 1990 }, { "epoch": 1.05, "learning_rate": 2.730403283530767e-06, "logits/chosen": 0.638060986995697, "logits/rejected": 0.7034865617752075, "logps/chosen": -258.02447509765625, "logps/rejected": -246.9062957763672, "loss": 1847.759, "rewards/accuracies": 0.75, "rewards/chosen": -0.09892908483743668, "rewards/margins": 0.10315445810556412, "rewards/rejected": -0.2020835429430008, "step": 2000 }, { "epoch": 1.05, "eval_logits/chosen": 0.5763067603111267, "eval_logits/rejected": 0.6332587599754333, "eval_logps/chosen": -267.7949523925781, "eval_logps/rejected": -253.0826873779297, "eval_loss": 2050.417724609375, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": -0.11177488416433334, "eval_rewards/margins": 0.0835629403591156, "eval_rewards/rejected": -0.19533783197402954, "eval_runtime": 416.585, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 2000 }, { "epoch": 1.05, "learning_rate": 2.707646581745188e-06, "logits/chosen": 0.6024230718612671, "logits/rejected": 0.6550949811935425, "logps/chosen": -275.99761962890625, "logps/rejected": -272.4120178222656, "loss": 1981.8529, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0975821316242218, "rewards/margins": 0.08785782754421234, "rewards/rejected": -0.18543997406959534, "step": 2010 }, { "epoch": 1.06, "learning_rate": 2.6848725415297888e-06, "logits/chosen": 0.629960298538208, "logits/rejected": 0.6249616742134094, "logps/chosen": -256.87603759765625, "logps/rejected": -253.21109008789062, "loss": 2136.5217, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10265711694955826, "rewards/margins": 0.0642293393611908, "rewards/rejected": -0.16688646376132965, "step": 2020 }, { "epoch": 1.06, "learning_rate": 2.6620830645103753e-06, "logits/chosen": 0.6109344959259033, "logits/rejected": 0.6072026491165161, "logps/chosen": -266.4075012207031, "logps/rejected": -258.20208740234375, "loss": 1938.8361, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09916529804468155, "rewards/margins": 0.09021677076816559, "rewards/rejected": -0.18938204646110535, "step": 2030 }, { "epoch": 1.07, "learning_rate": 2.639280053601719e-06, "logits/chosen": 0.566746711730957, "logits/rejected": 0.6063026189804077, "logps/chosen": -261.76739501953125, "logps/rejected": -270.6283874511719, "loss": 2085.5938, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12018795311450958, "rewards/margins": 0.08117975294589996, "rewards/rejected": -0.20136770606040955, "step": 2040 }, { "epoch": 1.07, "learning_rate": 2.6164654128486683e-06, "logits/chosen": 0.5058253407478333, "logits/rejected": 0.6028685569763184, "logps/chosen": -267.67376708984375, "logps/rejected": -230.2966766357422, "loss": 2055.1498, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11003967374563217, "rewards/margins": 0.08447955548763275, "rewards/rejected": -0.19451923668384552, "step": 2050 }, { "epoch": 1.08, "learning_rate": 2.59364104726716e-06, "logits/chosen": 0.5947778820991516, "logits/rejected": 0.5901384353637695, "logps/chosen": -278.0001525878906, "logps/rejected": -242.13583374023438, "loss": 1877.0174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11745290458202362, "rewards/margins": 0.10013137012720108, "rewards/rejected": -0.2175842821598053, "step": 2060 }, { "epoch": 1.08, "learning_rate": 2.5708088626851546e-06, "logits/chosen": 0.5502884387969971, "logits/rejected": 0.603992760181427, "logps/chosen": -269.38360595703125, "logps/rejected": -244.87619018554688, "loss": 2015.0283, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12011172622442245, "rewards/margins": 0.08565986156463623, "rewards/rejected": -0.20577159523963928, "step": 2070 }, { "epoch": 1.09, "learning_rate": 2.547970765583491e-06, "logits/chosen": 0.5619412064552307, "logits/rejected": 0.6468341946601868, "logps/chosen": -252.68115234375, "logps/rejected": -252.705322265625, "loss": 1855.9893, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10235454142093658, "rewards/margins": 0.10716482251882553, "rewards/rejected": -0.20951935648918152, "step": 2080 }, { "epoch": 1.09, "learning_rate": 2.525128662936707e-06, "logits/chosen": 0.512058436870575, "logits/rejected": 0.5677643418312073, "logps/chosen": -270.7825012207031, "logps/rejected": -260.822509765625, "loss": 1831.3346, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1064097136259079, "rewards/margins": 0.10622493177652359, "rewards/rejected": -0.21263465285301208, "step": 2090 }, { "epoch": 1.1, "learning_rate": 2.502284462053799e-06, "logits/chosen": 0.620409369468689, "logits/rejected": 0.6358670592308044, "logps/chosen": -258.42706298828125, "logps/rejected": -258.72161865234375, "loss": 2024.9559, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10622759163379669, "rewards/margins": 0.08639432489871979, "rewards/rejected": -0.19262190163135529, "step": 2100 }, { "epoch": 1.1, "eval_logits/chosen": 0.5589507818222046, "eval_logits/rejected": 0.6156801581382751, "eval_logps/chosen": -268.8072814941406, "eval_logps/rejected": -254.37991333007812, "eval_loss": 2046.75927734375, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.12189868092536926, "eval_rewards/margins": 0.08641137927770615, "eval_rewards/rejected": -0.2083100527524948, "eval_runtime": 416.7259, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 2100 }, { "epoch": 1.1, "learning_rate": 2.479440070418967e-06, "logits/chosen": 0.5901846885681152, "logits/rejected": 0.6195170283317566, "logps/chosen": -249.45816040039062, "logps/rejected": -253.7944793701172, "loss": 2205.6043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13660283386707306, "rewards/margins": 0.06599629670381546, "rewards/rejected": -0.20259912312030792, "step": 2110 }, { "epoch": 1.11, "learning_rate": 2.456597395532338e-06, "logits/chosen": 0.5504690408706665, "logits/rejected": 0.6531665921211243, "logps/chosen": -259.79010009765625, "logps/rejected": -284.0751647949219, "loss": 1959.0818, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.11810295283794403, "rewards/margins": 0.0964241549372673, "rewards/rejected": -0.21452713012695312, "step": 2120 }, { "epoch": 1.11, "learning_rate": 2.433758344750691e-06, "logits/chosen": 0.5741318464279175, "logits/rejected": 0.6458116173744202, "logps/chosen": -295.03192138671875, "logps/rejected": -276.64251708984375, "loss": 1911.3146, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11703141778707504, "rewards/margins": 0.10501817613840103, "rewards/rejected": -0.2220495641231537, "step": 2130 }, { "epoch": 1.12, "learning_rate": 2.4109248251281953e-06, "logits/chosen": 0.5908122062683105, "logits/rejected": 0.6558480858802795, "logps/chosen": -283.0213928222656, "logps/rejected": -252.56600952148438, "loss": 1911.8793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11346153914928436, "rewards/margins": 0.10086224228143692, "rewards/rejected": -0.2143237590789795, "step": 2140 }, { "epoch": 1.13, "learning_rate": 2.3880987432571675e-06, "logits/chosen": 0.5616129040718079, "logits/rejected": 0.593204915523529, "logps/chosen": -268.16583251953125, "logps/rejected": -262.17755126953125, "loss": 1994.0697, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11974780261516571, "rewards/margins": 0.09457085281610489, "rewards/rejected": -0.2143186628818512, "step": 2150 }, { "epoch": 1.13, "learning_rate": 2.365282005108875e-06, "logits/chosen": 0.5762392282485962, "logits/rejected": 0.615722119808197, "logps/chosen": -250.62509155273438, "logps/rejected": -253.91049194335938, "loss": 2099.1633, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1276218444108963, "rewards/margins": 0.07122951745986938, "rewards/rejected": -0.19885137677192688, "step": 2160 }, { "epoch": 1.14, "learning_rate": 2.3424765158743867e-06, "logits/chosen": 0.6059794425964355, "logits/rejected": 0.6645799875259399, "logps/chosen": -255.7693634033203, "logps/rejected": -251.96951293945312, "loss": 2010.217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12624487280845642, "rewards/margins": 0.09581606835126877, "rewards/rejected": -0.2220609486103058, "step": 2170 }, { "epoch": 1.14, "learning_rate": 2.319684179805491e-06, "logits/chosen": 0.516992449760437, "logits/rejected": 0.5456847548484802, "logps/chosen": -265.10321044921875, "logps/rejected": -246.3701629638672, "loss": 1933.4984, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11627723276615143, "rewards/margins": 0.0940864160656929, "rewards/rejected": -0.21036362648010254, "step": 2180 }, { "epoch": 1.15, "learning_rate": 2.296906900055691e-06, "logits/chosen": 0.596808135509491, "logits/rejected": 0.6393652558326721, "logps/chosen": -264.455810546875, "logps/rejected": -256.40667724609375, "loss": 2172.6984, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12352782487869263, "rewards/margins": 0.07605434954166412, "rewards/rejected": -0.19958215951919556, "step": 2190 }, { "epoch": 1.15, "learning_rate": 2.2741465785212905e-06, "logits/chosen": 0.5940336585044861, "logits/rejected": 0.6305769085884094, "logps/chosen": -256.6434326171875, "logps/rejected": -245.391357421875, "loss": 2038.6354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11867289245128632, "rewards/margins": 0.09147666394710541, "rewards/rejected": -0.21014957129955292, "step": 2200 }, { "epoch": 1.15, "eval_logits/chosen": 0.5517618656158447, "eval_logits/rejected": 0.6082795858383179, "eval_logps/chosen": -268.6722106933594, "eval_logps/rejected": -254.27310180664062, "eval_loss": 2043.57275390625, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": -0.12054779380559921, "eval_rewards/margins": 0.08669425547122955, "eval_rewards/rejected": -0.20724207162857056, "eval_runtime": 416.7766, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 2200 }, { "epoch": 1.16, "learning_rate": 2.251405115682587e-06, "logits/chosen": 0.5902246236801147, "logits/rejected": 0.5983418822288513, "logps/chosen": -263.2071228027344, "logps/rejected": -272.0802307128906, "loss": 2017.8775, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11642640829086304, "rewards/margins": 0.09181423485279083, "rewards/rejected": -0.20824062824249268, "step": 2210 }, { "epoch": 1.16, "learning_rate": 2.2286844104451848e-06, "logits/chosen": 0.5431746244430542, "logits/rejected": 0.6418278217315674, "logps/chosen": -264.33465576171875, "logps/rejected": -251.9208984375, "loss": 2095.6342, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.10533900558948517, "rewards/margins": 0.07776842266321182, "rewards/rejected": -0.1831074208021164, "step": 2220 }, { "epoch": 1.17, "learning_rate": 2.205986359981431e-06, "logits/chosen": 0.5207514762878418, "logits/rejected": 0.6270573139190674, "logps/chosen": -285.76849365234375, "logps/rejected": -277.93426513671875, "loss": 1912.0979, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11343447118997574, "rewards/margins": 0.1047770231962204, "rewards/rejected": -0.21821150183677673, "step": 2230 }, { "epoch": 1.17, "learning_rate": 2.183312859572008e-06, "logits/chosen": 0.5806037783622742, "logits/rejected": 0.6543610095977783, "logps/chosen": -281.1925354003906, "logps/rejected": -278.2233581542969, "loss": 2272.857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11976996809244156, "rewards/margins": 0.06387045979499817, "rewards/rejected": -0.18364043533802032, "step": 2240 }, { "epoch": 1.18, "learning_rate": 2.1606658024476744e-06, "logits/chosen": 0.5554038286209106, "logits/rejected": 0.5429580211639404, "logps/chosen": -269.9796447753906, "logps/rejected": -250.5005340576172, "loss": 2188.1607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1146581768989563, "rewards/margins": 0.07278671860694885, "rewards/rejected": -0.18744489550590515, "step": 2250 }, { "epoch": 1.18, "learning_rate": 2.1380470796311843e-06, "logits/chosen": 0.610127866268158, "logits/rejected": 0.6246207356452942, "logps/chosen": -274.421142578125, "logps/rejected": -259.88336181640625, "loss": 1878.2621, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10982956737279892, "rewards/margins": 0.10056765377521515, "rewards/rejected": -0.21039721369743347, "step": 2260 }, { "epoch": 1.19, "learning_rate": 2.1154585797793826e-06, "logits/chosen": 0.6410681009292603, "logits/rejected": 0.6446506977081299, "logps/chosen": -262.7099304199219, "logps/rejected": -243.33847045898438, "loss": 1969.0021, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10027629137039185, "rewards/margins": 0.08689162135124207, "rewards/rejected": -0.1871679127216339, "step": 2270 }, { "epoch": 1.19, "learning_rate": 2.092902189025507e-06, "logits/chosen": 0.6191312670707703, "logits/rejected": 0.6812275648117065, "logps/chosen": -258.3789978027344, "logps/rejected": -247.3417510986328, "loss": 1765.025, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10864460468292236, "rewards/margins": 0.11250102519989014, "rewards/rejected": -0.2211456298828125, "step": 2280 }, { "epoch": 1.2, "learning_rate": 2.070379790821693e-06, "logits/chosen": 0.5654376745223999, "logits/rejected": 0.6462022066116333, "logps/chosen": -301.2412414550781, "logps/rejected": -276.8460998535156, "loss": 2045.4492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10371474176645279, "rewards/margins": 0.09003494679927826, "rewards/rejected": -0.19374969601631165, "step": 2290 }, { "epoch": 1.2, "learning_rate": 2.0478932657817105e-06, "logits/chosen": 0.5810787081718445, "logits/rejected": 0.6360457539558411, "logps/chosen": -254.06838989257812, "logps/rejected": -243.23989868164062, "loss": 2022.9617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11232425272464752, "rewards/margins": 0.07984773069620132, "rewards/rejected": -0.19217197597026825, "step": 2300 }, { "epoch": 1.2, "eval_logits/chosen": 0.5535383820533752, "eval_logits/rejected": 0.6101322174072266, "eval_logps/chosen": -268.3490905761719, "eval_logps/rejected": -253.95965576171875, "eval_loss": 2035.585693359375, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.11731643229722977, "eval_rewards/margins": 0.08679118007421494, "eval_rewards/rejected": -0.2041076123714447, "eval_runtime": 416.4094, "eval_samples_per_second": 4.803, "eval_steps_per_second": 1.201, "step": 2300 }, { "epoch": 1.21, "learning_rate": 2.0254444915239287e-06, "logits/chosen": 0.5468884706497192, "logits/rejected": 0.5753307938575745, "logps/chosen": -271.94940185546875, "logps/rejected": -244.8318328857422, "loss": 1994.7408, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1074294000864029, "rewards/margins": 0.08415937423706055, "rewards/rejected": -0.19158877432346344, "step": 2310 }, { "epoch": 1.21, "learning_rate": 2.0030353425145376e-06, "logits/chosen": 0.6623051762580872, "logits/rejected": 0.6782322525978088, "logps/chosen": -220.7380828857422, "logps/rejected": -242.05850219726562, "loss": 1864.7361, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10245666652917862, "rewards/margins": 0.10949740558862686, "rewards/rejected": -0.2119540423154831, "step": 2320 }, { "epoch": 1.22, "learning_rate": 1.9806676899110305e-06, "logits/chosen": 0.6308891773223877, "logits/rejected": 0.6477428674697876, "logps/chosen": -262.88897705078125, "logps/rejected": -255.49362182617188, "loss": 1843.8875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10279623419046402, "rewards/margins": 0.11021213233470917, "rewards/rejected": -0.21300837397575378, "step": 2330 }, { "epoch": 1.22, "learning_rate": 1.958343401405964e-06, "logits/chosen": 0.5211482048034668, "logits/rejected": 0.6105703115463257, "logps/chosen": -272.09698486328125, "logps/rejected": -240.05392456054688, "loss": 1991.6119, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12355583906173706, "rewards/margins": 0.08727772533893585, "rewards/rejected": -0.2108335793018341, "step": 2340 }, { "epoch": 1.23, "learning_rate": 1.9360643410710027e-06, "logits/chosen": 0.6230972409248352, "logits/rejected": 0.6428076028823853, "logps/chosen": -297.76300048828125, "logps/rejected": -262.3421325683594, "loss": 2047.1437, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10469541698694229, "rewards/margins": 0.09010159224271774, "rewards/rejected": -0.19479700922966003, "step": 2350 }, { "epoch": 1.24, "learning_rate": 1.9138323692012734e-06, "logits/chosen": 0.5903237462043762, "logits/rejected": 0.6455060243606567, "logps/chosen": -288.16815185546875, "logps/rejected": -289.3240966796875, "loss": 1579.4779, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.08982647955417633, "rewards/margins": 0.14532434940338135, "rewards/rejected": -0.23515084385871887, "step": 2360 }, { "epoch": 1.24, "learning_rate": 1.8916493421600287e-06, "logits/chosen": 0.5603612065315247, "logits/rejected": 0.5792626142501831, "logps/chosen": -243.1230010986328, "logps/rejected": -257.9002685546875, "loss": 2017.2086, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12239034473896027, "rewards/margins": 0.0815977230668068, "rewards/rejected": -0.20398807525634766, "step": 2370 }, { "epoch": 1.25, "learning_rate": 1.8695171122236443e-06, "logits/chosen": 0.49118170142173767, "logits/rejected": 0.5506534576416016, "logps/chosen": -268.86822509765625, "logps/rejected": -274.64202880859375, "loss": 1977.4205, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11895406246185303, "rewards/margins": 0.10293842852115631, "rewards/rejected": -0.22189247608184814, "step": 2380 }, { "epoch": 1.25, "learning_rate": 1.84743752742695e-06, "logits/chosen": 0.6215322613716125, "logits/rejected": 0.6151038408279419, "logps/chosen": -265.5515441894531, "logps/rejected": -278.13177490234375, "loss": 1889.5199, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10073964297771454, "rewards/margins": 0.11166242510080338, "rewards/rejected": -0.2124020755290985, "step": 2390 }, { "epoch": 1.26, "learning_rate": 1.8254124314089225e-06, "logits/chosen": 0.6138418912887573, "logits/rejected": 0.6189366579055786, "logps/chosen": -263.2386169433594, "logps/rejected": -239.09963989257812, "loss": 1871.641, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12337759882211685, "rewards/margins": 0.09867466986179352, "rewards/rejected": -0.22205229103565216, "step": 2400 }, { "epoch": 1.26, "eval_logits/chosen": 0.5482152104377747, "eval_logits/rejected": 0.6045916676521301, "eval_logps/chosen": -268.51605224609375, "eval_logps/rejected": -254.28311157226562, "eval_loss": 2036.3372802734375, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.11898616701364517, "eval_rewards/margins": 0.0883559137582779, "eval_rewards/rejected": -0.20734207332134247, "eval_runtime": 416.6781, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 2400 }, { "epoch": 1.26, "learning_rate": 1.8034436632587394e-06, "logits/chosen": 0.5728852152824402, "logits/rejected": 0.6265703439712524, "logps/chosen": -237.0697784423828, "logps/rejected": -242.29367065429688, "loss": 1969.1203, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10528033971786499, "rewards/margins": 0.0880698561668396, "rewards/rejected": -0.1933501809835434, "step": 2410 }, { "epoch": 1.27, "learning_rate": 1.781533057362221e-06, "logits/chosen": 0.5749053359031677, "logits/rejected": 0.6042163372039795, "logps/chosen": -278.8114013671875, "logps/rejected": -279.18695068359375, "loss": 1906.6891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1059660091996193, "rewards/margins": 0.10437663644552231, "rewards/rejected": -0.210342675447464, "step": 2420 }, { "epoch": 1.27, "learning_rate": 1.7596824432486537e-06, "logits/chosen": 0.5984959602355957, "logits/rejected": 0.6386197209358215, "logps/chosen": -292.53143310546875, "logps/rejected": -256.42620849609375, "loss": 2003.0641, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10370471328496933, "rewards/margins": 0.09288345277309418, "rewards/rejected": -0.1965881586074829, "step": 2430 }, { "epoch": 1.28, "learning_rate": 1.7378936454380277e-06, "logits/chosen": 0.5537322163581848, "logits/rejected": 0.5942158102989197, "logps/chosen": -246.1141815185547, "logps/rejected": -253.85617065429688, "loss": 2137.652, "rewards/accuracies": 0.625, "rewards/chosen": -0.13521219789981842, "rewards/margins": 0.0802813172340393, "rewards/rejected": -0.21549351513385773, "step": 2440 }, { "epoch": 1.28, "learning_rate": 1.7161684832886893e-06, "logits/chosen": 0.5406220555305481, "logits/rejected": 0.540827751159668, "logps/chosen": -242.9103546142578, "logps/rejected": -247.41921997070312, "loss": 2005.6266, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12078257650136948, "rewards/margins": 0.09046939015388489, "rewards/rejected": -0.21125197410583496, "step": 2450 }, { "epoch": 1.29, "learning_rate": 1.6945087708454273e-06, "logits/chosen": 0.5730911493301392, "logits/rejected": 0.5966663956642151, "logps/chosen": -276.0887145996094, "logps/rejected": -264.39910888671875, "loss": 2211.1135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12395117431879044, "rewards/margins": 0.07060922682285309, "rewards/rejected": -0.19456037878990173, "step": 2460 }, { "epoch": 1.29, "learning_rate": 1.6729163166879964e-06, "logits/chosen": 0.5936635136604309, "logits/rejected": 0.6355383396148682, "logps/chosen": -258.3261413574219, "logps/rejected": -233.99075317382812, "loss": 1757.0482, "rewards/accuracies": 0.75, "rewards/chosen": -0.1076178103685379, "rewards/margins": 0.11206640303134918, "rewards/rejected": -0.21968421339988708, "step": 2470 }, { "epoch": 1.3, "learning_rate": 1.651392923780105e-06, "logits/chosen": 0.6025252342224121, "logits/rejected": 0.6687902808189392, "logps/chosen": -254.2070770263672, "logps/rejected": -237.63967895507812, "loss": 2055.3113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1099240779876709, "rewards/margins": 0.09202177077531815, "rewards/rejected": -0.20194585621356964, "step": 2480 }, { "epoch": 1.3, "learning_rate": 1.629940389318867e-06, "logits/chosen": 0.5291022062301636, "logits/rejected": 0.616036593914032, "logps/chosen": -294.7336730957031, "logps/rejected": -240.91796875, "loss": 1902.4217, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10780209302902222, "rewards/margins": 0.10337891429662704, "rewards/rejected": -0.21118099987506866, "step": 2490 }, { "epoch": 1.31, "learning_rate": 1.608560504584737e-06, "logits/chosen": 0.5608310103416443, "logits/rejected": 0.6271076798439026, "logps/chosen": -256.45770263671875, "logps/rejected": -253.8623809814453, "loss": 1907.3463, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10689397901296616, "rewards/margins": 0.10536620765924454, "rewards/rejected": -0.2122601717710495, "step": 2500 }, { "epoch": 1.31, "eval_logits/chosen": 0.5460030436515808, "eval_logits/rejected": 0.6022311449050903, "eval_logps/chosen": -268.7764587402344, "eval_logps/rejected": -254.62974548339844, "eval_loss": 2034.7010498046875, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": -0.12159038335084915, "eval_rewards/margins": 0.08921793848276138, "eval_rewards/rejected": -0.21080833673477173, "eval_runtime": 416.6626, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 2500 }, { "epoch": 1.31, "learning_rate": 1.587255054791937e-06, "logits/chosen": 0.5321905016899109, "logits/rejected": 0.589474081993103, "logps/chosen": -281.2105407714844, "logps/rejected": -264.56298828125, "loss": 2016.9854, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10593251138925552, "rewards/margins": 0.08659417182207108, "rewards/rejected": -0.1925266534090042, "step": 2510 }, { "epoch": 1.32, "learning_rate": 1.5660258189393945e-06, "logits/chosen": 0.5880864262580872, "logits/rejected": 0.6149991750717163, "logps/chosen": -251.75973510742188, "logps/rejected": -262.3134765625, "loss": 2130.8975, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12219718843698502, "rewards/margins": 0.08161304891109467, "rewards/rejected": -0.2038102149963379, "step": 2520 }, { "epoch": 1.32, "learning_rate": 1.5448745696621915e-06, "logits/chosen": 0.5654980540275574, "logits/rejected": 0.6478559970855713, "logps/chosen": -272.79864501953125, "logps/rejected": -258.56402587890625, "loss": 2114.0654, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11762849986553192, "rewards/margins": 0.08065593242645264, "rewards/rejected": -0.19828443229198456, "step": 2530 }, { "epoch": 1.33, "learning_rate": 1.5238030730835578e-06, "logits/chosen": 0.5662246942520142, "logits/rejected": 0.6270356178283691, "logps/chosen": -272.17449951171875, "logps/rejected": -237.2474365234375, "loss": 2106.9717, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11701546609401703, "rewards/margins": 0.0775529146194458, "rewards/rejected": -0.19456836581230164, "step": 2540 }, { "epoch": 1.33, "learning_rate": 1.5028130886673936e-06, "logits/chosen": 0.5928006172180176, "logits/rejected": 0.641442060470581, "logps/chosen": -263.0660705566406, "logps/rejected": -252.6539306640625, "loss": 2000.1971, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.111760713160038, "rewards/margins": 0.0842764601111412, "rewards/rejected": -0.1960371732711792, "step": 2550 }, { "epoch": 1.34, "learning_rate": 1.4819063690713565e-06, "logits/chosen": 0.5778559446334839, "logits/rejected": 0.6045337915420532, "logps/chosen": -284.07061767578125, "logps/rejected": -270.4360656738281, "loss": 1938.5168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12265495210886002, "rewards/margins": 0.09922391176223755, "rewards/rejected": -0.22187885642051697, "step": 2560 }, { "epoch": 1.35, "learning_rate": 1.4610846600005164e-06, "logits/chosen": 0.6385133862495422, "logits/rejected": 0.6164069175720215, "logps/chosen": -291.48590087890625, "logps/rejected": -241.2799072265625, "loss": 2040.5221, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11285148561000824, "rewards/margins": 0.08863021433353424, "rewards/rejected": -0.20148172974586487, "step": 2570 }, { "epoch": 1.35, "learning_rate": 1.4403497000615885e-06, "logits/chosen": 0.6022018194198608, "logits/rejected": 0.6375949382781982, "logps/chosen": -250.8367156982422, "logps/rejected": -242.37881469726562, "loss": 2072.085, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11340691894292831, "rewards/margins": 0.07300657033920288, "rewards/rejected": -0.1864134818315506, "step": 2580 }, { "epoch": 1.36, "learning_rate": 1.4197032206177618e-06, "logits/chosen": 0.6561594605445862, "logits/rejected": 0.7297431230545044, "logps/chosen": -249.74887084960938, "logps/rejected": -237.26779174804688, "loss": 2067.224, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12197699397802353, "rewards/margins": 0.08562152087688446, "rewards/rejected": -0.2075985223054886, "step": 2590 }, { "epoch": 1.36, "learning_rate": 1.3991469456441273e-06, "logits/chosen": 0.6001744270324707, "logits/rejected": 0.6553865075111389, "logps/chosen": -252.2941436767578, "logps/rejected": -243.48025512695312, "loss": 1884.6086, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11671899259090424, "rewards/margins": 0.10958864539861679, "rewards/rejected": -0.22630766034126282, "step": 2600 }, { "epoch": 1.36, "eval_logits/chosen": 0.545119047164917, "eval_logits/rejected": 0.601308286190033, "eval_logps/chosen": -268.77081298828125, "eval_logps/rejected": -254.6013946533203, "eval_loss": 2033.7977294921875, "eval_rewards/accuracies": 0.6909999847412109, "eval_rewards/chosen": -0.12153391540050507, "eval_rewards/margins": 0.08899100124835968, "eval_rewards/rejected": -0.21052493155002594, "eval_runtime": 416.4661, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 2600 }, { "epoch": 1.37, "learning_rate": 1.3786825915837299e-06, "logits/chosen": 0.6044927835464478, "logits/rejected": 0.608493447303772, "logps/chosen": -268.0179138183594, "logps/rejected": -251.26168823242188, "loss": 1763.491, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08965936303138733, "rewards/margins": 0.12351739406585693, "rewards/rejected": -0.21317675709724426, "step": 2610 }, { "epoch": 1.37, "learning_rate": 1.3583118672042441e-06, "logits/chosen": 0.5879210233688354, "logits/rejected": 0.6254302263259888, "logps/chosen": -268.9471740722656, "logps/rejected": -246.9387664794922, "loss": 1806.3043, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.09920786321163177, "rewards/margins": 0.1189170852303505, "rewards/rejected": -0.21812494099140167, "step": 2620 }, { "epoch": 1.38, "learning_rate": 1.3380364734552935e-06, "logits/chosen": 0.6040158867835999, "logits/rejected": 0.6454821825027466, "logps/chosen": -239.55313110351562, "logps/rejected": -252.41641235351562, "loss": 1881.491, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11133377254009247, "rewards/margins": 0.10438641160726547, "rewards/rejected": -0.21572017669677734, "step": 2630 }, { "epoch": 1.38, "learning_rate": 1.3178581033264218e-06, "logits/chosen": 0.5422452688217163, "logits/rejected": 0.5576962232589722, "logps/chosen": -267.02020263671875, "logps/rejected": -233.59628295898438, "loss": 1922.1236, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1054287701845169, "rewards/margins": 0.10195982456207275, "rewards/rejected": -0.20738859474658966, "step": 2640 }, { "epoch": 1.39, "learning_rate": 1.2977784417057262e-06, "logits/chosen": 0.5648713111877441, "logits/rejected": 0.5970919132232666, "logps/chosen": -266.724365234375, "logps/rejected": -252.4540557861328, "loss": 1788.8666, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11531468480825424, "rewards/margins": 0.11205389350652695, "rewards/rejected": -0.22736859321594238, "step": 2650 }, { "epoch": 1.39, "learning_rate": 1.2777991652391757e-06, "logits/chosen": 0.5807424783706665, "logits/rejected": 0.64664626121521, "logps/chosen": -253.5209197998047, "logps/rejected": -251.5238800048828, "loss": 2035.5062, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12570129334926605, "rewards/margins": 0.09053059667348862, "rewards/rejected": -0.21623189747333527, "step": 2660 }, { "epoch": 1.4, "learning_rate": 1.2579219421906049e-06, "logits/chosen": 0.612740159034729, "logits/rejected": 0.6295909285545349, "logps/chosen": -275.30938720703125, "logps/rejected": -246.85986328125, "loss": 1918.4975, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11256787925958633, "rewards/margins": 0.1016065701842308, "rewards/rejected": -0.21417441964149475, "step": 2670 }, { "epoch": 1.4, "learning_rate": 1.2381484323024178e-06, "logits/chosen": 0.5338586568832397, "logits/rejected": 0.614523708820343, "logps/chosen": -248.32406616210938, "logps/rejected": -234.80862426757812, "loss": 2030.9229, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12608769536018372, "rewards/margins": 0.09196772426366806, "rewards/rejected": -0.21805541217327118, "step": 2680 }, { "epoch": 1.41, "learning_rate": 1.2184802866569991e-06, "logits/chosen": 0.5740771889686584, "logits/rejected": 0.5626708269119263, "logps/chosen": -256.43524169921875, "logps/rejected": -254.8041534423828, "loss": 1881.1102, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12152848392724991, "rewards/margins": 0.1093037948012352, "rewards/rejected": -0.2308322638273239, "step": 2690 }, { "epoch": 1.41, "learning_rate": 1.1989191475388518e-06, "logits/chosen": 0.5784533023834229, "logits/rejected": 0.5800845623016357, "logps/chosen": -261.47900390625, "logps/rejected": -261.27618408203125, "loss": 2034.9129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12217319011688232, "rewards/margins": 0.08460931479930878, "rewards/rejected": -0.2067825049161911, "step": 2700 }, { "epoch": 1.41, "eval_logits/chosen": 0.5425635576248169, "eval_logits/rejected": 0.5986801385879517, "eval_logps/chosen": -268.96331787109375, "eval_logps/rejected": -254.94712829589844, "eval_loss": 2032.544677734375, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.12345867604017258, "eval_rewards/margins": 0.0905236080288887, "eval_rewards/rejected": -0.21398229897022247, "eval_runtime": 416.8138, "eval_samples_per_second": 4.798, "eval_steps_per_second": 1.2, "step": 2700 }, { "epoch": 1.42, "learning_rate": 1.1794666482974617e-06, "logits/chosen": 0.5704789161682129, "logits/rejected": 0.6782268285751343, "logps/chosen": -282.65875244140625, "logps/rejected": -257.2977600097656, "loss": 1989.7289, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11434787511825562, "rewards/margins": 0.0926680713891983, "rewards/rejected": -0.20701594650745392, "step": 2710 }, { "epoch": 1.42, "learning_rate": 1.160124413210918e-06, "logits/chosen": 0.5337072014808655, "logits/rejected": 0.5367878675460815, "logps/chosen": -264.4056701660156, "logps/rejected": -245.18301391601562, "loss": 1913.7168, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.10953982174396515, "rewards/margins": 0.10163428634405136, "rewards/rejected": -0.2111741304397583, "step": 2720 }, { "epoch": 1.43, "learning_rate": 1.1408940573502838e-06, "logits/chosen": 0.5485426783561707, "logits/rejected": 0.6499109864234924, "logps/chosen": -264.18505859375, "logps/rejected": -238.2395477294922, "loss": 1907.4213, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12582936882972717, "rewards/margins": 0.09968879073858261, "rewards/rejected": -0.22551818192005157, "step": 2730 }, { "epoch": 1.43, "learning_rate": 1.1217771864447396e-06, "logits/chosen": 0.5939881205558777, "logits/rejected": 0.6111994981765747, "logps/chosen": -261.0160827636719, "logps/rejected": -244.2744140625, "loss": 2013.2748, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12591084837913513, "rewards/margins": 0.08752218633890152, "rewards/rejected": -0.21343302726745605, "step": 2740 }, { "epoch": 1.44, "learning_rate": 1.1027753967475046e-06, "logits/chosen": 0.5890164375305176, "logits/rejected": 0.6029259562492371, "logps/chosen": -259.98382568359375, "logps/rejected": -254.6918487548828, "loss": 1947.2754, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1243789941072464, "rewards/margins": 0.09525910019874573, "rewards/rejected": -0.21963807940483093, "step": 2750 }, { "epoch": 1.44, "learning_rate": 1.08389027490255e-06, "logits/chosen": 0.5902668833732605, "logits/rejected": 0.6080381274223328, "logps/chosen": -248.3684844970703, "logps/rejected": -263.76776123046875, "loss": 2085.1875, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13539178669452667, "rewards/margins": 0.07983563095331192, "rewards/rejected": -0.215227410197258, "step": 2760 }, { "epoch": 1.45, "learning_rate": 1.0651233978121145e-06, "logits/chosen": 0.5521366596221924, "logits/rejected": 0.5906900763511658, "logps/chosen": -300.05230712890625, "logps/rejected": -272.1240234375, "loss": 1883.4229, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11396439373493195, "rewards/margins": 0.09765832126140594, "rewards/rejected": -0.2116227149963379, "step": 2770 }, { "epoch": 1.46, "learning_rate": 1.046476332505036e-06, "logits/chosen": 0.6347781419754028, "logits/rejected": 0.6657929420471191, "logps/chosen": -250.8879852294922, "logps/rejected": -223.33255004882812, "loss": 1819.1854, "rewards/accuracies": 0.75, "rewards/chosen": -0.11131460964679718, "rewards/margins": 0.11518070846796036, "rewards/rejected": -0.22649531066417694, "step": 2780 }, { "epoch": 1.46, "learning_rate": 1.0279506360059005e-06, "logits/chosen": 0.5551185011863708, "logits/rejected": 0.5792326331138611, "logps/chosen": -262.4249572753906, "logps/rejected": -267.63763427734375, "loss": 2209.3523, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.12959793210029602, "rewards/margins": 0.06600113213062286, "rewards/rejected": -0.19559906423091888, "step": 2790 }, { "epoch": 1.47, "learning_rate": 1.0095478552050348e-06, "logits/chosen": 0.6019959449768066, "logits/rejected": 0.6074908971786499, "logps/chosen": -273.5536193847656, "logps/rejected": -267.5385437011719, "loss": 2068.2822, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13250732421875, "rewards/margins": 0.09504042565822601, "rewards/rejected": -0.2275477647781372, "step": 2800 }, { "epoch": 1.47, "eval_logits/chosen": 0.5382584929466248, "eval_logits/rejected": 0.5942660570144653, "eval_logps/chosen": -269.1269836425781, "eval_logps/rejected": -255.16705322265625, "eval_loss": 2030.8697509765625, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.1250954419374466, "eval_rewards/margins": 0.09108588099479675, "eval_rewards/rejected": -0.21618132293224335, "eval_runtime": 416.6389, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 2800 }, { "epoch": 1.47, "learning_rate": 9.912695267293383e-07, "logits/chosen": 0.5214653015136719, "logits/rejected": 0.5876752734184265, "logps/chosen": -265.47882080078125, "logps/rejected": -239.1663360595703, "loss": 1950.4564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10924456268548965, "rewards/margins": 0.09817437827587128, "rewards/rejected": -0.20741891860961914, "step": 2810 }, { "epoch": 1.48, "learning_rate": 9.731171768139808e-07, "logits/chosen": 0.6136573553085327, "logits/rejected": 0.6188865900039673, "logps/chosen": -284.9826965332031, "logps/rejected": -261.3671875, "loss": 2210.9309, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12047699838876724, "rewards/margins": 0.06873573362827301, "rewards/rejected": -0.18921272456645966, "step": 2820 }, { "epoch": 1.48, "learning_rate": 9.550923211749557e-07, "logits/chosen": 0.5326896905899048, "logits/rejected": 0.5845073461532593, "logps/chosen": -260.52069091796875, "logps/rejected": -268.2504577636719, "loss": 2028.4584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11890892684459686, "rewards/margins": 0.08525559306144714, "rewards/rejected": -0.2041645348072052, "step": 2830 }, { "epoch": 1.49, "learning_rate": 9.371964648825221e-07, "logits/chosen": 0.6162235736846924, "logits/rejected": 0.5728213787078857, "logps/chosen": -272.336181640625, "logps/rejected": -252.33346557617188, "loss": 1941.8066, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10710117965936661, "rewards/margins": 0.10685823112726212, "rewards/rejected": -0.21395941078662872, "step": 2840 }, { "epoch": 1.49, "learning_rate": 9.194311022355279e-07, "logits/chosen": 0.5015624761581421, "logits/rejected": 0.5448901057243347, "logps/chosen": -276.95538330078125, "logps/rejected": -250.8726348876953, "loss": 1832.3256, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10919035971164703, "rewards/margins": 0.11640901863574982, "rewards/rejected": -0.22559937834739685, "step": 2850 }, { "epoch": 1.5, "learning_rate": 9.017977166366445e-07, "logits/chosen": 0.5708821415901184, "logits/rejected": 0.558485209941864, "logps/chosen": -258.96807861328125, "logps/rejected": -263.4228210449219, "loss": 1947.8791, "rewards/accuracies": 0.75, "rewards/chosen": -0.11477123200893402, "rewards/margins": 0.09229175001382828, "rewards/rejected": -0.2070629894733429, "step": 2860 }, { "epoch": 1.5, "learning_rate": 8.842977804684938e-07, "logits/chosen": 0.5845485925674438, "logits/rejected": 0.6778086423873901, "logps/chosen": -245.46102905273438, "logps/rejected": -233.6326141357422, "loss": 2066.3828, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12042129039764404, "rewards/margins": 0.08121231943368912, "rewards/rejected": -0.20163361728191376, "step": 2870 }, { "epoch": 1.51, "learning_rate": 8.669327549707096e-07, "logits/chosen": 0.5501264929771423, "logits/rejected": 0.6287878751754761, "logps/chosen": -281.37371826171875, "logps/rejected": -252.8254852294922, "loss": 1868.0959, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1148761659860611, "rewards/margins": 0.1036820039153099, "rewards/rejected": -0.2185581624507904, "step": 2880 }, { "epoch": 1.51, "learning_rate": 8.497040901179232e-07, "logits/chosen": 0.5025564432144165, "logits/rejected": 0.5421415567398071, "logps/chosen": -276.861572265625, "logps/rejected": -267.47723388671875, "loss": 1753.2416, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10830195993185043, "rewards/margins": 0.12366169691085815, "rewards/rejected": -0.2319636344909668, "step": 2890 }, { "epoch": 1.52, "learning_rate": 8.326132244986932e-07, "logits/chosen": 0.6039875745773315, "logits/rejected": 0.6563787460327148, "logps/chosen": -282.04266357421875, "logps/rejected": -257.56622314453125, "loss": 1977.4029, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12028930336236954, "rewards/margins": 0.09056351333856583, "rewards/rejected": -0.21085281670093536, "step": 2900 }, { "epoch": 1.52, "eval_logits/chosen": 0.5381389260292053, "eval_logits/rejected": 0.5940784811973572, "eval_logps/chosen": -269.125244140625, "eval_logps/rejected": -255.16897583007812, "eval_loss": 2030.603271484375, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.12507818639278412, "eval_rewards/margins": 0.09112255275249481, "eval_rewards/rejected": -0.21620073914527893, "eval_runtime": 416.5425, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 2900 }, { "epoch": 1.52, "learning_rate": 8.156615851953798e-07, "logits/chosen": 0.559486448764801, "logits/rejected": 0.5794366598129272, "logps/chosen": -256.5633239746094, "logps/rejected": -259.57696533203125, "loss": 1948.7941, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10746540874242783, "rewards/margins": 0.10354423522949219, "rewards/rejected": -0.2110096514225006, "step": 2910 }, { "epoch": 1.53, "learning_rate": 7.988505876649863e-07, "logits/chosen": 0.6247807741165161, "logits/rejected": 0.6010321974754333, "logps/chosen": -271.8721923828125, "logps/rejected": -256.86480712890625, "loss": 2099.2482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12578730285167694, "rewards/margins": 0.084382563829422, "rewards/rejected": -0.21016988158226013, "step": 2920 }, { "epoch": 1.53, "learning_rate": 7.821816356209677e-07, "logits/chosen": 0.5775936841964722, "logits/rejected": 0.6070097088813782, "logps/chosen": -272.50653076171875, "logps/rejected": -251.46243286132812, "loss": 2020.2645, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10367625951766968, "rewards/margins": 0.08927679061889648, "rewards/rejected": -0.19295303523540497, "step": 2930 }, { "epoch": 1.54, "learning_rate": 7.656561209160248e-07, "logits/chosen": 0.521769642829895, "logits/rejected": 0.5275167226791382, "logps/chosen": -289.0915222167969, "logps/rejected": -263.7314453125, "loss": 1947.6189, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12388887256383896, "rewards/margins": 0.1033380776643753, "rewards/rejected": -0.22722692787647247, "step": 2940 }, { "epoch": 1.54, "learning_rate": 7.492754234258794e-07, "logits/chosen": 0.5926128625869751, "logits/rejected": 0.6193209886550903, "logps/chosen": -241.3407440185547, "logps/rejected": -225.17724609375, "loss": 1876.9375, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10547138750553131, "rewards/margins": 0.10478665679693222, "rewards/rejected": -0.21025805175304413, "step": 2950 }, { "epoch": 1.55, "learning_rate": 7.330409109340563e-07, "logits/chosen": 0.5721119046211243, "logits/rejected": 0.5718821287155151, "logps/chosen": -267.39471435546875, "logps/rejected": -244.5928192138672, "loss": 2126.3432, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1287916600704193, "rewards/margins": 0.07899868488311768, "rewards/rejected": -0.2077903300523758, "step": 2960 }, { "epoch": 1.55, "learning_rate": 7.169539390176769e-07, "logits/chosen": 0.5741583704948425, "logits/rejected": 0.5660156011581421, "logps/chosen": -219.59640502929688, "logps/rejected": -233.7797088623047, "loss": 1845.3854, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.12960752844810486, "rewards/margins": 0.10263533890247345, "rewards/rejected": -0.23224285244941711, "step": 2970 }, { "epoch": 1.56, "learning_rate": 7.010158509342682e-07, "logits/chosen": 0.5922077298164368, "logits/rejected": 0.6388793587684631, "logps/chosen": -258.38946533203125, "logps/rejected": -236.8799285888672, "loss": 1677.0656, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11209128051996231, "rewards/margins": 0.13626167178153992, "rewards/rejected": -0.24835292994976044, "step": 2980 }, { "epoch": 1.57, "learning_rate": 6.852279775095976e-07, "logits/chosen": 0.6180992722511292, "logits/rejected": 0.6189014911651611, "logps/chosen": -272.6584167480469, "logps/rejected": -247.75033569335938, "loss": 1925.8682, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11567474901676178, "rewards/margins": 0.09763548523187637, "rewards/rejected": -0.21331021189689636, "step": 2990 }, { "epoch": 1.57, "learning_rate": 6.695916370265529e-07, "logits/chosen": 0.6014515161514282, "logits/rejected": 0.5875986814498901, "logps/chosen": -265.0668029785156, "logps/rejected": -241.8825225830078, "loss": 2110.2887, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1249975711107254, "rewards/margins": 0.07782919704914093, "rewards/rejected": -0.20282676815986633, "step": 3000 }, { "epoch": 1.57, "eval_logits/chosen": 0.5348395109176636, "eval_logits/rejected": 0.5908406972885132, "eval_logps/chosen": -269.2049865722656, "eval_logps/rejected": -255.2820587158203, "eval_loss": 2030.5706787109375, "eval_rewards/accuracies": 0.690500020980835, "eval_rewards/chosen": -0.12587547302246094, "eval_rewards/margins": 0.09145611524581909, "eval_rewards/rejected": -0.21733158826828003, "eval_runtime": 416.6652, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 3000 }, { "epoch": 1.58, "learning_rate": 6.541081351150638e-07, "logits/chosen": 0.5409640669822693, "logits/rejected": 0.5331202149391174, "logps/chosen": -279.83941650390625, "logps/rejected": -291.9646301269531, "loss": 2035.416, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.10843896865844727, "rewards/margins": 0.09788022935390472, "rewards/rejected": -0.2063191831111908, "step": 3010 }, { "epoch": 1.58, "learning_rate": 6.387787646430854e-07, "logits/chosen": 0.5321037769317627, "logits/rejected": 0.5582699775695801, "logps/chosen": -267.9813232421875, "logps/rejected": -264.12567138671875, "loss": 2006.4391, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10174393653869629, "rewards/margins": 0.08532971143722534, "rewards/rejected": -0.18707364797592163, "step": 3020 }, { "epoch": 1.59, "learning_rate": 6.2360480560864e-07, "logits/chosen": 0.5698617696762085, "logits/rejected": 0.5839768648147583, "logps/chosen": -251.5703125, "logps/rejected": -235.4780731201172, "loss": 1821.7498, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10450971126556396, "rewards/margins": 0.10978861898183823, "rewards/rejected": -0.2142982929944992, "step": 3030 }, { "epoch": 1.59, "learning_rate": 6.085875250329401e-07, "logits/chosen": 0.5382856726646423, "logits/rejected": 0.6018794178962708, "logps/chosen": -304.603271484375, "logps/rejected": -263.90570068359375, "loss": 1820.3352, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10189330577850342, "rewards/margins": 0.12546256184577942, "rewards/rejected": -0.22735583782196045, "step": 3040 }, { "epoch": 1.6, "learning_rate": 5.937281768545919e-07, "logits/chosen": 0.600039005279541, "logits/rejected": 0.5895189046859741, "logps/chosen": -288.302734375, "logps/rejected": -266.3108215332031, "loss": 2142.2947, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1374974250793457, "rewards/margins": 0.08518020063638687, "rewards/rejected": -0.22267761826515198, "step": 3050 }, { "epoch": 1.6, "learning_rate": 5.79028001824894e-07, "logits/chosen": 0.577072024345398, "logits/rejected": 0.5782276391983032, "logps/chosen": -258.71734619140625, "logps/rejected": -252.88906860351562, "loss": 2083.8391, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.13044390082359314, "rewards/margins": 0.09016549587249756, "rewards/rejected": -0.22060942649841309, "step": 3060 }, { "epoch": 1.61, "learning_rate": 5.644882274042285e-07, "logits/chosen": 0.5784581303596497, "logits/rejected": 0.5805580019950867, "logps/chosen": -286.2455139160156, "logps/rejected": -253.4708251953125, "loss": 1979.6775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11061519384384155, "rewards/margins": 0.10562906414270401, "rewards/rejected": -0.21624425053596497, "step": 3070 }, { "epoch": 1.61, "learning_rate": 5.501100676595761e-07, "logits/chosen": 0.5630078911781311, "logits/rejected": 0.5682691335678101, "logps/chosen": -267.6597900390625, "logps/rejected": -250.3827667236328, "loss": 1964.8426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1262790709733963, "rewards/margins": 0.09482350945472717, "rewards/rejected": -0.22110256552696228, "step": 3080 }, { "epoch": 1.62, "learning_rate": 5.358947231631375e-07, "logits/chosen": 0.534908652305603, "logits/rejected": 0.5746644139289856, "logps/chosen": -283.8402404785156, "logps/rejected": -272.68670654296875, "loss": 1792.1418, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0998634472489357, "rewards/margins": 0.1285194605588913, "rewards/rejected": -0.2283829152584076, "step": 3090 }, { "epoch": 1.62, "learning_rate": 5.218433808920884e-07, "logits/chosen": 0.5141295194625854, "logits/rejected": 0.5280352234840393, "logps/chosen": -262.8772888183594, "logps/rejected": -246.84671020507812, "loss": 2068.2863, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10809364169836044, "rewards/margins": 0.08412571996450424, "rewards/rejected": -0.19221936166286469, "step": 3100 }, { "epoch": 1.62, "eval_logits/chosen": 0.5356869697570801, "eval_logits/rejected": 0.5913118720054626, "eval_logps/chosen": -269.03900146484375, "eval_logps/rejected": -255.10865783691406, "eval_loss": 2029.4173583984375, "eval_rewards/accuracies": 0.6934999823570251, "eval_rewards/chosen": -0.12421557307243347, "eval_rewards/margins": 0.09138190746307373, "eval_rewards/rejected": -0.2155974805355072, "eval_runtime": 416.645, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 3100 }, { "epoch": 1.63, "learning_rate": 5.07957214129464e-07, "logits/chosen": 0.6343733072280884, "logits/rejected": 0.6377061605453491, "logps/chosen": -230.1392059326172, "logps/rejected": -217.2322540283203, "loss": 2110.5152, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.14483976364135742, "rewards/margins": 0.082811638712883, "rewards/rejected": -0.22765140235424042, "step": 3110 }, { "epoch": 1.63, "learning_rate": 4.942373823661928e-07, "logits/chosen": 0.5317670702934265, "logits/rejected": 0.5477628707885742, "logps/chosen": -253.5054931640625, "logps/rejected": -295.16021728515625, "loss": 2379.184, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.13306215405464172, "rewards/margins": 0.05652584508061409, "rewards/rejected": -0.18958799540996552, "step": 3120 }, { "epoch": 1.64, "learning_rate": 4.806850312042782e-07, "logits/chosen": 0.6451593637466431, "logits/rejected": 0.5899637937545776, "logps/chosen": -289.49151611328125, "logps/rejected": -257.98443603515625, "loss": 1992.5604, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12486691772937775, "rewards/margins": 0.09148009121417999, "rewards/rejected": -0.21634697914123535, "step": 3130 }, { "epoch": 1.64, "learning_rate": 4.6730129226114363e-07, "logits/chosen": 0.5886529684066772, "logits/rejected": 0.5409609079360962, "logps/chosen": -258.1368103027344, "logps/rejected": -249.2908935546875, "loss": 1995.9428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11228666454553604, "rewards/margins": 0.08399216085672379, "rewards/rejected": -0.19627881050109863, "step": 3140 }, { "epoch": 1.65, "learning_rate": 4.540872830751386e-07, "logits/chosen": 0.5374349355697632, "logits/rejected": 0.5601732134819031, "logps/chosen": -266.9260559082031, "logps/rejected": -266.8581237792969, "loss": 2206.0141, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12373526394367218, "rewards/margins": 0.060657333582639694, "rewards/rejected": -0.18439260125160217, "step": 3150 }, { "epoch": 1.65, "learning_rate": 4.4104410701222703e-07, "logits/chosen": 0.5198964476585388, "logits/rejected": 0.5673514008522034, "logps/chosen": -250.6959228515625, "logps/rejected": -233.34774780273438, "loss": 1759.9059, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11838851869106293, "rewards/margins": 0.12481342256069183, "rewards/rejected": -0.24320194125175476, "step": 3160 }, { "epoch": 1.66, "learning_rate": 4.281728531738563e-07, "logits/chosen": 0.597510814666748, "logits/rejected": 0.6312834620475769, "logps/chosen": -268.5950012207031, "logps/rejected": -250.58059692382812, "loss": 1960.2506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11448697745800018, "rewards/margins": 0.09015806764364243, "rewards/rejected": -0.204645037651062, "step": 3170 }, { "epoch": 1.66, "learning_rate": 4.154745963060197e-07, "logits/chosen": 0.507027268409729, "logits/rejected": 0.5732488632202148, "logps/chosen": -280.98382568359375, "logps/rejected": -286.677734375, "loss": 1965.5633, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12583956122398376, "rewards/margins": 0.10458560287952423, "rewards/rejected": -0.230425164103508, "step": 3180 }, { "epoch": 1.67, "learning_rate": 4.029503967095097e-07, "logits/chosen": 0.4729984402656555, "logits/rejected": 0.5827825665473938, "logps/chosen": -268.3514709472656, "logps/rejected": -247.7762451171875, "loss": 1878.176, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1029181256890297, "rewards/margins": 0.09399056434631348, "rewards/rejected": -0.19690869748592377, "step": 3190 }, { "epoch": 1.67, "learning_rate": 3.9060130015138863e-07, "logits/chosen": 0.5795052647590637, "logits/rejected": 0.6329609155654907, "logps/chosen": -262.9614562988281, "logps/rejected": -240.7771453857422, "loss": 1977.8852, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1271854192018509, "rewards/margins": 0.08980287611484528, "rewards/rejected": -0.21698825061321259, "step": 3200 }, { "epoch": 1.67, "eval_logits/chosen": 0.5363709926605225, "eval_logits/rejected": 0.5920352935791016, "eval_logps/chosen": -269.10711669921875, "eval_logps/rejected": -255.2016143798828, "eval_loss": 2026.12890625, "eval_rewards/accuracies": 0.6959999799728394, "eval_rewards/chosen": -0.1248970478773117, "eval_rewards/margins": 0.09163003414869308, "eval_rewards/rejected": -0.21652711927890778, "eval_runtime": 416.4729, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 3200 }, { "epoch": 1.68, "learning_rate": 3.784283377776651e-07, "logits/chosen": 0.6236351728439331, "logits/rejected": 0.630204975605011, "logps/chosen": -267.3162536621094, "logps/rejected": -241.71484375, "loss": 2151.366, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13855421543121338, "rewards/margins": 0.07831588387489319, "rewards/rejected": -0.21687009930610657, "step": 3210 }, { "epoch": 1.69, "learning_rate": 3.664325260271953e-07, "logits/chosen": 0.586463451385498, "logits/rejected": 0.6219819784164429, "logps/chosen": -240.7449493408203, "logps/rejected": -260.012451171875, "loss": 2145.4057, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.12818796932697296, "rewards/margins": 0.07563059777021408, "rewards/rejected": -0.20381855964660645, "step": 3220 }, { "epoch": 1.69, "learning_rate": 3.5461486654680746e-07, "logits/chosen": 0.5574949979782104, "logits/rejected": 0.6360602378845215, "logps/chosen": -262.3418884277344, "logps/rejected": -256.82366943359375, "loss": 2092.1422, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10938684642314911, "rewards/margins": 0.08223484456539154, "rewards/rejected": -0.19162169098854065, "step": 3230 }, { "epoch": 1.7, "learning_rate": 3.429763461076677e-07, "logits/chosen": 0.5418500304222107, "logits/rejected": 0.5625206828117371, "logps/chosen": -271.43792724609375, "logps/rejected": -255.0489501953125, "loss": 2035.8221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13326099514961243, "rewards/margins": 0.08890596777200699, "rewards/rejected": -0.2221669703722, "step": 3240 }, { "epoch": 1.7, "learning_rate": 3.315179365228824e-07, "logits/chosen": 0.5612285733222961, "logits/rejected": 0.5996168851852417, "logps/chosen": -284.46612548828125, "logps/rejected": -261.4803771972656, "loss": 1981.793, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12592165172100067, "rewards/margins": 0.0976746454834938, "rewards/rejected": -0.22359630465507507, "step": 3250 }, { "epoch": 1.71, "learning_rate": 3.202405945663556e-07, "logits/chosen": 0.5869094729423523, "logits/rejected": 0.5811904668807983, "logps/chosen": -273.99249267578125, "logps/rejected": -274.64410400390625, "loss": 2093.5598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11422860622406006, "rewards/margins": 0.09214137494564056, "rewards/rejected": -0.20636996626853943, "step": 3260 }, { "epoch": 1.71, "learning_rate": 3.09145261892895e-07, "logits/chosen": 0.5232716798782349, "logits/rejected": 0.6522939801216125, "logps/chosen": -266.1856384277344, "logps/rejected": -255.4128875732422, "loss": 1884.3191, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12048964202404022, "rewards/margins": 0.1096540093421936, "rewards/rejected": -0.23014366626739502, "step": 3270 }, { "epoch": 1.72, "learning_rate": 2.982328649595856e-07, "logits/chosen": 0.5434025526046753, "logits/rejected": 0.5516559481620789, "logps/chosen": -261.6842346191406, "logps/rejected": -268.0997619628906, "loss": 2020.6832, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12252092361450195, "rewards/margins": 0.08777900040149689, "rewards/rejected": -0.21029992401599884, "step": 3280 }, { "epoch": 1.72, "learning_rate": 2.8750431494843076e-07, "logits/chosen": 0.5793955326080322, "logits/rejected": 0.5846759676933289, "logps/chosen": -256.3296813964844, "logps/rejected": -258.81512451171875, "loss": 2103.4049, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13012662529945374, "rewards/margins": 0.07379056513309479, "rewards/rejected": -0.20391719043254852, "step": 3290 }, { "epoch": 1.73, "learning_rate": 2.7696050769026954e-07, "logits/chosen": 0.5790996551513672, "logits/rejected": 0.5938167572021484, "logps/chosen": -229.91567993164062, "logps/rejected": -236.54916381835938, "loss": 2123.3787, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12688665091991425, "rewards/margins": 0.07247930765151978, "rewards/rejected": -0.19936595857143402, "step": 3300 }, { "epoch": 1.73, "eval_logits/chosen": 0.5370410680770874, "eval_logits/rejected": 0.5926198363304138, "eval_logps/chosen": -269.0932922363281, "eval_logps/rejected": -255.16659545898438, "eval_loss": 2027.355224609375, "eval_rewards/accuracies": 0.6930000185966492, "eval_rewards/chosen": -0.12475859373807907, "eval_rewards/margins": 0.09141821414232254, "eval_rewards/rejected": -0.2161768227815628, "eval_runtime": 416.6319, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.2, "step": 3300 }, { "epoch": 1.73, "learning_rate": 2.666023235899734e-07, "logits/chosen": 0.5439051389694214, "logits/rejected": 0.638985276222229, "logps/chosen": -249.70217895507812, "logps/rejected": -246.07040405273438, "loss": 1936.2746, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13094648718833923, "rewards/margins": 0.10491780191659927, "rewards/rejected": -0.23586425185203552, "step": 3310 }, { "epoch": 1.74, "learning_rate": 2.564306275529341e-07, "logits/chosen": 0.5696260929107666, "logits/rejected": 0.6271142959594727, "logps/chosen": -288.08721923828125, "logps/rejected": -263.6356201171875, "loss": 1974.6082, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11936695873737335, "rewards/margins": 0.10857198387384415, "rewards/rejected": -0.2279389351606369, "step": 3320 }, { "epoch": 1.74, "learning_rate": 2.4644626891284243e-07, "logits/chosen": 0.5715283155441284, "logits/rejected": 0.6530539393424988, "logps/chosen": -245.0167236328125, "logps/rejected": -238.81460571289062, "loss": 2066.4543, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12561528384685516, "rewards/margins": 0.08124671876430511, "rewards/rejected": -0.20686200261116028, "step": 3330 }, { "epoch": 1.75, "learning_rate": 2.3665008136077332e-07, "logits/chosen": 0.5698996186256409, "logits/rejected": 0.6029760837554932, "logps/chosen": -264.3577575683594, "logps/rejected": -271.068115234375, "loss": 2090.4994, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1360047161579132, "rewards/margins": 0.08008682727813721, "rewards/rejected": -0.2160915583372116, "step": 3340 }, { "epoch": 1.75, "learning_rate": 2.2704288287556718e-07, "logits/chosen": 0.5687640309333801, "logits/rejected": 0.5940347909927368, "logps/chosen": -257.6128845214844, "logps/rejected": -248.91015625, "loss": 2136.7539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12395147979259491, "rewards/margins": 0.0794539600610733, "rewards/rejected": -0.2034054547548294, "step": 3350 }, { "epoch": 1.76, "learning_rate": 2.1762547565553293e-07, "logits/chosen": 0.5388206839561462, "logits/rejected": 0.5561047792434692, "logps/chosen": -261.9334716796875, "logps/rejected": -260.14556884765625, "loss": 1989.5404, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1216350793838501, "rewards/margins": 0.09191958606243134, "rewards/rejected": -0.21355466544628143, "step": 3360 }, { "epoch": 1.76, "learning_rate": 2.083986460514631e-07, "logits/chosen": 0.5701113343238831, "logits/rejected": 0.6196510195732117, "logps/chosen": -251.4109344482422, "logps/rejected": -252.1616668701172, "loss": 1820.4641, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.12642905116081238, "rewards/margins": 0.10934920608997345, "rewards/rejected": -0.23577824234962463, "step": 3370 }, { "epoch": 1.77, "learning_rate": 1.993631645009747e-07, "logits/chosen": 0.5514119863510132, "logits/rejected": 0.5474542379379272, "logps/chosen": -256.45831298828125, "logps/rejected": -227.55514526367188, "loss": 1845.2707, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10441195964813232, "rewards/margins": 0.1090712919831276, "rewards/rejected": -0.21348324418067932, "step": 3380 }, { "epoch": 1.77, "learning_rate": 1.9051978546417715e-07, "logits/chosen": 0.5202070474624634, "logits/rejected": 0.5690991282463074, "logps/chosen": -260.46600341796875, "logps/rejected": -261.4082946777344, "loss": 1912.7502, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.10940267145633698, "rewards/margins": 0.10180971771478653, "rewards/rejected": -0.2112123966217041, "step": 3390 }, { "epoch": 1.78, "learning_rate": 1.818692473606748e-07, "logits/chosen": 0.5479332208633423, "logits/rejected": 0.5715588331222534, "logps/chosen": -258.5849609375, "logps/rejected": -264.29388427734375, "loss": 1945.4934, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.12341777980327606, "rewards/margins": 0.09188680350780487, "rewards/rejected": -0.21530456840991974, "step": 3400 }, { "epoch": 1.78, "eval_logits/chosen": 0.5352820754051208, "eval_logits/rejected": 0.5908908247947693, "eval_logps/chosen": -269.1009826660156, "eval_logps/rejected": -255.1898651123047, "eval_loss": 2025.7803955078125, "eval_rewards/accuracies": 0.6934999823570251, "eval_rewards/chosen": -0.12483509629964828, "eval_rewards/margins": 0.09157437831163406, "eval_rewards/rejected": -0.21640948951244354, "eval_runtime": 416.572, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 3400 }, { "epoch": 1.78, "learning_rate": 1.7341227250790989e-07, "logits/chosen": 0.5836583375930786, "logits/rejected": 0.632857084274292, "logps/chosen": -245.8205108642578, "logps/rejected": -252.48471069335938, "loss": 1828.1664, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.10635235160589218, "rewards/margins": 0.11655166000127792, "rewards/rejected": -0.2229039967060089, "step": 3410 }, { "epoch": 1.79, "learning_rate": 1.6514956706084885e-07, "logits/chosen": 0.6221760511398315, "logits/rejected": 0.5567342042922974, "logps/chosen": -266.02239990234375, "logps/rejected": -246.35385131835938, "loss": 1826.1057, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09994658827781677, "rewards/margins": 0.1130019798874855, "rewards/rejected": -0.21294856071472168, "step": 3420 }, { "epoch": 1.8, "learning_rate": 1.5708182095301867e-07, "logits/chosen": 0.6005284190177917, "logits/rejected": 0.6083909869194031, "logps/chosen": -280.53741455078125, "logps/rejected": -261.88201904296875, "loss": 1851.2512, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.11887475103139877, "rewards/margins": 0.10133900493383408, "rewards/rejected": -0.22021374106407166, "step": 3430 }, { "epoch": 1.8, "learning_rate": 1.4920970783889737e-07, "logits/chosen": 0.5680890083312988, "logits/rejected": 0.5507141351699829, "logps/chosen": -271.96990966796875, "logps/rejected": -241.1654815673828, "loss": 2041.0072, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1112411841750145, "rewards/margins": 0.08822907507419586, "rewards/rejected": -0.19947026669979095, "step": 3440 }, { "epoch": 1.81, "learning_rate": 1.4153388503766492e-07, "logits/chosen": 0.5438860654830933, "logits/rejected": 0.5644111633300781, "logps/chosen": -279.3092346191406, "logps/rejected": -239.37167358398438, "loss": 1966.0102, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1369594931602478, "rewards/margins": 0.09016064554452896, "rewards/rejected": -0.22712013125419617, "step": 3450 }, { "epoch": 1.81, "learning_rate": 1.340549934783164e-07, "logits/chosen": 0.6110261082649231, "logits/rejected": 0.6002285480499268, "logps/chosen": -255.5424346923828, "logps/rejected": -258.6153259277344, "loss": 1778.4668, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.10746339708566666, "rewards/margins": 0.12720224261283875, "rewards/rejected": -0.23466560244560242, "step": 3460 }, { "epoch": 1.82, "learning_rate": 1.2677365764614452e-07, "logits/chosen": 0.6116484999656677, "logits/rejected": 0.6142521500587463, "logps/chosen": -251.9376983642578, "logps/rejected": -247.8961639404297, "loss": 1943.7922, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.12416845560073853, "rewards/margins": 0.09600269794464111, "rewards/rejected": -0.22017112374305725, "step": 3470 }, { "epoch": 1.82, "learning_rate": 1.196904855305961e-07, "logits/chosen": 0.5488280057907104, "logits/rejected": 0.6329927444458008, "logps/chosen": -261.445068359375, "logps/rejected": -255.37142944335938, "loss": 2087.1572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11578680574893951, "rewards/margins": 0.08292602747678757, "rewards/rejected": -0.19871282577514648, "step": 3480 }, { "epoch": 1.83, "learning_rate": 1.1280606857450387e-07, "logits/chosen": 0.5712449550628662, "logits/rejected": 0.6291993856430054, "logps/chosen": -243.8418731689453, "logps/rejected": -233.1319122314453, "loss": 1809.2799, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11564090102910995, "rewards/margins": 0.11533119529485703, "rewards/rejected": -0.23097209632396698, "step": 3490 }, { "epoch": 1.83, "learning_rate": 1.0612098162470302e-07, "logits/chosen": 0.5486131906509399, "logits/rejected": 0.6034047603607178, "logps/chosen": -253.94577026367188, "logps/rejected": -243.55593872070312, "loss": 1937.2627, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10992947965860367, "rewards/margins": 0.09977956861257553, "rewards/rejected": -0.2097090482711792, "step": 3500 }, { "epoch": 1.83, "eval_logits/chosen": 0.5346845984458923, "eval_logits/rejected": 0.5903106927871704, "eval_logps/chosen": -269.0877990722656, "eval_logps/rejected": -255.17501831054688, "eval_loss": 2027.823974609375, "eval_rewards/accuracies": 0.6930000185966492, "eval_rewards/chosen": -0.12470405548810959, "eval_rewards/margins": 0.09155706316232681, "eval_rewards/rejected": -0.2162611186504364, "eval_runtime": 416.489, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 3500 }, { "epoch": 1.84, "learning_rate": 9.96357828840297e-08, "logits/chosen": 0.5791751742362976, "logits/rejected": 0.6535072326660156, "logps/chosen": -262.40301513671875, "logps/rejected": -260.7125549316406, "loss": 1964.6746, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1026170626282692, "rewards/margins": 0.08860354125499725, "rewards/rejected": -0.19122058153152466, "step": 3510 }, { "epoch": 1.84, "learning_rate": 9.335101386471285e-08, "logits/chosen": 0.5727615356445312, "logits/rejected": 0.5819220542907715, "logps/chosen": -284.75616455078125, "logps/rejected": -250.07083129882812, "loss": 2092.7152, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12308394908905029, "rewards/margins": 0.08286546170711517, "rewards/rejected": -0.20594939589500427, "step": 3520 }, { "epoch": 1.85, "learning_rate": 8.726719934315648e-08, "logits/chosen": 0.5491209626197815, "logits/rejected": 0.5870348811149597, "logps/chosen": -249.99295043945312, "logps/rejected": -249.68679809570312, "loss": 1912.009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10659299790859222, "rewards/margins": 0.09919796884059906, "rewards/rejected": -0.20579096674919128, "step": 3530 }, { "epoch": 1.85, "learning_rate": 8.138484731612273e-08, "logits/chosen": 0.6029896140098572, "logits/rejected": 0.6406581997871399, "logps/chosen": -256.2204284667969, "logps/rejected": -229.89990234375, "loss": 1913.6865, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10959267616271973, "rewards/margins": 0.1080545037984848, "rewards/rejected": -0.21764719486236572, "step": 3540 }, { "epoch": 1.86, "learning_rate": 7.57044489583128e-08, "logits/chosen": 0.5283448100090027, "logits/rejected": 0.5756082534790039, "logps/chosen": -266.2628479003906, "logps/rejected": -251.70962524414062, "loss": 2214.9162, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12659896910190582, "rewards/margins": 0.06956067681312561, "rewards/rejected": -0.19615966081619263, "step": 3550 }, { "epoch": 1.86, "learning_rate": 7.022647858135501e-08, "logits/chosen": 0.5648905038833618, "logits/rejected": 0.5851987600326538, "logps/chosen": -255.75704956054688, "logps/rejected": -235.11666870117188, "loss": 1911.8838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10966980457305908, "rewards/margins": 0.09904567152261734, "rewards/rejected": -0.20871546864509583, "step": 3560 }, { "epoch": 1.87, "learning_rate": 6.495139359419922e-08, "logits/chosen": 0.5362564921379089, "logits/rejected": 0.629612922668457, "logps/chosen": -303.055419921875, "logps/rejected": -271.39288330078125, "loss": 1875.824, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1098160371184349, "rewards/margins": 0.11708301305770874, "rewards/rejected": -0.22689905762672424, "step": 3570 }, { "epoch": 1.87, "learning_rate": 5.987963446492384e-08, "logits/chosen": 0.5597736239433289, "logits/rejected": 0.5716227889060974, "logps/chosen": -262.5528564453125, "logps/rejected": -251.4736785888672, "loss": 1887.5355, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10405333340167999, "rewards/margins": 0.11675725132226944, "rewards/rejected": -0.22081057727336884, "step": 3580 }, { "epoch": 1.88, "learning_rate": 5.501162468395688e-08, "logits/chosen": 0.5817372798919678, "logits/rejected": 0.5784239768981934, "logps/chosen": -251.4989013671875, "logps/rejected": -250.26522827148438, "loss": 1920.6814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12673336267471313, "rewards/margins": 0.11053230613470078, "rewards/rejected": -0.2372656762599945, "step": 3590 }, { "epoch": 1.88, "learning_rate": 5.034777072871394e-08, "logits/chosen": 0.5656172037124634, "logits/rejected": 0.6273232102394104, "logps/chosen": -250.90109252929688, "logps/rejected": -256.27178955078125, "loss": 2007.2062, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1188703402876854, "rewards/margins": 0.09515853226184845, "rewards/rejected": -0.21402888000011444, "step": 3600 }, { "epoch": 1.88, "eval_logits/chosen": 0.5352125763893127, "eval_logits/rejected": 0.5910032391548157, "eval_logps/chosen": -269.0622863769531, "eval_logps/rejected": -255.18426513671875, "eval_loss": 2025.32275390625, "eval_rewards/accuracies": 0.6894999742507935, "eval_rewards/chosen": -0.12444862723350525, "eval_rewards/margins": 0.0919048860669136, "eval_rewards/rejected": -0.21635350584983826, "eval_runtime": 416.4513, "eval_samples_per_second": 4.802, "eval_steps_per_second": 1.201, "step": 3600 }, { "epoch": 1.89, "learning_rate": 4.5888462029658186e-08, "logits/chosen": 0.5575802326202393, "logits/rejected": 0.5975883603096008, "logps/chosen": -251.73623657226562, "logps/rejected": -250.5563201904297, "loss": 1952.7262, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12859514355659485, "rewards/margins": 0.0945589691400528, "rewards/rejected": -0.22315411269664764, "step": 3610 }, { "epoch": 1.89, "learning_rate": 4.163407093778243e-08, "logits/chosen": 0.5034095048904419, "logits/rejected": 0.5554597973823547, "logps/chosen": -264.1334533691406, "logps/rejected": -260.69805908203125, "loss": 2102.8385, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1385246217250824, "rewards/margins": 0.08275660127401352, "rewards/rejected": -0.22128121554851532, "step": 3620 }, { "epoch": 1.9, "learning_rate": 3.7584952693519025e-08, "logits/chosen": 0.5984662175178528, "logits/rejected": 0.5975054502487183, "logps/chosen": -270.71661376953125, "logps/rejected": -260.6340026855469, "loss": 1962.2367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11859796196222305, "rewards/margins": 0.0891033262014389, "rewards/rejected": -0.20770128071308136, "step": 3630 }, { "epoch": 1.91, "learning_rate": 3.37414453970758e-08, "logits/chosen": 0.5739267468452454, "logits/rejected": 0.577027440071106, "logps/chosen": -249.9311065673828, "logps/rejected": -226.3584442138672, "loss": 2134.9982, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13262119889259338, "rewards/margins": 0.08364128321409225, "rewards/rejected": -0.21626248955726624, "step": 3640 }, { "epoch": 1.91, "learning_rate": 3.0103869980206145e-08, "logits/chosen": 0.6304140090942383, "logits/rejected": 0.63347989320755, "logps/chosen": -239.28915405273438, "logps/rejected": -258.83660888671875, "loss": 2049.8613, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.13652729988098145, "rewards/margins": 0.08251725137233734, "rewards/rejected": -0.2190445363521576, "step": 3650 }, { "epoch": 1.92, "learning_rate": 2.6672530179410183e-08, "logits/chosen": 0.5943226218223572, "logits/rejected": 0.6503596305847168, "logps/chosen": -264.0231018066406, "logps/rejected": -245.2039031982422, "loss": 2003.8398, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12203504145145416, "rewards/margins": 0.09293092787265778, "rewards/rejected": -0.21496596932411194, "step": 3660 }, { "epoch": 1.92, "learning_rate": 2.3447712510573928e-08, "logits/chosen": 0.6132981777191162, "logits/rejected": 0.6643080115318298, "logps/chosen": -258.1444091796875, "logps/rejected": -241.70986938476562, "loss": 1827.9928, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12529726326465607, "rewards/margins": 0.11168257147073746, "rewards/rejected": -0.23697984218597412, "step": 3670 }, { "epoch": 1.93, "learning_rate": 2.04296862450451e-08, "logits/chosen": 0.5302231907844543, "logits/rejected": 0.5352843999862671, "logps/chosen": -270.13079833984375, "logps/rejected": -240.57931518554688, "loss": 2135.2258, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10986328125, "rewards/margins": 0.07277282327413559, "rewards/rejected": -0.182636097073555, "step": 3680 }, { "epoch": 1.93, "learning_rate": 1.7618703387147495e-08, "logits/chosen": 0.5543524622917175, "logits/rejected": 0.5519607663154602, "logps/chosen": -281.1496276855469, "logps/rejected": -274.3504943847656, "loss": 1951.3572, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10969813168048859, "rewards/margins": 0.09588075429201126, "rewards/rejected": -0.20557889342308044, "step": 3690 }, { "epoch": 1.94, "learning_rate": 1.501499865314171e-08, "logits/chosen": 0.5777779817581177, "logits/rejected": 0.6056709289550781, "logps/chosen": -258.98333740234375, "logps/rejected": -245.7554473876953, "loss": 2076.715, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11231188476085663, "rewards/margins": 0.09381435066461563, "rewards/rejected": -0.20612624287605286, "step": 3700 }, { "epoch": 1.94, "eval_logits/chosen": 0.5358251333236694, "eval_logits/rejected": 0.5913307666778564, "eval_logps/chosen": -269.0487365722656, "eval_logps/rejected": -255.13833618164062, "eval_loss": 2027.4857177734375, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.12431324273347855, "eval_rewards/margins": 0.09158134460449219, "eval_rewards/rejected": -0.21589456498622894, "eval_runtime": 416.7132, "eval_samples_per_second": 4.799, "eval_steps_per_second": 1.2, "step": 3700 }, { "epoch": 1.94, "learning_rate": 1.2618789451623314e-08, "logits/chosen": 0.5645478963851929, "logits/rejected": 0.6109569072723389, "logps/chosen": -224.93197631835938, "logps/rejected": -236.52059936523438, "loss": 2014.5852, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12038487195968628, "rewards/margins": 0.08836686611175537, "rewards/rejected": -0.20875172317028046, "step": 3710 }, { "epoch": 1.95, "learning_rate": 1.0430275865371265e-08, "logits/chosen": 0.5508732795715332, "logits/rejected": 0.6115376353263855, "logps/chosen": -280.9700927734375, "logps/rejected": -276.9327087402344, "loss": 2099.5396, "rewards/accuracies": 0.625, "rewards/chosen": -0.12365047633647919, "rewards/margins": 0.08925069868564606, "rewards/rejected": -0.21290118992328644, "step": 3720 }, { "epoch": 1.95, "learning_rate": 8.449640634639878e-09, "logits/chosen": 0.5355272889137268, "logits/rejected": 0.5811390280723572, "logps/chosen": -234.78927612304688, "logps/rejected": -228.4997100830078, "loss": 2043.5014, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1230451837182045, "rewards/margins": 0.08070604503154755, "rewards/rejected": -0.20375123620033264, "step": 3730 }, { "epoch": 1.96, "learning_rate": 6.677049141901315e-09, "logits/chosen": 0.5882354974746704, "logits/rejected": 0.572884738445282, "logps/chosen": -238.9161834716797, "logps/rejected": -247.5298309326172, "loss": 2155.4453, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13735826313495636, "rewards/margins": 0.06890521943569183, "rewards/rejected": -0.2062634974718094, "step": 3740 }, { "epoch": 1.96, "learning_rate": 5.112649398034686e-09, "logits/chosen": 0.6161108016967773, "logits/rejected": 0.6904915571212769, "logps/chosen": -284.59112548828125, "logps/rejected": -254.5317840576172, "loss": 2025.8148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.125542551279068, "rewards/margins": 0.1034855991601944, "rewards/rejected": -0.2290281355381012, "step": 3750 }, { "epoch": 1.97, "learning_rate": 3.756572029968708e-09, "logits/chosen": 0.5819270610809326, "logits/rejected": 0.552914023399353, "logps/chosen": -255.23886108398438, "logps/rejected": -249.76144409179688, "loss": 1779.8414, "rewards/accuracies": 0.75, "rewards/chosen": -0.11097989976406097, "rewards/margins": 0.1095919981598854, "rewards/rejected": -0.22057190537452698, "step": 3760 }, { "epoch": 1.97, "learning_rate": 2.6089302697732133e-09, "logits/chosen": 0.5825963020324707, "logits/rejected": 0.5435997843742371, "logps/chosen": -250.9562225341797, "logps/rejected": -227.83010864257812, "loss": 1853.0563, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.11256451904773712, "rewards/margins": 0.10355798900127411, "rewards/rejected": -0.21612253785133362, "step": 3770 }, { "epoch": 1.98, "learning_rate": 1.6698199452053199e-09, "logits/chosen": 0.6074197292327881, "logits/rejected": 0.6451854705810547, "logps/chosen": -269.6044616699219, "logps/rejected": -231.6178436279297, "loss": 1905.3814, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10596567392349243, "rewards/margins": 0.099820576608181, "rewards/rejected": -0.20578625798225403, "step": 3780 }, { "epoch": 1.98, "learning_rate": 9.393194717061127e-10, "logits/chosen": 0.5966477394104004, "logits/rejected": 0.57940673828125, "logps/chosen": -261.906982421875, "logps/rejected": -243.7214813232422, "loss": 2099.1896, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12468767166137695, "rewards/margins": 0.0848483294248581, "rewards/rejected": -0.20953598618507385, "step": 3790 }, { "epoch": 1.99, "learning_rate": 4.1748984585560094e-10, "logits/chosen": 0.5209355354309082, "logits/rejected": 0.6011817455291748, "logps/chosen": -257.47882080078125, "logps/rejected": -253.18017578125, "loss": 2055.2201, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12113461643457413, "rewards/margins": 0.09108567237854004, "rewards/rejected": -0.21222028136253357, "step": 3800 }, { "epoch": 1.99, "eval_logits/chosen": 0.5346859693527222, "eval_logits/rejected": 0.5902337431907654, "eval_logps/chosen": -269.0542907714844, "eval_logps/rejected": -255.1454620361328, "eval_loss": 2027.8082275390625, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.12436838448047638, "eval_rewards/margins": 0.09159712493419647, "eval_rewards/rejected": -0.21596547961235046, "eval_runtime": 416.5485, "eval_samples_per_second": 4.801, "eval_steps_per_second": 1.2, "step": 3800 }, { "epoch": 1.99, "learning_rate": 1.0437464027707179e-10, "logits/chosen": 0.5776988863945007, "logits/rejected": 0.6123504638671875, "logps/chosen": -265.8362121582031, "logps/rejected": -237.8904571533203, "loss": 2055.609, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1267283707857132, "rewards/margins": 0.08532574027776718, "rewards/rejected": -0.21205410361289978, "step": 3810 }, { "epoch": 2.0, "learning_rate": 0.0, "logits/chosen": 0.5118510127067566, "logits/rejected": 0.5845987200737, "logps/chosen": -274.1031188964844, "logps/rejected": -256.99688720703125, "loss": 2139.6068, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13095693290233612, "rewards/margins": 0.07750894129276276, "rewards/rejected": -0.20846585929393768, "step": 3820 }, { "epoch": 2.0, "step": 3820, "total_flos": 0.0, "train_loss": 2099.8451463309884, "train_runtime": 42790.3459, "train_samples_per_second": 1.429, "train_steps_per_second": 0.089 } ], "logging_steps": 10, "max_steps": 3820, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }