diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6000 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9994765768123528, + "eval_steps": 100, + "global_step": 3820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.3089005235602096e-08, + "logits/chosen": 0.896942138671875, + "logits/rejected": 0.9175108075141907, + "logps/chosen": -192.32028198242188, + "logps/rejected": -193.69876098632812, + "loss": 2500.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.3089005235602095e-07, + "logits/chosen": 0.903715968132019, + "logits/rejected": 0.9309377670288086, + "logps/chosen": -253.598876953125, + "logps/rejected": -228.25482177734375, + "loss": 2504.6897, + "rewards/accuracies": 0.3819444477558136, + "rewards/chosen": -0.0001807510998332873, + "rewards/margins": -0.0004412428825162351, + "rewards/rejected": 0.00026049179723486304, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.617801047120419e-07, + "logits/chosen": 0.8256899118423462, + "logits/rejected": 0.9293961524963379, + "logps/chosen": -252.84963989257812, + "logps/rejected": -214.4913330078125, + "loss": 2511.0686, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.0004701187717728317, + "rewards/margins": -0.0010407656664028764, + "rewards/rejected": 0.0005706468946300447, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 3.926701570680629e-07, + "logits/chosen": 0.8985889554023743, + "logits/rejected": 0.8785662651062012, + "logps/chosen": -236.40536499023438, + "logps/rejected": -219.20285034179688, + "loss": 2494.8072, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0004474873130675405, + "rewards/margins": 0.0005769692361354828, + "rewards/rejected": -0.00012948190851602703, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 5.235602094240838e-07, + "logits/chosen": 0.819919228553772, + "logits/rejected": 0.9144619703292847, + "logps/chosen": -252.99588012695312, + "logps/rejected": -225.9224853515625, + "loss": 2504.7604, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0003411176148802042, + "rewards/margins": -0.0004235326196067035, + "rewards/rejected": 8.241502655437216e-05, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 6.544502617801048e-07, + "logits/chosen": 0.7974398136138916, + "logits/rejected": 0.8803712725639343, + "logps/chosen": -254.3247528076172, + "logps/rejected": -243.318603515625, + "loss": 2498.3947, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.00023676609271205962, + "rewards/margins": 0.0002204025659011677, + "rewards/rejected": -0.00045716846943832934, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 7.853403141361258e-07, + "logits/chosen": 0.8626053929328918, + "logits/rejected": 0.8485649824142456, + "logps/chosen": -262.6585693359375, + "logps/rejected": -248.63272094726562, + "loss": 2500.4902, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0002292692952323705, + "rewards/margins": 2.0124425645917654e-05, + "rewards/rejected": -0.0002493937499821186, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 9.162303664921466e-07, + "logits/chosen": 0.8897444605827332, + "logits/rejected": 0.8922082185745239, + "logps/chosen": -232.531005859375, + "logps/rejected": -234.0869903564453, + "loss": 2496.8041, + "rewards/accuracies": 0.53125, + "rewards/chosen": 3.7896970752626657e-06, + "rewards/margins": 0.00037148987757973373, + "rewards/rejected": -0.00036770018050447106, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 1.0471204188481676e-06, + "logits/chosen": 0.8788009881973267, + "logits/rejected": 0.8891068696975708, + "logps/chosen": -242.5009765625, + "logps/rejected": -229.5125732421875, + "loss": 2508.6898, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.0007867829990573227, + "rewards/margins": -0.0008174808463081717, + "rewards/rejected": 3.069788363063708e-05, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 1.1780104712041885e-06, + "logits/chosen": 0.8606799840927124, + "logits/rejected": 0.9575719833374023, + "logps/chosen": -232.0597686767578, + "logps/rejected": -218.4732666015625, + "loss": 2496.8559, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0003882866003550589, + "rewards/margins": 0.000359431782271713, + "rewards/rejected": 2.8854870834038593e-05, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 1.3089005235602096e-06, + "logits/chosen": 0.8833224177360535, + "logits/rejected": 0.8661258816719055, + "logps/chosen": -245.799072265625, + "logps/rejected": -249.2645721435547, + "loss": 2496.843, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005279771285131574, + "rewards/margins": 0.00037464461638592184, + "rewards/rejected": -0.0009026216575875878, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": 0.8318074345588684, + "eval_logits/rejected": 0.8888298273086548, + "eval_logps/chosen": -256.65057373046875, + "eval_logps/rejected": -233.56494140625, + "eval_loss": 2502.266845703125, + "eval_rewards/accuracies": 0.5005000233650208, + "eval_rewards/chosen": -0.0003313073539175093, + "eval_rewards/margins": -0.00017098072567023337, + "eval_rewards/rejected": -0.00016032661369536072, + "eval_runtime": 416.835, + "eval_samples_per_second": 4.798, + "eval_steps_per_second": 1.2, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 1.4397905759162306e-06, + "logits/chosen": 0.9012953042984009, + "logits/rejected": 0.8766192197799683, + "logps/chosen": -229.46292114257812, + "logps/rejected": -210.2642364501953, + "loss": 2501.3449, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00010574392217677087, + "rewards/margins": -7.803810149198398e-05, + "rewards/rejected": -2.7705809770850465e-05, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.5706806282722515e-06, + "logits/chosen": 0.8108441233634949, + "logits/rejected": 0.8906086087226868, + "logps/chosen": -273.14385986328125, + "logps/rejected": -259.1924133300781, + "loss": 2494.2678, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.00021679741621483117, + "rewards/margins": 0.0006280258530750871, + "rewards/rejected": -0.00041122836410067976, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 1.7015706806282726e-06, + "logits/chosen": 0.8712674975395203, + "logits/rejected": 0.9049458503723145, + "logps/chosen": -277.8616943359375, + "logps/rejected": -222.53662109375, + "loss": 2489.5006, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0006732499459758401, + "rewards/margins": 0.001107201213017106, + "rewards/rejected": -0.0004339513252489269, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.8324607329842933e-06, + "logits/chosen": 0.7955681085586548, + "logits/rejected": 0.8811987638473511, + "logps/chosen": -248.83865356445312, + "logps/rejected": -246.317138671875, + "loss": 2504.0979, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00031032637343741953, + "rewards/margins": -0.00034084441722370684, + "rewards/rejected": 3.0518032872350886e-05, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 1.9633507853403143e-06, + "logits/chosen": 0.7933157086372375, + "logits/rejected": 0.8591764569282532, + "logps/chosen": -257.7363586425781, + "logps/rejected": -217.54580688476562, + "loss": 2507.8082, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0007291415822692215, + "rewards/margins": -0.0007318807765841484, + "rewards/rejected": 2.739173851296073e-06, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 2.094240837696335e-06, + "logits/chosen": 0.8041954040527344, + "logits/rejected": 0.8887465596199036, + "logps/chosen": -276.43304443359375, + "logps/rejected": -250.4193572998047, + "loss": 2504.2807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00010556764755165204, + "rewards/margins": -0.0003692187019623816, + "rewards/rejected": 0.00026365104713477194, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 2.2251308900523565e-06, + "logits/chosen": 0.8059272766113281, + "logits/rejected": 0.8950363993644714, + "logps/chosen": -274.240234375, + "logps/rejected": -247.8701171875, + "loss": 2501.8248, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.001084670191630721, + "rewards/margins": -0.0001141707762144506, + "rewards/rejected": -0.0009704994154162705, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 2.356020942408377e-06, + "logits/chosen": 0.8783141374588013, + "logits/rejected": 0.8253491520881653, + "logps/chosen": -242.3585968017578, + "logps/rejected": -221.0929718017578, + "loss": 2496.3063, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0004814372514374554, + "rewards/margins": 0.0004220888367854059, + "rewards/rejected": -0.0009035261464305222, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 2.4869109947643982e-06, + "logits/chosen": 0.8767743110656738, + "logits/rejected": 0.8822822570800781, + "logps/chosen": -246.2511444091797, + "logps/rejected": -224.4364471435547, + "loss": 2498.3156, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00032182232826016843, + "rewards/margins": 0.00022237170196603984, + "rewards/rejected": 9.945056081051007e-05, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 2.617801047120419e-06, + "logits/chosen": 0.855573296546936, + "logits/rejected": 0.9106731414794922, + "logps/chosen": -258.25885009765625, + "logps/rejected": -236.6140594482422, + "loss": 2499.2807, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00018535128037910908, + "rewards/margins": 0.0001189738031825982, + "rewards/rejected": -0.0003043250762857497, + "step": 200 + }, + { + "epoch": 0.1, + "eval_logits/chosen": 0.8310006856918335, + "eval_logits/rejected": 0.8882209062576294, + "eval_logps/chosen": -256.6106262207031, + "eval_logps/rejected": -233.5994873046875, + "eval_loss": 2494.83544921875, + "eval_rewards/accuracies": 0.5189999938011169, + "eval_rewards/chosen": 6.786447193007916e-05, + "eval_rewards/margins": 0.0005738017498515546, + "eval_rewards/rejected": -0.0005059372633695602, + "eval_runtime": 416.4863, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 2.74869109947644e-06, + "logits/chosen": 0.9517833590507507, + "logits/rejected": 0.910740852355957, + "logps/chosen": -244.80032348632812, + "logps/rejected": -232.45321655273438, + "loss": 2500.5746, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -9.258640056941658e-05, + "rewards/margins": -6.444106020353502e-06, + "rewards/rejected": -8.61423322930932e-05, + "step": 210 + }, + { + "epoch": 0.12, + "learning_rate": 2.8795811518324613e-06, + "logits/chosen": 0.8369059562683105, + "logits/rejected": 0.8937622904777527, + "logps/chosen": -267.40948486328125, + "logps/rejected": -209.13290405273438, + "loss": 2487.1783, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0004562476242426783, + "rewards/margins": 0.0013524172827601433, + "rewards/rejected": -0.0008961696876212955, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 3.010471204188482e-06, + "logits/chosen": 0.8930699229240417, + "logits/rejected": 0.9343907237052917, + "logps/chosen": -258.6376037597656, + "logps/rejected": -221.6857452392578, + "loss": 2493.877, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.0002062669227598235, + "rewards/margins": 0.0006668218411505222, + "rewards/rejected": -0.00046055493294261396, + "step": 230 + }, + { + "epoch": 0.13, + "learning_rate": 3.141361256544503e-06, + "logits/chosen": 0.9376400113105774, + "logits/rejected": 0.8995400667190552, + "logps/chosen": -228.9315185546875, + "logps/rejected": -242.112548828125, + "loss": 2480.1844, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0004402367048896849, + "rewards/margins": 0.002044759690761566, + "rewards/rejected": -0.001604523160494864, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 3.2722513089005235e-06, + "logits/chosen": 0.9079924821853638, + "logits/rejected": 0.8767238855361938, + "logps/chosen": -242.71121215820312, + "logps/rejected": -233.9228973388672, + "loss": 2493.2939, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -6.936644058441743e-05, + "rewards/margins": 0.0007372990949079394, + "rewards/rejected": -0.0008066653972491622, + "step": 250 + }, + { + "epoch": 0.14, + "learning_rate": 3.403141361256545e-06, + "logits/chosen": 0.8814166784286499, + "logits/rejected": 0.9410937428474426, + "logps/chosen": -236.88095092773438, + "logps/rejected": -223.930419921875, + "loss": 2496.3828, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00017086375737562776, + "rewards/margins": 0.0004164519195910543, + "rewards/rejected": -0.0005873156478628516, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 3.534031413612566e-06, + "logits/chosen": 0.8372557759284973, + "logits/rejected": 0.8741558194160461, + "logps/chosen": -212.303466796875, + "logps/rejected": -239.15158081054688, + "loss": 2483.2088, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.00031574201420880854, + "rewards/margins": 0.0017516377847641706, + "rewards/rejected": -0.0020673798862844706, + "step": 270 + }, + { + "epoch": 0.15, + "learning_rate": 3.6649214659685865e-06, + "logits/chosen": 0.8835927844047546, + "logits/rejected": 0.9312320947647095, + "logps/chosen": -250.3417510986328, + "logps/rejected": -263.51531982421875, + "loss": 2492.1223, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0005325328675098717, + "rewards/margins": 0.000863347842823714, + "rewards/rejected": -0.0013958807103335857, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 3.7958115183246074e-06, + "logits/chosen": 0.8323150873184204, + "logits/rejected": 0.8896921277046204, + "logps/chosen": -250.1656951904297, + "logps/rejected": -234.757568359375, + "loss": 2483.249, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.00013801059685647488, + "rewards/margins": 0.0017679758602753282, + "rewards/rejected": -0.001905986457131803, + "step": 290 + }, + { + "epoch": 0.16, + "learning_rate": 3.926701570680629e-06, + "logits/chosen": 0.8511127233505249, + "logits/rejected": 0.8209661245346069, + "logps/chosen": -273.7172546386719, + "logps/rejected": -250.82748413085938, + "loss": 2477.7609, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0011967200553044677, + "rewards/margins": 0.0023162723518908024, + "rewards/rejected": -0.0035129922907799482, + "step": 300 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 0.8318725824356079, + "eval_logits/rejected": 0.8892252445220947, + "eval_logps/chosen": -256.7284851074219, + "eval_logps/rejected": -233.8547821044922, + "eval_loss": 2481.50146484375, + "eval_rewards/accuracies": 0.559499979019165, + "eval_rewards/chosen": -0.001110685057938099, + "eval_rewards/margins": 0.0019479345064610243, + "eval_rewards/rejected": -0.003058619564399123, + "eval_runtime": 416.6935, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 4.05759162303665e-06, + "logits/chosen": 0.7432538866996765, + "logits/rejected": 0.817090630531311, + "logps/chosen": -274.85931396484375, + "logps/rejected": -236.4228057861328, + "loss": 2469.5742, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.001123375492170453, + "rewards/margins": 0.003155052661895752, + "rewards/rejected": -0.004278427921235561, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 4.18848167539267e-06, + "logits/chosen": 0.9147623777389526, + "logits/rejected": 0.9334859848022461, + "logps/chosen": -233.425537109375, + "logps/rejected": -214.6092071533203, + "loss": 2457.892, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.00017569802002981305, + "rewards/margins": 0.004348465241491795, + "rewards/rejected": -0.004172767512500286, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 4.319371727748692e-06, + "logits/chosen": 0.8749006390571594, + "logits/rejected": 0.9252738952636719, + "logps/chosen": -247.8308563232422, + "logps/rejected": -218.9490203857422, + "loss": 2463.7824, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0017152890795841813, + "rewards/margins": 0.0037990615237504244, + "rewards/rejected": -0.0055143507197499275, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 4.450261780104713e-06, + "logits/chosen": 0.8781224489212036, + "logits/rejected": 0.9259663820266724, + "logps/chosen": -253.4806365966797, + "logps/rejected": -239.67434692382812, + "loss": 2474.3055, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0034939595498144627, + "rewards/margins": 0.0027302266098558903, + "rewards/rejected": -0.00622418662533164, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 4.5811518324607335e-06, + "logits/chosen": 0.7855554223060608, + "logits/rejected": 0.9314893484115601, + "logps/chosen": -255.0915985107422, + "logps/rejected": -206.8708038330078, + "loss": 2432.458, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0009129707468673587, + "rewards/margins": 0.007046517916023731, + "rewards/rejected": -0.007959488779306412, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 4.712041884816754e-06, + "logits/chosen": 0.8957219123840332, + "logits/rejected": 0.8874330520629883, + "logps/chosen": -257.02764892578125, + "logps/rejected": -230.2834014892578, + "loss": 2441.5672, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.001695218845270574, + "rewards/margins": 0.006147631909698248, + "rewards/rejected": -0.007842850871384144, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 4.842931937172775e-06, + "logits/chosen": 0.9125442504882812, + "logits/rejected": 0.8982815742492676, + "logps/chosen": -230.16940307617188, + "logps/rejected": -211.3495635986328, + "loss": 2439.5756, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.004968739114701748, + "rewards/margins": 0.006435071583837271, + "rewards/rejected": -0.011403810232877731, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 4.9738219895287965e-06, + "logits/chosen": 0.9134615063667297, + "logits/rejected": 0.8667083978652954, + "logps/chosen": -267.7635192871094, + "logps/rejected": -219.404541015625, + "loss": 2438.1262, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.004255574196577072, + "rewards/margins": 0.006601777859032154, + "rewards/rejected": -0.0108573529869318, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 4.999933200062888e-06, + "logits/chosen": 0.8681972622871399, + "logits/rejected": 0.8684479594230652, + "logps/chosen": -253.1089324951172, + "logps/rejected": -232.1811981201172, + "loss": 2414.8473, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0038361139595508575, + "rewards/margins": 0.009047028608620167, + "rewards/rejected": -0.012883143499493599, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 4.999661831436499e-06, + "logits/chosen": 0.9156022071838379, + "logits/rejected": 0.9197471737861633, + "logps/chosen": -260.40093994140625, + "logps/rejected": -238.50961303710938, + "loss": 2428.4195, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.005848343018442392, + "rewards/margins": 0.007789201103150845, + "rewards/rejected": -0.013637542724609375, + "step": 400 + }, + { + "epoch": 0.21, + "eval_logits/chosen": 0.8404272794723511, + "eval_logits/rejected": 0.8983384966850281, + "eval_logps/chosen": -257.29510498046875, + "eval_logps/rejected": -235.11265563964844, + "eval_loss": 2419.1044921875, + "eval_rewards/accuracies": 0.6495000123977661, + "eval_rewards/chosen": -0.006776793394237757, + "eval_rewards/margins": 0.008860657922923565, + "eval_rewards/rejected": -0.01563744992017746, + "eval_runtime": 416.4578, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 4.999181741766532e-06, + "logits/chosen": 0.8992105722427368, + "logits/rejected": 0.8969219923019409, + "logps/chosen": -252.7702178955078, + "logps/rejected": -249.8018798828125, + "loss": 2438.1242, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.007479649968445301, + "rewards/margins": 0.006951476447284222, + "rewards/rejected": -0.014431129209697247, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 4.9984929711403395e-06, + "logits/chosen": 0.913814902305603, + "logits/rejected": 0.9069592356681824, + "logps/chosen": -255.3079833984375, + "logps/rejected": -251.0774383544922, + "loss": 2417.1992, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.011085378006100655, + "rewards/margins": 0.009236546233296394, + "rewards/rejected": -0.020321926102042198, + "step": 420 + }, + { + "epoch": 0.23, + "learning_rate": 4.997595577070068e-06, + "logits/chosen": 0.8943805694580078, + "logits/rejected": 0.8994030952453613, + "logps/chosen": -235.75009155273438, + "logps/rejected": -232.7864990234375, + "loss": 2429.1924, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.012942090630531311, + "rewards/margins": 0.007976246066391468, + "rewards/rejected": -0.020918335765600204, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 4.996489634487865e-06, + "logits/chosen": 0.8081871867179871, + "logits/rejected": 0.8945296406745911, + "logps/chosen": -231.6891632080078, + "logps/rejected": -253.6363067626953, + "loss": 2484.0605, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.012701654806733131, + "rewards/margins": 0.003084682859480381, + "rewards/rejected": -0.015786338597536087, + "step": 440 + }, + { + "epoch": 0.24, + "learning_rate": 4.995175235739619e-06, + "logits/chosen": 0.8565770983695984, + "logits/rejected": 0.8623224496841431, + "logps/chosen": -264.3777770996094, + "logps/rejected": -267.23712158203125, + "loss": 2392.5188, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.011126170866191387, + "rewards/margins": 0.011980591341853142, + "rewards/rejected": -0.023106763139367104, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 4.9936524905772466e-06, + "logits/chosen": 0.78350430727005, + "logits/rejected": 0.8708200454711914, + "logps/chosen": -267.3443298339844, + "logps/rejected": -236.05459594726562, + "loss": 2404.3129, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.015727100893855095, + "rewards/margins": 0.010795451700687408, + "rewards/rejected": -0.026522550731897354, + "step": 460 + }, + { + "epoch": 0.25, + "learning_rate": 4.991921526149529e-06, + "logits/chosen": 0.9162321090698242, + "logits/rejected": 0.9280640482902527, + "logps/chosen": -256.3532409667969, + "logps/rejected": -247.89315795898438, + "loss": 2386.3984, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.01587381586432457, + "rewards/margins": 0.01281227171421051, + "rewards/rejected": -0.02868608757853508, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 4.9899824869915e-06, + "logits/chosen": 0.8048622012138367, + "logits/rejected": 0.8283928036689758, + "logps/chosen": -246.1285858154297, + "logps/rejected": -252.7605743408203, + "loss": 2396.8398, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.021037602797150612, + "rewards/margins": 0.012100132182240486, + "rewards/rejected": -0.0331377312541008, + "step": 480 + }, + { + "epoch": 0.26, + "learning_rate": 4.987835535012371e-06, + "logits/chosen": 0.8453197479248047, + "logits/rejected": 0.86089026927948, + "logps/chosen": -240.1754913330078, + "logps/rejected": -229.0949249267578, + "loss": 2350.6213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019980577751994133, + "rewards/margins": 0.01681477203965187, + "rewards/rejected": -0.03679535537958145, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 4.985480849482012e-06, + "logits/chosen": 0.8507258296012878, + "logits/rejected": 0.8831195831298828, + "logps/chosen": -264.3097839355469, + "logps/rejected": -267.3841552734375, + "loss": 2296.8842, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019409244880080223, + "rewards/margins": 0.023494381457567215, + "rewards/rejected": -0.04290362820029259, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.8214389681816101, + "eval_logits/rejected": 0.8805551528930664, + "eval_logps/chosen": -259.0124206542969, + "eval_logps/rejected": -237.73793029785156, + "eval_loss": 2349.435791015625, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.023950034752488136, + "eval_rewards/margins": 0.017940117046236992, + "eval_rewards/rejected": -0.04189015179872513, + "eval_runtime": 416.5178, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.2, + "step": 500 + }, + { + "epoch": 0.27, + "learning_rate": 4.98291862701599e-06, + "logits/chosen": 0.8344039916992188, + "logits/rejected": 0.8795874714851379, + "logps/chosen": -254.190673828125, + "logps/rejected": -214.09396362304688, + "loss": 2307.1967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02996075712144375, + "rewards/margins": 0.022381700575351715, + "rewards/rejected": -0.052342455834150314, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 4.980149081559142e-06, + "logits/chosen": 0.8595240712165833, + "logits/rejected": 0.903663158416748, + "logps/chosen": -237.7533416748047, + "logps/rejected": -241.7561492919922, + "loss": 2330.1758, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02604847028851509, + "rewards/margins": 0.020816484466195107, + "rewards/rejected": -0.04686495289206505, + "step": 520 + }, + { + "epoch": 0.28, + "learning_rate": 4.977172444367718e-06, + "logits/chosen": 0.8232777714729309, + "logits/rejected": 0.8955798149108887, + "logps/chosen": -248.8101806640625, + "logps/rejected": -226.21786499023438, + "loss": 2284.3357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0257116612046957, + "rewards/margins": 0.026299094781279564, + "rewards/rejected": -0.052010755985975266, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 4.9739889639900655e-06, + "logits/chosen": 0.8850505948066711, + "logits/rejected": 0.9008530378341675, + "logps/chosen": -260.18963623046875, + "logps/rejected": -228.6940155029297, + "loss": 2314.4127, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.028372693806886673, + "rewards/margins": 0.02354586310684681, + "rewards/rejected": -0.051918547600507736, + "step": 540 + }, + { + "epoch": 0.29, + "learning_rate": 4.9705989062458805e-06, + "logits/chosen": 0.8566417694091797, + "logits/rejected": 0.8558026552200317, + "logps/chosen": -242.9883270263672, + "logps/rejected": -247.76370239257812, + "loss": 2339.4164, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03327028453350067, + "rewards/margins": 0.02076330967247486, + "rewards/rejected": -0.054033588618040085, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 4.967002554204009e-06, + "logits/chosen": 0.7901058793067932, + "logits/rejected": 0.8357075452804565, + "logps/chosen": -263.3518371582031, + "logps/rejected": -250.97958374023438, + "loss": 2302.1379, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.04014524444937706, + "rewards/margins": 0.025143718346953392, + "rewards/rejected": -0.0652889683842659, + "step": 560 + }, + { + "epoch": 0.3, + "learning_rate": 4.963200208158811e-06, + "logits/chosen": 0.8461757898330688, + "logits/rejected": 0.9372328519821167, + "logps/chosen": -223.61373901367188, + "logps/rejected": -212.1987762451172, + "loss": 2320.3783, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.04337463900446892, + "rewards/margins": 0.021725038066506386, + "rewards/rejected": -0.06509967893362045, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 4.959192185605089e-06, + "logits/chosen": 0.7897135615348816, + "logits/rejected": 0.9126697778701782, + "logps/chosen": -294.5254821777344, + "logps/rejected": -232.0474395751953, + "loss": 2167.2896, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03270890563726425, + "rewards/margins": 0.041936445981264114, + "rewards/rejected": -0.07464535534381866, + "step": 580 + }, + { + "epoch": 0.31, + "learning_rate": 4.95497882121157e-06, + "logits/chosen": 0.8067277669906616, + "logits/rejected": 0.8418477773666382, + "logps/chosen": -240.95639038085938, + "logps/rejected": -217.8979949951172, + "loss": 2249.4051, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04393559694290161, + "rewards/margins": 0.03141217678785324, + "rewards/rejected": -0.07534776628017426, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 4.950560466792969e-06, + "logits/chosen": 0.8411356806755066, + "logits/rejected": 0.8524805307388306, + "logps/chosen": -241.7827606201172, + "logps/rejected": -234.87985229492188, + "loss": 2254.5846, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04195866733789444, + "rewards/margins": 0.03153757005929947, + "rewards/rejected": -0.07349623739719391, + "step": 600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.7868022322654724, + "eval_logits/rejected": 0.8478493094444275, + "eval_logps/chosen": -261.86590576171875, + "eval_logps/rejected": -241.83828735351562, + "eval_loss": 2273.499267578125, + "eval_rewards/accuracies": 0.6570000052452087, + "eval_rewards/chosen": -0.05248467996716499, + "eval_rewards/margins": 0.03040897473692894, + "eval_rewards/rejected": -0.08289366215467453, + "eval_runtime": 416.6239, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 600 + }, + { + "epoch": 0.32, + "learning_rate": 4.945937491280611e-06, + "logits/chosen": 0.7756252288818359, + "logits/rejected": 0.8814484477043152, + "logps/chosen": -245.0117950439453, + "logps/rejected": -217.2014923095703, + "loss": 2348.3988, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.05819234997034073, + "rewards/margins": 0.021564457565546036, + "rewards/rejected": -0.07975681126117706, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 4.9411102806916185e-06, + "logits/chosen": 0.7903825640678406, + "logits/rejected": 0.8663345575332642, + "logps/chosen": -270.52239990234375, + "logps/rejected": -254.25369262695312, + "loss": 2181.5219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05034894496202469, + "rewards/margins": 0.042683206498622894, + "rewards/rejected": -0.09303215146064758, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 4.9360792380966875e-06, + "logits/chosen": 0.8880133628845215, + "logits/rejected": 0.8636928796768188, + "logps/chosen": -241.08279418945312, + "logps/rejected": -220.8309783935547, + "loss": 2263.642, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.05800148844718933, + "rewards/margins": 0.0320889875292778, + "rewards/rejected": -0.09009047597646713, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 4.930844783586424e-06, + "logits/chosen": 0.8702048063278198, + "logits/rejected": 0.8985433578491211, + "logps/chosen": -240.00460815429688, + "logps/rejected": -238.61227416992188, + "loss": 2244.0928, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05921437591314316, + "rewards/margins": 0.032827965915203094, + "rewards/rejected": -0.09204234182834625, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 4.925407354236279e-06, + "logits/chosen": 0.8151038885116577, + "logits/rejected": 0.8790351152420044, + "logps/chosen": -241.45755004882812, + "logps/rejected": -226.37667846679688, + "loss": 2236.8199, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06360708922147751, + "rewards/margins": 0.03408312052488327, + "rewards/rejected": -0.09769020974636078, + "step": 650 + }, + { + "epoch": 0.35, + "learning_rate": 4.919767404070033e-06, + "logits/chosen": 0.8537635803222656, + "logits/rejected": 0.8909260034561157, + "logps/chosen": -221.7104034423828, + "logps/rejected": -206.9849395751953, + "loss": 2317.7387, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07372693717479706, + "rewards/margins": 0.02632719837129116, + "rewards/rejected": -0.10005412995815277, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 4.913925404021905e-06, + "logits/chosen": 0.8039971590042114, + "logits/rejected": 0.8006811141967773, + "logps/chosen": -240.97213745117188, + "logps/rejected": -209.97219848632812, + "loss": 2208.3297, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.07874181121587753, + "rewards/margins": 0.03945617750287056, + "rewards/rejected": -0.11819799244403839, + "step": 670 + }, + { + "epoch": 0.36, + "learning_rate": 4.907881841897216e-06, + "logits/chosen": 0.8469578623771667, + "logits/rejected": 0.8435947299003601, + "logps/chosen": -257.61810302734375, + "logps/rejected": -245.639892578125, + "loss": 2260.2275, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07468362152576447, + "rewards/margins": 0.03678290545940399, + "rewards/rejected": -0.11146652698516846, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 4.901637222331665e-06, + "logits/chosen": 0.7657278776168823, + "logits/rejected": 0.7471415996551514, + "logps/chosen": -259.5301513671875, + "logps/rejected": -236.86032104492188, + "loss": 2287.06, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.08623397350311279, + "rewards/margins": 0.03296409547328949, + "rewards/rejected": -0.11919806897640228, + "step": 690 + }, + { + "epoch": 0.37, + "learning_rate": 4.89519206674919e-06, + "logits/chosen": 0.7977254390716553, + "logits/rejected": 0.8500420451164246, + "logps/chosen": -244.900634765625, + "logps/rejected": -251.4091339111328, + "loss": 2330.7787, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08788236975669861, + "rewards/margins": 0.02753649279475212, + "rewards/rejected": -0.11541886627674103, + "step": 700 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 0.7516666650772095, + "eval_logits/rejected": 0.812827467918396, + "eval_logps/chosen": -264.809326171875, + "eval_logps/rejected": -245.7631378173828, + "eval_loss": 2224.3349609375, + "eval_rewards/accuracies": 0.6629999876022339, + "eval_rewards/chosen": -0.08191882818937302, + "eval_rewards/margins": 0.0402236245572567, + "eval_rewards/rejected": -0.12214244902133942, + "eval_runtime": 416.6908, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 4.8885469133184235e-06, + "logits/chosen": 0.8586422204971313, + "logits/rejected": 0.8278988003730774, + "logps/chosen": -244.2412109375, + "logps/rejected": -248.913818359375, + "loss": 2265.9631, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0877029225230217, + "rewards/margins": 0.03201908990740776, + "rewards/rejected": -0.11972200870513916, + "step": 710 + }, + { + "epoch": 0.38, + "learning_rate": 4.881702316907769e-06, + "logits/chosen": 0.853478729724884, + "logits/rejected": 0.9013971090316772, + "logps/chosen": -240.85440063476562, + "logps/rejected": -233.39761352539062, + "loss": 2238.1352, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08841414749622345, + "rewards/margins": 0.03821689262986183, + "rewards/rejected": -0.12663105130195618, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 4.874658849039054e-06, + "logits/chosen": 0.6829933524131775, + "logits/rejected": 0.7707113027572632, + "logps/chosen": -274.42095947265625, + "logps/rejected": -232.1612091064453, + "loss": 2119.9938, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07465031743049622, + "rewards/margins": 0.049049459397792816, + "rewards/rejected": -0.12369978427886963, + "step": 730 + }, + { + "epoch": 0.39, + "learning_rate": 4.86741709783982e-06, + "logits/chosen": 0.7617892026901245, + "logits/rejected": 0.8164576292037964, + "logps/chosen": -267.9073486328125, + "logps/rejected": -240.621826171875, + "loss": 2243.2557, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.085872121155262, + "rewards/margins": 0.042114924639463425, + "rewards/rejected": -0.1279870569705963, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 4.859977667994209e-06, + "logits/chosen": 0.74756920337677, + "logits/rejected": 0.8244425654411316, + "logps/chosen": -255.57754516601562, + "logps/rejected": -242.3626251220703, + "loss": 2231.4938, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09324190765619278, + "rewards/margins": 0.04055342823266983, + "rewards/rejected": -0.1337953507900238, + "step": 750 + }, + { + "epoch": 0.4, + "learning_rate": 4.852341180692471e-06, + "logits/chosen": 0.7698923945426941, + "logits/rejected": 0.7992275953292847, + "logps/chosen": -256.96868896484375, + "logps/rejected": -271.1941833496094, + "loss": 2239.7266, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.08798633515834808, + "rewards/margins": 0.0419507697224617, + "rewards/rejected": -0.1299370974302292, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 4.844508273579097e-06, + "logits/chosen": 0.803545355796814, + "logits/rejected": 0.7743754982948303, + "logps/chosen": -249.5584716796875, + "logps/rejected": -238.66683959960938, + "loss": 2204.2545, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.08735480904579163, + "rewards/margins": 0.04183940216898918, + "rewards/rejected": -0.1291942000389099, + "step": 770 + }, + { + "epoch": 0.41, + "learning_rate": 4.836479600699579e-06, + "logits/chosen": 0.7211157083511353, + "logits/rejected": 0.7573873400688171, + "logps/chosen": -251.00076293945312, + "logps/rejected": -248.5784454345703, + "loss": 2175.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09977405518293381, + "rewards/margins": 0.047359712421894073, + "rewards/rejected": -0.14713376760482788, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 4.82825583244579e-06, + "logits/chosen": 0.7303954362869263, + "logits/rejected": 0.7423623204231262, + "logps/chosen": -271.29888916015625, + "logps/rejected": -255.95645141601562, + "loss": 2097.0797, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09688698500394821, + "rewards/margins": 0.060234714299440384, + "rewards/rejected": -0.1571216881275177, + "step": 790 + }, + { + "epoch": 0.42, + "learning_rate": 4.819837655500014e-06, + "logits/chosen": 0.7324298620223999, + "logits/rejected": 0.8285747766494751, + "logps/chosen": -259.47650146484375, + "logps/rejected": -261.0966491699219, + "loss": 2223.6863, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1077154278755188, + "rewards/margins": 0.04667884111404419, + "rewards/rejected": -0.154394268989563, + "step": 800 + }, + { + "epoch": 0.42, + "eval_logits/chosen": 0.6992308497428894, + "eval_logits/rejected": 0.7610952854156494, + "eval_logps/chosen": -266.70574951171875, + "eval_logps/rejected": -248.42222595214844, + "eval_loss": 2196.09912109375, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -0.1008833572268486, + "eval_rewards/margins": 0.04784964770078659, + "eval_rewards/rejected": -0.1487330049276352, + "eval_runtime": 416.5458, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 4.811225772777603e-06, + "logits/chosen": 0.8175959587097168, + "logits/rejected": 0.7778623700141907, + "logps/chosen": -281.6056823730469, + "logps/rejected": -235.4732666015625, + "loss": 2154.2184, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10602346807718277, + "rewards/margins": 0.0522245354950428, + "rewards/rejected": -0.15824799239635468, + "step": 810 + }, + { + "epoch": 0.43, + "learning_rate": 4.802420903368286e-06, + "logits/chosen": 0.6645683646202087, + "logits/rejected": 0.7504470944404602, + "logps/chosen": -262.6244812011719, + "logps/rejected": -246.62728881835938, + "loss": 2070.7916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08415599167346954, + "rewards/margins": 0.06298204511404037, + "rewards/rejected": -0.14713802933692932, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 4.793423782476125e-06, + "logits/chosen": 0.7014611959457397, + "logits/rejected": 0.7595884203910828, + "logps/chosen": -261.1951599121094, + "logps/rejected": -237.8996124267578, + "loss": 2303.9654, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11634109169244766, + "rewards/margins": 0.03935299813747406, + "rewards/rejected": -0.15569409728050232, + "step": 830 + }, + { + "epoch": 0.44, + "learning_rate": 4.784235161358124e-06, + "logits/chosen": 0.7220847010612488, + "logits/rejected": 0.8208295702934265, + "logps/chosen": -274.5125427246094, + "logps/rejected": -267.01275634765625, + "loss": 2374.0922, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.11991143226623535, + "rewards/margins": 0.03418232128024101, + "rewards/rejected": -0.15409375727176666, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 4.774855807261504e-06, + "logits/chosen": 0.771617591381073, + "logits/rejected": 0.7759231925010681, + "logps/chosen": -266.54156494140625, + "logps/rejected": -232.45114135742188, + "loss": 2085.9527, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09387587755918503, + "rewards/margins": 0.059171438217163086, + "rewards/rejected": -0.1530473232269287, + "step": 850 + }, + { + "epoch": 0.45, + "learning_rate": 4.765286503359632e-06, + "logits/chosen": 0.7542043924331665, + "logits/rejected": 0.7199236154556274, + "logps/chosen": -247.95004272460938, + "logps/rejected": -234.8477325439453, + "loss": 2140.2543, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09680913388729095, + "rewards/margins": 0.05759376287460327, + "rewards/rejected": -0.15440289676189423, + "step": 860 + }, + { + "epoch": 0.46, + "learning_rate": 4.755528048686629e-06, + "logits/chosen": 0.7054905891418457, + "logits/rejected": 0.7627168297767639, + "logps/chosen": -269.0238342285156, + "logps/rejected": -231.31594848632812, + "loss": 2018.9971, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09635048359632492, + "rewards/margins": 0.07080944627523422, + "rewards/rejected": -0.16715992987155914, + "step": 870 + }, + { + "epoch": 0.46, + "learning_rate": 4.745581258070654e-06, + "logits/chosen": 0.7587330937385559, + "logits/rejected": 0.7288376092910767, + "logps/chosen": -279.3045959472656, + "logps/rejected": -248.2842559814453, + "loss": 2182.2826, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.10375384986400604, + "rewards/margins": 0.04985477030277252, + "rewards/rejected": -0.15360862016677856, + "step": 880 + }, + { + "epoch": 0.47, + "learning_rate": 4.73544696206586e-06, + "logits/chosen": 0.7099634408950806, + "logits/rejected": 0.7651978731155396, + "logps/chosen": -241.7110137939453, + "logps/rejected": -226.1469268798828, + "loss": 2313.518, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.11171738058328629, + "rewards/margins": 0.033566057682037354, + "rewards/rejected": -0.14528343081474304, + "step": 890 + }, + { + "epoch": 0.47, + "learning_rate": 4.725126006883047e-06, + "logits/chosen": 0.6832414865493774, + "logits/rejected": 0.6996358633041382, + "logps/chosen": -287.02655029296875, + "logps/rejected": -271.8457946777344, + "loss": 2066.7418, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1082179993391037, + "rewards/margins": 0.06335000693798065, + "rewards/rejected": -0.17156800627708435, + "step": 900 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 0.6917389035224915, + "eval_logits/rejected": 0.7518260478973389, + "eval_logps/chosen": -267.73968505859375, + "eval_logps/rejected": -250.13189697265625, + "eval_loss": 2166.0732421875, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -0.11122233420610428, + "eval_rewards/margins": 0.0546073243021965, + "eval_rewards/rejected": -0.16582968831062317, + "eval_runtime": 416.784, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 900 + }, + { + "epoch": 0.48, + "learning_rate": 4.7146192543190005e-06, + "logits/chosen": 0.7068012952804565, + "logits/rejected": 0.7604703307151794, + "logps/chosen": -301.75897216796875, + "logps/rejected": -260.5961608886719, + "loss": 2114.2752, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10466556251049042, + "rewards/margins": 0.06352122128009796, + "rewards/rejected": -0.16818679869174957, + "step": 910 + }, + { + "epoch": 0.48, + "learning_rate": 4.70392758168454e-06, + "logits/chosen": 0.7013599872589111, + "logits/rejected": 0.7524459362030029, + "logps/chosen": -280.307861328125, + "logps/rejected": -256.722900390625, + "loss": 2081.7857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11534447968006134, + "rewards/margins": 0.06791369616985321, + "rewards/rejected": -0.18325819075107574, + "step": 920 + }, + { + "epoch": 0.49, + "learning_rate": 4.693051881731251e-06, + "logits/chosen": 0.6879482269287109, + "logits/rejected": 0.7315651178359985, + "logps/chosen": -267.4771728515625, + "logps/rejected": -269.62249755859375, + "loss": 2219.8021, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.11604435741901398, + "rewards/margins": 0.04904730245471001, + "rewards/rejected": -0.16509169340133667, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 4.68199306257695e-06, + "logits/chosen": 0.7221347093582153, + "logits/rejected": 0.8089338541030884, + "logps/chosen": -277.0709533691406, + "logps/rejected": -288.2813415527344, + "loss": 2066.718, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09484682977199554, + "rewards/margins": 0.066488116979599, + "rewards/rejected": -0.16133496165275574, + "step": 940 + }, + { + "epoch": 0.5, + "learning_rate": 4.670752047629855e-06, + "logits/chosen": 0.7649358510971069, + "logits/rejected": 0.8068546056747437, + "logps/chosen": -289.7987976074219, + "logps/rejected": -257.411865234375, + "loss": 1989.5814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09663524478673935, + "rewards/margins": 0.07417033612728119, + "rewards/rejected": -0.17080560326576233, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 4.659329775511478e-06, + "logits/chosen": 0.6801126599311829, + "logits/rejected": 0.7201008796691895, + "logps/chosen": -275.82244873046875, + "logps/rejected": -263.333251953125, + "loss": 2137.0553, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.10409847646951675, + "rewards/margins": 0.06283750385046005, + "rewards/rejected": -0.1669359654188156, + "step": 960 + }, + { + "epoch": 0.51, + "learning_rate": 4.647727199978255e-06, + "logits/chosen": 0.675479531288147, + "logits/rejected": 0.755820095539093, + "logps/chosen": -281.660888671875, + "logps/rejected": -264.6354675292969, + "loss": 2290.1266, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11869573593139648, + "rewards/margins": 0.0478622205555439, + "rewards/rejected": -0.1665579378604889, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 4.635945289841902e-06, + "logits/chosen": 0.6294044852256775, + "logits/rejected": 0.701261043548584, + "logps/chosen": -249.8705596923828, + "logps/rejected": -245.52346801757812, + "loss": 2214.392, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.12245059013366699, + "rewards/margins": 0.05106619745492935, + "rewards/rejected": -0.17351679503917694, + "step": 980 + }, + { + "epoch": 0.52, + "learning_rate": 4.623985028888527e-06, + "logits/chosen": 0.7620214223861694, + "logits/rejected": 0.799843430519104, + "logps/chosen": -236.2934112548828, + "logps/rejected": -222.5919189453125, + "loss": 2149.6912, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12090835720300674, + "rewards/margins": 0.05957023426890373, + "rewards/rejected": -0.18047860264778137, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 4.611847415796476e-06, + "logits/chosen": 0.7124743461608887, + "logits/rejected": 0.6904253363609314, + "logps/chosen": -265.0976257324219, + "logps/rejected": -254.4890594482422, + "loss": 2119.2691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11531393229961395, + "rewards/margins": 0.0636182576417923, + "rewards/rejected": -0.17893218994140625, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 0.6619382500648499, + "eval_logits/rejected": 0.721328854560852, + "eval_logps/chosen": -268.7693176269531, + "eval_logps/rejected": -251.76100158691406, + "eval_loss": 2138.93115234375, + "eval_rewards/accuracies": 0.671500027179718, + "eval_rewards/chosen": -0.12151883542537689, + "eval_rewards/margins": 0.060602057725191116, + "eval_rewards/rejected": -0.1821209043264389, + "eval_runtime": 416.4897, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 1000 + }, + { + "epoch": 0.53, + "learning_rate": 4.599533464052951e-06, + "logits/chosen": 0.7095866203308105, + "logits/rejected": 0.7142434120178223, + "logps/chosen": -285.8958740234375, + "logps/rejected": -269.59906005859375, + "loss": 2002.4428, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09995652735233307, + "rewards/margins": 0.08161594718694687, + "rewards/rejected": -0.18157246708869934, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 4.587044201869378e-06, + "logits/chosen": 0.6786423921585083, + "logits/rejected": 0.7168447375297546, + "logps/chosen": -285.7081298828125, + "logps/rejected": -245.5774688720703, + "loss": 2104.4162, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10604969412088394, + "rewards/margins": 0.06783930957317352, + "rewards/rejected": -0.17388899624347687, + "step": 1020 + }, + { + "epoch": 0.54, + "learning_rate": 4.574380672095555e-06, + "logits/chosen": 0.6884575486183167, + "logits/rejected": 0.7298802733421326, + "logps/chosen": -223.19393920898438, + "logps/rejected": -240.6363983154297, + "loss": 2218.8305, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1402120441198349, + "rewards/margins": 0.049252741038799286, + "rewards/rejected": -0.1894647777080536, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 4.561543932132574e-06, + "logits/chosen": 0.6861045360565186, + "logits/rejected": 0.7231858968734741, + "logps/chosen": -282.3648376464844, + "logps/rejected": -247.057373046875, + "loss": 2149.1324, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1236349567770958, + "rewards/margins": 0.05487797409296036, + "rewards/rejected": -0.17851293087005615, + "step": 1040 + }, + { + "epoch": 0.55, + "learning_rate": 4.548535053844527e-06, + "logits/chosen": 0.6396581530570984, + "logits/rejected": 0.7092006206512451, + "logps/chosen": -280.1047058105469, + "logps/rejected": -267.3292541503906, + "loss": 2057.9182, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.12618432939052582, + "rewards/margins": 0.07123459875583649, + "rewards/rejected": -0.1974189579486847, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.535355123469009e-06, + "logits/chosen": 0.7520751357078552, + "logits/rejected": 0.7533235549926758, + "logps/chosen": -246.0561065673828, + "logps/rejected": -210.5857391357422, + "loss": 2157.4354, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.11811725795269012, + "rewards/margins": 0.05882970616221428, + "rewards/rejected": -0.1769469678401947, + "step": 1060 + }, + { + "epoch": 0.56, + "learning_rate": 4.522005241526411e-06, + "logits/chosen": 0.670494019985199, + "logits/rejected": 0.7469106912612915, + "logps/chosen": -281.83013916015625, + "logps/rejected": -235.8207244873047, + "loss": 2224.4785, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12622186541557312, + "rewards/margins": 0.04977993294596672, + "rewards/rejected": -0.17600181698799133, + "step": 1070 + }, + { + "epoch": 0.57, + "learning_rate": 4.508486522728037e-06, + "logits/chosen": 0.6910241842269897, + "logits/rejected": 0.7257175445556641, + "logps/chosen": -276.518798828125, + "logps/rejected": -260.69781494140625, + "loss": 2003.6445, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1153578907251358, + "rewards/margins": 0.07534319162368774, + "rewards/rejected": -0.19070109724998474, + "step": 1080 + }, + { + "epoch": 0.57, + "learning_rate": 4.494800095883014e-06, + "logits/chosen": 0.6132059097290039, + "logits/rejected": 0.6958727836608887, + "logps/chosen": -290.1937255859375, + "logps/rejected": -244.7795867919922, + "loss": 1914.4395, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1075458973646164, + "rewards/margins": 0.09307406842708588, + "rewards/rejected": -0.20061998069286346, + "step": 1090 + }, + { + "epoch": 0.58, + "learning_rate": 4.480947103804044e-06, + "logits/chosen": 0.5848616361618042, + "logits/rejected": 0.6844476461410522, + "logps/chosen": -286.0004577636719, + "logps/rejected": -235.7235565185547, + "loss": 2191.7109, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12465153634548187, + "rewards/margins": 0.05475841090083122, + "rewards/rejected": -0.1794099658727646, + "step": 1100 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 0.6583799123764038, + "eval_logits/rejected": 0.7175658345222473, + "eval_logps/chosen": -269.19097900390625, + "eval_logps/rejected": -252.60589599609375, + "eval_loss": 2121.8115234375, + "eval_rewards/accuracies": 0.6694999933242798, + "eval_rewards/chosen": -0.12573528289794922, + "eval_rewards/margins": 0.06483451277017593, + "eval_rewards/rejected": -0.19056977331638336, + "eval_runtime": 416.4568, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 1100 + }, + { + "epoch": 0.58, + "learning_rate": 4.466928703211981e-06, + "logits/chosen": 0.6980951428413391, + "logits/rejected": 0.6862035989761353, + "logps/chosen": -281.24700927734375, + "logps/rejected": -248.4276123046875, + "loss": 2063.3184, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11639384180307388, + "rewards/margins": 0.07433497160673141, + "rewards/rejected": -0.1907288283109665, + "step": 1110 + }, + { + "epoch": 0.59, + "learning_rate": 4.452746064639239e-06, + "logits/chosen": 0.678636372089386, + "logits/rejected": 0.6571283936500549, + "logps/chosen": -258.97674560546875, + "logps/rejected": -254.3134307861328, + "loss": 2036.2967, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10998505353927612, + "rewards/margins": 0.06968870759010315, + "rewards/rejected": -0.17967377603054047, + "step": 1120 + }, + { + "epoch": 0.59, + "learning_rate": 4.438400372332058e-06, + "logits/chosen": 0.7093490958213806, + "logits/rejected": 0.7673132419586182, + "logps/chosen": -265.0453796386719, + "logps/rejected": -250.15982055664062, + "loss": 1899.6914, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.10779963433742523, + "rewards/margins": 0.08556055277585983, + "rewards/rejected": -0.19336020946502686, + "step": 1130 + }, + { + "epoch": 0.6, + "learning_rate": 4.423892824151617e-06, + "logits/chosen": 0.6779216527938843, + "logits/rejected": 0.7495394945144653, + "logps/chosen": -276.6440734863281, + "logps/rejected": -247.07150268554688, + "loss": 2002.1414, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.11242429912090302, + "rewards/margins": 0.08376909792423248, + "rewards/rejected": -0.19619342684745789, + "step": 1140 + }, + { + "epoch": 0.6, + "learning_rate": 4.409224631474014e-06, + "logits/chosen": 0.6950255632400513, + "logits/rejected": 0.7308493256568909, + "logps/chosen": -258.3533020019531, + "logps/rejected": -235.54483032226562, + "loss": 1995.5852, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12844698131084442, + "rewards/margins": 0.07693418860435486, + "rewards/rejected": -0.20538118481636047, + "step": 1150 + }, + { + "epoch": 0.61, + "learning_rate": 4.3943970190891164e-06, + "logits/chosen": 0.6389755010604858, + "logits/rejected": 0.6700756549835205, + "logps/chosen": -264.4709167480469, + "logps/rejected": -256.2208557128906, + "loss": 2016.508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.13102933764457703, + "rewards/margins": 0.07655525207519531, + "rewards/rejected": -0.20758457481861115, + "step": 1160 + }, + { + "epoch": 0.61, + "learning_rate": 4.379411225098292e-06, + "logits/chosen": 0.6980705261230469, + "logits/rejected": 0.7923838496208191, + "logps/chosen": -283.1957702636719, + "logps/rejected": -267.30938720703125, + "loss": 2012.2674, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.138292133808136, + "rewards/margins": 0.08078579604625702, + "rewards/rejected": -0.2190779149532318, + "step": 1170 + }, + { + "epoch": 0.62, + "learning_rate": 4.364268500811025e-06, + "logits/chosen": 0.6915451288223267, + "logits/rejected": 0.6770834922790527, + "logps/chosen": -258.2509460449219, + "logps/rejected": -266.7464294433594, + "loss": 2101.8338, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13359954953193665, + "rewards/margins": 0.07587826251983643, + "rewards/rejected": -0.20947781205177307, + "step": 1180 + }, + { + "epoch": 0.62, + "learning_rate": 4.348970110640437e-06, + "logits/chosen": 0.6509718298912048, + "logits/rejected": 0.7129366993904114, + "logps/chosen": -258.56280517578125, + "logps/rejected": -237.4566192626953, + "loss": 2027.6717, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14995795488357544, + "rewards/margins": 0.07759587466716766, + "rewards/rejected": -0.2275538146495819, + "step": 1190 + }, + { + "epoch": 0.63, + "learning_rate": 4.333517331997704e-06, + "logits/chosen": 0.5978332161903381, + "logits/rejected": 0.6565033793449402, + "logps/chosen": -272.02166748046875, + "logps/rejected": -271.3028259277344, + "loss": 2308.1883, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1387997567653656, + "rewards/margins": 0.04619471728801727, + "rewards/rejected": -0.18499447405338287, + "step": 1200 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 0.6329967379570007, + "eval_logits/rejected": 0.6920445561408997, + "eval_logps/chosen": -270.7044372558594, + "eval_logps/rejected": -254.78115844726562, + "eval_loss": 2110.306884765625, + "eval_rewards/accuracies": 0.6664999723434448, + "eval_rewards/chosen": -0.14087003469467163, + "eval_rewards/margins": 0.07145243883132935, + "eval_rewards/rejected": -0.21232248842716217, + "eval_runtime": 416.6934, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 1200 + }, + { + "epoch": 0.63, + "learning_rate": 4.317911455185396e-06, + "logits/chosen": 0.6959893703460693, + "logits/rejected": 0.7259203791618347, + "logps/chosen": -266.06829833984375, + "logps/rejected": -238.3871307373047, + "loss": 2262.5527, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.15047064423561096, + "rewards/margins": 0.04972488805651665, + "rewards/rejected": -0.2001955509185791, + "step": 1210 + }, + { + "epoch": 0.64, + "learning_rate": 4.302153783289737e-06, + "logits/chosen": 0.6275352239608765, + "logits/rejected": 0.7193800806999207, + "logps/chosen": -266.05908203125, + "logps/rejected": -258.8971252441406, + "loss": 2233.5896, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1438441276550293, + "rewards/margins": 0.05622429400682449, + "rewards/rejected": -0.2000684291124344, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 4.286245632071791e-06, + "logits/chosen": 0.6443454623222351, + "logits/rejected": 0.6870865225791931, + "logps/chosen": -257.45611572265625, + "logps/rejected": -253.72756958007812, + "loss": 2126.4154, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.15624599158763885, + "rewards/margins": 0.06528286635875702, + "rewards/rejected": -0.22152885794639587, + "step": 1230 + }, + { + "epoch": 0.65, + "learning_rate": 4.270188329857613e-06, + "logits/chosen": 0.7713927626609802, + "logits/rejected": 0.7753847241401672, + "logps/chosen": -263.57708740234375, + "logps/rejected": -270.3426513671875, + "loss": 2108.1736, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12420948594808578, + "rewards/margins": 0.06607901304960251, + "rewards/rejected": -0.1902884989976883, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 4.253983217427313e-06, + "logits/chosen": 0.6878337860107422, + "logits/rejected": 0.7090884447097778, + "logps/chosen": -271.42657470703125, + "logps/rejected": -288.615478515625, + "loss": 2128.0695, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.13797307014465332, + "rewards/margins": 0.06890544295310974, + "rewards/rejected": -0.20687851309776306, + "step": 1250 + }, + { + "epoch": 0.66, + "learning_rate": 4.237631647903115e-06, + "logits/chosen": 0.6635148525238037, + "logits/rejected": 0.6455484628677368, + "logps/chosen": -266.98248291015625, + "logps/rejected": -250.23403930664062, + "loss": 2209.6877, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.13750961422920227, + "rewards/margins": 0.05613657087087631, + "rewards/rejected": -0.1936461478471756, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 4.221134986636371e-06, + "logits/chosen": 0.6171488761901855, + "logits/rejected": 0.6567360758781433, + "logps/chosen": -273.8824157714844, + "logps/rejected": -249.103515625, + "loss": 1885.2906, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11038468778133392, + "rewards/margins": 0.10038020461797714, + "rewards/rejected": -0.21076488494873047, + "step": 1270 + }, + { + "epoch": 0.67, + "learning_rate": 4.204494611093548e-06, + "logits/chosen": 0.7011705636978149, + "logits/rejected": 0.6791177988052368, + "logps/chosen": -251.37478637695312, + "logps/rejected": -261.1529846191406, + "loss": 2258.1799, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14569328725337982, + "rewards/margins": 0.0609690323472023, + "rewards/rejected": -0.20666229724884033, + "step": 1280 + }, + { + "epoch": 0.68, + "learning_rate": 4.1877119107412165e-06, + "logits/chosen": 0.6343793869018555, + "logits/rejected": 0.6927725672721863, + "logps/chosen": -237.8370819091797, + "logps/rejected": -256.54254150390625, + "loss": 2061.6873, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1375429332256317, + "rewards/margins": 0.07506345212459564, + "rewards/rejected": -0.21260638535022736, + "step": 1290 + }, + { + "epoch": 0.68, + "learning_rate": 4.170788286930024e-06, + "logits/chosen": 0.6356396675109863, + "logits/rejected": 0.7518913149833679, + "logps/chosen": -275.9583435058594, + "logps/rejected": -253.1591033935547, + "loss": 1996.7178, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1259593516588211, + "rewards/margins": 0.09192151576280594, + "rewards/rejected": -0.21788087487220764, + "step": 1300 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 0.6140788793563843, + "eval_logits/rejected": 0.6721699833869934, + "eval_logps/chosen": -269.7620544433594, + "eval_logps/rejected": -253.97259521484375, + "eval_loss": 2095.31298828125, + "eval_rewards/accuracies": 0.6754999756813049, + "eval_rewards/chosen": -0.1314462274312973, + "eval_rewards/margins": 0.07279053330421448, + "eval_rewards/rejected": -0.20423679053783417, + "eval_runtime": 416.679, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 1300 + }, + { + "epoch": 0.69, + "learning_rate": 4.15372515277769e-06, + "logits/chosen": 0.6244436502456665, + "logits/rejected": 0.6609630584716797, + "logps/chosen": -280.32794189453125, + "logps/rejected": -248.7582244873047, + "loss": 2011.6244, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11511021852493286, + "rewards/margins": 0.08765153586864471, + "rewards/rejected": -0.20276173949241638, + "step": 1310 + }, + { + "epoch": 0.69, + "learning_rate": 4.136523933051005e-06, + "logits/chosen": 0.6894992589950562, + "logits/rejected": 0.6712801456451416, + "logps/chosen": -263.05767822265625, + "logps/rejected": -234.242431640625, + "loss": 1956.5369, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11226965487003326, + "rewards/margins": 0.09646574407815933, + "rewards/rejected": -0.208735391497612, + "step": 1320 + }, + { + "epoch": 0.7, + "learning_rate": 4.119186064046868e-06, + "logits/chosen": 0.6183528900146484, + "logits/rejected": 0.644507110118866, + "logps/chosen": -274.1941223144531, + "logps/rejected": -245.35562133789062, + "loss": 2166.2785, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.12718230485916138, + "rewards/margins": 0.06536950916051865, + "rewards/rejected": -0.19255182147026062, + "step": 1330 + }, + { + "epoch": 0.7, + "learning_rate": 4.101712993472348e-06, + "logits/chosen": 0.6670488715171814, + "logits/rejected": 0.6754225492477417, + "logps/chosen": -278.2078552246094, + "logps/rejected": -241.5122528076172, + "loss": 1883.2881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.119564950466156, + "rewards/margins": 0.09150619804859161, + "rewards/rejected": -0.2110711634159088, + "step": 1340 + }, + { + "epoch": 0.71, + "learning_rate": 4.084106180323813e-06, + "logits/chosen": 0.6214176416397095, + "logits/rejected": 0.66867595911026, + "logps/chosen": -261.7587890625, + "logps/rejected": -251.6807098388672, + "loss": 2017.3074, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1294572651386261, + "rewards/margins": 0.08191975206136703, + "rewards/rejected": -0.21137702465057373, + "step": 1350 + }, + { + "epoch": 0.71, + "learning_rate": 4.066367094765091e-06, + "logits/chosen": 0.6692546606063843, + "logits/rejected": 0.6942587494850159, + "logps/chosen": -259.8294982910156, + "logps/rejected": -268.8597412109375, + "loss": 2097.6473, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12980519235134125, + "rewards/margins": 0.07286903262138367, + "rewards/rejected": -0.20267422497272491, + "step": 1360 + }, + { + "epoch": 0.72, + "learning_rate": 4.048497218004724e-06, + "logits/chosen": 0.5632964968681335, + "logits/rejected": 0.6666015386581421, + "logps/chosen": -265.3409729003906, + "logps/rejected": -252.78958129882812, + "loss": 2076.2803, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13252495229244232, + "rewards/margins": 0.08360429853200912, + "rewards/rejected": -0.21612922847270966, + "step": 1370 + }, + { + "epoch": 0.72, + "learning_rate": 4.030498042172277e-06, + "logits/chosen": 0.6174412369728088, + "logits/rejected": 0.6783226728439331, + "logps/chosen": -249.213134765625, + "logps/rejected": -241.413330078125, + "loss": 2285.0457, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.14881116151809692, + "rewards/margins": 0.05003712326288223, + "rewards/rejected": -0.19884827733039856, + "step": 1380 + }, + { + "epoch": 0.73, + "learning_rate": 4.012371070193753e-06, + "logits/chosen": 0.6269063353538513, + "logits/rejected": 0.6346549391746521, + "logps/chosen": -241.5870361328125, + "logps/rejected": -245.96456909179688, + "loss": 2133.8158, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1412040889263153, + "rewards/margins": 0.06111832335591316, + "rewards/rejected": -0.20232239365577698, + "step": 1390 + }, + { + "epoch": 0.73, + "learning_rate": 3.994117815666095e-06, + "logits/chosen": 0.6533576250076294, + "logits/rejected": 0.6740087866783142, + "logps/chosen": -283.4432678222656, + "logps/rejected": -267.79071044921875, + "loss": 2038.3844, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1330070197582245, + "rewards/margins": 0.07623559981584549, + "rewards/rejected": -0.20924265682697296, + "step": 1400 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.5933060050010681, + "eval_logits/rejected": 0.6512511968612671, + "eval_logps/chosen": -270.4488220214844, + "eval_logps/rejected": -254.944091796875, + "eval_loss": 2085.085205078125, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -0.13831348717212677, + "eval_rewards/margins": 0.07563827186822891, + "eval_rewards/rejected": -0.21395176649093628, + "eval_runtime": 416.609, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 1400 + }, + { + "epoch": 0.74, + "learning_rate": 3.975739802730805e-06, + "logits/chosen": 0.5807250738143921, + "logits/rejected": 0.6861320734024048, + "logps/chosen": -298.25604248046875, + "logps/rejected": -273.8665466308594, + "loss": 2016.2207, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12952394783496857, + "rewards/margins": 0.08187790215015411, + "rewards/rejected": -0.21140184998512268, + "step": 1410 + }, + { + "epoch": 0.74, + "learning_rate": 3.957238565946672e-06, + "logits/chosen": 0.6601132750511169, + "logits/rejected": 0.6705759763717651, + "logps/chosen": -266.2096252441406, + "logps/rejected": -249.5722198486328, + "loss": 2228.1479, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1489478349685669, + "rewards/margins": 0.05832044407725334, + "rewards/rejected": -0.20726828277111053, + "step": 1420 + }, + { + "epoch": 0.75, + "learning_rate": 3.938615650161645e-06, + "logits/chosen": 0.6056556701660156, + "logits/rejected": 0.5954689979553223, + "logps/chosen": -244.7415771484375, + "logps/rejected": -234.8352813720703, + "loss": 2023.5078, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11950768530368805, + "rewards/margins": 0.0772583931684494, + "rewards/rejected": -0.19676607847213745, + "step": 1430 + }, + { + "epoch": 0.75, + "learning_rate": 3.919872610383831e-06, + "logits/chosen": 0.5716265439987183, + "logits/rejected": 0.6326289176940918, + "logps/chosen": -257.7333984375, + "logps/rejected": -234.2956085205078, + "loss": 2199.8311, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14616943895816803, + "rewards/margins": 0.055809132754802704, + "rewards/rejected": -0.20197856426239014, + "step": 1440 + }, + { + "epoch": 0.76, + "learning_rate": 3.9010110116516595e-06, + "logits/chosen": 0.655591607093811, + "logits/rejected": 0.7094139456748962, + "logps/chosen": -266.137939453125, + "logps/rejected": -247.2853240966797, + "loss": 1965.8004, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.13736537098884583, + "rewards/margins": 0.0867539569735527, + "rewards/rejected": -0.22411933541297913, + "step": 1450 + }, + { + "epoch": 0.76, + "learning_rate": 3.882032428903195e-06, + "logits/chosen": 0.6148696541786194, + "logits/rejected": 0.6896382570266724, + "logps/chosen": -256.70904541015625, + "logps/rejected": -245.3892822265625, + "loss": 2095.5068, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12945787608623505, + "rewards/margins": 0.07017168402671814, + "rewards/rejected": -0.19962957501411438, + "step": 1460 + }, + { + "epoch": 0.77, + "learning_rate": 3.8629384468446365e-06, + "logits/chosen": 0.5744356513023376, + "logits/rejected": 0.5954487919807434, + "logps/chosen": -250.98178100585938, + "logps/rejected": -272.93701171875, + "loss": 2119.459, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13649822771549225, + "rewards/margins": 0.07962769269943237, + "rewards/rejected": -0.21612592041492462, + "step": 1470 + }, + { + "epoch": 0.77, + "learning_rate": 3.84373065981799e-06, + "logits/chosen": 0.6630114912986755, + "logits/rejected": 0.6675506830215454, + "logps/chosen": -265.52447509765625, + "logps/rejected": -247.32455444335938, + "loss": 1991.6379, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12121538817882538, + "rewards/margins": 0.08006526529788971, + "rewards/rejected": -0.20128066837787628, + "step": 1480 + }, + { + "epoch": 0.78, + "learning_rate": 3.824410671667948e-06, + "logits/chosen": 0.6106497645378113, + "logits/rejected": 0.680738091468811, + "logps/chosen": -260.89288330078125, + "logps/rejected": -252.52017211914062, + "loss": 1942.9977, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12056465446949005, + "rewards/margins": 0.09015407413244247, + "rewards/rejected": -0.21071875095367432, + "step": 1490 + }, + { + "epoch": 0.79, + "learning_rate": 3.8049800956079552e-06, + "logits/chosen": 0.5932056903839111, + "logits/rejected": 0.6287232637405396, + "logps/chosen": -291.2415771484375, + "logps/rejected": -279.5646057128906, + "loss": 2094.2182, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.13307559490203857, + "rewards/margins": 0.0750352293252945, + "rewards/rejected": -0.20811080932617188, + "step": 1500 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.5898318886756897, + "eval_logits/rejected": 0.6474130749702454, + "eval_logps/chosen": -270.5129089355469, + "eval_logps/rejected": -255.21328735351562, + "eval_loss": 2076.30419921875, + "eval_rewards/accuracies": 0.6790000200271606, + "eval_rewards/chosen": -0.13895468413829803, + "eval_rewards/margins": 0.07768914848566055, + "eval_rewards/rejected": -0.2166438102722168, + "eval_runtime": 416.7121, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 1500 + }, + { + "epoch": 0.79, + "learning_rate": 3.7854405540855268e-06, + "logits/chosen": 0.580877959728241, + "logits/rejected": 0.6030541658401489, + "logps/chosen": -255.82693481445312, + "logps/rejected": -249.1620635986328, + "loss": 2165.7623, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1440289467573166, + "rewards/margins": 0.06519783288240433, + "rewards/rejected": -0.2092268019914627, + "step": 1510 + }, + { + "epoch": 0.8, + "learning_rate": 3.765793678646753e-06, + "logits/chosen": 0.612065851688385, + "logits/rejected": 0.6108434200286865, + "logps/chosen": -236.6591796875, + "logps/rejected": -245.4730224609375, + "loss": 2151.6375, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13691964745521545, + "rewards/margins": 0.06442641466856003, + "rewards/rejected": -0.20134606957435608, + "step": 1520 + }, + { + "epoch": 0.8, + "learning_rate": 3.7460411098000804e-06, + "logits/chosen": 0.620397686958313, + "logits/rejected": 0.6705790758132935, + "logps/chosen": -279.47003173828125, + "logps/rejected": -242.50320434570312, + "loss": 2097.6518, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13826757669448853, + "rewards/margins": 0.06730998307466507, + "rewards/rejected": -0.2055775672197342, + "step": 1530 + }, + { + "epoch": 0.81, + "learning_rate": 3.726184496879323e-06, + "logits/chosen": 0.5731703042984009, + "logits/rejected": 0.6038475036621094, + "logps/chosen": -273.57684326171875, + "logps/rejected": -263.6417541503906, + "loss": 1980.56, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13297039270401, + "rewards/margins": 0.08892510086297989, + "rewards/rejected": -0.2218955010175705, + "step": 1540 + }, + { + "epoch": 0.81, + "learning_rate": 3.706225497905946e-06, + "logits/chosen": 0.5495398640632629, + "logits/rejected": 0.6184272170066833, + "logps/chosen": -278.1634521484375, + "logps/rejected": -250.0457763671875, + "loss": 1925.0881, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14073289930820465, + "rewards/margins": 0.08853240311145782, + "rewards/rejected": -0.22926530241966248, + "step": 1550 + }, + { + "epoch": 0.82, + "learning_rate": 3.686165779450619e-06, + "logits/chosen": 0.6478545069694519, + "logits/rejected": 0.6362086534500122, + "logps/chosen": -267.02618408203125, + "logps/rejected": -239.1699676513672, + "loss": 2063.3338, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.13111211359500885, + "rewards/margins": 0.07986196875572205, + "rewards/rejected": -0.2109740674495697, + "step": 1560 + }, + { + "epoch": 0.82, + "learning_rate": 3.6660070164940614e-06, + "logits/chosen": 0.6316484808921814, + "logits/rejected": 0.686813473701477, + "logps/chosen": -254.73837280273438, + "logps/rejected": -249.4886016845703, + "loss": 1958.6893, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1321488916873932, + "rewards/margins": 0.09855500608682632, + "rewards/rejected": -0.23070387542247772, + "step": 1570 + }, + { + "epoch": 0.83, + "learning_rate": 3.645750892287178e-06, + "logits/chosen": 0.6227657794952393, + "logits/rejected": 0.642948567867279, + "logps/chosen": -254.3902587890625, + "logps/rejected": -239.3006134033203, + "loss": 2093.4068, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14098116755485535, + "rewards/margins": 0.07492861151695251, + "rewards/rejected": -0.21590976417064667, + "step": 1580 + }, + { + "epoch": 0.83, + "learning_rate": 3.6253990982105114e-06, + "logits/chosen": 0.5823426842689514, + "logits/rejected": 0.6044851541519165, + "logps/chosen": -282.6208801269531, + "logps/rejected": -284.2301330566406, + "loss": 2308.56, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1489720195531845, + "rewards/margins": 0.050694145262241364, + "rewards/rejected": -0.19966615736484528, + "step": 1590 + }, + { + "epoch": 0.84, + "learning_rate": 3.604953333633009e-06, + "logits/chosen": 0.6414502859115601, + "logits/rejected": 0.6938506960868835, + "logps/chosen": -254.117431640625, + "logps/rejected": -234.0909423828125, + "loss": 2171.3457, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12779875099658966, + "rewards/margins": 0.06595613807439804, + "rewards/rejected": -0.1937548816204071, + "step": 1600 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 0.5817673802375793, + "eval_logits/rejected": 0.6391910910606384, + "eval_logps/chosen": -270.3594970703125, + "eval_logps/rejected": -255.21298217773438, + "eval_loss": 2069.375732421875, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -0.137420654296875, + "eval_rewards/margins": 0.07922003418207169, + "eval_rewards/rejected": -0.2166406810283661, + "eval_runtime": 416.6975, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 1600 + }, + { + "epoch": 0.84, + "learning_rate": 3.5844153057701303e-06, + "logits/chosen": 0.6806268095970154, + "logits/rejected": 0.6613883376121521, + "logps/chosen": -293.35455322265625, + "logps/rejected": -249.47317504882812, + "loss": 2235.1336, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13428549468517303, + "rewards/margins": 0.06987977027893066, + "rewards/rejected": -0.2041652649641037, + "step": 1610 + }, + { + "epoch": 0.85, + "learning_rate": 3.56378672954129e-06, + "logits/chosen": 0.5934259295463562, + "logits/rejected": 0.6393053531646729, + "logps/chosen": -263.8625183105469, + "logps/rejected": -268.93646240234375, + "loss": 2115.3395, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.12568698823451996, + "rewards/margins": 0.06675116717815399, + "rewards/rejected": -0.19243815541267395, + "step": 1620 + }, + { + "epoch": 0.85, + "learning_rate": 3.5430693274266694e-06, + "logits/chosen": 0.6212111711502075, + "logits/rejected": 0.6776979565620422, + "logps/chosen": -265.48065185546875, + "logps/rejected": -242.78189086914062, + "loss": 1885.1145, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.12569081783294678, + "rewards/margins": 0.09405811876058578, + "rewards/rejected": -0.21974892914295197, + "step": 1630 + }, + { + "epoch": 0.86, + "learning_rate": 3.5222648293233806e-06, + "logits/chosen": 0.5869291424751282, + "logits/rejected": 0.614780843257904, + "logps/chosen": -309.3984069824219, + "logps/rejected": -281.76800537109375, + "loss": 2295.801, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13158050179481506, + "rewards/margins": 0.05296233296394348, + "rewards/rejected": -0.18454284965991974, + "step": 1640 + }, + { + "epoch": 0.86, + "learning_rate": 3.5013749724010298e-06, + "logits/chosen": 0.6291738152503967, + "logits/rejected": 0.6847606897354126, + "logps/chosen": -269.4018249511719, + "logps/rejected": -248.9547576904297, + "loss": 1985.4633, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12610626220703125, + "rewards/margins": 0.08754386752843857, + "rewards/rejected": -0.21365013718605042, + "step": 1650 + }, + { + "epoch": 0.87, + "learning_rate": 3.4804015009566573e-06, + "logits/chosen": 0.6193640828132629, + "logits/rejected": 0.6583009958267212, + "logps/chosen": -260.786865234375, + "logps/rejected": -246.417724609375, + "loss": 2067.8201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13031774759292603, + "rewards/margins": 0.07650937139987946, + "rewards/rejected": -0.20682711899280548, + "step": 1660 + }, + { + "epoch": 0.87, + "learning_rate": 3.459346166269093e-06, + "logits/chosen": 0.569218635559082, + "logits/rejected": 0.6178910136222839, + "logps/chosen": -286.010498046875, + "logps/rejected": -284.7559509277344, + "loss": 2057.5484, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11479117721319199, + "rewards/margins": 0.08598540723323822, + "rewards/rejected": -0.20077654719352722, + "step": 1670 + }, + { + "epoch": 0.88, + "learning_rate": 3.4382107264527244e-06, + "logits/chosen": 0.6346784234046936, + "logits/rejected": 0.7338213920593262, + "logps/chosen": -300.38739013671875, + "logps/rejected": -259.44525146484375, + "loss": 1994.4248, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13437870144844055, + "rewards/margins": 0.08268047124147415, + "rewards/rejected": -0.2170591652393341, + "step": 1680 + }, + { + "epoch": 0.88, + "learning_rate": 3.416996946310694e-06, + "logits/chosen": 0.5468164086341858, + "logits/rejected": 0.5939579010009766, + "logps/chosen": -299.57061767578125, + "logps/rejected": -265.4569396972656, + "loss": 1896.8961, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11913790553808212, + "rewards/margins": 0.10197613388299942, + "rewards/rejected": -0.22111406922340393, + "step": 1690 + }, + { + "epoch": 0.89, + "learning_rate": 3.3957065971875387e-06, + "logits/chosen": 0.5587860345840454, + "logits/rejected": 0.6276572346687317, + "logps/chosen": -253.8291473388672, + "logps/rejected": -239.05685424804688, + "loss": 2189.3863, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.14831021428108215, + "rewards/margins": 0.06738562881946564, + "rewards/rejected": -0.2156958281993866, + "step": 1700 + }, + { + "epoch": 0.89, + "eval_logits/chosen": 0.5722830295562744, + "eval_logits/rejected": 0.629075288772583, + "eval_logps/chosen": -270.473876953125, + "eval_logps/rejected": -255.46749877929688, + "eval_loss": 2062.199462890625, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -0.13856419920921326, + "eval_rewards/margins": 0.08062165975570679, + "eval_rewards/rejected": -0.21918585896492004, + "eval_runtime": 416.4246, + "eval_samples_per_second": 4.803, + "eval_steps_per_second": 1.201, + "step": 1700 + }, + { + "epoch": 0.9, + "learning_rate": 3.3743414568212828e-06, + "logits/chosen": 0.6158628463745117, + "logits/rejected": 0.6673662066459656, + "logps/chosen": -296.7020263671875, + "logps/rejected": -242.6101837158203, + "loss": 2082.5406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13253936171531677, + "rewards/margins": 0.07927088439464569, + "rewards/rejected": -0.21181027591228485, + "step": 1710 + }, + { + "epoch": 0.9, + "learning_rate": 3.352903309194999e-06, + "logits/chosen": 0.6274576187133789, + "logits/rejected": 0.6294026374816895, + "logps/chosen": -293.9241638183594, + "logps/rejected": -253.40036010742188, + "loss": 2083.2709, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13250832259655, + "rewards/margins": 0.08279639482498169, + "rewards/rejected": -0.21530470252037048, + "step": 1720 + }, + { + "epoch": 0.91, + "learning_rate": 3.331393944387845e-06, + "logits/chosen": 0.5965205430984497, + "logits/rejected": 0.70032799243927, + "logps/chosen": -291.1014099121094, + "logps/rejected": -274.6158752441406, + "loss": 2108.5279, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.12708380818367004, + "rewards/margins": 0.08121255040168762, + "rewards/rejected": -0.20829637348651886, + "step": 1730 + }, + { + "epoch": 0.91, + "learning_rate": 3.309815158425591e-06, + "logits/chosen": 0.588997483253479, + "logits/rejected": 0.6009566783905029, + "logps/chosen": -244.7908172607422, + "logps/rejected": -238.26651000976562, + "loss": 2017.7098, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11969755589962006, + "rewards/margins": 0.07758014649152756, + "rewards/rejected": -0.19727769494056702, + "step": 1740 + }, + { + "epoch": 0.92, + "learning_rate": 3.288168753130657e-06, + "logits/chosen": 0.6095719933509827, + "logits/rejected": 0.6279308199882507, + "logps/chosen": -250.91116333007812, + "logps/rejected": -265.10302734375, + "loss": 1975.5611, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1158142238855362, + "rewards/margins": 0.08399386703968048, + "rewards/rejected": -0.19980809092521667, + "step": 1750 + }, + { + "epoch": 0.92, + "learning_rate": 3.266456535971654e-06, + "logits/chosen": 0.5891221165657043, + "logits/rejected": 0.5675392746925354, + "logps/chosen": -283.29901123046875, + "logps/rejected": -258.32562255859375, + "loss": 1991.3586, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1158837080001831, + "rewards/margins": 0.09360859543085098, + "rewards/rejected": -0.2094922959804535, + "step": 1760 + }, + { + "epoch": 0.93, + "learning_rate": 3.2446803199124666e-06, + "logits/chosen": 0.542614221572876, + "logits/rejected": 0.5660384893417358, + "logps/chosen": -260.6263427734375, + "logps/rejected": -240.3424835205078, + "loss": 2044.5988, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12066911160945892, + "rewards/margins": 0.0799749344587326, + "rewards/rejected": -0.20064406096935272, + "step": 1770 + }, + { + "epoch": 0.93, + "learning_rate": 3.2228419232608692e-06, + "logits/chosen": 0.5963379144668579, + "logits/rejected": 0.6299723386764526, + "logps/chosen": -248.21920776367188, + "logps/rejected": -235.8214874267578, + "loss": 2095.258, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12164044380187988, + "rewards/margins": 0.07495652884244919, + "rewards/rejected": -0.19659698009490967, + "step": 1780 + }, + { + "epoch": 0.94, + "learning_rate": 3.2009431695166985e-06, + "logits/chosen": 0.5749480724334717, + "logits/rejected": 0.627223014831543, + "logps/chosen": -239.6404266357422, + "logps/rejected": -239.62014770507812, + "loss": 1970.9955, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.11313848197460175, + "rewards/margins": 0.08247244358062744, + "rewards/rejected": -0.19561094045639038, + "step": 1790 + }, + { + "epoch": 0.94, + "learning_rate": 3.1789858872195888e-06, + "logits/chosen": 0.6324980854988098, + "logits/rejected": 0.6260117888450623, + "logps/chosen": -244.56362915039062, + "logps/rejected": -245.474609375, + "loss": 2292.8938, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.14712049067020416, + "rewards/margins": 0.05482936650514603, + "rewards/rejected": -0.2019498646259308, + "step": 1800 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 0.5703141689300537, + "eval_logits/rejected": 0.627535343170166, + "eval_logps/chosen": -268.5789489746094, + "eval_logps/rejected": -253.6024627685547, + "eval_loss": 2053.1298828125, + "eval_rewards/accuracies": 0.6830000281333923, + "eval_rewards/chosen": -0.11961515992879868, + "eval_rewards/margins": 0.08092045783996582, + "eval_rewards/rejected": -0.2005356103181839, + "eval_runtime": 416.7248, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 1800 + }, + { + "epoch": 0.95, + "learning_rate": 3.156971909796295e-06, + "logits/chosen": 0.6370185613632202, + "logits/rejected": 0.7445378303527832, + "logps/chosen": -265.6059265136719, + "logps/rejected": -232.7034454345703, + "loss": 1958.5383, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.11271758377552032, + "rewards/margins": 0.08874475955963135, + "rewards/rejected": -0.20146234333515167, + "step": 1810 + }, + { + "epoch": 0.95, + "learning_rate": 3.1349030754075945e-06, + "logits/chosen": 0.623261034488678, + "logits/rejected": 0.6591364741325378, + "logps/chosen": -263.49993896484375, + "logps/rejected": -241.6349334716797, + "loss": 1940.6969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1059044599533081, + "rewards/margins": 0.09299680590629578, + "rewards/rejected": -0.19890126585960388, + "step": 1820 + }, + { + "epoch": 0.96, + "learning_rate": 3.1127812267948095e-06, + "logits/chosen": 0.6355741024017334, + "logits/rejected": 0.6655168533325195, + "logps/chosen": -264.20062255859375, + "logps/rejected": -258.43310546875, + "loss": 1993.0014, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.12248332798480988, + "rewards/margins": 0.08727528899908066, + "rewards/rejected": -0.20975859463214874, + "step": 1830 + }, + { + "epoch": 0.96, + "learning_rate": 3.0906082111259313e-06, + "logits/chosen": 0.548941433429718, + "logits/rejected": 0.5715293884277344, + "logps/chosen": -277.3153076171875, + "logps/rejected": -248.1029510498047, + "loss": 2108.9805, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.10435888916254044, + "rewards/margins": 0.07835109531879425, + "rewards/rejected": -0.18270999193191528, + "step": 1840 + }, + { + "epoch": 0.97, + "learning_rate": 3.068385879841389e-06, + "logits/chosen": 0.6165980100631714, + "logits/rejected": 0.6937299966812134, + "logps/chosen": -233.5325469970703, + "logps/rejected": -246.7562255859375, + "loss": 2168.8945, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10994801670312881, + "rewards/margins": 0.06389383226633072, + "rewards/rejected": -0.17384183406829834, + "step": 1850 + }, + { + "epoch": 0.97, + "learning_rate": 3.046116088499449e-06, + "logits/chosen": 0.6379483938217163, + "logits/rejected": 0.6270259618759155, + "logps/chosen": -266.16009521484375, + "logps/rejected": -259.60919189453125, + "loss": 2145.4496, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.11401374638080597, + "rewards/margins": 0.07185572385787964, + "rewards/rejected": -0.1858694702386856, + "step": 1860 + }, + { + "epoch": 0.98, + "learning_rate": 3.02380069662128e-06, + "logits/chosen": 0.623966634273529, + "logits/rejected": 0.5938777327537537, + "logps/chosen": -252.69869995117188, + "logps/rejected": -245.094482421875, + "loss": 2034.7914, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.10203299671411514, + "rewards/margins": 0.07526172697544098, + "rewards/rejected": -0.17729471623897552, + "step": 1870 + }, + { + "epoch": 0.98, + "learning_rate": 3.0014415675356813e-06, + "logits/chosen": 0.6284725069999695, + "logits/rejected": 0.6297743916511536, + "logps/chosen": -270.23333740234375, + "logps/rejected": -252.12704467773438, + "loss": 2150.0496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10082075744867325, + "rewards/margins": 0.06684517115354538, + "rewards/rejected": -0.16766592860221863, + "step": 1880 + }, + { + "epoch": 0.99, + "learning_rate": 2.979040568223498e-06, + "logits/chosen": 0.5534299612045288, + "logits/rejected": 0.675399661064148, + "logps/chosen": -263.3745422363281, + "logps/rejected": -266.8883972167969, + "loss": 2205.4939, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11857882887125015, + "rewards/margins": 0.06667140126228333, + "rewards/rejected": -0.18525022268295288, + "step": 1890 + }, + { + "epoch": 0.99, + "learning_rate": 2.9565995691617242e-06, + "logits/chosen": 0.6073340773582458, + "logits/rejected": 0.6487486362457275, + "logps/chosen": -292.90704345703125, + "logps/rejected": -238.2965545654297, + "loss": 2085.5805, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.09591428190469742, + "rewards/margins": 0.08266115188598633, + "rewards/rejected": -0.17857542634010315, + "step": 1900 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 0.5747328996658325, + "eval_logits/rejected": 0.6318737864494324, + "eval_logps/chosen": -267.4730224609375, + "eval_logps/rejected": -252.61306762695312, + "eval_loss": 2052.32373046875, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -0.10855603218078613, + "eval_rewards/margins": 0.08208546042442322, + "eval_rewards/rejected": -0.19064147770404816, + "eval_runtime": 416.81, + "eval_samples_per_second": 4.798, + "eval_steps_per_second": 1.2, + "step": 1900 + }, + { + "epoch": 1.0, + "learning_rate": 2.9341204441673267e-06, + "logits/chosen": 0.5886205434799194, + "logits/rejected": 0.6063315272331238, + "logps/chosen": -281.51129150390625, + "logps/rejected": -263.0057678222656, + "loss": 1992.2906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10820697247982025, + "rewards/margins": 0.08012684434652328, + "rewards/rejected": -0.18833380937576294, + "step": 1910 + }, + { + "epoch": 1.0, + "learning_rate": 2.9116050702407706e-06, + "logits/chosen": 0.6380060315132141, + "logits/rejected": 0.6841104030609131, + "logps/chosen": -267.55145263671875, + "logps/rejected": -248.828369140625, + "loss": 2045.3801, + "rewards/accuracies": 0.6604167222976685, + "rewards/chosen": -0.10555033385753632, + "rewards/margins": 0.08623509109020233, + "rewards/rejected": -0.19178542494773865, + "step": 1920 + }, + { + "epoch": 1.01, + "learning_rate": 2.889055327409301e-06, + "logits/chosen": 0.5285671353340149, + "logits/rejected": 0.5704804062843323, + "logps/chosen": -263.24725341796875, + "logps/rejected": -248.02395629882812, + "loss": 2023.134, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10273389518260956, + "rewards/margins": 0.08054333180189133, + "rewards/rejected": -0.1832772046327591, + "step": 1930 + }, + { + "epoch": 1.02, + "learning_rate": 2.8664730985699537e-06, + "logits/chosen": 0.5331718325614929, + "logits/rejected": 0.5988043546676636, + "logps/chosen": -242.79061889648438, + "logps/rejected": -238.0269012451172, + "loss": 1960.2693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09900447726249695, + "rewards/margins": 0.0868852287530899, + "rewards/rejected": -0.18588972091674805, + "step": 1940 + }, + { + "epoch": 1.02, + "learning_rate": 2.843860269332339e-06, + "logits/chosen": 0.6072074174880981, + "logits/rejected": 0.631058394908905, + "logps/chosen": -273.4151306152344, + "logps/rejected": -246.41238403320312, + "loss": 1955.907, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09934862703084946, + "rewards/margins": 0.09240168333053589, + "rewards/rejected": -0.19175033271312714, + "step": 1950 + }, + { + "epoch": 1.03, + "learning_rate": 2.8212187278611907e-06, + "logits/chosen": 0.6683967113494873, + "logits/rejected": 0.6856907606124878, + "logps/chosen": -257.2086181640625, + "logps/rejected": -247.91683959960938, + "loss": 1924.8703, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1071348562836647, + "rewards/margins": 0.09893598407506943, + "rewards/rejected": -0.20607082545757294, + "step": 1960 + }, + { + "epoch": 1.03, + "learning_rate": 2.7985503647187063e-06, + "logits/chosen": 0.5825555920600891, + "logits/rejected": 0.6476297378540039, + "logps/chosen": -288.1867980957031, + "logps/rejected": -254.5727996826172, + "loss": 1907.2662, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.08619461953639984, + "rewards/margins": 0.10769355297088623, + "rewards/rejected": -0.19388815760612488, + "step": 1970 + }, + { + "epoch": 1.04, + "learning_rate": 2.7758570727066843e-06, + "logits/chosen": 0.5205335021018982, + "logits/rejected": 0.6433119177818298, + "logps/chosen": -261.37982177734375, + "logps/rejected": -240.10952758789062, + "loss": 2241.4863, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12351379543542862, + "rewards/margins": 0.05891970917582512, + "rewards/rejected": -0.18243351578712463, + "step": 1980 + }, + { + "epoch": 1.04, + "learning_rate": 2.753140746708477e-06, + "logits/chosen": 0.6216637492179871, + "logits/rejected": 0.669810950756073, + "logps/chosen": -282.500244140625, + "logps/rejected": -273.12310791015625, + "loss": 1961.7119, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09472217410802841, + "rewards/margins": 0.09439438581466675, + "rewards/rejected": -0.18911656737327576, + "step": 1990 + }, + { + "epoch": 1.05, + "learning_rate": 2.730403283530767e-06, + "logits/chosen": 0.638060986995697, + "logits/rejected": 0.7034865617752075, + "logps/chosen": -258.02447509765625, + "logps/rejected": -246.9062957763672, + "loss": 1847.759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09892908483743668, + "rewards/margins": 0.10315445810556412, + "rewards/rejected": -0.2020835429430008, + "step": 2000 + }, + { + "epoch": 1.05, + "eval_logits/chosen": 0.5763067603111267, + "eval_logits/rejected": 0.6332587599754333, + "eval_logps/chosen": -267.7949523925781, + "eval_logps/rejected": -253.0826873779297, + "eval_loss": 2050.417724609375, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -0.11177488416433334, + "eval_rewards/margins": 0.0835629403591156, + "eval_rewards/rejected": -0.19533783197402954, + "eval_runtime": 416.585, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 2000 + }, + { + "epoch": 1.05, + "learning_rate": 2.707646581745188e-06, + "logits/chosen": 0.6024230718612671, + "logits/rejected": 0.6550949811935425, + "logps/chosen": -275.99761962890625, + "logps/rejected": -272.4120178222656, + "loss": 1981.8529, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0975821316242218, + "rewards/margins": 0.08785782754421234, + "rewards/rejected": -0.18543997406959534, + "step": 2010 + }, + { + "epoch": 1.06, + "learning_rate": 2.6848725415297888e-06, + "logits/chosen": 0.629960298538208, + "logits/rejected": 0.6249616742134094, + "logps/chosen": -256.87603759765625, + "logps/rejected": -253.21109008789062, + "loss": 2136.5217, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10265711694955826, + "rewards/margins": 0.0642293393611908, + "rewards/rejected": -0.16688646376132965, + "step": 2020 + }, + { + "epoch": 1.06, + "learning_rate": 2.6620830645103753e-06, + "logits/chosen": 0.6109344959259033, + "logits/rejected": 0.6072026491165161, + "logps/chosen": -266.4075012207031, + "logps/rejected": -258.20208740234375, + "loss": 1938.8361, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09916529804468155, + "rewards/margins": 0.09021677076816559, + "rewards/rejected": -0.18938204646110535, + "step": 2030 + }, + { + "epoch": 1.07, + "learning_rate": 2.639280053601719e-06, + "logits/chosen": 0.566746711730957, + "logits/rejected": 0.6063026189804077, + "logps/chosen": -261.76739501953125, + "logps/rejected": -270.6283874511719, + "loss": 2085.5938, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12018795311450958, + "rewards/margins": 0.08117975294589996, + "rewards/rejected": -0.20136770606040955, + "step": 2040 + }, + { + "epoch": 1.07, + "learning_rate": 2.6164654128486683e-06, + "logits/chosen": 0.5058253407478333, + "logits/rejected": 0.6028685569763184, + "logps/chosen": -267.67376708984375, + "logps/rejected": -230.2966766357422, + "loss": 2055.1498, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11003967374563217, + "rewards/margins": 0.08447955548763275, + "rewards/rejected": -0.19451923668384552, + "step": 2050 + }, + { + "epoch": 1.08, + "learning_rate": 2.59364104726716e-06, + "logits/chosen": 0.5947778820991516, + "logits/rejected": 0.5901384353637695, + "logps/chosen": -278.0001525878906, + "logps/rejected": -242.13583374023438, + "loss": 1877.0174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11745290458202362, + "rewards/margins": 0.10013137012720108, + "rewards/rejected": -0.2175842821598053, + "step": 2060 + }, + { + "epoch": 1.08, + "learning_rate": 2.5708088626851546e-06, + "logits/chosen": 0.5502884387969971, + "logits/rejected": 0.603992760181427, + "logps/chosen": -269.38360595703125, + "logps/rejected": -244.87619018554688, + "loss": 2015.0283, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12011172622442245, + "rewards/margins": 0.08565986156463623, + "rewards/rejected": -0.20577159523963928, + "step": 2070 + }, + { + "epoch": 1.09, + "learning_rate": 2.547970765583491e-06, + "logits/chosen": 0.5619412064552307, + "logits/rejected": 0.6468341946601868, + "logps/chosen": -252.68115234375, + "logps/rejected": -252.705322265625, + "loss": 1855.9893, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10235454142093658, + "rewards/margins": 0.10716482251882553, + "rewards/rejected": -0.20951935648918152, + "step": 2080 + }, + { + "epoch": 1.09, + "learning_rate": 2.525128662936707e-06, + "logits/chosen": 0.512058436870575, + "logits/rejected": 0.5677643418312073, + "logps/chosen": -270.7825012207031, + "logps/rejected": -260.822509765625, + "loss": 1831.3346, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1064097136259079, + "rewards/margins": 0.10622493177652359, + "rewards/rejected": -0.21263465285301208, + "step": 2090 + }, + { + "epoch": 1.1, + "learning_rate": 2.502284462053799e-06, + "logits/chosen": 0.620409369468689, + "logits/rejected": 0.6358670592308044, + "logps/chosen": -258.42706298828125, + "logps/rejected": -258.72161865234375, + "loss": 2024.9559, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10622759163379669, + "rewards/margins": 0.08639432489871979, + "rewards/rejected": -0.19262190163135529, + "step": 2100 + }, + { + "epoch": 1.1, + "eval_logits/chosen": 0.5589507818222046, + "eval_logits/rejected": 0.6156801581382751, + "eval_logps/chosen": -268.8072814941406, + "eval_logps/rejected": -254.37991333007812, + "eval_loss": 2046.75927734375, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -0.12189868092536926, + "eval_rewards/margins": 0.08641137927770615, + "eval_rewards/rejected": -0.2083100527524948, + "eval_runtime": 416.7259, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 2100 + }, + { + "epoch": 1.1, + "learning_rate": 2.479440070418967e-06, + "logits/chosen": 0.5901846885681152, + "logits/rejected": 0.6195170283317566, + "logps/chosen": -249.45816040039062, + "logps/rejected": -253.7944793701172, + "loss": 2205.6043, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.13660283386707306, + "rewards/margins": 0.06599629670381546, + "rewards/rejected": -0.20259912312030792, + "step": 2110 + }, + { + "epoch": 1.11, + "learning_rate": 2.456597395532338e-06, + "logits/chosen": 0.5504690408706665, + "logits/rejected": 0.6531665921211243, + "logps/chosen": -259.79010009765625, + "logps/rejected": -284.0751647949219, + "loss": 1959.0818, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.11810295283794403, + "rewards/margins": 0.0964241549372673, + "rewards/rejected": -0.21452713012695312, + "step": 2120 + }, + { + "epoch": 1.11, + "learning_rate": 2.433758344750691e-06, + "logits/chosen": 0.5741318464279175, + "logits/rejected": 0.6458116173744202, + "logps/chosen": -295.03192138671875, + "logps/rejected": -276.64251708984375, + "loss": 1911.3146, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11703141778707504, + "rewards/margins": 0.10501817613840103, + "rewards/rejected": -0.2220495641231537, + "step": 2130 + }, + { + "epoch": 1.12, + "learning_rate": 2.4109248251281953e-06, + "logits/chosen": 0.5908122062683105, + "logits/rejected": 0.6558480858802795, + "logps/chosen": -283.0213928222656, + "logps/rejected": -252.56600952148438, + "loss": 1911.8793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11346153914928436, + "rewards/margins": 0.10086224228143692, + "rewards/rejected": -0.2143237590789795, + "step": 2140 + }, + { + "epoch": 1.13, + "learning_rate": 2.3880987432571675e-06, + "logits/chosen": 0.5616129040718079, + "logits/rejected": 0.593204915523529, + "logps/chosen": -268.16583251953125, + "logps/rejected": -262.17755126953125, + "loss": 1994.0697, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11974780261516571, + "rewards/margins": 0.09457085281610489, + "rewards/rejected": -0.2143186628818512, + "step": 2150 + }, + { + "epoch": 1.13, + "learning_rate": 2.365282005108875e-06, + "logits/chosen": 0.5762392282485962, + "logits/rejected": 0.615722119808197, + "logps/chosen": -250.62509155273438, + "logps/rejected": -253.91049194335938, + "loss": 2099.1633, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1276218444108963, + "rewards/margins": 0.07122951745986938, + "rewards/rejected": -0.19885137677192688, + "step": 2160 + }, + { + "epoch": 1.14, + "learning_rate": 2.3424765158743867e-06, + "logits/chosen": 0.6059794425964355, + "logits/rejected": 0.6645799875259399, + "logps/chosen": -255.7693634033203, + "logps/rejected": -251.96951293945312, + "loss": 2010.217, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12624487280845642, + "rewards/margins": 0.09581606835126877, + "rewards/rejected": -0.2220609486103058, + "step": 2170 + }, + { + "epoch": 1.14, + "learning_rate": 2.319684179805491e-06, + "logits/chosen": 0.516992449760437, + "logits/rejected": 0.5456847548484802, + "logps/chosen": -265.10321044921875, + "logps/rejected": -246.3701629638672, + "loss": 1933.4984, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11627723276615143, + "rewards/margins": 0.0940864160656929, + "rewards/rejected": -0.21036362648010254, + "step": 2180 + }, + { + "epoch": 1.15, + "learning_rate": 2.296906900055691e-06, + "logits/chosen": 0.596808135509491, + "logits/rejected": 0.6393652558326721, + "logps/chosen": -264.455810546875, + "logps/rejected": -256.40667724609375, + "loss": 2172.6984, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12352782487869263, + "rewards/margins": 0.07605434954166412, + "rewards/rejected": -0.19958215951919556, + "step": 2190 + }, + { + "epoch": 1.15, + "learning_rate": 2.2741465785212905e-06, + "logits/chosen": 0.5940336585044861, + "logits/rejected": 0.6305769085884094, + "logps/chosen": -256.6434326171875, + "logps/rejected": -245.391357421875, + "loss": 2038.6354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11867289245128632, + "rewards/margins": 0.09147666394710541, + "rewards/rejected": -0.21014957129955292, + "step": 2200 + }, + { + "epoch": 1.15, + "eval_logits/chosen": 0.5517618656158447, + "eval_logits/rejected": 0.6082795858383179, + "eval_logps/chosen": -268.6722106933594, + "eval_logps/rejected": -254.27310180664062, + "eval_loss": 2043.57275390625, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": -0.12054779380559921, + "eval_rewards/margins": 0.08669425547122955, + "eval_rewards/rejected": -0.20724207162857056, + "eval_runtime": 416.7766, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 2200 + }, + { + "epoch": 1.16, + "learning_rate": 2.251405115682587e-06, + "logits/chosen": 0.5902246236801147, + "logits/rejected": 0.5983418822288513, + "logps/chosen": -263.2071228027344, + "logps/rejected": -272.0802307128906, + "loss": 2017.8775, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11642640829086304, + "rewards/margins": 0.09181423485279083, + "rewards/rejected": -0.20824062824249268, + "step": 2210 + }, + { + "epoch": 1.16, + "learning_rate": 2.2286844104451848e-06, + "logits/chosen": 0.5431746244430542, + "logits/rejected": 0.6418278217315674, + "logps/chosen": -264.33465576171875, + "logps/rejected": -251.9208984375, + "loss": 2095.6342, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10533900558948517, + "rewards/margins": 0.07776842266321182, + "rewards/rejected": -0.1831074208021164, + "step": 2220 + }, + { + "epoch": 1.17, + "learning_rate": 2.205986359981431e-06, + "logits/chosen": 0.5207514762878418, + "logits/rejected": 0.6270573139190674, + "logps/chosen": -285.76849365234375, + "logps/rejected": -277.93426513671875, + "loss": 1912.0979, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11343447118997574, + "rewards/margins": 0.1047770231962204, + "rewards/rejected": -0.21821150183677673, + "step": 2230 + }, + { + "epoch": 1.17, + "learning_rate": 2.183312859572008e-06, + "logits/chosen": 0.5806037783622742, + "logits/rejected": 0.6543610095977783, + "logps/chosen": -281.1925354003906, + "logps/rejected": -278.2233581542969, + "loss": 2272.857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11976996809244156, + "rewards/margins": 0.06387045979499817, + "rewards/rejected": -0.18364043533802032, + "step": 2240 + }, + { + "epoch": 1.18, + "learning_rate": 2.1606658024476744e-06, + "logits/chosen": 0.5554038286209106, + "logits/rejected": 0.5429580211639404, + "logps/chosen": -269.9796447753906, + "logps/rejected": -250.5005340576172, + "loss": 2188.1607, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1146581768989563, + "rewards/margins": 0.07278671860694885, + "rewards/rejected": -0.18744489550590515, + "step": 2250 + }, + { + "epoch": 1.18, + "learning_rate": 2.1380470796311843e-06, + "logits/chosen": 0.610127866268158, + "logits/rejected": 0.6246207356452942, + "logps/chosen": -274.421142578125, + "logps/rejected": -259.88336181640625, + "loss": 1878.2621, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10982956737279892, + "rewards/margins": 0.10056765377521515, + "rewards/rejected": -0.21039721369743347, + "step": 2260 + }, + { + "epoch": 1.19, + "learning_rate": 2.1154585797793826e-06, + "logits/chosen": 0.6410681009292603, + "logits/rejected": 0.6446506977081299, + "logps/chosen": -262.7099304199219, + "logps/rejected": -243.33847045898438, + "loss": 1969.0021, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.10027629137039185, + "rewards/margins": 0.08689162135124207, + "rewards/rejected": -0.1871679127216339, + "step": 2270 + }, + { + "epoch": 1.19, + "learning_rate": 2.092902189025507e-06, + "logits/chosen": 0.6191312670707703, + "logits/rejected": 0.6812275648117065, + "logps/chosen": -258.3789978027344, + "logps/rejected": -247.3417510986328, + "loss": 1765.025, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10864460468292236, + "rewards/margins": 0.11250102519989014, + "rewards/rejected": -0.2211456298828125, + "step": 2280 + }, + { + "epoch": 1.2, + "learning_rate": 2.070379790821693e-06, + "logits/chosen": 0.5654376745223999, + "logits/rejected": 0.6462022066116333, + "logps/chosen": -301.2412414550781, + "logps/rejected": -276.8460998535156, + "loss": 2045.4492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10371474176645279, + "rewards/margins": 0.09003494679927826, + "rewards/rejected": -0.19374969601631165, + "step": 2290 + }, + { + "epoch": 1.2, + "learning_rate": 2.0478932657817105e-06, + "logits/chosen": 0.5810787081718445, + "logits/rejected": 0.6360457539558411, + "logps/chosen": -254.06838989257812, + "logps/rejected": -243.23989868164062, + "loss": 2022.9617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11232425272464752, + "rewards/margins": 0.07984773069620132, + "rewards/rejected": -0.19217197597026825, + "step": 2300 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 0.5535383820533752, + "eval_logits/rejected": 0.6101322174072266, + "eval_logps/chosen": -268.3490905761719, + "eval_logps/rejected": -253.95965576171875, + "eval_loss": 2035.585693359375, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": -0.11731643229722977, + "eval_rewards/margins": 0.08679118007421494, + "eval_rewards/rejected": -0.2041076123714447, + "eval_runtime": 416.4094, + "eval_samples_per_second": 4.803, + "eval_steps_per_second": 1.201, + "step": 2300 + }, + { + "epoch": 1.21, + "learning_rate": 2.0254444915239287e-06, + "logits/chosen": 0.5468884706497192, + "logits/rejected": 0.5753307938575745, + "logps/chosen": -271.94940185546875, + "logps/rejected": -244.8318328857422, + "loss": 1994.7408, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1074294000864029, + "rewards/margins": 0.08415937423706055, + "rewards/rejected": -0.19158877432346344, + "step": 2310 + }, + { + "epoch": 1.21, + "learning_rate": 2.0030353425145376e-06, + "logits/chosen": 0.6623051762580872, + "logits/rejected": 0.6782322525978088, + "logps/chosen": -220.7380828857422, + "logps/rejected": -242.05850219726562, + "loss": 1864.7361, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10245666652917862, + "rewards/margins": 0.10949740558862686, + "rewards/rejected": -0.2119540423154831, + "step": 2320 + }, + { + "epoch": 1.22, + "learning_rate": 1.9806676899110305e-06, + "logits/chosen": 0.6308891773223877, + "logits/rejected": 0.6477428674697876, + "logps/chosen": -262.88897705078125, + "logps/rejected": -255.49362182617188, + "loss": 1843.8875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10279623419046402, + "rewards/margins": 0.11021213233470917, + "rewards/rejected": -0.21300837397575378, + "step": 2330 + }, + { + "epoch": 1.22, + "learning_rate": 1.958343401405964e-06, + "logits/chosen": 0.5211482048034668, + "logits/rejected": 0.6105703115463257, + "logps/chosen": -272.09698486328125, + "logps/rejected": -240.05392456054688, + "loss": 1991.6119, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.12355583906173706, + "rewards/margins": 0.08727772533893585, + "rewards/rejected": -0.2108335793018341, + "step": 2340 + }, + { + "epoch": 1.23, + "learning_rate": 1.9360643410710027e-06, + "logits/chosen": 0.6230972409248352, + "logits/rejected": 0.6428076028823853, + "logps/chosen": -297.76300048828125, + "logps/rejected": -262.3421325683594, + "loss": 2047.1437, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10469541698694229, + "rewards/margins": 0.09010159224271774, + "rewards/rejected": -0.19479700922966003, + "step": 2350 + }, + { + "epoch": 1.24, + "learning_rate": 1.9138323692012734e-06, + "logits/chosen": 0.5903237462043762, + "logits/rejected": 0.6455060243606567, + "logps/chosen": -288.16815185546875, + "logps/rejected": -289.3240966796875, + "loss": 1579.4779, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.08982647955417633, + "rewards/margins": 0.14532434940338135, + "rewards/rejected": -0.23515084385871887, + "step": 2360 + }, + { + "epoch": 1.24, + "learning_rate": 1.8916493421600287e-06, + "logits/chosen": 0.5603612065315247, + "logits/rejected": 0.5792626142501831, + "logps/chosen": -243.1230010986328, + "logps/rejected": -257.9002685546875, + "loss": 2017.2086, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12239034473896027, + "rewards/margins": 0.0815977230668068, + "rewards/rejected": -0.20398807525634766, + "step": 2370 + }, + { + "epoch": 1.25, + "learning_rate": 1.8695171122236443e-06, + "logits/chosen": 0.49118170142173767, + "logits/rejected": 0.5506534576416016, + "logps/chosen": -268.86822509765625, + "logps/rejected": -274.64202880859375, + "loss": 1977.4205, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11895406246185303, + "rewards/margins": 0.10293842852115631, + "rewards/rejected": -0.22189247608184814, + "step": 2380 + }, + { + "epoch": 1.25, + "learning_rate": 1.84743752742695e-06, + "logits/chosen": 0.6215322613716125, + "logits/rejected": 0.6151038408279419, + "logps/chosen": -265.5515441894531, + "logps/rejected": -278.13177490234375, + "loss": 1889.5199, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.10073964297771454, + "rewards/margins": 0.11166242510080338, + "rewards/rejected": -0.2124020755290985, + "step": 2390 + }, + { + "epoch": 1.26, + "learning_rate": 1.8254124314089225e-06, + "logits/chosen": 0.6138418912887573, + "logits/rejected": 0.6189366579055786, + "logps/chosen": -263.2386169433594, + "logps/rejected": -239.09963989257812, + "loss": 1871.641, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12337759882211685, + "rewards/margins": 0.09867466986179352, + "rewards/rejected": -0.22205229103565216, + "step": 2400 + }, + { + "epoch": 1.26, + "eval_logits/chosen": 0.5482152104377747, + "eval_logits/rejected": 0.6045916676521301, + "eval_logps/chosen": -268.51605224609375, + "eval_logps/rejected": -254.28311157226562, + "eval_loss": 2036.3372802734375, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": -0.11898616701364517, + "eval_rewards/margins": 0.0883559137582779, + "eval_rewards/rejected": -0.20734207332134247, + "eval_runtime": 416.6781, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 2400 + }, + { + "epoch": 1.26, + "learning_rate": 1.8034436632587394e-06, + "logits/chosen": 0.5728852152824402, + "logits/rejected": 0.6265703439712524, + "logps/chosen": -237.0697784423828, + "logps/rejected": -242.29367065429688, + "loss": 1969.1203, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10528033971786499, + "rewards/margins": 0.0880698561668396, + "rewards/rejected": -0.1933501809835434, + "step": 2410 + }, + { + "epoch": 1.27, + "learning_rate": 1.781533057362221e-06, + "logits/chosen": 0.5749053359031677, + "logits/rejected": 0.6042163372039795, + "logps/chosen": -278.8114013671875, + "logps/rejected": -279.18695068359375, + "loss": 1906.6891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1059660091996193, + "rewards/margins": 0.10437663644552231, + "rewards/rejected": -0.210342675447464, + "step": 2420 + }, + { + "epoch": 1.27, + "learning_rate": 1.7596824432486537e-06, + "logits/chosen": 0.5984959602355957, + "logits/rejected": 0.6386197209358215, + "logps/chosen": -292.53143310546875, + "logps/rejected": -256.42620849609375, + "loss": 2003.0641, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10370471328496933, + "rewards/margins": 0.09288345277309418, + "rewards/rejected": -0.1965881586074829, + "step": 2430 + }, + { + "epoch": 1.28, + "learning_rate": 1.7378936454380277e-06, + "logits/chosen": 0.5537322163581848, + "logits/rejected": 0.5942158102989197, + "logps/chosen": -246.1141815185547, + "logps/rejected": -253.85617065429688, + "loss": 2137.652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13521219789981842, + "rewards/margins": 0.0802813172340393, + "rewards/rejected": -0.21549351513385773, + "step": 2440 + }, + { + "epoch": 1.28, + "learning_rate": 1.7161684832886893e-06, + "logits/chosen": 0.5406220555305481, + "logits/rejected": 0.540827751159668, + "logps/chosen": -242.9103546142578, + "logps/rejected": -247.41921997070312, + "loss": 2005.6266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12078257650136948, + "rewards/margins": 0.09046939015388489, + "rewards/rejected": -0.21125197410583496, + "step": 2450 + }, + { + "epoch": 1.29, + "learning_rate": 1.6945087708454273e-06, + "logits/chosen": 0.5730911493301392, + "logits/rejected": 0.5966663956642151, + "logps/chosen": -276.0887145996094, + "logps/rejected": -264.39910888671875, + "loss": 2211.1135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12395117431879044, + "rewards/margins": 0.07060922682285309, + "rewards/rejected": -0.19456037878990173, + "step": 2460 + }, + { + "epoch": 1.29, + "learning_rate": 1.6729163166879964e-06, + "logits/chosen": 0.5936635136604309, + "logits/rejected": 0.6355383396148682, + "logps/chosen": -258.3261413574219, + "logps/rejected": -233.99075317382812, + "loss": 1757.0482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1076178103685379, + "rewards/margins": 0.11206640303134918, + "rewards/rejected": -0.21968421339988708, + "step": 2470 + }, + { + "epoch": 1.3, + "learning_rate": 1.651392923780105e-06, + "logits/chosen": 0.6025252342224121, + "logits/rejected": 0.6687902808189392, + "logps/chosen": -254.2070770263672, + "logps/rejected": -237.63967895507812, + "loss": 2055.3113, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1099240779876709, + "rewards/margins": 0.09202177077531815, + "rewards/rejected": -0.20194585621356964, + "step": 2480 + }, + { + "epoch": 1.3, + "learning_rate": 1.629940389318867e-06, + "logits/chosen": 0.5291022062301636, + "logits/rejected": 0.616036593914032, + "logps/chosen": -294.7336730957031, + "logps/rejected": -240.91796875, + "loss": 1902.4217, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10780209302902222, + "rewards/margins": 0.10337891429662704, + "rewards/rejected": -0.21118099987506866, + "step": 2490 + }, + { + "epoch": 1.31, + "learning_rate": 1.608560504584737e-06, + "logits/chosen": 0.5608310103416443, + "logits/rejected": 0.6271076798439026, + "logps/chosen": -256.45770263671875, + "logps/rejected": -253.8623809814453, + "loss": 1907.3463, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10689397901296616, + "rewards/margins": 0.10536620765924454, + "rewards/rejected": -0.2122601717710495, + "step": 2500 + }, + { + "epoch": 1.31, + "eval_logits/chosen": 0.5460030436515808, + "eval_logits/rejected": 0.6022311449050903, + "eval_logps/chosen": -268.7764587402344, + "eval_logps/rejected": -254.62974548339844, + "eval_loss": 2034.7010498046875, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": -0.12159038335084915, + "eval_rewards/margins": 0.08921793848276138, + "eval_rewards/rejected": -0.21080833673477173, + "eval_runtime": 416.6626, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 2500 + }, + { + "epoch": 1.31, + "learning_rate": 1.587255054791937e-06, + "logits/chosen": 0.5321905016899109, + "logits/rejected": 0.589474081993103, + "logps/chosen": -281.2105407714844, + "logps/rejected": -264.56298828125, + "loss": 2016.9854, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.10593251138925552, + "rewards/margins": 0.08659417182207108, + "rewards/rejected": -0.1925266534090042, + "step": 2510 + }, + { + "epoch": 1.32, + "learning_rate": 1.5660258189393945e-06, + "logits/chosen": 0.5880864262580872, + "logits/rejected": 0.6149991750717163, + "logps/chosen": -251.75973510742188, + "logps/rejected": -262.3134765625, + "loss": 2130.8975, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12219718843698502, + "rewards/margins": 0.08161304891109467, + "rewards/rejected": -0.2038102149963379, + "step": 2520 + }, + { + "epoch": 1.32, + "learning_rate": 1.5448745696621915e-06, + "logits/chosen": 0.5654980540275574, + "logits/rejected": 0.6478559970855713, + "logps/chosen": -272.79864501953125, + "logps/rejected": -258.56402587890625, + "loss": 2114.0654, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11762849986553192, + "rewards/margins": 0.08065593242645264, + "rewards/rejected": -0.19828443229198456, + "step": 2530 + }, + { + "epoch": 1.33, + "learning_rate": 1.5238030730835578e-06, + "logits/chosen": 0.5662246942520142, + "logits/rejected": 0.6270356178283691, + "logps/chosen": -272.17449951171875, + "logps/rejected": -237.2474365234375, + "loss": 2106.9717, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11701546609401703, + "rewards/margins": 0.0775529146194458, + "rewards/rejected": -0.19456836581230164, + "step": 2540 + }, + { + "epoch": 1.33, + "learning_rate": 1.5028130886673936e-06, + "logits/chosen": 0.5928006172180176, + "logits/rejected": 0.641442060470581, + "logps/chosen": -263.0660705566406, + "logps/rejected": -252.6539306640625, + "loss": 2000.1971, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.111760713160038, + "rewards/margins": 0.0842764601111412, + "rewards/rejected": -0.1960371732711792, + "step": 2550 + }, + { + "epoch": 1.34, + "learning_rate": 1.4819063690713565e-06, + "logits/chosen": 0.5778559446334839, + "logits/rejected": 0.6045337915420532, + "logps/chosen": -284.07061767578125, + "logps/rejected": -270.4360656738281, + "loss": 1938.5168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12265495210886002, + "rewards/margins": 0.09922391176223755, + "rewards/rejected": -0.22187885642051697, + "step": 2560 + }, + { + "epoch": 1.35, + "learning_rate": 1.4610846600005164e-06, + "logits/chosen": 0.6385133862495422, + "logits/rejected": 0.6164069175720215, + "logps/chosen": -291.48590087890625, + "logps/rejected": -241.2799072265625, + "loss": 2040.5221, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11285148561000824, + "rewards/margins": 0.08863021433353424, + "rewards/rejected": -0.20148172974586487, + "step": 2570 + }, + { + "epoch": 1.35, + "learning_rate": 1.4403497000615885e-06, + "logits/chosen": 0.6022018194198608, + "logits/rejected": 0.6375949382781982, + "logps/chosen": -250.8367156982422, + "logps/rejected": -242.37881469726562, + "loss": 2072.085, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11340691894292831, + "rewards/margins": 0.07300657033920288, + "rewards/rejected": -0.1864134818315506, + "step": 2580 + }, + { + "epoch": 1.36, + "learning_rate": 1.4197032206177618e-06, + "logits/chosen": 0.6561594605445862, + "logits/rejected": 0.7297431230545044, + "logps/chosen": -249.74887084960938, + "logps/rejected": -237.26779174804688, + "loss": 2067.224, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12197699397802353, + "rewards/margins": 0.08562152087688446, + "rewards/rejected": -0.2075985223054886, + "step": 2590 + }, + { + "epoch": 1.36, + "learning_rate": 1.3991469456441273e-06, + "logits/chosen": 0.6001744270324707, + "logits/rejected": 0.6553865075111389, + "logps/chosen": -252.2941436767578, + "logps/rejected": -243.48025512695312, + "loss": 1884.6086, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11671899259090424, + "rewards/margins": 0.10958864539861679, + "rewards/rejected": -0.22630766034126282, + "step": 2600 + }, + { + "epoch": 1.36, + "eval_logits/chosen": 0.545119047164917, + "eval_logits/rejected": 0.601308286190033, + "eval_logps/chosen": -268.77081298828125, + "eval_logps/rejected": -254.6013946533203, + "eval_loss": 2033.7977294921875, + "eval_rewards/accuracies": 0.6909999847412109, + "eval_rewards/chosen": -0.12153391540050507, + "eval_rewards/margins": 0.08899100124835968, + "eval_rewards/rejected": -0.21052493155002594, + "eval_runtime": 416.4661, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 2600 + }, + { + "epoch": 1.37, + "learning_rate": 1.3786825915837299e-06, + "logits/chosen": 0.6044927835464478, + "logits/rejected": 0.608493447303772, + "logps/chosen": -268.0179138183594, + "logps/rejected": -251.26168823242188, + "loss": 1763.491, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.08965936303138733, + "rewards/margins": 0.12351739406585693, + "rewards/rejected": -0.21317675709724426, + "step": 2610 + }, + { + "epoch": 1.37, + "learning_rate": 1.3583118672042441e-06, + "logits/chosen": 0.5879210233688354, + "logits/rejected": 0.6254302263259888, + "logps/chosen": -268.9471740722656, + "logps/rejected": -246.9387664794922, + "loss": 1806.3043, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.09920786321163177, + "rewards/margins": 0.1189170852303505, + "rewards/rejected": -0.21812494099140167, + "step": 2620 + }, + { + "epoch": 1.38, + "learning_rate": 1.3380364734552935e-06, + "logits/chosen": 0.6040158867835999, + "logits/rejected": 0.6454821825027466, + "logps/chosen": -239.55313110351562, + "logps/rejected": -252.41641235351562, + "loss": 1881.491, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11133377254009247, + "rewards/margins": 0.10438641160726547, + "rewards/rejected": -0.21572017669677734, + "step": 2630 + }, + { + "epoch": 1.38, + "learning_rate": 1.3178581033264218e-06, + "logits/chosen": 0.5422452688217163, + "logits/rejected": 0.5576962232589722, + "logps/chosen": -267.02020263671875, + "logps/rejected": -233.59628295898438, + "loss": 1922.1236, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1054287701845169, + "rewards/margins": 0.10195982456207275, + "rewards/rejected": -0.20738859474658966, + "step": 2640 + }, + { + "epoch": 1.39, + "learning_rate": 1.2977784417057262e-06, + "logits/chosen": 0.5648713111877441, + "logits/rejected": 0.5970919132232666, + "logps/chosen": -266.724365234375, + "logps/rejected": -252.4540557861328, + "loss": 1788.8666, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11531468480825424, + "rewards/margins": 0.11205389350652695, + "rewards/rejected": -0.22736859321594238, + "step": 2650 + }, + { + "epoch": 1.39, + "learning_rate": 1.2777991652391757e-06, + "logits/chosen": 0.5807424783706665, + "logits/rejected": 0.64664626121521, + "logps/chosen": -253.5209197998047, + "logps/rejected": -251.5238800048828, + "loss": 2035.5062, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12570129334926605, + "rewards/margins": 0.09053059667348862, + "rewards/rejected": -0.21623189747333527, + "step": 2660 + }, + { + "epoch": 1.4, + "learning_rate": 1.2579219421906049e-06, + "logits/chosen": 0.612740159034729, + "logits/rejected": 0.6295909285545349, + "logps/chosen": -275.30938720703125, + "logps/rejected": -246.85986328125, + "loss": 1918.4975, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11256787925958633, + "rewards/margins": 0.1016065701842308, + "rewards/rejected": -0.21417441964149475, + "step": 2670 + }, + { + "epoch": 1.4, + "learning_rate": 1.2381484323024178e-06, + "logits/chosen": 0.5338586568832397, + "logits/rejected": 0.614523708820343, + "logps/chosen": -248.32406616210938, + "logps/rejected": -234.80862426757812, + "loss": 2030.9229, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12608769536018372, + "rewards/margins": 0.09196772426366806, + "rewards/rejected": -0.21805541217327118, + "step": 2680 + }, + { + "epoch": 1.41, + "learning_rate": 1.2184802866569991e-06, + "logits/chosen": 0.5740771889686584, + "logits/rejected": 0.5626708269119263, + "logps/chosen": -256.43524169921875, + "logps/rejected": -254.8041534423828, + "loss": 1881.1102, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12152848392724991, + "rewards/margins": 0.1093037948012352, + "rewards/rejected": -0.2308322638273239, + "step": 2690 + }, + { + "epoch": 1.41, + "learning_rate": 1.1989191475388518e-06, + "logits/chosen": 0.5784533023834229, + "logits/rejected": 0.5800845623016357, + "logps/chosen": -261.47900390625, + "logps/rejected": -261.27618408203125, + "loss": 2034.9129, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12217319011688232, + "rewards/margins": 0.08460931479930878, + "rewards/rejected": -0.2067825049161911, + "step": 2700 + }, + { + "epoch": 1.41, + "eval_logits/chosen": 0.5425635576248169, + "eval_logits/rejected": 0.5986801385879517, + "eval_logps/chosen": -268.96331787109375, + "eval_logps/rejected": -254.94712829589844, + "eval_loss": 2032.544677734375, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -0.12345867604017258, + "eval_rewards/margins": 0.0905236080288887, + "eval_rewards/rejected": -0.21398229897022247, + "eval_runtime": 416.8138, + "eval_samples_per_second": 4.798, + "eval_steps_per_second": 1.2, + "step": 2700 + }, + { + "epoch": 1.42, + "learning_rate": 1.1794666482974617e-06, + "logits/chosen": 0.5704789161682129, + "logits/rejected": 0.6782268285751343, + "logps/chosen": -282.65875244140625, + "logps/rejected": -257.2977600097656, + "loss": 1989.7289, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11434787511825562, + "rewards/margins": 0.0926680713891983, + "rewards/rejected": -0.20701594650745392, + "step": 2710 + }, + { + "epoch": 1.42, + "learning_rate": 1.160124413210918e-06, + "logits/chosen": 0.5337072014808655, + "logits/rejected": 0.5367878675460815, + "logps/chosen": -264.4056701660156, + "logps/rejected": -245.18301391601562, + "loss": 1913.7168, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.10953982174396515, + "rewards/margins": 0.10163428634405136, + "rewards/rejected": -0.2111741304397583, + "step": 2720 + }, + { + "epoch": 1.43, + "learning_rate": 1.1408940573502838e-06, + "logits/chosen": 0.5485426783561707, + "logits/rejected": 0.6499109864234924, + "logps/chosen": -264.18505859375, + "logps/rejected": -238.2395477294922, + "loss": 1907.4213, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12582936882972717, + "rewards/margins": 0.09968879073858261, + "rewards/rejected": -0.22551818192005157, + "step": 2730 + }, + { + "epoch": 1.43, + "learning_rate": 1.1217771864447396e-06, + "logits/chosen": 0.5939881205558777, + "logits/rejected": 0.6111994981765747, + "logps/chosen": -261.0160827636719, + "logps/rejected": -244.2744140625, + "loss": 2013.2748, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12591084837913513, + "rewards/margins": 0.08752218633890152, + "rewards/rejected": -0.21343302726745605, + "step": 2740 + }, + { + "epoch": 1.44, + "learning_rate": 1.1027753967475046e-06, + "logits/chosen": 0.5890164375305176, + "logits/rejected": 0.6029259562492371, + "logps/chosen": -259.98382568359375, + "logps/rejected": -254.6918487548828, + "loss": 1947.2754, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1243789941072464, + "rewards/margins": 0.09525910019874573, + "rewards/rejected": -0.21963807940483093, + "step": 2750 + }, + { + "epoch": 1.44, + "learning_rate": 1.08389027490255e-06, + "logits/chosen": 0.5902668833732605, + "logits/rejected": 0.6080381274223328, + "logps/chosen": -248.3684844970703, + "logps/rejected": -263.76776123046875, + "loss": 2085.1875, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13539178669452667, + "rewards/margins": 0.07983563095331192, + "rewards/rejected": -0.215227410197258, + "step": 2760 + }, + { + "epoch": 1.45, + "learning_rate": 1.0651233978121145e-06, + "logits/chosen": 0.5521366596221924, + "logits/rejected": 0.5906900763511658, + "logps/chosen": -300.05230712890625, + "logps/rejected": -272.1240234375, + "loss": 1883.4229, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11396439373493195, + "rewards/margins": 0.09765832126140594, + "rewards/rejected": -0.2116227149963379, + "step": 2770 + }, + { + "epoch": 1.46, + "learning_rate": 1.046476332505036e-06, + "logits/chosen": 0.6347781419754028, + "logits/rejected": 0.6657929420471191, + "logps/chosen": -250.8879852294922, + "logps/rejected": -223.33255004882812, + "loss": 1819.1854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11131460964679718, + "rewards/margins": 0.11518070846796036, + "rewards/rejected": -0.22649531066417694, + "step": 2780 + }, + { + "epoch": 1.46, + "learning_rate": 1.0279506360059005e-06, + "logits/chosen": 0.5551185011863708, + "logits/rejected": 0.5792326331138611, + "logps/chosen": -262.4249572753906, + "logps/rejected": -267.63763427734375, + "loss": 2209.3523, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.12959793210029602, + "rewards/margins": 0.06600113213062286, + "rewards/rejected": -0.19559906423091888, + "step": 2790 + }, + { + "epoch": 1.47, + "learning_rate": 1.0095478552050348e-06, + "logits/chosen": 0.6019959449768066, + "logits/rejected": 0.6074908971786499, + "logps/chosen": -273.5536193847656, + "logps/rejected": -267.5385437011719, + "loss": 2068.2822, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.13250732421875, + "rewards/margins": 0.09504042565822601, + "rewards/rejected": -0.2275477647781372, + "step": 2800 + }, + { + "epoch": 1.47, + "eval_logits/chosen": 0.5382584929466248, + "eval_logits/rejected": 0.5942660570144653, + "eval_logps/chosen": -269.1269836425781, + "eval_logps/rejected": -255.16705322265625, + "eval_loss": 2030.8697509765625, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -0.1250954419374466, + "eval_rewards/margins": 0.09108588099479675, + "eval_rewards/rejected": -0.21618132293224335, + "eval_runtime": 416.6389, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 2800 + }, + { + "epoch": 1.47, + "learning_rate": 9.912695267293383e-07, + "logits/chosen": 0.5214653015136719, + "logits/rejected": 0.5876752734184265, + "logps/chosen": -265.47882080078125, + "logps/rejected": -239.1663360595703, + "loss": 1950.4564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10924456268548965, + "rewards/margins": 0.09817437827587128, + "rewards/rejected": -0.20741891860961914, + "step": 2810 + }, + { + "epoch": 1.48, + "learning_rate": 9.731171768139808e-07, + "logits/chosen": 0.6136573553085327, + "logits/rejected": 0.6188865900039673, + "logps/chosen": -284.9826965332031, + "logps/rejected": -261.3671875, + "loss": 2210.9309, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12047699838876724, + "rewards/margins": 0.06873573362827301, + "rewards/rejected": -0.18921272456645966, + "step": 2820 + }, + { + "epoch": 1.48, + "learning_rate": 9.550923211749557e-07, + "logits/chosen": 0.5326896905899048, + "logits/rejected": 0.5845073461532593, + "logps/chosen": -260.52069091796875, + "logps/rejected": -268.2504577636719, + "loss": 2028.4584, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11890892684459686, + "rewards/margins": 0.08525559306144714, + "rewards/rejected": -0.2041645348072052, + "step": 2830 + }, + { + "epoch": 1.49, + "learning_rate": 9.371964648825221e-07, + "logits/chosen": 0.6162235736846924, + "logits/rejected": 0.5728213787078857, + "logps/chosen": -272.336181640625, + "logps/rejected": -252.33346557617188, + "loss": 1941.8066, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10710117965936661, + "rewards/margins": 0.10685823112726212, + "rewards/rejected": -0.21395941078662872, + "step": 2840 + }, + { + "epoch": 1.49, + "learning_rate": 9.194311022355279e-07, + "logits/chosen": 0.5015624761581421, + "logits/rejected": 0.5448901057243347, + "logps/chosen": -276.95538330078125, + "logps/rejected": -250.8726348876953, + "loss": 1832.3256, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.10919035971164703, + "rewards/margins": 0.11640901863574982, + "rewards/rejected": -0.22559937834739685, + "step": 2850 + }, + { + "epoch": 1.5, + "learning_rate": 9.017977166366445e-07, + "logits/chosen": 0.5708821415901184, + "logits/rejected": 0.558485209941864, + "logps/chosen": -258.96807861328125, + "logps/rejected": -263.4228210449219, + "loss": 1947.8791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11477123200893402, + "rewards/margins": 0.09229175001382828, + "rewards/rejected": -0.2070629894733429, + "step": 2860 + }, + { + "epoch": 1.5, + "learning_rate": 8.842977804684938e-07, + "logits/chosen": 0.5845485925674438, + "logits/rejected": 0.6778086423873901, + "logps/chosen": -245.46102905273438, + "logps/rejected": -233.6326141357422, + "loss": 2066.3828, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12042129039764404, + "rewards/margins": 0.08121231943368912, + "rewards/rejected": -0.20163361728191376, + "step": 2870 + }, + { + "epoch": 1.51, + "learning_rate": 8.669327549707096e-07, + "logits/chosen": 0.5501264929771423, + "logits/rejected": 0.6287878751754761, + "logps/chosen": -281.37371826171875, + "logps/rejected": -252.8254852294922, + "loss": 1868.0959, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1148761659860611, + "rewards/margins": 0.1036820039153099, + "rewards/rejected": -0.2185581624507904, + "step": 2880 + }, + { + "epoch": 1.51, + "learning_rate": 8.497040901179232e-07, + "logits/chosen": 0.5025564432144165, + "logits/rejected": 0.5421415567398071, + "logps/chosen": -276.861572265625, + "logps/rejected": -267.47723388671875, + "loss": 1753.2416, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.10830195993185043, + "rewards/margins": 0.12366169691085815, + "rewards/rejected": -0.2319636344909668, + "step": 2890 + }, + { + "epoch": 1.52, + "learning_rate": 8.326132244986932e-07, + "logits/chosen": 0.6039875745773315, + "logits/rejected": 0.6563787460327148, + "logps/chosen": -282.04266357421875, + "logps/rejected": -257.56622314453125, + "loss": 1977.4029, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12028930336236954, + "rewards/margins": 0.09056351333856583, + "rewards/rejected": -0.21085281670093536, + "step": 2900 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 0.5381389260292053, + "eval_logits/rejected": 0.5940784811973572, + "eval_logps/chosen": -269.125244140625, + "eval_logps/rejected": -255.16897583007812, + "eval_loss": 2030.603271484375, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": -0.12507818639278412, + "eval_rewards/margins": 0.09112255275249481, + "eval_rewards/rejected": -0.21620073914527893, + "eval_runtime": 416.5425, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 2900 + }, + { + "epoch": 1.52, + "learning_rate": 8.156615851953798e-07, + "logits/chosen": 0.559486448764801, + "logits/rejected": 0.5794366598129272, + "logps/chosen": -256.5633239746094, + "logps/rejected": -259.57696533203125, + "loss": 1948.7941, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.10746540874242783, + "rewards/margins": 0.10354423522949219, + "rewards/rejected": -0.2110096514225006, + "step": 2910 + }, + { + "epoch": 1.53, + "learning_rate": 7.988505876649863e-07, + "logits/chosen": 0.6247807741165161, + "logits/rejected": 0.6010321974754333, + "logps/chosen": -271.8721923828125, + "logps/rejected": -256.86480712890625, + "loss": 2099.2482, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12578730285167694, + "rewards/margins": 0.084382563829422, + "rewards/rejected": -0.21016988158226013, + "step": 2920 + }, + { + "epoch": 1.53, + "learning_rate": 7.821816356209677e-07, + "logits/chosen": 0.5775936841964722, + "logits/rejected": 0.6070097088813782, + "logps/chosen": -272.50653076171875, + "logps/rejected": -251.46243286132812, + "loss": 2020.2645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10367625951766968, + "rewards/margins": 0.08927679061889648, + "rewards/rejected": -0.19295303523540497, + "step": 2930 + }, + { + "epoch": 1.54, + "learning_rate": 7.656561209160248e-07, + "logits/chosen": 0.521769642829895, + "logits/rejected": 0.5275167226791382, + "logps/chosen": -289.0915222167969, + "logps/rejected": -263.7314453125, + "loss": 1947.6189, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12388887256383896, + "rewards/margins": 0.1033380776643753, + "rewards/rejected": -0.22722692787647247, + "step": 2940 + }, + { + "epoch": 1.54, + "learning_rate": 7.492754234258794e-07, + "logits/chosen": 0.5926128625869751, + "logits/rejected": 0.6193209886550903, + "logps/chosen": -241.3407440185547, + "logps/rejected": -225.17724609375, + "loss": 1876.9375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10547138750553131, + "rewards/margins": 0.10478665679693222, + "rewards/rejected": -0.21025805175304413, + "step": 2950 + }, + { + "epoch": 1.55, + "learning_rate": 7.330409109340563e-07, + "logits/chosen": 0.5721119046211243, + "logits/rejected": 0.5718821287155151, + "logps/chosen": -267.39471435546875, + "logps/rejected": -244.5928192138672, + "loss": 2126.3432, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1287916600704193, + "rewards/margins": 0.07899868488311768, + "rewards/rejected": -0.2077903300523758, + "step": 2960 + }, + { + "epoch": 1.55, + "learning_rate": 7.169539390176769e-07, + "logits/chosen": 0.5741583704948425, + "logits/rejected": 0.5660156011581421, + "logps/chosen": -219.59640502929688, + "logps/rejected": -233.7797088623047, + "loss": 1845.3854, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.12960752844810486, + "rewards/margins": 0.10263533890247345, + "rewards/rejected": -0.23224285244941711, + "step": 2970 + }, + { + "epoch": 1.56, + "learning_rate": 7.010158509342682e-07, + "logits/chosen": 0.5922077298164368, + "logits/rejected": 0.6388793587684631, + "logps/chosen": -258.38946533203125, + "logps/rejected": -236.8799285888672, + "loss": 1677.0656, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11209128051996231, + "rewards/margins": 0.13626167178153992, + "rewards/rejected": -0.24835292994976044, + "step": 2980 + }, + { + "epoch": 1.57, + "learning_rate": 6.852279775095976e-07, + "logits/chosen": 0.6180992722511292, + "logits/rejected": 0.6189014911651611, + "logps/chosen": -272.6584167480469, + "logps/rejected": -247.75033569335938, + "loss": 1925.8682, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11567474901676178, + "rewards/margins": 0.09763548523187637, + "rewards/rejected": -0.21331021189689636, + "step": 2990 + }, + { + "epoch": 1.57, + "learning_rate": 6.695916370265529e-07, + "logits/chosen": 0.6014515161514282, + "logits/rejected": 0.5875986814498901, + "logps/chosen": -265.0668029785156, + "logps/rejected": -241.8825225830078, + "loss": 2110.2887, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1249975711107254, + "rewards/margins": 0.07782919704914093, + "rewards/rejected": -0.20282676815986633, + "step": 3000 + }, + { + "epoch": 1.57, + "eval_logits/chosen": 0.5348395109176636, + "eval_logits/rejected": 0.5908406972885132, + "eval_logps/chosen": -269.2049865722656, + "eval_logps/rejected": -255.2820587158203, + "eval_loss": 2030.5706787109375, + "eval_rewards/accuracies": 0.690500020980835, + "eval_rewards/chosen": -0.12587547302246094, + "eval_rewards/margins": 0.09145611524581909, + "eval_rewards/rejected": -0.21733158826828003, + "eval_runtime": 416.6652, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 3000 + }, + { + "epoch": 1.58, + "learning_rate": 6.541081351150638e-07, + "logits/chosen": 0.5409640669822693, + "logits/rejected": 0.5331202149391174, + "logps/chosen": -279.83941650390625, + "logps/rejected": -291.9646301269531, + "loss": 2035.416, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.10843896865844727, + "rewards/margins": 0.09788022935390472, + "rewards/rejected": -0.2063191831111908, + "step": 3010 + }, + { + "epoch": 1.58, + "learning_rate": 6.387787646430854e-07, + "logits/chosen": 0.5321037769317627, + "logits/rejected": 0.5582699775695801, + "logps/chosen": -267.9813232421875, + "logps/rejected": -264.12567138671875, + "loss": 2006.4391, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10174393653869629, + "rewards/margins": 0.08532971143722534, + "rewards/rejected": -0.18707364797592163, + "step": 3020 + }, + { + "epoch": 1.59, + "learning_rate": 6.2360480560864e-07, + "logits/chosen": 0.5698617696762085, + "logits/rejected": 0.5839768648147583, + "logps/chosen": -251.5703125, + "logps/rejected": -235.4780731201172, + "loss": 1821.7498, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10450971126556396, + "rewards/margins": 0.10978861898183823, + "rewards/rejected": -0.2142982929944992, + "step": 3030 + }, + { + "epoch": 1.59, + "learning_rate": 6.085875250329401e-07, + "logits/chosen": 0.5382856726646423, + "logits/rejected": 0.6018794178962708, + "logps/chosen": -304.603271484375, + "logps/rejected": -263.90570068359375, + "loss": 1820.3352, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10189330577850342, + "rewards/margins": 0.12546256184577942, + "rewards/rejected": -0.22735583782196045, + "step": 3040 + }, + { + "epoch": 1.6, + "learning_rate": 5.937281768545919e-07, + "logits/chosen": 0.600039005279541, + "logits/rejected": 0.5895189046859741, + "logps/chosen": -288.302734375, + "logps/rejected": -266.3108215332031, + "loss": 2142.2947, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1374974250793457, + "rewards/margins": 0.08518020063638687, + "rewards/rejected": -0.22267761826515198, + "step": 3050 + }, + { + "epoch": 1.6, + "learning_rate": 5.79028001824894e-07, + "logits/chosen": 0.577072024345398, + "logits/rejected": 0.5782276391983032, + "logps/chosen": -258.71734619140625, + "logps/rejected": -252.88906860351562, + "loss": 2083.8391, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.13044390082359314, + "rewards/margins": 0.09016549587249756, + "rewards/rejected": -0.22060942649841309, + "step": 3060 + }, + { + "epoch": 1.61, + "learning_rate": 5.644882274042285e-07, + "logits/chosen": 0.5784581303596497, + "logits/rejected": 0.5805580019950867, + "logps/chosen": -286.2455139160156, + "logps/rejected": -253.4708251953125, + "loss": 1979.6775, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11061519384384155, + "rewards/margins": 0.10562906414270401, + "rewards/rejected": -0.21624425053596497, + "step": 3070 + }, + { + "epoch": 1.61, + "learning_rate": 5.501100676595761e-07, + "logits/chosen": 0.5630078911781311, + "logits/rejected": 0.5682691335678101, + "logps/chosen": -267.6597900390625, + "logps/rejected": -250.3827667236328, + "loss": 1964.8426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1262790709733963, + "rewards/margins": 0.09482350945472717, + "rewards/rejected": -0.22110256552696228, + "step": 3080 + }, + { + "epoch": 1.62, + "learning_rate": 5.358947231631375e-07, + "logits/chosen": 0.534908652305603, + "logits/rejected": 0.5746644139289856, + "logps/chosen": -283.8402404785156, + "logps/rejected": -272.68670654296875, + "loss": 1792.1418, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0998634472489357, + "rewards/margins": 0.1285194605588913, + "rewards/rejected": -0.2283829152584076, + "step": 3090 + }, + { + "epoch": 1.62, + "learning_rate": 5.218433808920884e-07, + "logits/chosen": 0.5141295194625854, + "logits/rejected": 0.5280352234840393, + "logps/chosen": -262.8772888183594, + "logps/rejected": -246.84671020507812, + "loss": 2068.2863, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.10809364169836044, + "rewards/margins": 0.08412571996450424, + "rewards/rejected": -0.19221936166286469, + "step": 3100 + }, + { + "epoch": 1.62, + "eval_logits/chosen": 0.5356869697570801, + "eval_logits/rejected": 0.5913118720054626, + "eval_logps/chosen": -269.03900146484375, + "eval_logps/rejected": -255.10865783691406, + "eval_loss": 2029.4173583984375, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -0.12421557307243347, + "eval_rewards/margins": 0.09138190746307373, + "eval_rewards/rejected": -0.2155974805355072, + "eval_runtime": 416.645, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 3100 + }, + { + "epoch": 1.63, + "learning_rate": 5.07957214129464e-07, + "logits/chosen": 0.6343733072280884, + "logits/rejected": 0.6377061605453491, + "logps/chosen": -230.1392059326172, + "logps/rejected": -217.2322540283203, + "loss": 2110.5152, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.14483976364135742, + "rewards/margins": 0.082811638712883, + "rewards/rejected": -0.22765140235424042, + "step": 3110 + }, + { + "epoch": 1.63, + "learning_rate": 4.942373823661928e-07, + "logits/chosen": 0.5317670702934265, + "logits/rejected": 0.5477628707885742, + "logps/chosen": -253.5054931640625, + "logps/rejected": -295.16021728515625, + "loss": 2379.184, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.13306215405464172, + "rewards/margins": 0.05652584508061409, + "rewards/rejected": -0.18958799540996552, + "step": 3120 + }, + { + "epoch": 1.64, + "learning_rate": 4.806850312042782e-07, + "logits/chosen": 0.6451593637466431, + "logits/rejected": 0.5899637937545776, + "logps/chosen": -289.49151611328125, + "logps/rejected": -257.98443603515625, + "loss": 1992.5604, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12486691772937775, + "rewards/margins": 0.09148009121417999, + "rewards/rejected": -0.21634697914123535, + "step": 3130 + }, + { + "epoch": 1.64, + "learning_rate": 4.6730129226114363e-07, + "logits/chosen": 0.5886529684066772, + "logits/rejected": 0.5409609079360962, + "logps/chosen": -258.1368103027344, + "logps/rejected": -249.2908935546875, + "loss": 1995.9428, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11228666454553604, + "rewards/margins": 0.08399216085672379, + "rewards/rejected": -0.19627881050109863, + "step": 3140 + }, + { + "epoch": 1.65, + "learning_rate": 4.540872830751386e-07, + "logits/chosen": 0.5374349355697632, + "logits/rejected": 0.5601732134819031, + "logps/chosen": -266.9260559082031, + "logps/rejected": -266.8581237792969, + "loss": 2206.0141, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12373526394367218, + "rewards/margins": 0.060657333582639694, + "rewards/rejected": -0.18439260125160217, + "step": 3150 + }, + { + "epoch": 1.65, + "learning_rate": 4.4104410701222703e-07, + "logits/chosen": 0.5198964476585388, + "logits/rejected": 0.5673514008522034, + "logps/chosen": -250.6959228515625, + "logps/rejected": -233.34774780273438, + "loss": 1759.9059, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11838851869106293, + "rewards/margins": 0.12481342256069183, + "rewards/rejected": -0.24320194125175476, + "step": 3160 + }, + { + "epoch": 1.66, + "learning_rate": 4.281728531738563e-07, + "logits/chosen": 0.597510814666748, + "logits/rejected": 0.6312834620475769, + "logps/chosen": -268.5950012207031, + "logps/rejected": -250.58059692382812, + "loss": 1960.2506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11448697745800018, + "rewards/margins": 0.09015806764364243, + "rewards/rejected": -0.204645037651062, + "step": 3170 + }, + { + "epoch": 1.66, + "learning_rate": 4.154745963060197e-07, + "logits/chosen": 0.507027268409729, + "logits/rejected": 0.5732488632202148, + "logps/chosen": -280.98382568359375, + "logps/rejected": -286.677734375, + "loss": 1965.5633, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12583956122398376, + "rewards/margins": 0.10458560287952423, + "rewards/rejected": -0.230425164103508, + "step": 3180 + }, + { + "epoch": 1.67, + "learning_rate": 4.029503967095097e-07, + "logits/chosen": 0.4729984402656555, + "logits/rejected": 0.5827825665473938, + "logps/chosen": -268.3514709472656, + "logps/rejected": -247.7762451171875, + "loss": 1878.176, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.1029181256890297, + "rewards/margins": 0.09399056434631348, + "rewards/rejected": -0.19690869748592377, + "step": 3190 + }, + { + "epoch": 1.67, + "learning_rate": 3.9060130015138863e-07, + "logits/chosen": 0.5795052647590637, + "logits/rejected": 0.6329609155654907, + "logps/chosen": -262.9614562988281, + "logps/rejected": -240.7771453857422, + "loss": 1977.8852, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1271854192018509, + "rewards/margins": 0.08980287611484528, + "rewards/rejected": -0.21698825061321259, + "step": 3200 + }, + { + "epoch": 1.67, + "eval_logits/chosen": 0.5363709926605225, + "eval_logits/rejected": 0.5920352935791016, + "eval_logps/chosen": -269.10711669921875, + "eval_logps/rejected": -255.2016143798828, + "eval_loss": 2026.12890625, + "eval_rewards/accuracies": 0.6959999799728394, + "eval_rewards/chosen": -0.1248970478773117, + "eval_rewards/margins": 0.09163003414869308, + "eval_rewards/rejected": -0.21652711927890778, + "eval_runtime": 416.4729, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 3200 + }, + { + "epoch": 1.68, + "learning_rate": 3.784283377776651e-07, + "logits/chosen": 0.6236351728439331, + "logits/rejected": 0.630204975605011, + "logps/chosen": -267.3162536621094, + "logps/rejected": -241.71484375, + "loss": 2151.366, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13855421543121338, + "rewards/margins": 0.07831588387489319, + "rewards/rejected": -0.21687009930610657, + "step": 3210 + }, + { + "epoch": 1.69, + "learning_rate": 3.664325260271953e-07, + "logits/chosen": 0.586463451385498, + "logits/rejected": 0.6219819784164429, + "logps/chosen": -240.7449493408203, + "logps/rejected": -260.012451171875, + "loss": 2145.4057, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.12818796932697296, + "rewards/margins": 0.07563059777021408, + "rewards/rejected": -0.20381855964660645, + "step": 3220 + }, + { + "epoch": 1.69, + "learning_rate": 3.5461486654680746e-07, + "logits/chosen": 0.5574949979782104, + "logits/rejected": 0.6360602378845215, + "logps/chosen": -262.3418884277344, + "logps/rejected": -256.82366943359375, + "loss": 2092.1422, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.10938684642314911, + "rewards/margins": 0.08223484456539154, + "rewards/rejected": -0.19162169098854065, + "step": 3230 + }, + { + "epoch": 1.7, + "learning_rate": 3.429763461076677e-07, + "logits/chosen": 0.5418500304222107, + "logits/rejected": 0.5625206828117371, + "logps/chosen": -271.43792724609375, + "logps/rejected": -255.0489501953125, + "loss": 2035.8221, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13326099514961243, + "rewards/margins": 0.08890596777200699, + "rewards/rejected": -0.2221669703722, + "step": 3240 + }, + { + "epoch": 1.7, + "learning_rate": 3.315179365228824e-07, + "logits/chosen": 0.5612285733222961, + "logits/rejected": 0.5996168851852417, + "logps/chosen": -284.46612548828125, + "logps/rejected": -261.4803771972656, + "loss": 1981.793, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12592165172100067, + "rewards/margins": 0.0976746454834938, + "rewards/rejected": -0.22359630465507507, + "step": 3250 + }, + { + "epoch": 1.71, + "learning_rate": 3.202405945663556e-07, + "logits/chosen": 0.5869094729423523, + "logits/rejected": 0.5811904668807983, + "logps/chosen": -273.99249267578125, + "logps/rejected": -274.64410400390625, + "loss": 2093.5598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11422860622406006, + "rewards/margins": 0.09214137494564056, + "rewards/rejected": -0.20636996626853943, + "step": 3260 + }, + { + "epoch": 1.71, + "learning_rate": 3.09145261892895e-07, + "logits/chosen": 0.5232716798782349, + "logits/rejected": 0.6522939801216125, + "logps/chosen": -266.1856384277344, + "logps/rejected": -255.4128875732422, + "loss": 1884.3191, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12048964202404022, + "rewards/margins": 0.1096540093421936, + "rewards/rejected": -0.23014366626739502, + "step": 3270 + }, + { + "epoch": 1.72, + "learning_rate": 2.982328649595856e-07, + "logits/chosen": 0.5434025526046753, + "logits/rejected": 0.5516559481620789, + "logps/chosen": -261.6842346191406, + "logps/rejected": -268.0997619628906, + "loss": 2020.6832, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12252092361450195, + "rewards/margins": 0.08777900040149689, + "rewards/rejected": -0.21029992401599884, + "step": 3280 + }, + { + "epoch": 1.72, + "learning_rate": 2.8750431494843076e-07, + "logits/chosen": 0.5793955326080322, + "logits/rejected": 0.5846759676933289, + "logps/chosen": -256.3296813964844, + "logps/rejected": -258.81512451171875, + "loss": 2103.4049, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13012662529945374, + "rewards/margins": 0.07379056513309479, + "rewards/rejected": -0.20391719043254852, + "step": 3290 + }, + { + "epoch": 1.73, + "learning_rate": 2.7696050769026954e-07, + "logits/chosen": 0.5790996551513672, + "logits/rejected": 0.5938167572021484, + "logps/chosen": -229.91567993164062, + "logps/rejected": -236.54916381835938, + "loss": 2123.3787, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12688665091991425, + "rewards/margins": 0.07247930765151978, + "rewards/rejected": -0.19936595857143402, + "step": 3300 + }, + { + "epoch": 1.73, + "eval_logits/chosen": 0.5370410680770874, + "eval_logits/rejected": 0.5926198363304138, + "eval_logps/chosen": -269.0932922363281, + "eval_logps/rejected": -255.16659545898438, + "eval_loss": 2027.355224609375, + "eval_rewards/accuracies": 0.6930000185966492, + "eval_rewards/chosen": -0.12475859373807907, + "eval_rewards/margins": 0.09141821414232254, + "eval_rewards/rejected": -0.2161768227815628, + "eval_runtime": 416.6319, + "eval_samples_per_second": 4.8, + "eval_steps_per_second": 1.2, + "step": 3300 + }, + { + "epoch": 1.73, + "learning_rate": 2.666023235899734e-07, + "logits/chosen": 0.5439051389694214, + "logits/rejected": 0.638985276222229, + "logps/chosen": -249.70217895507812, + "logps/rejected": -246.07040405273438, + "loss": 1936.2746, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.13094648718833923, + "rewards/margins": 0.10491780191659927, + "rewards/rejected": -0.23586425185203552, + "step": 3310 + }, + { + "epoch": 1.74, + "learning_rate": 2.564306275529341e-07, + "logits/chosen": 0.5696260929107666, + "logits/rejected": 0.6271142959594727, + "logps/chosen": -288.08721923828125, + "logps/rejected": -263.6356201171875, + "loss": 1974.6082, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11936695873737335, + "rewards/margins": 0.10857198387384415, + "rewards/rejected": -0.2279389351606369, + "step": 3320 + }, + { + "epoch": 1.74, + "learning_rate": 2.4644626891284243e-07, + "logits/chosen": 0.5715283155441284, + "logits/rejected": 0.6530539393424988, + "logps/chosen": -245.0167236328125, + "logps/rejected": -238.81460571289062, + "loss": 2066.4543, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12561528384685516, + "rewards/margins": 0.08124671876430511, + "rewards/rejected": -0.20686200261116028, + "step": 3330 + }, + { + "epoch": 1.75, + "learning_rate": 2.3665008136077332e-07, + "logits/chosen": 0.5698996186256409, + "logits/rejected": 0.6029760837554932, + "logps/chosen": -264.3577575683594, + "logps/rejected": -271.068115234375, + "loss": 2090.4994, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1360047161579132, + "rewards/margins": 0.08008682727813721, + "rewards/rejected": -0.2160915583372116, + "step": 3340 + }, + { + "epoch": 1.75, + "learning_rate": 2.2704288287556718e-07, + "logits/chosen": 0.5687640309333801, + "logits/rejected": 0.5940347909927368, + "logps/chosen": -257.6128845214844, + "logps/rejected": -248.91015625, + "loss": 2136.7539, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12395147979259491, + "rewards/margins": 0.0794539600610733, + "rewards/rejected": -0.2034054547548294, + "step": 3350 + }, + { + "epoch": 1.76, + "learning_rate": 2.1762547565553293e-07, + "logits/chosen": 0.5388206839561462, + "logits/rejected": 0.5561047792434692, + "logps/chosen": -261.9334716796875, + "logps/rejected": -260.14556884765625, + "loss": 1989.5404, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1216350793838501, + "rewards/margins": 0.09191958606243134, + "rewards/rejected": -0.21355466544628143, + "step": 3360 + }, + { + "epoch": 1.76, + "learning_rate": 2.083986460514631e-07, + "logits/chosen": 0.5701113343238831, + "logits/rejected": 0.6196510195732117, + "logps/chosen": -251.4109344482422, + "logps/rejected": -252.1616668701172, + "loss": 1820.4641, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12642905116081238, + "rewards/margins": 0.10934920608997345, + "rewards/rejected": -0.23577824234962463, + "step": 3370 + }, + { + "epoch": 1.77, + "learning_rate": 1.993631645009747e-07, + "logits/chosen": 0.5514119863510132, + "logits/rejected": 0.5474542379379272, + "logps/chosen": -256.45831298828125, + "logps/rejected": -227.55514526367188, + "loss": 1845.2707, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.10441195964813232, + "rewards/margins": 0.1090712919831276, + "rewards/rejected": -0.21348324418067932, + "step": 3380 + }, + { + "epoch": 1.77, + "learning_rate": 1.9051978546417715e-07, + "logits/chosen": 0.5202070474624634, + "logits/rejected": 0.5690991282463074, + "logps/chosen": -260.46600341796875, + "logps/rejected": -261.4082946777344, + "loss": 1912.7502, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.10940267145633698, + "rewards/margins": 0.10180971771478653, + "rewards/rejected": -0.2112123966217041, + "step": 3390 + }, + { + "epoch": 1.78, + "learning_rate": 1.818692473606748e-07, + "logits/chosen": 0.5479332208633423, + "logits/rejected": 0.5715588331222534, + "logps/chosen": -258.5849609375, + "logps/rejected": -264.29388427734375, + "loss": 1945.4934, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12341777980327606, + "rewards/margins": 0.09188680350780487, + "rewards/rejected": -0.21530456840991974, + "step": 3400 + }, + { + "epoch": 1.78, + "eval_logits/chosen": 0.5352820754051208, + "eval_logits/rejected": 0.5908908247947693, + "eval_logps/chosen": -269.1009826660156, + "eval_logps/rejected": -255.1898651123047, + "eval_loss": 2025.7803955078125, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -0.12483509629964828, + "eval_rewards/margins": 0.09157437831163406, + "eval_rewards/rejected": -0.21640948951244354, + "eval_runtime": 416.572, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 3400 + }, + { + "epoch": 1.78, + "learning_rate": 1.7341227250790989e-07, + "logits/chosen": 0.5836583375930786, + "logits/rejected": 0.632857084274292, + "logps/chosen": -245.8205108642578, + "logps/rejected": -252.48471069335938, + "loss": 1828.1664, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.10635235160589218, + "rewards/margins": 0.11655166000127792, + "rewards/rejected": -0.2229039967060089, + "step": 3410 + }, + { + "epoch": 1.79, + "learning_rate": 1.6514956706084885e-07, + "logits/chosen": 0.6221760511398315, + "logits/rejected": 0.5567342042922974, + "logps/chosen": -266.02239990234375, + "logps/rejected": -246.35385131835938, + "loss": 1826.1057, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09994658827781677, + "rewards/margins": 0.1130019798874855, + "rewards/rejected": -0.21294856071472168, + "step": 3420 + }, + { + "epoch": 1.8, + "learning_rate": 1.5708182095301867e-07, + "logits/chosen": 0.6005284190177917, + "logits/rejected": 0.6083909869194031, + "logps/chosen": -280.53741455078125, + "logps/rejected": -261.88201904296875, + "loss": 1851.2512, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.11887475103139877, + "rewards/margins": 0.10133900493383408, + "rewards/rejected": -0.22021374106407166, + "step": 3430 + }, + { + "epoch": 1.8, + "learning_rate": 1.4920970783889737e-07, + "logits/chosen": 0.5680890083312988, + "logits/rejected": 0.5507141351699829, + "logps/chosen": -271.96990966796875, + "logps/rejected": -241.1654815673828, + "loss": 2041.0072, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.1112411841750145, + "rewards/margins": 0.08822907507419586, + "rewards/rejected": -0.19947026669979095, + "step": 3440 + }, + { + "epoch": 1.81, + "learning_rate": 1.4153388503766492e-07, + "logits/chosen": 0.5438860654830933, + "logits/rejected": 0.5644111633300781, + "logps/chosen": -279.3092346191406, + "logps/rejected": -239.37167358398438, + "loss": 1966.0102, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1369594931602478, + "rewards/margins": 0.09016064554452896, + "rewards/rejected": -0.22712013125419617, + "step": 3450 + }, + { + "epoch": 1.81, + "learning_rate": 1.340549934783164e-07, + "logits/chosen": 0.6110261082649231, + "logits/rejected": 0.6002285480499268, + "logps/chosen": -255.5424346923828, + "logps/rejected": -258.6153259277344, + "loss": 1778.4668, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.10746339708566666, + "rewards/margins": 0.12720224261283875, + "rewards/rejected": -0.23466560244560242, + "step": 3460 + }, + { + "epoch": 1.82, + "learning_rate": 1.2677365764614452e-07, + "logits/chosen": 0.6116484999656677, + "logits/rejected": 0.6142521500587463, + "logps/chosen": -251.9376983642578, + "logps/rejected": -247.8961639404297, + "loss": 1943.7922, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12416845560073853, + "rewards/margins": 0.09600269794464111, + "rewards/rejected": -0.22017112374305725, + "step": 3470 + }, + { + "epoch": 1.82, + "learning_rate": 1.196904855305961e-07, + "logits/chosen": 0.5488280057907104, + "logits/rejected": 0.6329927444458008, + "logps/chosen": -261.445068359375, + "logps/rejected": -255.37142944335938, + "loss": 2087.1572, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11578680574893951, + "rewards/margins": 0.08292602747678757, + "rewards/rejected": -0.19871282577514648, + "step": 3480 + }, + { + "epoch": 1.83, + "learning_rate": 1.1280606857450387e-07, + "logits/chosen": 0.5712449550628662, + "logits/rejected": 0.6291993856430054, + "logps/chosen": -243.8418731689453, + "logps/rejected": -233.1319122314453, + "loss": 1809.2799, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11564090102910995, + "rewards/margins": 0.11533119529485703, + "rewards/rejected": -0.23097209632396698, + "step": 3490 + }, + { + "epoch": 1.83, + "learning_rate": 1.0612098162470302e-07, + "logits/chosen": 0.5486131906509399, + "logits/rejected": 0.6034047603607178, + "logps/chosen": -253.94577026367188, + "logps/rejected": -243.55593872070312, + "loss": 1937.2627, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10992947965860367, + "rewards/margins": 0.09977956861257553, + "rewards/rejected": -0.2097090482711792, + "step": 3500 + }, + { + "epoch": 1.83, + "eval_logits/chosen": 0.5346845984458923, + "eval_logits/rejected": 0.5903106927871704, + "eval_logps/chosen": -269.0877990722656, + "eval_logps/rejected": -255.17501831054688, + "eval_loss": 2027.823974609375, + "eval_rewards/accuracies": 0.6930000185966492, + "eval_rewards/chosen": -0.12470405548810959, + "eval_rewards/margins": 0.09155706316232681, + "eval_rewards/rejected": -0.2162611186504364, + "eval_runtime": 416.489, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 3500 + }, + { + "epoch": 1.84, + "learning_rate": 9.96357828840297e-08, + "logits/chosen": 0.5791751742362976, + "logits/rejected": 0.6535072326660156, + "logps/chosen": -262.40301513671875, + "logps/rejected": -260.7125549316406, + "loss": 1964.6746, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1026170626282692, + "rewards/margins": 0.08860354125499725, + "rewards/rejected": -0.19122058153152466, + "step": 3510 + }, + { + "epoch": 1.84, + "learning_rate": 9.335101386471285e-08, + "logits/chosen": 0.5727615356445312, + "logits/rejected": 0.5819220542907715, + "logps/chosen": -284.75616455078125, + "logps/rejected": -250.07083129882812, + "loss": 2092.7152, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12308394908905029, + "rewards/margins": 0.08286546170711517, + "rewards/rejected": -0.20594939589500427, + "step": 3520 + }, + { + "epoch": 1.85, + "learning_rate": 8.726719934315648e-08, + "logits/chosen": 0.5491209626197815, + "logits/rejected": 0.5870348811149597, + "logps/chosen": -249.99295043945312, + "logps/rejected": -249.68679809570312, + "loss": 1912.009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10659299790859222, + "rewards/margins": 0.09919796884059906, + "rewards/rejected": -0.20579096674919128, + "step": 3530 + }, + { + "epoch": 1.85, + "learning_rate": 8.138484731612273e-08, + "logits/chosen": 0.6029896140098572, + "logits/rejected": 0.6406581997871399, + "logps/chosen": -256.2204284667969, + "logps/rejected": -229.89990234375, + "loss": 1913.6865, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10959267616271973, + "rewards/margins": 0.1080545037984848, + "rewards/rejected": -0.21764719486236572, + "step": 3540 + }, + { + "epoch": 1.86, + "learning_rate": 7.57044489583128e-08, + "logits/chosen": 0.5283448100090027, + "logits/rejected": 0.5756082534790039, + "logps/chosen": -266.2628479003906, + "logps/rejected": -251.70962524414062, + "loss": 2214.9162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12659896910190582, + "rewards/margins": 0.06956067681312561, + "rewards/rejected": -0.19615966081619263, + "step": 3550 + }, + { + "epoch": 1.86, + "learning_rate": 7.022647858135501e-08, + "logits/chosen": 0.5648905038833618, + "logits/rejected": 0.5851987600326538, + "logps/chosen": -255.75704956054688, + "logps/rejected": -235.11666870117188, + "loss": 1911.8838, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10966980457305908, + "rewards/margins": 0.09904567152261734, + "rewards/rejected": -0.20871546864509583, + "step": 3560 + }, + { + "epoch": 1.87, + "learning_rate": 6.495139359419922e-08, + "logits/chosen": 0.5362564921379089, + "logits/rejected": 0.629612922668457, + "logps/chosen": -303.055419921875, + "logps/rejected": -271.39288330078125, + "loss": 1875.824, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1098160371184349, + "rewards/margins": 0.11708301305770874, + "rewards/rejected": -0.22689905762672424, + "step": 3570 + }, + { + "epoch": 1.87, + "learning_rate": 5.987963446492384e-08, + "logits/chosen": 0.5597736239433289, + "logits/rejected": 0.5716227889060974, + "logps/chosen": -262.5528564453125, + "logps/rejected": -251.4736785888672, + "loss": 1887.5355, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10405333340167999, + "rewards/margins": 0.11675725132226944, + "rewards/rejected": -0.22081057727336884, + "step": 3580 + }, + { + "epoch": 1.88, + "learning_rate": 5.501162468395688e-08, + "logits/chosen": 0.5817372798919678, + "logits/rejected": 0.5784239768981934, + "logps/chosen": -251.4989013671875, + "logps/rejected": -250.26522827148438, + "loss": 1920.6814, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12673336267471313, + "rewards/margins": 0.11053230613470078, + "rewards/rejected": -0.2372656762599945, + "step": 3590 + }, + { + "epoch": 1.88, + "learning_rate": 5.034777072871394e-08, + "logits/chosen": 0.5656172037124634, + "logits/rejected": 0.6273232102394104, + "logps/chosen": -250.90109252929688, + "logps/rejected": -256.27178955078125, + "loss": 2007.2062, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1188703402876854, + "rewards/margins": 0.09515853226184845, + "rewards/rejected": -0.21402888000011444, + "step": 3600 + }, + { + "epoch": 1.88, + "eval_logits/chosen": 0.5352125763893127, + "eval_logits/rejected": 0.5910032391548157, + "eval_logps/chosen": -269.0622863769531, + "eval_logps/rejected": -255.18426513671875, + "eval_loss": 2025.32275390625, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": -0.12444862723350525, + "eval_rewards/margins": 0.0919048860669136, + "eval_rewards/rejected": -0.21635350584983826, + "eval_runtime": 416.4513, + "eval_samples_per_second": 4.802, + "eval_steps_per_second": 1.201, + "step": 3600 + }, + { + "epoch": 1.89, + "learning_rate": 4.5888462029658186e-08, + "logits/chosen": 0.5575802326202393, + "logits/rejected": 0.5975883603096008, + "logps/chosen": -251.73623657226562, + "logps/rejected": -250.5563201904297, + "loss": 1952.7262, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12859514355659485, + "rewards/margins": 0.0945589691400528, + "rewards/rejected": -0.22315411269664764, + "step": 3610 + }, + { + "epoch": 1.89, + "learning_rate": 4.163407093778243e-08, + "logits/chosen": 0.5034095048904419, + "logits/rejected": 0.5554597973823547, + "logps/chosen": -264.1334533691406, + "logps/rejected": -260.69805908203125, + "loss": 2102.8385, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1385246217250824, + "rewards/margins": 0.08275660127401352, + "rewards/rejected": -0.22128121554851532, + "step": 3620 + }, + { + "epoch": 1.9, + "learning_rate": 3.7584952693519025e-08, + "logits/chosen": 0.5984662175178528, + "logits/rejected": 0.5975054502487183, + "logps/chosen": -270.71661376953125, + "logps/rejected": -260.6340026855469, + "loss": 1962.2367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11859796196222305, + "rewards/margins": 0.0891033262014389, + "rewards/rejected": -0.20770128071308136, + "step": 3630 + }, + { + "epoch": 1.91, + "learning_rate": 3.37414453970758e-08, + "logits/chosen": 0.5739267468452454, + "logits/rejected": 0.577027440071106, + "logps/chosen": -249.9311065673828, + "logps/rejected": -226.3584442138672, + "loss": 2134.9982, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.13262119889259338, + "rewards/margins": 0.08364128321409225, + "rewards/rejected": -0.21626248955726624, + "step": 3640 + }, + { + "epoch": 1.91, + "learning_rate": 3.0103869980206145e-08, + "logits/chosen": 0.6304140090942383, + "logits/rejected": 0.63347989320755, + "logps/chosen": -239.28915405273438, + "logps/rejected": -258.83660888671875, + "loss": 2049.8613, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.13652729988098145, + "rewards/margins": 0.08251725137233734, + "rewards/rejected": -0.2190445363521576, + "step": 3650 + }, + { + "epoch": 1.92, + "learning_rate": 2.6672530179410183e-08, + "logits/chosen": 0.5943226218223572, + "logits/rejected": 0.6503596305847168, + "logps/chosen": -264.0231018066406, + "logps/rejected": -245.2039031982422, + "loss": 2003.8398, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12203504145145416, + "rewards/margins": 0.09293092787265778, + "rewards/rejected": -0.21496596932411194, + "step": 3660 + }, + { + "epoch": 1.92, + "learning_rate": 2.3447712510573928e-08, + "logits/chosen": 0.6132981777191162, + "logits/rejected": 0.6643080115318298, + "logps/chosen": -258.1444091796875, + "logps/rejected": -241.70986938476562, + "loss": 1827.9928, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12529726326465607, + "rewards/margins": 0.11168257147073746, + "rewards/rejected": -0.23697984218597412, + "step": 3670 + }, + { + "epoch": 1.93, + "learning_rate": 2.04296862450451e-08, + "logits/chosen": 0.5302231907844543, + "logits/rejected": 0.5352843999862671, + "logps/chosen": -270.13079833984375, + "logps/rejected": -240.57931518554688, + "loss": 2135.2258, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10986328125, + "rewards/margins": 0.07277282327413559, + "rewards/rejected": -0.182636097073555, + "step": 3680 + }, + { + "epoch": 1.93, + "learning_rate": 1.7618703387147495e-08, + "logits/chosen": 0.5543524622917175, + "logits/rejected": 0.5519607663154602, + "logps/chosen": -281.1496276855469, + "logps/rejected": -274.3504943847656, + "loss": 1951.3572, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10969813168048859, + "rewards/margins": 0.09588075429201126, + "rewards/rejected": -0.20557889342308044, + "step": 3690 + }, + { + "epoch": 1.94, + "learning_rate": 1.501499865314171e-08, + "logits/chosen": 0.5777779817581177, + "logits/rejected": 0.6056709289550781, + "logps/chosen": -258.98333740234375, + "logps/rejected": -245.7554473876953, + "loss": 2076.715, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11231188476085663, + "rewards/margins": 0.09381435066461563, + "rewards/rejected": -0.20612624287605286, + "step": 3700 + }, + { + "epoch": 1.94, + "eval_logits/chosen": 0.5358251333236694, + "eval_logits/rejected": 0.5913307666778564, + "eval_logps/chosen": -269.0487365722656, + "eval_logps/rejected": -255.13833618164062, + "eval_loss": 2027.4857177734375, + "eval_rewards/accuracies": 0.6919999718666077, + "eval_rewards/chosen": -0.12431324273347855, + "eval_rewards/margins": 0.09158134460449219, + "eval_rewards/rejected": -0.21589456498622894, + "eval_runtime": 416.7132, + "eval_samples_per_second": 4.799, + "eval_steps_per_second": 1.2, + "step": 3700 + }, + { + "epoch": 1.94, + "learning_rate": 1.2618789451623314e-08, + "logits/chosen": 0.5645478963851929, + "logits/rejected": 0.6109569072723389, + "logps/chosen": -224.93197631835938, + "logps/rejected": -236.52059936523438, + "loss": 2014.5852, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12038487195968628, + "rewards/margins": 0.08836686611175537, + "rewards/rejected": -0.20875172317028046, + "step": 3710 + }, + { + "epoch": 1.95, + "learning_rate": 1.0430275865371265e-08, + "logits/chosen": 0.5508732795715332, + "logits/rejected": 0.6115376353263855, + "logps/chosen": -280.9700927734375, + "logps/rejected": -276.9327087402344, + "loss": 2099.5396, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12365047633647919, + "rewards/margins": 0.08925069868564606, + "rewards/rejected": -0.21290118992328644, + "step": 3720 + }, + { + "epoch": 1.95, + "learning_rate": 8.449640634639878e-09, + "logits/chosen": 0.5355272889137268, + "logits/rejected": 0.5811390280723572, + "logps/chosen": -234.78927612304688, + "logps/rejected": -228.4997100830078, + "loss": 2043.5014, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1230451837182045, + "rewards/margins": 0.08070604503154755, + "rewards/rejected": -0.20375123620033264, + "step": 3730 + }, + { + "epoch": 1.96, + "learning_rate": 6.677049141901315e-09, + "logits/chosen": 0.5882354974746704, + "logits/rejected": 0.572884738445282, + "logps/chosen": -238.9161834716797, + "logps/rejected": -247.5298309326172, + "loss": 2155.4453, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.13735826313495636, + "rewards/margins": 0.06890521943569183, + "rewards/rejected": -0.2062634974718094, + "step": 3740 + }, + { + "epoch": 1.96, + "learning_rate": 5.112649398034686e-09, + "logits/chosen": 0.6161108016967773, + "logits/rejected": 0.6904915571212769, + "logps/chosen": -284.59112548828125, + "logps/rejected": -254.5317840576172, + "loss": 2025.8148, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.125542551279068, + "rewards/margins": 0.1034855991601944, + "rewards/rejected": -0.2290281355381012, + "step": 3750 + }, + { + "epoch": 1.97, + "learning_rate": 3.756572029968708e-09, + "logits/chosen": 0.5819270610809326, + "logits/rejected": 0.552914023399353, + "logps/chosen": -255.23886108398438, + "logps/rejected": -249.76144409179688, + "loss": 1779.8414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11097989976406097, + "rewards/margins": 0.1095919981598854, + "rewards/rejected": -0.22057190537452698, + "step": 3760 + }, + { + "epoch": 1.97, + "learning_rate": 2.6089302697732133e-09, + "logits/chosen": 0.5825963020324707, + "logits/rejected": 0.5435997843742371, + "logps/chosen": -250.9562225341797, + "logps/rejected": -227.83010864257812, + "loss": 1853.0563, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11256451904773712, + "rewards/margins": 0.10355798900127411, + "rewards/rejected": -0.21612253785133362, + "step": 3770 + }, + { + "epoch": 1.98, + "learning_rate": 1.6698199452053199e-09, + "logits/chosen": 0.6074197292327881, + "logits/rejected": 0.6451854705810547, + "logps/chosen": -269.6044616699219, + "logps/rejected": -231.6178436279297, + "loss": 1905.3814, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10596567392349243, + "rewards/margins": 0.099820576608181, + "rewards/rejected": -0.20578625798225403, + "step": 3780 + }, + { + "epoch": 1.98, + "learning_rate": 9.393194717061127e-10, + "logits/chosen": 0.5966477394104004, + "logits/rejected": 0.57940673828125, + "logps/chosen": -261.906982421875, + "logps/rejected": -243.7214813232422, + "loss": 2099.1896, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12468767166137695, + "rewards/margins": 0.0848483294248581, + "rewards/rejected": -0.20953598618507385, + "step": 3790 + }, + { + "epoch": 1.99, + "learning_rate": 4.1748984585560094e-10, + "logits/chosen": 0.5209355354309082, + "logits/rejected": 0.6011817455291748, + "logps/chosen": -257.47882080078125, + "logps/rejected": -253.18017578125, + "loss": 2055.2201, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12113461643457413, + "rewards/margins": 0.09108567237854004, + "rewards/rejected": -0.21222028136253357, + "step": 3800 + }, + { + "epoch": 1.99, + "eval_logits/chosen": 0.5346859693527222, + "eval_logits/rejected": 0.5902337431907654, + "eval_logps/chosen": -269.0542907714844, + "eval_logps/rejected": -255.1454620361328, + "eval_loss": 2027.8082275390625, + "eval_rewards/accuracies": 0.6919999718666077, + "eval_rewards/chosen": -0.12436838448047638, + "eval_rewards/margins": 0.09159712493419647, + "eval_rewards/rejected": -0.21596547961235046, + "eval_runtime": 416.5485, + "eval_samples_per_second": 4.801, + "eval_steps_per_second": 1.2, + "step": 3800 + }, + { + "epoch": 1.99, + "learning_rate": 1.0437464027707179e-10, + "logits/chosen": 0.5776988863945007, + "logits/rejected": 0.6123504638671875, + "logps/chosen": -265.8362121582031, + "logps/rejected": -237.8904571533203, + "loss": 2055.609, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1267283707857132, + "rewards/margins": 0.08532574027776718, + "rewards/rejected": -0.21205410361289978, + "step": 3810 + }, + { + "epoch": 2.0, + "learning_rate": 0.0, + "logits/chosen": 0.5118510127067566, + "logits/rejected": 0.5845987200737, + "logps/chosen": -274.1031188964844, + "logps/rejected": -256.99688720703125, + "loss": 2139.6068, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13095693290233612, + "rewards/margins": 0.07750894129276276, + "rewards/rejected": -0.20846585929393768, + "step": 3820 + }, + { + "epoch": 2.0, + "step": 3820, + "total_flos": 0.0, + "train_loss": 2099.8451463309884, + "train_runtime": 42790.3459, + "train_samples_per_second": 1.429, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 10, + "max_steps": 3820, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}