diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11956 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998854993048172, + "eval_steps": 100, + "global_step": 7641, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -2.7937374114990234, + "logits/rejected": -2.696331262588501, + "logps/chosen": -219.9345245361328, + "logps/rejected": -238.54010009765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.686509370803833, + "logits/rejected": -2.617267370223999, + "logps/chosen": -272.2289123535156, + "logps/rejected": -247.8722381591797, + "loss": 0.6929, + "rewards/accuracies": 0.2222222238779068, + "rewards/chosen": 0.00035665329778566957, + "rewards/margins": 0.0005560062127187848, + "rewards/rejected": -0.00019935290038120002, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.6644845008850098, + "logits/rejected": -2.6715245246887207, + "logps/chosen": -264.3231201171875, + "logps/rejected": -269.95965576171875, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": 2.1979305529384874e-05, + "rewards/margins": -0.0005006279679946601, + "rewards/rejected": 0.0005226072971709073, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.709402084350586, + "logits/rejected": -2.5456981658935547, + "logps/chosen": -235.95193481445312, + "logps/rejected": -211.9298095703125, + "loss": 0.6932, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 9.268704889109358e-05, + "rewards/margins": -0.00016139161016326398, + "rewards/rejected": 0.0002540787390898913, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.7728488445281982, + "logits/rejected": -2.588787078857422, + "logps/chosen": -265.30828857421875, + "logps/rejected": -244.3712615966797, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00027978085563518107, + "rewards/margins": -3.424427268328145e-05, + "rewards/rejected": -0.0002455365320201963, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.763617515563965, + "logits/rejected": -2.7047462463378906, + "logps/chosen": -226.91055297851562, + "logps/rejected": -226.02536010742188, + "loss": 0.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -5.5276301281992346e-05, + "rewards/margins": -6.809332262491807e-05, + "rewards/rejected": 1.2817062270187307e-05, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.7146997451782227, + "logits/rejected": -2.6671783924102783, + "logps/chosen": -267.3989562988281, + "logps/rejected": -241.76480102539062, + "loss": 0.6935, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.0004219438415020704, + "rewards/margins": -0.0006447834894061089, + "rewards/rejected": 0.00022283961880020797, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.7232730388641357, + "logits/rejected": -2.6747944355010986, + "logps/chosen": -245.4066619873047, + "logps/rejected": -204.679443359375, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000368702836567536, + "rewards/margins": 0.0002510659396648407, + "rewards/rejected": 0.00011763688962673768, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.68009614944458, + "logits/rejected": -2.608882427215576, + "logps/chosen": -289.00653076171875, + "logps/rejected": -281.58251953125, + "loss": 0.6927, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0005034831119701266, + "rewards/margins": 0.0008431966416537762, + "rewards/rejected": -0.00033971358789131045, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.799229145050049, + "logits/rejected": -2.685251474380493, + "logps/chosen": -254.7516632080078, + "logps/rejected": -224.9294891357422, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00010115172335645184, + "rewards/margins": 0.0002790264261420816, + "rewards/rejected": -0.000380178214982152, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.6838011741638184, + "logits/rejected": -2.681504726409912, + "logps/chosen": -228.4124298095703, + "logps/rejected": -235.77685546875, + "loss": 0.6934, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.0002754587621893734, + "rewards/margins": -0.0004044932429678738, + "rewards/rejected": 0.00012903442257083952, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.526097059249878, + "eval_logits/rejected": -2.438264846801758, + "eval_logps/chosen": -268.4692077636719, + "eval_logps/rejected": -248.5731201171875, + "eval_loss": 0.6930866241455078, + "eval_rewards/accuracies": 0.5105000138282776, + "eval_rewards/chosen": 0.00020805322856176645, + "eval_rewards/margins": 0.00012786558363586664, + "eval_rewards/rejected": 8.018761582206935e-05, + "eval_runtime": 1145.3814, + "eval_samples_per_second": 1.746, + "eval_steps_per_second": 0.873, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.694509506225586, + "logits/rejected": -2.6128382682800293, + "logps/chosen": -269.1925048828125, + "logps/rejected": -250.8614044189453, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0004071092698723078, + "rewards/margins": 0.00034786976175382733, + "rewards/rejected": 5.923947173869237e-05, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.7306885719299316, + "logits/rejected": -2.6021299362182617, + "logps/chosen": -287.8159484863281, + "logps/rejected": -229.77554321289062, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0005465588765218854, + "rewards/margins": 0.0008666679495945573, + "rewards/rejected": -0.0003201091312803328, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.693432569503784, + "logits/rejected": -2.6983115673065186, + "logps/chosen": -251.46865844726562, + "logps/rejected": -224.0875244140625, + "loss": 0.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0007204865687526762, + "rewards/margins": 0.001154972007498145, + "rewards/rejected": -0.0004344852641224861, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.65818452835083, + "logits/rejected": -2.5886805057525635, + "logps/chosen": -259.5521545410156, + "logps/rejected": -231.7545928955078, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00015563381020911038, + "rewards/margins": -8.64746980369091e-05, + "rewards/rejected": 0.0002421085664536804, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.792015314102173, + "logits/rejected": -2.6346828937530518, + "logps/chosen": -317.06451416015625, + "logps/rejected": -283.6150817871094, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.000999806565232575, + "rewards/margins": 0.0009337253868579865, + "rewards/rejected": 6.60812875139527e-05, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.8084640502929688, + "logits/rejected": -2.663980007171631, + "logps/chosen": -247.31594848632812, + "logps/rejected": -224.99697875976562, + "loss": 0.6936, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0001547201827634126, + "rewards/margins": -0.0008144931052811444, + "rewards/rejected": 0.0006597728352062404, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.684959650039673, + "logits/rejected": -2.617687940597534, + "logps/chosen": -228.97103881835938, + "logps/rejected": -213.688232421875, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0012499226722866297, + "rewards/margins": 0.0002835305640473962, + "rewards/rejected": 0.0009663921082392335, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.8306772708892822, + "logits/rejected": -2.568779468536377, + "logps/chosen": -329.3360900878906, + "logps/rejected": -258.120361328125, + "loss": 0.6928, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0013785558985546231, + "rewards/margins": 0.0007068994455039501, + "rewards/rejected": 0.000671656453050673, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.6136114597320557, + "logits/rejected": -2.5621001720428467, + "logps/chosen": -249.232421875, + "logps/rejected": -210.09970092773438, + "loss": 0.6922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.002042675856500864, + "rewards/margins": 0.0018692022422328591, + "rewards/rejected": 0.00017347374523524195, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.7648887634277344, + "logits/rejected": -2.6305103302001953, + "logps/chosen": -245.13864135742188, + "logps/rejected": -249.7960968017578, + "loss": 0.6924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0012468935456126928, + "rewards/margins": 0.0015361621044576168, + "rewards/rejected": -0.0002892684715334326, + "step": 200 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.5247368812561035, + "eval_logits/rejected": -2.436842918395996, + "eval_logps/chosen": -268.3450927734375, + "eval_logps/rejected": -248.5510711669922, + "eval_loss": 0.6925778388977051, + "eval_rewards/accuracies": 0.5605000257492065, + "eval_rewards/chosen": 0.0014493772760033607, + "eval_rewards/margins": 0.001148571027442813, + "eval_rewards/rejected": 0.000300806452287361, + "eval_runtime": 1160.8989, + "eval_samples_per_second": 1.723, + "eval_steps_per_second": 0.861, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.7637641429901123, + "logits/rejected": -2.705857515335083, + "logps/chosen": -267.3808898925781, + "logps/rejected": -244.5528564453125, + "loss": 0.6921, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00269283726811409, + "rewards/margins": 0.0021984183695167303, + "rewards/rejected": 0.0004944190150126815, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.667719602584839, + "logits/rejected": -2.578129291534424, + "logps/chosen": -267.173095703125, + "logps/rejected": -245.6712646484375, + "loss": 0.692, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0021157357841730118, + "rewards/margins": 0.0022508525289595127, + "rewards/rejected": -0.0001351167302345857, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.6373825073242188, + "logits/rejected": -2.600693941116333, + "logps/chosen": -218.6166534423828, + "logps/rejected": -262.50567626953125, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0025270141195505857, + "rewards/margins": 0.0020475280471146107, + "rewards/rejected": 0.0004794862470589578, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.6723687648773193, + "logits/rejected": -2.6017861366271973, + "logps/chosen": -234.77505493164062, + "logps/rejected": -253.36123657226562, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002189940307289362, + "rewards/margins": 0.0022566465195268393, + "rewards/rejected": -6.670653237961233e-05, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.7224369049072266, + "logits/rejected": -2.5702614784240723, + "logps/chosen": -301.28271484375, + "logps/rejected": -258.6485900878906, + "loss": 0.6917, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.003571895882487297, + "rewards/margins": 0.0029362873174250126, + "rewards/rejected": 0.0006356079829856753, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.75251841545105, + "logits/rejected": -2.6484122276306152, + "logps/chosen": -279.2475280761719, + "logps/rejected": -247.06002807617188, + "loss": 0.6915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004341104533523321, + "rewards/margins": 0.0033817340154200792, + "rewards/rejected": 0.0009593702852725983, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.6575112342834473, + "logits/rejected": -2.5978779792785645, + "logps/chosen": -235.12271118164062, + "logps/rejected": -213.41455078125, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004421571735292673, + "rewards/margins": 0.00482561532407999, + "rewards/rejected": -0.00040404353057965636, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.789543867111206, + "logits/rejected": -2.6499438285827637, + "logps/chosen": -304.09271240234375, + "logps/rejected": -272.1944274902344, + "loss": 0.6908, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.006846042815595865, + "rewards/margins": 0.004689323715865612, + "rewards/rejected": 0.0021567184012383223, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.6990113258361816, + "logits/rejected": -2.6313693523406982, + "logps/chosen": -274.0718688964844, + "logps/rejected": -256.3681335449219, + "loss": 0.6911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.008100202307105064, + "rewards/margins": 0.0040775686502456665, + "rewards/rejected": 0.004022633656859398, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.8378288745880127, + "logits/rejected": -2.6756560802459717, + "logps/chosen": -286.62457275390625, + "logps/rejected": -235.5787811279297, + "loss": 0.691, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.007958942092955112, + "rewards/margins": 0.004316599573940039, + "rewards/rejected": 0.00364234228618443, + "step": 300 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.5252809524536133, + "eval_logits/rejected": -2.4377505779266357, + "eval_logps/chosen": -267.5838928222656, + "eval_logps/rejected": -248.17530822753906, + "eval_loss": 0.6906724572181702, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": 0.009060990996658802, + "eval_rewards/margins": 0.005002738442271948, + "eval_rewards/rejected": 0.004058253485709429, + "eval_runtime": 1152.1824, + "eval_samples_per_second": 1.736, + "eval_steps_per_second": 0.868, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.657287120819092, + "logits/rejected": -2.624457836151123, + "logps/chosen": -276.63470458984375, + "logps/rejected": -271.01629638671875, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008446435444056988, + "rewards/margins": 0.004816326312720776, + "rewards/rejected": 0.003630108432844281, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.745816707611084, + "logits/rejected": -2.5970892906188965, + "logps/chosen": -250.56851196289062, + "logps/rejected": -241.47689819335938, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.009846082888543606, + "rewards/margins": 0.004527992103248835, + "rewards/rejected": 0.005318091716617346, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.8040456771850586, + "logits/rejected": -2.6397745609283447, + "logps/chosen": -284.6964416503906, + "logps/rejected": -238.743408203125, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01216509472578764, + "rewards/margins": 0.005259203724563122, + "rewards/rejected": 0.006905891001224518, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.731226682662964, + "logits/rejected": -2.5713067054748535, + "logps/chosen": -256.9880676269531, + "logps/rejected": -214.75692749023438, + "loss": 0.6895, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.013420146889984608, + "rewards/margins": 0.007420173846185207, + "rewards/rejected": 0.005999973509460688, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.7683897018432617, + "logits/rejected": -2.619065761566162, + "logps/chosen": -305.935546875, + "logps/rejected": -253.32894897460938, + "loss": 0.688, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.018984589725732803, + "rewards/margins": 0.010493551380932331, + "rewards/rejected": 0.008491038344800472, + "step": 350 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.7598016262054443, + "logits/rejected": -2.6604552268981934, + "logps/chosen": -247.54931640625, + "logps/rejected": -245.3474578857422, + "loss": 0.6894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0182194784283638, + "rewards/margins": 0.0077258930541574955, + "rewards/rejected": 0.010493585839867592, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.6800496578216553, + "logits/rejected": -2.684868812561035, + "logps/chosen": -250.73703002929688, + "logps/rejected": -229.0603485107422, + "loss": 0.6881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.021059587597846985, + "rewards/margins": 0.01036703772842884, + "rewards/rejected": 0.010692549869418144, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.7361929416656494, + "logits/rejected": -2.643367290496826, + "logps/chosen": -272.9651794433594, + "logps/rejected": -239.2753143310547, + "loss": 0.6851, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.025732994079589844, + "rewards/margins": 0.01641467772424221, + "rewards/rejected": 0.009318319149315357, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.7081408500671387, + "logits/rejected": -2.5116286277770996, + "logps/chosen": -286.8047790527344, + "logps/rejected": -231.5174560546875, + "loss": 0.6823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.03527464345097542, + "rewards/margins": 0.02219114825129509, + "rewards/rejected": 0.013083499856293201, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.7479846477508545, + "logits/rejected": -2.771252155303955, + "logps/chosen": -272.4174499511719, + "logps/rejected": -283.92108154296875, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03528792783617973, + "rewards/margins": 0.011549949645996094, + "rewards/rejected": 0.02373797819018364, + "step": 400 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.522955894470215, + "eval_logits/rejected": -2.4351353645324707, + "eval_logps/chosen": -264.435302734375, + "eval_logps/rejected": -246.30886840820312, + "eval_loss": 0.6845206022262573, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": 0.04054699465632439, + "eval_rewards/margins": 0.017824340611696243, + "eval_rewards/rejected": 0.022722657769918442, + "eval_runtime": 1160.5462, + "eval_samples_per_second": 1.723, + "eval_steps_per_second": 0.862, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.6627039909362793, + "logits/rejected": -2.5389530658721924, + "logps/chosen": -244.06640625, + "logps/rejected": -203.2464599609375, + "loss": 0.6848, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.040567509829998016, + "rewards/margins": 0.01748758926987648, + "rewards/rejected": 0.023079920560121536, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.6784210205078125, + "logits/rejected": -2.6046371459960938, + "logps/chosen": -261.45135498046875, + "logps/rejected": -262.4620666503906, + "loss": 0.6811, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.049992240965366364, + "rewards/margins": 0.024862922728061676, + "rewards/rejected": 0.025129318237304688, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.731562852859497, + "logits/rejected": -2.607314109802246, + "logps/chosen": -264.93365478515625, + "logps/rejected": -252.49789428710938, + "loss": 0.6815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04952297732234001, + "rewards/margins": 0.02442570962011814, + "rewards/rejected": 0.02509726583957672, + "step": 430 + }, + { + "epoch": 0.06, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.7449309825897217, + "logits/rejected": -2.7042744159698486, + "logps/chosen": -255.37319946289062, + "logps/rejected": -230.885009765625, + "loss": 0.6847, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.05002352595329285, + "rewards/margins": 0.017672471702098846, + "rewards/rejected": 0.032351054251194, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.7228033542633057, + "logits/rejected": -2.7313358783721924, + "logps/chosen": -272.48297119140625, + "logps/rejected": -288.769287109375, + "loss": 0.684, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05112846940755844, + "rewards/margins": 0.019452670589089394, + "rewards/rejected": 0.031675804406404495, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.6720528602600098, + "logits/rejected": -2.554985761642456, + "logps/chosen": -238.60134887695312, + "logps/rejected": -231.50741577148438, + "loss": 0.6795, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06515996903181076, + "rewards/margins": 0.028682807460427284, + "rewards/rejected": 0.036477167159318924, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.6872806549072266, + "logits/rejected": -2.650778293609619, + "logps/chosen": -258.9559020996094, + "logps/rejected": -241.73696899414062, + "loss": 0.6749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0744255781173706, + "rewards/margins": 0.038194023072719574, + "rewards/rejected": 0.03623156249523163, + "step": 470 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.6812539100646973, + "logits/rejected": -2.6287622451782227, + "logps/chosen": -267.0392761230469, + "logps/rejected": -225.1342010498047, + "loss": 0.6688, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.07361556589603424, + "rewards/margins": 0.051149915903806686, + "rewards/rejected": 0.022465649992227554, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.7315773963928223, + "logits/rejected": -2.561474561691284, + "logps/chosen": -257.4845886230469, + "logps/rejected": -213.83639526367188, + "loss": 0.6674, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05385025218129158, + "rewards/margins": 0.05505413934588432, + "rewards/rejected": -0.0012038892600685358, + "step": 490 + }, + { + "epoch": 0.07, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.6467950344085693, + "logits/rejected": -2.5683364868164062, + "logps/chosen": -237.36563110351562, + "logps/rejected": -221.3732452392578, + "loss": 0.6799, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.02083980292081833, + "rewards/margins": 0.029936185106635094, + "rewards/rejected": -0.009096382185816765, + "step": 500 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.4660205841064453, + "eval_logits/rejected": -2.375483989715576, + "eval_logps/chosen": -264.949462890625, + "eval_logps/rejected": -249.92762756347656, + "eval_loss": 0.6707118153572083, + "eval_rewards/accuracies": 0.6815000176429749, + "eval_rewards/chosen": 0.035405587404966354, + "eval_rewards/margins": 0.04887029901146889, + "eval_rewards/rejected": -0.013464723713696003, + "eval_runtime": 1161.9283, + "eval_samples_per_second": 1.721, + "eval_steps_per_second": 0.861, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.557415008544922, + "logits/rejected": -2.587461471557617, + "logps/chosen": -248.0208740234375, + "logps/rejected": -248.16250610351562, + "loss": 0.676, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02980933152139187, + "rewards/margins": 0.03878789395093918, + "rewards/rejected": -0.008978564292192459, + "step": 510 + }, + { + "epoch": 0.07, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.5861740112304688, + "logits/rejected": -2.4508297443389893, + "logps/chosen": -242.4407196044922, + "logps/rejected": -246.52182006835938, + "loss": 0.6693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01556556485593319, + "rewards/margins": 0.05277082324028015, + "rewards/rejected": -0.03720525652170181, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.627671957015991, + "logits/rejected": -2.5770435333251953, + "logps/chosen": -247.16738891601562, + "logps/rejected": -231.6279754638672, + "loss": 0.6649, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.014550316147506237, + "rewards/margins": 0.06174767017364502, + "rewards/rejected": -0.047197360545396805, + "step": 530 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.699380874633789, + "logits/rejected": -2.5035440921783447, + "logps/chosen": -272.3833923339844, + "logps/rejected": -255.84744262695312, + "loss": 0.6618, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0020259805023670197, + "rewards/margins": 0.06897115707397461, + "rewards/rejected": -0.06694517284631729, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -2.6517231464385986, + "logits/rejected": -2.578498125076294, + "logps/chosen": -242.3655548095703, + "logps/rejected": -233.637939453125, + "loss": 0.6476, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.020458657294511795, + "rewards/margins": 0.10107719898223877, + "rewards/rejected": -0.08061854541301727, + "step": 550 + }, + { + "epoch": 0.07, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -2.6894688606262207, + "logits/rejected": -2.506308078765869, + "logps/chosen": -277.38946533203125, + "logps/rejected": -297.1171569824219, + "loss": 0.6589, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.031552791595458984, + "rewards/margins": 0.07832719385623932, + "rewards/rejected": -0.10987997055053711, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.7173991203308105, + "logits/rejected": -2.6116385459899902, + "logps/chosen": -262.6662902832031, + "logps/rejected": -255.9048309326172, + "loss": 0.6584, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06762684136629105, + "rewards/margins": 0.077293761074543, + "rewards/rejected": -0.14492060244083405, + "step": 570 + }, + { + "epoch": 0.08, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -2.681260347366333, + "logits/rejected": -2.4548838138580322, + "logps/chosen": -302.3983154296875, + "logps/rejected": -283.51959228515625, + "loss": 0.6385, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.025161724537611008, + "rewards/margins": 0.125971257686615, + "rewards/rejected": -0.1511329710483551, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -2.656480312347412, + "logits/rejected": -2.4974989891052246, + "logps/chosen": -286.38043212890625, + "logps/rejected": -257.6932678222656, + "loss": 0.6432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05199650675058365, + "rewards/margins": 0.11849125474691391, + "rewards/rejected": -0.17048776149749756, + "step": 590 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -2.5832934379577637, + "logits/rejected": -2.4160356521606445, + "logps/chosen": -245.601806640625, + "logps/rejected": -255.2142333984375, + "loss": 0.6577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1271064281463623, + "rewards/margins": 0.09216101467609406, + "rewards/rejected": -0.21926744282245636, + "step": 600 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.360114097595215, + "eval_logits/rejected": -2.2541239261627197, + "eval_logps/chosen": -280.78851318359375, + "eval_logps/rejected": -272.36041259765625, + "eval_loss": 0.6461644172668457, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -0.12298478186130524, + "eval_rewards/margins": 0.11480776965618134, + "eval_rewards/rejected": -0.23779255151748657, + "eval_runtime": 1158.9282, + "eval_samples_per_second": 1.726, + "eval_steps_per_second": 0.863, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -2.62365460395813, + "logits/rejected": -2.392054557800293, + "logps/chosen": -231.8965606689453, + "logps/rejected": -213.38131713867188, + "loss": 0.6325, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05727694183588028, + "rewards/margins": 0.13814638555049896, + "rewards/rejected": -0.19542333483695984, + "step": 610 + }, + { + "epoch": 0.08, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -2.64388108253479, + "logits/rejected": -2.4387760162353516, + "logps/chosen": -285.2777404785156, + "logps/rejected": -278.3298645019531, + "loss": 0.6246, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07192676514387131, + "rewards/margins": 0.16072218120098114, + "rewards/rejected": -0.23264892399311066, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -2.4769465923309326, + "logits/rejected": -2.419826030731201, + "logps/chosen": -278.97564697265625, + "logps/rejected": -272.57269287109375, + "loss": 0.6501, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.16124705970287323, + "rewards/margins": 0.11454138904809952, + "rewards/rejected": -0.27578845620155334, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -2.6546480655670166, + "logits/rejected": -2.4742162227630615, + "logps/chosen": -272.10870361328125, + "logps/rejected": -273.37518310546875, + "loss": 0.6526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15491154789924622, + "rewards/margins": 0.1191616877913475, + "rewards/rejected": -0.27407321333885193, + "step": 640 + }, + { + "epoch": 0.09, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -2.5434234142303467, + "logits/rejected": -2.544323682785034, + "logps/chosen": -271.45587158203125, + "logps/rejected": -271.0767517089844, + "loss": 0.6503, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1807040274143219, + "rewards/margins": 0.1250898838043213, + "rewards/rejected": -0.3057938814163208, + "step": 650 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -2.596226215362549, + "logits/rejected": -2.439690113067627, + "logps/chosen": -318.5782775878906, + "logps/rejected": -278.7407531738281, + "loss": 0.6247, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22762902081012726, + "rewards/margins": 0.16828063130378723, + "rewards/rejected": -0.3959096074104309, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -2.6381888389587402, + "logits/rejected": -2.522258758544922, + "logps/chosen": -300.07244873046875, + "logps/rejected": -334.3465270996094, + "loss": 0.6205, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.18682761490345, + "rewards/margins": 0.18621641397476196, + "rewards/rejected": -0.3730439841747284, + "step": 670 + }, + { + "epoch": 0.09, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -2.663062572479248, + "logits/rejected": -2.5052947998046875, + "logps/chosen": -281.8200988769531, + "logps/rejected": -277.8845520019531, + "loss": 0.6352, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13653087615966797, + "rewards/margins": 0.1511278599500656, + "rewards/rejected": -0.2876587510108948, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -2.664280652999878, + "logits/rejected": -2.524028778076172, + "logps/chosen": -309.06658935546875, + "logps/rejected": -282.0691223144531, + "loss": 0.6352, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.15293274819850922, + "rewards/margins": 0.15575169026851654, + "rewards/rejected": -0.30868446826934814, + "step": 690 + }, + { + "epoch": 0.09, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -2.6197104454040527, + "logits/rejected": -2.5259227752685547, + "logps/chosen": -279.31195068359375, + "logps/rejected": -282.20111083984375, + "loss": 0.6365, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11969868838787079, + "rewards/margins": 0.15021947026252747, + "rewards/rejected": -0.26991817355155945, + "step": 700 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.31355619430542, + "eval_logits/rejected": -2.2012522220611572, + "eval_logps/chosen": -277.0453186035156, + "eval_logps/rejected": -272.20367431640625, + "eval_loss": 0.6344681978225708, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": -0.08555291593074799, + "eval_rewards/margins": 0.15067268908023834, + "eval_rewards/rejected": -0.23622561991214752, + "eval_runtime": 1159.7861, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.862, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -2.5975005626678467, + "logits/rejected": -2.508746862411499, + "logps/chosen": -281.31512451171875, + "logps/rejected": -281.1041259765625, + "loss": 0.6322, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.062341589480638504, + "rewards/margins": 0.1610972285270691, + "rewards/rejected": -0.2234388291835785, + "step": 710 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -2.6123766899108887, + "logits/rejected": -2.5293030738830566, + "logps/chosen": -323.938720703125, + "logps/rejected": -315.38092041015625, + "loss": 0.624, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09480178356170654, + "rewards/margins": 0.18506909906864166, + "rewards/rejected": -0.2798708975315094, + "step": 720 + }, + { + "epoch": 0.1, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -2.605785369873047, + "logits/rejected": -2.518705129623413, + "logps/chosen": -289.82965087890625, + "logps/rejected": -295.834228515625, + "loss": 0.6195, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12233264744281769, + "rewards/margins": 0.18506425619125366, + "rewards/rejected": -0.30739688873291016, + "step": 730 + }, + { + "epoch": 0.1, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -2.632423162460327, + "logits/rejected": -2.4382903575897217, + "logps/chosen": -304.190185546875, + "logps/rejected": -279.628662109375, + "loss": 0.6155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10397078841924667, + "rewards/margins": 0.20101885497570038, + "rewards/rejected": -0.30498963594436646, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.450904369354248, + "logits/rejected": -2.3369948863983154, + "logps/chosen": -257.42974853515625, + "logps/rejected": -265.8070373535156, + "loss": 0.6048, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1934954673051834, + "rewards/margins": 0.22951745986938477, + "rewards/rejected": -0.423012912273407, + "step": 750 + }, + { + "epoch": 0.1, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -2.5839266777038574, + "logits/rejected": -2.357056140899658, + "logps/chosen": -303.64654541015625, + "logps/rejected": -279.87774658203125, + "loss": 0.6236, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32853788137435913, + "rewards/margins": 0.19128181040287018, + "rewards/rejected": -0.5198196172714233, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 4.999993476542427e-06, + "logits/chosen": -2.515817165374756, + "logits/rejected": -2.486161708831787, + "logps/chosen": -327.0534973144531, + "logps/rejected": -329.8384704589844, + "loss": 0.6198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4592045247554779, + "rewards/margins": 0.23688983917236328, + "rewards/rejected": -0.6960943937301636, + "step": 770 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941289086112e-06, + "logits/chosen": -2.5871434211730957, + "logits/rejected": -2.2986748218536377, + "logps/chosen": -352.54925537109375, + "logps/rejected": -345.39727783203125, + "loss": 0.602, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.615364134311676, + "rewards/margins": 0.27003955841064453, + "rewards/rejected": -0.8854037523269653, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 4.999836915262896e-06, + "logits/chosen": -2.4022507667541504, + "logits/rejected": -2.423859119415283, + "logps/chosen": -341.8403625488281, + "logps/rejected": -363.97869873046875, + "loss": 0.6102, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6583345532417297, + "rewards/margins": 0.2388831377029419, + "rewards/rejected": -0.8972176313400269, + "step": 790 + }, + { + "epoch": 0.1, + "learning_rate": 4.999680357251587e-06, + "logits/chosen": -2.2201342582702637, + "logits/rejected": -2.259474277496338, + "logps/chosen": -311.77374267578125, + "logps/rejected": -336.84295654296875, + "loss": 0.6519, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.613008975982666, + "rewards/margins": 0.15706071257591248, + "rewards/rejected": -0.7700697183609009, + "step": 800 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.183530330657959, + "eval_logits/rejected": -2.04819917678833, + "eval_logps/chosen": -317.9223327636719, + "eval_logps/rejected": -320.8871765136719, + "eval_loss": 0.6239581108093262, + "eval_rewards/accuracies": 0.6629999876022339, + "eval_rewards/chosen": -0.49432334303855896, + "eval_rewards/margins": 0.22873705625534058, + "eval_rewards/rejected": -0.7230603098869324, + "eval_runtime": 1149.5976, + "eval_samples_per_second": 1.74, + "eval_steps_per_second": 0.87, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 4.999471618320339e-06, + "logits/chosen": -2.5523288249969482, + "logits/rejected": -2.320265054702759, + "logps/chosen": -328.2205810546875, + "logps/rejected": -322.53076171875, + "loss": 0.6069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4844594895839691, + "rewards/margins": 0.2521725296974182, + "rewards/rejected": -0.7366319894790649, + "step": 810 + }, + { + "epoch": 0.11, + "learning_rate": 4.999210702826586e-06, + "logits/chosen": -2.6358237266540527, + "logits/rejected": -2.4474587440490723, + "logps/chosen": -346.6995849609375, + "logps/rejected": -324.92242431640625, + "loss": 0.6221, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4281977117061615, + "rewards/margins": 0.23045018315315247, + "rewards/rejected": -0.658647894859314, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 4.998897616216947e-06, + "logits/chosen": -2.4511475563049316, + "logits/rejected": -2.5670619010925293, + "logps/chosen": -258.81573486328125, + "logps/rejected": -311.01043701171875, + "loss": 0.6131, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3054274618625641, + "rewards/margins": 0.24654905498027802, + "rewards/rejected": -0.5519765019416809, + "step": 830 + }, + { + "epoch": 0.11, + "learning_rate": 4.998532365027117e-06, + "logits/chosen": -2.4918437004089355, + "logits/rejected": -2.2926580905914307, + "logps/chosen": -311.8231201171875, + "logps/rejected": -280.79620361328125, + "loss": 0.5902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2812344431877136, + "rewards/margins": 0.2974868416786194, + "rewards/rejected": -0.5787213444709778, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 4.9981149568817275e-06, + "logits/chosen": -2.498668909072876, + "logits/rejected": -2.4396538734436035, + "logps/chosen": -313.8962707519531, + "logps/rejected": -356.46392822265625, + "loss": 0.6126, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4118213653564453, + "rewards/margins": 0.26523926854133606, + "rewards/rejected": -0.677060604095459, + "step": 850 + }, + { + "epoch": 0.11, + "learning_rate": 4.997645400494192e-06, + "logits/chosen": -2.544689178466797, + "logits/rejected": -2.427544116973877, + "logps/chosen": -297.4951477050781, + "logps/rejected": -325.6196594238281, + "loss": 0.6211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5930116176605225, + "rewards/margins": 0.29512810707092285, + "rewards/rejected": -0.8881398439407349, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 4.997123705666514e-06, + "logits/chosen": -2.5834832191467285, + "logits/rejected": -2.3967490196228027, + "logps/chosen": -340.9512939453125, + "logps/rejected": -351.071044921875, + "loss": 0.6576, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6529733538627625, + "rewards/margins": 0.1915055215358734, + "rewards/rejected": -0.8444789052009583, + "step": 870 + }, + { + "epoch": 0.12, + "learning_rate": 4.996549883289093e-06, + "logits/chosen": -2.5025086402893066, + "logits/rejected": -2.4052650928497314, + "logps/chosen": -308.8294677734375, + "logps/rejected": -350.6068115234375, + "loss": 0.6265, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.593645453453064, + "rewards/margins": 0.27240580320358276, + "rewards/rejected": -0.866051197052002, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 4.995923945340495e-06, + "logits/chosen": -2.5428309440612793, + "logits/rejected": -2.4630656242370605, + "logps/chosen": -305.7501525878906, + "logps/rejected": -334.24639892578125, + "loss": 0.6473, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.537492036819458, + "rewards/margins": 0.21050593256950378, + "rewards/rejected": -0.7479979395866394, + "step": 890 + }, + { + "epoch": 0.12, + "learning_rate": 4.995245904887195e-06, + "logits/chosen": -2.57853364944458, + "logits/rejected": -2.3580322265625, + "logps/chosen": -310.814208984375, + "logps/rejected": -298.79962158203125, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7046155333518982, + "rewards/margins": 0.197780042886734, + "rewards/rejected": -0.9023955464363098, + "step": 900 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.2183713912963867, + "eval_logits/rejected": -2.0783114433288574, + "eval_logps/chosen": -325.8177490234375, + "eval_logps/rejected": -331.4542236328125, + "eval_loss": 0.6203334927558899, + "eval_rewards/accuracies": 0.6694999933242798, + "eval_rewards/chosen": -0.5732770562171936, + "eval_rewards/margins": 0.25545385479927063, + "eval_rewards/rejected": -0.8287308216094971, + "eval_runtime": 1154.0508, + "eval_samples_per_second": 1.733, + "eval_steps_per_second": 0.867, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 4.994515776083313e-06, + "logits/chosen": -2.4050710201263428, + "logits/rejected": -2.4928653240203857, + "logps/chosen": -315.0536193847656, + "logps/rejected": -373.6515197753906, + "loss": 0.5972, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.47445574402809143, + "rewards/margins": 0.32963109016418457, + "rewards/rejected": -0.8040868639945984, + "step": 910 + }, + { + "epoch": 0.12, + "learning_rate": 4.993733574170316e-06, + "logits/chosen": -2.5724964141845703, + "logits/rejected": -2.402193546295166, + "logps/chosen": -261.9773864746094, + "logps/rejected": -292.7881774902344, + "loss": 0.5911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3413178622722626, + "rewards/margins": 0.32790833711624146, + "rewards/rejected": -0.6692262291908264, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 4.992899315476696e-06, + "logits/chosen": -2.6308352947235107, + "logits/rejected": -2.504096508026123, + "logps/chosen": -331.2294616699219, + "logps/rejected": -336.93280029296875, + "loss": 0.5964, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.303416907787323, + "rewards/margins": 0.3124091327190399, + "rewards/rejected": -0.6158260703086853, + "step": 930 + }, + { + "epoch": 0.12, + "learning_rate": 4.9920130174176354e-06, + "logits/chosen": -2.5981762409210205, + "logits/rejected": -2.4487905502319336, + "logps/chosen": -319.9151916503906, + "logps/rejected": -333.0742492675781, + "loss": 0.5814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42610105872154236, + "rewards/margins": 0.30205437541007996, + "rewards/rejected": -0.7281554341316223, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 4.991074698494638e-06, + "logits/chosen": -2.6599678993225098, + "logits/rejected": -2.3881497383117676, + "logps/chosen": -321.3490905761719, + "logps/rejected": -312.3178405761719, + "loss": 0.6153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4955690801143646, + "rewards/margins": 0.26488804817199707, + "rewards/rejected": -0.7604571580886841, + "step": 950 + }, + { + "epoch": 0.13, + "learning_rate": 4.990084378295148e-06, + "logits/chosen": -2.6287171840667725, + "logits/rejected": -2.499150514602661, + "logps/chosen": -286.4674987792969, + "logps/rejected": -286.6602478027344, + "loss": 0.6153, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46585598587989807, + "rewards/margins": 0.2714241147041321, + "rewards/rejected": -0.737280011177063, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 4.989042077492135e-06, + "logits/chosen": -2.6049046516418457, + "logits/rejected": -2.512279987335205, + "logps/chosen": -312.2615051269531, + "logps/rejected": -334.3189392089844, + "loss": 0.548, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.38576096296310425, + "rewards/margins": 0.3811526596546173, + "rewards/rejected": -0.7669135928153992, + "step": 970 + }, + { + "epoch": 0.13, + "learning_rate": 4.987947817843665e-06, + "logits/chosen": -2.4653379917144775, + "logits/rejected": -2.4543960094451904, + "logps/chosen": -293.2818908691406, + "logps/rejected": -294.23089599609375, + "loss": 0.6096, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46745139360427856, + "rewards/margins": 0.312784880399704, + "rewards/rejected": -0.7802362442016602, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 4.986801622192453e-06, + "logits/chosen": -2.585569381713867, + "logits/rejected": -2.398247241973877, + "logps/chosen": -264.38580322265625, + "logps/rejected": -286.49078369140625, + "loss": 0.5645, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.41495591402053833, + "rewards/margins": 0.42753225564956665, + "rewards/rejected": -0.842488169670105, + "step": 990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985603514465372e-06, + "logits/chosen": -2.496156692504883, + "logits/rejected": -2.562666416168213, + "logps/chosen": -313.02593994140625, + "logps/rejected": -339.96746826171875, + "loss": 0.5841, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.47666803002357483, + "rewards/margins": 0.3657141327857971, + "rewards/rejected": -0.8423821330070496, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.2086312770843506, + "eval_logits/rejected": -2.068852186203003, + "eval_logps/chosen": -322.0998229980469, + "eval_logps/rejected": -334.5815734863281, + "eval_loss": 0.6070671677589417, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -0.5360978245735168, + "eval_rewards/margins": 0.32390668988227844, + "eval_rewards/rejected": -0.8600045442581177, + "eval_runtime": 1162.8161, + "eval_samples_per_second": 1.72, + "eval_steps_per_second": 0.86, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984353519672966e-06, + "logits/chosen": -2.5102970600128174, + "logits/rejected": -2.296638011932373, + "logps/chosen": -322.3448486328125, + "logps/rejected": -314.18927001953125, + "loss": 0.652, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6231977343559265, + "rewards/margins": 0.2020292580127716, + "rewards/rejected": -0.8252270817756653, + "step": 1010 + }, + { + "epoch": 0.13, + "learning_rate": 4.9830516639089226e-06, + "logits/chosen": -2.5087199211120605, + "logits/rejected": -2.2799477577209473, + "logps/chosen": -359.6461486816406, + "logps/rejected": -333.55712890625, + "loss": 0.5308, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.509143590927124, + "rewards/margins": 0.46995463967323303, + "rewards/rejected": -0.9790982007980347, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 4.9816979743495296e-06, + "logits/chosen": -2.437312602996826, + "logits/rejected": -2.312748670578003, + "logps/chosen": -381.3270263671875, + "logps/rejected": -398.35748291015625, + "loss": 0.5985, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8338835835456848, + "rewards/margins": 0.41778063774108887, + "rewards/rejected": -1.251664400100708, + "step": 1030 + }, + { + "epoch": 0.14, + "learning_rate": 4.980292479253105e-06, + "logits/chosen": -2.523524761199951, + "logits/rejected": -2.1683216094970703, + "logps/chosen": -413.48956298828125, + "logps/rejected": -424.48797607421875, + "loss": 0.5374, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.087397575378418, + "rewards/margins": 0.5586522817611694, + "rewards/rejected": -1.6460498571395874, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 4.978835207959414e-06, + "logits/chosen": -2.291563034057617, + "logits/rejected": -2.2155869007110596, + "logps/chosen": -365.6241760253906, + "logps/rejected": -386.0607604980469, + "loss": 0.5994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0963432788848877, + "rewards/margins": 0.34998494386672974, + "rewards/rejected": -1.4463282823562622, + "step": 1050 + }, + { + "epoch": 0.14, + "learning_rate": 4.977326190889046e-06, + "logits/chosen": -2.4078869819641113, + "logits/rejected": -1.7181726694107056, + "logps/chosen": -359.93756103515625, + "logps/rejected": -351.11859130859375, + "loss": 0.5669, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9058128595352173, + "rewards/margins": 0.4577213227748871, + "rewards/rejected": -1.3635342121124268, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 4.975765459542788e-06, + "logits/chosen": -2.2240052223205566, + "logits/rejected": -2.059451103210449, + "logps/chosen": -315.9155578613281, + "logps/rejected": -355.5037841796875, + "loss": 0.528, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5928197503089905, + "rewards/margins": 0.5360963344573975, + "rewards/rejected": -1.1289160251617432, + "step": 1070 + }, + { + "epoch": 0.14, + "learning_rate": 4.9741530465009665e-06, + "logits/chosen": -2.186079502105713, + "logits/rejected": -1.9260094165802002, + "logps/chosen": -312.0130920410156, + "logps/rejected": -334.43389892578125, + "loss": 0.5806, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6744036078453064, + "rewards/margins": 0.40451207756996155, + "rewards/rejected": -1.0789155960083008, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 4.972488985422763e-06, + "logits/chosen": -2.0618720054626465, + "logits/rejected": -1.9399276971817017, + "logps/chosen": -329.8536376953125, + "logps/rejected": -354.6712341308594, + "loss": 0.5418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8148940205574036, + "rewards/margins": 0.6003459692001343, + "rewards/rejected": -1.4152400493621826, + "step": 1090 + }, + { + "epoch": 0.14, + "learning_rate": 4.970773311045514e-06, + "logits/chosen": -2.120664358139038, + "logits/rejected": -1.6714054346084595, + "logps/chosen": -371.4166564941406, + "logps/rejected": -393.62786865234375, + "loss": 0.5877, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0566858053207397, + "rewards/margins": 0.43407002091407776, + "rewards/rejected": -1.4907559156417847, + "step": 1100 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -1.3836040496826172, + "eval_logits/rejected": -1.1052757501602173, + "eval_logps/chosen": -383.43798828125, + "eval_logps/rejected": -410.8677673339844, + "eval_loss": 0.5946979522705078, + "eval_rewards/accuracies": 0.6855000257492065, + "eval_rewards/chosen": -1.149479866027832, + "eval_rewards/margins": 0.4733865261077881, + "eval_rewards/rejected": -1.6228665113449097, + "eval_runtime": 1178.8626, + "eval_samples_per_second": 1.697, + "eval_steps_per_second": 0.848, + "step": 1100 + }, + { + "epoch": 0.15, + "learning_rate": 4.969006059183984e-06, + "logits/chosen": -2.1172611713409424, + "logits/rejected": -1.6654752492904663, + "logps/chosen": -387.62664794921875, + "logps/rejected": -408.0655212402344, + "loss": 0.6262, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2082122564315796, + "rewards/margins": 0.43365636467933655, + "rewards/rejected": -1.6418688297271729, + "step": 1110 + }, + { + "epoch": 0.15, + "learning_rate": 4.967187266729623e-06, + "logits/chosen": -2.193946123123169, + "logits/rejected": -1.850895643234253, + "logps/chosen": -398.3614501953125, + "logps/rejected": -430.51373291015625, + "loss": 0.5971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2287832498550415, + "rewards/margins": 0.43036943674087524, + "rewards/rejected": -1.6591527462005615, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 4.965316971649791e-06, + "logits/chosen": -2.100188732147217, + "logits/rejected": -1.686700463294983, + "logps/chosen": -396.20623779296875, + "logps/rejected": -422.5747985839844, + "loss": 0.4954, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.049180269241333, + "rewards/margins": 0.7083006501197815, + "rewards/rejected": -1.7574809789657593, + "step": 1130 + }, + { + "epoch": 0.15, + "learning_rate": 4.963395212986964e-06, + "logits/chosen": -2.071812152862549, + "logits/rejected": -1.535829782485962, + "logps/chosen": -342.7199401855469, + "logps/rejected": -373.0933837890625, + "loss": 0.526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0833193063735962, + "rewards/margins": 0.5862609148025513, + "rewards/rejected": -1.6695802211761475, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 4.9614220308579285e-06, + "logits/chosen": -1.8345826864242554, + "logits/rejected": -2.1796462535858154, + "logps/chosen": -351.6676330566406, + "logps/rejected": -381.9559326171875, + "loss": 0.5963, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7989309430122375, + "rewards/margins": 0.41886311769485474, + "rewards/rejected": -1.2177939414978027, + "step": 1150 + }, + { + "epoch": 0.15, + "learning_rate": 4.9593974664529325e-06, + "logits/chosen": -2.1840920448303223, + "logits/rejected": -1.7762985229492188, + "logps/chosen": -331.3350830078125, + "logps/rejected": -385.296630859375, + "loss": 0.5442, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6507974863052368, + "rewards/margins": 0.5434342622756958, + "rewards/rejected": -1.1942317485809326, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 4.957321562034833e-06, + "logits/chosen": -2.0527548789978027, + "logits/rejected": -1.8421388864517212, + "logps/chosen": -366.1250305175781, + "logps/rejected": -394.0170593261719, + "loss": 0.559, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8194394111633301, + "rewards/margins": 0.5880136489868164, + "rewards/rejected": -1.4074528217315674, + "step": 1170 + }, + { + "epoch": 0.15, + "learning_rate": 4.955194360938214e-06, + "logits/chosen": -1.9533300399780273, + "logits/rejected": -1.7631326913833618, + "logps/chosen": -371.4923400878906, + "logps/rejected": -387.41253662109375, + "loss": 0.5852, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1551687717437744, + "rewards/margins": 0.4826745092868805, + "rewards/rejected": -1.637843370437622, + "step": 1180 + }, + { + "epoch": 0.16, + "learning_rate": 4.9530159075684735e-06, + "logits/chosen": -1.4516584873199463, + "logits/rejected": -1.5901073217391968, + "logps/chosen": -388.7632141113281, + "logps/rejected": -484.33258056640625, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5982259511947632, + "rewards/margins": 0.268177330493927, + "rewards/rejected": -1.8664032220840454, + "step": 1190 + }, + { + "epoch": 0.16, + "learning_rate": 4.950786247400908e-06, + "logits/chosen": -1.6140592098236084, + "logits/rejected": -1.6555970907211304, + "logps/chosen": -390.38330078125, + "logps/rejected": -435.25604248046875, + "loss": 0.5552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4886068105697632, + "rewards/margins": 0.4878392219543457, + "rewards/rejected": -1.9764461517333984, + "step": 1200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -0.7372294068336487, + "eval_logits/rejected": -0.36139780282974243, + "eval_logps/chosen": -411.0458679199219, + "eval_logps/rejected": -437.91998291015625, + "eval_loss": 0.5908603072166443, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": -1.4255588054656982, + "eval_rewards/margins": 0.4678295850753784, + "eval_rewards/rejected": -1.8933883905410767, + "eval_runtime": 1176.6814, + "eval_samples_per_second": 1.7, + "eval_steps_per_second": 0.85, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 4.948505426979756e-06, + "logits/chosen": -1.8322169780731201, + "logits/rejected": -1.5653588771820068, + "logps/chosen": -406.3680114746094, + "logps/rejected": -444.8448791503906, + "loss": 0.5533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4452760219573975, + "rewards/margins": 0.5475454926490784, + "rewards/rejected": -1.992821455001831, + "step": 1210 + }, + { + "epoch": 0.16, + "learning_rate": 4.946173493917228e-06, + "logits/chosen": -1.7810437679290771, + "logits/rejected": -1.2556498050689697, + "logps/chosen": -417.5589294433594, + "logps/rejected": -397.54437255859375, + "loss": 0.733, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5235958099365234, + "rewards/margins": 0.1696973741054535, + "rewards/rejected": -1.6932932138442993, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 4.943790496892513e-06, + "logits/chosen": -1.9522157907485962, + "logits/rejected": -1.3978416919708252, + "logps/chosen": -345.17913818359375, + "logps/rejected": -371.7323303222656, + "loss": 0.5339, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8955880999565125, + "rewards/margins": 0.6181024312973022, + "rewards/rejected": -1.5136905908584595, + "step": 1230 + }, + { + "epoch": 0.16, + "learning_rate": 4.941356485650762e-06, + "logits/chosen": -2.062680721282959, + "logits/rejected": -1.6688048839569092, + "logps/chosen": -390.939208984375, + "logps/rejected": -419.36334228515625, + "loss": 0.5703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8187043070793152, + "rewards/margins": 0.4967420995235443, + "rewards/rejected": -1.3154462575912476, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 4.93887151100205e-06, + "logits/chosen": -2.130277156829834, + "logits/rejected": -1.685664176940918, + "logps/chosen": -385.4061279296875, + "logps/rejected": -389.6874084472656, + "loss": 0.6048, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6924458146095276, + "rewards/margins": 0.38014930486679077, + "rewards/rejected": -1.0725951194763184, + "step": 1250 + }, + { + "epoch": 0.16, + "learning_rate": 4.936335624820313e-06, + "logits/chosen": -2.035191774368286, + "logits/rejected": -1.712632417678833, + "logps/chosen": -348.1141052246094, + "logps/rejected": -357.98541259765625, + "loss": 0.5586, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8647063970565796, + "rewards/margins": 0.4742640554904938, + "rewards/rejected": -1.3389705419540405, + "step": 1260 + }, + { + "epoch": 0.17, + "learning_rate": 4.933748880042271e-06, + "logits/chosen": -2.0088629722595215, + "logits/rejected": -1.5719774961471558, + "logps/chosen": -360.0817565917969, + "logps/rejected": -390.73211669921875, + "loss": 0.5454, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9902151226997375, + "rewards/margins": 0.534164309501648, + "rewards/rejected": -1.5243794918060303, + "step": 1270 + }, + { + "epoch": 0.17, + "learning_rate": 4.931111330666317e-06, + "logits/chosen": -2.0022754669189453, + "logits/rejected": -1.2308666706085205, + "logps/chosen": -358.9193420410156, + "logps/rejected": -354.67352294921875, + "loss": 0.6109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0700308084487915, + "rewards/margins": 0.3465597927570343, + "rewards/rejected": -1.4165904521942139, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 4.9284230317513906e-06, + "logits/chosen": -1.9100980758666992, + "logits/rejected": -1.4506380558013916, + "logps/chosen": -428.6927795410156, + "logps/rejected": -433.57489013671875, + "loss": 0.5906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2611325979232788, + "rewards/margins": 0.5384291410446167, + "rewards/rejected": -1.7995617389678955, + "step": 1290 + }, + { + "epoch": 0.17, + "learning_rate": 4.9256840394158325e-06, + "logits/chosen": -1.5153193473815918, + "logits/rejected": -1.5525444746017456, + "logps/chosen": -408.9667053222656, + "logps/rejected": -496.4200134277344, + "loss": 0.5492, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3559167385101318, + "rewards/margins": 0.5765284895896912, + "rewards/rejected": -1.9324451684951782, + "step": 1300 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -0.5949398279190063, + "eval_logits/rejected": -0.19331219792366028, + "eval_logps/chosen": -414.6322937011719, + "eval_logps/rejected": -446.29095458984375, + "eval_loss": 0.5791042447090149, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -1.461422085762024, + "eval_rewards/margins": 0.5156759023666382, + "eval_rewards/rejected": -1.977097988128662, + "eval_runtime": 1198.5552, + "eval_samples_per_second": 1.669, + "eval_steps_per_second": 0.834, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 4.922894410836207e-06, + "logits/chosen": -1.9784332513809204, + "logits/rejected": -1.0070569515228271, + "logps/chosen": -438.13262939453125, + "logps/rejected": -431.6548767089844, + "loss": 0.5808, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5133016109466553, + "rewards/margins": 0.5359851121902466, + "rewards/rejected": -2.0492866039276123, + "step": 1310 + }, + { + "epoch": 0.17, + "learning_rate": 4.920054204246116e-06, + "logits/chosen": -1.8840601444244385, + "logits/rejected": -1.0693460702896118, + "logps/chosen": -434.94189453125, + "logps/rejected": -427.7396545410156, + "loss": 0.6386, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5475298166275024, + "rewards/margins": 0.38071927428245544, + "rewards/rejected": -1.9282491207122803, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 4.9171634789349744e-06, + "logits/chosen": -1.7856547832489014, + "logits/rejected": -1.456695795059204, + "logps/chosen": -393.8877868652344, + "logps/rejected": -460.5874938964844, + "loss": 0.5115, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.295278787612915, + "rewards/margins": 0.6688205003738403, + "rewards/rejected": -1.9640991687774658, + "step": 1330 + }, + { + "epoch": 0.18, + "learning_rate": 4.914222295246782e-06, + "logits/chosen": -1.7937772274017334, + "logits/rejected": -1.691471815109253, + "logps/chosen": -362.4700927734375, + "logps/rejected": -393.37750244140625, + "loss": 0.6389, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0026386976242065, + "rewards/margins": 0.29950928688049316, + "rewards/rejected": -1.3021478652954102, + "step": 1340 + }, + { + "epoch": 0.18, + "learning_rate": 4.911230714578858e-06, + "logits/chosen": -1.6912847757339478, + "logits/rejected": -1.8542919158935547, + "logps/chosen": -274.45208740234375, + "logps/rejected": -340.21807861328125, + "loss": 0.5671, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5728007555007935, + "rewards/margins": 0.4466250538825989, + "rewards/rejected": -1.0194257497787476, + "step": 1350 + }, + { + "epoch": 0.18, + "learning_rate": 4.908188799380558e-06, + "logits/chosen": -2.217525005340576, + "logits/rejected": -1.9035638570785522, + "logps/chosen": -292.1073303222656, + "logps/rejected": -301.85491943359375, + "loss": 0.5692, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.46799778938293457, + "rewards/margins": 0.4237847924232483, + "rewards/rejected": -0.8917825818061829, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 4.905096613151975e-06, + "logits/chosen": -2.0345702171325684, + "logits/rejected": -1.6463664770126343, + "logps/chosen": -375.4377746582031, + "logps/rejected": -376.78997802734375, + "loss": 0.629, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7877501845359802, + "rewards/margins": 0.28059062361717224, + "rewards/rejected": -1.06834077835083, + "step": 1370 + }, + { + "epoch": 0.18, + "learning_rate": 4.90195422044261e-06, + "logits/chosen": -1.9617021083831787, + "logits/rejected": -1.6719785928726196, + "logps/chosen": -360.3094787597656, + "logps/rejected": -401.42926025390625, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7087548971176147, + "rewards/margins": 0.6915256381034851, + "rewards/rejected": -1.4002805948257446, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 4.898761686850028e-06, + "logits/chosen": -1.6123435497283936, + "logits/rejected": -1.2256313562393188, + "logps/chosen": -361.95721435546875, + "logps/rejected": -406.7974548339844, + "loss": 0.5884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.009804129600525, + "rewards/margins": 0.534344494342804, + "rewards/rejected": -1.544148564338684, + "step": 1390 + }, + { + "epoch": 0.18, + "learning_rate": 4.895519079018485e-06, + "logits/chosen": -1.6615076065063477, + "logits/rejected": -0.7398694753646851, + "logps/chosen": -338.4859619140625, + "logps/rejected": -375.15704345703125, + "loss": 0.5789, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8177839517593384, + "rewards/margins": 0.6118384599685669, + "rewards/rejected": -1.4296224117279053, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -0.5845944285392761, + "eval_logits/rejected": -0.1907668113708496, + "eval_logps/chosen": -356.483154296875, + "eval_logps/rejected": -384.9108581542969, + "eval_loss": 0.5771133303642273, + "eval_rewards/accuracies": 0.703499972820282, + "eval_rewards/chosen": -0.8799312114715576, + "eval_rewards/margins": 0.4833654463291168, + "eval_rewards/rejected": -1.3632965087890625, + "eval_runtime": 1177.7993, + "eval_samples_per_second": 1.698, + "eval_steps_per_second": 0.849, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 4.89222646463754e-06, + "logits/chosen": -1.5245163440704346, + "logits/rejected": -1.3327850103378296, + "logps/chosen": -351.33154296875, + "logps/rejected": -393.2879333496094, + "loss": 0.5975, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9374910593032837, + "rewards/margins": 0.4914394021034241, + "rewards/rejected": -1.4289302825927734, + "step": 1410 + }, + { + "epoch": 0.19, + "learning_rate": 4.888883912440642e-06, + "logits/chosen": -1.7364609241485596, + "logits/rejected": -1.2762303352355957, + "logps/chosen": -401.25, + "logps/rejected": -442.7745666503906, + "loss": 0.5555, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9093993306159973, + "rewards/margins": 0.5804132223129272, + "rewards/rejected": -1.4898124933242798, + "step": 1420 + }, + { + "epoch": 0.19, + "learning_rate": 4.885491492203688e-06, + "logits/chosen": -1.50997793674469, + "logits/rejected": -1.1342096328735352, + "logps/chosen": -360.4520568847656, + "logps/rejected": -378.17291259765625, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8876248598098755, + "rewards/margins": 0.491793155670166, + "rewards/rejected": -1.3794180154800415, + "step": 1430 + }, + { + "epoch": 0.19, + "learning_rate": 4.882049274743578e-06, + "logits/chosen": -1.7279183864593506, + "logits/rejected": -1.3486309051513672, + "logps/chosen": -405.9081115722656, + "logps/rejected": -435.51708984375, + "loss": 0.5283, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9554494619369507, + "rewards/margins": 0.64899742603302, + "rewards/rejected": -1.6044470071792603, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 4.878557331916729e-06, + "logits/chosen": -1.4362828731536865, + "logits/rejected": -1.2300403118133545, + "logps/chosen": -356.7294006347656, + "logps/rejected": -382.6611633300781, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9905643463134766, + "rewards/margins": 0.5484082698822021, + "rewards/rejected": -1.5389726161956787, + "step": 1450 + }, + { + "epoch": 0.19, + "learning_rate": 4.875015736617576e-06, + "logits/chosen": -1.6814028024673462, + "logits/rejected": -1.2631704807281494, + "logps/chosen": -434.5621643066406, + "logps/rejected": -433.96038818359375, + "loss": 0.5972, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0096790790557861, + "rewards/margins": 0.5042110681533813, + "rewards/rejected": -1.5138901472091675, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 4.8714245627770515e-06, + "logits/chosen": -1.7851835489273071, + "logits/rejected": -1.0083516836166382, + "logps/chosen": -348.3280334472656, + "logps/rejected": -359.25750732421875, + "loss": 0.6018, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0166282653808594, + "rewards/margins": 0.45403042435646057, + "rewards/rejected": -1.470658540725708, + "step": 1470 + }, + { + "epoch": 0.19, + "learning_rate": 4.8677838853610445e-06, + "logits/chosen": -1.6132957935333252, + "logits/rejected": -0.8410366177558899, + "logps/chosen": -358.55548095703125, + "logps/rejected": -369.1767883300781, + "loss": 0.5801, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9697539210319519, + "rewards/margins": 0.5656896829605103, + "rewards/rejected": -1.535443663597107, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 4.864093780368828e-06, + "logits/chosen": -1.8534510135650635, + "logits/rejected": -1.0167924165725708, + "logps/chosen": -379.2197265625, + "logps/rejected": -396.7680358886719, + "loss": 0.4763, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8626648783683777, + "rewards/margins": 0.758463978767395, + "rewards/rejected": -1.621128797531128, + "step": 1490 + }, + { + "epoch": 0.2, + "learning_rate": 4.860354324831482e-06, + "logits/chosen": -1.2768698930740356, + "logits/rejected": -1.3293505907058716, + "logps/chosen": -382.0423583984375, + "logps/rejected": -450.2859802246094, + "loss": 0.5456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1258668899536133, + "rewards/margins": 0.6569615602493286, + "rewards/rejected": -1.7828283309936523, + "step": 1500 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -0.15739573538303375, + "eval_logits/rejected": 0.3097783625125885, + "eval_logps/chosen": -386.943603515625, + "eval_logps/rejected": -427.7158203125, + "eval_loss": 0.5645538568496704, + "eval_rewards/accuracies": 0.703499972820282, + "eval_rewards/chosen": -1.1845359802246094, + "eval_rewards/margins": 0.606810986995697, + "eval_rewards/rejected": -1.7913470268249512, + "eval_runtime": 1155.3202, + "eval_samples_per_second": 1.731, + "eval_steps_per_second": 0.866, + "step": 1500 + }, + { + "epoch": 0.2, + "learning_rate": 4.856565596810279e-06, + "logits/chosen": -1.3375440835952759, + "logits/rejected": -0.9983604550361633, + "logps/chosen": -339.1542053222656, + "logps/rejected": -410.56072998046875, + "loss": 0.5896, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2330477237701416, + "rewards/margins": 0.5682455897331238, + "rewards/rejected": -1.8012933731079102, + "step": 1510 + }, + { + "epoch": 0.2, + "learning_rate": 4.852727675395056e-06, + "logits/chosen": -1.191486120223999, + "logits/rejected": -0.736042857170105, + "logps/chosen": -383.180419921875, + "logps/rejected": -435.84185791015625, + "loss": 0.4452, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2295153141021729, + "rewards/margins": 0.8655103445053101, + "rewards/rejected": -2.0950255393981934, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 4.848840640702565e-06, + "logits/chosen": -1.4446022510528564, + "logits/rejected": -1.0689058303833008, + "logps/chosen": -384.16009521484375, + "logps/rejected": -390.2742614746094, + "loss": 0.6654, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3684113025665283, + "rewards/margins": 0.37485355138778687, + "rewards/rejected": -1.74326491355896, + "step": 1530 + }, + { + "epoch": 0.2, + "learning_rate": 4.844904573874798e-06, + "logits/chosen": -1.4517594575881958, + "logits/rejected": -0.9329123497009277, + "logps/chosen": -404.48828125, + "logps/rejected": -426.7264709472656, + "loss": 0.5109, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2724699974060059, + "rewards/margins": 0.7288385629653931, + "rewards/rejected": -2.0013086795806885, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 4.840919557077297e-06, + "logits/chosen": -1.6456127166748047, + "logits/rejected": -0.6936973333358765, + "logps/chosen": -370.9361267089844, + "logps/rejected": -393.5981140136719, + "loss": 0.5758, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0427272319793701, + "rewards/margins": 0.5629565119743347, + "rewards/rejected": -1.60568368434906, + "step": 1550 + }, + { + "epoch": 0.2, + "learning_rate": 4.836885673497435e-06, + "logits/chosen": -1.382566213607788, + "logits/rejected": -0.757800817489624, + "logps/chosen": -366.60137939453125, + "logps/rejected": -412.43865966796875, + "loss": 0.501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9314566850662231, + "rewards/margins": 0.710411548614502, + "rewards/rejected": -1.6418683528900146, + "step": 1560 + }, + { + "epoch": 0.21, + "learning_rate": 4.832803007342679e-06, + "logits/chosen": -0.9425197839736938, + "logits/rejected": -0.9502668380737305, + "logps/chosen": -353.66204833984375, + "logps/rejected": -422.31640625, + "loss": 0.5845, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1626675128936768, + "rewards/margins": 0.5805758237838745, + "rewards/rejected": -1.7432434558868408, + "step": 1570 + }, + { + "epoch": 0.21, + "learning_rate": 4.828671643838839e-06, + "logits/chosen": -1.275815725326538, + "logits/rejected": -0.9463005065917969, + "logps/chosen": -385.7954406738281, + "logps/rejected": -382.41021728515625, + "loss": 0.6096, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1785976886749268, + "rewards/margins": 0.4413912892341614, + "rewards/rejected": -1.619989037513733, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 4.824491669228279e-06, + "logits/chosen": -1.289780616760254, + "logits/rejected": -0.7804897427558899, + "logps/chosen": -371.3612060546875, + "logps/rejected": -400.07611083984375, + "loss": 0.5764, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.173409104347229, + "rewards/margins": 0.5362740159034729, + "rewards/rejected": -1.7096830606460571, + "step": 1590 + }, + { + "epoch": 0.21, + "learning_rate": 4.8202631707681245e-06, + "logits/chosen": -1.3760565519332886, + "logits/rejected": -0.5771588683128357, + "logps/chosen": -360.80352783203125, + "logps/rejected": -426.24688720703125, + "loss": 0.4722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.24233078956604, + "rewards/margins": 0.8159643411636353, + "rewards/rejected": -2.0582950115203857, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_logits/chosen": 0.03460278362035751, + "eval_logits/rejected": 0.539508044719696, + "eval_logps/chosen": -400.91131591796875, + "eval_logps/rejected": -442.8173828125, + "eval_loss": 0.5598156452178955, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -1.3242131471633911, + "eval_rewards/margins": 0.6181491613388062, + "eval_rewards/rejected": -1.9423623085021973, + "eval_runtime": 1154.908, + "eval_samples_per_second": 1.732, + "eval_steps_per_second": 0.866, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 4.815986236728437e-06, + "logits/chosen": -1.1099474430084229, + "logits/rejected": -0.6507538557052612, + "logps/chosen": -378.3443298339844, + "logps/rejected": -439.36651611328125, + "loss": 0.5462, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2413318157196045, + "rewards/margins": 0.6197353601455688, + "rewards/rejected": -1.8610671758651733, + "step": 1610 + }, + { + "epoch": 0.21, + "learning_rate": 4.811660956390372e-06, + "logits/chosen": -1.168228030204773, + "logits/rejected": -0.9048255085945129, + "logps/chosen": -420.23974609375, + "logps/rejected": -447.28790283203125, + "loss": 0.5796, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1689625978469849, + "rewards/margins": 0.5838147401809692, + "rewards/rejected": -1.7527774572372437, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 4.807287420044319e-06, + "logits/chosen": -1.4433248043060303, + "logits/rejected": -0.7626504302024841, + "logps/chosen": -344.18035888671875, + "logps/rejected": -398.12451171875, + "loss": 0.5509, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1426336765289307, + "rewards/margins": 0.6661030650138855, + "rewards/rejected": -1.8087365627288818, + "step": 1630 + }, + { + "epoch": 0.21, + "learning_rate": 4.802865718988008e-06, + "logits/chosen": -1.0445847511291504, + "logits/rejected": -1.0025596618652344, + "logps/chosen": -345.99066162109375, + "logps/rejected": -438.6319885253906, + "loss": 0.5689, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2196751832962036, + "rewards/margins": 0.5887807607650757, + "rewards/rejected": -1.8084558248519897, + "step": 1640 + }, + { + "epoch": 0.22, + "learning_rate": 4.798395945524615e-06, + "logits/chosen": -1.2105060815811157, + "logits/rejected": -0.5044958591461182, + "logps/chosen": -390.4164123535156, + "logps/rejected": -438.38916015625, + "loss": 0.5252, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3014953136444092, + "rewards/margins": 0.7168537974357605, + "rewards/rejected": -2.0183491706848145, + "step": 1650 + }, + { + "epoch": 0.22, + "learning_rate": 4.793878192960823e-06, + "logits/chosen": -1.5575644969940186, + "logits/rejected": -0.6008479595184326, + "logps/chosen": -447.37384033203125, + "logps/rejected": -512.7503051757812, + "loss": 0.5289, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3483020067214966, + "rewards/margins": 0.7725515961647034, + "rewards/rejected": -2.1208536624908447, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 4.789312555604887e-06, + "logits/chosen": -1.3201217651367188, + "logits/rejected": -0.7254796028137207, + "logps/chosen": -357.54486083984375, + "logps/rejected": -404.8153381347656, + "loss": 0.5135, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0712493658065796, + "rewards/margins": 0.709019660949707, + "rewards/rejected": -1.7802692651748657, + "step": 1670 + }, + { + "epoch": 0.22, + "learning_rate": 4.784699128764654e-06, + "logits/chosen": -1.4174318313598633, + "logits/rejected": -0.4926614761352539, + "logps/chosen": -362.864013671875, + "logps/rejected": -409.38238525390625, + "loss": 0.5552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1419918537139893, + "rewards/margins": 0.6905233263969421, + "rewards/rejected": -1.8325151205062866, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 4.780038008745581e-06, + "logits/chosen": -1.2989250421524048, + "logits/rejected": -0.561491072177887, + "logps/chosen": -437.3382263183594, + "logps/rejected": -468.6178283691406, + "loss": 0.5451, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4737846851348877, + "rewards/margins": 0.7077856063842773, + "rewards/rejected": -2.181570291519165, + "step": 1690 + }, + { + "epoch": 0.22, + "learning_rate": 4.775329292848721e-06, + "logits/chosen": -0.8505582809448242, + "logits/rejected": -0.4519156515598297, + "logps/chosen": -432.04736328125, + "logps/rejected": -503.6212463378906, + "loss": 0.5072, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4707634449005127, + "rewards/margins": 0.8545511960983276, + "rewards/rejected": -2.32531476020813, + "step": 1700 + }, + { + "epoch": 0.22, + "eval_logits/chosen": 0.46571579575538635, + "eval_logits/rejected": 1.0410724878311157, + "eval_logps/chosen": -418.8860168457031, + "eval_logps/rejected": -465.253662109375, + "eval_loss": 0.5574353933334351, + "eval_rewards/accuracies": 0.7059999704360962, + "eval_rewards/chosen": -1.503960132598877, + "eval_rewards/margins": 0.6627644896507263, + "eval_rewards/rejected": -2.166724681854248, + "eval_runtime": 1172.1157, + "eval_samples_per_second": 1.706, + "eval_steps_per_second": 0.853, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 4.770573079368691e-06, + "logits/chosen": -0.927274227142334, + "logits/rejected": -0.7411154508590698, + "logps/chosen": -412.53338623046875, + "logps/rejected": -437.78875732421875, + "loss": 0.6122, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4526969194412231, + "rewards/margins": 0.5428808927536011, + "rewards/rejected": -1.9955780506134033, + "step": 1710 + }, + { + "epoch": 0.23, + "learning_rate": 4.765769467591626e-06, + "logits/chosen": -1.1742867231369019, + "logits/rejected": -0.6260591745376587, + "logps/chosen": -435.9891052246094, + "logps/rejected": -470.205810546875, + "loss": 0.5436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4501960277557373, + "rewards/margins": 0.603704035282135, + "rewards/rejected": -2.0539000034332275, + "step": 1720 + }, + { + "epoch": 0.23, + "learning_rate": 4.760918557793096e-06, + "logits/chosen": -1.0430524349212646, + "logits/rejected": -0.6410683989524841, + "logps/chosen": -408.3353271484375, + "logps/rejected": -473.285888671875, + "loss": 0.5752, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.56908118724823, + "rewards/margins": 0.5391335487365723, + "rewards/rejected": -2.108214855194092, + "step": 1730 + }, + { + "epoch": 0.23, + "learning_rate": 4.756020451236025e-06, + "logits/chosen": -1.3290060758590698, + "logits/rejected": -0.5599908232688904, + "logps/chosen": -448.45428466796875, + "logps/rejected": -473.66387939453125, + "loss": 0.5918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4412941932678223, + "rewards/margins": 0.5439326167106628, + "rewards/rejected": -1.9852268695831299, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 4.751075250168569e-06, + "logits/chosen": -1.4968469142913818, + "logits/rejected": 0.1215222105383873, + "logps/chosen": -433.72222900390625, + "logps/rejected": -473.5547790527344, + "loss": 0.539, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6900333166122437, + "rewards/margins": 0.7276581525802612, + "rewards/rejected": -2.417691707611084, + "step": 1750 + }, + { + "epoch": 0.23, + "learning_rate": 4.746083057821981e-06, + "logits/chosen": -1.1097086668014526, + "logits/rejected": -0.21016299724578857, + "logps/chosen": -395.05413818359375, + "logps/rejected": -453.3968811035156, + "loss": 0.5255, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.454843282699585, + "rewards/margins": 0.9116643667221069, + "rewards/rejected": -2.3665075302124023, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 4.741043978408463e-06, + "logits/chosen": -0.9275191426277161, + "logits/rejected": -0.33824652433395386, + "logps/chosen": -380.26177978515625, + "logps/rejected": -466.066650390625, + "loss": 0.509, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2040696144104004, + "rewards/margins": 0.8978725671768188, + "rewards/rejected": -2.1019420623779297, + "step": 1770 + }, + { + "epoch": 0.23, + "learning_rate": 4.735958117118983e-06, + "logits/chosen": -1.4339239597320557, + "logits/rejected": -0.27769067883491516, + "logps/chosen": -403.5133056640625, + "logps/rejected": -451.6259765625, + "loss": 0.5387, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0971940755844116, + "rewards/margins": 0.7503548860549927, + "rewards/rejected": -1.8475488424301147, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 4.730825580121084e-06, + "logits/chosen": -1.1844618320465088, + "logits/rejected": -0.35769081115722656, + "logps/chosen": -366.8836669921875, + "logps/rejected": -446.3826599121094, + "loss": 0.5439, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2104787826538086, + "rewards/margins": 0.7347462177276611, + "rewards/rejected": -1.9452251195907593, + "step": 1790 + }, + { + "epoch": 0.24, + "learning_rate": 4.725646474556666e-06, + "logits/chosen": -0.8105975985527039, + "logits/rejected": -0.6895971894264221, + "logps/chosen": -352.57110595703125, + "logps/rejected": -445.0667419433594, + "loss": 0.5284, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.290921926498413, + "rewards/margins": 0.7879074811935425, + "rewards/rejected": -2.078829526901245, + "step": 1800 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 0.6528418064117432, + "eval_logits/rejected": 1.2403807640075684, + "eval_logps/chosen": -423.3541564941406, + "eval_logps/rejected": -469.1292724609375, + "eval_loss": 0.5534334182739258, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": -1.5486416816711426, + "eval_rewards/margins": 0.6568393111228943, + "eval_rewards/rejected": -2.2054810523986816, + "eval_runtime": 1170.7626, + "eval_samples_per_second": 1.708, + "eval_steps_per_second": 0.854, + "step": 1800 + }, + { + "epoch": 0.24, + "learning_rate": 4.720420908539748e-06, + "logits/chosen": -1.189819574356079, + "logits/rejected": -0.6282674074172974, + "logps/chosen": -418.718505859375, + "logps/rejected": -480.90216064453125, + "loss": 0.6174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7022778987884521, + "rewards/margins": 0.5764120817184448, + "rewards/rejected": -2.2786898612976074, + "step": 1810 + }, + { + "epoch": 0.24, + "learning_rate": 4.715148991154216e-06, + "logits/chosen": -1.222535252571106, + "logits/rejected": -0.8831266164779663, + "logps/chosen": -516.7198486328125, + "logps/rejected": -564.3641967773438, + "loss": 0.586, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7563714981079102, + "rewards/margins": 0.5409680604934692, + "rewards/rejected": -2.297339677810669, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 4.709830832451538e-06, + "logits/chosen": -0.8930926322937012, + "logits/rejected": -0.4406977593898773, + "logps/chosen": -486.24578857421875, + "logps/rejected": -537.7103881835938, + "loss": 0.5226, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8711732625961304, + "rewards/margins": 0.6819201111793518, + "rewards/rejected": -2.553093433380127, + "step": 1830 + }, + { + "epoch": 0.24, + "learning_rate": 4.704466543448477e-06, + "logits/chosen": -1.0724461078643799, + "logits/rejected": -0.03855453059077263, + "logps/chosen": -519.872802734375, + "logps/rejected": -553.6289672851562, + "loss": 0.4735, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7257235050201416, + "rewards/margins": 0.9186381101608276, + "rewards/rejected": -2.644361972808838, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 4.699056236124762e-06, + "logits/chosen": -1.1924560070037842, + "logits/rejected": -0.6589155197143555, + "logps/chosen": -442.31707763671875, + "logps/rejected": -505.4425354003906, + "loss": 0.6075, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8461265563964844, + "rewards/margins": 0.616051197052002, + "rewards/rejected": -2.4621777534484863, + "step": 1850 + }, + { + "epoch": 0.24, + "learning_rate": 4.693600023420758e-06, + "logits/chosen": -1.2352359294891357, + "logits/rejected": -0.2970736026763916, + "logps/chosen": -436.0951232910156, + "logps/rejected": -452.52923583984375, + "loss": 0.4406, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.391472339630127, + "rewards/margins": 0.9484335780143738, + "rewards/rejected": -2.3399059772491455, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 4.688098019235108e-06, + "logits/chosen": -1.2450083494186401, + "logits/rejected": -0.5829068422317505, + "logps/chosen": -445.16143798828125, + "logps/rejected": -517.4144287109375, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5391108989715576, + "rewards/margins": 0.8366822004318237, + "rewards/rejected": -2.375793218612671, + "step": 1870 + }, + { + "epoch": 0.25, + "learning_rate": 4.682550338422353e-06, + "logits/chosen": -1.3568122386932373, + "logits/rejected": -0.5020118355751038, + "logps/chosen": -410.96826171875, + "logps/rejected": -465.6785583496094, + "loss": 0.4733, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4551056623458862, + "rewards/margins": 0.9018338322639465, + "rewards/rejected": -2.3569395542144775, + "step": 1880 + }, + { + "epoch": 0.25, + "learning_rate": 4.676957096790536e-06, + "logits/chosen": -1.0935922861099243, + "logits/rejected": -0.5090438723564148, + "logps/chosen": -427.3106994628906, + "logps/rejected": -443.31231689453125, + "loss": 0.6429, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6104503870010376, + "rewards/margins": 0.5206896066665649, + "rewards/rejected": -2.1311399936676025, + "step": 1890 + }, + { + "epoch": 0.25, + "learning_rate": 4.671318411098782e-06, + "logits/chosen": -0.7024677991867065, + "logits/rejected": -0.8284416198730469, + "logps/chosen": -458.29608154296875, + "logps/rejected": -553.300537109375, + "loss": 0.5623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7181098461151123, + "rewards/margins": 0.9186338186264038, + "rewards/rejected": -2.6367437839508057, + "step": 1900 + }, + { + "epoch": 0.25, + "eval_logits/chosen": 0.30577704310417175, + "eval_logits/rejected": 0.7807530760765076, + "eval_logps/chosen": -439.5539245605469, + "eval_logps/rejected": -491.0526123046875, + "eval_loss": 0.5624514222145081, + "eval_rewards/accuracies": 0.7055000066757202, + "eval_rewards/chosen": -1.7106391191482544, + "eval_rewards/margins": 0.7140753269195557, + "eval_rewards/rejected": -2.4247145652770996, + "eval_runtime": 1175.0312, + "eval_samples_per_second": 1.702, + "eval_steps_per_second": 0.851, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 4.665634399054864e-06, + "logits/chosen": -0.725782036781311, + "logits/rejected": -0.6188144683837891, + "logps/chosen": -409.5749206542969, + "logps/rejected": -466.3291931152344, + "loss": 0.663, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.785436987876892, + "rewards/margins": 0.5989683270454407, + "rewards/rejected": -2.3844053745269775, + "step": 1910 + }, + { + "epoch": 0.25, + "learning_rate": 4.659905179312743e-06, + "logits/chosen": -1.4557887315750122, + "logits/rejected": -0.6816984415054321, + "logps/chosen": -442.3401794433594, + "logps/rejected": -428.6465759277344, + "loss": 0.6225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4092276096343994, + "rewards/margins": 0.5179110765457153, + "rewards/rejected": -1.9271386861801147, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 4.654130871470093e-06, + "logits/chosen": -1.4024882316589355, + "logits/rejected": -0.5982595682144165, + "logps/chosen": -372.2467346191406, + "logps/rejected": -375.06610107421875, + "loss": 0.6068, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1297688484191895, + "rewards/margins": 0.4118806719779968, + "rewards/rejected": -1.541649580001831, + "step": 1930 + }, + { + "epoch": 0.25, + "learning_rate": 4.6483115960658045e-06, + "logits/chosen": -1.4664485454559326, + "logits/rejected": -0.3487986624240875, + "logps/chosen": -396.49334716796875, + "logps/rejected": -393.38616943359375, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2406772375106812, + "rewards/margins": 0.653410792350769, + "rewards/rejected": -1.8940880298614502, + "step": 1940 + }, + { + "epoch": 0.26, + "learning_rate": 4.642447474577466e-06, + "logits/chosen": -1.0883629322052002, + "logits/rejected": -1.072458028793335, + "logps/chosen": -371.15093994140625, + "logps/rejected": -431.68426513671875, + "loss": 0.5129, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.395630121231079, + "rewards/margins": 0.68388831615448, + "rewards/rejected": -2.0795183181762695, + "step": 1950 + }, + { + "epoch": 0.26, + "learning_rate": 4.636538629418832e-06, + "logits/chosen": -1.2723660469055176, + "logits/rejected": -0.904948353767395, + "logps/chosen": -401.46112060546875, + "logps/rejected": -466.59393310546875, + "loss": 0.4758, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1891543865203857, + "rewards/margins": 0.8602797389030457, + "rewards/rejected": -2.049434185028076, + "step": 1960 + }, + { + "epoch": 0.26, + "learning_rate": 4.630585183937263e-06, + "logits/chosen": -1.2972795963287354, + "logits/rejected": -0.6552487015724182, + "logps/chosen": -409.4198913574219, + "logps/rejected": -429.36248779296875, + "loss": 0.5831, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.228527307510376, + "rewards/margins": 0.5128410458564758, + "rewards/rejected": -1.741368055343628, + "step": 1970 + }, + { + "epoch": 0.26, + "learning_rate": 4.6245872624111535e-06, + "logits/chosen": -1.1344765424728394, + "logits/rejected": -0.9775272607803345, + "logps/chosen": -344.75238037109375, + "logps/rejected": -382.8861389160156, + "loss": 0.615, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2352514266967773, + "rewards/margins": 0.49147820472717285, + "rewards/rejected": -1.7267297506332397, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 4.618544990047336e-06, + "logits/chosen": -1.1246018409729004, + "logits/rejected": -0.5179897546768188, + "logps/chosen": -425.73406982421875, + "logps/rejected": -470.3760681152344, + "loss": 0.5957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2693254947662354, + "rewards/margins": 0.6230798959732056, + "rewards/rejected": -1.8924052715301514, + "step": 1990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612458492978473e-06, + "logits/chosen": -1.3398029804229736, + "logits/rejected": -0.8593130111694336, + "logps/chosen": -380.1425476074219, + "logps/rejected": -439.82891845703125, + "loss": 0.6092, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3958600759506226, + "rewards/margins": 0.515204131603241, + "rewards/rejected": -1.9110643863677979, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.007945289835333824, + "eval_logits/rejected": 0.519872784614563, + "eval_logps/chosen": -370.0728454589844, + "eval_logps/rejected": -413.70892333984375, + "eval_loss": 0.550082266330719, + "eval_rewards/accuracies": 0.7085000276565552, + "eval_rewards/chosen": -1.0158281326293945, + "eval_rewards/margins": 0.6354495882987976, + "eval_rewards/rejected": -1.6512778997421265, + "eval_runtime": 1182.5268, + "eval_samples_per_second": 1.691, + "eval_steps_per_second": 0.846, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 4.606327898260413e-06, + "logits/chosen": -1.1447194814682007, + "logits/rejected": -0.6674858331680298, + "logps/chosen": -386.634765625, + "logps/rejected": -428.674072265625, + "loss": 0.5888, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0479754209518433, + "rewards/margins": 0.6669696569442749, + "rewards/rejected": -1.7149450778961182, + "step": 2010 + }, + { + "epoch": 0.26, + "learning_rate": 4.600153333869549e-06, + "logits/chosen": -1.5867061614990234, + "logits/rejected": -0.8068370819091797, + "logps/chosen": -372.733154296875, + "logps/rejected": -399.0820007324219, + "loss": 0.5192, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9379202127456665, + "rewards/margins": 0.6650333404541016, + "rewards/rejected": -1.602953553199768, + "step": 2020 + }, + { + "epoch": 0.27, + "learning_rate": 4.593934928700141e-06, + "logits/chosen": -1.5054073333740234, + "logits/rejected": -0.331088125705719, + "logps/chosen": -396.2590026855469, + "logps/rejected": -440.35308837890625, + "loss": 0.516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2215917110443115, + "rewards/margins": 0.7985371947288513, + "rewards/rejected": -2.0201287269592285, + "step": 2030 + }, + { + "epoch": 0.27, + "learning_rate": 4.587672812561626e-06, + "logits/chosen": -1.0228203535079956, + "logits/rejected": -0.6623596549034119, + "logps/chosen": -359.9720458984375, + "logps/rejected": -467.29461669921875, + "loss": 0.5241, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2281525135040283, + "rewards/margins": 0.7495523691177368, + "rewards/rejected": -1.9777047634124756, + "step": 2040 + }, + { + "epoch": 0.27, + "learning_rate": 4.581367116175911e-06, + "logits/chosen": -0.9328869581222534, + "logits/rejected": -0.2950093150138855, + "logps/chosen": -431.26873779296875, + "logps/rejected": -445.1465759277344, + "loss": 0.5993, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3680379390716553, + "rewards/margins": 0.5561319589614868, + "rewards/rejected": -1.9241701364517212, + "step": 2050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5750179711746416e-06, + "logits/chosen": -0.9565197229385376, + "logits/rejected": -0.35139861702919006, + "logps/chosen": -391.14947509765625, + "logps/rejected": -434.99713134765625, + "loss": 0.5767, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3655421733856201, + "rewards/margins": 0.5043060183525085, + "rewards/rejected": -1.8698484897613525, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 4.5686255100964535e-06, + "logits/chosen": -1.3030929565429688, + "logits/rejected": -0.6504136323928833, + "logps/chosen": -408.542236328125, + "logps/rejected": -433.14068603515625, + "loss": 0.5608, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.455055832862854, + "rewards/margins": 0.6146378517150879, + "rewards/rejected": -2.0696938037872314, + "step": 2070 + }, + { + "epoch": 0.27, + "learning_rate": 4.562189866384209e-06, + "logits/chosen": -0.8565002679824829, + "logits/rejected": -0.5893241763114929, + "logps/chosen": -393.67462158203125, + "logps/rejected": -500.3961486816406, + "loss": 0.5081, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5734975337982178, + "rewards/margins": 0.8559523820877075, + "rewards/rejected": -2.429450035095215, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 4.555711174382209e-06, + "logits/chosen": -1.1275274753570557, + "logits/rejected": -0.4757865369319916, + "logps/chosen": -406.5447692871094, + "logps/rejected": -458.0465393066406, + "loss": 0.5529, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7623049020767212, + "rewards/margins": 0.6915754079818726, + "rewards/rejected": -2.4538803100585938, + "step": 2090 + }, + { + "epoch": 0.27, + "learning_rate": 4.549189569333387e-06, + "logits/chosen": -1.1001176834106445, + "logits/rejected": -0.4274144172668457, + "logps/chosen": -383.9236755371094, + "logps/rejected": -410.41162109375, + "loss": 0.5726, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4652644395828247, + "rewards/margins": 0.5952554941177368, + "rewards/rejected": -2.0605199337005615, + "step": 2100 + }, + { + "epoch": 0.27, + "eval_logits/chosen": 0.4405115842819214, + "eval_logits/rejected": 0.9980767965316772, + "eval_logps/chosen": -415.4569396972656, + "eval_logps/rejected": -464.3841857910156, + "eval_loss": 0.5433006882667542, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -1.4696691036224365, + "eval_rewards/margins": 0.6883615255355835, + "eval_rewards/rejected": -2.1580307483673096, + "eval_runtime": 1178.0191, + "eval_samples_per_second": 1.698, + "eval_steps_per_second": 0.849, + "step": 2100 + }, + { + "epoch": 0.28, + "learning_rate": 4.542625187376491e-06, + "logits/chosen": -1.145003318786621, + "logits/rejected": 0.003861379576846957, + "logps/chosen": -429.87255859375, + "logps/rejected": -454.26239013671875, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3411312103271484, + "rewards/margins": 0.6698344349861145, + "rewards/rejected": -2.0109658241271973, + "step": 2110 + }, + { + "epoch": 0.28, + "learning_rate": 4.536018165543239e-06, + "logits/chosen": -0.990827739238739, + "logits/rejected": -0.4457016885280609, + "logps/chosen": -453.3412170410156, + "logps/rejected": -506.03741455078125, + "loss": 0.5634, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5011839866638184, + "rewards/margins": 0.6426320672035217, + "rewards/rejected": -2.1438162326812744, + "step": 2120 + }, + { + "epoch": 0.28, + "learning_rate": 4.529368641755453e-06, + "logits/chosen": -0.9683168530464172, + "logits/rejected": -0.37535277009010315, + "logps/chosen": -391.17059326171875, + "logps/rejected": -449.50714111328125, + "loss": 0.6002, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6766704320907593, + "rewards/margins": 0.6462152600288391, + "rewards/rejected": -2.322885513305664, + "step": 2130 + }, + { + "epoch": 0.28, + "learning_rate": 4.522676754822189e-06, + "logits/chosen": -1.1304986476898193, + "logits/rejected": -0.19757482409477234, + "logps/chosen": -427.12188720703125, + "logps/rejected": -439.59716796875, + "loss": 0.4706, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5254873037338257, + "rewards/margins": 0.839495837688446, + "rewards/rejected": -2.364983081817627, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 4.515942644436836e-06, + "logits/chosen": -1.0416542291641235, + "logits/rejected": -0.08233954012393951, + "logps/chosen": -426.8787536621094, + "logps/rejected": -478.9959411621094, + "loss": 0.5463, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4865224361419678, + "rewards/margins": 0.7738333940505981, + "rewards/rejected": -2.2603557109832764, + "step": 2150 + }, + { + "epoch": 0.28, + "learning_rate": 4.509166451174194e-06, + "logits/chosen": -1.0556268692016602, + "logits/rejected": -0.6550186276435852, + "logps/chosen": -412.85565185546875, + "logps/rejected": -469.2373962402344, + "loss": 0.5133, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0766072273254395, + "rewards/margins": 0.8188613057136536, + "rewards/rejected": -1.8954684734344482, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 4.502348316487552e-06, + "logits/chosen": -1.5349935293197632, + "logits/rejected": -0.5842410922050476, + "logps/chosen": -392.6039123535156, + "logps/rejected": -432.892578125, + "loss": 0.5352, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1731538772583008, + "rewards/margins": 0.7138124704360962, + "rewards/rejected": -1.886966347694397, + "step": 2170 + }, + { + "epoch": 0.29, + "learning_rate": 4.495488382705722e-06, + "logits/chosen": -1.4572819471359253, + "logits/rejected": -0.1267738789319992, + "logps/chosen": -435.73748779296875, + "logps/rejected": -435.91864013671875, + "loss": 0.4479, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9534079432487488, + "rewards/margins": 0.9101373553276062, + "rewards/rejected": -1.8635451793670654, + "step": 2180 + }, + { + "epoch": 0.29, + "learning_rate": 4.488586793030075e-06, + "logits/chosen": -1.0830357074737549, + "logits/rejected": -0.5002374053001404, + "logps/chosen": -330.9154052734375, + "logps/rejected": -439.44158935546875, + "loss": 0.4391, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0812445878982544, + "rewards/margins": 0.9342634081840515, + "rewards/rejected": -2.015507936477661, + "step": 2190 + }, + { + "epoch": 0.29, + "learning_rate": 4.481643691531551e-06, + "logits/chosen": -1.0210946798324585, + "logits/rejected": -0.11824611574411392, + "logps/chosen": -396.3450622558594, + "logps/rejected": -435.77734375, + "loss": 0.5323, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2495872974395752, + "rewards/margins": 0.7661415934562683, + "rewards/rejected": -2.015728712081909, + "step": 2200 + }, + { + "epoch": 0.29, + "eval_logits/chosen": 0.7444968819618225, + "eval_logits/rejected": 1.3532707691192627, + "eval_logps/chosen": -400.2243957519531, + "eval_logps/rejected": -457.445068359375, + "eval_loss": 0.5483363270759583, + "eval_rewards/accuracies": 0.7149999737739563, + "eval_rewards/chosen": -1.3173435926437378, + "eval_rewards/margins": 0.7712955474853516, + "eval_rewards/rejected": -2.088639259338379, + "eval_runtime": 1176.3943, + "eval_samples_per_second": 1.7, + "eval_steps_per_second": 0.85, + "step": 2200 + }, + { + "epoch": 0.29, + "learning_rate": 4.474659223147652e-06, + "logits/chosen": -0.6220839619636536, + "logits/rejected": -0.34505695104599, + "logps/chosen": -400.8376770019531, + "logps/rejected": -455.2310485839844, + "loss": 0.5745, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3072404861450195, + "rewards/margins": 0.7971282005310059, + "rewards/rejected": -2.1043686866760254, + "step": 2210 + }, + { + "epoch": 0.29, + "learning_rate": 4.4676335336794125e-06, + "logits/chosen": -0.9507938623428345, + "logits/rejected": -0.2239619791507721, + "logps/chosen": -421.60479736328125, + "logps/rejected": -466.75341796875, + "loss": 0.5746, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1340086460113525, + "rewards/margins": 0.7016123533248901, + "rewards/rejected": -1.8356211185455322, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 4.46056676978836e-06, + "logits/chosen": -0.8311563730239868, + "logits/rejected": -0.7106046676635742, + "logps/chosen": -368.59271240234375, + "logps/rejected": -457.5370178222656, + "loss": 0.5991, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.149174451828003, + "rewards/margins": 0.5227919816970825, + "rewards/rejected": -1.671966314315796, + "step": 2230 + }, + { + "epoch": 0.29, + "learning_rate": 4.453459078993453e-06, + "logits/chosen": -0.7870109677314758, + "logits/rejected": -0.6436534523963928, + "logps/chosen": -353.3741760253906, + "logps/rejected": -449.02093505859375, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9842308163642883, + "rewards/margins": 1.0269984006881714, + "rewards/rejected": -2.0112290382385254, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 4.446310609668001e-06, + "logits/chosen": -0.685552716255188, + "logits/rejected": -0.2860308289527893, + "logps/chosen": -392.5391540527344, + "logps/rejected": -509.0357360839844, + "loss": 0.5367, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.616072654724121, + "rewards/margins": 0.8003999590873718, + "rewards/rejected": -2.4164726734161377, + "step": 2250 + }, + { + "epoch": 0.3, + "learning_rate": 4.439121511036562e-06, + "logits/chosen": -0.8480124473571777, + "logits/rejected": 0.001265025115571916, + "logps/chosen": -474.6297912597656, + "logps/rejected": -509.7176818847656, + "loss": 0.5317, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8826634883880615, + "rewards/margins": 0.7242499589920044, + "rewards/rejected": -2.6069135665893555, + "step": 2260 + }, + { + "epoch": 0.3, + "learning_rate": 4.431891933171839e-06, + "logits/chosen": -1.1753742694854736, + "logits/rejected": -0.3657965064048767, + "logps/chosen": -403.2115783691406, + "logps/rejected": -454.543212890625, + "loss": 0.6065, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4112242460250854, + "rewards/margins": 0.6532294154167175, + "rewards/rejected": -2.064453601837158, + "step": 2270 + }, + { + "epoch": 0.3, + "learning_rate": 4.424622026991536e-06, + "logits/chosen": -1.0060756206512451, + "logits/rejected": -0.37251150608062744, + "logps/chosen": -391.8915100097656, + "logps/rejected": -437.4137268066406, + "loss": 0.5717, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2653111219406128, + "rewards/margins": 0.6680868864059448, + "rewards/rejected": -1.9333980083465576, + "step": 2280 + }, + { + "epoch": 0.3, + "learning_rate": 4.417311944255215e-06, + "logits/chosen": -0.7842764258384705, + "logits/rejected": -0.8166142702102661, + "logps/chosen": -354.4258728027344, + "logps/rejected": -414.52362060546875, + "loss": 0.6835, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1270352602005005, + "rewards/margins": 0.41290101408958435, + "rewards/rejected": -1.5399363040924072, + "step": 2290 + }, + { + "epoch": 0.3, + "learning_rate": 4.409961837561122e-06, + "logits/chosen": -0.6530027985572815, + "logits/rejected": -0.5591267943382263, + "logps/chosen": -439.7125549316406, + "logps/rejected": -520.6340942382812, + "loss": 0.5148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2820568084716797, + "rewards/margins": 0.7924367785453796, + "rewards/rejected": -2.074493408203125, + "step": 2300 + }, + { + "epoch": 0.3, + "eval_logits/chosen": 0.5106939077377319, + "eval_logits/rejected": 1.1454416513442993, + "eval_logps/chosen": -400.4307861328125, + "eval_logps/rejected": -450.4645690917969, + "eval_loss": 0.5387266278266907, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.3194077014923096, + "eval_rewards/margins": 0.6994263529777527, + "eval_rewards/rejected": -2.018834114074707, + "eval_runtime": 1973.7027, + "eval_samples_per_second": 1.013, + "eval_steps_per_second": 0.507, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 4.402571860343006e-06, + "logits/chosen": -1.4138994216918945, + "logits/rejected": 0.06842954456806183, + "logps/chosen": -422.7046813964844, + "logps/rejected": -416.2068786621094, + "loss": 0.5745, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4042686223983765, + "rewards/margins": 0.6060650944709778, + "rewards/rejected": -2.01033353805542, + "step": 2310 + }, + { + "epoch": 0.3, + "learning_rate": 4.3951421668669165e-06, + "logits/chosen": -0.9971601366996765, + "logits/rejected": -0.3194156587123871, + "logps/chosen": -423.9295349121094, + "logps/rejected": -490.6680603027344, + "loss": 0.5077, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4917466640472412, + "rewards/margins": 0.8139832615852356, + "rewards/rejected": -2.305730104446411, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 4.3876729122279784e-06, + "logits/chosen": -0.9761942028999329, + "logits/rejected": -0.7307599782943726, + "logps/chosen": -324.6410827636719, + "logps/rejected": -422.3362731933594, + "loss": 0.4951, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2676632404327393, + "rewards/margins": 0.9440364837646484, + "rewards/rejected": -2.2116997241973877, + "step": 2330 + }, + { + "epoch": 0.31, + "learning_rate": 4.3801642523471585e-06, + "logits/chosen": -1.422680139541626, + "logits/rejected": 0.22055339813232422, + "logps/chosen": -427.3160705566406, + "logps/rejected": -468.10980224609375, + "loss": 0.4991, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5216461420059204, + "rewards/margins": 0.8185620307922363, + "rewards/rejected": -2.3402082920074463, + "step": 2340 + }, + { + "epoch": 0.31, + "learning_rate": 4.37261634396801e-06, + "logits/chosen": -0.7539848685264587, + "logits/rejected": -0.37924566864967346, + "logps/chosen": -433.6114807128906, + "logps/rejected": -493.5924377441406, + "loss": 0.5182, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7175815105438232, + "rewards/margins": 0.7918481230735779, + "rewards/rejected": -2.509429693222046, + "step": 2350 + }, + { + "epoch": 0.31, + "learning_rate": 4.365029344653401e-06, + "logits/chosen": -1.1154688596725464, + "logits/rejected": -0.17494897544384003, + "logps/chosen": -479.53936767578125, + "logps/rejected": -493.6295471191406, + "loss": 0.5254, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4410346746444702, + "rewards/margins": 0.9099518656730652, + "rewards/rejected": -2.3509867191314697, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 4.35740341278222e-06, + "logits/chosen": -1.2674121856689453, + "logits/rejected": -0.6544126272201538, + "logps/chosen": -448.0379943847656, + "logps/rejected": -505.9681701660156, + "loss": 0.5445, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2304657697677612, + "rewards/margins": 0.7070282697677612, + "rewards/rejected": -1.9374940395355225, + "step": 2370 + }, + { + "epoch": 0.31, + "learning_rate": 4.349738707546079e-06, + "logits/chosen": -0.8608369827270508, + "logits/rejected": -0.4290972650051117, + "logps/chosen": -400.2996826171875, + "logps/rejected": -436.1752014160156, + "loss": 0.5362, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3461062908172607, + "rewards/margins": 0.770219624042511, + "rewards/rejected": -2.116325855255127, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 4.3420353889459835e-06, + "logits/chosen": -1.3208414316177368, + "logits/rejected": -0.2934853434562683, + "logps/chosen": -462.20465087890625, + "logps/rejected": -489.04156494140625, + "loss": 0.4917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4787876605987549, + "rewards/margins": 0.8698205947875977, + "rewards/rejected": -2.3486080169677734, + "step": 2390 + }, + { + "epoch": 0.31, + "learning_rate": 4.334293617788992e-06, + "logits/chosen": -1.3569272756576538, + "logits/rejected": 0.16524724662303925, + "logps/chosen": -401.202880859375, + "logps/rejected": -444.6561584472656, + "loss": 0.4112, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5012043714523315, + "rewards/margins": 1.1782524585723877, + "rewards/rejected": -2.679456949234009, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.6648316383361816, + "eval_logits/rejected": 1.2865513563156128, + "eval_logps/chosen": -430.5039978027344, + "eval_logps/rejected": -490.7723388671875, + "eval_loss": 0.5400993824005127, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -1.6201398372650146, + "eval_rewards/margins": 0.8017721772193909, + "eval_rewards/rejected": -2.4219119548797607, + "eval_runtime": 2327.0644, + "eval_samples_per_second": 0.859, + "eval_steps_per_second": 0.43, + "step": 2400 + }, + { + "epoch": 0.32, + "learning_rate": 4.326513555684867e-06, + "logits/chosen": -1.254010558128357, + "logits/rejected": 0.03536475822329521, + "logps/chosen": -450.07354736328125, + "logps/rejected": -438.840576171875, + "loss": 0.6208, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5773955583572388, + "rewards/margins": 0.5903611779212952, + "rewards/rejected": -2.1677565574645996, + "step": 2410 + }, + { + "epoch": 0.32, + "learning_rate": 4.31869536504269e-06, + "logits/chosen": -0.6679142117500305, + "logits/rejected": -0.44401970505714417, + "logps/chosen": -402.5331115722656, + "logps/rejected": -454.895263671875, + "loss": 0.5753, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4515701532363892, + "rewards/margins": 0.6856138110160828, + "rewards/rejected": -2.1371841430664062, + "step": 2420 + }, + { + "epoch": 0.32, + "learning_rate": 4.310839209067482e-06, + "logits/chosen": -1.3650258779525757, + "logits/rejected": 0.0447537787258625, + "logps/chosen": -433.0634765625, + "logps/rejected": -456.85467529296875, + "loss": 0.5657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5849604606628418, + "rewards/margins": 0.5240734815597534, + "rewards/rejected": -2.1090340614318848, + "step": 2430 + }, + { + "epoch": 0.32, + "learning_rate": 4.302945251756788e-06, + "logits/chosen": -0.8697819709777832, + "logits/rejected": -0.31318679451942444, + "logps/chosen": -399.88421630859375, + "logps/rejected": -466.7974548339844, + "loss": 0.4323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4232676029205322, + "rewards/margins": 0.9615268707275391, + "rewards/rejected": -2.3847947120666504, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 4.29501365789726e-06, + "logits/chosen": -0.6540710926055908, + "logits/rejected": -0.13195696473121643, + "logps/chosen": -375.75726318359375, + "logps/rejected": -441.58135986328125, + "loss": 0.5364, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5193512439727783, + "rewards/margins": 0.8794568181037903, + "rewards/rejected": -2.398808002471924, + "step": 2450 + }, + { + "epoch": 0.32, + "learning_rate": 4.2870445930612135e-06, + "logits/chosen": -0.6666491627693176, + "logits/rejected": -0.20681039988994598, + "logps/chosen": -443.6128845214844, + "logps/rejected": -499.73370361328125, + "loss": 0.4717, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2950096130371094, + "rewards/margins": 1.0014543533325195, + "rewards/rejected": -2.296464204788208, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 4.279038223603171e-06, + "logits/chosen": -0.9633728265762329, + "logits/rejected": 0.12687508761882782, + "logps/chosen": -392.9165344238281, + "logps/rejected": -445.36907958984375, + "loss": 0.5094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3179303407669067, + "rewards/margins": 0.8730724453926086, + "rewards/rejected": -2.19100284576416, + "step": 2470 + }, + { + "epoch": 0.32, + "learning_rate": 4.2709947166563906e-06, + "logits/chosen": -0.3830980658531189, + "logits/rejected": -0.04551839455962181, + "logps/chosen": -437.1377868652344, + "logps/rejected": -540.1201171875, + "loss": 0.5012, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7204186916351318, + "rewards/margins": 0.9347385168075562, + "rewards/rejected": -2.6551575660705566, + "step": 2480 + }, + { + "epoch": 0.33, + "learning_rate": 4.262914240129379e-06, + "logits/chosen": -0.7016464471817017, + "logits/rejected": 0.48445090651512146, + "logps/chosen": -457.5303649902344, + "logps/rejected": -510.51959228515625, + "loss": 0.5521, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.668168067932129, + "rewards/margins": 0.9587725400924683, + "rewards/rejected": -2.6269407272338867, + "step": 2490 + }, + { + "epoch": 0.33, + "learning_rate": 4.254796962702382e-06, + "logits/chosen": -0.9013687968254089, + "logits/rejected": -0.2077399045228958, + "logps/chosen": -482.5912170410156, + "logps/rejected": -530.4383544921875, + "loss": 0.5246, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9096496105194092, + "rewards/margins": 0.7448057532310486, + "rewards/rejected": -2.6544556617736816, + "step": 2500 + }, + { + "epoch": 0.33, + "eval_logits/chosen": 1.0913811922073364, + "eval_logits/rejected": 1.7388339042663574, + "eval_logps/chosen": -481.2728576660156, + "eval_logps/rejected": -538.2222290039062, + "eval_loss": 0.5413315296173096, + "eval_rewards/accuracies": 0.722000002861023, + "eval_rewards/chosen": -2.1278281211853027, + "eval_rewards/margins": 0.7685829997062683, + "eval_rewards/rejected": -2.896411180496216, + "eval_runtime": 2343.5019, + "eval_samples_per_second": 0.853, + "eval_steps_per_second": 0.427, + "step": 2500 + }, + { + "epoch": 0.33, + "learning_rate": 4.246643053823864e-06, + "logits/chosen": -0.9218589663505554, + "logits/rejected": -0.20635871589183807, + "logps/chosen": -394.0392761230469, + "logps/rejected": -518.8317260742188, + "loss": 0.4924, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8538624048233032, + "rewards/margins": 0.9797351956367493, + "rewards/rejected": -2.8335976600646973, + "step": 2510 + }, + { + "epoch": 0.33, + "learning_rate": 4.238452683706979e-06, + "logits/chosen": -0.845422089099884, + "logits/rejected": -0.2671593129634857, + "logps/chosen": -385.7846984863281, + "logps/rejected": -433.94219970703125, + "loss": 0.52, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.65887451171875, + "rewards/margins": 0.8339220285415649, + "rewards/rejected": -2.4927964210510254, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 4.2302260233260025e-06, + "logits/chosen": -0.5951265096664429, + "logits/rejected": -0.30148234963417053, + "logps/chosen": -463.2898864746094, + "logps/rejected": -552.0485229492188, + "loss": 0.5021, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.882061243057251, + "rewards/margins": 0.9961883425712585, + "rewards/rejected": -2.8782496452331543, + "step": 2530 + }, + { + "epoch": 0.33, + "learning_rate": 4.2219632444127766e-06, + "logits/chosen": -0.3708893358707428, + "logits/rejected": 0.35125669836997986, + "logps/chosen": -490.02197265625, + "logps/rejected": -541.1585083007812, + "loss": 0.577, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2178242206573486, + "rewards/margins": 0.617692232131958, + "rewards/rejected": -2.8355164527893066, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 4.213664519453115e-06, + "logits/chosen": -0.8187354803085327, + "logits/rejected": -0.09626396000385284, + "logps/chosen": -438.7705993652344, + "logps/rejected": -519.1937866210938, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.062185287475586, + "rewards/margins": 0.7500772476196289, + "rewards/rejected": -2.8122622966766357, + "step": 2550 + }, + { + "epoch": 0.33, + "learning_rate": 4.205330021683208e-06, + "logits/chosen": -0.38596871495246887, + "logits/rejected": -0.07203111797571182, + "logps/chosen": -407.65740966796875, + "logps/rejected": -462.069580078125, + "loss": 0.5697, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9310047626495361, + "rewards/margins": 0.5898799300193787, + "rewards/rejected": -2.5208847522735596, + "step": 2560 + }, + { + "epoch": 0.34, + "learning_rate": 4.196959925086008e-06, + "logits/chosen": -0.22275564074516296, + "logits/rejected": -0.3874654173851013, + "logps/chosen": -478.38409423828125, + "logps/rejected": -548.2817993164062, + "loss": 0.6185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1326379776000977, + "rewards/margins": 0.47789937257766724, + "rewards/rejected": -2.6105377674102783, + "step": 2570 + }, + { + "epoch": 0.34, + "learning_rate": 4.188554404387588e-06, + "logits/chosen": -1.0716884136199951, + "logits/rejected": -0.1740306168794632, + "logps/chosen": -451.58734130859375, + "logps/rejected": -492.9994201660156, + "loss": 0.5536, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7293052673339844, + "rewards/margins": 0.6498473286628723, + "rewards/rejected": -2.379152536392212, + "step": 2580 + }, + { + "epoch": 0.34, + "learning_rate": 4.180113635053504e-06, + "logits/chosen": -0.3191063404083252, + "logits/rejected": -0.5359519720077515, + "logps/chosen": -417.9990234375, + "logps/rejected": -500.7484436035156, + "loss": 0.5768, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6265789270401, + "rewards/margins": 0.6983104944229126, + "rewards/rejected": -2.3248894214630127, + "step": 2590 + }, + { + "epoch": 0.34, + "learning_rate": 4.17163779328513e-06, + "logits/chosen": -0.8110774755477905, + "logits/rejected": -0.09047582000494003, + "logps/chosen": -430.71197509765625, + "logps/rejected": -488.92913818359375, + "loss": 0.5657, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.597516417503357, + "rewards/margins": 0.8392965197563171, + "rewards/rejected": -2.4368128776550293, + "step": 2600 + }, + { + "epoch": 0.34, + "eval_logits/chosen": 0.9886474609375, + "eval_logits/rejected": 1.657106876373291, + "eval_logps/chosen": -437.1171875, + "eval_logps/rejected": -495.00030517578125, + "eval_loss": 0.5373482704162598, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -1.6862711906433105, + "eval_rewards/margins": 0.7779201865196228, + "eval_rewards/rejected": -2.4641919136047363, + "eval_runtime": 2326.8602, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.43, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 4.163127056015975e-06, + "logits/chosen": -0.8432804346084595, + "logits/rejected": 0.2679641544818878, + "logps/chosen": -450.518798828125, + "logps/rejected": -520.7162475585938, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.684958815574646, + "rewards/margins": 0.8352130651473999, + "rewards/rejected": -2.520171642303467, + "step": 2610 + }, + { + "epoch": 0.34, + "learning_rate": 4.154581600907994e-06, + "logits/chosen": -0.8970314264297485, + "logits/rejected": -0.025129878893494606, + "logps/chosen": -394.2820739746094, + "logps/rejected": -448.968505859375, + "loss": 0.4855, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.448757529258728, + "rewards/margins": 0.8607563972473145, + "rewards/rejected": -2.309514284133911, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 4.14600160634788e-06, + "logits/chosen": -0.5631152391433716, + "logits/rejected": 0.03483830764889717, + "logps/chosen": -381.072998046875, + "logps/rejected": -490.3233947753906, + "loss": 0.4763, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4302902221679688, + "rewards/margins": 1.0292136669158936, + "rewards/rejected": -2.459503650665283, + "step": 2630 + }, + { + "epoch": 0.35, + "learning_rate": 4.137387251443335e-06, + "logits/chosen": -1.161237120628357, + "logits/rejected": 0.12660464644432068, + "logps/chosen": -383.56878662109375, + "logps/rejected": -413.07550048828125, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3406836986541748, + "rewards/margins": 0.6809174418449402, + "rewards/rejected": -2.0216009616851807, + "step": 2640 + }, + { + "epoch": 0.35, + "learning_rate": 4.128738716019338e-06, + "logits/chosen": -0.9206587672233582, + "logits/rejected": 0.036871038377285004, + "logps/chosen": -433.82080078125, + "logps/rejected": -477.78619384765625, + "loss": 0.538, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3911445140838623, + "rewards/margins": 0.7216758728027344, + "rewards/rejected": -2.1128203868865967, + "step": 2650 + }, + { + "epoch": 0.35, + "learning_rate": 4.120056180614386e-06, + "logits/chosen": -0.46158695220947266, + "logits/rejected": 0.09717199206352234, + "logps/chosen": -426.7666015625, + "logps/rejected": -509.9091796875, + "loss": 0.5761, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.841655969619751, + "rewards/margins": 0.7033026814460754, + "rewards/rejected": -2.5449583530426025, + "step": 2660 + }, + { + "epoch": 0.35, + "learning_rate": 4.111339826476725e-06, + "logits/chosen": -0.36191526055336, + "logits/rejected": 0.032123737037181854, + "logps/chosen": -385.82342529296875, + "logps/rejected": -472.90203857421875, + "loss": 0.5479, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5523761510849, + "rewards/margins": 0.8306114077568054, + "rewards/rejected": -2.3829877376556396, + "step": 2670 + }, + { + "epoch": 0.35, + "learning_rate": 4.102589835560572e-06, + "logits/chosen": -0.8637989163398743, + "logits/rejected": 0.42278608679771423, + "logps/chosen": -448.33477783203125, + "logps/rejected": -472.2781677246094, + "loss": 0.5531, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3624870777130127, + "rewards/margins": 0.7671451568603516, + "rewards/rejected": -2.1296322345733643, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 4.09380639052231e-06, + "logits/chosen": -0.7388796210289001, + "logits/rejected": -0.16724643111228943, + "logps/chosen": -423.61370849609375, + "logps/rejected": -544.0777587890625, + "loss": 0.4631, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.392254114151001, + "rewards/margins": 1.0100640058517456, + "rewards/rejected": -2.402318239212036, + "step": 2690 + }, + { + "epoch": 0.35, + "learning_rate": 4.084989674716679e-06, + "logits/chosen": -0.7967337369918823, + "logits/rejected": -0.30105361342430115, + "logps/chosen": -473.30743408203125, + "logps/rejected": -546.0299072265625, + "loss": 0.5216, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9343668222427368, + "rewards/margins": 0.7889801859855652, + "rewards/rejected": -2.7233471870422363, + "step": 2700 + }, + { + "epoch": 0.35, + "eval_logits/chosen": 1.1290334463119507, + "eval_logits/rejected": 1.7935971021652222, + "eval_logps/chosen": -467.43646240234375, + "eval_logps/rejected": -522.52783203125, + "eval_loss": 0.5356955528259277, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.9894644021987915, + "eval_rewards/margins": 0.7500025629997253, + "eval_rewards/rejected": -2.7394673824310303, + "eval_runtime": 2319.9728, + "eval_samples_per_second": 0.862, + "eval_steps_per_second": 0.431, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 4.076139872192949e-06, + "logits/chosen": -0.7494713068008423, + "logits/rejected": 0.4128738045692444, + "logps/chosen": -523.4064331054688, + "logps/rejected": -553.7559204101562, + "loss": 0.5189, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.168044090270996, + "rewards/margins": 0.7787975668907166, + "rewards/rejected": -2.9468414783477783, + "step": 2710 + }, + { + "epoch": 0.36, + "learning_rate": 4.067257167691074e-06, + "logits/chosen": -0.3000775873661041, + "logits/rejected": -0.04052892327308655, + "logps/chosen": -495.1270446777344, + "logps/rejected": -570.2074584960938, + "loss": 0.5173, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9619982242584229, + "rewards/margins": 0.9272588491439819, + "rewards/rejected": -2.8892569541931152, + "step": 2720 + }, + { + "epoch": 0.36, + "learning_rate": 4.05834174663784e-06, + "logits/chosen": -0.5102322697639465, + "logits/rejected": -0.13270506262779236, + "logps/chosen": -456.03424072265625, + "logps/rejected": -465.28240966796875, + "loss": 0.6489, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8180183172225952, + "rewards/margins": 0.48946094512939453, + "rewards/rejected": -2.307478904724121, + "step": 2730 + }, + { + "epoch": 0.36, + "learning_rate": 4.0493937951429895e-06, + "logits/chosen": -1.0210916996002197, + "logits/rejected": -0.37836208939552307, + "logps/chosen": -429.5328674316406, + "logps/rejected": -458.05877685546875, + "loss": 0.5128, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6570367813110352, + "rewards/margins": 0.6776058673858643, + "rewards/rejected": -2.3346428871154785, + "step": 2740 + }, + { + "epoch": 0.36, + "learning_rate": 4.040413499995343e-06, + "logits/chosen": -0.7698782086372375, + "logits/rejected": -0.08163869380950928, + "logps/chosen": -478.63848876953125, + "logps/rejected": -546.338623046875, + "loss": 0.513, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.83940851688385, + "rewards/margins": 0.8054366111755371, + "rewards/rejected": -2.6448452472686768, + "step": 2750 + }, + { + "epoch": 0.36, + "learning_rate": 4.031401048658892e-06, + "logits/chosen": -0.7496566772460938, + "logits/rejected": 0.024263203144073486, + "logps/chosen": -451.26336669921875, + "logps/rejected": -517.8736572265625, + "loss": 0.5583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7768245935440063, + "rewards/margins": 0.8688470721244812, + "rewards/rejected": -2.6456716060638428, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 4.022356629268894e-06, + "logits/chosen": -0.7827082276344299, + "logits/rejected": 0.21052603423595428, + "logps/chosen": -487.554443359375, + "logps/rejected": -506.66192626953125, + "loss": 0.6025, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0982871055603027, + "rewards/margins": 0.5698203444480896, + "rewards/rejected": -2.668107509613037, + "step": 2770 + }, + { + "epoch": 0.36, + "learning_rate": 4.013280430627936e-06, + "logits/chosen": -0.36198630928993225, + "logits/rejected": 0.06339599192142487, + "logps/chosen": -443.41961669921875, + "logps/rejected": -474.7898864746094, + "loss": 0.6083, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9905750751495361, + "rewards/margins": 0.5316168665885925, + "rewards/rejected": -2.5221920013427734, + "step": 2780 + }, + { + "epoch": 0.37, + "learning_rate": 4.004172642202002e-06, + "logits/chosen": -0.9610457420349121, + "logits/rejected": 0.30055952072143555, + "logps/chosen": -452.96343994140625, + "logps/rejected": -512.3440551757812, + "loss": 0.4909, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0993430614471436, + "rewards/margins": 0.8472961187362671, + "rewards/rejected": -2.9466395378112793, + "step": 2790 + }, + { + "epoch": 0.37, + "learning_rate": 3.995033454116512e-06, + "logits/chosen": -1.0286301374435425, + "logits/rejected": 0.016544628888368607, + "logps/chosen": -508.1040954589844, + "logps/rejected": -527.2821044921875, + "loss": 0.5865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1054210662841797, + "rewards/margins": 0.5004376173019409, + "rewards/rejected": -2.605858564376831, + "step": 2800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 1.101927399635315, + "eval_logits/rejected": 1.7565044164657593, + "eval_logps/chosen": -478.56048583984375, + "eval_logps/rejected": -529.6148681640625, + "eval_loss": 0.5350669622421265, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -2.1007049083709717, + "eval_rewards/margins": 0.709632158279419, + "eval_rewards/rejected": -2.8103370666503906, + "eval_runtime": 2335.1815, + "eval_samples_per_second": 0.856, + "eval_steps_per_second": 0.428, + "step": 2800 + }, + { + "epoch": 0.37, + "learning_rate": 3.985863057152355e-06, + "logits/chosen": -0.4937034547328949, + "logits/rejected": -0.2508159577846527, + "logps/chosen": -504.6494140625, + "logps/rejected": -560.7249755859375, + "loss": 0.5349, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.059462785720825, + "rewards/margins": 0.8392888307571411, + "rewards/rejected": -2.898751735687256, + "step": 2810 + }, + { + "epoch": 0.37, + "learning_rate": 3.976661642741908e-06, + "logits/chosen": -0.27275025844573975, + "logits/rejected": -0.03731584548950195, + "logps/chosen": -484.62506103515625, + "logps/rejected": -580.8031005859375, + "loss": 0.466, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.229395627975464, + "rewards/margins": 0.9002262353897095, + "rewards/rejected": -3.129621982574463, + "step": 2820 + }, + { + "epoch": 0.37, + "learning_rate": 3.967429402965035e-06, + "logits/chosen": -0.3462119400501251, + "logits/rejected": 0.061771608889102936, + "logps/chosen": -553.1046142578125, + "logps/rejected": -620.2770385742188, + "loss": 0.5486, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.695598602294922, + "rewards/margins": 0.7296158671379089, + "rewards/rejected": -3.4252142906188965, + "step": 2830 + }, + { + "epoch": 0.37, + "learning_rate": 3.958166530545085e-06, + "logits/chosen": -0.5360496044158936, + "logits/rejected": -0.30475491285324097, + "logps/chosen": -516.7517700195312, + "logps/rejected": -609.4689331054688, + "loss": 0.4753, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5592031478881836, + "rewards/margins": 0.9313445091247559, + "rewards/rejected": -3.4905476570129395, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 3.948873218844863e-06, + "logits/chosen": 0.14160022139549255, + "logits/rejected": -0.38983091711997986, + "logps/chosen": -468.97314453125, + "logps/rejected": -537.5845947265625, + "loss": 0.6692, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.569286584854126, + "rewards/margins": 0.41470590233802795, + "rewards/rejected": -2.983992338180542, + "step": 2850 + }, + { + "epoch": 0.37, + "learning_rate": 3.939549661862592e-06, + "logits/chosen": -0.5928257703781128, + "logits/rejected": 0.027815770357847214, + "logps/chosen": -490.8857421875, + "logps/rejected": -555.0617065429688, + "loss": 0.5306, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.194727659225464, + "rewards/margins": 0.8603199124336243, + "rewards/rejected": -3.0550477504730225, + "step": 2860 + }, + { + "epoch": 0.38, + "learning_rate": 3.930196054227871e-06, + "logits/chosen": -0.7506051063537598, + "logits/rejected": 0.29753103852272034, + "logps/chosen": -445.02667236328125, + "logps/rejected": -515.992431640625, + "loss": 0.5096, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.015972375869751, + "rewards/margins": 0.8588827848434448, + "rewards/rejected": -2.8748552799224854, + "step": 2870 + }, + { + "epoch": 0.38, + "learning_rate": 3.920812591197604e-06, + "logits/chosen": -1.051764726638794, + "logits/rejected": 0.04196419566869736, + "logps/chosen": -435.1283264160156, + "logps/rejected": -485.16290283203125, + "loss": 0.4647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.730743646621704, + "rewards/margins": 0.8607555627822876, + "rewards/rejected": -2.591498851776123, + "step": 2880 + }, + { + "epoch": 0.38, + "learning_rate": 3.9113994686519305e-06, + "logits/chosen": -1.111640214920044, + "logits/rejected": 0.03947020322084427, + "logps/chosen": -432.510009765625, + "logps/rejected": -520.13134765625, + "loss": 0.4645, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6205533742904663, + "rewards/margins": 0.9038299322128296, + "rewards/rejected": -2.524383306503296, + "step": 2890 + }, + { + "epoch": 0.38, + "learning_rate": 3.90195688309013e-06, + "logits/chosen": -0.9940659403800964, + "logits/rejected": -0.16780364513397217, + "logps/chosen": -408.54425048828125, + "logps/rejected": -470.9107360839844, + "loss": 0.5252, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5781054496765137, + "rewards/margins": 0.9151102304458618, + "rewards/rejected": -2.493215560913086, + "step": 2900 + }, + { + "epoch": 0.38, + "eval_logits/chosen": 0.9107775688171387, + "eval_logits/rejected": 1.5685968399047852, + "eval_logps/chosen": -426.64959716796875, + "eval_logps/rejected": -492.73974609375, + "eval_loss": 0.537619948387146, + "eval_rewards/accuracies": 0.7204999923706055, + "eval_rewards/chosen": -1.581595540046692, + "eval_rewards/margins": 0.8599900603294373, + "eval_rewards/rejected": -2.4415855407714844, + "eval_runtime": 2334.4828, + "eval_samples_per_second": 0.857, + "eval_steps_per_second": 0.428, + "step": 2900 + }, + { + "epoch": 0.38, + "learning_rate": 3.892485031626527e-06, + "logits/chosen": -1.0271751880645752, + "logits/rejected": -0.31663069128990173, + "logps/chosen": -413.0658264160156, + "logps/rejected": -492.5518493652344, + "loss": 0.5214, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5365521907806396, + "rewards/margins": 0.9228584170341492, + "rewards/rejected": -2.4594106674194336, + "step": 2910 + }, + { + "epoch": 0.38, + "learning_rate": 3.882984111986371e-06, + "logits/chosen": -0.7470510005950928, + "logits/rejected": 0.05860505253076553, + "logps/chosen": -435.0110778808594, + "logps/rejected": -458.22137451171875, + "loss": 0.5542, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5745357275009155, + "rewards/margins": 0.6277254223823547, + "rewards/rejected": -2.202260971069336, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 3.873454322501711e-06, + "logits/chosen": -1.084609866142273, + "logits/rejected": -0.0922069326043129, + "logps/chosen": -392.8829650878906, + "logps/rejected": -459.1026306152344, + "loss": 0.5197, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1698970794677734, + "rewards/margins": 0.9355286359786987, + "rewards/rejected": -2.1054255962371826, + "step": 2930 + }, + { + "epoch": 0.38, + "learning_rate": 3.863895862107255e-06, + "logits/chosen": -1.0651941299438477, + "logits/rejected": -0.298669695854187, + "logps/chosen": -362.4504089355469, + "logps/rejected": -488.83001708984375, + "loss": 0.4332, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0489132404327393, + "rewards/margins": 1.0638964176177979, + "rewards/rejected": -2.112809658050537, + "step": 2940 + }, + { + "epoch": 0.39, + "learning_rate": 3.854308930336216e-06, + "logits/chosen": -1.077282190322876, + "logits/rejected": 0.2055933028459549, + "logps/chosen": -443.42486572265625, + "logps/rejected": -475.886474609375, + "loss": 0.555, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3301843404769897, + "rewards/margins": 0.7858977317810059, + "rewards/rejected": -2.116081953048706, + "step": 2950 + }, + { + "epoch": 0.39, + "learning_rate": 3.844693727316151e-06, + "logits/chosen": -1.1035977602005005, + "logits/rejected": -0.3125559389591217, + "logps/chosen": -408.77783203125, + "logps/rejected": -454.4917907714844, + "loss": 0.4932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2791414260864258, + "rewards/margins": 0.9063733220100403, + "rewards/rejected": -2.185514450073242, + "step": 2960 + }, + { + "epoch": 0.39, + "learning_rate": 3.835050453764779e-06, + "logits/chosen": -0.4838363230228424, + "logits/rejected": -0.24740548431873322, + "logps/chosen": -358.820556640625, + "logps/rejected": -441.27001953125, + "loss": 0.4782, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1172313690185547, + "rewards/margins": 1.0152322053909302, + "rewards/rejected": -2.1324634552001953, + "step": 2970 + }, + { + "epoch": 0.39, + "learning_rate": 3.825379310985792e-06, + "logits/chosen": -0.7724876999855042, + "logits/rejected": -0.538825273513794, + "logps/chosen": -370.72052001953125, + "logps/rejected": -447.32745361328125, + "loss": 0.5206, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1402572393417358, + "rewards/margins": 0.8123918771743774, + "rewards/rejected": -1.9526491165161133, + "step": 2980 + }, + { + "epoch": 0.39, + "learning_rate": 3.815680500864651e-06, + "logits/chosen": -0.9769965410232544, + "logits/rejected": -0.2623330056667328, + "logps/chosen": -419.5028381347656, + "logps/rejected": -455.3551330566406, + "loss": 0.5107, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1941817998886108, + "rewards/margins": 0.7795951962471008, + "rewards/rejected": -1.9737770557403564, + "step": 2990 + }, + { + "epoch": 0.39, + "learning_rate": 3.80595422586438e-06, + "logits/chosen": -1.0406408309936523, + "logits/rejected": -0.20213142037391663, + "logps/chosen": -462.68487548828125, + "logps/rejected": -471.08123779296875, + "loss": 0.5381, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3869965076446533, + "rewards/margins": 0.8180079460144043, + "rewards/rejected": -2.2050046920776367, + "step": 3000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 1.0232552289962769, + "eval_logits/rejected": 1.7206393480300903, + "eval_logps/chosen": -422.64849853515625, + "eval_logps/rejected": -485.7741394042969, + "eval_loss": 0.5305867791175842, + "eval_rewards/accuracies": 0.7229999899864197, + "eval_rewards/chosen": -1.5415852069854736, + "eval_rewards/margins": 0.8303446769714355, + "eval_rewards/rejected": -2.37192964553833, + "eval_runtime": 2335.0533, + "eval_samples_per_second": 0.857, + "eval_steps_per_second": 0.428, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 3.7962006890213266e-06, + "logits/chosen": -0.1788121610879898, + "logits/rejected": 0.016005922108888626, + "logps/chosen": -406.82928466796875, + "logps/rejected": -460.6402282714844, + "loss": 0.606, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7624022960662842, + "rewards/margins": 0.5616916418075562, + "rewards/rejected": -2.324093818664551, + "step": 3010 + }, + { + "epoch": 0.4, + "learning_rate": 3.7864200939409336e-06, + "logits/chosen": -1.0381033420562744, + "logits/rejected": 0.2535991072654724, + "logps/chosen": -414.68536376953125, + "logps/rejected": -450.0343322753906, + "loss": 0.5872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4555238485336304, + "rewards/margins": 0.6130931973457336, + "rewards/rejected": -2.068617105484009, + "step": 3020 + }, + { + "epoch": 0.4, + "learning_rate": 3.7766126447934857e-06, + "logits/chosen": -1.1287479400634766, + "logits/rejected": -0.4950384199619293, + "logps/chosen": -377.2622985839844, + "logps/rejected": -427.8341369628906, + "loss": 0.5285, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.304473638534546, + "rewards/margins": 0.7007145881652832, + "rewards/rejected": -2.005188465118408, + "step": 3030 + }, + { + "epoch": 0.4, + "learning_rate": 3.766778546309847e-06, + "logits/chosen": -0.7997163534164429, + "logits/rejected": 0.35035890340805054, + "logps/chosen": -444.89190673828125, + "logps/rejected": -414.5162048339844, + "loss": 0.6038, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4366438388824463, + "rewards/margins": 0.5985267758369446, + "rewards/rejected": -2.035170793533325, + "step": 3040 + }, + { + "epoch": 0.4, + "learning_rate": 3.7569180037771868e-06, + "logits/chosen": -0.36123308539390564, + "logits/rejected": -0.1611182987689972, + "logps/chosen": -421.418701171875, + "logps/rejected": -478.39300537109375, + "loss": 0.596, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.594327688217163, + "rewards/margins": 0.6372068524360657, + "rewards/rejected": -2.231534481048584, + "step": 3050 + }, + { + "epoch": 0.4, + "learning_rate": 3.7470312230346955e-06, + "logits/chosen": -0.7029015421867371, + "logits/rejected": 0.3699846863746643, + "logps/chosen": -454.07025146484375, + "logps/rejected": -483.5018615722656, + "loss": 0.4709, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4222877025604248, + "rewards/margins": 0.9277554750442505, + "rewards/rejected": -2.3500430583953857, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 3.7371184104692857e-06, + "logits/chosen": -1.1917140483856201, + "logits/rejected": -0.15570615231990814, + "logps/chosen": -475.3992614746094, + "logps/rejected": -481.27301025390625, + "loss": 0.5378, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.428065299987793, + "rewards/margins": 0.7733792066574097, + "rewards/rejected": -2.201444387435913, + "step": 3070 + }, + { + "epoch": 0.4, + "learning_rate": 3.727179773011289e-06, + "logits/chosen": -0.5196498036384583, + "logits/rejected": -0.2388072907924652, + "logps/chosen": -431.4600524902344, + "logps/rejected": -482.206787109375, + "loss": 0.5579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5728769302368164, + "rewards/margins": 0.6688525080680847, + "rewards/rejected": -2.241729259490967, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 3.717215518130127e-06, + "logits/chosen": -0.6362151503562927, + "logits/rejected": 0.041982658207416534, + "logps/chosen": -410.38189697265625, + "logps/rejected": -457.52093505859375, + "loss": 0.5947, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5848448276519775, + "rewards/margins": 0.5786422491073608, + "rewards/rejected": -2.163486957550049, + "step": 3090 + }, + { + "epoch": 0.41, + "learning_rate": 3.7072258538299923e-06, + "logits/chosen": -1.2260688543319702, + "logits/rejected": 0.28254860639572144, + "logps/chosen": -483.26568603515625, + "logps/rejected": -465.8003845214844, + "loss": 0.4587, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3665094375610352, + "rewards/margins": 0.8110405206680298, + "rewards/rejected": -2.1775500774383545, + "step": 3100 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 1.1220719814300537, + "eval_logits/rejected": 1.844536542892456, + "eval_logps/chosen": -413.6004943847656, + "eval_logps/rejected": -467.0777587890625, + "eval_loss": 0.522217869758606, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.4511048793792725, + "eval_rewards/margins": 0.7338610291481018, + "eval_rewards/rejected": -2.1849660873413086, + "eval_runtime": 2334.682, + "eval_samples_per_second": 0.857, + "eval_steps_per_second": 0.428, + "step": 3100 + }, + { + "epoch": 0.41, + "learning_rate": 3.6972109886454933e-06, + "logits/chosen": -0.25699383020401, + "logits/rejected": -0.20727062225341797, + "logps/chosen": -435.0664978027344, + "logps/rejected": -490.909912109375, + "loss": 0.4972, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.762265920639038, + "rewards/margins": 0.8372318148612976, + "rewards/rejected": -2.5994980335235596, + "step": 3110 + }, + { + "epoch": 0.41, + "learning_rate": 3.687171131637314e-06, + "logits/chosen": -0.7965458035469055, + "logits/rejected": 0.2634788155555725, + "logps/chosen": -453.2422790527344, + "logps/rejected": -499.4004821777344, + "loss": 0.5227, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7745777368545532, + "rewards/margins": 0.7375408411026001, + "rewards/rejected": -2.5121185779571533, + "step": 3120 + }, + { + "epoch": 0.41, + "learning_rate": 3.677106492387839e-06, + "logits/chosen": -0.7059360146522522, + "logits/rejected": 0.452791303396225, + "logps/chosen": -459.8872985839844, + "logps/rejected": -473.69781494140625, + "loss": 0.5509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7924385070800781, + "rewards/margins": 0.7473533153533936, + "rewards/rejected": -2.5397915840148926, + "step": 3130 + }, + { + "epoch": 0.41, + "learning_rate": 3.6670172809967865e-06, + "logits/chosen": -0.48607778549194336, + "logits/rejected": 0.2857428193092346, + "logps/chosen": -408.91790771484375, + "logps/rejected": -458.8837890625, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9582751989364624, + "rewards/margins": 0.6871889233589172, + "rewards/rejected": -2.6454639434814453, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569037080768153e-06, + "logits/chosen": -0.8563801646232605, + "logits/rejected": -0.1331445276737213, + "logps/chosen": -419.33544921875, + "logps/rejected": -523.2894287109375, + "loss": 0.5028, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7469348907470703, + "rewards/margins": 0.928310215473175, + "rewards/rejected": -2.6752450466156006, + "step": 3150 + }, + { + "epoch": 0.41, + "learning_rate": 3.646765984749137e-06, + "logits/chosen": -0.42074188590049744, + "logits/rejected": -0.15325963497161865, + "logps/chosen": -437.05352783203125, + "logps/rejected": -532.8778076171875, + "loss": 0.4926, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6952593326568604, + "rewards/margins": 0.9555587768554688, + "rewards/rejected": -2.650818109512329, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 3.6366043226391e-06, + "logits/chosen": -0.8013358116149902, + "logits/rejected": 0.14026977121829987, + "logps/chosen": -455.299560546875, + "logps/rejected": -499.14892578125, + "loss": 0.505, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.768510103225708, + "rewards/margins": 0.9221822023391724, + "rewards/rejected": -2.690692186355591, + "step": 3170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6264189338717766e-06, + "logits/chosen": -1.1270676851272583, + "logits/rejected": 0.0055741192772984505, + "logps/chosen": -444.71502685546875, + "logps/rejected": -478.0035705566406, + "loss": 0.5603, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6736608743667603, + "rewards/margins": 0.6656028032302856, + "rewards/rejected": -2.339263439178467, + "step": 3180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6162100310675334e-06, + "logits/chosen": -0.5859667062759399, + "logits/rejected": -0.5138736963272095, + "logps/chosen": -400.99151611328125, + "logps/rejected": -445.34747314453125, + "loss": 0.65, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3842413425445557, + "rewards/margins": 0.5574811697006226, + "rewards/rejected": -1.9417225122451782, + "step": 3190 + }, + { + "epoch": 0.42, + "learning_rate": 3.605977827337596e-06, + "logits/chosen": -0.5511821508407593, + "logits/rejected": -0.18979701399803162, + "logps/chosen": -392.2962341308594, + "logps/rejected": -460.0108947753906, + "loss": 0.5173, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.33115553855896, + "rewards/margins": 0.8485208749771118, + "rewards/rejected": -2.1796765327453613, + "step": 3200 + }, + { + "epoch": 0.42, + "eval_logits/chosen": 0.8981449604034424, + "eval_logits/rejected": 1.6186491250991821, + "eval_logps/chosen": -403.9989318847656, + "eval_logps/rejected": -462.40948486328125, + "eval_loss": 0.5276510715484619, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.3550893068313599, + "eval_rewards/margins": 0.7831941843032837, + "eval_rewards/rejected": -2.1382837295532227, + "eval_runtime": 2082.617, + "eval_samples_per_second": 0.96, + "eval_steps_per_second": 0.48, + "step": 3200 + }, + { + "epoch": 0.42, + "learning_rate": 3.595722536279595e-06, + "logits/chosen": -1.2897363901138306, + "logits/rejected": 0.7435730695724487, + "logps/chosen": -463.46563720703125, + "logps/rejected": -484.604736328125, + "loss": 0.4551, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.345106840133667, + "rewards/margins": 0.9554675221443176, + "rewards/rejected": -2.300574541091919, + "step": 3210 + }, + { + "epoch": 0.42, + "learning_rate": 3.58544437197311e-06, + "logits/chosen": -0.5996850728988647, + "logits/rejected": 0.24509985744953156, + "logps/chosen": -428.47125244140625, + "logps/rejected": -500.66168212890625, + "loss": 0.5034, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.491923213005066, + "rewards/margins": 0.9786638021469116, + "rewards/rejected": -2.4705870151519775, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5751435489752025e-06, + "logits/chosen": -0.4525715410709381, + "logits/rejected": 0.17086896300315857, + "logps/chosen": -396.47723388671875, + "logps/rejected": -455.05706787109375, + "loss": 0.4999, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4165663719177246, + "rewards/margins": 0.9099749326705933, + "rewards/rejected": -2.3265414237976074, + "step": 3230 + }, + { + "epoch": 0.42, + "learning_rate": 3.5648202823159317e-06, + "logits/chosen": -0.16280296444892883, + "logits/rejected": 0.15849009156227112, + "logps/chosen": -411.54742431640625, + "logps/rejected": -548.3161010742188, + "loss": 0.4644, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7535407543182373, + "rewards/margins": 1.087467908859253, + "rewards/rejected": -2.8410089015960693, + "step": 3240 + }, + { + "epoch": 0.43, + "learning_rate": 3.554474787493873e-06, + "logits/chosen": 0.05241694301366806, + "logits/rejected": 0.6363898515701294, + "logps/chosen": -502.8150329589844, + "logps/rejected": -597.0306396484375, + "loss": 0.4855, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0513508319854736, + "rewards/margins": 1.1114223003387451, + "rewards/rejected": -3.1627731323242188, + "step": 3250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5441072804716125e-06, + "logits/chosen": 0.01650853082537651, + "logits/rejected": 0.25886648893356323, + "logps/chosen": -555.9203491210938, + "logps/rejected": -650.7965087890625, + "loss": 0.5809, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.538379192352295, + "rewards/margins": 0.8736263513565063, + "rewards/rejected": -3.412005662918091, + "step": 3260 + }, + { + "epoch": 0.43, + "learning_rate": 3.5337179776712427e-06, + "logits/chosen": -0.008978593163192272, + "logits/rejected": 0.6548370718955994, + "logps/chosen": -530.6326904296875, + "logps/rejected": -644.48876953125, + "loss": 0.5841, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7416481971740723, + "rewards/margins": 1.1851587295532227, + "rewards/rejected": -3.926807403564453, + "step": 3270 + }, + { + "epoch": 0.43, + "learning_rate": 3.5233070959698445e-06, + "logits/chosen": -0.5847116708755493, + "logits/rejected": 0.3420669138431549, + "logps/chosen": -555.023681640625, + "logps/rejected": -576.3981323242188, + "loss": 0.6021, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.538581371307373, + "rewards/margins": 0.596244752407074, + "rewards/rejected": -3.1348259449005127, + "step": 3280 + }, + { + "epoch": 0.43, + "learning_rate": 3.512874852694959e-06, + "logits/chosen": -0.6708934903144836, + "logits/rejected": 0.5892789959907532, + "logps/chosen": -480.2303161621094, + "logps/rejected": -554.13671875, + "loss": 0.4727, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.084829807281494, + "rewards/margins": 0.9657213091850281, + "rewards/rejected": -3.050551414489746, + "step": 3290 + }, + { + "epoch": 0.43, + "learning_rate": 3.5024214656200497e-06, + "logits/chosen": -0.9602855443954468, + "logits/rejected": 0.5978536009788513, + "logps/chosen": -469.49737548828125, + "logps/rejected": -488.523681640625, + "loss": 0.5851, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8592431545257568, + "rewards/margins": 0.7517732381820679, + "rewards/rejected": -2.6110165119171143, + "step": 3300 + }, + { + "epoch": 0.43, + "eval_logits/chosen": 1.2859539985656738, + "eval_logits/rejected": 2.034395933151245, + "eval_logps/chosen": -437.1257629394531, + "eval_logps/rejected": -498.6931457519531, + "eval_loss": 0.5180677771568298, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -1.6863574981689453, + "eval_rewards/margins": 0.8147625923156738, + "eval_rewards/rejected": -2.50111985206604, + "eval_runtime": 1144.5777, + "eval_samples_per_second": 1.747, + "eval_steps_per_second": 0.874, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 3.491947152959958e-06, + "logits/chosen": -0.5393772125244141, + "logits/rejected": 0.08978531509637833, + "logps/chosen": -471.2378845214844, + "logps/rejected": -527.0679321289062, + "loss": 0.5259, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7623119354248047, + "rewards/margins": 0.7789031863212585, + "rewards/rejected": -2.541214942932129, + "step": 3310 + }, + { + "epoch": 0.43, + "learning_rate": 3.4814521333663497e-06, + "logits/chosen": -0.8797744512557983, + "logits/rejected": 0.28487175703048706, + "logps/chosen": -507.973388671875, + "logps/rejected": -497.87677001953125, + "loss": 0.5358, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7715896368026733, + "rewards/margins": 0.7174810767173767, + "rewards/rejected": -2.489070415496826, + "step": 3320 + }, + { + "epoch": 0.44, + "learning_rate": 3.4709366259231468e-06, + "logits/chosen": -0.5718088150024414, + "logits/rejected": 0.6245108842849731, + "logps/chosen": -451.671142578125, + "logps/rejected": -492.45111083984375, + "loss": 0.5618, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.632046103477478, + "rewards/margins": 0.802625834941864, + "rewards/rejected": -2.4346723556518555, + "step": 3330 + }, + { + "epoch": 0.44, + "learning_rate": 3.460400850141956e-06, + "logits/chosen": -0.8436506390571594, + "logits/rejected": 0.6500002145767212, + "logps/chosen": -418.16571044921875, + "logps/rejected": -476.41375732421875, + "loss": 0.5184, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8505115509033203, + "rewards/margins": 0.7895227670669556, + "rewards/rejected": -2.6400341987609863, + "step": 3340 + }, + { + "epoch": 0.44, + "learning_rate": 3.4498450259574858e-06, + "logits/chosen": -0.3697187304496765, + "logits/rejected": 0.12063203006982803, + "logps/chosen": -465.37066650390625, + "logps/rejected": -506.58319091796875, + "loss": 0.5763, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.957269310951233, + "rewards/margins": 0.574730634689331, + "rewards/rejected": -2.5320000648498535, + "step": 3350 + }, + { + "epoch": 0.44, + "learning_rate": 3.439269373722957e-06, + "logits/chosen": -0.4114798605442047, + "logits/rejected": 0.28758490085601807, + "logps/chosen": -450.16741943359375, + "logps/rejected": -514.9403076171875, + "loss": 0.542, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.925722360610962, + "rewards/margins": 0.8905431628227234, + "rewards/rejected": -2.81626558303833, + "step": 3360 + }, + { + "epoch": 0.44, + "learning_rate": 3.4286741142055014e-06, + "logits/chosen": -0.9048460721969604, + "logits/rejected": -0.23542626202106476, + "logps/chosen": -464.1852111816406, + "logps/rejected": -529.8837890625, + "loss": 0.4867, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.771493673324585, + "rewards/margins": 0.8255535364151001, + "rewards/rejected": -2.5970473289489746, + "step": 3370 + }, + { + "epoch": 0.44, + "learning_rate": 3.4180594685815536e-06, + "logits/chosen": -0.6555899381637573, + "logits/rejected": 0.23162047564983368, + "logps/chosen": -387.56414794921875, + "logps/rejected": -469.15020751953125, + "loss": 0.5231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6515061855316162, + "rewards/margins": 0.8171674609184265, + "rewards/rejected": -2.4686737060546875, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 3.4074256584322336e-06, + "logits/chosen": -0.6201430559158325, + "logits/rejected": 0.13968434929847717, + "logps/chosen": -395.79229736328125, + "logps/rejected": -470.4158630371094, + "loss": 0.521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4998222589492798, + "rewards/margins": 0.9599917531013489, + "rewards/rejected": -2.4598140716552734, + "step": 3390 + }, + { + "epoch": 0.44, + "learning_rate": 3.3967729057387213e-06, + "logits/chosen": -0.7700475454330444, + "logits/rejected": 0.4305594563484192, + "logps/chosen": -439.98504638671875, + "logps/rejected": -465.68408203125, + "loss": 0.5811, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.470069169998169, + "rewards/margins": 0.6720464825630188, + "rewards/rejected": -2.142115831375122, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 1.0161676406860352, + "eval_logits/rejected": 1.7237727642059326, + "eval_logps/chosen": -428.5589904785156, + "eval_logps/rejected": -492.4408264160156, + "eval_loss": 0.5165792107582092, + "eval_rewards/accuracies": 0.7335000038146973, + "eval_rewards/chosen": -1.6006896495819092, + "eval_rewards/margins": 0.8379069566726685, + "eval_rewards/rejected": -2.4385969638824463, + "eval_runtime": 1168.883, + "eval_samples_per_second": 1.711, + "eval_steps_per_second": 0.856, + "step": 3400 + }, + { + "epoch": 0.45, + "learning_rate": 3.386101432877624e-06, + "logits/chosen": -0.7892847657203674, + "logits/rejected": -0.024471605196595192, + "logps/chosen": -428.40667724609375, + "logps/rejected": -461.5455627441406, + "loss": 0.5302, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6095516681671143, + "rewards/margins": 0.7331417798995972, + "rewards/rejected": -2.342693328857422, + "step": 3410 + }, + { + "epoch": 0.45, + "learning_rate": 3.375411462616332e-06, + "logits/chosen": -1.0004537105560303, + "logits/rejected": 0.13648569583892822, + "logps/chosen": -466.4956970214844, + "logps/rejected": -552.7140502929688, + "loss": 0.5037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7703803777694702, + "rewards/margins": 0.8303622007369995, + "rewards/rejected": -2.600742816925049, + "step": 3420 + }, + { + "epoch": 0.45, + "learning_rate": 3.3647032181083696e-06, + "logits/chosen": -0.7642697095870972, + "logits/rejected": -0.002010262105613947, + "logps/chosen": -486.2002868652344, + "logps/rejected": -559.04736328125, + "loss": 0.4941, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.827528715133667, + "rewards/margins": 0.9090808629989624, + "rewards/rejected": -2.73660945892334, + "step": 3430 + }, + { + "epoch": 0.45, + "learning_rate": 3.3539769228887382e-06, + "logits/chosen": -0.9899004697799683, + "logits/rejected": 0.285744845867157, + "logps/chosen": -455.26788330078125, + "logps/rejected": -538.8175048828125, + "loss": 0.4733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.473595142364502, + "rewards/margins": 0.9281530380249023, + "rewards/rejected": -2.4017484188079834, + "step": 3440 + }, + { + "epoch": 0.45, + "learning_rate": 3.343232800869247e-06, + "logits/chosen": -0.8719793558120728, + "logits/rejected": 0.3464857041835785, + "logps/chosen": -392.4674072265625, + "logps/rejected": -419.7901306152344, + "loss": 0.518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.578559160232544, + "rewards/margins": 0.7874252796173096, + "rewards/rejected": -2.3659844398498535, + "step": 3450 + }, + { + "epoch": 0.45, + "learning_rate": 3.33247107633384e-06, + "logits/chosen": -0.6414632201194763, + "logits/rejected": -0.19829490780830383, + "logps/chosen": -397.5970764160156, + "logps/rejected": -498.804931640625, + "loss": 0.4297, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.311751365661621, + "rewards/margins": 1.1057935953140259, + "rewards/rejected": -2.4175448417663574, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 3.3216919739339155e-06, + "logits/chosen": -0.8113977313041687, + "logits/rejected": 0.24353833496570587, + "logps/chosen": -451.53900146484375, + "logps/rejected": -521.3798217773438, + "loss": 0.4028, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5756160020828247, + "rewards/margins": 1.230313777923584, + "rewards/rejected": -2.805929660797119, + "step": 3470 + }, + { + "epoch": 0.46, + "learning_rate": 3.310895718683635e-06, + "logits/chosen": -0.5552123188972473, + "logits/rejected": 0.25212961435317993, + "logps/chosen": -468.75030517578125, + "logps/rejected": -510.51214599609375, + "loss": 0.6551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7728900909423828, + "rewards/margins": 0.6345969438552856, + "rewards/rejected": -2.4074866771698, + "step": 3480 + }, + { + "epoch": 0.46, + "learning_rate": 3.3000825359552256e-06, + "logits/chosen": -0.29439646005630493, + "logits/rejected": 0.14724037051200867, + "logps/chosen": -405.78594970703125, + "logps/rejected": -501.95465087890625, + "loss": 0.497, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3883360624313354, + "rewards/margins": 0.9358784556388855, + "rewards/rejected": -2.324214458465576, + "step": 3490 + }, + { + "epoch": 0.46, + "learning_rate": 3.2892526514742778e-06, + "logits/chosen": -0.5424224734306335, + "logits/rejected": 0.3168255388736725, + "logps/chosen": -418.60272216796875, + "logps/rejected": -471.0479431152344, + "loss": 0.4892, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4777576923370361, + "rewards/margins": 0.914044201374054, + "rewards/rejected": -2.391801595687866, + "step": 3500 + }, + { + "epoch": 0.46, + "eval_logits/chosen": 1.3014029264450073, + "eval_logits/rejected": 2.0708816051483154, + "eval_logps/chosen": -415.6103820800781, + "eval_logps/rejected": -480.9519348144531, + "eval_loss": 0.5256860256195068, + "eval_rewards/accuracies": 0.7279999852180481, + "eval_rewards/chosen": -1.471203327178955, + "eval_rewards/margins": 0.8525046110153198, + "eval_rewards/rejected": -2.3237080574035645, + "eval_runtime": 1172.9577, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 3500 + }, + { + "epoch": 0.46, + "learning_rate": 3.27840629131503e-06, + "logits/chosen": -0.525278627872467, + "logits/rejected": 0.5383912324905396, + "logps/chosen": -441.439208984375, + "logps/rejected": -500.04010009765625, + "loss": 0.5312, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6438672542572021, + "rewards/margins": 0.845868706703186, + "rewards/rejected": -2.4897360801696777, + "step": 3510 + }, + { + "epoch": 0.46, + "learning_rate": 3.2675436818956522e-06, + "logits/chosen": -0.6424092054367065, + "logits/rejected": 0.2780497372150421, + "logps/chosen": -387.6494445800781, + "logps/rejected": -470.696533203125, + "loss": 0.5343, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4292685985565186, + "rewards/margins": 0.742842972278595, + "rewards/rejected": -2.1721115112304688, + "step": 3520 + }, + { + "epoch": 0.46, + "learning_rate": 3.2566650499735185e-06, + "logits/chosen": -0.35371822118759155, + "logits/rejected": 0.6347888112068176, + "logps/chosen": -437.46600341796875, + "logps/rejected": -535.9605712890625, + "loss": 0.4477, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4554798603057861, + "rewards/margins": 1.2568625211715698, + "rewards/rejected": -2.7123425006866455, + "step": 3530 + }, + { + "epoch": 0.46, + "learning_rate": 3.2457706226404715e-06, + "logits/chosen": -0.41827550530433655, + "logits/rejected": -0.005315917544066906, + "logps/chosen": -431.5044860839844, + "logps/rejected": -449.8809509277344, + "loss": 0.6296, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6185357570648193, + "rewards/margins": 0.5960680246353149, + "rewards/rejected": -2.214603900909424, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 3.2348606273180847e-06, + "logits/chosen": -1.0620290040969849, + "logits/rejected": 0.8680380582809448, + "logps/chosen": -446.9424743652344, + "logps/rejected": -452.28485107421875, + "loss": 0.4631, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3773703575134277, + "rewards/margins": 0.9029923677444458, + "rewards/rejected": -2.280362844467163, + "step": 3550 + }, + { + "epoch": 0.47, + "learning_rate": 3.2239352917529165e-06, + "logits/chosen": -0.7291229367256165, + "logits/rejected": 0.35278087854385376, + "logps/chosen": -489.33551025390625, + "logps/rejected": -550.3321533203125, + "loss": 0.519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8307793140411377, + "rewards/margins": 0.7998561859130859, + "rewards/rejected": -2.6306357383728027, + "step": 3560 + }, + { + "epoch": 0.47, + "learning_rate": 3.2129948440117487e-06, + "logits/chosen": -0.4160127639770508, + "logits/rejected": 0.06812303513288498, + "logps/chosen": -480.4317932128906, + "logps/rejected": -528.1541748046875, + "loss": 0.5639, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1663801670074463, + "rewards/margins": 0.6297258734703064, + "rewards/rejected": -2.7961058616638184, + "step": 3570 + }, + { + "epoch": 0.47, + "learning_rate": 3.202039512476833e-06, + "logits/chosen": -0.47289180755615234, + "logits/rejected": 0.3967745304107666, + "logps/chosen": -413.58563232421875, + "logps/rejected": -521.4444580078125, + "loss": 0.4684, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8022072315216064, + "rewards/margins": 1.0440325736999512, + "rewards/rejected": -2.8462398052215576, + "step": 3580 + }, + { + "epoch": 0.47, + "learning_rate": 3.1910695258411216e-06, + "logits/chosen": -0.7860490679740906, + "logits/rejected": 0.8082360029220581, + "logps/chosen": -440.5712890625, + "logps/rejected": -458.45526123046875, + "loss": 0.5925, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7098718881607056, + "rewards/margins": 0.7629823684692383, + "rewards/rejected": -2.4728541374206543, + "step": 3590 + }, + { + "epoch": 0.47, + "learning_rate": 3.1800851131034904e-06, + "logits/chosen": -0.5506579279899597, + "logits/rejected": 0.5083298087120056, + "logps/chosen": -437.36016845703125, + "logps/rejected": -497.79742431640625, + "loss": 0.5438, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7282190322875977, + "rewards/margins": 0.9188164472579956, + "rewards/rejected": -2.647035598754883, + "step": 3600 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 1.4150185585021973, + "eval_logits/rejected": 2.2019591331481934, + "eval_logps/chosen": -428.1592102050781, + "eval_logps/rejected": -493.0663757324219, + "eval_loss": 0.5251966714859009, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.5966919660568237, + "eval_rewards/margins": 0.8481603860855103, + "eval_rewards/rejected": -2.444852113723755, + "eval_runtime": 1146.1005, + "eval_samples_per_second": 1.745, + "eval_steps_per_second": 0.873, + "step": 3600 + }, + { + "epoch": 0.47, + "learning_rate": 3.169086503563962e-06, + "logits/chosen": -0.6776447296142578, + "logits/rejected": -0.23693008720874786, + "logps/chosen": -416.4723205566406, + "logps/rejected": -498.85479736328125, + "loss": 0.6094, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5892466306686401, + "rewards/margins": 0.6511684656143188, + "rewards/rejected": -2.240415096282959, + "step": 3610 + }, + { + "epoch": 0.47, + "learning_rate": 3.1580739268189165e-06, + "logits/chosen": -0.5585545897483826, + "logits/rejected": 0.5914020538330078, + "logps/chosen": -437.05572509765625, + "logps/rejected": -491.81671142578125, + "loss": 0.505, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.594426155090332, + "rewards/margins": 0.934490978717804, + "rewards/rejected": -2.528916835784912, + "step": 3620 + }, + { + "epoch": 0.48, + "learning_rate": 3.147047612756302e-06, + "logits/chosen": -0.24671559035778046, + "logits/rejected": 0.12607893347740173, + "logps/chosen": -449.9618225097656, + "logps/rejected": -528.7197265625, + "loss": 0.4698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4847663640975952, + "rewards/margins": 0.9548713564872742, + "rewards/rejected": -2.4396376609802246, + "step": 3630 + }, + { + "epoch": 0.48, + "learning_rate": 3.136007791550833e-06, + "logits/chosen": -0.7207472324371338, + "logits/rejected": 0.6024399995803833, + "logps/chosen": -402.71319580078125, + "logps/rejected": -450.76373291015625, + "loss": 0.5079, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6094977855682373, + "rewards/margins": 0.8133002519607544, + "rewards/rejected": -2.4227981567382812, + "step": 3640 + }, + { + "epoch": 0.48, + "learning_rate": 3.1249546936591848e-06, + "logits/chosen": -0.587475597858429, + "logits/rejected": 0.040339358150959015, + "logps/chosen": -386.5211181640625, + "logps/rejected": -455.6336364746094, + "loss": 0.5345, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4895503520965576, + "rewards/margins": 0.7380887866020203, + "rewards/rejected": -2.2276394367218018, + "step": 3650 + }, + { + "epoch": 0.48, + "learning_rate": 3.1138885498151843e-06, + "logits/chosen": -0.3841520845890045, + "logits/rejected": 0.43626269698143005, + "logps/chosen": -448.64080810546875, + "logps/rejected": -513.5535888671875, + "loss": 0.4928, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7264331579208374, + "rewards/margins": 1.076446771621704, + "rewards/rejected": -2.802879810333252, + "step": 3660 + }, + { + "epoch": 0.48, + "learning_rate": 3.1028095910249937e-06, + "logits/chosen": -1.0335357189178467, + "logits/rejected": 0.6055603623390198, + "logps/chosen": -452.31829833984375, + "logps/rejected": -487.44921875, + "loss": 0.4924, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7537025213241577, + "rewards/margins": 0.8785789608955383, + "rewards/rejected": -2.632281541824341, + "step": 3670 + }, + { + "epoch": 0.48, + "learning_rate": 3.0917180485622895e-06, + "logits/chosen": -0.4549393653869629, + "logits/rejected": 1.1361083984375, + "logps/chosen": -451.30145263671875, + "logps/rejected": -507.1758728027344, + "loss": 0.4922, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7780840396881104, + "rewards/margins": 1.0487792491912842, + "rewards/rejected": -2.8268632888793945, + "step": 3680 + }, + { + "epoch": 0.48, + "learning_rate": 3.0806141539634294e-06, + "logits/chosen": -0.6923087239265442, + "logits/rejected": 0.8231356739997864, + "logps/chosen": -423.26531982421875, + "logps/rejected": -463.19921875, + "loss": 0.5082, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7002700567245483, + "rewards/margins": 0.9019443392753601, + "rewards/rejected": -2.6022145748138428, + "step": 3690 + }, + { + "epoch": 0.48, + "learning_rate": 3.069498139022624e-06, + "logits/chosen": -0.8322190046310425, + "logits/rejected": 0.6718935966491699, + "logps/chosen": -491.6170349121094, + "logps/rejected": -499.7591247558594, + "loss": 0.5677, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0136921405792236, + "rewards/margins": 0.6361916661262512, + "rewards/rejected": -2.64988374710083, + "step": 3700 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.6843464374542236, + "eval_logits/rejected": 2.467794895172119, + "eval_logps/chosen": -465.7503662109375, + "eval_logps/rejected": -529.8629760742188, + "eval_loss": 0.5152121782302856, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.972603678703308, + "eval_rewards/margins": 0.8402146697044373, + "eval_rewards/rejected": -2.8128180503845215, + "eval_runtime": 1146.5714, + "eval_samples_per_second": 1.744, + "eval_steps_per_second": 0.872, + "step": 3700 + }, + { + "epoch": 0.49, + "learning_rate": 3.0583702357870964e-06, + "logits/chosen": -0.48846787214279175, + "logits/rejected": -0.0028875707648694515, + "logps/chosen": -522.038818359375, + "logps/rejected": -585.753662109375, + "loss": 0.6047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.117889881134033, + "rewards/margins": 0.6571815609931946, + "rewards/rejected": -2.775071144104004, + "step": 3710 + }, + { + "epoch": 0.49, + "learning_rate": 3.0472306765522393e-06, + "logits/chosen": -0.9246646761894226, + "logits/rejected": 1.0386006832122803, + "logps/chosen": -434.03704833984375, + "logps/rejected": -522.7357177734375, + "loss": 0.4683, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8714182376861572, + "rewards/margins": 1.1093872785568237, + "rewards/rejected": -2.9808051586151123, + "step": 3720 + }, + { + "epoch": 0.49, + "learning_rate": 3.0360796938567628e-06, + "logits/chosen": -0.7488586902618408, + "logits/rejected": 0.5929322838783264, + "logps/chosen": -465.7920837402344, + "logps/rejected": -497.0887145996094, + "loss": 0.5484, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9402759075164795, + "rewards/margins": 0.7168923616409302, + "rewards/rejected": -2.65716814994812, + "step": 3730 + }, + { + "epoch": 0.49, + "learning_rate": 3.0249175204778435e-06, + "logits/chosen": -0.12813475728034973, + "logits/rejected": 0.1722133457660675, + "logps/chosen": -443.7284240722656, + "logps/rejected": -510.58660888671875, + "loss": 0.513, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8336127996444702, + "rewards/margins": 0.8592530488967896, + "rewards/rejected": -2.6928658485412598, + "step": 3740 + }, + { + "epoch": 0.49, + "learning_rate": 3.0137443894262634e-06, + "logits/chosen": -0.1264442503452301, + "logits/rejected": 1.0170029401779175, + "logps/chosen": -455.2422790527344, + "logps/rejected": -530.8844604492188, + "loss": 0.3586, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7252286672592163, + "rewards/margins": 1.3205913305282593, + "rewards/rejected": -3.0458197593688965, + "step": 3750 + }, + { + "epoch": 0.49, + "learning_rate": 3.0025605339415476e-06, + "logits/chosen": -0.2637806832790375, + "logits/rejected": 0.7203923463821411, + "logps/chosen": -443.43988037109375, + "logps/rejected": -518.1168823242188, + "loss": 0.4633, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7762037515640259, + "rewards/margins": 1.030763030052185, + "rewards/rejected": -2.8069663047790527, + "step": 3760 + }, + { + "epoch": 0.49, + "learning_rate": 2.9913661874870923e-06, + "logits/chosen": -0.07495728135108948, + "logits/rejected": 0.4468969702720642, + "logps/chosen": -466.39678955078125, + "logps/rejected": -520.4652709960938, + "loss": 0.5005, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0380637645721436, + "rewards/margins": 0.8805074691772461, + "rewards/rejected": -2.9185712337493896, + "step": 3770 + }, + { + "epoch": 0.49, + "learning_rate": 2.980161583745294e-06, + "logits/chosen": -0.31729915738105774, + "logits/rejected": 0.5375093221664429, + "logps/chosen": -503.6490783691406, + "logps/rejected": -559.5162963867188, + "loss": 0.4487, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9461323022842407, + "rewards/margins": 1.0691391229629517, + "rewards/rejected": -3.0152714252471924, + "step": 3780 + }, + { + "epoch": 0.5, + "learning_rate": 2.96894695661267e-06, + "logits/chosen": -0.8055630922317505, + "logits/rejected": 0.4190472662448883, + "logps/chosen": -503.9349670410156, + "logps/rejected": -536.4412841796875, + "loss": 0.5495, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.072908878326416, + "rewards/margins": 0.787026584148407, + "rewards/rejected": -2.8599355220794678, + "step": 3790 + }, + { + "epoch": 0.5, + "learning_rate": 2.9577225401949773e-06, + "logits/chosen": -0.2834416925907135, + "logits/rejected": -0.048535846173763275, + "logps/chosen": -445.82147216796875, + "logps/rejected": -526.7498779296875, + "loss": 0.5471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1524107456207275, + "rewards/margins": 0.9361416101455688, + "rewards/rejected": -3.0885519981384277, + "step": 3800 + }, + { + "epoch": 0.5, + "eval_logits/chosen": 1.4351999759674072, + "eval_logits/rejected": 2.202155113220215, + "eval_logps/chosen": -475.79779052734375, + "eval_logps/rejected": -551.583251953125, + "eval_loss": 0.5240182876586914, + "eval_rewards/accuracies": 0.7254999876022339, + "eval_rewards/chosen": -2.07307767868042, + "eval_rewards/margins": 0.956943154335022, + "eval_rewards/rejected": -3.0300209522247314, + "eval_runtime": 1140.6996, + "eval_samples_per_second": 1.753, + "eval_steps_per_second": 0.877, + "step": 3800 + }, + { + "epoch": 0.5, + "learning_rate": 2.946488568802324e-06, + "logits/chosen": -0.331181138753891, + "logits/rejected": 0.5133270025253296, + "logps/chosen": -477.48321533203125, + "logps/rejected": -547.2113647460938, + "loss": 0.507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1447014808654785, + "rewards/margins": 0.8755722045898438, + "rewards/rejected": -3.0202736854553223, + "step": 3810 + }, + { + "epoch": 0.5, + "learning_rate": 2.935245276944278e-06, + "logits/chosen": -0.2571033537387848, + "logits/rejected": 0.4771188199520111, + "logps/chosen": -497.06866455078125, + "logps/rejected": -538.19091796875, + "loss": 0.5672, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0118210315704346, + "rewards/margins": 0.7694553136825562, + "rewards/rejected": -2.781276226043701, + "step": 3820 + }, + { + "epoch": 0.5, + "learning_rate": 2.9239928993249723e-06, + "logits/chosen": -0.2940518260002136, + "logits/rejected": 0.16310206055641174, + "logps/chosen": -479.2884216308594, + "logps/rejected": -555.8187255859375, + "loss": 0.4874, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.98129141330719, + "rewards/margins": 1.1137723922729492, + "rewards/rejected": -3.0950639247894287, + "step": 3830 + }, + { + "epoch": 0.5, + "learning_rate": 2.912731670838207e-06, + "logits/chosen": -0.561539888381958, + "logits/rejected": 0.4365290701389313, + "logps/chosen": -458.83734130859375, + "logps/rejected": -545.0903930664062, + "loss": 0.5649, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.022507667541504, + "rewards/margins": 0.8180673718452454, + "rewards/rejected": -2.8405749797821045, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 2.901461826562543e-06, + "logits/chosen": -0.57462078332901, + "logits/rejected": 0.3850022852420807, + "logps/chosen": -398.58343505859375, + "logps/rejected": -475.1954040527344, + "loss": 0.4639, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.729619026184082, + "rewards/margins": 0.9505112767219543, + "rewards/rejected": -2.6801302433013916, + "step": 3850 + }, + { + "epoch": 0.51, + "learning_rate": 2.8901836017563966e-06, + "logits/chosen": -0.42263826727867126, + "logits/rejected": 0.20525923371315002, + "logps/chosen": -463.990478515625, + "logps/rejected": -499.493408203125, + "loss": 0.562, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.830883264541626, + "rewards/margins": 0.7106839418411255, + "rewards/rejected": -2.541567325592041, + "step": 3860 + }, + { + "epoch": 0.51, + "learning_rate": 2.8788972318531272e-06, + "logits/chosen": -0.6156303882598877, + "logits/rejected": 0.3582186996936798, + "logps/chosen": -427.96002197265625, + "logps/rejected": -484.9053649902344, + "loss": 0.5611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7909084558486938, + "rewards/margins": 0.6238812208175659, + "rewards/rejected": -2.4147896766662598, + "step": 3870 + }, + { + "epoch": 0.51, + "learning_rate": 2.8676029524561255e-06, + "logits/chosen": -0.09342961758375168, + "logits/rejected": 0.25026214122772217, + "logps/chosen": -468.81072998046875, + "logps/rejected": -550.8553466796875, + "loss": 0.5211, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7681678533554077, + "rewards/margins": 0.9555566906929016, + "rewards/rejected": -2.723724603652954, + "step": 3880 + }, + { + "epoch": 0.51, + "learning_rate": 2.8563009993338906e-06, + "logits/chosen": -0.24521970748901367, + "logits/rejected": 0.4679892659187317, + "logps/chosen": -440.3994140625, + "logps/rejected": -543.8873291015625, + "loss": 0.4792, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9353783130645752, + "rewards/margins": 1.108598232269287, + "rewards/rejected": -3.043976306915283, + "step": 3890 + }, + { + "epoch": 0.51, + "learning_rate": 2.844991608415113e-06, + "logits/chosen": -0.3554648458957672, + "logits/rejected": -0.03292546421289444, + "logps/chosen": -480.20037841796875, + "logps/rejected": -566.8743896484375, + "loss": 0.5193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.146230936050415, + "rewards/margins": 0.9551491737365723, + "rewards/rejected": -3.1013801097869873, + "step": 3900 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 1.3990401029586792, + "eval_logits/rejected": 2.1469078063964844, + "eval_logps/chosen": -485.6194152832031, + "eval_logps/rejected": -559.7595825195312, + "eval_loss": 0.5184990167617798, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -2.1712939739227295, + "eval_rewards/margins": 0.940490186214447, + "eval_rewards/rejected": -3.111783981323242, + "eval_runtime": 1152.1911, + "eval_samples_per_second": 1.736, + "eval_steps_per_second": 0.868, + "step": 3900 + }, + { + "epoch": 0.51, + "learning_rate": 2.833675015783746e-06, + "logits/chosen": -0.07204243540763855, + "logits/rejected": 0.11534376442432404, + "logps/chosen": -458.18798828125, + "logps/rejected": -554.1547241210938, + "loss": 0.5293, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2303760051727295, + "rewards/margins": 0.9195043444633484, + "rewards/rejected": -3.1498801708221436, + "step": 3910 + }, + { + "epoch": 0.51, + "learning_rate": 2.8223514576740784e-06, + "logits/chosen": 0.09607383608818054, + "logits/rejected": 0.03933734819293022, + "logps/chosen": -431.48175048828125, + "logps/rejected": -560.3643798828125, + "loss": 0.4971, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0321695804595947, + "rewards/margins": 0.9992935061454773, + "rewards/rejected": -3.0314629077911377, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8110211704658073e-06, + "logits/chosen": -0.698123037815094, + "logits/rejected": 0.5209210515022278, + "logps/chosen": -530.5778198242188, + "logps/rejected": -592.2348022460938, + "loss": 0.4693, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.271674156188965, + "rewards/margins": 0.9807974696159363, + "rewards/rejected": -3.252471446990967, + "step": 3930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7996843906790955e-06, + "logits/chosen": -0.11168187856674194, + "logits/rejected": 0.6565873026847839, + "logps/chosen": -470.85064697265625, + "logps/rejected": -538.1116943359375, + "loss": 0.6032, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2667603492736816, + "rewards/margins": 0.637987494468689, + "rewards/rejected": -2.9047482013702393, + "step": 3940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7883413549696396e-06, + "logits/chosen": -0.47097617387771606, + "logits/rejected": 0.9360452890396118, + "logps/chosen": -505.99658203125, + "logps/rejected": -607.1832885742188, + "loss": 0.3929, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.171560764312744, + "rewards/margins": 1.2839739322662354, + "rewards/rejected": -3.4555351734161377, + "step": 3950 + }, + { + "epoch": 0.52, + "learning_rate": 2.776992300123732e-06, + "logits/chosen": -0.3683644235134125, + "logits/rejected": 0.48894062638282776, + "logps/chosen": -468.18316650390625, + "logps/rejected": -581.08154296875, + "loss": 0.5471, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.261422872543335, + "rewards/margins": 1.1004810333251953, + "rewards/rejected": -3.3619041442871094, + "step": 3960 + }, + { + "epoch": 0.52, + "learning_rate": 2.7656374630533113e-06, + "logits/chosen": -0.399179607629776, + "logits/rejected": -0.14117364585399628, + "logps/chosen": -440.6139221191406, + "logps/rejected": -555.7425537109375, + "loss": 0.4605, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0904624462127686, + "rewards/margins": 1.1303551197052002, + "rewards/rejected": -3.2208175659179688, + "step": 3970 + }, + { + "epoch": 0.52, + "learning_rate": 2.754277080791021e-06, + "logits/chosen": -0.6631547212600708, + "logits/rejected": -0.17448690533638, + "logps/chosen": -481.0576171875, + "logps/rejected": -556.90380859375, + "loss": 0.6197, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1500160694122314, + "rewards/margins": 0.8200462460517883, + "rewards/rejected": -2.970062255859375, + "step": 3980 + }, + { + "epoch": 0.52, + "learning_rate": 2.742911390485262e-06, + "logits/chosen": -0.10709667205810547, + "logits/rejected": 0.19126668572425842, + "logps/chosen": -419.6375427246094, + "logps/rejected": -470.33880615234375, + "loss": 0.6197, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0977683067321777, + "rewards/margins": 0.590727686882019, + "rewards/rejected": -2.6884961128234863, + "step": 3990 + }, + { + "epoch": 0.52, + "learning_rate": 2.731540629395239e-06, + "logits/chosen": -0.5709615349769592, + "logits/rejected": 0.3959980607032776, + "logps/chosen": -460.33544921875, + "logps/rejected": -499.603759765625, + "loss": 0.5764, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.8743162155151367, + "rewards/margins": 0.6547318696975708, + "rewards/rejected": -2.529047966003418, + "step": 4000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 1.1191996335983276, + "eval_logits/rejected": 1.8653334379196167, + "eval_logps/chosen": -469.05755615234375, + "eval_logps/rejected": -545.9298095703125, + "eval_loss": 0.5176907777786255, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -2.0056753158569336, + "eval_rewards/margins": 0.9678111672401428, + "eval_rewards/rejected": -2.9734864234924316, + "eval_runtime": 1155.1218, + "eval_samples_per_second": 1.731, + "eval_steps_per_second": 0.866, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7201650348860115e-06, + "logits/chosen": -0.7722757458686829, + "logits/rejected": 0.36960023641586304, + "logps/chosen": -442.9876403808594, + "logps/rejected": -502.1902770996094, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.029517412185669, + "rewards/margins": 1.0260822772979736, + "rewards/rejected": -3.0555994510650635, + "step": 4010 + }, + { + "epoch": 0.53, + "learning_rate": 2.7087848444235354e-06, + "logits/chosen": -0.8600364923477173, + "logits/rejected": 0.4562684893608093, + "logps/chosen": -454.643310546875, + "logps/rejected": -560.4906005859375, + "loss": 0.4478, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.75933837890625, + "rewards/margins": 1.259271264076233, + "rewards/rejected": -3.0186095237731934, + "step": 4020 + }, + { + "epoch": 0.53, + "learning_rate": 2.697400295569707e-06, + "logits/chosen": -0.6417149901390076, + "logits/rejected": -0.5029559135437012, + "logps/chosen": -410.6507873535156, + "logps/rejected": -496.6881408691406, + "loss": 0.5825, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6875232458114624, + "rewards/margins": 0.8735893964767456, + "rewards/rejected": -2.561112880706787, + "step": 4030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6860116259774065e-06, + "logits/chosen": -0.604103684425354, + "logits/rejected": 0.4802830219268799, + "logps/chosen": -457.4012145996094, + "logps/rejected": -566.4182739257812, + "loss": 0.4229, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7043393850326538, + "rewards/margins": 1.2353878021240234, + "rewards/rejected": -2.939727306365967, + "step": 4040 + }, + { + "epoch": 0.53, + "learning_rate": 2.674619073385531e-06, + "logits/chosen": -0.3527432382106781, + "logits/rejected": 0.4966762959957123, + "logps/chosen": -414.24102783203125, + "logps/rejected": -521.317626953125, + "loss": 0.5281, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7188953161239624, + "rewards/margins": 1.0378594398498535, + "rewards/rejected": -2.7567548751831055, + "step": 4050 + }, + { + "epoch": 0.53, + "learning_rate": 2.663222875614038e-06, + "logits/chosen": -0.4794388711452484, + "logits/rejected": 0.13224102556705475, + "logps/chosen": -445.65087890625, + "logps/rejected": -518.9547729492188, + "loss": 0.6135, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.054771661758423, + "rewards/margins": 0.5872553586959839, + "rewards/rejected": -2.6420271396636963, + "step": 4060 + }, + { + "epoch": 0.53, + "learning_rate": 2.6518232705589775e-06, + "logits/chosen": -0.4580906331539154, + "logits/rejected": 0.23300707340240479, + "logps/chosen": -428.31707763671875, + "logps/rejected": -548.2869262695312, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6607511043548584, + "rewards/margins": 1.2533749341964722, + "rewards/rejected": -2.91412615776062, + "step": 4070 + }, + { + "epoch": 0.53, + "learning_rate": 2.640420496187528e-06, + "logits/chosen": -0.6754065752029419, + "logits/rejected": 0.6897384524345398, + "logps/chosen": -462.10235595703125, + "logps/rejected": -518.758544921875, + "loss": 0.4402, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6582934856414795, + "rewards/margins": 1.126479983329773, + "rewards/rejected": -2.784773588180542, + "step": 4080 + }, + { + "epoch": 0.54, + "learning_rate": 2.629014790533025e-06, + "logits/chosen": -0.8516901135444641, + "logits/rejected": 0.13358630239963531, + "logps/chosen": -460.47509765625, + "logps/rejected": -500.4847106933594, + "loss": 0.4939, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6765422821044922, + "rewards/margins": 1.0363404750823975, + "rewards/rejected": -2.7128829956054688, + "step": 4090 + }, + { + "epoch": 0.54, + "learning_rate": 2.617606391689996e-06, + "logits/chosen": -0.5585889220237732, + "logits/rejected": 0.3247618079185486, + "logps/chosen": -427.287841796875, + "logps/rejected": -505.0663146972656, + "loss": 0.504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6491613388061523, + "rewards/margins": 0.9884650111198425, + "rewards/rejected": -2.6376264095306396, + "step": 4100 + }, + { + "epoch": 0.54, + "eval_logits/chosen": 1.0344183444976807, + "eval_logits/rejected": 1.7948068380355835, + "eval_logps/chosen": -450.85650634765625, + "eval_logps/rejected": -523.1134643554688, + "eval_loss": 0.5179835557937622, + "eval_rewards/accuracies": 0.7269999980926514, + "eval_rewards/chosen": -1.8236651420593262, + "eval_rewards/margins": 0.9216578006744385, + "eval_rewards/rejected": -2.7453227043151855, + "eval_runtime": 1151.937, + "eval_samples_per_second": 1.736, + "eval_steps_per_second": 0.868, + "step": 4100 + }, + { + "epoch": 0.54, + "learning_rate": 2.6061955378091896e-06, + "logits/chosen": -0.4794127345085144, + "logits/rejected": 0.3910555839538574, + "logps/chosen": -423.6997985839844, + "logps/rejected": -548.6406860351562, + "loss": 0.4475, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8037827014923096, + "rewards/margins": 1.150130271911621, + "rewards/rejected": -2.9539127349853516, + "step": 4110 + }, + { + "epoch": 0.54, + "learning_rate": 2.5947824670926025e-06, + "logits/chosen": -0.4441138803958893, + "logits/rejected": -0.20847466588020325, + "logps/chosen": -416.5233459472656, + "logps/rejected": -513.5562744140625, + "loss": 0.5321, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7394109964370728, + "rewards/margins": 0.8989079594612122, + "rewards/rejected": -2.6383190155029297, + "step": 4120 + }, + { + "epoch": 0.54, + "learning_rate": 2.583367417788508e-06, + "logits/chosen": -0.3594195246696472, + "logits/rejected": 0.763751745223999, + "logps/chosen": -463.782470703125, + "logps/rejected": -538.66748046875, + "loss": 0.5754, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1264193058013916, + "rewards/margins": 0.9109905958175659, + "rewards/rejected": -3.037409543991089, + "step": 4130 + }, + { + "epoch": 0.54, + "learning_rate": 2.5719506281864838e-06, + "logits/chosen": -0.767003059387207, + "logits/rejected": -0.14061614871025085, + "logps/chosen": -479.3724670410156, + "logps/rejected": -501.5732421875, + "loss": 0.5855, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9783029556274414, + "rewards/margins": 0.8176881670951843, + "rewards/rejected": -2.7959909439086914, + "step": 4140 + }, + { + "epoch": 0.54, + "learning_rate": 2.5605323366124335e-06, + "logits/chosen": -0.4945451617240906, + "logits/rejected": 0.5269497036933899, + "logps/chosen": -454.4396057128906, + "logps/rejected": -526.3812866210938, + "loss": 0.513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9973022937774658, + "rewards/margins": 0.7900811433792114, + "rewards/rejected": -2.787383556365967, + "step": 4150 + }, + { + "epoch": 0.54, + "learning_rate": 2.5491127814236172e-06, + "logits/chosen": -0.19666481018066406, + "logits/rejected": -0.4852234423160553, + "logps/chosen": -372.50628662109375, + "logps/rejected": -494.17144775390625, + "loss": 0.51, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6012966632843018, + "rewards/margins": 0.775696337223053, + "rewards/rejected": -2.37699294090271, + "step": 4160 + }, + { + "epoch": 0.55, + "learning_rate": 2.537692201003671e-06, + "logits/chosen": -0.26586753129959106, + "logits/rejected": 0.2930835485458374, + "logps/chosen": -467.44317626953125, + "logps/rejected": -552.4528198242188, + "loss": 0.5688, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0270495414733887, + "rewards/margins": 0.9594928622245789, + "rewards/rejected": -2.9865424633026123, + "step": 4170 + }, + { + "epoch": 0.55, + "learning_rate": 2.526270833757635e-06, + "logits/chosen": -0.6537747383117676, + "logits/rejected": 0.6223368048667908, + "logps/chosen": -447.3667907714844, + "logps/rejected": -499.9156188964844, + "loss": 0.5537, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9054014682769775, + "rewards/margins": 0.7102811932563782, + "rewards/rejected": -2.615682601928711, + "step": 4180 + }, + { + "epoch": 0.55, + "learning_rate": 2.514848918106971e-06, + "logits/chosen": -0.553719162940979, + "logits/rejected": 0.4931580424308777, + "logps/chosen": -447.16204833984375, + "logps/rejected": -520.3040771484375, + "loss": 0.4678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8360874652862549, + "rewards/margins": 1.1568195819854736, + "rewards/rejected": -2.9929070472717285, + "step": 4190 + }, + { + "epoch": 0.55, + "learning_rate": 2.503426692484594e-06, + "logits/chosen": -0.3490946888923645, + "logits/rejected": 0.03165990114212036, + "logps/chosen": -444.204345703125, + "logps/rejected": -547.2092895507812, + "loss": 0.4846, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9349002838134766, + "rewards/margins": 0.9464532732963562, + "rewards/rejected": -2.8813538551330566, + "step": 4200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 1.3328680992126465, + "eval_logits/rejected": 2.1064412593841553, + "eval_logps/chosen": -480.6316833496094, + "eval_logps/rejected": -553.0634765625, + "eval_loss": 0.5167676210403442, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -2.1214165687561035, + "eval_rewards/margins": 0.9234069585800171, + "eval_rewards/rejected": -3.044823169708252, + "eval_runtime": 1156.9425, + "eval_samples_per_second": 1.729, + "eval_steps_per_second": 0.864, + "step": 4200 + }, + { + "epoch": 0.55, + "learning_rate": 2.492004395329883e-06, + "logits/chosen": -0.47201377153396606, + "logits/rejected": 0.1662154495716095, + "logps/chosen": -442.28265380859375, + "logps/rejected": -555.5616455078125, + "loss": 0.424, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9666084051132202, + "rewards/margins": 1.2794740200042725, + "rewards/rejected": -3.246082305908203, + "step": 4210 + }, + { + "epoch": 0.55, + "learning_rate": 2.4805822650837165e-06, + "logits/chosen": -0.05408313125371933, + "logits/rejected": 0.7589906454086304, + "logps/chosen": -454.1454162597656, + "logps/rejected": -595.169189453125, + "loss": 0.4313, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1684341430664062, + "rewards/margins": 1.3142242431640625, + "rewards/rejected": -3.4826583862304688, + "step": 4220 + }, + { + "epoch": 0.55, + "learning_rate": 2.4691605401834843e-06, + "logits/chosen": -0.5830433964729309, + "logits/rejected": 0.2723081707954407, + "logps/chosen": -507.00323486328125, + "logps/rejected": -582.15283203125, + "loss": 0.5321, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2436351776123047, + "rewards/margins": 0.8307396173477173, + "rewards/rejected": -3.0743746757507324, + "step": 4230 + }, + { + "epoch": 0.55, + "learning_rate": 2.457739459058117e-06, + "logits/chosen": -0.5049376487731934, + "logits/rejected": 0.14484481513500214, + "logps/chosen": -547.284423828125, + "logps/rejected": -602.6152954101562, + "loss": 0.4511, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2237820625305176, + "rewards/margins": 1.0236866474151611, + "rewards/rejected": -3.2474684715270996, + "step": 4240 + }, + { + "epoch": 0.56, + "learning_rate": 2.4463192601231054e-06, + "logits/chosen": -0.28555774688720703, + "logits/rejected": 0.8731438517570496, + "logps/chosen": -530.2666015625, + "logps/rejected": -578.294677734375, + "loss": 0.4807, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.338956117630005, + "rewards/margins": 1.1115846633911133, + "rewards/rejected": -3.450540542602539, + "step": 4250 + }, + { + "epoch": 0.56, + "learning_rate": 2.434900181775524e-06, + "logits/chosen": -0.7155624628067017, + "logits/rejected": 0.29977577924728394, + "logps/chosen": -479.2982482910156, + "logps/rejected": -573.4529418945312, + "loss": 0.4766, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0659286975860596, + "rewards/margins": 1.073961615562439, + "rewards/rejected": -3.139890193939209, + "step": 4260 + }, + { + "epoch": 0.56, + "learning_rate": 2.4234824623890578e-06, + "logits/chosen": -0.7157832980155945, + "logits/rejected": 0.18825335800647736, + "logps/chosen": -466.107177734375, + "logps/rejected": -545.6228637695312, + "loss": 0.4646, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0345892906188965, + "rewards/margins": 0.9853957295417786, + "rewards/rejected": -3.0199851989746094, + "step": 4270 + }, + { + "epoch": 0.56, + "learning_rate": 2.4120663403090193e-06, + "logits/chosen": -0.6923838257789612, + "logits/rejected": 0.20935389399528503, + "logps/chosen": -482.73724365234375, + "logps/rejected": -583.6288452148438, + "loss": 0.5585, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0791468620300293, + "rewards/margins": 0.9277111291885376, + "rewards/rejected": -3.0068578720092773, + "step": 4280 + }, + { + "epoch": 0.56, + "learning_rate": 2.40065205384738e-06, + "logits/chosen": -0.7260463833808899, + "logits/rejected": 0.5240879058837891, + "logps/chosen": -472.66845703125, + "logps/rejected": -497.08575439453125, + "loss": 0.6256, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3160202503204346, + "rewards/margins": 0.5535823106765747, + "rewards/rejected": -2.8696022033691406, + "step": 4290 + }, + { + "epoch": 0.56, + "learning_rate": 2.389239841277793e-06, + "logits/chosen": -0.25377124547958374, + "logits/rejected": 0.46026620268821716, + "logps/chosen": -445.01019287109375, + "logps/rejected": -527.544677734375, + "loss": 0.426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9602826833724976, + "rewards/margins": 1.1500470638275146, + "rewards/rejected": -3.1103296279907227, + "step": 4300 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.2899590730667114, + "eval_logits/rejected": 2.0376882553100586, + "eval_logps/chosen": -469.9074401855469, + "eval_logps/rejected": -543.4855346679688, + "eval_loss": 0.5095502138137817, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -2.014173746109009, + "eval_rewards/margins": 0.9348700642585754, + "eval_rewards/rejected": -2.9490435123443604, + "eval_runtime": 1156.5138, + "eval_samples_per_second": 1.729, + "eval_steps_per_second": 0.865, + "step": 4300 + }, + { + "epoch": 0.56, + "learning_rate": 2.3778299408306167e-06, + "logits/chosen": -0.4929943084716797, + "logits/rejected": 0.48326101899147034, + "logps/chosen": -455.02838134765625, + "logps/rejected": -525.9819946289062, + "loss": 0.5103, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0632247924804688, + "rewards/margins": 0.8879944682121277, + "rewards/rejected": -2.951219081878662, + "step": 4310 + }, + { + "epoch": 0.57, + "learning_rate": 2.3664225906879452e-06, + "logits/chosen": -0.42962485551834106, + "logits/rejected": 0.17619100213050842, + "logps/chosen": -452.4551696777344, + "logps/rejected": -521.5052490234375, + "loss": 0.4912, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.080214023590088, + "rewards/margins": 0.9484179615974426, + "rewards/rejected": -3.0286319255828857, + "step": 4320 + }, + { + "epoch": 0.57, + "learning_rate": 2.3550180289786357e-06, + "logits/chosen": -0.8907386064529419, + "logits/rejected": 0.4001527726650238, + "logps/chosen": -469.4991760253906, + "logps/rejected": -503.5176696777344, + "loss": 0.5744, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0900187492370605, + "rewards/margins": 0.7551154494285583, + "rewards/rejected": -2.8451342582702637, + "step": 4330 + }, + { + "epoch": 0.57, + "learning_rate": 2.343616493773335e-06, + "logits/chosen": -0.3753073513507843, + "logits/rejected": -0.04642016813158989, + "logps/chosen": -479.0115661621094, + "logps/rejected": -580.8361206054688, + "loss": 0.5041, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.154541492462158, + "rewards/margins": 1.0236194133758545, + "rewards/rejected": -3.1781609058380127, + "step": 4340 + }, + { + "epoch": 0.57, + "learning_rate": 2.3322182230795127e-06, + "logits/chosen": -0.14095106720924377, + "logits/rejected": -0.2498963177204132, + "logps/chosen": -418.10491943359375, + "logps/rejected": -577.77783203125, + "loss": 0.4198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8447338342666626, + "rewards/margins": 1.3301194906234741, + "rewards/rejected": -3.1748533248901367, + "step": 4350 + }, + { + "epoch": 0.57, + "learning_rate": 2.320823454836491e-06, + "logits/chosen": -0.9373834729194641, + "logits/rejected": 0.3088436722755432, + "logps/chosen": -433.88226318359375, + "logps/rejected": -530.290283203125, + "loss": 0.401, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7864519357681274, + "rewards/margins": 1.156484603881836, + "rewards/rejected": -2.942936658859253, + "step": 4360 + }, + { + "epoch": 0.57, + "learning_rate": 2.309432426910478e-06, + "logits/chosen": -0.6841565370559692, + "logits/rejected": 0.7033779621124268, + "logps/chosen": -494.7813415527344, + "logps/rejected": -543.1317138671875, + "loss": 0.4644, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9815502166748047, + "rewards/margins": 1.0772145986557007, + "rewards/rejected": -3.058764934539795, + "step": 4370 + }, + { + "epoch": 0.57, + "learning_rate": 2.298045377089604e-06, + "logits/chosen": -0.8424911499023438, + "logits/rejected": 0.32612770795822144, + "logps/chosen": -466.86895751953125, + "logps/rejected": -569.2764892578125, + "loss": 0.4831, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1209797859191895, + "rewards/margins": 1.2162678241729736, + "rewards/rejected": -3.337247371673584, + "step": 4380 + }, + { + "epoch": 0.57, + "learning_rate": 2.286662543078955e-06, + "logits/chosen": -0.4193035066127777, + "logits/rejected": 0.4950118958950043, + "logps/chosen": -491.828857421875, + "logps/rejected": -533.1112060546875, + "loss": 0.5354, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.155092239379883, + "rewards/margins": 0.7993155121803284, + "rewards/rejected": -2.9544079303741455, + "step": 4390 + }, + { + "epoch": 0.58, + "learning_rate": 2.2752841624956125e-06, + "logits/chosen": -0.604637622833252, + "logits/rejected": 0.18484275043010712, + "logps/chosen": -507.84112548828125, + "logps/rejected": -585.2672119140625, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1505980491638184, + "rewards/margins": 0.9751760363578796, + "rewards/rejected": -3.1257741451263428, + "step": 4400 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 1.0285941362380981, + "eval_logits/rejected": 1.7669211626052856, + "eval_logps/chosen": -464.733154296875, + "eval_logps/rejected": -542.265869140625, + "eval_loss": 0.5142533779144287, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.9624314308166504, + "eval_rewards/margins": 0.9744157195091248, + "eval_rewards/rejected": -2.93684720993042, + "eval_runtime": 1341.7741, + "eval_samples_per_second": 1.491, + "eval_steps_per_second": 0.745, + "step": 4400 + }, + { + "epoch": 0.58, + "learning_rate": 2.2639104728636915e-06, + "logits/chosen": -0.3302585482597351, + "logits/rejected": -0.0848245620727539, + "logps/chosen": -438.95501708984375, + "logps/rejected": -518.0885009765625, + "loss": 0.5619, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7836341857910156, + "rewards/margins": 0.8088982701301575, + "rewards/rejected": -2.5925326347351074, + "step": 4410 + }, + { + "epoch": 0.58, + "learning_rate": 2.252541711609384e-06, + "logits/chosen": -0.4086834788322449, + "logits/rejected": 0.6533911824226379, + "logps/chosen": -442.6697692871094, + "logps/rejected": -500.9053649902344, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.846428632736206, + "rewards/margins": 0.9241348505020142, + "rewards/rejected": -2.7705636024475098, + "step": 4420 + }, + { + "epoch": 0.58, + "learning_rate": 2.241178116056002e-06, + "logits/chosen": -0.7984479665756226, + "logits/rejected": 0.013322305865585804, + "logps/chosen": -431.56884765625, + "logps/rejected": -503.01080322265625, + "loss": 0.4753, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7908424139022827, + "rewards/margins": 0.9538822174072266, + "rewards/rejected": -2.744724750518799, + "step": 4430 + }, + { + "epoch": 0.58, + "learning_rate": 2.2298199234190236e-06, + "logits/chosen": -0.3780064582824707, + "logits/rejected": 0.18104317784309387, + "logps/chosen": -475.06024169921875, + "logps/rejected": -548.1165771484375, + "loss": 0.4519, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.868931770324707, + "rewards/margins": 1.0790822505950928, + "rewards/rejected": -2.9480140209198, + "step": 4440 + }, + { + "epoch": 0.58, + "learning_rate": 2.218467370801138e-06, + "logits/chosen": -0.6153514981269836, + "logits/rejected": 0.3399887979030609, + "logps/chosen": -482.9944763183594, + "logps/rejected": -523.6447143554688, + "loss": 0.6105, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1587672233581543, + "rewards/margins": 0.7031978368759155, + "rewards/rejected": -2.8619649410247803, + "step": 4450 + }, + { + "epoch": 0.58, + "learning_rate": 2.207120695187304e-06, + "logits/chosen": -0.7532152533531189, + "logits/rejected": 0.8358518481254578, + "logps/chosen": -490.2608337402344, + "logps/rejected": -566.9043579101562, + "loss": 0.4384, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.088639736175537, + "rewards/margins": 1.1486093997955322, + "rewards/rejected": -3.2372488975524902, + "step": 4460 + }, + { + "epoch": 0.58, + "learning_rate": 2.195780133439794e-06, + "logits/chosen": -0.20792141556739807, + "logits/rejected": 0.016714613884687424, + "logps/chosen": -480.798583984375, + "logps/rejected": -579.108642578125, + "loss": 0.5193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.015219211578369, + "rewards/margins": 0.9281864166259766, + "rewards/rejected": -2.9434056282043457, + "step": 4470 + }, + { + "epoch": 0.59, + "learning_rate": 2.1844459222932535e-06, + "logits/chosen": -0.6897789835929871, + "logits/rejected": 0.44204026460647583, + "logps/chosen": -500.740478515625, + "logps/rejected": -560.5718383789062, + "loss": 0.4913, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2337794303894043, + "rewards/margins": 0.9274255633354187, + "rewards/rejected": -3.161205768585205, + "step": 4480 + }, + { + "epoch": 0.59, + "learning_rate": 2.17311829834976e-06, + "logits/chosen": -1.0006176233291626, + "logits/rejected": -0.1672603338956833, + "logps/chosen": -442.28802490234375, + "logps/rejected": -546.0424194335938, + "loss": 0.4748, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.864127516746521, + "rewards/margins": 1.0716313123703003, + "rewards/rejected": -2.9357590675354004, + "step": 4490 + }, + { + "epoch": 0.59, + "learning_rate": 2.1617974980738814e-06, + "logits/chosen": -0.7605545520782471, + "logits/rejected": 0.6477273106575012, + "logps/chosen": -446.9376525878906, + "logps/rejected": -515.064453125, + "loss": 0.4542, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9530082941055298, + "rewards/margins": 0.973568320274353, + "rewards/rejected": -2.926576852798462, + "step": 4500 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 1.1394939422607422, + "eval_logits/rejected": 1.8774791955947876, + "eval_logps/chosen": -464.9223327636719, + "eval_logps/rejected": -541.3861083984375, + "eval_loss": 0.5101990699768066, + "eval_rewards/accuracies": 0.7335000038146973, + "eval_rewards/chosen": -1.9643235206604004, + "eval_rewards/margins": 0.963726282119751, + "eval_rewards/rejected": -2.9280498027801514, + "eval_runtime": 1305.9412, + "eval_samples_per_second": 1.531, + "eval_steps_per_second": 0.766, + "step": 4500 + }, + { + "epoch": 0.59, + "learning_rate": 2.150483757787744e-06, + "logits/chosen": -0.9358876943588257, + "logits/rejected": 0.6876541972160339, + "logps/chosen": -459.2244567871094, + "logps/rejected": -489.6673278808594, + "loss": 0.5597, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.086225986480713, + "rewards/margins": 0.8154786229133606, + "rewards/rejected": -2.9017043113708496, + "step": 4510 + }, + { + "epoch": 0.59, + "learning_rate": 2.139177313666093e-06, + "logits/chosen": -0.5859389901161194, + "logits/rejected": -0.10606422275304794, + "logps/chosen": -503.25933837890625, + "logps/rejected": -543.6451416015625, + "loss": 0.533, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0373759269714355, + "rewards/margins": 0.933161735534668, + "rewards/rejected": -2.9705376625061035, + "step": 4520 + }, + { + "epoch": 0.59, + "learning_rate": 2.1278784017313688e-06, + "logits/chosen": -0.2547047734260559, + "logits/rejected": -0.3063717484474182, + "logps/chosen": -468.4463806152344, + "logps/rejected": -556.9096069335938, + "loss": 0.5121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8143895864486694, + "rewards/margins": 0.8284521102905273, + "rewards/rejected": -2.6428415775299072, + "step": 4530 + }, + { + "epoch": 0.59, + "learning_rate": 2.116587257848776e-06, + "logits/chosen": -0.6765443086624146, + "logits/rejected": -0.6112786531448364, + "logps/chosen": -419.4952697753906, + "logps/rejected": -512.7720947265625, + "loss": 0.587, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.773984670639038, + "rewards/margins": 0.6838240623474121, + "rewards/rejected": -2.4578089714050293, + "step": 4540 + }, + { + "epoch": 0.6, + "learning_rate": 2.105304117721361e-06, + "logits/chosen": -0.5224363803863525, + "logits/rejected": 0.21402570605278015, + "logps/chosen": -394.37713623046875, + "logps/rejected": -444.3313903808594, + "loss": 0.595, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7446057796478271, + "rewards/margins": 0.7791751027107239, + "rewards/rejected": -2.5237812995910645, + "step": 4550 + }, + { + "epoch": 0.6, + "learning_rate": 2.0940292168850913e-06, + "logits/chosen": -0.5463732481002808, + "logits/rejected": 0.1663360893726349, + "logps/chosen": -414.0874938964844, + "logps/rejected": -459.6375427246094, + "loss": 0.5231, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5902382135391235, + "rewards/margins": 0.7520595788955688, + "rewards/rejected": -2.3422977924346924, + "step": 4560 + }, + { + "epoch": 0.6, + "learning_rate": 2.082762790703939e-06, + "logits/chosen": -0.802535891532898, + "logits/rejected": 0.28110548853874207, + "logps/chosen": -429.9853515625, + "logps/rejected": -507.537841796875, + "loss": 0.5194, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6454626321792603, + "rewards/margins": 0.8513976335525513, + "rewards/rejected": -2.4968602657318115, + "step": 4570 + }, + { + "epoch": 0.6, + "learning_rate": 2.0715050743649674e-06, + "logits/chosen": -0.736199676990509, + "logits/rejected": 0.028460631147027016, + "logps/chosen": -406.27423095703125, + "logps/rejected": -543.2493286132812, + "loss": 0.4913, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7144979238510132, + "rewards/margins": 0.9728431701660156, + "rewards/rejected": -2.6873412132263184, + "step": 4580 + }, + { + "epoch": 0.6, + "learning_rate": 2.060256302873421e-06, + "logits/chosen": -0.4676334857940674, + "logits/rejected": -0.17427358031272888, + "logps/chosen": -442.82806396484375, + "logps/rejected": -547.2783203125, + "loss": 0.4903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9163872003555298, + "rewards/margins": 0.9291712641716003, + "rewards/rejected": -2.8455584049224854, + "step": 4590 + }, + { + "epoch": 0.6, + "learning_rate": 2.049016711047822e-06, + "logits/chosen": -0.7500999569892883, + "logits/rejected": 0.41036081314086914, + "logps/chosen": -462.0934143066406, + "logps/rejected": -542.0261840820312, + "loss": 0.4839, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.009445905685425, + "rewards/margins": 1.0364575386047363, + "rewards/rejected": -3.045903444290161, + "step": 4600 + }, + { + "epoch": 0.6, + "eval_logits/chosen": 1.1471757888793945, + "eval_logits/rejected": 1.885801911354065, + "eval_logps/chosen": -468.8564147949219, + "eval_logps/rejected": -546.4149780273438, + "eval_loss": 0.5094394087791443, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -2.003664016723633, + "eval_rewards/margins": 0.9746747016906738, + "eval_rewards/rejected": -2.9783387184143066, + "eval_runtime": 1172.0125, + "eval_samples_per_second": 1.706, + "eval_steps_per_second": 0.853, + "step": 4600 + }, + { + "epoch": 0.6, + "learning_rate": 2.037786533515064e-06, + "logits/chosen": -0.33482661843299866, + "logits/rejected": 0.0835098996758461, + "logps/chosen": -504.9192810058594, + "logps/rejected": -563.8491821289062, + "loss": 0.5111, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.06898832321167, + "rewards/margins": 0.8450971841812134, + "rewards/rejected": -2.914085626602173, + "step": 4610 + }, + { + "epoch": 0.6, + "learning_rate": 2.02656600470552e-06, + "logits/chosen": -0.6622999906539917, + "logits/rejected": 0.010640504769980907, + "logps/chosen": -441.935546875, + "logps/rejected": -526.4178466796875, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8346233367919922, + "rewards/margins": 1.0247472524642944, + "rewards/rejected": -2.859370708465576, + "step": 4620 + }, + { + "epoch": 0.61, + "learning_rate": 2.015355358848144e-06, + "logits/chosen": -0.013636887073516846, + "logits/rejected": -0.028725851327180862, + "logps/chosen": -397.56976318359375, + "logps/rejected": -514.6229248046875, + "loss": 0.5147, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8054370880126953, + "rewards/margins": 0.9198155403137207, + "rewards/rejected": -2.725252866744995, + "step": 4630 + }, + { + "epoch": 0.61, + "learning_rate": 2.004154829965582e-06, + "logits/chosen": -0.6800112128257751, + "logits/rejected": 0.04380284622311592, + "logps/chosen": -467.8673400878906, + "logps/rejected": -557.4893798828125, + "loss": 0.5006, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9010484218597412, + "rewards/margins": 1.0638830661773682, + "rewards/rejected": -2.9649314880371094, + "step": 4640 + }, + { + "epoch": 0.61, + "learning_rate": 1.99296465186929e-06, + "logits/chosen": -0.8370498418807983, + "logits/rejected": 0.4450223445892334, + "logps/chosen": -462.6966247558594, + "logps/rejected": -493.6236877441406, + "loss": 0.514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8274908065795898, + "rewards/margins": 0.9097514152526855, + "rewards/rejected": -2.7372422218322754, + "step": 4650 + }, + { + "epoch": 0.61, + "learning_rate": 1.9817850581546488e-06, + "logits/chosen": -0.24383525550365448, + "logits/rejected": 0.3440066874027252, + "logps/chosen": -495.785400390625, + "logps/rejected": -580.841796875, + "loss": 0.5974, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.167267084121704, + "rewards/margins": 0.7816963791847229, + "rewards/rejected": -2.9489636421203613, + "step": 4660 + }, + { + "epoch": 0.61, + "learning_rate": 1.970616282196091e-06, + "logits/chosen": -0.5986989140510559, + "logits/rejected": 0.10017760097980499, + "logps/chosen": -437.3359375, + "logps/rejected": -534.2577514648438, + "loss": 0.4824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8744062185287476, + "rewards/margins": 1.0383882522583008, + "rewards/rejected": -2.912794589996338, + "step": 4670 + }, + { + "epoch": 0.61, + "learning_rate": 1.959458557142228e-06, + "logits/chosen": -0.3310568928718567, + "logits/rejected": -0.00010063648369396105, + "logps/chosen": -468.37921142578125, + "logps/rejected": -538.8629150390625, + "loss": 0.7126, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1639952659606934, + "rewards/margins": 0.5907930135726929, + "rewards/rejected": -2.754788398742676, + "step": 4680 + }, + { + "epoch": 0.61, + "learning_rate": 1.948312115910982e-06, + "logits/chosen": -0.44464436173439026, + "logits/rejected": 0.15500059723854065, + "logps/chosen": -484.95843505859375, + "logps/rejected": -547.2438354492188, + "loss": 0.4925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0217325687408447, + "rewards/margins": 1.021672010421753, + "rewards/rejected": -3.0434045791625977, + "step": 4690 + }, + { + "epoch": 0.62, + "learning_rate": 1.937177191184729e-06, + "logits/chosen": -0.2106432020664215, + "logits/rejected": -0.05869419500231743, + "logps/chosen": -432.0470275878906, + "logps/rejected": -516.4940185546875, + "loss": 0.5562, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9173109531402588, + "rewards/margins": 0.8195775747299194, + "rewards/rejected": -2.7368886470794678, + "step": 4700 + }, + { + "epoch": 0.62, + "eval_logits/chosen": 1.1998752355575562, + "eval_logits/rejected": 1.9384359121322632, + "eval_logps/chosen": -471.0873107910156, + "eval_logps/rejected": -546.7677001953125, + "eval_loss": 0.5075832009315491, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -2.025972843170166, + "eval_rewards/margins": 0.9558923244476318, + "eval_rewards/rejected": -2.9818649291992188, + "eval_runtime": 1220.8498, + "eval_samples_per_second": 1.638, + "eval_steps_per_second": 0.819, + "step": 4700 + }, + { + "epoch": 0.62, + "learning_rate": 1.9260540154054317e-06, + "logits/chosen": -0.3351259231567383, + "logits/rejected": 0.35000115633010864, + "logps/chosen": -423.49017333984375, + "logps/rejected": -554.1179809570312, + "loss": 0.3844, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8695085048675537, + "rewards/margins": 1.3363901376724243, + "rewards/rejected": -3.2058990001678467, + "step": 4710 + }, + { + "epoch": 0.62, + "learning_rate": 1.9149428207697983e-06, + "logits/chosen": -0.33344680070877075, + "logits/rejected": -0.03340202569961548, + "logps/chosen": -488.39239501953125, + "logps/rejected": -548.125244140625, + "loss": 0.6408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3214313983917236, + "rewards/margins": 0.5961470603942871, + "rewards/rejected": -2.91757869720459, + "step": 4720 + }, + { + "epoch": 0.62, + "learning_rate": 1.9038438392244262e-06, + "logits/chosen": -0.3149067759513855, + "logits/rejected": -0.3603346645832062, + "logps/chosen": -476.91766357421875, + "logps/rejected": -549.4235229492188, + "loss": 0.4499, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9402230978012085, + "rewards/margins": 1.0026373863220215, + "rewards/rejected": -2.9428603649139404, + "step": 4730 + }, + { + "epoch": 0.62, + "learning_rate": 1.8927573024609666e-06, + "logits/chosen": -0.267407089471817, + "logits/rejected": 0.3591880202293396, + "logps/chosen": -424.043212890625, + "logps/rejected": -519.4016723632812, + "loss": 0.4639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.020537853240967, + "rewards/margins": 1.063114047050476, + "rewards/rejected": -3.0836517810821533, + "step": 4740 + }, + { + "epoch": 0.62, + "learning_rate": 1.8816834419112845e-06, + "logits/chosen": -0.276968777179718, + "logits/rejected": 0.6711007356643677, + "logps/chosen": -467.92401123046875, + "logps/rejected": -534.5933837890625, + "loss": 0.5329, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1998984813690186, + "rewards/margins": 1.0478665828704834, + "rewards/rejected": -3.247765302658081, + "step": 4750 + }, + { + "epoch": 0.62, + "learning_rate": 1.8706224887426283e-06, + "logits/chosen": -0.017868299037218094, + "logits/rejected": 0.1659470647573471, + "logps/chosen": -508.35198974609375, + "logps/rejected": -592.7130126953125, + "loss": 0.5827, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4431605339050293, + "rewards/margins": 0.7835296392440796, + "rewards/rejected": -3.2266902923583984, + "step": 4760 + }, + { + "epoch": 0.62, + "learning_rate": 1.8595746738528045e-06, + "logits/chosen": -0.05624357610940933, + "logits/rejected": -0.07322041690349579, + "logps/chosen": -468.21783447265625, + "logps/rejected": -583.8419799804688, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.211426258087158, + "rewards/margins": 0.9063574075698853, + "rewards/rejected": -3.117783546447754, + "step": 4770 + }, + { + "epoch": 0.63, + "learning_rate": 1.8485402278653584e-06, + "logits/chosen": -0.331932932138443, + "logits/rejected": 0.19877803325653076, + "logps/chosen": -465.69122314453125, + "logps/rejected": -539.7578125, + "loss": 0.4492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2374541759490967, + "rewards/margins": 0.9785755276679993, + "rewards/rejected": -3.216029644012451, + "step": 4780 + }, + { + "epoch": 0.63, + "learning_rate": 1.8375193811247577e-06, + "logits/chosen": -0.3239549398422241, + "logits/rejected": 0.33996933698654175, + "logps/chosen": -472.1409606933594, + "logps/rejected": -542.0296020507812, + "loss": 0.5019, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.24760365486145, + "rewards/margins": 0.8752537965774536, + "rewards/rejected": -3.1228575706481934, + "step": 4790 + }, + { + "epoch": 0.63, + "learning_rate": 1.826512363691586e-06, + "logits/chosen": -0.8161247372627258, + "logits/rejected": -0.2031267136335373, + "logps/chosen": -500.17401123046875, + "logps/rejected": -567.0885009765625, + "loss": 0.4964, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2298948764801025, + "rewards/margins": 0.9767163991928101, + "rewards/rejected": -3.206610918045044, + "step": 4800 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 1.3968160152435303, + "eval_logits/rejected": 2.1537721157073975, + "eval_logps/chosen": -485.73052978515625, + "eval_logps/rejected": -561.4290161132812, + "eval_loss": 0.5078465938568115, + "eval_rewards/accuracies": 0.7335000038146973, + "eval_rewards/chosen": -2.172405481338501, + "eval_rewards/margins": 0.956072986125946, + "eval_rewards/rejected": -3.1284782886505127, + "eval_runtime": 1205.6317, + "eval_samples_per_second": 1.659, + "eval_steps_per_second": 0.829, + "step": 4800 + }, + { + "epoch": 0.63, + "learning_rate": 1.8155194053377391e-06, + "logits/chosen": -0.4702053964138031, + "logits/rejected": 0.691516101360321, + "logps/chosen": -462.8230895996094, + "logps/rejected": -549.562744140625, + "loss": 0.478, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0018343925476074, + "rewards/margins": 1.2428886890411377, + "rewards/rejected": -3.244723081588745, + "step": 4810 + }, + { + "epoch": 0.63, + "learning_rate": 1.80454073554163e-06, + "logits/chosen": -0.25434356927871704, + "logits/rejected": 0.4281914234161377, + "logps/chosen": -443.32781982421875, + "logps/rejected": -518.9435424804688, + "loss": 0.5968, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2195210456848145, + "rewards/margins": 0.8934110403060913, + "rewards/rejected": -3.1129322052001953, + "step": 4820 + }, + { + "epoch": 0.63, + "learning_rate": 1.7935765834833966e-06, + "logits/chosen": -0.17489385604858398, + "logits/rejected": 0.3458006978034973, + "logps/chosen": -487.3931579589844, + "logps/rejected": -616.27587890625, + "loss": 0.4374, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1858935356140137, + "rewards/margins": 1.282659888267517, + "rewards/rejected": -3.468553066253662, + "step": 4830 + }, + { + "epoch": 0.63, + "learning_rate": 1.7826271780401182e-06, + "logits/chosen": -0.34163016080856323, + "logits/rejected": 0.6715383529663086, + "logps/chosen": -477.7323303222656, + "logps/rejected": -576.5382690429688, + "loss": 0.4413, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3642547130584717, + "rewards/margins": 1.1457160711288452, + "rewards/rejected": -3.5099709033966064, + "step": 4840 + }, + { + "epoch": 0.63, + "learning_rate": 1.7716927477810389e-06, + "logits/chosen": -0.3121749758720398, + "logits/rejected": 0.46080583333969116, + "logps/chosen": -460.3019104003906, + "logps/rejected": -576.9380493164062, + "loss": 0.4645, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.031574249267578, + "rewards/margins": 1.1349817514419556, + "rewards/rejected": -3.166555881500244, + "step": 4850 + }, + { + "epoch": 0.64, + "learning_rate": 1.7607735209627953e-06, + "logits/chosen": -0.23650212585926056, + "logits/rejected": 0.6278365850448608, + "logps/chosen": -487.94305419921875, + "logps/rejected": -562.5647583007812, + "loss": 0.4918, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.249572515487671, + "rewards/margins": 1.1094893217086792, + "rewards/rejected": -3.3590621948242188, + "step": 4860 + }, + { + "epoch": 0.64, + "learning_rate": 1.749869725524651e-06, + "logits/chosen": -0.2613787055015564, + "logits/rejected": 0.479464054107666, + "logps/chosen": -465.05743408203125, + "logps/rejected": -545.4020385742188, + "loss": 0.4567, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.889392614364624, + "rewards/margins": 1.1381362676620483, + "rewards/rejected": -3.027529001235962, + "step": 4870 + }, + { + "epoch": 0.64, + "learning_rate": 1.7389815890837392e-06, + "logits/chosen": 0.03331397473812103, + "logits/rejected": -0.37613219022750854, + "logps/chosen": -484.1585998535156, + "logps/rejected": -606.52978515625, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.071956157684326, + "rewards/margins": 1.0469639301300049, + "rewards/rejected": -3.118920087814331, + "step": 4880 + }, + { + "epoch": 0.64, + "learning_rate": 1.7281093389303105e-06, + "logits/chosen": -0.5912939310073853, + "logits/rejected": -0.04338790848851204, + "logps/chosen": -431.63238525390625, + "logps/rejected": -512.7877197265625, + "loss": 0.4884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8607794046401978, + "rewards/margins": 0.9917821884155273, + "rewards/rejected": -2.8525617122650146, + "step": 4890 + }, + { + "epoch": 0.64, + "learning_rate": 1.7172532020229899e-06, + "logits/chosen": -0.8086379766464233, + "logits/rejected": 0.37625136971473694, + "logps/chosen": -488.94122314453125, + "logps/rejected": -558.6923828125, + "loss": 0.4879, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.099210739135742, + "rewards/margins": 1.0362012386322021, + "rewards/rejected": -3.1354117393493652, + "step": 4900 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.3801579475402832, + "eval_logits/rejected": 2.132445812225342, + "eval_logps/chosen": -489.562255859375, + "eval_logps/rejected": -571.5598754882812, + "eval_loss": 0.5124561190605164, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -2.210721731185913, + "eval_rewards/margins": 1.0190653800964355, + "eval_rewards/rejected": -3.2297873497009277, + "eval_runtime": 1238.0078, + "eval_samples_per_second": 1.615, + "eval_steps_per_second": 0.808, + "step": 4900 + }, + { + "epoch": 0.64, + "learning_rate": 1.7064134049840359e-06, + "logits/chosen": -0.2153107225894928, + "logits/rejected": 0.029362941160798073, + "logps/chosen": -449.80029296875, + "logps/rejected": -585.2144775390625, + "loss": 0.4087, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9706871509552002, + "rewards/margins": 1.3460237979888916, + "rewards/rejected": -3.31671142578125, + "step": 4910 + }, + { + "epoch": 0.64, + "learning_rate": 1.6955901740946136e-06, + "logits/chosen": -0.38802531361579895, + "logits/rejected": 0.2870200276374817, + "logps/chosen": -543.64306640625, + "logps/rejected": -657.2401123046875, + "loss": 0.483, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5144500732421875, + "rewards/margins": 1.142197847366333, + "rewards/rejected": -3.6566474437713623, + "step": 4920 + }, + { + "epoch": 0.65, + "learning_rate": 1.684783735290067e-06, + "logits/chosen": -0.6487330198287964, + "logits/rejected": 0.3835352659225464, + "logps/chosen": -452.5752868652344, + "logps/rejected": -591.7291870117188, + "loss": 0.3777, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9951683282852173, + "rewards/margins": 1.4564054012298584, + "rewards/rejected": -3.451573610305786, + "step": 4930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6739943141552079e-06, + "logits/chosen": -0.6588994860649109, + "logits/rejected": 0.3580857515335083, + "logps/chosen": -522.0641479492188, + "logps/rejected": -571.3804931640625, + "loss": 0.5866, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2700130939483643, + "rewards/margins": 1.0017244815826416, + "rewards/rejected": -3.271738052368164, + "step": 4940 + }, + { + "epoch": 0.65, + "learning_rate": 1.663222135919601e-06, + "logits/chosen": -0.5175567865371704, + "logits/rejected": 0.5185331106185913, + "logps/chosen": -509.4527282714844, + "logps/rejected": -577.5142822265625, + "loss": 0.4903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.127331256866455, + "rewards/margins": 0.9136832356452942, + "rewards/rejected": -3.0410146713256836, + "step": 4950 + }, + { + "epoch": 0.65, + "learning_rate": 1.652467425452865e-06, + "logits/chosen": -0.43478575348854065, + "logits/rejected": -0.07904218137264252, + "logps/chosen": -461.2347717285156, + "logps/rejected": -556.1910400390625, + "loss": 0.4535, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1336779594421387, + "rewards/margins": 1.1299854516983032, + "rewards/rejected": -3.2636635303497314, + "step": 4960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6417304072599787e-06, + "logits/chosen": -0.6369231939315796, + "logits/rejected": 0.5573645234107971, + "logps/chosen": -503.250732421875, + "logps/rejected": -592.5829467773438, + "loss": 0.531, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3848960399627686, + "rewards/margins": 0.9315534830093384, + "rewards/rejected": -3.3164494037628174, + "step": 4970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6310113054765947e-06, + "logits/chosen": -0.6183720231056213, + "logits/rejected": 0.6410430073738098, + "logps/chosen": -529.9524536132812, + "logps/rejected": -604.1205444335938, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5189731121063232, + "rewards/margins": 1.104549765586853, + "rewards/rejected": -3.623522996902466, + "step": 4980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6203103438643591e-06, + "logits/chosen": -0.5253661870956421, + "logits/rejected": 0.08035139739513397, + "logps/chosen": -489.2454528808594, + "logps/rejected": -587.1968994140625, + "loss": 0.4981, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3722403049468994, + "rewards/margins": 0.9075069427490234, + "rewards/rejected": -3.279747486114502, + "step": 4990 + }, + { + "epoch": 0.65, + "learning_rate": 1.6096277458062417e-06, + "logits/chosen": 0.17417654395103455, + "logits/rejected": 0.0011325478553771973, + "logps/chosen": -417.90740966796875, + "logps/rejected": -536.4146118164062, + "loss": 0.4916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.131730556488037, + "rewards/margins": 1.022687554359436, + "rewards/rejected": -3.1544182300567627, + "step": 5000 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 1.3780100345611572, + "eval_logits/rejected": 2.116060256958008, + "eval_logps/chosen": -478.1451110839844, + "eval_logps/rejected": -558.6430053710938, + "eval_loss": 0.5087121725082397, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -2.096550941467285, + "eval_rewards/margins": 1.0040674209594727, + "eval_rewards/rejected": -3.100618362426758, + "eval_runtime": 1192.3166, + "eval_samples_per_second": 1.677, + "eval_steps_per_second": 0.839, + "step": 5000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5989637343018705e-06, + "logits/chosen": -0.31024664640426636, + "logits/rejected": 0.315248966217041, + "logps/chosen": -463.248046875, + "logps/rejected": -594.2037963867188, + "loss": 0.4, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.942427635192871, + "rewards/margins": 1.2658753395080566, + "rewards/rejected": -3.2083029747009277, + "step": 5010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5883185319628824e-06, + "logits/chosen": -0.7604548931121826, + "logits/rejected": 1.1575146913528442, + "logps/chosen": -514.0056762695312, + "logps/rejected": -576.3931884765625, + "loss": 0.4749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3210787773132324, + "rewards/margins": 1.0514533519744873, + "rewards/rejected": -3.372532367706299, + "step": 5020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5776923610082695e-06, + "logits/chosen": -0.40637367963790894, + "logits/rejected": 0.5249187350273132, + "logps/chosen": -471.66650390625, + "logps/rejected": -582.8086547851562, + "loss": 0.4789, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1091582775115967, + "rewards/margins": 1.3221811056137085, + "rewards/rejected": -3.431339979171753, + "step": 5030 + }, + { + "epoch": 0.66, + "learning_rate": 1.5670854432597433e-06, + "logits/chosen": -0.29735487699508667, + "logits/rejected": 0.3211146891117096, + "logps/chosen": -496.14306640625, + "logps/rejected": -521.6513061523438, + "loss": 0.5256, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9637601375579834, + "rewards/margins": 0.7890554666519165, + "rewards/rejected": -2.7528157234191895, + "step": 5040 + }, + { + "epoch": 0.66, + "learning_rate": 1.556498000137104e-06, + "logits/chosen": -0.15755796432495117, + "logits/rejected": 0.6389847993850708, + "logps/chosen": -417.54052734375, + "logps/rejected": -540.6213989257812, + "loss": 0.3919, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8478233814239502, + "rewards/margins": 1.3877325057983398, + "rewards/rejected": -3.235556125640869, + "step": 5050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5459302526536188e-06, + "logits/chosen": -0.493274986743927, + "logits/rejected": -0.03875049203634262, + "logps/chosen": -449.4249572753906, + "logps/rejected": -511.86492919921875, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8050098419189453, + "rewards/margins": 0.8924520611763, + "rewards/rejected": -2.697462320327759, + "step": 5060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5353824214114075e-06, + "logits/chosen": -0.6162487268447876, + "logits/rejected": 0.14038828015327454, + "logps/chosen": -444.60736083984375, + "logps/rejected": -520.192138671875, + "loss": 0.5187, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8896453380584717, + "rewards/margins": 0.8513407707214355, + "rewards/rejected": -2.7409861087799072, + "step": 5070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5248547265968373e-06, + "logits/chosen": -0.5640738606452942, + "logits/rejected": -0.3062174618244171, + "logps/chosen": -428.0648498535156, + "logps/rejected": -505.9144592285156, + "loss": 0.5312, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8717830181121826, + "rewards/margins": 0.8837350606918335, + "rewards/rejected": -2.7555181980133057, + "step": 5080 + }, + { + "epoch": 0.67, + "learning_rate": 1.5143473879759265e-06, + "logits/chosen": -0.9101265668869019, + "logits/rejected": 0.7031489610671997, + "logps/chosen": -447.75201416015625, + "logps/rejected": -512.3680419921875, + "loss": 0.5284, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0770153999328613, + "rewards/margins": 1.0809861421585083, + "rewards/rejected": -3.158001184463501, + "step": 5090 + }, + { + "epoch": 0.67, + "learning_rate": 1.5038606248897586e-06, + "logits/chosen": -0.36409762501716614, + "logits/rejected": 0.16153445839881897, + "logps/chosen": -540.3499755859375, + "logps/rejected": -589.29150390625, + "loss": 0.5806, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5957887172698975, + "rewards/margins": 0.6685230135917664, + "rewards/rejected": -3.2643120288848877, + "step": 5100 + }, + { + "epoch": 0.67, + "eval_logits/chosen": 1.3594954013824463, + "eval_logits/rejected": 2.0896875858306885, + "eval_logps/chosen": -491.28375244140625, + "eval_logps/rejected": -572.3603515625, + "eval_loss": 0.5089035034179688, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -2.2279369831085205, + "eval_rewards/margins": 1.0098555088043213, + "eval_rewards/rejected": -3.237792491912842, + "eval_runtime": 1197.34, + "eval_samples_per_second": 1.67, + "eval_steps_per_second": 0.835, + "step": 5100 + }, + { + "epoch": 0.67, + "learning_rate": 1.4933946562499008e-06, + "logits/chosen": -0.3227179944515228, + "logits/rejected": 0.6649230718612671, + "logps/chosen": -481.3219299316406, + "logps/rejected": -565.8980712890625, + "loss": 0.5032, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.204648733139038, + "rewards/margins": 1.1594634056091309, + "rewards/rejected": -3.364112377166748, + "step": 5110 + }, + { + "epoch": 0.67, + "learning_rate": 1.482949700533835e-06, + "logits/chosen": -0.20165733993053436, + "logits/rejected": 0.22539961338043213, + "logps/chosen": -431.33514404296875, + "logps/rejected": -507.3214416503906, + "loss": 0.5052, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.136174440383911, + "rewards/margins": 0.8829553723335266, + "rewards/rejected": -3.019129753112793, + "step": 5120 + }, + { + "epoch": 0.67, + "learning_rate": 1.4725259757803983e-06, + "logits/chosen": -0.3899513781070709, + "logits/rejected": 0.26980915665626526, + "logps/chosen": -543.8787841796875, + "logps/rejected": -611.9580688476562, + "loss": 0.5236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3227076530456543, + "rewards/margins": 1.0733088254928589, + "rewards/rejected": -3.3960158824920654, + "step": 5130 + }, + { + "epoch": 0.67, + "learning_rate": 1.4621236995852314e-06, + "logits/chosen": -0.9624841809272766, + "logits/rejected": 0.41740506887435913, + "logps/chosen": -478.22625732421875, + "logps/rejected": -570.9786987304688, + "loss": 0.4712, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1282503604888916, + "rewards/margins": 1.1468855142593384, + "rewards/rejected": -3.2751357555389404, + "step": 5140 + }, + { + "epoch": 0.67, + "learning_rate": 1.4517430890962337e-06, + "logits/chosen": -1.003804087638855, + "logits/rejected": 0.8981930613517761, + "logps/chosen": -504.6312561035156, + "logps/rejected": -505.14471435546875, + "loss": 0.5248, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.165656328201294, + "rewards/margins": 0.9904571771621704, + "rewards/rejected": -3.1561131477355957, + "step": 5150 + }, + { + "epoch": 0.68, + "learning_rate": 1.4413843610090342e-06, + "logits/chosen": -1.1295411586761475, + "logits/rejected": 0.7629106044769287, + "logps/chosen": -517.0686645507812, + "logps/rejected": -583.6041870117188, + "loss": 0.5029, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2716588973999023, + "rewards/margins": 0.9855988621711731, + "rewards/rejected": -3.2572574615478516, + "step": 5160 + }, + { + "epoch": 0.68, + "learning_rate": 1.4310477315624637e-06, + "logits/chosen": -0.5515908598899841, + "logits/rejected": 0.39060476422309875, + "logps/chosen": -463.7108459472656, + "logps/rejected": -558.4481201171875, + "loss": 0.603, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.238138198852539, + "rewards/margins": 0.9213913679122925, + "rewards/rejected": -3.1595299243927, + "step": 5170 + }, + { + "epoch": 0.68, + "learning_rate": 1.420733416534045e-06, + "logits/chosen": 0.11524833738803864, + "logits/rejected": 0.43751248717308044, + "logps/chosen": -463.92352294921875, + "logps/rejected": -566.92578125, + "loss": 0.5296, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1641011238098145, + "rewards/margins": 0.9625070691108704, + "rewards/rejected": -3.126608371734619, + "step": 5180 + }, + { + "epoch": 0.68, + "learning_rate": 1.410441631235487e-06, + "logits/chosen": -0.6287524700164795, + "logits/rejected": -0.010029402561485767, + "logps/chosen": -475.24285888671875, + "logps/rejected": -560.3567504882812, + "loss": 0.4824, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9610084295272827, + "rewards/margins": 0.988863468170166, + "rewards/rejected": -2.949871778488159, + "step": 5190 + }, + { + "epoch": 0.68, + "learning_rate": 1.4001725905081868e-06, + "logits/chosen": -0.594923198223114, + "logits/rejected": 0.47926950454711914, + "logps/chosen": -443.66815185546875, + "logps/rejected": -482.48907470703125, + "loss": 0.5027, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1409497261047363, + "rewards/margins": 0.8297210931777954, + "rewards/rejected": -2.9706709384918213, + "step": 5200 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 1.071418285369873, + "eval_logits/rejected": 1.8013691902160645, + "eval_logps/chosen": -458.1094970703125, + "eval_logps/rejected": -531.8433837890625, + "eval_loss": 0.5037881135940552, + "eval_rewards/accuracies": 0.737500011920929, + "eval_rewards/chosen": -1.8961946964263916, + "eval_rewards/margins": 0.936427116394043, + "eval_rewards/rejected": -2.8326218128204346, + "eval_runtime": 1205.2329, + "eval_samples_per_second": 1.659, + "eval_steps_per_second": 0.83, + "step": 5200 + }, + { + "epoch": 0.68, + "learning_rate": 1.3899265087187507e-06, + "logits/chosen": -0.2585764527320862, + "logits/rejected": 0.0724058523774147, + "logps/chosen": -413.6856994628906, + "logps/rejected": -479.63323974609375, + "loss": 0.494, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8029801845550537, + "rewards/margins": 0.9064146876335144, + "rewards/rejected": -2.709394931793213, + "step": 5210 + }, + { + "epoch": 0.68, + "learning_rate": 1.3797035997545144e-06, + "logits/chosen": -0.7779779434204102, + "logits/rejected": -0.05586876720190048, + "logps/chosen": -463.6990661621094, + "logps/rejected": -529.9464721679688, + "loss": 0.4447, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.761859655380249, + "rewards/margins": 1.0219517946243286, + "rewards/rejected": -2.7838118076324463, + "step": 5220 + }, + { + "epoch": 0.68, + "learning_rate": 1.3695040770190816e-06, + "logits/chosen": -0.9211093187332153, + "logits/rejected": 0.09622125327587128, + "logps/chosen": -420.81951904296875, + "logps/rejected": -507.4986267089844, + "loss": 0.5245, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7863895893096924, + "rewards/margins": 0.9270517230033875, + "rewards/rejected": -2.7134411334991455, + "step": 5230 + }, + { + "epoch": 0.69, + "learning_rate": 1.3593281534278651e-06, + "logits/chosen": -0.5812297463417053, + "logits/rejected": -0.16451668739318848, + "logps/chosen": -413.26971435546875, + "logps/rejected": -532.844482421875, + "loss": 0.4485, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8284438848495483, + "rewards/margins": 1.1127674579620361, + "rewards/rejected": -2.941211223602295, + "step": 5240 + }, + { + "epoch": 0.69, + "learning_rate": 1.3491760414036478e-06, + "logits/chosen": -0.8652269244194031, + "logits/rejected": 0.4699738919734955, + "logps/chosen": -487.8330993652344, + "logps/rejected": -502.11395263671875, + "loss": 0.6038, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9750216007232666, + "rewards/margins": 0.6716020703315735, + "rewards/rejected": -2.6466238498687744, + "step": 5250 + }, + { + "epoch": 0.69, + "learning_rate": 1.3390479528721444e-06, + "logits/chosen": -0.4403937757015228, + "logits/rejected": 0.20110268890857697, + "logps/chosen": -475.3594665527344, + "logps/rejected": -581.604248046875, + "loss": 0.4959, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.161839723587036, + "rewards/margins": 1.0040169954299927, + "rewards/rejected": -3.1658565998077393, + "step": 5260 + }, + { + "epoch": 0.69, + "learning_rate": 1.3289440992575756e-06, + "logits/chosen": -0.5245973467826843, + "logits/rejected": 0.14496035873889923, + "logps/chosen": -487.95831298828125, + "logps/rejected": -544.9544677734375, + "loss": 0.4847, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9425252676010132, + "rewards/margins": 0.839037299156189, + "rewards/rejected": -2.781562566757202, + "step": 5270 + }, + { + "epoch": 0.69, + "learning_rate": 1.3188646914782616e-06, + "logits/chosen": -1.1343739032745361, + "logits/rejected": 0.7802155613899231, + "logps/chosen": -519.5667724609375, + "logps/rejected": -522.45166015625, + "loss": 0.4612, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.960554838180542, + "rewards/margins": 0.9846093058586121, + "rewards/rejected": -2.945164203643799, + "step": 5280 + }, + { + "epoch": 0.69, + "learning_rate": 1.3088099399422109e-06, + "logits/chosen": -0.3516455292701721, + "logits/rejected": -0.15082845091819763, + "logps/chosen": -515.0338745117188, + "logps/rejected": -570.6123657226562, + "loss": 0.5449, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2784736156463623, + "rewards/margins": 0.777103066444397, + "rewards/rejected": -3.0555763244628906, + "step": 5290 + }, + { + "epoch": 0.69, + "learning_rate": 1.2987800545427353e-06, + "logits/chosen": -0.7941345572471619, + "logits/rejected": 0.2526698708534241, + "logps/chosen": -451.58319091796875, + "logps/rejected": -526.3946533203125, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7788721323013306, + "rewards/margins": 1.0764901638031006, + "rewards/rejected": -2.8553624153137207, + "step": 5300 + }, + { + "epoch": 0.69, + "eval_logits/chosen": 1.155483365058899, + "eval_logits/rejected": 1.890524983406067, + "eval_logps/chosen": -463.9870300292969, + "eval_logps/rejected": -540.6599731445312, + "eval_loss": 0.505189836025238, + "eval_rewards/accuracies": 0.7329999804496765, + "eval_rewards/chosen": -1.9549702405929565, + "eval_rewards/margins": 0.9658184051513672, + "eval_rewards/rejected": -2.920788526535034, + "eval_runtime": 1180.1738, + "eval_samples_per_second": 1.695, + "eval_steps_per_second": 0.847, + "step": 5300 + }, + { + "epoch": 0.69, + "learning_rate": 1.288775244654062e-06, + "logits/chosen": -0.3898960053920746, + "logits/rejected": -0.007041037082672119, + "logps/chosen": -516.4864501953125, + "logps/rejected": -554.4014282226562, + "loss": 0.5911, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0645229816436768, + "rewards/margins": 0.8199642896652222, + "rewards/rejected": -2.8844873905181885, + "step": 5310 + }, + { + "epoch": 0.7, + "learning_rate": 1.2787957191269696e-06, + "logits/chosen": -0.3055559992790222, + "logits/rejected": 0.39844006299972534, + "logps/chosen": -468.44097900390625, + "logps/rejected": -557.8650512695312, + "loss": 0.5665, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0604255199432373, + "rewards/margins": 0.7497578859329224, + "rewards/rejected": -2.81018328666687, + "step": 5320 + }, + { + "epoch": 0.7, + "learning_rate": 1.2688416862844193e-06, + "logits/chosen": -0.3095241189002991, + "logits/rejected": 0.23874731361865997, + "logps/chosen": -409.28021240234375, + "logps/rejected": -558.3905029296875, + "loss": 0.4496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7695989608764648, + "rewards/margins": 1.3319097757339478, + "rewards/rejected": -3.101508378982544, + "step": 5330 + }, + { + "epoch": 0.7, + "learning_rate": 1.2589133539172193e-06, + "logits/chosen": -0.9595244526863098, + "logits/rejected": 0.1507810652256012, + "logps/chosen": -486.26629638671875, + "logps/rejected": -545.3350830078125, + "loss": 0.432, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7377183437347412, + "rewards/margins": 1.0882227420806885, + "rewards/rejected": -2.8259410858154297, + "step": 5340 + }, + { + "epoch": 0.7, + "learning_rate": 1.249010929279672e-06, + "logits/chosen": -0.8987747430801392, + "logits/rejected": -0.06229814141988754, + "logps/chosen": -492.7890625, + "logps/rejected": -579.2696533203125, + "loss": 0.5204, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0684757232666016, + "rewards/margins": 0.983476996421814, + "rewards/rejected": -3.051952600479126, + "step": 5350 + }, + { + "epoch": 0.7, + "learning_rate": 1.2391346190852603e-06, + "logits/chosen": -0.8224785923957825, + "logits/rejected": 0.3233809769153595, + "logps/chosen": -468.3160095214844, + "logps/rejected": -521.6168823242188, + "loss": 0.6492, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1017160415649414, + "rewards/margins": 0.6927504539489746, + "rewards/rejected": -2.794466495513916, + "step": 5360 + }, + { + "epoch": 0.7, + "learning_rate": 1.2292846295023222e-06, + "logits/chosen": -0.5089510679244995, + "logits/rejected": 0.2342415750026703, + "logps/chosen": -515.0882568359375, + "logps/rejected": -560.9594116210938, + "loss": 0.5775, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1873621940612793, + "rewards/margins": 0.6423591375350952, + "rewards/rejected": -2.829720973968506, + "step": 5370 + }, + { + "epoch": 0.7, + "learning_rate": 1.2194611661497576e-06, + "logits/chosen": -0.40925416350364685, + "logits/rejected": 0.3876747190952301, + "logps/chosen": -480.6631774902344, + "logps/rejected": -569.6943969726562, + "loss": 0.4608, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.132833242416382, + "rewards/margins": 1.046400547027588, + "rewards/rejected": -3.1792335510253906, + "step": 5380 + }, + { + "epoch": 0.71, + "learning_rate": 1.2096644340927247e-06, + "logits/chosen": -0.4503072202205658, + "logits/rejected": -0.0540265329182148, + "logps/chosen": -478.57281494140625, + "logps/rejected": -541.1819458007812, + "loss": 0.5218, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.94533371925354, + "rewards/margins": 0.8256410360336304, + "rewards/rejected": -2.770974636077881, + "step": 5390 + }, + { + "epoch": 0.71, + "learning_rate": 1.19989463783837e-06, + "logits/chosen": -1.0149070024490356, + "logits/rejected": 0.19897064566612244, + "logps/chosen": -478.80670166015625, + "logps/rejected": -576.4320068359375, + "loss": 0.4521, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8361622095108032, + "rewards/margins": 1.1377449035644531, + "rewards/rejected": -2.973906993865967, + "step": 5400 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 1.1075793504714966, + "eval_logits/rejected": 1.8436723947525024, + "eval_logps/chosen": -467.6123962402344, + "eval_logps/rejected": -543.2982177734375, + "eval_loss": 0.5038532018661499, + "eval_rewards/accuracies": 0.7369999885559082, + "eval_rewards/chosen": -1.9912235736846924, + "eval_rewards/margins": 0.9559467434883118, + "eval_rewards/rejected": -2.9471704959869385, + "eval_runtime": 1197.7441, + "eval_samples_per_second": 1.67, + "eval_steps_per_second": 0.835, + "step": 5400 + }, + { + "epoch": 0.71, + "learning_rate": 1.1901519813315495e-06, + "logits/chosen": -0.39034947752952576, + "logits/rejected": 0.27100151777267456, + "logps/chosen": -449.6952209472656, + "logps/rejected": -528.5842895507812, + "loss": 0.4638, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9907958507537842, + "rewards/margins": 1.0050820112228394, + "rewards/rejected": -2.995877742767334, + "step": 5410 + }, + { + "epoch": 0.71, + "learning_rate": 1.1804366679505798e-06, + "logits/chosen": -0.8501359820365906, + "logits/rejected": 0.513168215751648, + "logps/chosen": -513.7134399414062, + "logps/rejected": -545.6766967773438, + "loss": 0.5751, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.186096668243408, + "rewards/margins": 0.8673110008239746, + "rewards/rejected": -3.053407907485962, + "step": 5420 + }, + { + "epoch": 0.71, + "learning_rate": 1.1707489005029877e-06, + "logits/chosen": -0.03637596219778061, + "logits/rejected": 0.2914156913757324, + "logps/chosen": -467.412353515625, + "logps/rejected": -572.0277099609375, + "loss": 0.4408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0735740661621094, + "rewards/margins": 1.2480766773223877, + "rewards/rejected": -3.321650743484497, + "step": 5430 + }, + { + "epoch": 0.71, + "learning_rate": 1.1610888812212749e-06, + "logits/chosen": -0.569449245929718, + "logits/rejected": 0.4190862774848938, + "logps/chosen": -465.05218505859375, + "logps/rejected": -561.7803955078125, + "loss": 0.4018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.994209885597229, + "rewards/margins": 1.1598801612854004, + "rewards/rejected": -3.1540896892547607, + "step": 5440 + }, + { + "epoch": 0.71, + "learning_rate": 1.1514568117587035e-06, + "logits/chosen": -0.14006878435611725, + "logits/rejected": -0.16314168274402618, + "logps/chosen": -490.553955078125, + "logps/rejected": -563.6068115234375, + "loss": 0.5097, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2165277004241943, + "rewards/margins": 0.8935906291007996, + "rewards/rejected": -3.1101181507110596, + "step": 5450 + }, + { + "epoch": 0.71, + "learning_rate": 1.1418528931850781e-06, + "logits/chosen": -0.2969356179237366, + "logits/rejected": 0.7354004979133606, + "logps/chosen": -481.17120361328125, + "logps/rejected": -559.3109130859375, + "loss": 0.4667, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0389533042907715, + "rewards/margins": 1.261207103729248, + "rewards/rejected": -3.3001601696014404, + "step": 5460 + }, + { + "epoch": 0.72, + "learning_rate": 1.1322773259825563e-06, + "logits/chosen": -0.7932473421096802, + "logits/rejected": 0.7015919089317322, + "logps/chosen": -505.8838806152344, + "logps/rejected": -538.8965454101562, + "loss": 0.4926, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2662813663482666, + "rewards/margins": 1.0218729972839355, + "rewards/rejected": -3.288154125213623, + "step": 5470 + }, + { + "epoch": 0.72, + "learning_rate": 1.1227303100414552e-06, + "logits/chosen": -0.3481768071651459, + "logits/rejected": 0.19559481739997864, + "logps/chosen": -453.82781982421875, + "logps/rejected": -582.0299682617188, + "loss": 0.4953, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.3037056922912598, + "rewards/margins": 1.0858337879180908, + "rewards/rejected": -3.3895392417907715, + "step": 5480 + }, + { + "epoch": 0.72, + "learning_rate": 1.113212044656087e-06, + "logits/chosen": -0.41049233078956604, + "logits/rejected": 0.1941242218017578, + "logps/chosen": -451.33038330078125, + "logps/rejected": -546.5862426757812, + "loss": 0.542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.117616653442383, + "rewards/margins": 0.8589200973510742, + "rewards/rejected": -2.976536750793457, + "step": 5490 + }, + { + "epoch": 0.72, + "learning_rate": 1.1037227285205951e-06, + "logits/chosen": 0.08584056794643402, + "logits/rejected": -0.11161471903324127, + "logps/chosen": -493.79400634765625, + "logps/rejected": -588.3775634765625, + "loss": 0.5869, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3220810890197754, + "rewards/margins": 0.9301006197929382, + "rewards/rejected": -3.2521815299987793, + "step": 5500 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.1574229001998901, + "eval_logits/rejected": 1.886451244354248, + "eval_logps/chosen": -485.5281066894531, + "eval_logps/rejected": -564.9520874023438, + "eval_loss": 0.5054470896720886, + "eval_rewards/accuracies": 0.7360000014305115, + "eval_rewards/chosen": -2.1703803539276123, + "eval_rewards/margins": 0.9933285117149353, + "eval_rewards/rejected": -3.1637091636657715, + "eval_runtime": 1194.4996, + "eval_samples_per_second": 1.674, + "eval_steps_per_second": 0.837, + "step": 5500 + }, + { + "epoch": 0.72, + "learning_rate": 1.0942625597248028e-06, + "logits/chosen": -0.8230582475662231, + "logits/rejected": 0.5852819681167603, + "logps/chosen": -475.0633239746094, + "logps/rejected": -559.2849731445312, + "loss": 0.5008, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2042269706726074, + "rewards/margins": 1.2232129573822021, + "rewards/rejected": -3.4274401664733887, + "step": 5510 + }, + { + "epoch": 0.72, + "learning_rate": 1.0848317357500854e-06, + "logits/chosen": -0.4844978451728821, + "logits/rejected": 0.25016266107559204, + "logps/chosen": -517.09814453125, + "logps/rejected": -525.1922607421875, + "loss": 0.5246, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2888691425323486, + "rewards/margins": 0.7657750248908997, + "rewards/rejected": -3.0546441078186035, + "step": 5520 + }, + { + "epoch": 0.72, + "learning_rate": 1.0754304534652404e-06, + "logits/chosen": -0.05556102842092514, + "logits/rejected": -0.6705011129379272, + "logps/chosen": -480.46502685546875, + "logps/rejected": -580.5036010742188, + "loss": 0.5881, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2701926231384277, + "rewards/margins": 0.6363264322280884, + "rewards/rejected": -2.9065189361572266, + "step": 5530 + }, + { + "epoch": 0.72, + "learning_rate": 1.0660589091223854e-06, + "logits/chosen": -0.6355140209197998, + "logits/rejected": -0.15657804906368256, + "logps/chosen": -415.057373046875, + "logps/rejected": -514.7366943359375, + "loss": 0.4432, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.001784563064575, + "rewards/margins": 1.1080721616744995, + "rewards/rejected": -3.1098568439483643, + "step": 5540 + }, + { + "epoch": 0.73, + "learning_rate": 1.0567172983528534e-06, + "logits/chosen": -0.4708939492702484, + "logits/rejected": 0.3893592357635498, + "logps/chosen": -413.32177734375, + "logps/rejected": -507.1968688964844, + "loss": 0.4412, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0215418338775635, + "rewards/margins": 0.9789319038391113, + "rewards/rejected": -3.000473976135254, + "step": 5550 + }, + { + "epoch": 0.73, + "learning_rate": 1.0474058161631168e-06, + "logits/chosen": -0.2970888018608093, + "logits/rejected": -0.11767357587814331, + "logps/chosen": -529.619140625, + "logps/rejected": -596.6094970703125, + "loss": 0.5635, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1467816829681396, + "rewards/margins": 0.8911802172660828, + "rewards/rejected": -3.037961721420288, + "step": 5560 + }, + { + "epoch": 0.73, + "learning_rate": 1.0381246569307077e-06, + "logits/chosen": -0.535216212272644, + "logits/rejected": 0.25071626901626587, + "logps/chosen": -521.4472045898438, + "logps/rejected": -551.7024536132812, + "loss": 0.6022, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2295970916748047, + "rewards/margins": 0.6778484582901001, + "rewards/rejected": -2.9074459075927734, + "step": 5570 + }, + { + "epoch": 0.73, + "learning_rate": 1.0288740144001722e-06, + "logits/chosen": -1.0160603523254395, + "logits/rejected": 0.10044028609991074, + "logps/chosen": -462.79730224609375, + "logps/rejected": -512.1188354492188, + "loss": 0.57, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9899721145629883, + "rewards/margins": 0.9382216334342957, + "rewards/rejected": -2.928194046020508, + "step": 5580 + }, + { + "epoch": 0.73, + "learning_rate": 1.0196540816790127e-06, + "logits/chosen": -0.79753178358078, + "logits/rejected": 0.11815383285284042, + "logps/chosen": -427.5205078125, + "logps/rejected": -464.82562255859375, + "loss": 0.5304, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.933569312095642, + "rewards/margins": 0.8004889488220215, + "rewards/rejected": -2.7340588569641113, + "step": 5590 + }, + { + "epoch": 0.73, + "learning_rate": 1.0104650512336679e-06, + "logits/chosen": -0.9227102994918823, + "logits/rejected": 0.0573890320956707, + "logps/chosen": -458.4912109375, + "logps/rejected": -493.012939453125, + "loss": 0.5924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8620121479034424, + "rewards/margins": 0.8347771763801575, + "rewards/rejected": -2.696789264678955, + "step": 5600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.8215131759643555, + "eval_logits/rejected": 1.5324963331222534, + "eval_logps/chosen": -450.29345703125, + "eval_logps/rejected": -527.0138549804688, + "eval_loss": 0.5064495801925659, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.8180345296859741, + "eval_rewards/margins": 0.9662933349609375, + "eval_rewards/rejected": -2.784327745437622, + "eval_runtime": 1192.2692, + "eval_samples_per_second": 1.677, + "eval_steps_per_second": 0.839, + "step": 5600 + }, + { + "epoch": 0.73, + "learning_rate": 1.0013071148854861e-06, + "logits/chosen": -0.46502774953842163, + "logits/rejected": -0.10884647071361542, + "logps/chosen": -406.987060546875, + "logps/rejected": -543.9107055664062, + "loss": 0.4149, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.725001573562622, + "rewards/margins": 1.3317546844482422, + "rewards/rejected": -3.0567562580108643, + "step": 5610 + }, + { + "epoch": 0.74, + "learning_rate": 9.921804638067292e-07, + "logits/chosen": -0.9193164110183716, + "logits/rejected": 0.3256033957004547, + "logps/chosen": -473.353759765625, + "logps/rejected": -545.8274536132812, + "loss": 0.4718, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0329298973083496, + "rewards/margins": 1.113075852394104, + "rewards/rejected": -3.146005868911743, + "step": 5620 + }, + { + "epoch": 0.74, + "learning_rate": 9.830852885165749e-07, + "logits/chosen": -0.1020597368478775, + "logits/rejected": -0.6451666355133057, + "logps/chosen": -406.77081298828125, + "logps/rejected": -512.661865234375, + "loss": 0.5619, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9078305959701538, + "rewards/margins": 0.6681450605392456, + "rewards/rejected": -2.5759758949279785, + "step": 5630 + }, + { + "epoch": 0.74, + "learning_rate": 9.740217788771453e-07, + "logits/chosen": -0.8085994720458984, + "logits/rejected": 0.037676215171813965, + "logps/chosen": -455.20208740234375, + "logps/rejected": -509.16680908203125, + "loss": 0.4926, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7756178379058838, + "rewards/margins": 0.8752728700637817, + "rewards/rejected": -2.650890588760376, + "step": 5640 + }, + { + "epoch": 0.74, + "learning_rate": 9.649901240895374e-07, + "logits/chosen": -0.11386583000421524, + "logits/rejected": 0.1979323923587799, + "logps/chosen": -440.43438720703125, + "logps/rejected": -523.9313354492188, + "loss": 0.5482, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9702411890029907, + "rewards/margins": 0.900561511516571, + "rewards/rejected": -2.870802640914917, + "step": 5650 + }, + { + "epoch": 0.74, + "learning_rate": 9.559905126898803e-07, + "logits/chosen": -1.057208776473999, + "logits/rejected": 0.41706523299217224, + "logps/chosen": -467.60906982421875, + "logps/rejected": -559.2456665039062, + "loss": 0.3784, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.945220947265625, + "rewards/margins": 1.2823781967163086, + "rewards/rejected": -3.2275986671447754, + "step": 5660 + }, + { + "epoch": 0.74, + "learning_rate": 9.470231325453958e-07, + "logits/chosen": -0.6564953923225403, + "logits/rejected": 0.41940754652023315, + "logps/chosen": -467.0869140625, + "logps/rejected": -520.7767944335938, + "loss": 0.5324, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.068354368209839, + "rewards/margins": 0.8983513116836548, + "rewards/rejected": -2.966705799102783, + "step": 5670 + }, + { + "epoch": 0.74, + "learning_rate": 9.380881708504741e-07, + "logits/chosen": -0.29646626114845276, + "logits/rejected": 0.6381570100784302, + "logps/chosen": -402.63690185546875, + "logps/rejected": -467.7684020996094, + "loss": 0.4885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7983324527740479, + "rewards/margins": 0.9997227787971497, + "rewards/rejected": -2.798055410385132, + "step": 5680 + }, + { + "epoch": 0.74, + "learning_rate": 9.291858141227733e-07, + "logits/chosen": -0.41397857666015625, + "logits/rejected": -0.3768884539604187, + "logps/chosen": -433.42816162109375, + "logps/rejected": -586.0889892578125, + "loss": 0.4259, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8530666828155518, + "rewards/margins": 1.3527576923370361, + "rewards/rejected": -3.205824375152588, + "step": 5690 + }, + { + "epoch": 0.75, + "learning_rate": 9.203162481993175e-07, + "logits/chosen": -0.9961411356925964, + "logits/rejected": -0.5173764228820801, + "logps/chosen": -484.4798278808594, + "logps/rejected": -600.5260009765625, + "loss": 0.4275, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.856370210647583, + "rewards/margins": 1.2594194412231445, + "rewards/rejected": -3.1157894134521484, + "step": 5700 + }, + { + "epoch": 0.75, + "eval_logits/chosen": 0.995962917804718, + "eval_logits/rejected": 1.7229074239730835, + "eval_logps/chosen": -469.19317626953125, + "eval_logps/rejected": -549.8818969726562, + "eval_loss": 0.5055263042449951, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -2.0070316791534424, + "eval_rewards/margins": 1.0059758424758911, + "eval_rewards/rejected": -3.013007402420044, + "eval_runtime": 1227.9162, + "eval_samples_per_second": 1.629, + "eval_steps_per_second": 0.814, + "step": 5700 + }, + { + "epoch": 0.75, + "learning_rate": 9.114796582326255e-07, + "logits/chosen": -1.0405280590057373, + "logits/rejected": 0.4770817756652832, + "logps/chosen": -472.912109375, + "logps/rejected": -529.1992797851562, + "loss": 0.5751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.202134609222412, + "rewards/margins": 0.855462908744812, + "rewards/rejected": -3.0575973987579346, + "step": 5710 + }, + { + "epoch": 0.75, + "learning_rate": 9.026762286868373e-07, + "logits/chosen": -0.797855794429779, + "logits/rejected": -0.14750805497169495, + "logps/chosen": -454.8798828125, + "logps/rejected": -599.2950439453125, + "loss": 0.418, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9340975284576416, + "rewards/margins": 1.3620237112045288, + "rewards/rejected": -3.2961208820343018, + "step": 5720 + }, + { + "epoch": 0.75, + "learning_rate": 8.939061433338722e-07, + "logits/chosen": -0.7406786680221558, + "logits/rejected": -0.19209416210651398, + "logps/chosen": -473.05560302734375, + "logps/rejected": -562.7134399414062, + "loss": 0.5286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0807507038116455, + "rewards/margins": 0.8903197050094604, + "rewards/rejected": -2.9710705280303955, + "step": 5730 + }, + { + "epoch": 0.75, + "learning_rate": 8.851695852495867e-07, + "logits/chosen": -0.4927915632724762, + "logits/rejected": -0.5158900618553162, + "logps/chosen": -431.3685607910156, + "logps/rejected": -555.056884765625, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0437633991241455, + "rewards/margins": 1.1984370946884155, + "rewards/rejected": -3.2422003746032715, + "step": 5740 + }, + { + "epoch": 0.75, + "learning_rate": 8.764667368099525e-07, + "logits/chosen": -0.2646760046482086, + "logits/rejected": 0.0725071132183075, + "logps/chosen": -458.35369873046875, + "logps/rejected": -541.1822509765625, + "loss": 0.5088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1217246055603027, + "rewards/margins": 1.0628745555877686, + "rewards/rejected": -3.184598922729492, + "step": 5750 + }, + { + "epoch": 0.75, + "learning_rate": 8.677977796872541e-07, + "logits/chosen": -0.7068579792976379, + "logits/rejected": 0.5747413039207458, + "logps/chosen": -504.84881591796875, + "logps/rejected": -543.8521728515625, + "loss": 0.5338, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.211315870285034, + "rewards/margins": 1.0215401649475098, + "rewards/rejected": -3.232856273651123, + "step": 5760 + }, + { + "epoch": 0.76, + "learning_rate": 8.591628948462913e-07, + "logits/chosen": -0.13534298539161682, + "logits/rejected": 0.21535761654376984, + "logps/chosen": -481.2478942871094, + "logps/rejected": -595.2032470703125, + "loss": 0.4979, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.09712553024292, + "rewards/margins": 1.1030064821243286, + "rewards/rejected": -3.200131893157959, + "step": 5770 + }, + { + "epoch": 0.76, + "learning_rate": 8.505622625406054e-07, + "logits/chosen": -0.13635878264904022, + "logits/rejected": -0.033559300005435944, + "logps/chosen": -466.84637451171875, + "logps/rejected": -594.5919189453125, + "loss": 0.4774, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1044180393218994, + "rewards/margins": 1.287061095237732, + "rewards/rejected": -3.391479015350342, + "step": 5780 + }, + { + "epoch": 0.76, + "learning_rate": 8.419960623087129e-07, + "logits/chosen": 0.0363604798913002, + "logits/rejected": 0.037777043879032135, + "logps/chosen": -419.9930114746094, + "logps/rejected": -560.1504516601562, + "loss": 0.5144, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1091957092285156, + "rewards/margins": 1.1116769313812256, + "rewards/rejected": -3.220872402191162, + "step": 5790 + }, + { + "epoch": 0.76, + "learning_rate": 8.334644729703617e-07, + "logits/chosen": -0.32428237795829773, + "logits/rejected": -0.13345181941986084, + "logps/chosen": -442.7701721191406, + "logps/rejected": -567.7952270507812, + "loss": 0.4746, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.127009391784668, + "rewards/margins": 1.1997586488723755, + "rewards/rejected": -3.326767683029175, + "step": 5800 + }, + { + "epoch": 0.76, + "eval_logits/chosen": 1.1167664527893066, + "eval_logits/rejected": 1.8506929874420166, + "eval_logps/chosen": -489.1824951171875, + "eval_logps/rejected": -573.2806396484375, + "eval_loss": 0.5072213411331177, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -2.2069249153137207, + "eval_rewards/margins": 1.0400696992874146, + "eval_rewards/rejected": -3.2469944953918457, + "eval_runtime": 1256.1768, + "eval_samples_per_second": 1.592, + "eval_steps_per_second": 0.796, + "step": 5800 + }, + { + "epoch": 0.76, + "learning_rate": 8.249676726227931e-07, + "logits/chosen": -0.2723647356033325, + "logits/rejected": 0.3790988326072693, + "logps/chosen": -546.9454345703125, + "logps/rejected": -591.0533447265625, + "loss": 0.5515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.456743001937866, + "rewards/margins": 0.8524373173713684, + "rewards/rejected": -3.3091800212860107, + "step": 5810 + }, + { + "epoch": 0.76, + "learning_rate": 8.165058386370314e-07, + "logits/chosen": -0.2161664515733719, + "logits/rejected": 0.19322898983955383, + "logps/chosen": -473.7935485839844, + "logps/rejected": -592.0934448242188, + "loss": 0.488, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.100954055786133, + "rewards/margins": 1.0307704210281372, + "rewards/rejected": -3.1317245960235596, + "step": 5820 + }, + { + "epoch": 0.76, + "learning_rate": 8.080791476541721e-07, + "logits/chosen": -0.2463117390871048, + "logits/rejected": 0.10424462705850601, + "logps/chosen": -435.74786376953125, + "logps/rejected": -540.4898681640625, + "loss": 0.4526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9524482488632202, + "rewards/margins": 1.2612793445587158, + "rewards/rejected": -3.2137274742126465, + "step": 5830 + }, + { + "epoch": 0.76, + "learning_rate": 7.996877755817026e-07, + "logits/chosen": -0.7921704053878784, + "logits/rejected": 0.26169005036354065, + "logps/chosen": -465.552490234375, + "logps/rejected": -492.36956787109375, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.133111000061035, + "rewards/margins": 0.5900529623031616, + "rewards/rejected": -2.7231638431549072, + "step": 5840 + }, + { + "epoch": 0.77, + "learning_rate": 7.913318975898238e-07, + "logits/chosen": -0.8258234858512878, + "logits/rejected": 0.3698134422302246, + "logps/chosen": -555.0818481445312, + "logps/rejected": -597.4561767578125, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.23042631149292, + "rewards/margins": 1.0009479522705078, + "rewards/rejected": -3.2313742637634277, + "step": 5850 + }, + { + "epoch": 0.77, + "learning_rate": 7.830116881077992e-07, + "logits/chosen": -0.42663684487342834, + "logits/rejected": 0.7298521399497986, + "logps/chosen": -476.6568298339844, + "logps/rejected": -559.68310546875, + "loss": 0.4522, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9448516368865967, + "rewards/margins": 1.070634126663208, + "rewards/rejected": -3.015486001968384, + "step": 5860 + }, + { + "epoch": 0.77, + "learning_rate": 7.747273208203096e-07, + "logits/chosen": -0.327279657125473, + "logits/rejected": 0.10556366294622421, + "logps/chosen": -494.0747985839844, + "logps/rejected": -612.4318237304688, + "loss": 0.498, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2703728675842285, + "rewards/margins": 1.0807685852050781, + "rewards/rejected": -3.3511414527893066, + "step": 5870 + }, + { + "epoch": 0.77, + "learning_rate": 7.664789686638272e-07, + "logits/chosen": -0.753741443157196, + "logits/rejected": 0.26916906237602234, + "logps/chosen": -444.783447265625, + "logps/rejected": -573.6693115234375, + "loss": 0.4559, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.905149221420288, + "rewards/margins": 1.2002140283584595, + "rewards/rejected": -3.105363130569458, + "step": 5880 + }, + { + "epoch": 0.77, + "learning_rate": 7.582668038230089e-07, + "logits/chosen": -0.6280697584152222, + "logits/rejected": 0.23410716652870178, + "logps/chosen": -483.71844482421875, + "logps/rejected": -548.9215087890625, + "loss": 0.6371, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.051710367202759, + "rewards/margins": 0.8779205083847046, + "rewards/rejected": -2.929630756378174, + "step": 5890 + }, + { + "epoch": 0.77, + "learning_rate": 7.500909977271007e-07, + "logits/chosen": -0.55290687084198, + "logits/rejected": -0.04884564131498337, + "logps/chosen": -483.20465087890625, + "logps/rejected": -569.9694213867188, + "loss": 0.5033, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9960193634033203, + "rewards/margins": 1.1004588603973389, + "rewards/rejected": -3.096478223800659, + "step": 5900 + }, + { + "epoch": 0.77, + "eval_logits/chosen": 0.9675183892250061, + "eval_logits/rejected": 1.7070553302764893, + "eval_logps/chosen": -458.106201171875, + "eval_logps/rejected": -536.0161743164062, + "eval_loss": 0.5060694217681885, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.896161675453186, + "eval_rewards/margins": 0.9781885147094727, + "eval_rewards/rejected": -2.874350070953369, + "eval_runtime": 1176.3994, + "eval_samples_per_second": 1.7, + "eval_steps_per_second": 0.85, + "step": 5900 + }, + { + "epoch": 0.77, + "learning_rate": 7.41951721046357e-07, + "logits/chosen": -0.6494289040565491, + "logits/rejected": 0.32405179738998413, + "logps/chosen": -428.957275390625, + "logps/rejected": -514.9093017578125, + "loss": 0.4849, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.763302206993103, + "rewards/margins": 0.9204890131950378, + "rewards/rejected": -2.683791399002075, + "step": 5910 + }, + { + "epoch": 0.77, + "learning_rate": 7.338491436884787e-07, + "logits/chosen": -0.2216249257326126, + "logits/rejected": -0.0639793649315834, + "logps/chosen": -431.1812438964844, + "logps/rejected": -554.4732666015625, + "loss": 0.4382, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.016435146331787, + "rewards/margins": 1.169579267501831, + "rewards/rejected": -3.1860146522521973, + "step": 5920 + }, + { + "epoch": 0.78, + "learning_rate": 7.257834347950693e-07, + "logits/chosen": -0.5539714097976685, + "logits/rejected": 0.5835639238357544, + "logps/chosen": -446.4956970214844, + "logps/rejected": -492.2032165527344, + "loss": 0.5289, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8876628875732422, + "rewards/margins": 0.7628009915351868, + "rewards/rejected": -2.650463819503784, + "step": 5930 + }, + { + "epoch": 0.78, + "learning_rate": 7.177547627380987e-07, + "logits/chosen": -0.4449450373649597, + "logits/rejected": -0.001817166805267334, + "logps/chosen": -462.5804748535156, + "logps/rejected": -559.3094482421875, + "loss": 0.4137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7668288946151733, + "rewards/margins": 1.125380277633667, + "rewards/rejected": -2.89220929145813, + "step": 5940 + }, + { + "epoch": 0.78, + "learning_rate": 7.097632951163949e-07, + "logits/chosen": -0.49515801668167114, + "logits/rejected": 0.028278637677431107, + "logps/chosen": -472.94647216796875, + "logps/rejected": -543.9949340820312, + "loss": 0.5109, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7570574283599854, + "rewards/margins": 1.0789839029312134, + "rewards/rejected": -2.8360414505004883, + "step": 5950 + }, + { + "epoch": 0.78, + "learning_rate": 7.018091987521386e-07, + "logits/chosen": -0.8645265698432922, + "logits/rejected": 0.3468485176563263, + "logps/chosen": -471.080810546875, + "logps/rejected": -537.1405029296875, + "loss": 0.5746, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.022472381591797, + "rewards/margins": 0.8672344088554382, + "rewards/rejected": -2.889706611633301, + "step": 5960 + }, + { + "epoch": 0.78, + "learning_rate": 6.93892639687386e-07, + "logits/chosen": -0.655937671661377, + "logits/rejected": -0.05244150012731552, + "logps/chosen": -465.4962463378906, + "logps/rejected": -507.5498046875, + "loss": 0.5212, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7037566900253296, + "rewards/margins": 0.9643377065658569, + "rewards/rejected": -2.6680946350097656, + "step": 5970 + }, + { + "epoch": 0.78, + "learning_rate": 6.860137831806018e-07, + "logits/chosen": -0.5556502938270569, + "logits/rejected": -0.3047857880592346, + "logps/chosen": -465.64697265625, + "logps/rejected": -526.3079223632812, + "loss": 0.5657, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8184341192245483, + "rewards/margins": 0.9852198362350464, + "rewards/rejected": -2.8036537170410156, + "step": 5980 + }, + { + "epoch": 0.78, + "learning_rate": 6.781727937032054e-07, + "logits/chosen": -0.3701472878456116, + "logits/rejected": 0.17485050857067108, + "logps/chosen": -410.296630859375, + "logps/rejected": -538.2269897460938, + "loss": 0.3979, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5407458543777466, + "rewards/margins": 1.3232797384262085, + "rewards/rejected": -2.864025354385376, + "step": 5990 + }, + { + "epoch": 0.79, + "learning_rate": 6.703698349361437e-07, + "logits/chosen": -0.6748332977294922, + "logits/rejected": 0.417569637298584, + "logps/chosen": -427.6536560058594, + "logps/rejected": -490.7718200683594, + "loss": 0.4517, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7474530935287476, + "rewards/margins": 0.9970604777336121, + "rewards/rejected": -2.744513988494873, + "step": 6000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.8156449198722839, + "eval_logits/rejected": 1.5612982511520386, + "eval_logps/chosen": -441.7278747558594, + "eval_logps/rejected": -516.7131958007812, + "eval_loss": 0.5105239152908325, + "eval_rewards/accuracies": 0.7264999747276306, + "eval_rewards/chosen": -1.7323784828186035, + "eval_rewards/margins": 0.9489423632621765, + "eval_rewards/rejected": -2.6813206672668457, + "eval_runtime": 1173.428, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 6000 + }, + { + "epoch": 0.79, + "learning_rate": 6.626050697664682e-07, + "logits/chosen": -0.5815194249153137, + "logits/rejected": 0.18539538979530334, + "logps/chosen": -448.04583740234375, + "logps/rejected": -505.4095153808594, + "loss": 0.4312, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7882267236709595, + "rewards/margins": 1.0168880224227905, + "rewards/rejected": -2.80511474609375, + "step": 6010 + }, + { + "epoch": 0.79, + "learning_rate": 6.548786602839404e-07, + "logits/chosen": -0.5067921876907349, + "logits/rejected": 0.06481216102838516, + "logps/chosen": -400.51556396484375, + "logps/rejected": -506.417236328125, + "loss": 0.4459, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6396009922027588, + "rewards/margins": 1.3462693691253662, + "rewards/rejected": -2.985870361328125, + "step": 6020 + }, + { + "epoch": 0.79, + "learning_rate": 6.471907677776426e-07, + "logits/chosen": -0.8723716735839844, + "logits/rejected": 0.1587451696395874, + "logps/chosen": -461.92376708984375, + "logps/rejected": -495.30029296875, + "loss": 0.5527, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7867259979248047, + "rewards/margins": 0.7085933685302734, + "rewards/rejected": -2.495319366455078, + "step": 6030 + }, + { + "epoch": 0.79, + "learning_rate": 6.39541552732617e-07, + "logits/chosen": -0.41061049699783325, + "logits/rejected": -0.26837730407714844, + "logps/chosen": -455.27044677734375, + "logps/rejected": -573.69091796875, + "loss": 0.5308, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.901277780532837, + "rewards/margins": 0.9423490762710571, + "rewards/rejected": -2.8436269760131836, + "step": 6040 + }, + { + "epoch": 0.79, + "learning_rate": 6.319311748265086e-07, + "logits/chosen": -0.3957150876522064, + "logits/rejected": 0.8088628053665161, + "logps/chosen": -577.0421752929688, + "logps/rejected": -617.0074462890625, + "loss": 0.5536, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2539684772491455, + "rewards/margins": 1.012986660003662, + "rewards/rejected": -3.2669551372528076, + "step": 6050 + }, + { + "epoch": 0.79, + "learning_rate": 6.243597929262404e-07, + "logits/chosen": -0.17666508257389069, + "logits/rejected": 0.17739292979240417, + "logps/chosen": -379.2519836425781, + "logps/rejected": -559.8756103515625, + "loss": 0.492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7480781078338623, + "rewards/margins": 1.3037656545639038, + "rewards/rejected": -3.0518438816070557, + "step": 6060 + }, + { + "epoch": 0.79, + "learning_rate": 6.168275650846875e-07, + "logits/chosen": -0.7394101619720459, + "logits/rejected": -0.0025915740989148617, + "logps/chosen": -452.69830322265625, + "logps/rejected": -493.15216064453125, + "loss": 0.5109, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5727591514587402, + "rewards/margins": 0.9079796671867371, + "rewards/rejected": -2.480738878250122, + "step": 6070 + }, + { + "epoch": 0.8, + "learning_rate": 6.093346485373863e-07, + "logits/chosen": -0.5971829295158386, + "logits/rejected": 0.7547621130943298, + "logps/chosen": -476.29693603515625, + "logps/rejected": -533.7948608398438, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9149093627929688, + "rewards/margins": 0.8861688375473022, + "rewards/rejected": -2.8010783195495605, + "step": 6080 + }, + { + "epoch": 0.8, + "learning_rate": 6.018811996992455e-07, + "logits/chosen": -0.6492999792098999, + "logits/rejected": 0.5133501291275024, + "logps/chosen": -446.62933349609375, + "logps/rejected": -528.8438110351562, + "loss": 0.3894, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6136525869369507, + "rewards/margins": 1.323101282119751, + "rewards/rejected": -2.936753749847412, + "step": 6090 + }, + { + "epoch": 0.8, + "learning_rate": 5.944673741612866e-07, + "logits/chosen": -0.5191225409507751, + "logits/rejected": -0.2703114449977875, + "logps/chosen": -488.6419372558594, + "logps/rejected": -571.1328125, + "loss": 0.5071, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.037747859954834, + "rewards/margins": 0.8893720507621765, + "rewards/rejected": -2.9271202087402344, + "step": 6100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 0.9369565844535828, + "eval_logits/rejected": 1.6894502639770508, + "eval_logps/chosen": -454.8271789550781, + "eval_logps/rejected": -534.7506103515625, + "eval_loss": 0.5116304159164429, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.8633716106414795, + "eval_rewards/margins": 0.9983232617378235, + "eval_rewards/rejected": -2.861694574356079, + "eval_runtime": 1179.4032, + "eval_samples_per_second": 1.696, + "eval_steps_per_second": 0.848, + "step": 6100 + }, + { + "epoch": 0.8, + "learning_rate": 5.870933266873916e-07, + "logits/chosen": -0.28973495960235596, + "logits/rejected": 0.016529571264982224, + "logps/chosen": -409.6407775878906, + "logps/rejected": -498.581787109375, + "loss": 0.5488, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7589409351348877, + "rewards/margins": 0.8432859182357788, + "rewards/rejected": -2.602226734161377, + "step": 6110 + }, + { + "epoch": 0.8, + "learning_rate": 5.797592112110734e-07, + "logits/chosen": -0.19786319136619568, + "logits/rejected": 0.2796550989151001, + "logps/chosen": -398.9677734375, + "logps/rejected": -460.76220703125, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8883498907089233, + "rewards/margins": 0.7973678708076477, + "rewards/rejected": -2.685717821121216, + "step": 6120 + }, + { + "epoch": 0.8, + "learning_rate": 5.724651808322645e-07, + "logits/chosen": -0.3768353760242462, + "logits/rejected": 0.02244797721505165, + "logps/chosen": -426.26446533203125, + "logps/rejected": -552.0750122070312, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7751553058624268, + "rewards/margins": 1.0461262464523315, + "rewards/rejected": -2.821281671524048, + "step": 6130 + }, + { + "epoch": 0.8, + "learning_rate": 5.652113878141194e-07, + "logits/chosen": -0.6332122087478638, + "logits/rejected": 0.4572317600250244, + "logps/chosen": -385.82476806640625, + "logps/rejected": -482.82281494140625, + "loss": 0.5261, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.835436463356018, + "rewards/margins": 0.9859844446182251, + "rewards/rejected": -2.821420669555664, + "step": 6140 + }, + { + "epoch": 0.8, + "learning_rate": 5.579979835798361e-07, + "logits/chosen": -0.4648275375366211, + "logits/rejected": 0.25945791602134705, + "logps/chosen": -436.0567321777344, + "logps/rejected": -553.8167114257812, + "loss": 0.4439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9036706686019897, + "rewards/margins": 1.2614033222198486, + "rewards/rejected": -3.165074110031128, + "step": 6150 + }, + { + "epoch": 0.81, + "learning_rate": 5.508251187094932e-07, + "logits/chosen": -0.7267307043075562, + "logits/rejected": 0.3853886127471924, + "logps/chosen": -506.763427734375, + "logps/rejected": -537.0343017578125, + "loss": 0.6427, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1635117530822754, + "rewards/margins": 0.7928264737129211, + "rewards/rejected": -2.9563381671905518, + "step": 6160 + }, + { + "epoch": 0.81, + "learning_rate": 5.436929429369122e-07, + "logits/chosen": -0.4642343521118164, + "logits/rejected": 0.12719163298606873, + "logps/chosen": -440.3096618652344, + "logps/rejected": -517.3118286132812, + "loss": 0.5682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9598861932754517, + "rewards/margins": 0.9272395968437195, + "rewards/rejected": -2.8871257305145264, + "step": 6170 + }, + { + "epoch": 0.81, + "learning_rate": 5.366016051465245e-07, + "logits/chosen": -0.5060332417488098, + "logits/rejected": 0.35478395223617554, + "logps/chosen": -435.00872802734375, + "logps/rejected": -564.3742065429688, + "loss": 0.4017, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.828107476234436, + "rewards/margins": 1.3373243808746338, + "rewards/rejected": -3.1654322147369385, + "step": 6180 + }, + { + "epoch": 0.81, + "learning_rate": 5.295512533702701e-07, + "logits/chosen": -0.12531735002994537, + "logits/rejected": 0.5097149014472961, + "logps/chosen": -410.29974365234375, + "logps/rejected": -509.9228515625, + "loss": 0.532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8042405843734741, + "rewards/margins": 1.0363777875900269, + "rewards/rejected": -2.840618371963501, + "step": 6190 + }, + { + "epoch": 0.81, + "learning_rate": 5.225420347845023e-07, + "logits/chosen": -0.7511438131332397, + "logits/rejected": 0.019868087023496628, + "logps/chosen": -490.9335021972656, + "logps/rejected": -552.3634033203125, + "loss": 0.6455, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0654990673065186, + "rewards/margins": 0.7708456516265869, + "rewards/rejected": -2.8363449573516846, + "step": 6200 + }, + { + "epoch": 0.81, + "eval_logits/chosen": 0.9542487263679504, + "eval_logits/rejected": 1.7120394706726074, + "eval_logps/chosen": -456.4508056640625, + "eval_logps/rejected": -536.0125732421875, + "eval_loss": 0.5110178589820862, + "eval_rewards/accuracies": 0.7250000238418579, + "eval_rewards/chosen": -1.8796085119247437, + "eval_rewards/margins": 0.994705319404602, + "eval_rewards/rejected": -2.8743135929107666, + "eval_runtime": 1204.1855, + "eval_samples_per_second": 1.661, + "eval_steps_per_second": 0.83, + "step": 6200 + }, + { + "epoch": 0.81, + "learning_rate": 5.155740957069186e-07, + "logits/chosen": -0.6087840795516968, + "logits/rejected": 0.2238648384809494, + "logps/chosen": -457.75323486328125, + "logps/rejected": -550.1719970703125, + "loss": 0.4296, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8891584873199463, + "rewards/margins": 1.2166895866394043, + "rewards/rejected": -3.1058480739593506, + "step": 6210 + }, + { + "epoch": 0.81, + "learning_rate": 5.08647581593506e-07, + "logits/chosen": -0.24337653815746307, + "logits/rejected": 0.19924645125865936, + "logps/chosen": -430.85003662109375, + "logps/rejected": -523.4949340820312, + "loss": 0.4476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.682952880859375, + "rewards/margins": 1.0686895847320557, + "rewards/rejected": -2.7516424655914307, + "step": 6220 + }, + { + "epoch": 0.82, + "learning_rate": 5.017626370355014e-07, + "logits/chosen": -0.8117885589599609, + "logits/rejected": 0.5791851878166199, + "logps/chosen": -433.12237548828125, + "logps/rejected": -513.7730712890625, + "loss": 0.4269, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7210042476654053, + "rewards/margins": 1.2424871921539307, + "rewards/rejected": -2.963491678237915, + "step": 6230 + }, + { + "epoch": 0.82, + "learning_rate": 4.949194057563783e-07, + "logits/chosen": -0.7460896372795105, + "logits/rejected": 0.34127911925315857, + "logps/chosen": -460.87713623046875, + "logps/rejected": -520.8004150390625, + "loss": 0.4998, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9422998428344727, + "rewards/margins": 1.0103784799575806, + "rewards/rejected": -2.9526784420013428, + "step": 6240 + }, + { + "epoch": 0.82, + "learning_rate": 4.881180306088418e-07, + "logits/chosen": -0.7913476228713989, + "logits/rejected": 0.48180437088012695, + "logps/chosen": -464.13397216796875, + "logps/rejected": -523.867919921875, + "loss": 0.4899, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8360408544540405, + "rewards/margins": 1.0949831008911133, + "rewards/rejected": -2.9310240745544434, + "step": 6250 + }, + { + "epoch": 0.82, + "learning_rate": 4.813586535718512e-07, + "logits/chosen": -0.7295901775360107, + "logits/rejected": 1.1600463390350342, + "logps/chosen": -491.99139404296875, + "logps/rejected": -539.6505126953125, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8388845920562744, + "rewards/margins": 1.2725069522857666, + "rewards/rejected": -3.111391544342041, + "step": 6260 + }, + { + "epoch": 0.82, + "learning_rate": 4.746414157476506e-07, + "logits/chosen": -1.1962846517562866, + "logits/rejected": 0.240956112742424, + "logps/chosen": -408.2303771972656, + "logps/rejected": -522.8681640625, + "loss": 0.4429, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7451229095458984, + "rewards/margins": 1.4104244709014893, + "rewards/rejected": -3.1555473804473877, + "step": 6270 + }, + { + "epoch": 0.82, + "learning_rate": 4.679664573588294e-07, + "logits/chosen": -0.38419827818870544, + "logits/rejected": 0.4673822522163391, + "logps/chosen": -407.20782470703125, + "logps/rejected": -497.8232421875, + "loss": 0.4743, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7693662643432617, + "rewards/margins": 1.090094804763794, + "rewards/rejected": -2.8594608306884766, + "step": 6280 + }, + { + "epoch": 0.82, + "learning_rate": 4.6133391774538903e-07, + "logits/chosen": -0.8859823942184448, + "logits/rejected": 0.3551492989063263, + "logps/chosen": -481.630615234375, + "logps/rejected": -541.5256958007812, + "loss": 0.5493, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8950306177139282, + "rewards/margins": 1.0568093061447144, + "rewards/rejected": -2.9518399238586426, + "step": 6290 + }, + { + "epoch": 0.82, + "learning_rate": 4.5474393536184214e-07, + "logits/chosen": -0.8629859685897827, + "logits/rejected": 0.4089199900627136, + "logps/chosen": -455.08062744140625, + "logps/rejected": -525.6192626953125, + "loss": 0.4796, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9132658243179321, + "rewards/margins": 1.0315920114517212, + "rewards/rejected": -2.9448578357696533, + "step": 6300 + }, + { + "epoch": 0.82, + "eval_logits/chosen": 1.020259976387024, + "eval_logits/rejected": 1.7784451246261597, + "eval_logps/chosen": -460.98785400390625, + "eval_logps/rejected": -543.0519409179688, + "eval_loss": 0.5111602544784546, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.924978494644165, + "eval_rewards/margins": 1.019729495048523, + "eval_rewards/rejected": -2.9447081089019775, + "eval_runtime": 1194.9348, + "eval_samples_per_second": 1.674, + "eval_steps_per_second": 0.837, + "step": 6300 + }, + { + "epoch": 0.83, + "learning_rate": 4.4819664777431243e-07, + "logits/chosen": -0.18434950709342957, + "logits/rejected": 0.1741354763507843, + "logps/chosen": -412.4391174316406, + "logps/rejected": -475.8851013183594, + "loss": 0.5578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9152721166610718, + "rewards/margins": 0.8140000104904175, + "rewards/rejected": -2.72927188873291, + "step": 6310 + }, + { + "epoch": 0.83, + "learning_rate": 4.416921916576722e-07, + "logits/chosen": -0.6540490388870239, + "logits/rejected": 0.48029765486717224, + "logps/chosen": -492.7520446777344, + "logps/rejected": -564.4111328125, + "loss": 0.5657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.991019606590271, + "rewards/margins": 0.8219622373580933, + "rewards/rejected": -2.8129818439483643, + "step": 6320 + }, + { + "epoch": 0.83, + "learning_rate": 4.352307027926828e-07, + "logits/chosen": -0.6802606582641602, + "logits/rejected": 0.12222106754779816, + "logps/chosen": -441.7935485839844, + "logps/rejected": -548.2308349609375, + "loss": 0.4019, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7764034271240234, + "rewards/margins": 1.371129035949707, + "rewards/rejected": -3.1475327014923096, + "step": 6330 + }, + { + "epoch": 0.83, + "learning_rate": 4.288123160631624e-07, + "logits/chosen": 0.02034485712647438, + "logits/rejected": 0.16564354300498962, + "logps/chosen": -435.69488525390625, + "logps/rejected": -519.12060546875, + "loss": 0.5531, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9254554510116577, + "rewards/margins": 0.9142245054244995, + "rewards/rejected": -2.839679718017578, + "step": 6340 + }, + { + "epoch": 0.83, + "learning_rate": 4.224371654531731e-07, + "logits/chosen": -0.42781931161880493, + "logits/rejected": 0.401696115732193, + "logps/chosen": -449.87701416015625, + "logps/rejected": -493.78302001953125, + "loss": 0.5935, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9719626903533936, + "rewards/margins": 0.7792240381240845, + "rewards/rejected": -2.7511868476867676, + "step": 6350 + }, + { + "epoch": 0.83, + "learning_rate": 4.1610538404421837e-07, + "logits/chosen": -0.12257415056228638, + "logits/rejected": -0.3362657427787781, + "logps/chosen": -416.6104431152344, + "logps/rejected": -562.0303344726562, + "loss": 0.4317, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.695469617843628, + "rewards/margins": 1.2062984704971313, + "rewards/rejected": -2.901768207550049, + "step": 6360 + }, + { + "epoch": 0.83, + "learning_rate": 4.098171040124699e-07, + "logits/chosen": -0.9566270112991333, + "logits/rejected": 0.6268806457519531, + "logps/chosen": -514.7496337890625, + "logps/rejected": -540.041748046875, + "loss": 0.6099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.057263135910034, + "rewards/margins": 0.8319833874702454, + "rewards/rejected": -2.889246702194214, + "step": 6370 + }, + { + "epoch": 0.83, + "learning_rate": 4.03572456626006e-07, + "logits/chosen": -0.10828417539596558, + "logits/rejected": -0.13383126258850098, + "logps/chosen": -456.984130859375, + "logps/rejected": -528.4546508789062, + "loss": 0.4909, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.877745270729065, + "rewards/margins": 0.9030615091323853, + "rewards/rejected": -2.7808070182800293, + "step": 6380 + }, + { + "epoch": 0.84, + "learning_rate": 3.9737157224207265e-07, + "logits/chosen": -0.623379647731781, + "logits/rejected": 0.0637415200471878, + "logps/chosen": -429.89349365234375, + "logps/rejected": -523.8753051757812, + "loss": 0.5763, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.002406597137451, + "rewards/margins": 0.9444215893745422, + "rewards/rejected": -2.9468283653259277, + "step": 6390 + }, + { + "epoch": 0.84, + "learning_rate": 3.912145803043596e-07, + "logits/chosen": -0.6023394465446472, + "logits/rejected": 0.09874238818883896, + "logps/chosen": -470.8539123535156, + "logps/rejected": -520.394287109375, + "loss": 0.5568, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9863427877426147, + "rewards/margins": 0.7905007600784302, + "rewards/rejected": -2.776843309402466, + "step": 6400 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 1.115229845046997, + "eval_logits/rejected": 1.8763548135757446, + "eval_logps/chosen": -463.8809509277344, + "eval_logps/rejected": -545.5327758789062, + "eval_loss": 0.5085515379905701, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -1.953909158706665, + "eval_rewards/margins": 1.0156067609786987, + "eval_rewards/rejected": -2.969515800476074, + "eval_runtime": 1193.0934, + "eval_samples_per_second": 1.676, + "eval_steps_per_second": 0.838, + "step": 6400 + }, + { + "epoch": 0.84, + "learning_rate": 3.851016093403023e-07, + "logits/chosen": -0.14243006706237793, + "logits/rejected": 0.18332967162132263, + "logps/chosen": -435.7933044433594, + "logps/rejected": -534.3563842773438, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0880608558654785, + "rewards/margins": 1.0540637969970703, + "rewards/rejected": -3.142124652862549, + "step": 6410 + }, + { + "epoch": 0.84, + "learning_rate": 3.7903278695839456e-07, + "logits/chosen": -0.13955774903297424, + "logits/rejected": -0.20281191170215607, + "logps/chosen": -465.5072326660156, + "logps/rejected": -532.3043212890625, + "loss": 0.5635, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9887361526489258, + "rewards/margins": 0.9076420068740845, + "rewards/rejected": -2.8963780403137207, + "step": 6420 + }, + { + "epoch": 0.84, + "learning_rate": 3.7300823984552983e-07, + "logits/chosen": -0.5505388975143433, + "logits/rejected": -0.24104097485542297, + "logps/chosen": -418.98516845703125, + "logps/rejected": -541.846923828125, + "loss": 0.4913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.963343620300293, + "rewards/margins": 1.0439479351043701, + "rewards/rejected": -3.007291793823242, + "step": 6430 + }, + { + "epoch": 0.84, + "learning_rate": 3.670280937643503e-07, + "logits/chosen": -0.5132700204849243, + "logits/rejected": 0.6271919012069702, + "logps/chosen": -444.8404846191406, + "logps/rejected": -527.2811279296875, + "loss": 0.4893, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.886971116065979, + "rewards/margins": 1.1429336071014404, + "rewards/rejected": -3.029904842376709, + "step": 6440 + }, + { + "epoch": 0.84, + "learning_rate": 3.610924735506274e-07, + "logits/chosen": -0.8992152214050293, + "logits/rejected": 0.8895284533500671, + "logps/chosen": -502.09051513671875, + "logps/rejected": -532.7025146484375, + "loss": 0.5641, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.00895094871521, + "rewards/margins": 0.9643281102180481, + "rewards/rejected": -2.9732792377471924, + "step": 6450 + }, + { + "epoch": 0.85, + "learning_rate": 3.5520150311065316e-07, + "logits/chosen": -0.26017525792121887, + "logits/rejected": 0.5120875239372253, + "logps/chosen": -459.91436767578125, + "logps/rejected": -549.1633911132812, + "loss": 0.4416, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8477706909179688, + "rewards/margins": 1.1013177633285522, + "rewards/rejected": -2.9490883350372314, + "step": 6460 + }, + { + "epoch": 0.85, + "learning_rate": 3.493553054186527e-07, + "logits/chosen": -0.6854721307754517, + "logits/rejected": 0.3193429112434387, + "logps/chosen": -469.05230712890625, + "logps/rejected": -552.3963623046875, + "loss": 0.5335, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.117194652557373, + "rewards/margins": 0.8773033022880554, + "rewards/rejected": -2.994497776031494, + "step": 6470 + }, + { + "epoch": 0.85, + "learning_rate": 3.4355400251421977e-07, + "logits/chosen": -0.15283913910388947, + "logits/rejected": 0.5103310346603394, + "logps/chosen": -456.74993896484375, + "logps/rejected": -499.0576171875, + "loss": 0.6538, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.07171368598938, + "rewards/margins": 0.6756519079208374, + "rewards/rejected": -2.747365713119507, + "step": 6480 + }, + { + "epoch": 0.85, + "learning_rate": 3.3779771549976637e-07, + "logits/chosen": -0.47592344880104065, + "logits/rejected": 0.43167513608932495, + "logps/chosen": -438.9248962402344, + "logps/rejected": -505.8505859375, + "loss": 0.5395, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9432265758514404, + "rewards/margins": 0.8998059034347534, + "rewards/rejected": -2.8430323600769043, + "step": 6490 + }, + { + "epoch": 0.85, + "learning_rate": 3.3208656453799783e-07, + "logits/chosen": -0.7136383056640625, + "logits/rejected": 0.3548789918422699, + "logps/chosen": -434.717529296875, + "logps/rejected": -514.3789672851562, + "loss": 0.4335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8354284763336182, + "rewards/margins": 1.087013840675354, + "rewards/rejected": -2.9224421977996826, + "step": 6500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": 1.1822218894958496, + "eval_logits/rejected": 1.9424830675125122, + "eval_logps/chosen": -468.96807861328125, + "eval_logps/rejected": -550.4982299804688, + "eval_loss": 0.5067179203033447, + "eval_rewards/accuracies": 0.7294999957084656, + "eval_rewards/chosen": -2.0047807693481445, + "eval_rewards/margins": 1.014390230178833, + "eval_rewards/rejected": -3.0191712379455566, + "eval_runtime": 1181.0567, + "eval_samples_per_second": 1.693, + "eval_steps_per_second": 0.847, + "step": 6500 + }, + { + "epoch": 0.85, + "learning_rate": 3.2642066884940064e-07, + "logits/chosen": -0.5502122640609741, + "logits/rejected": 0.032435137778520584, + "logps/chosen": -474.4972229003906, + "logps/rejected": -589.9862670898438, + "loss": 0.594, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.066648006439209, + "rewards/margins": 1.129963755607605, + "rewards/rejected": -3.1966123580932617, + "step": 6510 + }, + { + "epoch": 0.85, + "learning_rate": 3.2080014670975825e-07, + "logits/chosen": -0.5147031545639038, + "logits/rejected": -0.046147845685482025, + "logps/chosen": -433.7167053222656, + "logps/rejected": -496.712646484375, + "loss": 0.5683, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8814866542816162, + "rewards/margins": 0.8404488563537598, + "rewards/rejected": -2.721935510635376, + "step": 6520 + }, + { + "epoch": 0.85, + "learning_rate": 3.152251154476765e-07, + "logits/chosen": -0.5666553378105164, + "logits/rejected": 0.2234136164188385, + "logps/chosen": -429.125732421875, + "logps/rejected": -543.9380493164062, + "loss": 0.4476, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.934643030166626, + "rewards/margins": 1.1717562675476074, + "rewards/rejected": -3.1063990592956543, + "step": 6530 + }, + { + "epoch": 0.86, + "learning_rate": 3.0969569144214147e-07, + "logits/chosen": -0.9073036313056946, + "logits/rejected": 0.5169941186904907, + "logps/chosen": -476.5718688964844, + "logps/rejected": -547.4609985351562, + "loss": 0.4676, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0425972938537598, + "rewards/margins": 0.9983150362968445, + "rewards/rejected": -3.040912628173828, + "step": 6540 + }, + { + "epoch": 0.86, + "learning_rate": 3.042119901200824e-07, + "logits/chosen": -0.1191103607416153, + "logits/rejected": 0.11132284253835678, + "logps/chosen": -444.5992736816406, + "logps/rejected": -555.7691650390625, + "loss": 0.5972, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1316585540771484, + "rewards/margins": 0.7841309309005737, + "rewards/rejected": -2.915789842605591, + "step": 6550 + }, + { + "epoch": 0.86, + "learning_rate": 2.9877412595396726e-07, + "logits/chosen": -0.8357526063919067, + "logits/rejected": -0.019801050424575806, + "logps/chosen": -504.7652893066406, + "logps/rejected": -582.4708251953125, + "loss": 0.4626, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9639135599136353, + "rewards/margins": 1.1901118755340576, + "rewards/rejected": -3.1540255546569824, + "step": 6560 + }, + { + "epoch": 0.86, + "learning_rate": 2.933822124594124e-07, + "logits/chosen": -0.2868785560131073, + "logits/rejected": 0.4834575057029724, + "logps/chosen": -471.68603515625, + "logps/rejected": -518.7160034179688, + "loss": 0.5799, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0963294506073, + "rewards/margins": 0.8177496194839478, + "rewards/rejected": -2.914079189300537, + "step": 6570 + }, + { + "epoch": 0.86, + "learning_rate": 2.880363621928106e-07, + "logits/chosen": -0.4423336982727051, + "logits/rejected": 0.6005432605743408, + "logps/chosen": -483.3235778808594, + "logps/rejected": -539.4915771484375, + "loss": 0.5047, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.052248239517212, + "rewards/margins": 0.9304612874984741, + "rewards/rejected": -2.9827091693878174, + "step": 6580 + }, + { + "epoch": 0.86, + "learning_rate": 2.82736686748985e-07, + "logits/chosen": -0.6261542439460754, + "logits/rejected": 0.38417965173721313, + "logps/chosen": -470.73077392578125, + "logps/rejected": -533.3613891601562, + "loss": 0.4692, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9056422710418701, + "rewards/margins": 1.2279062271118164, + "rewards/rejected": -3.1335487365722656, + "step": 6590 + }, + { + "epoch": 0.86, + "learning_rate": 2.774832967588556e-07, + "logits/chosen": -0.7487810850143433, + "logits/rejected": 0.46375662088394165, + "logps/chosen": -487.33544921875, + "logps/rejected": -587.102783203125, + "loss": 0.5263, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.039247512817383, + "rewards/margins": 1.248397707939148, + "rewards/rejected": -3.2876453399658203, + "step": 6600 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 1.180580735206604, + "eval_logits/rejected": 1.9389982223510742, + "eval_logps/chosen": -465.3099060058594, + "eval_logps/rejected": -546.2759399414062, + "eval_loss": 0.5066229104995728, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -1.9681991338729858, + "eval_rewards/margins": 1.0087487697601318, + "eval_rewards/rejected": -2.976947784423828, + "eval_runtime": 1275.913, + "eval_samples_per_second": 1.568, + "eval_steps_per_second": 0.784, + "step": 6600 + }, + { + "epoch": 0.86, + "learning_rate": 2.7227630188713326e-07, + "logits/chosen": -0.889278769493103, + "logits/rejected": 0.699964702129364, + "logps/chosen": -502.218017578125, + "logps/rejected": -545.4527587890625, + "loss": 0.504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0107197761535645, + "rewards/margins": 1.0104124546051025, + "rewards/rejected": -3.021131992340088, + "step": 6610 + }, + { + "epoch": 0.87, + "learning_rate": 2.671158108300284e-07, + "logits/chosen": -0.6139100790023804, + "logits/rejected": -0.24325446784496307, + "logps/chosen": -461.9278259277344, + "logps/rejected": -540.6260986328125, + "loss": 0.5506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.021791934967041, + "rewards/margins": 0.7798693776130676, + "rewards/rejected": -2.801661252975464, + "step": 6620 + }, + { + "epoch": 0.87, + "learning_rate": 2.6200193131298376e-07, + "logits/chosen": -0.6526001691818237, + "logits/rejected": -0.10106471925973892, + "logps/chosen": -480.90618896484375, + "logps/rejected": -579.097900390625, + "loss": 0.4102, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9294904470443726, + "rewards/margins": 1.2845728397369385, + "rewards/rejected": -3.2140636444091797, + "step": 6630 + }, + { + "epoch": 0.87, + "learning_rate": 2.569347700884217e-07, + "logits/chosen": -0.69781893491745, + "logits/rejected": 0.686926543712616, + "logps/chosen": -466.5616149902344, + "logps/rejected": -538.9299926757812, + "loss": 0.473, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9398361444473267, + "rewards/margins": 1.1922200918197632, + "rewards/rejected": -3.13205623626709, + "step": 6640 + }, + { + "epoch": 0.87, + "learning_rate": 2.5191443293352186e-07, + "logits/chosen": -0.3311184346675873, + "logits/rejected": 0.31631073355674744, + "logps/chosen": -480.02655029296875, + "logps/rejected": -576.1092529296875, + "loss": 0.5448, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.015752077102661, + "rewards/margins": 0.9452459216117859, + "rewards/rejected": -2.960998058319092, + "step": 6650 + }, + { + "epoch": 0.87, + "learning_rate": 2.469410246480067e-07, + "logits/chosen": -0.21244564652442932, + "logits/rejected": 0.6931090950965881, + "logps/chosen": -439.51812744140625, + "logps/rejected": -541.0764770507812, + "loss": 0.4631, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.069284439086914, + "rewards/margins": 1.1991032361984253, + "rewards/rejected": -3.26838755607605, + "step": 6660 + }, + { + "epoch": 0.87, + "learning_rate": 2.4201464905195955e-07, + "logits/chosen": -0.42047929763793945, + "logits/rejected": -0.40434974431991577, + "logps/chosen": -454.3592224121094, + "logps/rejected": -524.0152587890625, + "loss": 0.5885, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9843008518218994, + "rewards/margins": 0.6817992925643921, + "rewards/rejected": -2.666100025177002, + "step": 6670 + }, + { + "epoch": 0.87, + "learning_rate": 2.3713540898365196e-07, + "logits/chosen": -0.5671381950378418, + "logits/rejected": 0.32992106676101685, + "logps/chosen": -449.9239196777344, + "logps/rejected": -534.2097778320312, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7972313165664673, + "rewards/margins": 1.1687263250350952, + "rewards/rejected": -2.9659574031829834, + "step": 6680 + }, + { + "epoch": 0.88, + "learning_rate": 2.3230340629740166e-07, + "logits/chosen": -0.6218796968460083, + "logits/rejected": -0.1357102394104004, + "logps/chosen": -465.1273498535156, + "logps/rejected": -495.3822326660156, + "loss": 0.6258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9846065044403076, + "rewards/margins": 0.5649687647819519, + "rewards/rejected": -2.5495753288269043, + "step": 6690 + }, + { + "epoch": 0.88, + "learning_rate": 2.2751874186144357e-07, + "logits/chosen": -1.0036194324493408, + "logits/rejected": 0.0544402189552784, + "logps/chosen": -472.434326171875, + "logps/rejected": -515.3161010742188, + "loss": 0.5263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8698005676269531, + "rewards/margins": 0.915500819683075, + "rewards/rejected": -2.7853012084960938, + "step": 6700 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.1793560981750488, + "eval_logits/rejected": 1.9366413354873657, + "eval_logps/chosen": -465.6783752441406, + "eval_logps/rejected": -546.6119384765625, + "eval_loss": 0.5065844058990479, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.9718834161758423, + "eval_rewards/margins": 1.0084247589111328, + "eval_rewards/rejected": -2.9803082942962646, + "eval_runtime": 1228.3254, + "eval_samples_per_second": 1.628, + "eval_steps_per_second": 0.814, + "step": 6700 + }, + { + "epoch": 0.88, + "learning_rate": 2.227815155558241e-07, + "logits/chosen": -0.715009331703186, + "logits/rejected": 0.11820618808269501, + "logps/chosen": -483.3104553222656, + "logps/rejected": -584.90478515625, + "loss": 0.5043, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.07963228225708, + "rewards/margins": 1.144766092300415, + "rewards/rejected": -3.224398136138916, + "step": 6710 + }, + { + "epoch": 0.88, + "learning_rate": 2.1809182627031883e-07, + "logits/chosen": -0.9190937280654907, + "logits/rejected": 0.36419257521629333, + "logps/chosen": -471.60467529296875, + "logps/rejected": -543.3637084960938, + "loss": 0.5222, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8738210201263428, + "rewards/margins": 0.989733099937439, + "rewards/rejected": -2.8635544776916504, + "step": 6720 + }, + { + "epoch": 0.88, + "learning_rate": 2.1344977190236372e-07, + "logits/chosen": 0.27573806047439575, + "logits/rejected": 0.18469476699829102, + "logps/chosen": -439.60107421875, + "logps/rejected": -560.4583740234375, + "loss": 0.4557, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9545615911483765, + "rewards/margins": 1.2339239120483398, + "rewards/rejected": -3.188485622406006, + "step": 6730 + }, + { + "epoch": 0.88, + "learning_rate": 2.0885544935501656e-07, + "logits/chosen": -0.5300592184066772, + "logits/rejected": -0.35172906517982483, + "logps/chosen": -451.72027587890625, + "logps/rejected": -578.4183349609375, + "loss": 0.4219, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9533073902130127, + "rewards/margins": 1.2389609813690186, + "rewards/rejected": -3.1922686100006104, + "step": 6740 + }, + { + "epoch": 0.88, + "learning_rate": 2.0430895453492944e-07, + "logits/chosen": -0.5228718519210815, + "logits/rejected": -0.06577786058187485, + "logps/chosen": -495.9405212402344, + "logps/rejected": -537.6466674804688, + "loss": 0.5773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0149998664855957, + "rewards/margins": 0.7745741605758667, + "rewards/rejected": -2.789574384689331, + "step": 6750 + }, + { + "epoch": 0.88, + "learning_rate": 1.9981038235035111e-07, + "logits/chosen": -0.12807337939739227, + "logits/rejected": -0.08245428651571274, + "logps/chosen": -441.65008544921875, + "logps/rejected": -548.439697265625, + "loss": 0.3802, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.779802680015564, + "rewards/margins": 1.3215081691741943, + "rewards/rejected": -3.101310968399048, + "step": 6760 + }, + { + "epoch": 0.89, + "learning_rate": 1.9535982670914112e-07, + "logits/chosen": -0.55552738904953, + "logits/rejected": 0.4736878275871277, + "logps/chosen": -500.26171875, + "logps/rejected": -571.901611328125, + "loss": 0.5424, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.030512571334839, + "rewards/margins": 0.9621774554252625, + "rewards/rejected": -2.9926905632019043, + "step": 6770 + }, + { + "epoch": 0.89, + "learning_rate": 1.9095738051681412e-07, + "logits/chosen": -0.19979307055473328, + "logits/rejected": 0.008223796263337135, + "logps/chosen": -459.77935791015625, + "logps/rejected": -547.7391357421875, + "loss": 0.5411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.210814952850342, + "rewards/margins": 0.9136184453964233, + "rewards/rejected": -3.1244330406188965, + "step": 6780 + }, + { + "epoch": 0.89, + "learning_rate": 1.8660313567459703e-07, + "logits/chosen": -0.09574131667613983, + "logits/rejected": -0.14765064418315887, + "logps/chosen": -426.57159423828125, + "logps/rejected": -542.1318359375, + "loss": 0.5432, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9452426433563232, + "rewards/margins": 1.1776217222213745, + "rewards/rejected": -3.1228644847869873, + "step": 6790 + }, + { + "epoch": 0.89, + "learning_rate": 1.8229718307751165e-07, + "logits/chosen": -0.45521458983421326, + "logits/rejected": 0.7231898903846741, + "logps/chosen": -494.8271484375, + "logps/rejected": -564.8556518554688, + "loss": 0.4939, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0975873470306396, + "rewards/margins": 1.2642689943313599, + "rewards/rejected": -3.361856460571289, + "step": 6800 + }, + { + "epoch": 0.89, + "eval_logits/chosen": 1.2238198518753052, + "eval_logits/rejected": 1.979459524154663, + "eval_logps/chosen": -470.53741455078125, + "eval_logps/rejected": -551.8628540039062, + "eval_loss": 0.5062793493270874, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -2.0204737186431885, + "eval_rewards/margins": 1.0123436450958252, + "eval_rewards/rejected": -3.0328176021575928, + "eval_runtime": 1180.1566, + "eval_samples_per_second": 1.695, + "eval_steps_per_second": 0.847, + "step": 6800 + }, + { + "epoch": 0.89, + "learning_rate": 1.7803961261247864e-07, + "logits/chosen": -0.2857457995414734, + "logits/rejected": 0.201734259724617, + "logps/chosen": -467.9620666503906, + "logps/rejected": -601.2489013671875, + "loss": 0.4212, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9273033142089844, + "rewards/margins": 1.3749698400497437, + "rewards/rejected": -3.3022735118865967, + "step": 6810 + }, + { + "epoch": 0.89, + "learning_rate": 1.7383051315643772e-07, + "logits/chosen": -0.7285705804824829, + "logits/rejected": 0.5277493000030518, + "logps/chosen": -507.9571228027344, + "logps/rejected": -551.97607421875, + "loss": 0.58, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2432968616485596, + "rewards/margins": 0.8464199304580688, + "rewards/rejected": -3.0897164344787598, + "step": 6820 + }, + { + "epoch": 0.89, + "learning_rate": 1.6966997257449685e-07, + "logits/chosen": -0.5658080577850342, + "logits/rejected": 0.22322329878807068, + "logps/chosen": -476.03692626953125, + "logps/rejected": -543.3971557617188, + "loss": 0.526, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0512733459472656, + "rewards/margins": 0.8895019292831421, + "rewards/rejected": -2.9407753944396973, + "step": 6830 + }, + { + "epoch": 0.9, + "learning_rate": 1.6555807771809375e-07, + "logits/chosen": -0.6819769144058228, + "logits/rejected": 0.3543395698070526, + "logps/chosen": -454.736328125, + "logps/rejected": -513.91650390625, + "loss": 0.4701, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9653873443603516, + "rewards/margins": 1.1591241359710693, + "rewards/rejected": -3.124511241912842, + "step": 6840 + }, + { + "epoch": 0.9, + "learning_rate": 1.6149491442318617e-07, + "logits/chosen": -0.37542515993118286, + "logits/rejected": -0.02834094688296318, + "logps/chosen": -455.55657958984375, + "logps/rejected": -529.7952880859375, + "loss": 0.5826, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.912413239479065, + "rewards/margins": 0.8809484243392944, + "rewards/rejected": -2.7933619022369385, + "step": 6850 + }, + { + "epoch": 0.9, + "learning_rate": 1.5748056750845786e-07, + "logits/chosen": -0.7669013738632202, + "logits/rejected": 0.44335660338401794, + "logps/chosen": -499.58685302734375, + "logps/rejected": -512.37353515625, + "loss": 0.5071, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2117278575897217, + "rewards/margins": 0.8127554655075073, + "rewards/rejected": -3.0244839191436768, + "step": 6860 + }, + { + "epoch": 0.9, + "learning_rate": 1.5351512077355024e-07, + "logits/chosen": -0.5717117786407471, + "logits/rejected": 0.31644728779792786, + "logps/chosen": -493.24652099609375, + "logps/rejected": -646.7489624023438, + "loss": 0.4034, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9442113637924194, + "rewards/margins": 1.276631236076355, + "rewards/rejected": -3.2208428382873535, + "step": 6870 + }, + { + "epoch": 0.9, + "learning_rate": 1.4959865699730902e-07, + "logits/chosen": -0.39092275500297546, + "logits/rejected": 0.6129297614097595, + "logps/chosen": -439.7764587402344, + "logps/rejected": -524.3287353515625, + "loss": 0.4971, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0712218284606934, + "rewards/margins": 1.1431468725204468, + "rewards/rejected": -3.214369297027588, + "step": 6880 + }, + { + "epoch": 0.9, + "learning_rate": 1.4573125793606202e-07, + "logits/chosen": -0.4366453289985657, + "logits/rejected": 0.452403724193573, + "logps/chosen": -431.4530334472656, + "logps/rejected": -524.9496459960938, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.097776174545288, + "rewards/margins": 1.0594654083251953, + "rewards/rejected": -3.1572415828704834, + "step": 6890 + }, + { + "epoch": 0.9, + "learning_rate": 1.4191300432190634e-07, + "logits/chosen": -0.45784568786621094, + "logits/rejected": 0.9001695513725281, + "logps/chosen": -504.549072265625, + "logps/rejected": -562.7247314453125, + "loss": 0.5763, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3490102291107178, + "rewards/margins": 0.8059114217758179, + "rewards/rejected": -3.154921770095825, + "step": 6900 + }, + { + "epoch": 0.9, + "eval_logits/chosen": 1.202736258506775, + "eval_logits/rejected": 1.9579252004623413, + "eval_logps/chosen": -469.47125244140625, + "eval_logps/rejected": -550.4862670898438, + "eval_loss": 0.5060390830039978, + "eval_rewards/accuracies": 0.7329999804496765, + "eval_rewards/chosen": -2.009812116622925, + "eval_rewards/margins": 1.009238600730896, + "eval_rewards/rejected": -3.0190508365631104, + "eval_runtime": 1171.5664, + "eval_samples_per_second": 1.707, + "eval_steps_per_second": 0.854, + "step": 6900 + }, + { + "epoch": 0.9, + "learning_rate": 1.381439758610284e-07, + "logits/chosen": -0.6321390271186829, + "logits/rejected": 0.20315834879875183, + "logps/chosen": -462.337158203125, + "logps/rejected": -521.3242797851562, + "loss": 0.5273, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.02864408493042, + "rewards/margins": 0.7589557766914368, + "rewards/rejected": -2.787600040435791, + "step": 6910 + }, + { + "epoch": 0.91, + "learning_rate": 1.3442425123203596e-07, + "logits/chosen": -0.7073062658309937, + "logits/rejected": 0.02772808074951172, + "logps/chosen": -461.3926696777344, + "logps/rejected": -560.2494506835938, + "loss": 0.5085, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.046745538711548, + "rewards/margins": 1.014560580253601, + "rewards/rejected": -3.0613059997558594, + "step": 6920 + }, + { + "epoch": 0.91, + "learning_rate": 1.3075390808431897e-07, + "logits/chosen": -0.37788501381874084, + "logits/rejected": 0.6019797325134277, + "logps/chosen": -435.75726318359375, + "logps/rejected": -511.6985778808594, + "loss": 0.4686, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9663892984390259, + "rewards/margins": 1.0337203741073608, + "rewards/rejected": -3.0001096725463867, + "step": 6930 + }, + { + "epoch": 0.91, + "learning_rate": 1.271330230364262e-07, + "logits/chosen": -0.14386829733848572, + "logits/rejected": 0.21935054659843445, + "logps/chosen": -460.65582275390625, + "logps/rejected": -620.8492431640625, + "loss": 0.4858, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.145711898803711, + "rewards/margins": 1.1036556959152222, + "rewards/rejected": -3.2493674755096436, + "step": 6940 + }, + { + "epoch": 0.91, + "learning_rate": 1.2356167167446698e-07, + "logits/chosen": -0.04140012338757515, + "logits/rejected": 0.4257062077522278, + "logps/chosen": -467.8836975097656, + "logps/rejected": -582.3963012695312, + "loss": 0.5375, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3314528465270996, + "rewards/margins": 1.0109634399414062, + "rewards/rejected": -3.342416286468506, + "step": 6950 + }, + { + "epoch": 0.91, + "learning_rate": 1.2003992855053326e-07, + "logits/chosen": -0.09045709669589996, + "logits/rejected": 0.434389591217041, + "logps/chosen": -432.42889404296875, + "logps/rejected": -556.8145751953125, + "loss": 0.5105, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9629653692245483, + "rewards/margins": 1.2737910747528076, + "rewards/rejected": -3.2367560863494873, + "step": 6960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1656786718114239e-07, + "logits/chosen": -0.14372889697551727, + "logits/rejected": 0.17852702736854553, + "logps/chosen": -464.07659912109375, + "logps/rejected": -544.1351318359375, + "loss": 0.5259, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1138415336608887, + "rewards/margins": 0.9006859064102173, + "rewards/rejected": -3.0145275592803955, + "step": 6970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1314556004570487e-07, + "logits/chosen": -0.35672527551651, + "logits/rejected": -0.19655278325080872, + "logps/chosen": -405.0440979003906, + "logps/rejected": -522.1256713867188, + "loss": 0.5576, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9034448862075806, + "rewards/margins": 0.9194092750549316, + "rewards/rejected": -2.8228540420532227, + "step": 6980 + }, + { + "epoch": 0.91, + "learning_rate": 1.0977307858500818e-07, + "logits/chosen": -0.7549588084220886, + "logits/rejected": 0.3601745367050171, + "logps/chosen": -438.78955078125, + "logps/rejected": -511.4344787597656, + "loss": 0.4611, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8593467473983765, + "rewards/margins": 1.0313808917999268, + "rewards/rejected": -2.8907275199890137, + "step": 6990 + }, + { + "epoch": 0.92, + "learning_rate": 1.0645049319972789e-07, + "logits/chosen": -0.11657045036554337, + "logits/rejected": 0.3493362367153168, + "logps/chosen": -482.05938720703125, + "logps/rejected": -546.6885375976562, + "loss": 0.5062, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1746017932891846, + "rewards/margins": 1.1042107343673706, + "rewards/rejected": -3.278812885284424, + "step": 7000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 1.2018110752105713, + "eval_logits/rejected": 1.957441806793213, + "eval_logps/chosen": -468.7945556640625, + "eval_logps/rejected": -549.6513671875, + "eval_loss": 0.5059376955032349, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -2.0030453205108643, + "eval_rewards/margins": 1.0076566934585571, + "eval_rewards/rejected": -3.010701894760132, + "eval_runtime": 1176.4736, + "eval_samples_per_second": 1.7, + "eval_steps_per_second": 0.85, + "step": 7000 + }, + { + "epoch": 0.92, + "learning_rate": 1.0317787324895634e-07, + "logits/chosen": -0.35694876313209534, + "logits/rejected": 0.610944926738739, + "logps/chosen": -489.6722717285156, + "logps/rejected": -578.5293579101562, + "loss": 0.3914, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.016587495803833, + "rewards/margins": 1.268135666847229, + "rewards/rejected": -3.2847228050231934, + "step": 7010 + }, + { + "epoch": 0.92, + "learning_rate": 9.995528704875635e-08, + "logits/chosen": -0.07247890532016754, + "logits/rejected": -0.2989567816257477, + "logps/chosen": -438.46728515625, + "logps/rejected": -547.0955810546875, + "loss": 0.5225, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.047827959060669, + "rewards/margins": 0.896246075630188, + "rewards/rejected": -2.9440741539001465, + "step": 7020 + }, + { + "epoch": 0.92, + "learning_rate": 9.678280187073452e-08, + "logits/chosen": -0.27913758158683777, + "logits/rejected": 0.5736603140830994, + "logps/chosen": -463.31243896484375, + "logps/rejected": -556.4307861328125, + "loss": 0.3972, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7716821432113647, + "rewards/margins": 1.4218103885650635, + "rewards/rejected": -3.1934924125671387, + "step": 7030 + }, + { + "epoch": 0.92, + "learning_rate": 9.366048394063549e-08, + "logits/chosen": -0.41250452399253845, + "logits/rejected": -0.0662340372800827, + "logps/chosen": -456.4169006347656, + "logps/rejected": -561.72021484375, + "loss": 0.5114, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.850750207901001, + "rewards/margins": 0.9853949546813965, + "rewards/rejected": -2.8361451625823975, + "step": 7040 + }, + { + "epoch": 0.92, + "learning_rate": 9.058839843696237e-08, + "logits/chosen": -0.4489797055721283, + "logits/rejected": 0.25591760873794556, + "logps/chosen": -483.81707763671875, + "logps/rejected": -559.30908203125, + "loss": 0.4706, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0449726581573486, + "rewards/margins": 1.0600306987762451, + "rewards/rejected": -3.1050033569335938, + "step": 7050 + }, + { + "epoch": 0.92, + "learning_rate": 8.756660948961299e-08, + "logits/chosen": -0.4514777660369873, + "logits/rejected": -0.20608548820018768, + "logps/chosen": -442.100341796875, + "logps/rejected": -545.4368286132812, + "loss": 0.5293, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.016775608062744, + "rewards/margins": 0.8498009443283081, + "rewards/rejected": -2.866576671600342, + "step": 7060 + }, + { + "epoch": 0.93, + "learning_rate": 8.459518017854412e-08, + "logits/chosen": -0.6188634634017944, + "logits/rejected": 0.048554904758930206, + "logps/chosen": -459.34503173828125, + "logps/rejected": -503.36541748046875, + "loss": 0.5722, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9700435400009155, + "rewards/margins": 0.6844844818115234, + "rewards/rejected": -2.6545281410217285, + "step": 7070 + }, + { + "epoch": 0.93, + "learning_rate": 8.167417253245213e-08, + "logits/chosen": -0.7391661405563354, + "logits/rejected": 0.666763424873352, + "logps/chosen": -460.7491760253906, + "logps/rejected": -533.9959716796875, + "loss": 0.5173, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.060835123062134, + "rewards/margins": 0.9698716998100281, + "rewards/rejected": -3.0307066440582275, + "step": 7080 + }, + { + "epoch": 0.93, + "learning_rate": 7.880364752747948e-08, + "logits/chosen": -0.42400145530700684, + "logits/rejected": -0.08642569929361343, + "logps/chosen": -453.1158142089844, + "logps/rejected": -535.9979248046875, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1991636753082275, + "rewards/margins": 0.8412439227104187, + "rewards/rejected": -3.04040789604187, + "step": 7090 + }, + { + "epoch": 0.93, + "learning_rate": 7.598366508594245e-08, + "logits/chosen": -0.19622935354709625, + "logits/rejected": -0.13577821850776672, + "logps/chosen": -504.1654357910156, + "logps/rejected": -597.3096313476562, + "loss": 0.4432, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.007800340652466, + "rewards/margins": 1.198746919631958, + "rewards/rejected": -3.2065467834472656, + "step": 7100 + }, + { + "epoch": 0.93, + "eval_logits/chosen": 1.2115496397018433, + "eval_logits/rejected": 1.9674791097640991, + "eval_logps/chosen": -469.8140869140625, + "eval_logps/rejected": -550.7593994140625, + "eval_loss": 0.5058996081352234, + "eval_rewards/accuracies": 0.7329999804496765, + "eval_rewards/chosen": -2.0132408142089844, + "eval_rewards/margins": 1.0085415840148926, + "eval_rewards/rejected": -3.021782398223877, + "eval_runtime": 1186.4135, + "eval_samples_per_second": 1.686, + "eval_steps_per_second": 0.843, + "step": 7100 + }, + { + "epoch": 0.93, + "learning_rate": 7.32142840750788e-08, + "logits/chosen": -0.5224785804748535, + "logits/rejected": 0.3103681802749634, + "logps/chosen": -489.0027770996094, + "logps/rejected": -570.6793212890625, + "loss": 0.4273, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8569695949554443, + "rewards/margins": 1.2805449962615967, + "rewards/rejected": -3.137514591217041, + "step": 7110 + }, + { + "epoch": 0.93, + "learning_rate": 7.049556230581872e-08, + "logits/chosen": -0.19431808590888977, + "logits/rejected": 0.6167198419570923, + "logps/chosen": -450.33013916015625, + "logps/rejected": -534.5645751953125, + "loss": 0.5363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.142000675201416, + "rewards/margins": 1.0167112350463867, + "rewards/rejected": -3.1587119102478027, + "step": 7120 + }, + { + "epoch": 0.93, + "learning_rate": 6.782755653158085e-08, + "logits/chosen": -0.4115025997161865, + "logits/rejected": 0.14585857093334198, + "logps/chosen": -475.48480224609375, + "logps/rejected": -537.6339111328125, + "loss": 0.517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.024827480316162, + "rewards/margins": 0.8768397569656372, + "rewards/rejected": -2.901667356491089, + "step": 7130 + }, + { + "epoch": 0.93, + "learning_rate": 6.521032244708375e-08, + "logits/chosen": -0.2687646448612213, + "logits/rejected": 0.2106965035200119, + "logps/chosen": -456.90948486328125, + "logps/rejected": -554.6942138671875, + "loss": 0.5336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.990630865097046, + "rewards/margins": 0.9876958727836609, + "rewards/rejected": -2.9783265590667725, + "step": 7140 + }, + { + "epoch": 0.94, + "learning_rate": 6.264391468718628e-08, + "logits/chosen": -0.7374650239944458, + "logits/rejected": 0.049775220453739166, + "logps/chosen": -459.86181640625, + "logps/rejected": -549.9932250976562, + "loss": 0.4637, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8901011943817139, + "rewards/margins": 1.0680241584777832, + "rewards/rejected": -2.958125591278076, + "step": 7150 + }, + { + "epoch": 0.94, + "learning_rate": 6.012838682574462e-08, + "logits/chosen": -0.6180638074874878, + "logits/rejected": 0.6540043950080872, + "logps/chosen": -475.832763671875, + "logps/rejected": -497.9046325683594, + "loss": 0.5072, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0510411262512207, + "rewards/margins": 0.8002778887748718, + "rewards/rejected": -2.851318836212158, + "step": 7160 + }, + { + "epoch": 0.94, + "learning_rate": 5.766379137449624e-08, + "logits/chosen": -0.36336010694503784, + "logits/rejected": 0.0748773068189621, + "logps/chosen": -423.7010192871094, + "logps/rejected": -554.7673950195312, + "loss": 0.4622, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9527654647827148, + "rewards/margins": 1.0933210849761963, + "rewards/rejected": -3.0460867881774902, + "step": 7170 + }, + { + "epoch": 0.94, + "learning_rate": 5.525017978196295e-08, + "logits/chosen": -0.5020288825035095, + "logits/rejected": 0.6828367114067078, + "logps/chosen": -488.1974182128906, + "logps/rejected": -566.2771606445312, + "loss": 0.4981, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0206267833709717, + "rewards/margins": 1.190063714981079, + "rewards/rejected": -3.21069073677063, + "step": 7180 + }, + { + "epoch": 0.94, + "learning_rate": 5.288760243237545e-08, + "logits/chosen": -0.7169966697692871, + "logits/rejected": 0.34568727016448975, + "logps/chosen": -526.5534057617188, + "logps/rejected": -571.4498291015625, + "loss": 0.5102, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1718175411224365, + "rewards/margins": 0.957715630531311, + "rewards/rejected": -3.129533052444458, + "step": 7190 + }, + { + "epoch": 0.94, + "learning_rate": 5.0576108644623536e-08, + "logits/chosen": -0.5569435954093933, + "logits/rejected": 0.4495469927787781, + "logps/chosen": -521.062255859375, + "logps/rejected": -563.0753173828125, + "loss": 0.5294, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2122786045074463, + "rewards/margins": 1.0275580883026123, + "rewards/rejected": -3.2398364543914795, + "step": 7200 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 1.2122623920440674, + "eval_logits/rejected": 1.967880129814148, + "eval_logps/chosen": -469.9013671875, + "eval_logps/rejected": -550.8819580078125, + "eval_loss": 0.5059316158294678, + "eval_rewards/accuracies": 0.7315000295639038, + "eval_rewards/chosen": -2.014113426208496, + "eval_rewards/margins": 1.0088937282562256, + "eval_rewards/rejected": -3.0230071544647217, + "eval_runtime": 1186.2494, + "eval_samples_per_second": 1.686, + "eval_steps_per_second": 0.843, + "step": 7200 + }, + { + "epoch": 0.94, + "learning_rate": 4.8315746671225296e-08, + "logits/chosen": -0.5705938339233398, + "logits/rejected": 0.5357488989830017, + "logps/chosen": -488.63177490234375, + "logps/rejected": -576.7957763671875, + "loss": 0.4553, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8494971990585327, + "rewards/margins": 1.1614658832550049, + "rewards/rejected": -3.0109634399414062, + "step": 7210 + }, + { + "epoch": 0.94, + "learning_rate": 4.6106563697320695e-08, + "logits/chosen": -0.6351941823959351, + "logits/rejected": 0.735435962677002, + "logps/chosen": -432.3277282714844, + "logps/rejected": -521.2852783203125, + "loss": 0.4985, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9638311862945557, + "rewards/margins": 1.1676876544952393, + "rewards/rejected": -3.131518840789795, + "step": 7220 + }, + { + "epoch": 0.95, + "learning_rate": 4.394860583968624e-08, + "logits/chosen": 0.21465528011322021, + "logits/rejected": -0.21870703995227814, + "logps/chosen": -392.111328125, + "logps/rejected": -517.2110595703125, + "loss": 0.5165, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8918077945709229, + "rewards/margins": 1.0091055631637573, + "rewards/rejected": -2.9009132385253906, + "step": 7230 + }, + { + "epoch": 0.95, + "learning_rate": 4.1841918145771874e-08, + "logits/chosen": -0.24777980148792267, + "logits/rejected": 0.0036219656467437744, + "logps/chosen": -458.74249267578125, + "logps/rejected": -550.5509033203125, + "loss": 0.4325, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8298200368881226, + "rewards/margins": 1.063430905342102, + "rewards/rejected": -2.8932509422302246, + "step": 7240 + }, + { + "epoch": 0.95, + "learning_rate": 3.978654459276088e-08, + "logits/chosen": -0.7501617670059204, + "logits/rejected": 0.33591216802597046, + "logps/chosen": -512.9222412109375, + "logps/rejected": -573.7879028320312, + "loss": 0.4945, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9614841938018799, + "rewards/margins": 1.2372934818267822, + "rewards/rejected": -3.198777437210083, + "step": 7250 + }, + { + "epoch": 0.95, + "learning_rate": 3.778252808665284e-08, + "logits/chosen": -0.8246575593948364, + "logits/rejected": 0.21241407096385956, + "logps/chosen": -523.8837890625, + "logps/rejected": -519.209716796875, + "loss": 0.5693, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0833640098571777, + "rewards/margins": 0.7381225824356079, + "rewards/rejected": -2.821486711502075, + "step": 7260 + }, + { + "epoch": 0.95, + "learning_rate": 3.5829910461366023e-08, + "logits/chosen": 0.01369396410882473, + "logits/rejected": 0.08776617050170898, + "logps/chosen": -458.39581298828125, + "logps/rejected": -524.14013671875, + "loss": 0.6843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1614108085632324, + "rewards/margins": 0.7248591780662537, + "rewards/rejected": -2.8862698078155518, + "step": 7270 + }, + { + "epoch": 0.95, + "learning_rate": 3.39287324778656e-08, + "logits/chosen": -0.7124800086021423, + "logits/rejected": 0.3487294912338257, + "logps/chosen": -543.2281494140625, + "logps/rejected": -586.5289916992188, + "loss": 0.6138, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2240188121795654, + "rewards/margins": 0.852017879486084, + "rewards/rejected": -3.0760366916656494, + "step": 7280 + }, + { + "epoch": 0.95, + "learning_rate": 3.207903382331262e-08, + "logits/chosen": -0.7476862072944641, + "logits/rejected": 0.375562846660614, + "logps/chosen": -478.54052734375, + "logps/rejected": -548.2950439453125, + "loss": 0.4735, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7987592220306396, + "rewards/margins": 1.1323232650756836, + "rewards/rejected": -2.9310824871063232, + "step": 7290 + }, + { + "epoch": 0.96, + "learning_rate": 3.028085311023443e-08, + "logits/chosen": -0.5295546054840088, + "logits/rejected": 0.3997403085231781, + "logps/chosen": -470.9478454589844, + "logps/rejected": -556.5192260742188, + "loss": 0.4488, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9478727579116821, + "rewards/margins": 1.1733306646347046, + "rewards/rejected": -3.1212034225463867, + "step": 7300 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.2130078077316284, + "eval_logits/rejected": 1.9687855243682861, + "eval_logps/chosen": -469.9289245605469, + "eval_logps/rejected": -550.9682006835938, + "eval_loss": 0.5058298707008362, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -2.0143890380859375, + "eval_rewards/margins": 1.00948166847229, + "eval_rewards/rejected": -3.0238709449768066, + "eval_runtime": 1188.4057, + "eval_samples_per_second": 1.683, + "eval_steps_per_second": 0.841, + "step": 7300 + }, + { + "epoch": 0.96, + "learning_rate": 2.8534227875720576e-08, + "logits/chosen": -0.3639126420021057, + "logits/rejected": -0.1453981250524521, + "logps/chosen": -465.8466796875, + "logps/rejected": -564.0140991210938, + "loss": 0.5208, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.014688014984131, + "rewards/margins": 1.0618035793304443, + "rewards/rejected": -3.0764918327331543, + "step": 7310 + }, + { + "epoch": 0.96, + "learning_rate": 2.683919458063705e-08, + "logits/chosen": -0.6341809630393982, + "logits/rejected": 0.7493041753768921, + "logps/chosen": -391.2576599121094, + "logps/rejected": -444.69024658203125, + "loss": 0.5043, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8518688678741455, + "rewards/margins": 0.9767493009567261, + "rewards/rejected": -2.828618288040161, + "step": 7320 + }, + { + "epoch": 0.96, + "learning_rate": 2.5195788608866345e-08, + "logits/chosen": -0.28447026014328003, + "logits/rejected": 0.6122376322746277, + "logps/chosen": -529.1468505859375, + "logps/rejected": -554.7265625, + "loss": 0.5484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.015683889389038, + "rewards/margins": 0.9786075353622437, + "rewards/rejected": -2.994291305541992, + "step": 7330 + }, + { + "epoch": 0.96, + "learning_rate": 2.3604044266569426e-08, + "logits/chosen": -0.8208459615707397, + "logits/rejected": 0.9798334836959839, + "logps/chosen": -484.0049743652344, + "logps/rejected": -558.9443969726562, + "loss": 0.4936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.064974546432495, + "rewards/margins": 1.0787862539291382, + "rewards/rejected": -3.1437606811523438, + "step": 7340 + }, + { + "epoch": 0.96, + "learning_rate": 2.2063994781468256e-08, + "logits/chosen": -0.2842608094215393, + "logits/rejected": 0.27055567502975464, + "logps/chosen": -452.8561096191406, + "logps/rejected": -518.8817749023438, + "loss": 0.5172, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8908157348632812, + "rewards/margins": 1.0217288732528687, + "rewards/rejected": -2.9125447273254395, + "step": 7350 + }, + { + "epoch": 0.96, + "learning_rate": 2.057567230215246e-08, + "logits/chosen": -0.3838343322277069, + "logits/rejected": -0.392334908246994, + "logps/chosen": -468.7618103027344, + "logps/rejected": -558.9362182617188, + "loss": 0.5131, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0254180431365967, + "rewards/margins": 0.84428870677948, + "rewards/rejected": -2.869706630706787, + "step": 7360 + }, + { + "epoch": 0.96, + "learning_rate": 1.9139107897409303e-08, + "logits/chosen": -0.3951881229877472, + "logits/rejected": 0.7453802824020386, + "logps/chosen": -481.59222412109375, + "logps/rejected": -544.9617919921875, + "loss": 0.4279, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9218416213989258, + "rewards/margins": 1.2567031383514404, + "rewards/rejected": -3.178544521331787, + "step": 7370 + }, + { + "epoch": 0.97, + "learning_rate": 1.7754331555573656e-08, + "logits/chosen": -0.5559927821159363, + "logits/rejected": -0.07539238035678864, + "logps/chosen": -494.11053466796875, + "logps/rejected": -615.8850708007812, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.090358018875122, + "rewards/margins": 0.9464312791824341, + "rewards/rejected": -3.0367894172668457, + "step": 7380 + }, + { + "epoch": 0.97, + "learning_rate": 1.642137218390294e-08, + "logits/chosen": -0.41712522506713867, + "logits/rejected": 0.7344776391983032, + "logps/chosen": -474.60247802734375, + "logps/rejected": -519.6890869140625, + "loss": 0.5292, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9687159061431885, + "rewards/margins": 0.937299370765686, + "rewards/rejected": -2.906015396118164, + "step": 7390 + }, + { + "epoch": 0.97, + "learning_rate": 1.514025760797344e-08, + "logits/chosen": -0.8390815854072571, + "logits/rejected": 0.6691681742668152, + "logps/chosen": -529.1636962890625, + "logps/rejected": -570.968017578125, + "loss": 0.4747, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.024813175201416, + "rewards/margins": 1.0949513912200928, + "rewards/rejected": -3.119764804840088, + "step": 7400 + }, + { + "epoch": 0.97, + "eval_logits/chosen": 1.2122125625610352, + "eval_logits/rejected": 1.9678871631622314, + "eval_logps/chosen": -469.9052429199219, + "eval_logps/rejected": -550.9177856445312, + "eval_loss": 0.5057068467140198, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -2.0141522884368896, + "eval_rewards/margins": 1.0092144012451172, + "eval_rewards/rejected": -3.023366689682007, + "eval_runtime": 1179.1327, + "eval_samples_per_second": 1.696, + "eval_steps_per_second": 0.848, + "step": 7400 + }, + { + "epoch": 0.97, + "learning_rate": 1.3911014571098835e-08, + "logits/chosen": -0.4136602282524109, + "logits/rejected": -0.011446630582213402, + "logps/chosen": -441.872314453125, + "logps/rejected": -560.9342041015625, + "loss": 0.4565, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9357706308364868, + "rewards/margins": 1.1018954515457153, + "rewards/rejected": -3.037665843963623, + "step": 7410 + }, + { + "epoch": 0.97, + "learning_rate": 1.2733668733773685e-08, + "logits/chosen": -0.6516170501708984, + "logits/rejected": 0.346381813287735, + "logps/chosen": -461.1044006347656, + "logps/rejected": -534.1959228515625, + "loss": 0.4793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.893280029296875, + "rewards/margins": 1.116193175315857, + "rewards/rejected": -3.0094730854034424, + "step": 7420 + }, + { + "epoch": 0.97, + "learning_rate": 1.160824467313526e-08, + "logits/chosen": -0.6387825608253479, + "logits/rejected": 0.31129929423332214, + "logps/chosen": -525.2669067382812, + "logps/rejected": -617.6837158203125, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.132877826690674, + "rewards/margins": 1.1318700313568115, + "rewards/rejected": -3.2647480964660645, + "step": 7430 + }, + { + "epoch": 0.97, + "learning_rate": 1.0534765882453113e-08, + "logits/chosen": -0.9205204248428345, + "logits/rejected": 0.434356689453125, + "logps/chosen": -449.462890625, + "logps/rejected": -524.9171752929688, + "loss": 0.5114, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9196815490722656, + "rewards/margins": 0.9198969602584839, + "rewards/rejected": -2.83957839012146, + "step": 7440 + }, + { + "epoch": 0.97, + "learning_rate": 9.513254770636138e-09, + "logits/chosen": -0.3660666048526764, + "logits/rejected": 0.5724458694458008, + "logps/chosen": -530.5885009765625, + "logps/rejected": -592.3338623046875, + "loss": 0.5655, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.310126543045044, + "rewards/margins": 0.7468729019165039, + "rewards/rejected": -3.056999444961548, + "step": 7450 + }, + { + "epoch": 0.98, + "learning_rate": 8.543732661767113e-09, + "logits/chosen": -0.14984729886054993, + "logits/rejected": 0.1948297917842865, + "logps/chosen": -473.29302978515625, + "logps/rejected": -562.1162109375, + "loss": 0.5471, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9029672145843506, + "rewards/margins": 0.876223087310791, + "rewards/rejected": -2.7791905403137207, + "step": 7460 + }, + { + "epoch": 0.98, + "learning_rate": 7.626219794655553e-09, + "logits/chosen": -0.7110680937767029, + "logits/rejected": 0.13748976588249207, + "logps/chosen": -441.283203125, + "logps/rejected": -585.7564086914062, + "loss": 0.4372, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9341871738433838, + "rewards/margins": 1.3025288581848145, + "rewards/rejected": -3.236715793609619, + "step": 7470 + }, + { + "epoch": 0.98, + "learning_rate": 6.7607353224163896e-09, + "logits/chosen": -0.5482068657875061, + "logits/rejected": 0.41012755036354065, + "logps/chosen": -485.97540283203125, + "logps/rejected": -520.5703125, + "loss": 0.5626, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1319339275360107, + "rewards/margins": 0.7751830220222473, + "rewards/rejected": -2.907116413116455, + "step": 7480 + }, + { + "epoch": 0.98, + "learning_rate": 5.947297312070554e-09, + "logits/chosen": -0.6903416514396667, + "logits/rejected": 0.9713506698608398, + "logps/chosen": -506.7986755371094, + "logps/rejected": -553.6888427734375, + "loss": 0.4687, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.060804843902588, + "rewards/margins": 1.1317999362945557, + "rewards/rejected": -3.1926045417785645, + "step": 7490 + }, + { + "epoch": 0.98, + "learning_rate": 5.185922744166128e-09, + "logits/chosen": -0.4292621612548828, + "logits/rejected": 0.4434017539024353, + "logps/chosen": -476.03125, + "logps/rejected": -570.34375, + "loss": 0.4494, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9747473001480103, + "rewards/margins": 1.1069953441619873, + "rewards/rejected": -3.081742525100708, + "step": 7500 + }, + { + "epoch": 0.98, + "eval_logits/chosen": 1.212050437927246, + "eval_logits/rejected": 1.967947244644165, + "eval_logps/chosen": -469.93450927734375, + "eval_logps/rejected": -550.9584350585938, + "eval_loss": 0.5057631134986877, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -2.0144448280334473, + "eval_rewards/margins": 1.0093281269073486, + "eval_rewards/rejected": -3.023772954940796, + "eval_runtime": 1173.1475, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 7500 + }, + { + "epoch": 0.98, + "learning_rate": 4.476627512425558e-09, + "logits/chosen": -0.2599974274635315, + "logits/rejected": -0.058055657893419266, + "logps/chosen": -464.0921936035156, + "logps/rejected": -562.7515869140625, + "loss": 0.4728, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9272934198379517, + "rewards/margins": 1.0648444890975952, + "rewards/rejected": -2.992137908935547, + "step": 7510 + }, + { + "epoch": 0.98, + "learning_rate": 3.819426423412875e-09, + "logits/chosen": -0.6782014966011047, + "logits/rejected": 0.14074628055095673, + "logps/chosen": -494.22979736328125, + "logps/rejected": -555.0218505859375, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.09846830368042, + "rewards/margins": 0.9207183122634888, + "rewards/rejected": -3.0191867351531982, + "step": 7520 + }, + { + "epoch": 0.99, + "learning_rate": 3.2143331962256053e-09, + "logits/chosen": -0.24794983863830566, + "logits/rejected": 0.10213349014520645, + "logps/chosen": -464.79620361328125, + "logps/rejected": -570.9036865234375, + "loss": 0.4865, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8833863735198975, + "rewards/margins": 1.067216157913208, + "rewards/rejected": -2.9506025314331055, + "step": 7530 + }, + { + "epoch": 0.99, + "learning_rate": 2.6613604622066635e-09, + "logits/chosen": -0.35508936643600464, + "logits/rejected": -0.39137864112854004, + "logps/chosen": -444.24786376953125, + "logps/rejected": -548.8897705078125, + "loss": 0.5311, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8555755615234375, + "rewards/margins": 0.892242431640625, + "rewards/rejected": -2.7478179931640625, + "step": 7540 + }, + { + "epoch": 0.99, + "learning_rate": 2.1605197646826228e-09, + "logits/chosen": -0.4565364718437195, + "logits/rejected": 0.6792299151420593, + "logps/chosen": -427.64044189453125, + "logps/rejected": -537.2177734375, + "loss": 0.4213, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.849523901939392, + "rewards/margins": 1.3361475467681885, + "rewards/rejected": -3.185671806335449, + "step": 7550 + }, + { + "epoch": 0.99, + "learning_rate": 1.711821558721405e-09, + "logits/chosen": -0.9418695569038391, + "logits/rejected": 0.49443039298057556, + "logps/chosen": -510.28033447265625, + "logps/rejected": -537.0841674804688, + "loss": 0.4911, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0567002296447754, + "rewards/margins": 0.8696410059928894, + "rewards/rejected": -2.9263408184051514, + "step": 7560 + }, + { + "epoch": 0.99, + "learning_rate": 1.3152752109149569e-09, + "logits/chosen": -0.4310362935066223, + "logits/rejected": 0.2431652992963791, + "logps/chosen": -481.66925048828125, + "logps/rejected": -550.9615478515625, + "loss": 0.5462, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9845142364501953, + "rewards/margins": 0.914382815361023, + "rewards/rejected": -2.898897171020508, + "step": 7570 + }, + { + "epoch": 0.99, + "learning_rate": 9.708889991830173e-10, + "logits/chosen": -0.8171736001968384, + "logits/rejected": 0.5985888242721558, + "logps/chosen": -484.5865783691406, + "logps/rejected": -540.4600219726562, + "loss": 0.4686, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.115016460418701, + "rewards/margins": 1.2058589458465576, + "rewards/rejected": -3.3208751678466797, + "step": 7580 + }, + { + "epoch": 0.99, + "learning_rate": 6.786701125999218e-10, + "logits/chosen": 0.09515878558158875, + "logits/rejected": 0.33796870708465576, + "logps/chosen": -481.95538330078125, + "logps/rejected": -555.1029052734375, + "loss": 0.572, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.285205364227295, + "rewards/margins": 0.8154329061508179, + "rewards/rejected": -3.100637912750244, + "step": 7590 + }, + { + "epoch": 0.99, + "learning_rate": 4.3862465124638873e-10, + "logits/chosen": -0.09601716697216034, + "logits/rejected": -0.01322057843208313, + "logps/chosen": -466.84674072265625, + "logps/rejected": -536.1382446289062, + "loss": 0.5319, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0459887981414795, + "rewards/margins": 0.8657206296920776, + "rewards/rejected": -2.9117093086242676, + "step": 7600 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 1.212050437927246, + "eval_logits/rejected": 1.967947244644165, + "eval_logps/chosen": -469.93450927734375, + "eval_logps/rejected": -550.9584350585938, + "eval_loss": 0.5057631134986877, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -2.0144448280334473, + "eval_rewards/margins": 1.0093281269073486, + "eval_rewards/rejected": -3.023772954940796, + "eval_runtime": 1169.363, + "eval_samples_per_second": 1.71, + "eval_steps_per_second": 0.855, + "step": 7600 + }, + { + "epoch": 1.0, + "learning_rate": 2.507576260799005e-10, + "logits/chosen": -0.826134979724884, + "logits/rejected": 0.014235076494514942, + "logps/chosen": -475.70306396484375, + "logps/rejected": -575.2286376953125, + "loss": 0.4462, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.839888334274292, + "rewards/margins": 1.1129884719848633, + "rewards/rejected": -2.952876567840576, + "step": 7610 + }, + { + "epoch": 1.0, + "learning_rate": 1.1507295883145253e-10, + "logits/chosen": -0.5861895680427551, + "logits/rejected": 0.36244645714759827, + "logps/chosen": -472.48651123046875, + "logps/rejected": -567.5523681640625, + "loss": 0.5132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.945277214050293, + "rewards/margins": 0.905789852142334, + "rewards/rejected": -2.851067543029785, + "step": 7620 + }, + { + "epoch": 1.0, + "learning_rate": 3.1573481923952156e-11, + "logits/chosen": -0.40739935636520386, + "logits/rejected": 0.2489284723997116, + "logps/chosen": -516.2659912109375, + "logps/rejected": -590.5760498046875, + "loss": 0.5475, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.035932779312134, + "rewards/margins": 1.092670202255249, + "rewards/rejected": -3.1286027431488037, + "step": 7630 + }, + { + "epoch": 1.0, + "learning_rate": 2.609384119889313e-13, + "logits/chosen": -0.07999588549137115, + "logits/rejected": 0.025267338380217552, + "logps/chosen": -458.60418701171875, + "logps/rejected": -581.3901977539062, + "loss": 0.4471, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0217761993408203, + "rewards/margins": 1.1485462188720703, + "rewards/rejected": -3.1703224182128906, + "step": 7640 + }, + { + "epoch": 1.0, + "step": 7641, + "total_flos": 0.0, + "train_loss": 0.539378755131, + "train_runtime": 172496.8646, + "train_samples_per_second": 0.354, + "train_steps_per_second": 0.044 + } + ], + "logging_steps": 10, + "max_steps": 7641, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}