diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10772 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 3000, + "global_step": 7642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -3.067896842956543, + "logits/rejected": -2.665156364440918, + "logps/chosen": -369.56707763671875, + "logps/rejected": -245.21652221679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.7294201850891113, + "logits/rejected": -2.721268892288208, + "logps/chosen": -305.72943115234375, + "logps/rejected": -252.87887573242188, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00022612199245486408, + "rewards/margins": 0.0007171333418227732, + "rewards/rejected": -0.0009432554943487048, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.6429858207702637, + "logits/rejected": -2.6347553730010986, + "logps/chosen": -268.5677795410156, + "logps/rejected": -286.9141540527344, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00045462269918061793, + "rewards/margins": 0.0008564515737816691, + "rewards/rejected": -0.00040182872908189893, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.811220169067383, + "logits/rejected": -2.7868704795837402, + "logps/chosen": -259.4601135253906, + "logps/rejected": -241.2826690673828, + "loss": 0.693, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.00037817476550117135, + "rewards/margins": -0.0003882342134602368, + "rewards/rejected": 1.0059355190605856e-05, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.758310317993164, + "logits/rejected": -2.7663474082946777, + "logps/chosen": -265.38970947265625, + "logps/rejected": -253.40817260742188, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0002627837238833308, + "rewards/margins": 0.00041533843614161015, + "rewards/rejected": -0.00015255471225827932, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.758348226547241, + "logits/rejected": -2.8130736351013184, + "logps/chosen": -314.16876220703125, + "logps/rejected": -267.50469970703125, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0012757123913615942, + "rewards/margins": 0.001312709879130125, + "rewards/rejected": -3.6997324059484527e-05, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.8405418395996094, + "logits/rejected": -2.814239025115967, + "logps/chosen": -344.8076171875, + "logps/rejected": -274.74407958984375, + "loss": 0.6935, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0006552209961228073, + "rewards/margins": -0.0008447232539765537, + "rewards/rejected": 0.0014999441336840391, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.7691597938537598, + "logits/rejected": -2.734368085861206, + "logps/chosen": -303.10333251953125, + "logps/rejected": -304.51019287109375, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004421949270181358, + "rewards/margins": 0.0008764710510149598, + "rewards/rejected": -0.000434276123996824, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.76478910446167, + "logits/rejected": -2.7242846488952637, + "logps/chosen": -303.20611572265625, + "logps/rejected": -266.6237487792969, + "loss": 0.6933, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.0013080921489745378, + "rewards/margins": -0.0003038823197130114, + "rewards/rejected": -0.0010042100911960006, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.7607009410858154, + "logits/rejected": -2.77255916595459, + "logps/chosen": -259.4180603027344, + "logps/rejected": -227.8728790283203, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0019916724413633347, + "rewards/margins": 0.0010361919412389398, + "rewards/rejected": 0.0009554806165397167, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.7348523139953613, + "logits/rejected": -2.6490190029144287, + "logps/chosen": -281.78057861328125, + "logps/rejected": -282.218017578125, + "loss": 0.6933, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0007766232592985034, + "rewards/margins": -0.0009761027176864445, + "rewards/rejected": 0.0017527260351926088, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.7022414207458496, + "logits/rejected": -2.7490687370300293, + "logps/chosen": -308.96270751953125, + "logps/rejected": -325.3237609863281, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0012512827524915338, + "rewards/margins": 0.0013003626372665167, + "rewards/rejected": -4.907987386104651e-05, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.6600308418273926, + "logits/rejected": -2.6658639907836914, + "logps/chosen": -240.9485626220703, + "logps/rejected": -240.7635498046875, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0016876354347914457, + "rewards/margins": 0.0012553991982713342, + "rewards/rejected": 0.00043223617831245065, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.559237003326416, + "logits/rejected": -2.610476016998291, + "logps/chosen": -285.5517578125, + "logps/rejected": -235.6249237060547, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0017439329531043768, + "rewards/margins": 0.0025381764862686396, + "rewards/rejected": -0.0007942432421259582, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.687140464782715, + "logits/rejected": -2.704700231552124, + "logps/chosen": -245.53781127929688, + "logps/rejected": -239.60733032226562, + "loss": 0.6934, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0007982795941643417, + "rewards/margins": 0.00011384822573745623, + "rewards/rejected": 0.0006844315794296563, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.648986339569092, + "logits/rejected": -2.6699306964874268, + "logps/chosen": -251.0587615966797, + "logps/rejected": -298.28509521484375, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0021498159039765596, + "rewards/margins": 0.000584556080866605, + "rewards/rejected": 0.0015652598813176155, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.6839661598205566, + "logits/rejected": -2.6052136421203613, + "logps/chosen": -253.9778289794922, + "logps/rejected": -262.3018493652344, + "loss": 0.6929, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0009039879078045487, + "rewards/margins": -0.0014957765815779567, + "rewards/rejected": 0.0023997644893825054, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.8574955463409424, + "logits/rejected": -2.737119197845459, + "logps/chosen": -278.136474609375, + "logps/rejected": -230.5788116455078, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.002106861211359501, + "rewards/margins": 0.000668777443934232, + "rewards/rejected": 0.0014380835928022861, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.7418787479400635, + "logits/rejected": -2.746934413909912, + "logps/chosen": -203.57164001464844, + "logps/rejected": -214.1345977783203, + "loss": 0.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0030259876511991024, + "rewards/margins": 0.0011640565935522318, + "rewards/rejected": 0.001861930824816227, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.8132755756378174, + "logits/rejected": -2.8657147884368896, + "logps/chosen": -304.7922058105469, + "logps/rejected": -245.5707550048828, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0030305986292660236, + "rewards/margins": 0.0001166601650766097, + "rewards/rejected": 0.0029139386024326086, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.75506329536438, + "logits/rejected": -2.6703126430511475, + "logps/chosen": -259.27557373046875, + "logps/rejected": -226.0971221923828, + "loss": 0.692, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005612267181277275, + "rewards/margins": 0.0017307508969679475, + "rewards/rejected": 0.0038815164007246494, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.7886624336242676, + "logits/rejected": -2.743159770965576, + "logps/chosen": -252.05056762695312, + "logps/rejected": -276.8826904296875, + "loss": 0.6927, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005716738756746054, + "rewards/margins": 0.0016259342664852738, + "rewards/rejected": 0.004090805072337389, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.747499942779541, + "logits/rejected": -2.7848358154296875, + "logps/chosen": -246.60159301757812, + "logps/rejected": -238.0751953125, + "loss": 0.6936, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005415156949311495, + "rewards/margins": -0.0007758921710774302, + "rewards/rejected": 0.006191048305481672, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.7876715660095215, + "logits/rejected": -2.7275068759918213, + "logps/chosen": -267.59906005859375, + "logps/rejected": -239.3266143798828, + "loss": 0.6924, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.007181273307651281, + "rewards/margins": 0.0002540668356232345, + "rewards/rejected": 0.006927207112312317, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.800996780395508, + "logits/rejected": -2.7003908157348633, + "logps/chosen": -287.8214111328125, + "logps/rejected": -314.76995849609375, + "loss": 0.6921, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00708800321444869, + "rewards/margins": 0.00336349755525589, + "rewards/rejected": 0.0037245056591928005, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.6484315395355225, + "logits/rejected": -2.700706958770752, + "logps/chosen": -278.3447265625, + "logps/rejected": -267.60650634765625, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006393952760845423, + "rewards/margins": -6.773813220206648e-05, + "rewards/rejected": 0.006461690180003643, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.7541725635528564, + "logits/rejected": -2.7509753704071045, + "logps/chosen": -346.51800537109375, + "logps/rejected": -270.1097106933594, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008134925737977028, + "rewards/margins": 0.0005340513889677823, + "rewards/rejected": 0.0076008751057088375, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.8576409816741943, + "logits/rejected": -2.8007590770721436, + "logps/chosen": -306.0218505859375, + "logps/rejected": -275.7831115722656, + "loss": 0.6921, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011459826491773129, + "rewards/margins": 0.004005993716418743, + "rewards/rejected": 0.00745383370667696, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.70776104927063, + "logits/rejected": -2.7481980323791504, + "logps/chosen": -308.8506774902344, + "logps/rejected": -249.86709594726562, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009326718747615814, + "rewards/margins": 0.0025166200939565897, + "rewards/rejected": 0.006810098886489868, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.757758617401123, + "logits/rejected": -2.6980502605438232, + "logps/chosen": -263.22271728515625, + "logps/rejected": -279.9680480957031, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011617752723395824, + "rewards/margins": 0.0024096411652863026, + "rewards/rejected": 0.009208111092448235, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.820605993270874, + "logits/rejected": -2.8428916931152344, + "logps/chosen": -276.6488952636719, + "logps/rejected": -283.6178283691406, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013900157995522022, + "rewards/margins": 0.0006721949321217835, + "rewards/rejected": 0.013227961957454681, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.6398987770080566, + "logits/rejected": -2.6090402603149414, + "logps/chosen": -302.3424377441406, + "logps/rejected": -250.61837768554688, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015470264479517937, + "rewards/margins": 0.005023065954446793, + "rewards/rejected": 0.010447200387716293, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.735560894012451, + "logits/rejected": -2.798781633377075, + "logps/chosen": -213.24282836914062, + "logps/rejected": -214.4536895751953, + "loss": 0.6904, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018553482368588448, + "rewards/margins": 0.00482649402692914, + "rewards/rejected": 0.013726988807320595, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.8908467292785645, + "logits/rejected": -2.8173251152038574, + "logps/chosen": -330.4458923339844, + "logps/rejected": -269.921142578125, + "loss": 0.6914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022704200819134712, + "rewards/margins": 0.0067848386242985725, + "rewards/rejected": 0.015919361263513565, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.707131862640381, + "logits/rejected": -2.7113561630249023, + "logps/chosen": -330.91375732421875, + "logps/rejected": -228.343505859375, + "loss": 0.6895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.023857425898313522, + "rewards/margins": 0.009241563268005848, + "rewards/rejected": 0.014615865424275398, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.722703218460083, + "logits/rejected": -2.715259075164795, + "logps/chosen": -241.8896484375, + "logps/rejected": -241.798828125, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.021002648398280144, + "rewards/margins": 0.003937164321541786, + "rewards/rejected": 0.017065484076738358, + "step": 350 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.7021656036376953, + "logits/rejected": -2.629462242126465, + "logps/chosen": -280.72113037109375, + "logps/rejected": -227.78271484375, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02864675782620907, + "rewards/margins": 0.006986622698605061, + "rewards/rejected": 0.021660136058926582, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.811124563217163, + "logits/rejected": -2.8341519832611084, + "logps/chosen": -246.65103149414062, + "logps/rejected": -397.9068908691406, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.023113662376999855, + "rewards/margins": 0.002985612954944372, + "rewards/rejected": 0.020128050819039345, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.734302043914795, + "logits/rejected": -2.693673610687256, + "logps/chosen": -230.55703735351562, + "logps/rejected": -209.6863555908203, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025653624907135963, + "rewards/margins": 0.005581668112426996, + "rewards/rejected": 0.020071957260370255, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.738853931427002, + "logits/rejected": -2.6872971057891846, + "logps/chosen": -309.9087829589844, + "logps/rejected": -272.66424560546875, + "loss": 0.6884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.030913596972823143, + "rewards/margins": 0.009090432897210121, + "rewards/rejected": 0.021823160350322723, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.793600082397461, + "logits/rejected": -2.8919758796691895, + "logps/chosen": -297.2984313964844, + "logps/rejected": -287.2091369628906, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03484376519918442, + "rewards/margins": 0.005045943893492222, + "rewards/rejected": 0.02979782596230507, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.8387460708618164, + "logits/rejected": -2.809161901473999, + "logps/chosen": -258.84844970703125, + "logps/rejected": -240.54306030273438, + "loss": 0.6903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.033333465456962585, + "rewards/margins": 0.009452047757804394, + "rewards/rejected": 0.023881418630480766, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.7520031929016113, + "logits/rejected": -2.7394309043884277, + "logps/chosen": -402.2895812988281, + "logps/rejected": -370.82952880859375, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03959353268146515, + "rewards/margins": 0.009067794308066368, + "rewards/rejected": 0.03052573837339878, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.693946361541748, + "logits/rejected": -2.6700122356414795, + "logps/chosen": -288.62689208984375, + "logps/rejected": -290.95660400390625, + "loss": 0.6871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.037793610244989395, + "rewards/margins": 0.011007636785507202, + "rewards/rejected": 0.026785975322127342, + "step": 430 + }, + { + "epoch": 0.06, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.797002077102661, + "logits/rejected": -2.8516902923583984, + "logps/chosen": -294.93109130859375, + "logps/rejected": -271.66290283203125, + "loss": 0.6883, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.04043292626738548, + "rewards/margins": 0.009669439867138863, + "rewards/rejected": 0.03076348826289177, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.694373607635498, + "logits/rejected": -2.6590065956115723, + "logps/chosen": -239.82382202148438, + "logps/rejected": -249.83810424804688, + "loss": 0.6886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0385117270052433, + "rewards/margins": 0.011470241472125053, + "rewards/rejected": 0.0270414836704731, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.7474026679992676, + "logits/rejected": -2.676415205001831, + "logps/chosen": -298.39715576171875, + "logps/rejected": -192.79092407226562, + "loss": 0.6844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05105415731668472, + "rewards/margins": 0.02257906273007393, + "rewards/rejected": 0.028475087136030197, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.775913715362549, + "logits/rejected": -2.8217592239379883, + "logps/chosen": -265.2435607910156, + "logps/rejected": -244.2783660888672, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0432317815721035, + "rewards/margins": 0.012395241297781467, + "rewards/rejected": 0.03083653748035431, + "step": 470 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.7269763946533203, + "logits/rejected": -2.740833044052124, + "logps/chosen": -289.32708740234375, + "logps/rejected": -312.49078369140625, + "loss": 0.6899, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.04812690615653992, + "rewards/margins": 0.004453951492905617, + "rewards/rejected": 0.04367295280098915, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.8748435974121094, + "logits/rejected": -2.7451539039611816, + "logps/chosen": -275.0404968261719, + "logps/rejected": -244.89865112304688, + "loss": 0.6844, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04758143052458763, + "rewards/margins": 0.021307654678821564, + "rewards/rejected": 0.026273775845766068, + "step": 490 + }, + { + "epoch": 0.07, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.796952486038208, + "logits/rejected": -2.8082685470581055, + "logps/chosen": -236.18270874023438, + "logps/rejected": -240.5693359375, + "loss": 0.6859, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.047277629375457764, + "rewards/margins": 0.020680280402302742, + "rewards/rejected": 0.02659735083580017, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.839818239212036, + "logits/rejected": -2.8162848949432373, + "logps/chosen": -358.53143310546875, + "logps/rejected": -258.11370849609375, + "loss": 0.6802, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06839190423488617, + "rewards/margins": 0.03216848522424698, + "rewards/rejected": 0.036223411560058594, + "step": 510 + }, + { + "epoch": 0.07, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.818481922149658, + "logits/rejected": -2.7945566177368164, + "logps/chosen": -306.904296875, + "logps/rejected": -250.4951934814453, + "loss": 0.6842, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05793602019548416, + "rewards/margins": 0.01893232949078083, + "rewards/rejected": 0.03900368884205818, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.7927000522613525, + "logits/rejected": -2.7819037437438965, + "logps/chosen": -333.9432067871094, + "logps/rejected": -279.5553894042969, + "loss": 0.685, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06334703415632248, + "rewards/margins": 0.021390151232481003, + "rewards/rejected": 0.041956886649131775, + "step": 530 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.8078837394714355, + "logits/rejected": -2.8248987197875977, + "logps/chosen": -310.5298156738281, + "logps/rejected": -290.93121337890625, + "loss": 0.6846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06569667905569077, + "rewards/margins": 0.014290650375187397, + "rewards/rejected": 0.051406025886535645, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -2.673478603363037, + "logits/rejected": -2.5919220447540283, + "logps/chosen": -280.1520080566406, + "logps/rejected": -226.1075439453125, + "loss": 0.6784, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.067818284034729, + "rewards/margins": 0.03553170710802078, + "rewards/rejected": 0.03228657692670822, + "step": 550 + }, + { + "epoch": 0.07, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -2.7392077445983887, + "logits/rejected": -2.809436321258545, + "logps/chosen": -273.9886474609375, + "logps/rejected": -258.1772155761719, + "loss": 0.6818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05769692733883858, + "rewards/margins": 0.02829897403717041, + "rewards/rejected": 0.029397953301668167, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.6483209133148193, + "logits/rejected": -2.743899345397949, + "logps/chosen": -318.599609375, + "logps/rejected": -243.72738647460938, + "loss": 0.6791, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.06688439100980759, + "rewards/margins": 0.040116921067237854, + "rewards/rejected": 0.026767458766698837, + "step": 570 + }, + { + "epoch": 0.08, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -2.718994617462158, + "logits/rejected": -2.6787586212158203, + "logps/chosen": -329.4825744628906, + "logps/rejected": -241.14859008789062, + "loss": 0.677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06385556608438492, + "rewards/margins": 0.0331583209335804, + "rewards/rejected": 0.030697250738739967, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -2.7615089416503906, + "logits/rejected": -2.7959094047546387, + "logps/chosen": -241.9472198486328, + "logps/rejected": -234.4981689453125, + "loss": 0.6809, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.056456904858350754, + "rewards/margins": 0.02373141422867775, + "rewards/rejected": 0.032725490629673004, + "step": 590 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -2.746553421020508, + "logits/rejected": -2.781179904937744, + "logps/chosen": -279.4906921386719, + "logps/rejected": -272.57916259765625, + "loss": 0.6777, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05568607896566391, + "rewards/margins": 0.036982376128435135, + "rewards/rejected": 0.018703702837228775, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -2.6201729774475098, + "logits/rejected": -2.673488140106201, + "logps/chosen": -332.6716613769531, + "logps/rejected": -283.5208740234375, + "loss": 0.6796, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04745926707983017, + "rewards/margins": 0.037932686507701874, + "rewards/rejected": 0.009526585228741169, + "step": 610 + }, + { + "epoch": 0.08, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -2.7669689655303955, + "logits/rejected": -2.750363349914551, + "logps/chosen": -279.34197998046875, + "logps/rejected": -259.713134765625, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05433487892150879, + "rewards/margins": 0.026942182332277298, + "rewards/rejected": 0.027392691001296043, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -2.6598455905914307, + "logits/rejected": -2.7436347007751465, + "logps/chosen": -227.24819946289062, + "logps/rejected": -211.1447296142578, + "loss": 0.6771, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03060081973671913, + "rewards/margins": 0.025880303233861923, + "rewards/rejected": 0.004720507655292749, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -2.730870008468628, + "logits/rejected": -2.763122081756592, + "logps/chosen": -295.49224853515625, + "logps/rejected": -313.200927734375, + "loss": 0.6708, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.056504249572753906, + "rewards/margins": 0.04328325390815735, + "rewards/rejected": 0.013220993801951408, + "step": 640 + }, + { + "epoch": 0.09, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -2.687933921813965, + "logits/rejected": -2.7529239654541016, + "logps/chosen": -317.00494384765625, + "logps/rejected": -285.4657287597656, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0442182794213295, + "rewards/margins": 0.01487693376839161, + "rewards/rejected": 0.02934134379029274, + "step": 650 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -2.7617669105529785, + "logits/rejected": -2.7793593406677246, + "logps/chosen": -241.38919067382812, + "logps/rejected": -202.88455200195312, + "loss": 0.6762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04204503446817398, + "rewards/margins": 0.034246526658535004, + "rewards/rejected": 0.0077985054813325405, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -2.7490274906158447, + "logits/rejected": -2.7165563106536865, + "logps/chosen": -260.8857727050781, + "logps/rejected": -264.39337158203125, + "loss": 0.676, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.05145852640271187, + "rewards/margins": 0.0430302657186985, + "rewards/rejected": 0.008428258821368217, + "step": 670 + }, + { + "epoch": 0.09, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -2.563361167907715, + "logits/rejected": -2.665355920791626, + "logps/chosen": -223.7394561767578, + "logps/rejected": -199.46377563476562, + "loss": 0.6705, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03764995187520981, + "rewards/margins": 0.04758134484291077, + "rewards/rejected": -0.009931390173733234, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -2.6653246879577637, + "logits/rejected": -2.666923761367798, + "logps/chosen": -230.67587280273438, + "logps/rejected": -215.2616729736328, + "loss": 0.6711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012365647591650486, + "rewards/margins": 0.053419359028339386, + "rewards/rejected": -0.041053708642721176, + "step": 690 + }, + { + "epoch": 0.09, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -2.7226383686065674, + "logits/rejected": -2.613816022872925, + "logps/chosen": -315.6794128417969, + "logps/rejected": -273.4043884277344, + "loss": 0.6735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013524688547477126, + "rewards/margins": 0.03125739097595215, + "rewards/rejected": -0.032609861344099045, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -2.6579039096832275, + "logits/rejected": -2.666107654571533, + "logps/chosen": -329.4698791503906, + "logps/rejected": -275.6382751464844, + "loss": 0.6634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03630353882908821, + "rewards/margins": 0.08792858570814133, + "rewards/rejected": -0.05162503570318222, + "step": 710 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -2.734612226486206, + "logits/rejected": -2.704101085662842, + "logps/chosen": -247.69021606445312, + "logps/rejected": -269.6416931152344, + "loss": 0.6621, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03198099881410599, + "rewards/margins": 0.05677540972828865, + "rewards/rejected": -0.02479441836476326, + "step": 720 + }, + { + "epoch": 0.1, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -2.828014612197876, + "logits/rejected": -2.7556374073028564, + "logps/chosen": -292.7957763671875, + "logps/rejected": -267.17950439453125, + "loss": 0.6651, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022923732176423073, + "rewards/margins": 0.062076129019260406, + "rewards/rejected": -0.039152394980192184, + "step": 730 + }, + { + "epoch": 0.1, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -2.6186816692352295, + "logits/rejected": -2.6504456996917725, + "logps/chosen": -295.7213134765625, + "logps/rejected": -278.32220458984375, + "loss": 0.6666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009617263451218605, + "rewards/margins": 0.08382589370012283, + "rewards/rejected": -0.07420863211154938, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.5760602951049805, + "logits/rejected": -2.6215083599090576, + "logps/chosen": -306.38812255859375, + "logps/rejected": -266.0890197753906, + "loss": 0.6678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0065828850492835045, + "rewards/margins": 0.06601814180612564, + "rewards/rejected": -0.07260102778673172, + "step": 750 + }, + { + "epoch": 0.1, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -2.6420629024505615, + "logits/rejected": -2.581559658050537, + "logps/chosen": -244.5784149169922, + "logps/rejected": -190.7187042236328, + "loss": 0.671, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02735377475619316, + "rewards/margins": 0.06146436184644699, + "rewards/rejected": -0.08881814777851105, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 4.99999347843947e-06, + "logits/chosen": -2.762193202972412, + "logits/rejected": -2.695753335952759, + "logps/chosen": -264.07281494140625, + "logps/rejected": -236.6415252685547, + "loss": 0.6679, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0426148995757103, + "rewards/margins": 0.04322836920619011, + "rewards/rejected": -0.0858432799577713, + "step": 770 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941306159375e-06, + "logits/chosen": -2.6757876873016357, + "logits/rejected": -2.732694625854492, + "logps/chosen": -310.57177734375, + "logps/rejected": -302.04022216796875, + "loss": 0.6674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06698893010616302, + "rewards/margins": 0.057061631232500076, + "rewards/rejected": -0.124050572514534, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 4.999836962687967e-06, + "logits/chosen": -2.5134544372558594, + "logits/rejected": -2.511914014816284, + "logps/chosen": -270.5887756347656, + "logps/rejected": -303.32989501953125, + "loss": 0.6746, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04029134660959244, + "rewards/margins": 0.0430169552564621, + "rewards/rejected": -0.08330829441547394, + "step": 790 + }, + { + "epoch": 0.1, + "learning_rate": 4.999680450202786e-06, + "logits/chosen": -2.5781846046447754, + "logits/rejected": -2.5485153198242188, + "logps/chosen": -348.46002197265625, + "logps/rejected": -310.71502685546875, + "loss": 0.6561, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.00026362837525084615, + "rewards/margins": 0.12150361388921738, + "rewards/rejected": -0.12176723778247833, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 4.999471771970087e-06, + "logits/chosen": -2.750290870666504, + "logits/rejected": -2.80876088142395, + "logps/chosen": -239.05850219726562, + "logps/rejected": -216.97216796875, + "loss": 0.6682, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10349904000759125, + "rewards/margins": 0.0836646780371666, + "rewards/rejected": -0.18716372549533844, + "step": 810 + }, + { + "epoch": 0.11, + "learning_rate": 4.999210932344767e-06, + "logits/chosen": -2.7766480445861816, + "logits/rejected": -2.6801209449768066, + "logps/chosen": -314.65838623046875, + "logps/rejected": -296.15484619140625, + "loss": 0.6541, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07962436974048615, + "rewards/margins": 0.10081206262111664, + "rewards/rejected": -0.18043644726276398, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 4.998897936770281e-06, + "logits/chosen": -2.572793483734131, + "logits/rejected": -2.4554505348205566, + "logps/chosen": -254.23965454101562, + "logps/rejected": -241.06307983398438, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11069433391094208, + "rewards/margins": 0.054053209722042084, + "rewards/rejected": -0.16474755108356476, + "step": 830 + }, + { + "epoch": 0.11, + "learning_rate": 4.998532791778521e-06, + "logits/chosen": -2.6647446155548096, + "logits/rejected": -2.647819995880127, + "logps/chosen": -298.02001953125, + "logps/rejected": -238.2508544921875, + "loss": 0.6581, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0700971931219101, + "rewards/margins": 0.06591956317424774, + "rewards/rejected": -0.13601675629615784, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 4.9981155049896885e-06, + "logits/chosen": -2.6755619049072266, + "logits/rejected": -2.667574644088745, + "logps/chosen": -298.78466796875, + "logps/rejected": -265.919189453125, + "loss": 0.6451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03017922304570675, + "rewards/margins": 0.11484507471323013, + "rewards/rejected": -0.14502426981925964, + "step": 850 + }, + { + "epoch": 0.11, + "learning_rate": 4.997646085112126e-06, + "logits/chosen": -2.736441135406494, + "logits/rejected": -2.7532594203948975, + "logps/chosen": -291.0871887207031, + "logps/rejected": -251.4500732421875, + "loss": 0.6575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.028814245015382767, + "rewards/margins": 0.08863598853349686, + "rewards/rejected": -0.11745022237300873, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 4.997124541942141e-06, + "logits/chosen": -2.5254862308502197, + "logits/rejected": -2.607194185256958, + "logps/chosen": -274.9963073730469, + "logps/rejected": -305.16888427734375, + "loss": 0.6421, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03656459599733353, + "rewards/margins": 0.08495207875967026, + "rewards/rejected": -0.04838749021291733, + "step": 870 + }, + { + "epoch": 0.12, + "learning_rate": 4.996550886363801e-06, + "logits/chosen": -2.733109951019287, + "logits/rejected": -2.7724852561950684, + "logps/chosen": -326.680908203125, + "logps/rejected": -314.22906494140625, + "loss": 0.6622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04826091602444649, + "rewards/margins": 0.06839245557785034, + "rewards/rejected": -0.020131543278694153, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 4.995925130348706e-06, + "logits/chosen": -2.6151180267333984, + "logits/rejected": -2.574157238006592, + "logps/chosen": -343.59271240234375, + "logps/rejected": -288.0701904296875, + "loss": 0.6672, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024858742952346802, + "rewards/margins": 0.08349689841270447, + "rewards/rejected": -0.10835564136505127, + "step": 890 + }, + { + "epoch": 0.12, + "learning_rate": 4.995247286955734e-06, + "logits/chosen": -2.652498960494995, + "logits/rejected": -2.627878189086914, + "logps/chosen": -291.9495544433594, + "logps/rejected": -290.9588928222656, + "loss": 0.6473, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004280113615095615, + "rewards/margins": 0.1024482250213623, + "rewards/rejected": -0.10672833770513535, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 4.994517370330779e-06, + "logits/chosen": -2.63486909866333, + "logits/rejected": -2.623075485229492, + "logps/chosen": -272.341064453125, + "logps/rejected": -288.5429382324219, + "loss": 0.6401, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.023534759879112244, + "rewards/margins": 0.1517385095357895, + "rewards/rejected": -0.12820376455783844, + "step": 910 + }, + { + "epoch": 0.12, + "learning_rate": 4.993735395706446e-06, + "logits/chosen": -2.6047418117523193, + "logits/rejected": -2.560774564743042, + "logps/chosen": -416.06964111328125, + "logps/rejected": -440.61358642578125, + "loss": 0.6594, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15959739685058594, + "rewards/margins": 0.06939327716827393, + "rewards/rejected": -0.22899067401885986, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 4.992901379401737e-06, + "logits/chosen": -2.6060338020324707, + "logits/rejected": -2.5110888481140137, + "logps/chosen": -231.749267578125, + "logps/rejected": -263.95318603515625, + "loss": 0.6327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02359617128968239, + "rewards/margins": 0.15627136826515198, + "rewards/rejected": -0.17986753582954407, + "step": 930 + }, + { + "epoch": 0.12, + "learning_rate": 4.992015338821711e-06, + "logits/chosen": -2.643939256668091, + "logits/rejected": -2.598416328430176, + "logps/chosen": -279.15447998046875, + "logps/rejected": -222.61776733398438, + "loss": 0.6508, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05902577564120293, + "rewards/margins": 0.08327849209308624, + "rewards/rejected": -0.14230427145957947, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 4.991077292457117e-06, + "logits/chosen": -2.641256809234619, + "logits/rejected": -2.6436831951141357, + "logps/chosen": -197.23062133789062, + "logps/rejected": -190.83807373046875, + "loss": 0.6261, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.035204172134399414, + "rewards/margins": 0.17812921106815338, + "rewards/rejected": -0.14292503893375397, + "step": 950 + }, + { + "epoch": 0.13, + "learning_rate": 4.990087259884016e-06, + "logits/chosen": -2.672856092453003, + "logits/rejected": -2.6397016048431396, + "logps/chosen": -304.98699951171875, + "logps/rejected": -304.4772033691406, + "loss": 0.654, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1001698225736618, + "rewards/margins": 0.13911756873130798, + "rewards/rejected": -0.23928740620613098, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 4.989045261763362e-06, + "logits/chosen": -2.631251811981201, + "logits/rejected": -2.607426881790161, + "logps/chosen": -281.02325439453125, + "logps/rejected": -283.25994873046875, + "loss": 0.6402, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09959861636161804, + "rewards/margins": 0.12125315517187119, + "rewards/rejected": -0.22085174918174744, + "step": 970 + }, + { + "epoch": 0.13, + "learning_rate": 4.98795131984058e-06, + "logits/chosen": -2.448883056640625, + "logits/rejected": -2.471648693084717, + "logps/chosen": -316.79083251953125, + "logps/rejected": -263.4494323730469, + "loss": 0.6458, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10418669134378433, + "rewards/margins": 0.1183033436536789, + "rewards/rejected": -0.22249004244804382, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 4.986805456945107e-06, + "logits/chosen": -2.5442581176757812, + "logits/rejected": -2.6185083389282227, + "logps/chosen": -321.2454528808594, + "logps/rejected": -309.71356201171875, + "loss": 0.6323, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12658333778381348, + "rewards/margins": 0.2301892787218094, + "rewards/rejected": -0.3567725718021393, + "step": 990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985607696989919e-06, + "logits/chosen": -2.618556022644043, + "logits/rejected": -2.543536901473999, + "logps/chosen": -314.64532470703125, + "logps/rejected": -308.508056640625, + "loss": 0.6642, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07881616055965424, + "rewards/margins": 0.12552544474601746, + "rewards/rejected": -0.2043416053056717, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984358064971026e-06, + "logits/chosen": -2.667376756668091, + "logits/rejected": -2.666207790374756, + "logps/chosen": -309.4914245605469, + "logps/rejected": -300.64337158203125, + "loss": 0.6223, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0031271413899958134, + "rewards/margins": 0.1839439570903778, + "rewards/rejected": -0.1870710849761963, + "step": 1010 + }, + { + "epoch": 0.13, + "learning_rate": 4.983056586966958e-06, + "logits/chosen": -2.538663625717163, + "logits/rejected": -2.5320019721984863, + "logps/chosen": -302.5948486328125, + "logps/rejected": -304.9889221191406, + "loss": 0.6103, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.012503793463110924, + "rewards/margins": 0.1832227259874344, + "rewards/rejected": -0.1707189381122589, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 4.981703290138215e-06, + "logits/chosen": -2.651843547821045, + "logits/rejected": -2.646949291229248, + "logps/chosen": -334.045654296875, + "logps/rejected": -309.9171142578125, + "loss": 0.644, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08129630982875824, + "rewards/margins": 0.06906045973300934, + "rewards/rejected": -0.15035676956176758, + "step": 1030 + }, + { + "epoch": 0.14, + "learning_rate": 4.980298202726706e-06, + "logits/chosen": -2.632660150527954, + "logits/rejected": -2.6133508682250977, + "logps/chosen": -311.20172119140625, + "logps/rejected": -317.9689636230469, + "loss": 0.6527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0033264122903347015, + "rewards/margins": 0.09684255719184875, + "rewards/rejected": -0.10016896575689316, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 4.978841354055148e-06, + "logits/chosen": -2.588775873184204, + "logits/rejected": -2.5144200325012207, + "logps/chosen": -241.2130126953125, + "logps/rejected": -248.9693603515625, + "loss": 0.6092, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08562804758548737, + "rewards/margins": 0.17738834023475647, + "rewards/rejected": -0.26301640272140503, + "step": 1050 + }, + { + "epoch": 0.14, + "learning_rate": 4.977332774526471e-06, + "logits/chosen": -2.4832615852355957, + "logits/rejected": -2.5900285243988037, + "logps/chosen": -255.9525146484375, + "logps/rejected": -286.47174072265625, + "loss": 0.6421, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10017786920070648, + "rewards/margins": 0.14203806221485138, + "rewards/rejected": -0.24221591651439667, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 4.97577249562317e-06, + "logits/chosen": -2.6250522136688232, + "logits/rejected": -2.5602431297302246, + "logps/chosen": -274.66192626953125, + "logps/rejected": -315.9530029296875, + "loss": 0.6168, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17169487476348877, + "rewards/margins": 0.13329385221004486, + "rewards/rejected": -0.3049887418746948, + "step": 1070 + }, + { + "epoch": 0.14, + "learning_rate": 4.974160549906652e-06, + "logits/chosen": -2.683316946029663, + "logits/rejected": -2.7179713249206543, + "logps/chosen": -345.38873291015625, + "logps/rejected": -312.57989501953125, + "loss": 0.615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.037032462656497955, + "rewards/margins": 0.15039598941802979, + "rewards/rejected": -0.11336354166269302, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 4.972496971016559e-06, + "logits/chosen": -2.566483736038208, + "logits/rejected": -2.5443625450134277, + "logps/chosen": -267.92767333984375, + "logps/rejected": -248.58706665039062, + "loss": 0.6186, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06917119026184082, + "rewards/margins": 0.1513262540102005, + "rewards/rejected": -0.08215506374835968, + "step": 1090 + }, + { + "epoch": 0.14, + "learning_rate": 4.9707817936700635e-06, + "logits/chosen": -2.62837815284729, + "logits/rejected": -2.5987088680267334, + "logps/chosen": -284.10528564453125, + "logps/rejected": -254.57406616210938, + "loss": 0.6723, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.10585768520832062, + "rewards/margins": 0.04544953256845474, + "rewards/rejected": -0.15130721032619476, + "step": 1100 + }, + { + "epoch": 0.15, + "learning_rate": 4.969015053661142e-06, + "logits/chosen": -2.5701441764831543, + "logits/rejected": -2.596679210662842, + "logps/chosen": -312.38568115234375, + "logps/rejected": -296.02557373046875, + "loss": 0.6286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07000347226858139, + "rewards/margins": 0.2007255256175995, + "rewards/rejected": -0.2707290053367615, + "step": 1110 + }, + { + "epoch": 0.15, + "learning_rate": 4.967196787859835e-06, + "logits/chosen": -2.5174918174743652, + "logits/rejected": -2.529386043548584, + "logps/chosen": -353.6432800292969, + "logps/rejected": -331.7977294921875, + "loss": 0.6598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05754435062408447, + "rewards/margins": 0.031603433191776276, + "rewards/rejected": -0.08914779126644135, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 4.965327034211469e-06, + "logits/chosen": -2.541290283203125, + "logits/rejected": -2.5290510654449463, + "logps/chosen": -274.7214660644531, + "logps/rejected": -217.9346923828125, + "loss": 0.6394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.025605330243706703, + "rewards/margins": 0.15105733275413513, + "rewards/rejected": -0.12545201182365417, + "step": 1130 + }, + { + "epoch": 0.15, + "learning_rate": 4.96340583173587e-06, + "logits/chosen": -2.6864686012268066, + "logits/rejected": -2.6411499977111816, + "logps/chosen": -335.83465576171875, + "logps/rejected": -310.6993713378906, + "loss": 0.619, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03760487958788872, + "rewards/margins": 0.19487908482551575, + "rewards/rejected": -0.15727417171001434, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 4.96143322052655e-06, + "logits/chosen": -2.5475759506225586, + "logits/rejected": -2.5627975463867188, + "logps/chosen": -257.0076599121094, + "logps/rejected": -291.53070068359375, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.017704404890537262, + "rewards/margins": 0.10867403447628021, + "rewards/rejected": -0.09096963703632355, + "step": 1150 + }, + { + "epoch": 0.15, + "learning_rate": 4.959409241749864e-06, + "logits/chosen": -2.4454431533813477, + "logits/rejected": -2.4386324882507324, + "logps/chosen": -281.3999938964844, + "logps/rejected": -298.0575866699219, + "loss": 0.6432, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06157253310084343, + "rewards/margins": 0.19012895226478577, + "rewards/rejected": -0.2517014741897583, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 4.957333937644159e-06, + "logits/chosen": -2.6096065044403076, + "logits/rejected": -2.5484375953674316, + "logps/chosen": -228.22763061523438, + "logps/rejected": -247.7223663330078, + "loss": 0.6063, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07591332495212555, + "rewards/margins": 0.18348181247711182, + "rewards/rejected": -0.2593950927257538, + "step": 1170 + }, + { + "epoch": 0.15, + "learning_rate": 4.955207351518885e-06, + "logits/chosen": -2.6323444843292236, + "logits/rejected": -2.6869893074035645, + "logps/chosen": -321.0739440917969, + "logps/rejected": -287.7506408691406, + "loss": 0.6641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09519028663635254, + "rewards/margins": 0.14201217889785767, + "rewards/rejected": -0.2372024804353714, + "step": 1180 + }, + { + "epoch": 0.16, + "learning_rate": 4.953029527753699e-06, + "logits/chosen": -2.4826338291168213, + "logits/rejected": -2.433850049972534, + "logps/chosen": -317.3799133300781, + "logps/rejected": -308.22442626953125, + "loss": 0.6561, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07348557561635971, + "rewards/margins": 0.1411493420600891, + "rewards/rejected": -0.21463492512702942, + "step": 1190 + }, + { + "epoch": 0.16, + "learning_rate": 4.95080051179753e-06, + "logits/chosen": -2.6370556354522705, + "logits/rejected": -2.61586332321167, + "logps/chosen": -254.8235321044922, + "logps/rejected": -270.93841552734375, + "loss": 0.6526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08541082590818405, + "rewards/margins": 0.1679578721523285, + "rewards/rejected": -0.25336867570877075, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 4.948520350167637e-06, + "logits/chosen": -2.561717987060547, + "logits/rejected": -2.457118511199951, + "logps/chosen": -358.45977783203125, + "logps/rejected": -340.25433349609375, + "loss": 0.6127, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.048429448157548904, + "rewards/margins": 0.22879111766815186, + "rewards/rejected": -0.27722054719924927, + "step": 1210 + }, + { + "epoch": 0.16, + "learning_rate": 4.946189090448639e-06, + "logits/chosen": -2.707820415496826, + "logits/rejected": -2.5927631855010986, + "logps/chosen": -349.81634521484375, + "logps/rejected": -329.79595947265625, + "loss": 0.5952, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.047972869127988815, + "rewards/margins": 0.2178984433412552, + "rewards/rejected": -0.2658712863922119, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 4.943806781291515e-06, + "logits/chosen": -2.5137152671813965, + "logits/rejected": -2.490893840789795, + "logps/chosen": -308.74652099609375, + "logps/rejected": -360.9945373535156, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09396428614854813, + "rewards/margins": 0.15484566986560822, + "rewards/rejected": -0.24880993366241455, + "step": 1230 + }, + { + "epoch": 0.16, + "learning_rate": 4.941373472412595e-06, + "logits/chosen": -2.5708816051483154, + "logits/rejected": -2.5408287048339844, + "logps/chosen": -327.78631591796875, + "logps/rejected": -271.6731262207031, + "loss": 0.6241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15845860540866852, + "rewards/margins": 0.23015668988227844, + "rewards/rejected": -0.38861528038978577, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 4.938889214592521e-06, + "logits/chosen": -2.446613073348999, + "logits/rejected": -2.5086395740509033, + "logps/chosen": -229.57431030273438, + "logps/rejected": -296.53814697265625, + "loss": 0.6214, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20037560164928436, + "rewards/margins": 0.16084669530391693, + "rewards/rejected": -0.3612222373485565, + "step": 1250 + }, + { + "epoch": 0.16, + "learning_rate": 4.936354059675186e-06, + "logits/chosen": -2.6176249980926514, + "logits/rejected": -2.5214638710021973, + "logps/chosen": -281.8207092285156, + "logps/rejected": -298.4183654785156, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04399660974740982, + "rewards/margins": 0.25542718172073364, + "rewards/rejected": -0.29942384362220764, + "step": 1260 + }, + { + "epoch": 0.17, + "learning_rate": 4.933768060566654e-06, + "logits/chosen": -2.58945369720459, + "logits/rejected": -2.4401657581329346, + "logps/chosen": -297.75054931640625, + "logps/rejected": -274.86334228515625, + "loss": 0.6132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09477253258228302, + "rewards/margins": 0.21978351473808289, + "rewards/rejected": -0.3145560622215271, + "step": 1270 + }, + { + "epoch": 0.17, + "learning_rate": 4.931131271234052e-06, + "logits/chosen": -2.5565812587738037, + "logits/rejected": -2.5460667610168457, + "logps/chosen": -314.35723876953125, + "logps/rejected": -337.5452880859375, + "loss": 0.6582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12781710922718048, + "rewards/margins": 0.16964153945446014, + "rewards/rejected": -0.2974586486816406, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 4.928443746704448e-06, + "logits/chosen": -2.6169776916503906, + "logits/rejected": -2.4687631130218506, + "logps/chosen": -281.2894287109375, + "logps/rejected": -275.56512451171875, + "loss": 0.6008, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.01575014926493168, + "rewards/margins": 0.291829913854599, + "rewards/rejected": -0.30758005380630493, + "step": 1290 + }, + { + "epoch": 0.17, + "learning_rate": 4.925705543063703e-06, + "logits/chosen": -2.548710823059082, + "logits/rejected": -2.551642656326294, + "logps/chosen": -290.65301513671875, + "logps/rejected": -353.34429931640625, + "loss": 0.6286, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02048138529062271, + "rewards/margins": 0.1404663622379303, + "rewards/rejected": -0.1609477400779724, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 4.922916717455297e-06, + "logits/chosen": -2.4327712059020996, + "logits/rejected": -2.4818997383117676, + "logps/chosen": -256.9817810058594, + "logps/rejected": -271.67059326171875, + "loss": 0.6512, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020475735887885094, + "rewards/margins": 0.08691316843032837, + "rewards/rejected": -0.06643743813037872, + "step": 1310 + }, + { + "epoch": 0.17, + "learning_rate": 4.920077328079136e-06, + "logits/chosen": -2.5746166706085205, + "logits/rejected": -2.464660882949829, + "logps/chosen": -304.72174072265625, + "logps/rejected": -293.9650573730469, + "loss": 0.5806, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.010029902681708336, + "rewards/margins": 0.3328434228897095, + "rewards/rejected": -0.3228135108947754, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 4.9171874341903445e-06, + "logits/chosen": -2.4323649406433105, + "logits/rejected": -2.4603872299194336, + "logps/chosen": -282.19989013671875, + "logps/rejected": -272.5907287597656, + "loss": 0.5928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.25110486149787903, + "rewards/margins": 0.2193199098110199, + "rewards/rejected": -0.47042474150657654, + "step": 1330 + }, + { + "epoch": 0.18, + "learning_rate": 4.914247096098019e-06, + "logits/chosen": -2.4408984184265137, + "logits/rejected": -2.4234704971313477, + "logps/chosen": -275.26556396484375, + "logps/rejected": -274.07000732421875, + "loss": 0.6134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1627645045518875, + "rewards/margins": 0.23349003493785858, + "rewards/rejected": -0.3962545394897461, + "step": 1340 + }, + { + "epoch": 0.18, + "learning_rate": 4.911256375163977e-06, + "logits/chosen": -2.5912342071533203, + "logits/rejected": -2.5624070167541504, + "logps/chosen": -340.3487243652344, + "logps/rejected": -292.11212158203125, + "loss": 0.5945, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19813594222068787, + "rewards/margins": 0.21375887095928192, + "rewards/rejected": -0.4118947386741638, + "step": 1350 + }, + { + "epoch": 0.18, + "learning_rate": 4.908215333801474e-06, + "logits/chosen": -2.755558967590332, + "logits/rejected": -2.719031572341919, + "logps/chosen": -341.6617126464844, + "logps/rejected": -289.6936950683594, + "loss": 0.6524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16030514240264893, + "rewards/margins": 0.1857156753540039, + "rewards/rejected": -0.34602075815200806, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 4.9051240354739004e-06, + "logits/chosen": -2.539360523223877, + "logits/rejected": -2.513627052307129, + "logps/chosen": -324.1982421875, + "logps/rejected": -328.8523864746094, + "loss": 0.6199, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18575456738471985, + "rewards/margins": 0.18963254988193512, + "rewards/rejected": -0.37538713216781616, + "step": 1370 + }, + { + "epoch": 0.18, + "learning_rate": 4.901982544693457e-06, + "logits/chosen": -2.4364771842956543, + "logits/rejected": -2.3941617012023926, + "logps/chosen": -189.21115112304688, + "logps/rejected": -262.70672607421875, + "loss": 0.6067, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16636282205581665, + "rewards/margins": 0.18926694989204407, + "rewards/rejected": -0.3556297719478607, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 4.898790927019809e-06, + "logits/chosen": -2.5843005180358887, + "logits/rejected": -2.581282377243042, + "logps/chosen": -284.1488037109375, + "logps/rejected": -293.54083251953125, + "loss": 0.6024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06523891538381577, + "rewards/margins": 0.16263197362422943, + "rewards/rejected": -0.2278708964586258, + "step": 1390 + }, + { + "epoch": 0.18, + "learning_rate": 4.895549249058718e-06, + "logits/chosen": -2.5506742000579834, + "logits/rejected": -2.4969406127929688, + "logps/chosen": -329.4296875, + "logps/rejected": -288.1108093261719, + "loss": 0.5862, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.030632415786385536, + "rewards/margins": 0.2558412253856659, + "rewards/rejected": -0.28647366166114807, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 4.892257578460656e-06, + "logits/chosen": -2.4754862785339355, + "logits/rejected": -2.4533698558807373, + "logps/chosen": -230.84561157226562, + "logps/rejected": -242.7822723388672, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22271236777305603, + "rewards/margins": 0.1977839320898056, + "rewards/rejected": -0.4204963147640228, + "step": 1410 + }, + { + "epoch": 0.19, + "learning_rate": 4.888915983919383e-06, + "logits/chosen": -2.4821248054504395, + "logits/rejected": -2.4885854721069336, + "logps/chosen": -250.3030242919922, + "logps/rejected": -285.76123046875, + "loss": 0.5895, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24887505173683167, + "rewards/margins": 0.19912755489349365, + "rewards/rejected": -0.4480026364326477, + "step": 1420 + }, + { + "epoch": 0.19, + "learning_rate": 4.885524535170525e-06, + "logits/chosen": -2.39884614944458, + "logits/rejected": -2.2909188270568848, + "logps/chosen": -236.4246826171875, + "logps/rejected": -241.6230926513672, + "loss": 0.5828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17980875074863434, + "rewards/margins": 0.22901611030101776, + "rewards/rejected": -0.4088248312473297, + "step": 1430 + }, + { + "epoch": 0.19, + "learning_rate": 4.882083302990113e-06, + "logits/chosen": -2.523052453994751, + "logits/rejected": -2.4976887702941895, + "logps/chosen": -273.3792419433594, + "logps/rejected": -275.5514221191406, + "loss": 0.6476, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09809037297964096, + "rewards/margins": 0.07924286276102066, + "rewards/rejected": -0.17733325064182281, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 4.878592359193104e-06, + "logits/chosen": -2.5098214149475098, + "logits/rejected": -2.4154574871063232, + "logps/chosen": -258.97833251953125, + "logps/rejected": -232.565673828125, + "loss": 0.5954, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05890699476003647, + "rewards/margins": 0.2493654489517212, + "rewards/rejected": -0.30827242136001587, + "step": 1450 + }, + { + "epoch": 0.19, + "learning_rate": 4.875051776631888e-06, + "logits/chosen": -2.4685325622558594, + "logits/rejected": -2.6131837368011475, + "logps/chosen": -291.41033935546875, + "logps/rejected": -363.9025573730469, + "loss": 0.6213, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18342065811157227, + "rewards/margins": 0.16329218447208405, + "rewards/rejected": -0.34671279788017273, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 4.871461629194764e-06, + "logits/chosen": -2.602108955383301, + "logits/rejected": -2.4977211952209473, + "logps/chosen": -344.0504455566406, + "logps/rejected": -329.0046081542969, + "loss": 0.6106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09750501066446304, + "rewards/margins": 0.32457083463668823, + "rewards/rejected": -0.4220758378505707, + "step": 1470 + }, + { + "epoch": 0.19, + "learning_rate": 4.8678219918043984e-06, + "logits/chosen": -2.5800747871398926, + "logits/rejected": -2.5869297981262207, + "logps/chosen": -303.756103515625, + "logps/rejected": -348.797119140625, + "loss": 0.6402, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.287535160779953, + "rewards/margins": 0.15229448676109314, + "rewards/rejected": -0.43982967734336853, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 4.864132940416262e-06, + "logits/chosen": -2.5854430198669434, + "logits/rejected": -2.554171085357666, + "logps/chosen": -252.5200958251953, + "logps/rejected": -258.8785705566406, + "loss": 0.6339, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3456532955169678, + "rewards/margins": 0.13309410214424133, + "rewards/rejected": -0.4787473678588867, + "step": 1490 + }, + { + "epoch": 0.2, + "learning_rate": 4.860394552017044e-06, + "logits/chosen": -2.7142224311828613, + "logits/rejected": -2.6043894290924072, + "logps/chosen": -320.2101135253906, + "logps/rejected": -314.76214599609375, + "loss": 0.5984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.052597224712371826, + "rewards/margins": 0.19962158799171448, + "rewards/rejected": -0.2522187829017639, + "step": 1500 + }, + { + "epoch": 0.2, + "learning_rate": 4.856606904623047e-06, + "logits/chosen": -2.7056479454040527, + "logits/rejected": -2.6384434700012207, + "logps/chosen": -297.3497619628906, + "logps/rejected": -289.73602294921875, + "loss": 0.5747, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0032710283994674683, + "rewards/margins": 0.26606133580207825, + "rewards/rejected": -0.2693323493003845, + "step": 1510 + }, + { + "epoch": 0.2, + "learning_rate": 4.852770077278557e-06, + "logits/chosen": -2.5575883388519287, + "logits/rejected": -2.443173408508301, + "logps/chosen": -303.90509033203125, + "logps/rejected": -284.10504150390625, + "loss": 0.6025, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08063594996929169, + "rewards/margins": 0.25129538774490356, + "rewards/rejected": -0.33193135261535645, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 4.848884150054196e-06, + "logits/chosen": -2.4108645915985107, + "logits/rejected": -2.3747196197509766, + "logps/chosen": -315.03680419921875, + "logps/rejected": -360.4490661621094, + "loss": 0.6506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20810608565807343, + "rewards/margins": 0.1870233714580536, + "rewards/rejected": -0.3951294720172882, + "step": 1530 + }, + { + "epoch": 0.2, + "learning_rate": 4.8449492040452495e-06, + "logits/chosen": -2.6083035469055176, + "logits/rejected": -2.471031904220581, + "logps/chosen": -330.26470947265625, + "logps/rejected": -314.167236328125, + "loss": 0.6286, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3471115231513977, + "rewards/margins": 0.17778223752975464, + "rewards/rejected": -0.5248937606811523, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 4.840965321369973e-06, + "logits/chosen": -2.5670390129089355, + "logits/rejected": -2.6726107597351074, + "logps/chosen": -345.9797058105469, + "logps/rejected": -378.4753723144531, + "loss": 0.5965, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22988872230052948, + "rewards/margins": 0.276926726102829, + "rewards/rejected": -0.506815493106842, + "step": 1550 + }, + { + "epoch": 0.2, + "learning_rate": 4.8369325851678795e-06, + "logits/chosen": -2.5260396003723145, + "logits/rejected": -2.533756971359253, + "logps/chosen": -304.7558898925781, + "logps/rejected": -270.6090393066406, + "loss": 0.5744, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.03183753415942192, + "rewards/margins": 0.36377862095832825, + "rewards/rejected": -0.3956161439418793, + "step": 1560 + }, + { + "epoch": 0.21, + "learning_rate": 4.832851079598007e-06, + "logits/chosen": -2.5340375900268555, + "logits/rejected": -2.5676798820495605, + "logps/chosen": -301.473876953125, + "logps/rejected": -305.3074951171875, + "loss": 0.6623, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27487364411354065, + "rewards/margins": 0.14021675288677216, + "rewards/rejected": -0.415090411901474, + "step": 1570 + }, + { + "epoch": 0.21, + "learning_rate": 4.828720889837158e-06, + "logits/chosen": -2.674614906311035, + "logits/rejected": -2.655691623687744, + "logps/chosen": -351.2834167480469, + "logps/rejected": -329.86968994140625, + "loss": 0.6952, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.26195287704467773, + "rewards/margins": 0.040315210819244385, + "rewards/rejected": -0.3022680878639221, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 4.824542102078125e-06, + "logits/chosen": -2.516094446182251, + "logits/rejected": -2.523566484451294, + "logps/chosen": -315.708984375, + "logps/rejected": -295.6442565917969, + "loss": 0.552, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04934043809771538, + "rewards/margins": 0.37400636076927185, + "rewards/rejected": -0.4233468174934387, + "step": 1590 + }, + { + "epoch": 0.21, + "learning_rate": 4.820314803527888e-06, + "logits/chosen": -2.516603708267212, + "logits/rejected": -2.4437460899353027, + "logps/chosen": -270.81573486328125, + "logps/rejected": -300.3429870605469, + "loss": 0.6041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2289186716079712, + "rewards/margins": 0.22313785552978516, + "rewards/rejected": -0.4520565867424011, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 4.816039082405799e-06, + "logits/chosen": -2.434842824935913, + "logits/rejected": -2.5143871307373047, + "logps/chosen": -283.5533142089844, + "logps/rejected": -298.64959716796875, + "loss": 0.6409, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20882566273212433, + "rewards/margins": 0.1115816980600357, + "rewards/rejected": -0.3204073905944824, + "step": 1610 + }, + { + "epoch": 0.21, + "learning_rate": 4.81171502794174e-06, + "logits/chosen": -2.482771158218384, + "logits/rejected": -2.4701130390167236, + "logps/chosen": -289.22552490234375, + "logps/rejected": -291.7224426269531, + "loss": 0.6423, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2668714225292206, + "rewards/margins": 0.15823452174663544, + "rewards/rejected": -0.4251059591770172, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 4.8073427303742584e-06, + "logits/chosen": -2.544376850128174, + "logits/rejected": -2.498720645904541, + "logps/chosen": -304.0487976074219, + "logps/rejected": -242.37948608398438, + "loss": 0.5992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1958625316619873, + "rewards/margins": 0.23352602124214172, + "rewards/rejected": -0.4293885827064514, + "step": 1630 + }, + { + "epoch": 0.21, + "learning_rate": 4.802922280948685e-06, + "logits/chosen": -2.498978614807129, + "logits/rejected": -2.4339146614074707, + "logps/chosen": -280.9106140136719, + "logps/rejected": -322.80499267578125, + "loss": 0.6118, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0869927778840065, + "rewards/margins": 0.22293932735919952, + "rewards/rejected": -0.3099321126937866, + "step": 1640 + }, + { + "epoch": 0.22, + "learning_rate": 4.798453771915231e-06, + "logits/chosen": -2.5638351440429688, + "logits/rejected": -2.4881110191345215, + "logps/chosen": -286.447021484375, + "logps/rejected": -279.38836669921875, + "loss": 0.5954, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019666481763124466, + "rewards/margins": 0.29864734411239624, + "rewards/rejected": -0.2789808213710785, + "step": 1650 + }, + { + "epoch": 0.22, + "learning_rate": 4.793937296527062e-06, + "logits/chosen": -2.6339659690856934, + "logits/rejected": -2.498025417327881, + "logps/chosen": -333.6770935058594, + "logps/rejected": -319.0213928222656, + "loss": 0.6179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10026480257511139, + "rewards/margins": 0.2683647572994232, + "rewards/rejected": -0.368629515171051, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 4.78937294903835e-06, + "logits/chosen": -2.647279977798462, + "logits/rejected": -2.5249857902526855, + "logps/chosen": -362.973876953125, + "logps/rejected": -305.00872802734375, + "loss": 0.6298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.154966339468956, + "rewards/margins": 0.13557985424995422, + "rewards/rejected": -0.2905462086200714, + "step": 1670 + }, + { + "epoch": 0.22, + "learning_rate": 4.78476082470231e-06, + "logits/chosen": -2.631761312484741, + "logits/rejected": -2.6130969524383545, + "logps/chosen": -307.74114990234375, + "logps/rejected": -327.1836242675781, + "loss": 0.5767, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0334634855389595, + "rewards/margins": 0.3227209448814392, + "rewards/rejected": -0.3561844527721405, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 4.780101019769212e-06, + "logits/chosen": -2.544905185699463, + "logits/rejected": -2.5424928665161133, + "logps/chosen": -305.4649353027344, + "logps/rejected": -340.08831787109375, + "loss": 0.613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13283556699752808, + "rewards/margins": 0.190877765417099, + "rewards/rejected": -0.3237133324146271, + "step": 1690 + }, + { + "epoch": 0.22, + "learning_rate": 4.775393631484368e-06, + "logits/chosen": -2.6364634037017822, + "logits/rejected": -2.596670150756836, + "logps/chosen": -393.5442199707031, + "logps/rejected": -418.76849365234375, + "loss": 0.5832, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16526606678962708, + "rewards/margins": 0.33243757486343384, + "rewards/rejected": -0.49770355224609375, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 4.770638758086105e-06, + "logits/chosen": -2.6779677867889404, + "logits/rejected": -2.6232552528381348, + "logps/chosen": -328.4881286621094, + "logps/rejected": -362.8997497558594, + "loss": 0.6343, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15946415066719055, + "rewards/margins": 0.30744999647140503, + "rewards/rejected": -0.46691417694091797, + "step": 1710 + }, + { + "epoch": 0.23, + "learning_rate": 4.7658364988037184e-06, + "logits/chosen": -2.3870747089385986, + "logits/rejected": -2.4414222240448, + "logps/chosen": -262.0770263671875, + "logps/rejected": -299.34344482421875, + "loss": 0.5918, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16681644320487976, + "rewards/margins": 0.2157113254070282, + "rewards/rejected": -0.38252782821655273, + "step": 1720 + }, + { + "epoch": 0.23, + "learning_rate": 4.760986953855395e-06, + "logits/chosen": -2.4868204593658447, + "logits/rejected": -2.4730277061462402, + "logps/chosen": -346.83184814453125, + "logps/rejected": -306.1211242675781, + "loss": 0.6128, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1638791561126709, + "rewards/margins": 0.2095731943845749, + "rewards/rejected": -0.373452365398407, + "step": 1730 + }, + { + "epoch": 0.23, + "learning_rate": 4.756090224446127e-06, + "logits/chosen": -2.4956960678100586, + "logits/rejected": -2.4144070148468018, + "logps/chosen": -335.1727600097656, + "logps/rejected": -318.25750732421875, + "loss": 0.5973, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11654730141162872, + "rewards/margins": 0.3332977890968323, + "rewards/rejected": -0.4498451352119446, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 4.7511464127655945e-06, + "logits/chosen": -2.3792405128479004, + "logits/rejected": -2.3864805698394775, + "logps/chosen": -269.36474609375, + "logps/rejected": -310.34942626953125, + "loss": 0.6409, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37235167622566223, + "rewards/margins": 0.12245126068592072, + "rewards/rejected": -0.494802862405777, + "step": 1750 + }, + { + "epoch": 0.23, + "learning_rate": 4.74615562198604e-06, + "logits/chosen": -2.4615726470947266, + "logits/rejected": -2.4444642066955566, + "logps/chosen": -280.3018493652344, + "logps/rejected": -267.9478759765625, + "loss": 0.6459, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.302560418844223, + "rewards/margins": 0.19724300503730774, + "rewards/rejected": -0.49980348348617554, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 4.741117956260107e-06, + "logits/chosen": -2.624507188796997, + "logits/rejected": -2.5798962116241455, + "logps/chosen": -367.918212890625, + "logps/rejected": -354.9175720214844, + "loss": 0.6237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24024216830730438, + "rewards/margins": 0.21005964279174805, + "rewards/rejected": -0.45030179619789124, + "step": 1770 + }, + { + "epoch": 0.23, + "learning_rate": 4.736033520718672e-06, + "logits/chosen": -2.5508697032928467, + "logits/rejected": -2.447824001312256, + "logps/chosen": -263.49700927734375, + "logps/rejected": -244.37313842773438, + "loss": 0.6098, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3110814690589905, + "rewards/margins": 0.24499061703681946, + "rewards/rejected": -0.5560721158981323, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 4.730902421468652e-06, + "logits/chosen": -2.577620506286621, + "logits/rejected": -2.4969711303710938, + "logps/chosen": -296.9185791015625, + "logps/rejected": -268.3824462890625, + "loss": 0.6011, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09890172630548477, + "rewards/margins": 0.26050621271133423, + "rewards/rejected": -0.35940787196159363, + "step": 1790 + }, + { + "epoch": 0.24, + "learning_rate": 4.7257247655907854e-06, + "logits/chosen": -2.535594940185547, + "logits/rejected": -2.380336284637451, + "logps/chosen": -304.6521911621094, + "logps/rejected": -283.36541748046875, + "loss": 0.5972, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06494356691837311, + "rewards/margins": 0.28849363327026367, + "rewards/rejected": -0.3534371554851532, + "step": 1800 + }, + { + "epoch": 0.24, + "learning_rate": 4.720500661137397e-06, + "logits/chosen": -2.4605793952941895, + "logits/rejected": -2.4445199966430664, + "logps/chosen": -212.75570678710938, + "logps/rejected": -278.8281555175781, + "loss": 0.5815, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1056346446275711, + "rewards/margins": 0.31375908851623535, + "rewards/rejected": -0.41939371824264526, + "step": 1810 + }, + { + "epoch": 0.24, + "learning_rate": 4.71523021713015e-06, + "logits/chosen": -2.6024506092071533, + "logits/rejected": -2.516287326812744, + "logps/chosen": -322.142822265625, + "logps/rejected": -284.1523742675781, + "loss": 0.5865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.031340669840574265, + "rewards/margins": 0.19477489590644836, + "rewards/rejected": -0.22611558437347412, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 4.709913543557761e-06, + "logits/chosen": -2.551557779312134, + "logits/rejected": -2.579463481903076, + "logps/chosen": -347.54254150390625, + "logps/rejected": -326.8415832519531, + "loss": 0.6201, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14780651032924652, + "rewards/margins": 0.2015647143125534, + "rewards/rejected": -0.3493712246417999, + "step": 1830 + }, + { + "epoch": 0.24, + "learning_rate": 4.704550751373715e-06, + "logits/chosen": -2.5001020431518555, + "logits/rejected": -2.5061745643615723, + "logps/chosen": -304.66876220703125, + "logps/rejected": -329.971435546875, + "loss": 0.6323, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21862399578094482, + "rewards/margins": 0.04476120322942734, + "rewards/rejected": -0.26338517665863037, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 4.699141952493941e-06, + "logits/chosen": -2.5286591053009033, + "logits/rejected": -2.480250835418701, + "logps/chosen": -298.2022399902344, + "logps/rejected": -271.6536560058594, + "loss": 0.5698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03638296201825142, + "rewards/margins": 0.4161553382873535, + "rewards/rejected": -0.45253825187683105, + "step": 1850 + }, + { + "epoch": 0.24, + "learning_rate": 4.6936872597944814e-06, + "logits/chosen": -2.422044277191162, + "logits/rejected": -2.362180709838867, + "logps/chosen": -279.4925231933594, + "logps/rejected": -305.65264892578125, + "loss": 0.6056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05379510670900345, + "rewards/margins": 0.29282423853874207, + "rewards/rejected": -0.3466193675994873, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 4.688186787109136e-06, + "logits/chosen": -2.5348618030548096, + "logits/rejected": -2.5085978507995605, + "logps/chosen": -250.30996704101562, + "logps/rejected": -251.60397338867188, + "loss": 0.6169, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.03564034774899483, + "rewards/margins": 0.25885969400405884, + "rewards/rejected": -0.2232193499803543, + "step": 1870 + }, + { + "epoch": 0.25, + "learning_rate": 4.682640649227085e-06, + "logits/chosen": -2.373028039932251, + "logits/rejected": -2.3929853439331055, + "logps/chosen": -289.9034423828125, + "logps/rejected": -279.6601257324219, + "loss": 0.6165, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.018260175362229347, + "rewards/margins": 0.19186533987522125, + "rewards/rejected": -0.17360517382621765, + "step": 1880 + }, + { + "epoch": 0.25, + "learning_rate": 4.677048961890492e-06, + "logits/chosen": -2.3592562675476074, + "logits/rejected": -2.388822078704834, + "logps/chosen": -292.2175598144531, + "logps/rejected": -288.35736083984375, + "loss": 0.5945, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.002486780984327197, + "rewards/margins": 0.29614076018333435, + "rewards/rejected": -0.2986275851726532, + "step": 1890 + }, + { + "epoch": 0.25, + "learning_rate": 4.671411841792096e-06, + "logits/chosen": -2.577094316482544, + "logits/rejected": -2.5577306747436523, + "logps/chosen": -366.50579833984375, + "logps/rejected": -297.91558837890625, + "loss": 0.6126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10249079763889313, + "rewards/margins": 0.2659778296947479, + "rewards/rejected": -0.36846858263015747, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 4.665729406572764e-06, + "logits/chosen": -2.455371618270874, + "logits/rejected": -2.4111618995666504, + "logps/chosen": -220.4912109375, + "logps/rejected": -242.9920196533203, + "loss": 0.6166, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0025220424868166447, + "rewards/margins": 0.19942674040794373, + "rewards/rejected": -0.196904718875885, + "step": 1910 + }, + { + "epoch": 0.25, + "learning_rate": 4.660001774819048e-06, + "logits/chosen": -2.40692400932312, + "logits/rejected": -2.412745237350464, + "logps/chosen": -191.8985595703125, + "logps/rejected": -250.158935546875, + "loss": 0.6297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04563095420598984, + "rewards/margins": 0.0915987491607666, + "rewards/rejected": -0.13722969591617584, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 4.654229066060702e-06, + "logits/chosen": -2.5632548332214355, + "logits/rejected": -2.538203239440918, + "logps/chosen": -286.740478515625, + "logps/rejected": -419.6044006347656, + "loss": 0.6227, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09640201926231384, + "rewards/margins": 0.2534371316432953, + "rewards/rejected": -0.34983915090560913, + "step": 1930 + }, + { + "epoch": 0.25, + "learning_rate": 4.648411400768193e-06, + "logits/chosen": -2.434919595718384, + "logits/rejected": -2.5149827003479004, + "logps/chosen": -275.26763916015625, + "logps/rejected": -312.817626953125, + "loss": 0.6375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029093902558088303, + "rewards/margins": 0.17853209376335144, + "rewards/rejected": -0.20762601494789124, + "step": 1940 + }, + { + "epoch": 0.26, + "learning_rate": 4.642548900350182e-06, + "logits/chosen": -2.4125964641571045, + "logits/rejected": -2.4099197387695312, + "logps/chosen": -309.1310729980469, + "logps/rejected": -319.1329345703125, + "loss": 0.5814, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.028103992342948914, + "rewards/margins": 0.3758625090122223, + "rewards/rejected": -0.40396642684936523, + "step": 1950 + }, + { + "epoch": 0.26, + "learning_rate": 4.636641687150994e-06, + "logits/chosen": -2.5086982250213623, + "logits/rejected": -2.484741449356079, + "logps/chosen": -273.00469970703125, + "logps/rejected": -269.30426025390625, + "loss": 0.6309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0481497123837471, + "rewards/margins": 0.16629788279533386, + "rewards/rejected": -0.21444758772850037, + "step": 1960 + }, + { + "epoch": 0.26, + "learning_rate": 4.6306898844480615e-06, + "logits/chosen": -2.4550163745880127, + "logits/rejected": -2.4166951179504395, + "logps/chosen": -247.8860321044922, + "logps/rejected": -301.16436767578125, + "loss": 0.5935, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009688841179013252, + "rewards/margins": 0.24411949515342712, + "rewards/rejected": -0.25380831956863403, + "step": 1970 + }, + { + "epoch": 0.26, + "learning_rate": 4.624693616449358e-06, + "logits/chosen": -2.439455986022949, + "logits/rejected": -2.3766419887542725, + "logps/chosen": -309.07818603515625, + "logps/rejected": -271.0009460449219, + "loss": 0.6181, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.036980949342250824, + "rewards/margins": 0.20554514229297638, + "rewards/rejected": -0.16856420040130615, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 4.6186530082908e-06, + "logits/chosen": -2.413626194000244, + "logits/rejected": -2.458937168121338, + "logps/chosen": -279.32843017578125, + "logps/rejected": -308.2882995605469, + "loss": 0.6058, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.029264086857438087, + "rewards/margins": 0.13824932277202606, + "rewards/rejected": -0.10898523032665253, + "step": 1990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612568186033633e-06, + "logits/chosen": -2.235609769821167, + "logits/rejected": -2.3205313682556152, + "logps/chosen": -254.9501953125, + "logps/rejected": -210.4433135986328, + "loss": 0.5872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.039027273654937744, + "rewards/margins": 0.29768261313438416, + "rewards/rejected": -0.2586553692817688, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 4.6064392766618125e-06, + "logits/chosen": -2.3810200691223145, + "logits/rejected": -2.409494400024414, + "logps/chosen": -279.5122375488281, + "logps/rejected": -291.50555419921875, + "loss": 0.615, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15733623504638672, + "rewards/margins": 0.3040218651294708, + "rewards/rejected": -0.46135807037353516, + "step": 2010 + }, + { + "epoch": 0.26, + "learning_rate": 4.60026640807934e-06, + "logits/chosen": -2.4634006023406982, + "logits/rejected": -2.517077922821045, + "logps/chosen": -322.32708740234375, + "logps/rejected": -374.2076721191406, + "loss": 0.6036, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018660439178347588, + "rewards/margins": 0.2876133918762207, + "rewards/rejected": -0.2689529359340668, + "step": 2020 + }, + { + "epoch": 0.27, + "learning_rate": 4.594049709107604e-06, + "logits/chosen": -2.4302146434783936, + "logits/rejected": -2.469205141067505, + "logps/chosen": -288.62115478515625, + "logps/rejected": -306.14959716796875, + "loss": 0.6046, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12552428245544434, + "rewards/margins": 0.21927396953105927, + "rewards/rejected": -0.3447982668876648, + "step": 2030 + }, + { + "epoch": 0.27, + "learning_rate": 4.587789309482687e-06, + "logits/chosen": -2.4714980125427246, + "logits/rejected": -2.4519617557525635, + "logps/chosen": -245.44546508789062, + "logps/rejected": -303.46527099609375, + "loss": 0.6004, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1674383580684662, + "rewards/margins": 0.3390670418739319, + "rewards/rejected": -0.5065053701400757, + "step": 2040 + }, + { + "epoch": 0.27, + "learning_rate": 4.581485339852659e-06, + "logits/chosen": -2.4701762199401855, + "logits/rejected": -2.353682041168213, + "logps/chosen": -278.525146484375, + "logps/rejected": -242.499267578125, + "loss": 0.6316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.249502494931221, + "rewards/margins": 0.147563636302948, + "rewards/rejected": -0.3970661461353302, + "step": 2050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5751379317748514e-06, + "logits/chosen": -2.452324628829956, + "logits/rejected": -2.42596697807312, + "logps/chosen": -365.1957092285156, + "logps/rejected": -309.4463195800781, + "loss": 0.5917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16269655525684357, + "rewards/margins": 0.20064584910869598, + "rewards/rejected": -0.36334243416786194, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 4.56874721771311e-06, + "logits/chosen": -2.502645969390869, + "logits/rejected": -2.4329185485839844, + "logps/chosen": -368.23529052734375, + "logps/rejected": -348.7842712402344, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08776414394378662, + "rewards/margins": 0.3524087071418762, + "rewards/rejected": -0.44017285108566284, + "step": 2070 + }, + { + "epoch": 0.27, + "learning_rate": 4.562313331035032e-06, + "logits/chosen": -2.3490259647369385, + "logits/rejected": -2.3561103343963623, + "logps/chosen": -255.2849578857422, + "logps/rejected": -269.08917236328125, + "loss": 0.6629, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2621305286884308, + "rewards/margins": 0.14026693999767303, + "rewards/rejected": -0.4023974537849426, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 4.555836406009183e-06, + "logits/chosen": -2.411449432373047, + "logits/rejected": -2.432992458343506, + "logps/chosen": -312.03765869140625, + "logps/rejected": -280.2890319824219, + "loss": 0.5594, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09314192831516266, + "rewards/margins": 0.2997645437717438, + "rewards/rejected": -0.3929064869880676, + "step": 2090 + }, + { + "epoch": 0.27, + "learning_rate": 4.5493165778022945e-06, + "logits/chosen": -2.4259262084960938, + "logits/rejected": -2.3749966621398926, + "logps/chosen": -263.62762451171875, + "logps/rejected": -296.1581115722656, + "loss": 0.6284, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.015141060575842857, + "rewards/margins": 0.25695693492889404, + "rewards/rejected": -0.27209797501564026, + "step": 2100 + }, + { + "epoch": 0.28, + "learning_rate": 4.542753982476443e-06, + "logits/chosen": -2.3411784172058105, + "logits/rejected": -2.3107755184173584, + "logps/chosen": -191.20492553710938, + "logps/rejected": -254.7215118408203, + "loss": 0.5545, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07754083722829819, + "rewards/margins": 0.36685508489608765, + "rewards/rejected": -0.28931424021720886, + "step": 2110 + }, + { + "epoch": 0.28, + "learning_rate": 4.53614875698621e-06, + "logits/chosen": -2.485403537750244, + "logits/rejected": -2.4413208961486816, + "logps/chosen": -254.0594940185547, + "logps/rejected": -298.60418701171875, + "loss": 0.5989, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.019393663853406906, + "rewards/margins": 0.262022465467453, + "rewards/rejected": -0.242628812789917, + "step": 2120 + }, + { + "epoch": 0.28, + "learning_rate": 4.529501039175824e-06, + "logits/chosen": -2.6065239906311035, + "logits/rejected": -2.481513261795044, + "logps/chosen": -338.58074951171875, + "logps/rejected": -289.3169860839844, + "loss": 0.5989, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02020101621747017, + "rewards/margins": 0.29916948080062866, + "rewards/rejected": -0.31937047839164734, + "step": 2130 + }, + { + "epoch": 0.28, + "learning_rate": 4.522810967776287e-06, + "logits/chosen": -2.3868212699890137, + "logits/rejected": -2.4291112422943115, + "logps/chosen": -278.4410400390625, + "logps/rejected": -277.8754577636719, + "loss": 0.6006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13445501029491425, + "rewards/margins": 0.2142070233821869, + "rewards/rejected": -0.34866204857826233, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 4.516078682402473e-06, + "logits/chosen": -2.4170455932617188, + "logits/rejected": -2.5293774604797363, + "logps/chosen": -355.03814697265625, + "logps/rejected": -353.56427001953125, + "loss": 0.6163, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16386206448078156, + "rewards/margins": 0.2695190906524658, + "rewards/rejected": -0.43338117003440857, + "step": 2150 + }, + { + "epoch": 0.28, + "learning_rate": 4.509304323550221e-06, + "logits/chosen": -2.5881645679473877, + "logits/rejected": -2.538785696029663, + "logps/chosen": -283.84417724609375, + "logps/rejected": -299.88818359375, + "loss": 0.5677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1053847074508667, + "rewards/margins": 0.24635562300682068, + "rewards/rejected": -0.3517403304576874, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 4.502488032593398e-06, + "logits/chosen": -2.4397566318511963, + "logits/rejected": -2.4842915534973145, + "logps/chosen": -282.6026611328125, + "logps/rejected": -286.7880859375, + "loss": 0.6088, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17665888369083405, + "rewards/margins": 0.30781909823417664, + "rewards/rejected": -0.4844779968261719, + "step": 2170 + }, + { + "epoch": 0.29, + "learning_rate": 4.495629951780951e-06, + "logits/chosen": -2.429715633392334, + "logits/rejected": -2.4719252586364746, + "logps/chosen": -274.6560363769531, + "logps/rejected": -319.56451416015625, + "loss": 0.6209, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3811848759651184, + "rewards/margins": 0.21358831226825714, + "rewards/rejected": -0.5947731733322144, + "step": 2180 + }, + { + "epoch": 0.29, + "learning_rate": 4.488730224233941e-06, + "logits/chosen": -2.339588165283203, + "logits/rejected": -2.283986806869507, + "logps/chosen": -270.29217529296875, + "logps/rejected": -296.27691650390625, + "loss": 0.6223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4109855592250824, + "rewards/margins": 0.18327102065086365, + "rewards/rejected": -0.5942565202713013, + "step": 2190 + }, + { + "epoch": 0.29, + "learning_rate": 4.481788993942547e-06, + "logits/chosen": -2.4507622718811035, + "logits/rejected": -2.387429714202881, + "logps/chosen": -280.0044250488281, + "logps/rejected": -307.25750732421875, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17895767092704773, + "rewards/margins": 0.3076832890510559, + "rewards/rejected": -0.4866410791873932, + "step": 2200 + }, + { + "epoch": 0.29, + "learning_rate": 4.474806405763076e-06, + "logits/chosen": -2.4795234203338623, + "logits/rejected": -2.5023980140686035, + "logps/chosen": -345.68109130859375, + "logps/rejected": -403.64459228515625, + "loss": 0.6175, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23371641337871552, + "rewards/margins": 0.31080514192581177, + "rewards/rejected": -0.5445215106010437, + "step": 2210 + }, + { + "epoch": 0.29, + "learning_rate": 4.4677826054149235e-06, + "logits/chosen": -2.584589719772339, + "logits/rejected": -2.5288052558898926, + "logps/chosen": -308.86737060546875, + "logps/rejected": -322.84124755859375, + "loss": 0.6263, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11034766584634781, + "rewards/margins": 0.21714715659618378, + "rewards/rejected": -0.32749485969543457, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 4.460717739477543e-06, + "logits/chosen": -2.5494511127471924, + "logits/rejected": -2.4685215950012207, + "logps/chosen": -291.8902893066406, + "logps/rejected": -279.8962707519531, + "loss": 0.6018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.061232179403305054, + "rewards/margins": 0.2693517208099365, + "rewards/rejected": -0.3305839002132416, + "step": 2230 + }, + { + "epoch": 0.29, + "learning_rate": 4.4536119553873866e-06, + "logits/chosen": -2.4901301860809326, + "logits/rejected": -2.41926646232605, + "logps/chosen": -262.8622131347656, + "logps/rejected": -323.3758850097656, + "loss": 0.5967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06539380550384521, + "rewards/margins": 0.27599868178367615, + "rewards/rejected": -0.34139248728752136, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 4.446465401434824e-06, + "logits/chosen": -2.5936059951782227, + "logits/rejected": -2.663071632385254, + "logps/chosen": -304.9284973144531, + "logps/rejected": -320.9336853027344, + "loss": 0.6454, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07203792035579681, + "rewards/margins": 0.2686172127723694, + "rewards/rejected": -0.19657929241657257, + "step": 2250 + }, + { + "epoch": 0.3, + "learning_rate": 4.43927822676105e-06, + "logits/chosen": -2.4864799976348877, + "logits/rejected": -2.5649726390838623, + "logps/chosen": -296.66204833984375, + "logps/rejected": -289.03082275390625, + "loss": 0.5929, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.026843184605240822, + "rewards/margins": 0.32251793146133423, + "rewards/rejected": -0.3493611216545105, + "step": 2260 + }, + { + "epoch": 0.3, + "learning_rate": 4.432050581354972e-06, + "logits/chosen": -2.4548957347869873, + "logits/rejected": -2.4216175079345703, + "logps/chosen": -234.52139282226562, + "logps/rejected": -239.2303466796875, + "loss": 0.5865, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03519226610660553, + "rewards/margins": 0.3026849329471588, + "rewards/rejected": -0.33787721395492554, + "step": 2270 + }, + { + "epoch": 0.3, + "learning_rate": 4.424782616050078e-06, + "logits/chosen": -2.3073291778564453, + "logits/rejected": -2.3597683906555176, + "logps/chosen": -254.8343963623047, + "logps/rejected": -264.667236328125, + "loss": 0.6044, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.038565151393413544, + "rewards/margins": 0.2819172441959381, + "rewards/rejected": -0.24335213005542755, + "step": 2280 + }, + { + "epoch": 0.3, + "learning_rate": 4.4174744825212954e-06, + "logits/chosen": -2.5058090686798096, + "logits/rejected": -2.515031099319458, + "logps/chosen": -304.8393249511719, + "logps/rejected": -326.65972900390625, + "loss": 0.5692, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11621864140033722, + "rewards/margins": 0.346867173910141, + "rewards/rejected": -0.23064854741096497, + "step": 2290 + }, + { + "epoch": 0.3, + "learning_rate": 4.410126333281815e-06, + "logits/chosen": -2.3070998191833496, + "logits/rejected": -2.4147284030914307, + "logps/chosen": -307.43597412109375, + "logps/rejected": -239.33474731445312, + "loss": 0.6102, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.037448983639478683, + "rewards/margins": 0.19774429500102997, + "rewards/rejected": -0.23519328236579895, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 4.402738321679918e-06, + "logits/chosen": -2.3407058715820312, + "logits/rejected": -2.380464792251587, + "logps/chosen": -226.2201690673828, + "logps/rejected": -261.9936828613281, + "loss": 0.6317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024725358933210373, + "rewards/margins": 0.2291100025177002, + "rewards/rejected": -0.25383538007736206, + "step": 2310 + }, + { + "epoch": 0.3, + "learning_rate": 4.395310601895772e-06, + "logits/chosen": -2.5363030433654785, + "logits/rejected": -2.567491292953491, + "logps/chosen": -347.8073425292969, + "logps/rejected": -317.74249267578125, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0013760656584054232, + "rewards/margins": 0.2783746123313904, + "rewards/rejected": -0.27699849009513855, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 4.38784332893821e-06, + "logits/chosen": -2.430960178375244, + "logits/rejected": -2.45249605178833, + "logps/chosen": -306.5945129394531, + "logps/rejected": -279.98651123046875, + "loss": 0.6134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.028177553787827492, + "rewards/margins": 0.19871966540813446, + "rewards/rejected": -0.17054212093353271, + "step": 2330 + }, + { + "epoch": 0.31, + "learning_rate": 4.380336658641503e-06, + "logits/chosen": -2.559107542037964, + "logits/rejected": -2.523157835006714, + "logps/chosen": -290.43804931640625, + "logps/rejected": -351.04046630859375, + "loss": 0.5473, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13771465420722961, + "rewards/margins": 0.31184014678001404, + "rewards/rejected": -0.44955483078956604, + "step": 2340 + }, + { + "epoch": 0.31, + "learning_rate": 4.372790747662101e-06, + "logits/chosen": -2.5139009952545166, + "logits/rejected": -2.417280673980713, + "logps/chosen": -286.5636291503906, + "logps/rejected": -324.59039306640625, + "loss": 0.5932, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11431699991226196, + "rewards/margins": 0.22261719405651093, + "rewards/rejected": -0.33693423867225647, + "step": 2350 + }, + { + "epoch": 0.31, + "learning_rate": 4.365205753475367e-06, + "logits/chosen": -2.4109270572662354, + "logits/rejected": -2.4186367988586426, + "logps/chosen": -315.2587890625, + "logps/rejected": -291.68255615234375, + "loss": 0.5672, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07513483613729477, + "rewards/margins": 0.33876127004623413, + "rewards/rejected": -0.4138960838317871, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 4.35758183437229e-06, + "logits/chosen": -2.6014819145202637, + "logits/rejected": -2.5077662467956543, + "logps/chosen": -351.85394287109375, + "logps/rejected": -269.0332336425781, + "loss": 0.5406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004348271992057562, + "rewards/margins": 0.3022286891937256, + "rewards/rejected": -0.3065769374370575, + "step": 2370 + }, + { + "epoch": 0.31, + "learning_rate": 4.3499191494561835e-06, + "logits/chosen": -2.3322207927703857, + "logits/rejected": -2.3083035945892334, + "logps/chosen": -318.77581787109375, + "logps/rejected": -336.80670166015625, + "loss": 0.5912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08362318575382233, + "rewards/margins": 0.3532470464706421, + "rewards/rejected": -0.4368702471256256, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 4.3422178586393615e-06, + "logits/chosen": -2.443808078765869, + "logits/rejected": -2.4920902252197266, + "logps/chosen": -349.8179016113281, + "logps/rejected": -306.9833984375, + "loss": 0.5583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08438996225595474, + "rewards/margins": 0.3546445965766907, + "rewards/rejected": -0.4390345513820648, + "step": 2390 + }, + { + "epoch": 0.31, + "learning_rate": 4.334478122639804e-06, + "logits/chosen": -2.3911237716674805, + "logits/rejected": -2.422091484069824, + "logps/chosen": -351.39886474609375, + "logps/rejected": -262.12237548828125, + "loss": 0.6123, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13675999641418457, + "rewards/margins": 0.1890634149312973, + "rewards/rejected": -0.32582345604896545, + "step": 2400 + }, + { + "epoch": 0.32, + "learning_rate": 4.3267001029778015e-06, + "logits/chosen": -2.288379192352295, + "logits/rejected": -2.2975244522094727, + "logps/chosen": -308.05157470703125, + "logps/rejected": -284.1717224121094, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03496421501040459, + "rewards/margins": 0.3698606491088867, + "rewards/rejected": -0.4048248827457428, + "step": 2410 + }, + { + "epoch": 0.32, + "learning_rate": 4.318883961972585e-06, + "logits/chosen": -2.536451578140259, + "logits/rejected": -2.510542631149292, + "logps/chosen": -263.849365234375, + "logps/rejected": -263.8736877441406, + "loss": 0.5814, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13201352953910828, + "rewards/margins": 0.20481924712657928, + "rewards/rejected": -0.33683282136917114, + "step": 2420 + }, + { + "epoch": 0.32, + "learning_rate": 4.311029862738942e-06, + "logits/chosen": -2.211146593093872, + "logits/rejected": -2.203566789627075, + "logps/chosen": -234.99008178710938, + "logps/rejected": -300.64654541015625, + "loss": 0.6125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10305074602365494, + "rewards/margins": 0.28637638688087463, + "rewards/rejected": -0.389427125453949, + "step": 2430 + }, + { + "epoch": 0.32, + "learning_rate": 4.303137969183804e-06, + "logits/chosen": -2.401642322540283, + "logits/rejected": -2.3310704231262207, + "logps/chosen": -300.87078857421875, + "logps/rejected": -345.95819091796875, + "loss": 0.5971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06070847436785698, + "rewards/margins": 0.3207516670227051, + "rewards/rejected": -0.3814601004123688, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 4.295208446002832e-06, + "logits/chosen": -2.3515610694885254, + "logits/rejected": -2.18513822555542, + "logps/chosen": -244.791259765625, + "logps/rejected": -240.6001434326172, + "loss": 0.6044, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2596897482872009, + "rewards/margins": 0.19080080091953278, + "rewards/rejected": -0.4504905641078949, + "step": 2450 + }, + { + "epoch": 0.32, + "learning_rate": 4.287241458676981e-06, + "logits/chosen": -2.294541120529175, + "logits/rejected": -2.297311782836914, + "logps/chosen": -300.506103515625, + "logps/rejected": -303.3978576660156, + "loss": 0.5231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0287349671125412, + "rewards/margins": 0.46231597661972046, + "rewards/rejected": -0.4910510182380676, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 4.279237173469043e-06, + "logits/chosen": -2.259739875793457, + "logits/rejected": -2.238673686981201, + "logps/chosen": -324.48944091796875, + "logps/rejected": -331.08953857421875, + "loss": 0.6065, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11445991694927216, + "rewards/margins": 0.3600297272205353, + "rewards/rejected": -0.474489688873291, + "step": 2470 + }, + { + "epoch": 0.32, + "learning_rate": 4.271195757420177e-06, + "logits/chosen": -2.3108835220336914, + "logits/rejected": -2.259281873703003, + "logps/chosen": -289.0098571777344, + "logps/rejected": -295.3184814453125, + "loss": 0.6186, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.205244779586792, + "rewards/margins": 0.32750067114830017, + "rewards/rejected": -0.5327454805374146, + "step": 2480 + }, + { + "epoch": 0.33, + "learning_rate": 4.263117378346425e-06, + "logits/chosen": -2.441551685333252, + "logits/rejected": -2.3553037643432617, + "logps/chosen": -281.33172607421875, + "logps/rejected": -288.5588073730469, + "loss": 0.5818, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11674769967794418, + "rewards/margins": 0.30570247769355774, + "rewards/rejected": -0.42245015501976013, + "step": 2490 + }, + { + "epoch": 0.33, + "learning_rate": 4.255002204835208e-06, + "logits/chosen": -2.217869281768799, + "logits/rejected": -2.1716344356536865, + "logps/chosen": -297.220947265625, + "logps/rejected": -282.67205810546875, + "loss": 0.5996, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22235146164894104, + "rewards/margins": 0.3301992416381836, + "rewards/rejected": -0.5525506734848022, + "step": 2500 + }, + { + "epoch": 0.33, + "learning_rate": 4.246850406241812e-06, + "logits/chosen": -2.407466411590576, + "logits/rejected": -2.353933811187744, + "logps/chosen": -374.38525390625, + "logps/rejected": -407.6166076660156, + "loss": 0.5731, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13405577838420868, + "rewards/margins": 0.39027923345565796, + "rewards/rejected": -0.5243349671363831, + "step": 2510 + }, + { + "epoch": 0.33, + "learning_rate": 4.2386621526858465e-06, + "logits/chosen": -2.3261351585388184, + "logits/rejected": -2.389404296875, + "logps/chosen": -294.97125244140625, + "logps/rejected": -308.83514404296875, + "loss": 0.5968, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1480991542339325, + "rewards/margins": 0.3081936240196228, + "rewards/rejected": -0.4562928080558777, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 4.2304376150477015e-06, + "logits/chosen": -2.4777417182922363, + "logits/rejected": -2.249668598175049, + "logps/chosen": -302.7928466796875, + "logps/rejected": -316.9418640136719, + "loss": 0.6443, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.156307190656662, + "rewards/margins": 0.23777207732200623, + "rewards/rejected": -0.3940792679786682, + "step": 2530 + }, + { + "epoch": 0.33, + "learning_rate": 4.222176964964977e-06, + "logits/chosen": -2.408116579055786, + "logits/rejected": -2.2220921516418457, + "logps/chosen": -301.1642150878906, + "logps/rejected": -321.4419250488281, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09838789701461792, + "rewards/margins": 0.46421924233436584, + "rewards/rejected": -0.5626071691513062, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 4.213880374828903e-06, + "logits/chosen": -2.278613805770874, + "logits/rejected": -2.3370423316955566, + "logps/chosen": -308.1859130859375, + "logps/rejected": -302.2791442871094, + "loss": 0.5677, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1452012062072754, + "rewards/margins": 0.3158339560031891, + "rewards/rejected": -0.46103519201278687, + "step": 2550 + }, + { + "epoch": 0.33, + "learning_rate": 4.2055480177807406e-06, + "logits/chosen": -2.3317220211029053, + "logits/rejected": -2.309561252593994, + "logps/chosen": -309.7371520996094, + "logps/rejected": -312.92828369140625, + "loss": 0.5645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09456729888916016, + "rewards/margins": 0.4201463758945465, + "rewards/rejected": -0.514713704586029, + "step": 2560 + }, + { + "epoch": 0.34, + "learning_rate": 4.1971800677081696e-06, + "logits/chosen": -2.4523494243621826, + "logits/rejected": -2.295958995819092, + "logps/chosen": -249.5877685546875, + "logps/rejected": -284.1394958496094, + "loss": 0.5876, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09122565388679504, + "rewards/margins": 0.44344639778137207, + "rewards/rejected": -0.5346721410751343, + "step": 2570 + }, + { + "epoch": 0.34, + "learning_rate": 4.188776699241661e-06, + "logits/chosen": -2.2753987312316895, + "logits/rejected": -2.2520804405212402, + "logps/chosen": -249.6858367919922, + "logps/rejected": -302.20037841796875, + "loss": 0.6202, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2591573894023895, + "rewards/margins": 0.22782394289970398, + "rewards/rejected": -0.4869813323020935, + "step": 2580 + }, + { + "epoch": 0.34, + "learning_rate": 4.180338087750827e-06, + "logits/chosen": -2.2341039180755615, + "logits/rejected": -2.1473493576049805, + "logps/chosen": -330.99188232421875, + "logps/rejected": -321.80084228515625, + "loss": 0.6238, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2616772949695587, + "rewards/margins": 0.2900907099246979, + "rewards/rejected": -0.5517680048942566, + "step": 2590 + }, + { + "epoch": 0.34, + "learning_rate": 4.1718644093407704e-06, + "logits/chosen": -2.3818371295928955, + "logits/rejected": -2.323246479034424, + "logps/chosen": -277.61602783203125, + "logps/rejected": -312.0962219238281, + "loss": 0.6223, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.057144057005643845, + "rewards/margins": 0.2782011032104492, + "rewards/rejected": -0.3353451192378998, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 4.163355840848401e-06, + "logits/chosen": -2.2862417697906494, + "logits/rejected": -2.326111316680908, + "logps/chosen": -293.950927734375, + "logps/rejected": -318.50738525390625, + "loss": 0.6024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11978361755609512, + "rewards/margins": 0.316842645406723, + "rewards/rejected": -0.43662625551223755, + "step": 2610 + }, + { + "epoch": 0.34, + "learning_rate": 4.154812559838748e-06, + "logits/chosen": -2.3543944358825684, + "logits/rejected": -2.303706407546997, + "logps/chosen": -301.2679443359375, + "logps/rejected": -305.17083740234375, + "loss": 0.5571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09670768678188324, + "rewards/margins": 0.47083932161331177, + "rewards/rejected": -0.5675469636917114, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 4.146234744601259e-06, + "logits/chosen": -2.0804076194763184, + "logits/rejected": -2.0592329502105713, + "logps/chosen": -347.26690673828125, + "logps/rejected": -321.4139709472656, + "loss": 0.568, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3265538513660431, + "rewards/margins": 0.4145590662956238, + "rewards/rejected": -0.7411128878593445, + "step": 2630 + }, + { + "epoch": 0.35, + "learning_rate": 4.137622574146071e-06, + "logits/chosen": -2.3070764541625977, + "logits/rejected": -2.164135456085205, + "logps/chosen": -289.1574401855469, + "logps/rejected": -289.14593505859375, + "loss": 0.658, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3988410234451294, + "rewards/margins": 0.2665736973285675, + "rewards/rejected": -0.6654146909713745, + "step": 2640 + }, + { + "epoch": 0.35, + "learning_rate": 4.12897622820028e-06, + "logits/chosen": -2.147688388824463, + "logits/rejected": -2.30525279045105, + "logps/chosen": -363.5196838378906, + "logps/rejected": -376.8955993652344, + "loss": 0.5947, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35801568627357483, + "rewards/margins": 0.3377968668937683, + "rewards/rejected": -0.6958125233650208, + "step": 2650 + }, + { + "epoch": 0.35, + "learning_rate": 4.120295887204191e-06, + "logits/chosen": -2.036778450012207, + "logits/rejected": -2.1223537921905518, + "logps/chosen": -347.158203125, + "logps/rejected": -303.8466796875, + "loss": 0.6315, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.31214606761932373, + "rewards/margins": 0.3557351529598236, + "rewards/rejected": -0.667881190776825, + "step": 2660 + }, + { + "epoch": 0.35, + "learning_rate": 4.111581732307548e-06, + "logits/chosen": -2.1609489917755127, + "logits/rejected": -2.2461633682250977, + "logps/chosen": -396.5936279296875, + "logps/rejected": -346.9856262207031, + "loss": 0.5981, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29690462350845337, + "rewards/margins": 0.34992241859436035, + "rewards/rejected": -0.6468270421028137, + "step": 2670 + }, + { + "epoch": 0.35, + "learning_rate": 4.1028339453657595e-06, + "logits/chosen": -2.185354709625244, + "logits/rejected": -2.20464825630188, + "logps/chosen": -301.9336853027344, + "logps/rejected": -297.63848876953125, + "loss": 0.5928, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12790650129318237, + "rewards/margins": 0.2755155861377716, + "rewards/rejected": -0.4034220278263092, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 4.094052708936096e-06, + "logits/chosen": -2.303433656692505, + "logits/rejected": -2.291386842727661, + "logps/chosen": -304.7251892089844, + "logps/rejected": -271.1741027832031, + "loss": 0.5825, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18045970797538757, + "rewards/margins": 0.29294687509536743, + "rewards/rejected": -0.4734066128730774, + "step": 2690 + }, + { + "epoch": 0.35, + "learning_rate": 4.0852382062738874e-06, + "logits/chosen": -2.155461072921753, + "logits/rejected": -2.221578359603882, + "logps/chosen": -266.7882080078125, + "logps/rejected": -297.5181884765625, + "loss": 0.5807, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25950485467910767, + "rewards/margins": 0.3401879668235779, + "rewards/rejected": -0.5996927618980408, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 4.076390621328693e-06, + "logits/chosen": -2.242144823074341, + "logits/rejected": -2.1814465522766113, + "logps/chosen": -284.7609558105469, + "logps/rejected": -254.7688751220703, + "loss": 0.6326, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.22597642242908478, + "rewards/margins": 0.22978129982948303, + "rewards/rejected": -0.4557577073574066, + "step": 2710 + }, + { + "epoch": 0.36, + "learning_rate": 4.067510138740467e-06, + "logits/chosen": -2.0894429683685303, + "logits/rejected": -2.0228912830352783, + "logps/chosen": -306.20648193359375, + "logps/rejected": -333.9632873535156, + "loss": 0.648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3702475428581238, + "rewards/margins": 0.3178880214691162, + "rewards/rejected": -0.68813556432724, + "step": 2720 + }, + { + "epoch": 0.36, + "learning_rate": 4.058596943835703e-06, + "logits/chosen": -2.2216053009033203, + "logits/rejected": -2.103013515472412, + "logps/chosen": -324.4733581542969, + "logps/rejected": -384.42218017578125, + "loss": 0.5703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33959847688674927, + "rewards/margins": 0.43944621086120605, + "rewards/rejected": -0.7790446281433105, + "step": 2730 + }, + { + "epoch": 0.36, + "learning_rate": 4.049651222623568e-06, + "logits/chosen": -2.0486068725585938, + "logits/rejected": -2.0759623050689697, + "logps/chosen": -340.746337890625, + "logps/rejected": -304.642822265625, + "loss": 0.6202, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3014008402824402, + "rewards/margins": 0.33375871181488037, + "rewards/rejected": -0.6351595520973206, + "step": 2740 + }, + { + "epoch": 0.36, + "learning_rate": 4.040673161792014e-06, + "logits/chosen": -2.0863208770751953, + "logits/rejected": -2.2182228565216064, + "logps/chosen": -263.3828430175781, + "logps/rejected": -306.2562561035156, + "loss": 0.5554, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.30262234807014465, + "rewards/margins": 0.5143641829490662, + "rewards/rejected": -0.8169865608215332, + "step": 2750 + }, + { + "epoch": 0.36, + "learning_rate": 4.031662948703896e-06, + "logits/chosen": -2.131186008453369, + "logits/rejected": -2.1568381786346436, + "logps/chosen": -355.12188720703125, + "logps/rejected": -363.9092102050781, + "loss": 0.5581, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21906176209449768, + "rewards/margins": 0.4955914616584778, + "rewards/rejected": -0.7146531939506531, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 4.022620771393047e-06, + "logits/chosen": -2.118340253829956, + "logits/rejected": -2.098379135131836, + "logps/chosen": -329.6128845214844, + "logps/rejected": -316.06634521484375, + "loss": 0.5547, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.32899558544158936, + "rewards/margins": 0.2673387825489044, + "rewards/rejected": -0.5963343977928162, + "step": 2770 + }, + { + "epoch": 0.36, + "learning_rate": 4.013546818560362e-06, + "logits/chosen": -2.2855918407440186, + "logits/rejected": -2.227619171142578, + "logps/chosen": -373.0777282714844, + "logps/rejected": -338.8295593261719, + "loss": 0.5698, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.30602166056632996, + "rewards/margins": 0.2875543236732483, + "rewards/rejected": -0.5935760140419006, + "step": 2780 + }, + { + "epoch": 0.37, + "learning_rate": 4.00444127956986e-06, + "logits/chosen": -2.0654170513153076, + "logits/rejected": -2.0763885974884033, + "logps/chosen": -335.36932373046875, + "logps/rejected": -368.7438049316406, + "loss": 0.5724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24469080567359924, + "rewards/margins": 0.4334893226623535, + "rewards/rejected": -0.6781800985336304, + "step": 2790 + }, + { + "epoch": 0.37, + "learning_rate": 3.9953043444447255e-06, + "logits/chosen": -2.013458728790283, + "logits/rejected": -2.0963714122772217, + "logps/chosen": -339.3968200683594, + "logps/rejected": -383.424560546875, + "loss": 0.6198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25088945031166077, + "rewards/margins": 0.28715524077415466, + "rewards/rejected": -0.5380446314811707, + "step": 2800 + }, + { + "epoch": 0.37, + "learning_rate": 3.986136203863355e-06, + "logits/chosen": -2.154468059539795, + "logits/rejected": -2.126218557357788, + "logps/chosen": -316.4076232910156, + "logps/rejected": -336.19805908203125, + "loss": 0.5697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3153080642223358, + "rewards/margins": 0.3109374940395355, + "rewards/rejected": -0.6262456178665161, + "step": 2810 + }, + { + "epoch": 0.37, + "learning_rate": 3.976937049155365e-06, + "logits/chosen": -2.2662549018859863, + "logits/rejected": -2.0330440998077393, + "logps/chosen": -328.37689208984375, + "logps/rejected": -397.66192626953125, + "loss": 0.612, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23124253749847412, + "rewards/margins": 0.23719322681427002, + "rewards/rejected": -0.46843576431274414, + "step": 2820 + }, + { + "epoch": 0.37, + "learning_rate": 3.967707072297608e-06, + "logits/chosen": -2.1351351737976074, + "logits/rejected": -2.0593247413635254, + "logps/chosen": -253.71023559570312, + "logps/rejected": -282.72906494140625, + "loss": 0.5432, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1287592053413391, + "rewards/margins": 0.40838032960891724, + "rewards/rejected": -0.5371395349502563, + "step": 2830 + }, + { + "epoch": 0.37, + "learning_rate": 3.958446465910159e-06, + "logits/chosen": -2.0801303386688232, + "logits/rejected": -2.1126909255981445, + "logps/chosen": -300.8744201660156, + "logps/rejected": -336.2434387207031, + "loss": 0.5857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22396187484264374, + "rewards/margins": 0.18644443154335022, + "rewards/rejected": -0.41040635108947754, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 3.9491554232523066e-06, + "logits/chosen": -1.8716402053833008, + "logits/rejected": -1.930053949356079, + "logps/chosen": -285.86676025390625, + "logps/rejected": -308.9801330566406, + "loss": 0.6083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27063798904418945, + "rewards/margins": 0.3528960645198822, + "rewards/rejected": -0.6235340237617493, + "step": 2850 + }, + { + "epoch": 0.37, + "learning_rate": 3.939834138218505e-06, + "logits/chosen": -1.9066463708877563, + "logits/rejected": -1.8312351703643799, + "logps/chosen": -260.8415222167969, + "logps/rejected": -307.47137451171875, + "loss": 0.5802, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23134949803352356, + "rewards/margins": 0.34520596265792847, + "rewards/rejected": -0.5765554308891296, + "step": 2860 + }, + { + "epoch": 0.38, + "learning_rate": 3.930482805334339e-06, + "logits/chosen": -2.0641021728515625, + "logits/rejected": -1.9888460636138916, + "logps/chosen": -285.9426574707031, + "logps/rejected": -280.6845397949219, + "loss": 0.5867, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.33594805002212524, + "rewards/margins": 0.25887084007263184, + "rewards/rejected": -0.5948188900947571, + "step": 2870 + }, + { + "epoch": 0.38, + "learning_rate": 3.921101619752464e-06, + "logits/chosen": -2.114084005355835, + "logits/rejected": -2.1507246494293213, + "logps/chosen": -328.9808044433594, + "logps/rejected": -373.2530822753906, + "loss": 0.6008, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25964587926864624, + "rewards/margins": 0.2907247841358185, + "rewards/rejected": -0.5503706336021423, + "step": 2880 + }, + { + "epoch": 0.38, + "learning_rate": 3.911690777248525e-06, + "logits/chosen": -2.107130765914917, + "logits/rejected": -2.0618035793304443, + "logps/chosen": -265.3777160644531, + "logps/rejected": -277.8905944824219, + "loss": 0.5935, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20159196853637695, + "rewards/margins": 0.19179949164390564, + "rewards/rejected": -0.3933914303779602, + "step": 2890 + }, + { + "epoch": 0.38, + "learning_rate": 3.902250474217079e-06, + "logits/chosen": -1.9314912557601929, + "logits/rejected": -1.9056018590927124, + "logps/chosen": -203.7743377685547, + "logps/rejected": -299.52294921875, + "loss": 0.5475, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11742253601551056, + "rewards/margins": 0.5690168142318726, + "rewards/rejected": -0.6864393353462219, + "step": 2900 + }, + { + "epoch": 0.38, + "learning_rate": 3.892780907667495e-06, + "logits/chosen": -2.3248629570007324, + "logits/rejected": -2.138824939727783, + "logps/chosen": -337.47442626953125, + "logps/rejected": -356.5024108886719, + "loss": 0.5704, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13495245575904846, + "rewards/margins": 0.3198757767677307, + "rewards/rejected": -0.45482826232910156, + "step": 2910 + }, + { + "epoch": 0.38, + "learning_rate": 3.883282275219837e-06, + "logits/chosen": -2.003467082977295, + "logits/rejected": -2.1252570152282715, + "logps/chosen": -291.63104248046875, + "logps/rejected": -323.35394287109375, + "loss": 0.5762, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.259773313999176, + "rewards/margins": 0.3523366451263428, + "rewards/rejected": -0.612109899520874, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 3.873754775100751e-06, + "logits/chosen": -1.76229989528656, + "logits/rejected": -1.7705276012420654, + "logps/chosen": -297.5982971191406, + "logps/rejected": -361.0187072753906, + "loss": 0.58, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.45478639006614685, + "rewards/margins": 0.43320173025131226, + "rewards/rejected": -0.8879879713058472, + "step": 2930 + }, + { + "epoch": 0.38, + "learning_rate": 3.8641986061393145e-06, + "logits/chosen": -1.9891338348388672, + "logits/rejected": -1.7917144298553467, + "logps/chosen": -340.5142822265625, + "logps/rejected": -329.16650390625, + "loss": 0.5425, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36488741636276245, + "rewards/margins": 0.5596657991409302, + "rewards/rejected": -0.9245532155036926, + "step": 2940 + }, + { + "epoch": 0.39, + "learning_rate": 3.854613967762898e-06, + "logits/chosen": -1.7319562435150146, + "logits/rejected": -1.7842572927474976, + "logps/chosen": -305.44927978515625, + "logps/rejected": -376.9012756347656, + "loss": 0.5102, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.29306459426879883, + "rewards/margins": 0.5575570464134216, + "rewards/rejected": -0.8506217002868652, + "step": 2950 + }, + { + "epoch": 0.39, + "learning_rate": 3.845001059992999e-06, + "logits/chosen": -1.970813512802124, + "logits/rejected": -1.7959372997283936, + "logps/chosen": -364.3366394042969, + "logps/rejected": -381.53057861328125, + "loss": 0.5737, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4650901257991791, + "rewards/margins": 0.40246671438217163, + "rewards/rejected": -0.8675567507743835, + "step": 2960 + }, + { + "epoch": 0.39, + "learning_rate": 3.835360083441067e-06, + "logits/chosen": -1.9703340530395508, + "logits/rejected": -1.9452049732208252, + "logps/chosen": -318.2285461425781, + "logps/rejected": -342.3816833496094, + "loss": 0.622, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7103244662284851, + "rewards/margins": 0.20334331691265106, + "rewards/rejected": -0.9136677980422974, + "step": 2970 + }, + { + "epoch": 0.39, + "learning_rate": 3.825691239304318e-06, + "logits/chosen": -1.7512576580047607, + "logits/rejected": -1.9350429773330688, + "logps/chosen": -323.9259338378906, + "logps/rejected": -361.47406005859375, + "loss": 0.5961, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4926982820034027, + "rewards/margins": 0.5273388028144836, + "rewards/rejected": -1.020037055015564, + "step": 2980 + }, + { + "epoch": 0.39, + "learning_rate": 3.8159947293615385e-06, + "logits/chosen": -1.9276981353759766, + "logits/rejected": -2.0005970001220703, + "logps/chosen": -374.3683166503906, + "logps/rejected": -364.6202392578125, + "loss": 0.6238, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6283657550811768, + "rewards/margins": 0.32263484597206116, + "rewards/rejected": -0.9510005712509155, + "step": 2990 + }, + { + "epoch": 0.39, + "learning_rate": 3.806270755968866e-06, + "logits/chosen": -1.8625282049179077, + "logits/rejected": -1.7391935586929321, + "logps/chosen": -239.41372680664062, + "logps/rejected": -336.10821533203125, + "loss": 0.5074, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.42174920439720154, + "rewards/margins": 0.5053009390830994, + "rewards/rejected": -0.9270502328872681, + "step": 3000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -1.8925442695617676, + "eval_logits/rejected": -1.8683404922485352, + "eval_logps/chosen": -339.8664855957031, + "eval_logps/rejected": -345.8194274902344, + "eval_loss": 0.583726167678833, + "eval_rewards/accuracies": 0.6940000057220459, + "eval_rewards/chosen": -0.4892871081829071, + "eval_rewards/margins": 0.3993385136127472, + "eval_rewards/rejected": -0.8886256814002991, + "eval_runtime": 470.3396, + "eval_samples_per_second": 4.252, + "eval_steps_per_second": 1.063, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 3.7965195220555784e-06, + "logits/chosen": -1.9942333698272705, + "logits/rejected": -2.0117554664611816, + "logps/chosen": -339.41314697265625, + "logps/rejected": -375.5736389160156, + "loss": 0.5657, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.36183083057403564, + "rewards/margins": 0.3870917558670044, + "rewards/rejected": -0.74892258644104, + "step": 3010 + }, + { + "epoch": 0.4, + "learning_rate": 3.786741231119847e-06, + "logits/chosen": -1.888830542564392, + "logits/rejected": -1.9655253887176514, + "logps/chosen": -299.9886779785156, + "logps/rejected": -364.9654235839844, + "loss": 0.4903, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29870834946632385, + "rewards/margins": 0.60088050365448, + "rewards/rejected": -0.899588942527771, + "step": 3020 + }, + { + "epoch": 0.4, + "learning_rate": 3.7769360872244992e-06, + "logits/chosen": -2.050785779953003, + "logits/rejected": -1.9752006530761719, + "logps/chosen": -360.0720520019531, + "logps/rejected": -389.65277099609375, + "loss": 0.5904, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4514765739440918, + "rewards/margins": 0.3611275553703308, + "rewards/rejected": -0.8126041293144226, + "step": 3030 + }, + { + "epoch": 0.4, + "learning_rate": 3.767104294992754e-06, + "logits/chosen": -2.1255500316619873, + "logits/rejected": -1.9410721063613892, + "logps/chosen": -294.6708068847656, + "logps/rejected": -296.01959228515625, + "loss": 0.6338, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3220329284667969, + "rewards/margins": 0.24862143397331238, + "rewards/rejected": -0.5706543326377869, + "step": 3040 + }, + { + "epoch": 0.4, + "learning_rate": 3.7572460596039524e-06, + "logits/chosen": -2.0564870834350586, + "logits/rejected": -1.9291785955429077, + "logps/chosen": -273.62689208984375, + "logps/rejected": -306.6474304199219, + "loss": 0.639, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3566773533821106, + "rewards/margins": 0.30197399854660034, + "rewards/rejected": -0.6586513519287109, + "step": 3050 + }, + { + "epoch": 0.4, + "learning_rate": 3.74736158678928e-06, + "logits/chosen": -1.691098928451538, + "logits/rejected": -1.6534446477890015, + "logps/chosen": -250.8104705810547, + "logps/rejected": -287.73492431640625, + "loss": 0.5514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26907315850257874, + "rewards/margins": 0.47001558542251587, + "rewards/rejected": -0.739088773727417, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 3.7374510828274673e-06, + "logits/chosen": -2.013826370239258, + "logits/rejected": -1.925368309020996, + "logps/chosen": -291.58526611328125, + "logps/rejected": -373.1654052734375, + "loss": 0.5742, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2754715383052826, + "rewards/margins": 0.39501675963401794, + "rewards/rejected": -0.6704882383346558, + "step": 3070 + }, + { + "epoch": 0.4, + "learning_rate": 3.72751475454049e-06, + "logits/chosen": -1.9034068584442139, + "logits/rejected": -1.9186067581176758, + "logps/chosen": -330.48345947265625, + "logps/rejected": -342.3094787597656, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2500624656677246, + "rewards/margins": 0.4651865065097809, + "rewards/rejected": -0.7152489423751831, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 3.7175528092892503e-06, + "logits/chosen": -2.0310986042022705, + "logits/rejected": -1.9921343326568604, + "logps/chosen": -314.3505859375, + "logps/rejected": -308.22100830078125, + "loss": 0.6007, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3394377529621124, + "rewards/margins": 0.33183586597442627, + "rewards/rejected": -0.6712735891342163, + "step": 3090 + }, + { + "epoch": 0.41, + "learning_rate": 3.7075654549692498e-06, + "logits/chosen": -1.760263442993164, + "logits/rejected": -1.6774251461029053, + "logps/chosen": -305.39593505859375, + "logps/rejected": -309.4274597167969, + "loss": 0.6064, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.34780532121658325, + "rewards/margins": 0.34272414445877075, + "rewards/rejected": -0.6905295252799988, + "step": 3100 + }, + { + "epoch": 0.41, + "learning_rate": 3.697552900006249e-06, + "logits/chosen": -1.9550129175186157, + "logits/rejected": -1.979591727256775, + "logps/chosen": -328.69854736328125, + "logps/rejected": -389.3207092285156, + "loss": 0.5857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47823166847229004, + "rewards/margins": 0.26306986808776855, + "rewards/rejected": -0.7413015365600586, + "step": 3110 + }, + { + "epoch": 0.41, + "learning_rate": 3.6875153533519244e-06, + "logits/chosen": -2.008551597595215, + "logits/rejected": -1.7621619701385498, + "logps/chosen": -330.7730712890625, + "logps/rejected": -330.113037109375, + "loss": 0.5356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34347599744796753, + "rewards/margins": 0.5221987962722778, + "rewards/rejected": -0.8656747937202454, + "step": 3120 + }, + { + "epoch": 0.41, + "learning_rate": 3.6774530244794992e-06, + "logits/chosen": -1.9466478824615479, + "logits/rejected": -1.9106762409210205, + "logps/chosen": -367.3138427734375, + "logps/rejected": -353.61578369140625, + "loss": 0.577, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.350848525762558, + "rewards/margins": 0.43803420662879944, + "rewards/rejected": -0.7888827919960022, + "step": 3130 + }, + { + "epoch": 0.41, + "learning_rate": 3.667366123379378e-06, + "logits/chosen": -1.7992089986801147, + "logits/rejected": -1.7090864181518555, + "logps/chosen": -320.92767333984375, + "logps/rejected": -382.07049560546875, + "loss": 0.5307, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.35857582092285156, + "rewards/margins": 0.4412190914154053, + "rewards/rejected": -0.7997949719429016, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 3.6572548605547607e-06, + "logits/chosen": -1.9332202672958374, + "logits/rejected": -1.7783571481704712, + "logps/chosen": -344.77960205078125, + "logps/rejected": -333.2381286621094, + "loss": 0.5498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39050596952438354, + "rewards/margins": 0.3955034911632538, + "rewards/rejected": -0.7860094308853149, + "step": 3150 + }, + { + "epoch": 0.41, + "learning_rate": 3.6471194470172538e-06, + "logits/chosen": -2.1253113746643066, + "logits/rejected": -2.131592273712158, + "logps/chosen": -411.53717041015625, + "logps/rejected": -421.5502014160156, + "loss": 0.5551, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5085423588752747, + "rewards/margins": 0.4050801694393158, + "rewards/rejected": -0.9136225581169128, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 3.636960094282461e-06, + "logits/chosen": -1.588428258895874, + "logits/rejected": -1.7615245580673218, + "logps/chosen": -284.22906494140625, + "logps/rejected": -344.3065490722656, + "loss": 0.513, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4755423069000244, + "rewards/margins": 0.45948463678359985, + "rewards/rejected": -0.9350269436836243, + "step": 3170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6267770143655743e-06, + "logits/chosen": -1.9426956176757812, + "logits/rejected": -1.9251439571380615, + "logps/chosen": -309.91845703125, + "logps/rejected": -315.4515380859375, + "loss": 0.6187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3190910220146179, + "rewards/margins": 0.2664428651332855, + "rewards/rejected": -0.5855339765548706, + "step": 3180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6165704197769484e-06, + "logits/chosen": -1.8122657537460327, + "logits/rejected": -1.7266877889633179, + "logps/chosen": -283.2447204589844, + "logps/rejected": -322.1012268066406, + "loss": 0.5132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3078143000602722, + "rewards/margins": 0.5648934841156006, + "rewards/rejected": -0.872707724571228, + "step": 3190 + }, + { + "epoch": 0.42, + "learning_rate": 3.606340523517663e-06, + "logits/chosen": -2.02577543258667, + "logits/rejected": -1.9819904565811157, + "logps/chosen": -345.08013916015625, + "logps/rejected": -388.6039733886719, + "loss": 0.5648, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3757336139678955, + "rewards/margins": 0.5439754128456116, + "rewards/rejected": -0.9197090268135071, + "step": 3200 + }, + { + "epoch": 0.42, + "learning_rate": 3.5960875390750793e-06, + "logits/chosen": -1.883567452430725, + "logits/rejected": -1.83354914188385, + "logps/chosen": -328.5043640136719, + "logps/rejected": -389.57147216796875, + "loss": 0.657, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5405290722846985, + "rewards/margins": 0.2401866465806961, + "rewards/rejected": -0.7807157039642334, + "step": 3210 + }, + { + "epoch": 0.42, + "learning_rate": 3.585811680418386e-06, + "logits/chosen": -1.8089916706085205, + "logits/rejected": -1.6490482091903687, + "logps/chosen": -290.0383605957031, + "logps/rejected": -280.5243225097656, + "loss": 0.5336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25913602113723755, + "rewards/margins": 0.5148822069168091, + "rewards/rejected": -0.7740182280540466, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5755131619941347e-06, + "logits/chosen": -1.8778012990951538, + "logits/rejected": -1.8739662170410156, + "logps/chosen": -371.23541259765625, + "logps/rejected": -398.765625, + "loss": 0.5434, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4466145634651184, + "rewards/margins": 0.5453084111213684, + "rewards/rejected": -0.9919229745864868, + "step": 3230 + }, + { + "epoch": 0.42, + "learning_rate": 3.565192198721759e-06, + "logits/chosen": -1.8352159261703491, + "logits/rejected": -1.6965032815933228, + "logps/chosen": -320.8368835449219, + "logps/rejected": -293.8391418457031, + "loss": 0.5918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3606331944465637, + "rewards/margins": 0.49762916564941406, + "rewards/rejected": -0.8582623600959778, + "step": 3240 + }, + { + "epoch": 0.43, + "learning_rate": 3.5548490059890965e-06, + "logits/chosen": -1.9786624908447266, + "logits/rejected": -1.9229927062988281, + "logps/chosen": -354.0396728515625, + "logps/rejected": -359.8043518066406, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3673448860645294, + "rewards/margins": 0.28263354301452637, + "rewards/rejected": -0.649978518486023, + "step": 3250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5444837996478903e-06, + "logits/chosen": -1.675825834274292, + "logits/rejected": -1.7425578832626343, + "logps/chosen": -281.09356689453125, + "logps/rejected": -372.2138671875, + "loss": 0.5869, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.261254221200943, + "rewards/margins": 0.5568670034408569, + "rewards/rejected": -0.8181211352348328, + "step": 3260 + }, + { + "epoch": 0.43, + "learning_rate": 3.534096796009282e-06, + "logits/chosen": -2.0022459030151367, + "logits/rejected": -1.9982783794403076, + "logps/chosen": -276.1816101074219, + "logps/rejected": -320.88433837890625, + "loss": 0.608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2739899158477783, + "rewards/margins": 0.2969050407409668, + "rewards/rejected": -0.5708949565887451, + "step": 3270 + }, + { + "epoch": 0.43, + "learning_rate": 3.5236882118393046e-06, + "logits/chosen": -1.9864498376846313, + "logits/rejected": -1.9585399627685547, + "logps/chosen": -314.32049560546875, + "logps/rejected": -324.8771667480469, + "loss": 0.5573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34241166710853577, + "rewards/margins": 0.3525606691837311, + "rewards/rejected": -0.6949723958969116, + "step": 3280 + }, + { + "epoch": 0.43, + "learning_rate": 3.5132582643543513e-06, + "logits/chosen": -1.88204824924469, + "logits/rejected": -1.6861655712127686, + "logps/chosen": -292.99005126953125, + "logps/rejected": -310.88739013671875, + "loss": 0.5405, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11707563698291779, + "rewards/margins": 0.5362022519111633, + "rewards/rejected": -0.6532778143882751, + "step": 3290 + }, + { + "epoch": 0.43, + "learning_rate": 3.5028071712166456e-06, + "logits/chosen": -1.5832637548446655, + "logits/rejected": -1.6472686529159546, + "logps/chosen": -305.79010009765625, + "logps/rejected": -327.6679382324219, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12145811319351196, + "rewards/margins": 0.6676918268203735, + "rewards/rejected": -0.7891498804092407, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 3.4923351505297008e-06, + "logits/chosen": -1.6448742151260376, + "logits/rejected": -1.8500255346298218, + "logps/chosen": -330.3373107910156, + "logps/rejected": -285.66510009765625, + "loss": 0.6104, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3089975118637085, + "rewards/margins": 0.4476284980773926, + "rewards/rejected": -0.7566260099411011, + "step": 3310 + }, + { + "epoch": 0.43, + "learning_rate": 3.481842420833766e-06, + "logits/chosen": -2.031259059906006, + "logits/rejected": -1.9165537357330322, + "logps/chosen": -312.6297607421875, + "logps/rejected": -352.7905578613281, + "loss": 0.5613, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2148057520389557, + "rewards/margins": 0.5726595520973206, + "rewards/rejected": -0.7874653935432434, + "step": 3320 + }, + { + "epoch": 0.44, + "learning_rate": 3.4713292011012645e-06, + "logits/chosen": -1.7606697082519531, + "logits/rejected": -1.6835781335830688, + "logps/chosen": -278.14349365234375, + "logps/rejected": -313.0967712402344, + "loss": 0.5691, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12200482934713364, + "rewards/margins": 0.3358001410961151, + "rewards/rejected": -0.45780497789382935, + "step": 3330 + }, + { + "epoch": 0.44, + "learning_rate": 3.4607957107322277e-06, + "logits/chosen": -1.6394850015640259, + "logits/rejected": -1.7286498546600342, + "logps/chosen": -251.3562774658203, + "logps/rejected": -320.8995361328125, + "loss": 0.6055, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.28670233488082886, + "rewards/margins": 0.4773294925689697, + "rewards/rejected": -0.7640317678451538, + "step": 3340 + }, + { + "epoch": 0.44, + "learning_rate": 3.4502421695497112e-06, + "logits/chosen": -1.880221962928772, + "logits/rejected": -1.7732661962509155, + "logps/chosen": -324.29595947265625, + "logps/rejected": -331.45819091796875, + "loss": 0.6354, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19243960082530975, + "rewards/margins": 0.5145636796951294, + "rewards/rejected": -0.7070032954216003, + "step": 3350 + }, + { + "epoch": 0.44, + "learning_rate": 3.4396687977952137e-06, + "logits/chosen": -1.9363040924072266, + "logits/rejected": -1.7693822383880615, + "logps/chosen": -263.6895446777344, + "logps/rejected": -296.7872009277344, + "loss": 0.5735, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2648986577987671, + "rewards/margins": 0.49490612745285034, + "rewards/rejected": -0.7598048448562622, + "step": 3360 + }, + { + "epoch": 0.44, + "learning_rate": 3.429075816124075e-06, + "logits/chosen": -2.0347540378570557, + "logits/rejected": -1.9576051235198975, + "logps/chosen": -416.261474609375, + "logps/rejected": -391.58319091796875, + "loss": 0.5312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2854515314102173, + "rewards/margins": 0.46610012650489807, + "rewards/rejected": -0.7515517473220825, + "step": 3370 + }, + { + "epoch": 0.44, + "learning_rate": 3.418463445600874e-06, + "logits/chosen": -1.9797786474227905, + "logits/rejected": -1.8702456951141357, + "logps/chosen": -356.02569580078125, + "logps/rejected": -296.83013916015625, + "loss": 0.6574, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.35800525546073914, + "rewards/margins": 0.182878777384758, + "rewards/rejected": -0.5408840179443359, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 3.4078319076948173e-06, + "logits/chosen": -1.814796805381775, + "logits/rejected": -1.6449832916259766, + "logps/chosen": -308.68695068359375, + "logps/rejected": -302.3656005859375, + "loss": 0.5907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3078504204750061, + "rewards/margins": 0.2429850846529007, + "rewards/rejected": -0.5508354902267456, + "step": 3390 + }, + { + "epoch": 0.44, + "learning_rate": 3.3971814242751123e-06, + "logits/chosen": -1.67721688747406, + "logits/rejected": -1.7463533878326416, + "logps/chosen": -348.93829345703125, + "logps/rejected": -369.0666198730469, + "loss": 0.6087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3005754351615906, + "rewards/margins": 0.43208903074264526, + "rewards/rejected": -0.7326644659042358, + "step": 3400 + }, + { + "epoch": 0.45, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": -1.9113638401031494, + "logits/rejected": -1.596459150314331, + "logps/chosen": -315.2054748535156, + "logps/rejected": -343.07696533203125, + "loss": 0.5382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31064683198928833, + "rewards/margins": 0.38677093386650085, + "rewards/rejected": -0.6974178552627563, + "step": 3410 + }, + { + "epoch": 0.45, + "learning_rate": 3.375824510343816e-06, + "logits/chosen": -1.7063785791397095, + "logits/rejected": -1.74051034450531, + "logps/chosen": -268.4913330078125, + "logps/rejected": -274.31854248046875, + "loss": 0.6151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.368520051240921, + "rewards/margins": 0.4169144034385681, + "rewards/rejected": -0.7854345440864563, + "step": 3420 + }, + { + "epoch": 0.45, + "learning_rate": 3.3651185255289466e-06, + "logits/chosen": -1.973966360092163, + "logits/rejected": -1.9200187921524048, + "logps/chosen": -330.79681396484375, + "logps/rejected": -329.52337646484375, + "loss": 0.5539, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3142684996128082, + "rewards/margins": 0.3972964286804199, + "rewards/rejected": -0.7115648984909058, + "step": 3430 + }, + { + "epoch": 0.45, + "learning_rate": 3.354394486584568e-06, + "logits/chosen": -1.6630948781967163, + "logits/rejected": -1.6668355464935303, + "logps/chosen": -275.77325439453125, + "logps/rejected": -332.2265930175781, + "loss": 0.6233, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3652837872505188, + "rewards/margins": 0.24674764275550842, + "rewards/rejected": -0.6120314598083496, + "step": 3440 + }, + { + "epoch": 0.45, + "learning_rate": 3.3436526173102913e-06, + "logits/chosen": -1.854465126991272, + "logits/rejected": -1.8468387126922607, + "logps/chosen": -353.0055236816406, + "logps/rejected": -368.3038024902344, + "loss": 0.5685, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3907431960105896, + "rewards/margins": 0.42177945375442505, + "rewards/rejected": -0.8125227093696594, + "step": 3450 + }, + { + "epoch": 0.45, + "learning_rate": 3.3328931418778254e-06, + "logits/chosen": -1.7449018955230713, + "logits/rejected": -1.562340259552002, + "logps/chosen": -304.9818115234375, + "logps/rejected": -320.1778564453125, + "loss": 0.5605, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5152902603149414, + "rewards/margins": 0.5426300168037415, + "rewards/rejected": -1.057920217514038, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 3.3221162848263028e-06, + "logits/chosen": -1.7783063650131226, + "logits/rejected": -1.69500732421875, + "logps/chosen": -343.2347412109375, + "logps/rejected": -319.19403076171875, + "loss": 0.5709, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6175016760826111, + "rewards/margins": 0.4418589472770691, + "rewards/rejected": -1.0593606233596802, + "step": 3470 + }, + { + "epoch": 0.46, + "learning_rate": 3.3113222710575914e-06, + "logits/chosen": -2.046802043914795, + "logits/rejected": -1.8008846044540405, + "logps/chosen": -374.3642883300781, + "logps/rejected": -387.9255065917969, + "loss": 0.5784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.40988707542419434, + "rewards/margins": 0.49350953102111816, + "rewards/rejected": -0.903396725654602, + "step": 3480 + }, + { + "epoch": 0.46, + "learning_rate": 3.300511325831603e-06, + "logits/chosen": -1.892831563949585, + "logits/rejected": -1.7738029956817627, + "logps/chosen": -372.5390625, + "logps/rejected": -341.38909912109375, + "loss": 0.5703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6792625188827515, + "rewards/margins": 0.27183204889297485, + "rewards/rejected": -0.9510945081710815, + "step": 3490 + }, + { + "epoch": 0.46, + "learning_rate": 3.289683674761592e-06, + "logits/chosen": -1.8223018646240234, + "logits/rejected": -1.9840595722198486, + "logps/chosen": -370.36962890625, + "logps/rejected": -365.89373779296875, + "loss": 0.5492, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5442531704902649, + "rewards/margins": 0.49803152680397034, + "rewards/rejected": -1.0422847270965576, + "step": 3500 + }, + { + "epoch": 0.46, + "learning_rate": 3.2788395438094444e-06, + "logits/chosen": -1.542337417602539, + "logits/rejected": -1.7137134075164795, + "logps/chosen": -325.2630310058594, + "logps/rejected": -337.03997802734375, + "loss": 0.6226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.37942296266555786, + "rewards/margins": 0.3259304463863373, + "rewards/rejected": -0.7053534388542175, + "step": 3510 + }, + { + "epoch": 0.46, + "learning_rate": 3.2679791592809653e-06, + "logits/chosen": -1.7539052963256836, + "logits/rejected": -2.0646724700927734, + "logps/chosen": -310.50018310546875, + "logps/rejected": -360.68682861328125, + "loss": 0.5787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.45985502004623413, + "rewards/margins": 0.34542837738990784, + "rewards/rejected": -0.8052834272384644, + "step": 3520 + }, + { + "epoch": 0.46, + "learning_rate": 3.257102747821157e-06, + "logits/chosen": -1.7291984558105469, + "logits/rejected": -1.8702964782714844, + "logps/chosen": -337.31634521484375, + "logps/rejected": -346.04705810546875, + "loss": 0.624, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2818695902824402, + "rewards/margins": 0.463838666677475, + "rewards/rejected": -0.7457082271575928, + "step": 3530 + }, + { + "epoch": 0.46, + "learning_rate": 3.246210536409484e-06, + "logits/chosen": -1.748192548751831, + "logits/rejected": -1.7564138174057007, + "logps/chosen": -242.19003295898438, + "logps/rejected": -254.59774780273438, + "loss": 0.5977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3837249279022217, + "rewards/margins": 0.3885956108570099, + "rewards/rejected": -0.772320568561554, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 3.235302752355142e-06, + "logits/chosen": -1.7237571477890015, + "logits/rejected": -1.6881086826324463, + "logps/chosen": -315.25164794921875, + "logps/rejected": -336.93096923828125, + "loss": 0.5621, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3909755349159241, + "rewards/margins": 0.4756072461605072, + "rewards/rejected": -0.8665827512741089, + "step": 3550 + }, + { + "epoch": 0.47, + "learning_rate": 3.2243796232923097e-06, + "logits/chosen": -1.7582708597183228, + "logits/rejected": -1.8244333267211914, + "logps/chosen": -242.5635528564453, + "logps/rejected": -286.02545166015625, + "loss": 0.6247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.282164990901947, + "rewards/margins": 0.3209809958934784, + "rewards/rejected": -0.603145956993103, + "step": 3560 + }, + { + "epoch": 0.47, + "learning_rate": 3.2134413771754037e-06, + "logits/chosen": -1.6943250894546509, + "logits/rejected": -1.6668803691864014, + "logps/chosen": -301.8404541015625, + "logps/rejected": -305.6943359375, + "loss": 0.5362, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2543103098869324, + "rewards/margins": 0.45604562759399414, + "rewards/rejected": -0.7103559374809265, + "step": 3570 + }, + { + "epoch": 0.47, + "learning_rate": 3.2024882422743118e-06, + "logits/chosen": -1.8849399089813232, + "logits/rejected": -1.7547498941421509, + "logps/chosen": -293.3818664550781, + "logps/rejected": -308.4000244140625, + "loss": 0.5684, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22510361671447754, + "rewards/margins": 0.47139206528663635, + "rewards/rejected": -0.6964956521987915, + "step": 3580 + }, + { + "epoch": 0.47, + "learning_rate": 3.1915204471696425e-06, + "logits/chosen": -1.7541160583496094, + "logits/rejected": -1.8730714321136475, + "logps/chosen": -322.0429382324219, + "logps/rejected": -334.17083740234375, + "loss": 0.5846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.41813772916793823, + "rewards/margins": 0.4513067603111267, + "rewards/rejected": -0.8694444894790649, + "step": 3590 + }, + { + "epoch": 0.47, + "learning_rate": 3.180538220747943e-06, + "logits/chosen": -1.5943326950073242, + "logits/rejected": -1.7133232355117798, + "logps/chosen": -299.25262451171875, + "logps/rejected": -303.612060546875, + "loss": 0.589, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5019037127494812, + "rewards/margins": 0.29574936628341675, + "rewards/rejected": -0.797653079032898, + "step": 3600 + }, + { + "epoch": 0.47, + "learning_rate": 3.1695417921969287e-06, + "logits/chosen": -1.7009267807006836, + "logits/rejected": -1.8506752252578735, + "logps/chosen": -319.67822265625, + "logps/rejected": -321.08807373046875, + "loss": 0.5704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7186891436576843, + "rewards/margins": 0.37807756662368774, + "rewards/rejected": -1.0967668294906616, + "step": 3610 + }, + { + "epoch": 0.47, + "learning_rate": 3.158531391000697e-06, + "logits/chosen": -1.8101762533187866, + "logits/rejected": -1.8018306493759155, + "logps/chosen": -379.90478515625, + "logps/rejected": -370.8863525390625, + "loss": 0.523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20886072516441345, + "rewards/margins": 0.6331818103790283, + "rewards/rejected": -0.8420425653457642, + "step": 3620 + }, + { + "epoch": 0.48, + "learning_rate": 3.147507246934943e-06, + "logits/chosen": -1.7746751308441162, + "logits/rejected": -1.7157208919525146, + "logps/chosen": -350.02069091796875, + "logps/rejected": -349.77972412109375, + "loss": 0.5924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40597105026245117, + "rewards/margins": 0.4071740210056305, + "rewards/rejected": -0.8131451606750488, + "step": 3630 + }, + { + "epoch": 0.48, + "learning_rate": 3.136469590062158e-06, + "logits/chosen": -1.8396203517913818, + "logits/rejected": -1.6241391897201538, + "logps/chosen": -306.0809631347656, + "logps/rejected": -299.9479675292969, + "loss": 0.5619, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3231487274169922, + "rewards/margins": 0.5906227231025696, + "rewards/rejected": -0.9137715101242065, + "step": 3640 + }, + { + "epoch": 0.48, + "learning_rate": 3.1254186507268354e-06, + "logits/chosen": -1.9199928045272827, + "logits/rejected": -1.8068441152572632, + "logps/chosen": -368.0453186035156, + "logps/rejected": -351.4677734375, + "loss": 0.5735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43998974561691284, + "rewards/margins": 0.34722068905830383, + "rewards/rejected": -0.7872104048728943, + "step": 3650 + }, + { + "epoch": 0.48, + "learning_rate": 3.114354659550656e-06, + "logits/chosen": -1.84914231300354, + "logits/rejected": -1.8388614654541016, + "logps/chosen": -318.1921081542969, + "logps/rejected": -399.82147216796875, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43869534134864807, + "rewards/margins": 0.4454549252986908, + "rewards/rejected": -0.8841502070426941, + "step": 3660 + }, + { + "epoch": 0.48, + "learning_rate": 3.1032778474276816e-06, + "logits/chosen": -1.8377641439437866, + "logits/rejected": -1.6623141765594482, + "logps/chosen": -346.48883056640625, + "logps/rejected": -353.8504333496094, + "loss": 0.6397, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.564034640789032, + "rewards/margins": 0.3558120131492615, + "rewards/rejected": -0.9198466539382935, + "step": 3670 + }, + { + "epoch": 0.48, + "learning_rate": 3.092188445519532e-06, + "logits/chosen": -1.864964246749878, + "logits/rejected": -1.7114427089691162, + "logps/chosen": -324.5653381347656, + "logps/rejected": -325.3702087402344, + "loss": 0.5634, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21466271579265594, + "rewards/margins": 0.4827520251274109, + "rewards/rejected": -0.697414755821228, + "step": 3680 + }, + { + "epoch": 0.48, + "learning_rate": 3.081086685250565e-06, + "logits/chosen": -2.0764949321746826, + "logits/rejected": -1.9646022319793701, + "logps/chosen": -360.2198791503906, + "logps/rejected": -351.66973876953125, + "loss": 0.5685, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2742803692817688, + "rewards/margins": 0.40870004892349243, + "rewards/rejected": -0.6829804182052612, + "step": 3690 + }, + { + "epoch": 0.48, + "learning_rate": 3.0699727983030434e-06, + "logits/chosen": -1.8122189044952393, + "logits/rejected": -1.7426958084106445, + "logps/chosen": -332.35296630859375, + "logps/rejected": -395.4708251953125, + "loss": 0.6108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.26644280552864075, + "rewards/margins": 0.48443466424942017, + "rewards/rejected": -0.7508773803710938, + "step": 3700 + }, + { + "epoch": 0.49, + "learning_rate": 3.058847016612301e-06, + "logits/chosen": -1.9748141765594482, + "logits/rejected": -1.7744249105453491, + "logps/chosen": -404.2115783691406, + "logps/rejected": -404.9239807128906, + "loss": 0.5381, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45408979058265686, + "rewards/margins": 0.509874165058136, + "rewards/rejected": -0.9639638662338257, + "step": 3710 + }, + { + "epoch": 0.49, + "learning_rate": 3.0477095723619034e-06, + "logits/chosen": -1.9092906713485718, + "logits/rejected": -1.6186670064926147, + "logps/chosen": -353.45654296875, + "logps/rejected": -408.7009582519531, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5701279044151306, + "rewards/margins": 0.625190794467926, + "rewards/rejected": -1.195318579673767, + "step": 3720 + }, + { + "epoch": 0.49, + "learning_rate": 3.0365606979788003e-06, + "logits/chosen": -1.5970327854156494, + "logits/rejected": -1.643099069595337, + "logps/chosen": -273.62353515625, + "logps/rejected": -322.5993957519531, + "loss": 0.6026, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4693664014339447, + "rewards/margins": 0.3268064558506012, + "rewards/rejected": -0.7961727380752563, + "step": 3730 + }, + { + "epoch": 0.49, + "learning_rate": 3.0254006261284786e-06, + "logits/chosen": -1.9919275045394897, + "logits/rejected": -2.0272574424743652, + "logps/chosen": -335.11334228515625, + "logps/rejected": -353.7044372558594, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27459877729415894, + "rewards/margins": 0.5165451765060425, + "rewards/rejected": -0.7911440134048462, + "step": 3740 + }, + { + "epoch": 0.49, + "learning_rate": 3.0142295897101032e-06, + "logits/chosen": -1.5142481327056885, + "logits/rejected": -1.665331482887268, + "logps/chosen": -327.7146301269531, + "logps/rejected": -352.46875, + "loss": 0.5518, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4227105975151062, + "rewards/margins": 0.700469434261322, + "rewards/rejected": -1.1231801509857178, + "step": 3750 + }, + { + "epoch": 0.49, + "learning_rate": 3.0030478218516578e-06, + "logits/chosen": -1.7529665231704712, + "logits/rejected": -1.5313271284103394, + "logps/chosen": -331.0423889160156, + "logps/rejected": -274.0474548339844, + "loss": 0.5559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3719012141227722, + "rewards/margins": 0.6556496620178223, + "rewards/rejected": -1.0275509357452393, + "step": 3760 + }, + { + "epoch": 0.49, + "learning_rate": 2.9918555559050826e-06, + "logits/chosen": -1.8108575344085693, + "logits/rejected": -1.8639781475067139, + "logps/chosen": -331.29376220703125, + "logps/rejected": -400.363037109375, + "loss": 0.633, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4847791790962219, + "rewards/margins": 0.4975178837776184, + "rewards/rejected": -0.9822970628738403, + "step": 3770 + }, + { + "epoch": 0.49, + "learning_rate": 2.980653025441399e-06, + "logits/chosen": -1.8997920751571655, + "logits/rejected": -1.977393388748169, + "logps/chosen": -284.5909423828125, + "logps/rejected": -352.7862854003906, + "loss": 0.6112, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.43618661165237427, + "rewards/margins": 0.27137070894241333, + "rewards/rejected": -0.7075573205947876, + "step": 3780 + }, + { + "epoch": 0.5, + "learning_rate": 2.969440464245841e-06, + "logits/chosen": -1.5956467390060425, + "logits/rejected": -1.6918439865112305, + "logps/chosen": -280.7166442871094, + "logps/rejected": -307.34051513671875, + "loss": 0.6307, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4487435817718506, + "rewards/margins": 0.16629108786582947, + "rewards/rejected": -0.6150346994400024, + "step": 3790 + }, + { + "epoch": 0.5, + "learning_rate": 2.95821810631297e-06, + "logits/chosen": -1.8552080392837524, + "logits/rejected": -1.7493228912353516, + "logps/chosen": -359.79351806640625, + "logps/rejected": -417.745849609375, + "loss": 0.6301, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5076690316200256, + "rewards/margins": 0.33847576379776, + "rewards/rejected": -0.8461447954177856, + "step": 3800 + }, + { + "epoch": 0.5, + "learning_rate": 2.946986185841801e-06, + "logits/chosen": -1.6677796840667725, + "logits/rejected": -1.6328794956207275, + "logps/chosen": -316.1706237792969, + "logps/rejected": -331.98876953125, + "loss": 0.6439, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3543296754360199, + "rewards/margins": 0.3643968999385834, + "rewards/rejected": -0.7187266945838928, + "step": 3810 + }, + { + "epoch": 0.5, + "learning_rate": 2.935744937230903e-06, + "logits/chosen": -1.5863654613494873, + "logits/rejected": -1.8301881551742554, + "logps/chosen": -318.82098388671875, + "logps/rejected": -296.8606262207031, + "loss": 0.5676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3077712655067444, + "rewards/margins": 0.3471468389034271, + "rewards/rejected": -0.6549181342124939, + "step": 3820 + }, + { + "epoch": 0.5, + "learning_rate": 2.924494595073517e-06, + "logits/chosen": -1.6902148723602295, + "logits/rejected": -1.7198559045791626, + "logps/chosen": -273.60247802734375, + "logps/rejected": -301.96588134765625, + "loss": 0.5616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22179213166236877, + "rewards/margins": 0.5069515109062195, + "rewards/rejected": -0.7287436127662659, + "step": 3830 + }, + { + "epoch": 0.5, + "learning_rate": 2.9132353941526575e-06, + "logits/chosen": -1.7695382833480835, + "logits/rejected": -1.897695779800415, + "logps/chosen": -345.64617919921875, + "logps/rejected": -319.2537841796875, + "loss": 0.5701, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28853678703308105, + "rewards/margins": 0.5146527290344238, + "rewards/rejected": -0.8031895756721497, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 2.901967569436209e-06, + "logits/chosen": -1.7483577728271484, + "logits/rejected": -1.6690582036972046, + "logps/chosen": -296.76824951171875, + "logps/rejected": -345.4691467285156, + "loss": 0.6237, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38306859135627747, + "rewards/margins": 0.41898447275161743, + "rewards/rejected": -0.8020529747009277, + "step": 3850 + }, + { + "epoch": 0.51, + "learning_rate": 2.89069135607203e-06, + "logits/chosen": -1.7685825824737549, + "logits/rejected": -1.6555359363555908, + "logps/chosen": -324.80633544921875, + "logps/rejected": -340.04376220703125, + "loss": 0.5247, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15670789778232574, + "rewards/margins": 0.6518503427505493, + "rewards/rejected": -0.8085581660270691, + "step": 3860 + }, + { + "epoch": 0.51, + "learning_rate": 2.8794069893830386e-06, + "logits/chosen": -1.5534040927886963, + "logits/rejected": -1.4398003816604614, + "logps/chosen": -343.73065185546875, + "logps/rejected": -413.04052734375, + "loss": 0.5715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2923961877822876, + "rewards/margins": 0.43721580505371094, + "rewards/rejected": -0.7296119928359985, + "step": 3870 + }, + { + "epoch": 0.51, + "learning_rate": 2.8681147048623038e-06, + "logits/chosen": -1.9648174047470093, + "logits/rejected": -1.9548981189727783, + "logps/chosen": -353.8490905761719, + "logps/rejected": -318.52191162109375, + "loss": 0.5857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29026466608047485, + "rewards/margins": 0.29097938537597656, + "rewards/rejected": -0.5812441110610962, + "step": 3880 + }, + { + "epoch": 0.51, + "learning_rate": 2.8568147381681333e-06, + "logits/chosen": -1.8199918270111084, + "logits/rejected": -1.5809903144836426, + "logps/chosen": -350.03619384765625, + "logps/rejected": -360.7236633300781, + "loss": 0.5015, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2252894937992096, + "rewards/margins": 0.6579054594039917, + "rewards/rejected": -0.8831951022148132, + "step": 3890 + }, + { + "epoch": 0.51, + "learning_rate": 2.8455073251191533e-06, + "logits/chosen": -1.9472877979278564, + "logits/rejected": -1.6184675693511963, + "logps/chosen": -334.63641357421875, + "logps/rejected": -361.4620361328125, + "loss": 0.55, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.388920396566391, + "rewards/margins": 0.45513391494750977, + "rewards/rejected": -0.8440543413162231, + "step": 3900 + }, + { + "epoch": 0.51, + "learning_rate": 2.8341927016893887e-06, + "logits/chosen": -1.8738635778427124, + "logits/rejected": -1.7635606527328491, + "logps/chosen": -332.4501037597656, + "logps/rejected": -351.65289306640625, + "loss": 0.6336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3494104743003845, + "rewards/margins": 0.44819337129592896, + "rewards/rejected": -0.7976039052009583, + "step": 3910 + }, + { + "epoch": 0.51, + "learning_rate": 2.822871104003335e-06, + "logits/chosen": -1.7811224460601807, + "logits/rejected": -1.7455886602401733, + "logps/chosen": -281.77069091796875, + "logps/rejected": -358.2641906738281, + "loss": 0.5678, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2726995348930359, + "rewards/margins": 0.5175293684005737, + "rewards/rejected": -0.7902289628982544, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8115427683310355e-06, + "logits/chosen": -1.8350861072540283, + "logits/rejected": -1.7046111822128296, + "logps/chosen": -290.176025390625, + "logps/rejected": -327.64385986328125, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2753733694553375, + "rewards/margins": 0.4063855707645416, + "rewards/rejected": -0.6817589998245239, + "step": 3930 + }, + { + "epoch": 0.52, + "learning_rate": 2.8002079310831477e-06, + "logits/chosen": -1.7511241436004639, + "logits/rejected": -1.4732693433761597, + "logps/chosen": -314.26727294921875, + "logps/rejected": -302.6998596191406, + "loss": 0.5618, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2875606417655945, + "rewards/margins": 0.5203619599342346, + "rewards/rejected": -0.8079225420951843, + "step": 3940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7888668288060095e-06, + "logits/chosen": -1.8107450008392334, + "logits/rejected": -1.7523075342178345, + "logps/chosen": -298.2847900390625, + "logps/rejected": -358.3739013671875, + "loss": 0.53, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43408942222595215, + "rewards/margins": 0.5461779832839966, + "rewards/rejected": -0.9802674055099487, + "step": 3950 + }, + { + "epoch": 0.52, + "learning_rate": 2.7775196981767044e-06, + "logits/chosen": -1.8494669198989868, + "logits/rejected": -1.7733339071273804, + "logps/chosen": -284.2920837402344, + "logps/rejected": -315.0772705078125, + "loss": 0.5421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34565073251724243, + "rewards/margins": 0.4883705675601959, + "rewards/rejected": -0.834021270275116, + "step": 3960 + }, + { + "epoch": 0.52, + "learning_rate": 2.7661667759981213e-06, + "logits/chosen": -1.4416885375976562, + "logits/rejected": -1.5648361444473267, + "logps/chosen": -250.1155242919922, + "logps/rejected": -267.9757995605469, + "loss": 0.5955, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.31566399335861206, + "rewards/margins": 0.2132129967212677, + "rewards/rejected": -0.5288770198822021, + "step": 3970 + }, + { + "epoch": 0.52, + "learning_rate": 2.7548082991940137e-06, + "logits/chosen": -1.8730270862579346, + "logits/rejected": -1.7931387424468994, + "logps/chosen": -390.61236572265625, + "logps/rejected": -356.39654541015625, + "loss": 0.5503, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3787888288497925, + "rewards/margins": 0.3946785032749176, + "rewards/rejected": -0.7734672427177429, + "step": 3980 + }, + { + "epoch": 0.52, + "learning_rate": 2.743444504804051e-06, + "logits/chosen": -1.871013879776001, + "logits/rejected": -1.7201436758041382, + "logps/chosen": -312.48876953125, + "logps/rejected": -275.2581481933594, + "loss": 0.6242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.37757596373558044, + "rewards/margins": 0.3250061869621277, + "rewards/rejected": -0.7025822401046753, + "step": 3990 + }, + { + "epoch": 0.52, + "learning_rate": 2.7320756299788788e-06, + "logits/chosen": -1.5397943258285522, + "logits/rejected": -1.6421153545379639, + "logps/chosen": -301.97119140625, + "logps/rejected": -354.7770690917969, + "loss": 0.5911, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39353466033935547, + "rewards/margins": 0.486635684967041, + "rewards/rejected": -0.8801703453063965, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7207019119751644e-06, + "logits/chosen": -1.5140125751495361, + "logits/rejected": -1.7460044622421265, + "logps/chosen": -339.53460693359375, + "logps/rejected": -347.5783386230469, + "loss": 0.5651, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34493696689605713, + "rewards/margins": 0.554093599319458, + "rewards/rejected": -0.8990306854248047, + "step": 4010 + }, + { + "epoch": 0.53, + "learning_rate": 2.7093235881506474e-06, + "logits/chosen": -1.775244116783142, + "logits/rejected": -1.5799227952957153, + "logps/chosen": -350.29241943359375, + "logps/rejected": -346.9272766113281, + "loss": 0.5866, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5338143706321716, + "rewards/margins": 0.38223499059677124, + "rewards/rejected": -0.9160493016242981, + "step": 4020 + }, + { + "epoch": 0.53, + "learning_rate": 2.6979408959591863e-06, + "logits/chosen": -1.9225536584854126, + "logits/rejected": -1.6527570486068726, + "logps/chosen": -345.1325988769531, + "logps/rejected": -371.31390380859375, + "loss": 0.4755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4339163303375244, + "rewards/margins": 0.7859948873519897, + "rewards/rejected": -1.2199113368988037, + "step": 4030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6865540729458034e-06, + "logits/chosen": -1.7279541492462158, + "logits/rejected": -1.534313440322876, + "logps/chosen": -355.6867980957031, + "logps/rejected": -379.92425537109375, + "loss": 0.512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3558761179447174, + "rewards/margins": 0.4911411702632904, + "rewards/rejected": -0.8470171689987183, + "step": 4040 + }, + { + "epoch": 0.53, + "learning_rate": 2.675163356741726e-06, + "logits/chosen": -1.6380796432495117, + "logits/rejected": -1.888669729232788, + "logps/chosen": -285.0138244628906, + "logps/rejected": -295.8802795410156, + "loss": 0.5055, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.142476424574852, + "rewards/margins": 0.6192440986633301, + "rewards/rejected": -0.7617205381393433, + "step": 4050 + }, + { + "epoch": 0.53, + "learning_rate": 2.6637689850594285e-06, + "logits/chosen": -1.3838313817977905, + "logits/rejected": -1.4706891775131226, + "logps/chosen": -374.72613525390625, + "logps/rejected": -429.5972595214844, + "loss": 0.5564, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4830802381038666, + "rewards/margins": 0.5818537473678589, + "rewards/rejected": -1.0649340152740479, + "step": 4060 + }, + { + "epoch": 0.53, + "learning_rate": 2.652371195687671e-06, + "logits/chosen": -1.6332095861434937, + "logits/rejected": -1.7725727558135986, + "logps/chosen": -351.6695556640625, + "logps/rejected": -421.2322692871094, + "loss": 0.546, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.40922918915748596, + "rewards/margins": 0.676921010017395, + "rewards/rejected": -1.0861501693725586, + "step": 4070 + }, + { + "epoch": 0.53, + "learning_rate": 2.64097022648654e-06, + "logits/chosen": -1.7817274332046509, + "logits/rejected": -1.6316791772842407, + "logps/chosen": -330.9732360839844, + "logps/rejected": -306.84197998046875, + "loss": 0.6591, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4978245794773102, + "rewards/margins": 0.21460232138633728, + "rewards/rejected": -0.7124269604682922, + "step": 4080 + }, + { + "epoch": 0.54, + "learning_rate": 2.6295663153824774e-06, + "logits/chosen": -1.8382799625396729, + "logits/rejected": -1.8394279479980469, + "logps/chosen": -370.68902587890625, + "logps/rejected": -345.5810241699219, + "loss": 0.5457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4140840172767639, + "rewards/margins": 0.583431601524353, + "rewards/rejected": -0.9975157976150513, + "step": 4090 + }, + { + "epoch": 0.54, + "learning_rate": 2.6181597003633218e-06, + "logits/chosen": -1.8080418109893799, + "logits/rejected": -1.8286720514297485, + "logps/chosen": -256.10845947265625, + "logps/rejected": -288.5578918457031, + "loss": 0.5532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2099798023700714, + "rewards/margins": 0.5048014521598816, + "rewards/rejected": -0.7147814035415649, + "step": 4100 + }, + { + "epoch": 0.54, + "learning_rate": 2.606750619473342e-06, + "logits/chosen": -1.678256630897522, + "logits/rejected": -1.6355950832366943, + "logps/chosen": -262.5571594238281, + "logps/rejected": -336.2910461425781, + "loss": 0.5456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17769090831279755, + "rewards/margins": 0.5116496682167053, + "rewards/rejected": -0.6893404722213745, + "step": 4110 + }, + { + "epoch": 0.54, + "learning_rate": 2.595339310808262e-06, + "logits/chosen": -1.7698469161987305, + "logits/rejected": -1.6262279748916626, + "logps/chosen": -348.2425231933594, + "logps/rejected": -330.33624267578125, + "loss": 0.5633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25929564237594604, + "rewards/margins": 0.558716356754303, + "rewards/rejected": -0.8180120587348938, + "step": 4120 + }, + { + "epoch": 0.54, + "learning_rate": 2.5839260125103004e-06, + "logits/chosen": -1.4036157131195068, + "logits/rejected": -1.2687772512435913, + "logps/chosen": -236.756591796875, + "logps/rejected": -296.260009765625, + "loss": 0.5412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4874787926673889, + "rewards/margins": 0.45000410079956055, + "rewards/rejected": -0.9374828338623047, + "step": 4130 + }, + { + "epoch": 0.54, + "learning_rate": 2.5725109627631984e-06, + "logits/chosen": -1.7914928197860718, + "logits/rejected": -1.60581374168396, + "logps/chosen": -338.9268798828125, + "logps/rejected": -314.90966796875, + "loss": 0.6029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4206452965736389, + "rewards/margins": 0.34881529211997986, + "rewards/rejected": -0.7694606184959412, + "step": 4140 + }, + { + "epoch": 0.54, + "learning_rate": 2.5610943997872443e-06, + "logits/chosen": -1.7513971328735352, + "logits/rejected": -1.5885504484176636, + "logps/chosen": -298.176513671875, + "logps/rejected": -353.7212829589844, + "loss": 0.5964, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5241976976394653, + "rewards/margins": 0.35407060384750366, + "rewards/rejected": -0.8782683610916138, + "step": 4150 + }, + { + "epoch": 0.54, + "learning_rate": 2.5496765618343096e-06, + "logits/chosen": -1.5626815557479858, + "logits/rejected": -1.6557413339614868, + "logps/chosen": -306.47454833984375, + "logps/rejected": -368.38519287109375, + "loss": 0.4256, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18705104291439056, + "rewards/margins": 0.9199911952018738, + "rewards/rejected": -1.1070421934127808, + "step": 4160 + }, + { + "epoch": 0.55, + "learning_rate": 2.538257687182871e-06, + "logits/chosen": -1.8515218496322632, + "logits/rejected": -1.516538381576538, + "logps/chosen": -306.88653564453125, + "logps/rejected": -323.8311462402344, + "loss": 0.5082, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3485359847545624, + "rewards/margins": 0.5290993452072144, + "rewards/rejected": -0.8776353001594543, + "step": 4170 + }, + { + "epoch": 0.55, + "learning_rate": 2.526838014133041e-06, + "logits/chosen": -1.710623025894165, + "logits/rejected": -1.5006887912750244, + "logps/chosen": -322.9178161621094, + "logps/rejected": -394.24176025390625, + "loss": 0.5394, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35940316319465637, + "rewards/margins": 0.6309499740600586, + "rewards/rejected": -0.9903531074523926, + "step": 4180 + }, + { + "epoch": 0.55, + "learning_rate": 2.515417781001594e-06, + "logits/chosen": -1.6988499164581299, + "logits/rejected": -1.7259676456451416, + "logps/chosen": -351.405517578125, + "logps/rejected": -366.4336242675781, + "loss": 0.5986, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3532882630825043, + "rewards/margins": 0.4758473336696625, + "rewards/rejected": -0.829135537147522, + "step": 4190 + }, + { + "epoch": 0.55, + "learning_rate": 2.503997226116992e-06, + "logits/chosen": -1.646026849746704, + "logits/rejected": -1.6328113079071045, + "logps/chosen": -366.5597839355469, + "logps/rejected": -379.0836486816406, + "loss": 0.517, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4745301306247711, + "rewards/margins": 0.7557706832885742, + "rewards/rejected": -1.2303006649017334, + "step": 4200 + }, + { + "epoch": 0.55, + "learning_rate": 2.4925765878144115e-06, + "logits/chosen": -1.6957159042358398, + "logits/rejected": -1.42415452003479, + "logps/chosen": -301.3494567871094, + "logps/rejected": -413.85858154296875, + "loss": 0.5809, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5121637582778931, + "rewards/margins": 0.5079084038734436, + "rewards/rejected": -1.0200722217559814, + "step": 4210 + }, + { + "epoch": 0.55, + "learning_rate": 2.4811561044307727e-06, + "logits/chosen": -1.6290591955184937, + "logits/rejected": -1.4747549295425415, + "logps/chosen": -334.2276306152344, + "logps/rejected": -367.42169189453125, + "loss": 0.5605, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29737338423728943, + "rewards/margins": 0.6729745864868164, + "rewards/rejected": -0.970348060131073, + "step": 4220 + }, + { + "epoch": 0.55, + "learning_rate": 2.469736014299758e-06, + "logits/chosen": -1.7984206676483154, + "logits/rejected": -1.7041215896606445, + "logps/chosen": -361.33160400390625, + "logps/rejected": -354.5500793457031, + "loss": 0.5659, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2924116849899292, + "rewards/margins": 0.6345264315605164, + "rewards/rejected": -0.9269381761550903, + "step": 4230 + }, + { + "epoch": 0.55, + "learning_rate": 2.458316555746846e-06, + "logits/chosen": -1.5120489597320557, + "logits/rejected": -1.4823814630508423, + "logps/chosen": -299.83624267578125, + "logps/rejected": -354.446533203125, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19021925330162048, + "rewards/margins": 0.5186191201210022, + "rewards/rejected": -0.7088383436203003, + "step": 4240 + }, + { + "epoch": 0.56, + "learning_rate": 2.446897967084334e-06, + "logits/chosen": -1.7055912017822266, + "logits/rejected": -1.6096477508544922, + "logps/chosen": -326.3486633300781, + "logps/rejected": -358.1311340332031, + "loss": 0.6313, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2880330979824066, + "rewards/margins": 0.4673018455505371, + "rewards/rejected": -0.7553349733352661, + "step": 4250 + }, + { + "epoch": 0.56, + "learning_rate": 2.4354804866063684e-06, + "logits/chosen": -1.7015244960784912, + "logits/rejected": -1.7155224084854126, + "logps/chosen": -317.40081787109375, + "logps/rejected": -338.3325500488281, + "loss": 0.5924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2812954783439636, + "rewards/margins": 0.41596174240112305, + "rewards/rejected": -0.6972572207450867, + "step": 4260 + }, + { + "epoch": 0.56, + "learning_rate": 2.424064352583964e-06, + "logits/chosen": -1.6986901760101318, + "logits/rejected": -1.456752061843872, + "logps/chosen": -349.41204833984375, + "logps/rejected": -349.1368713378906, + "loss": 0.5194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29286855459213257, + "rewards/margins": 0.5816076397895813, + "rewards/rejected": -0.8744761347770691, + "step": 4270 + }, + { + "epoch": 0.56, + "learning_rate": 2.4126498032600403e-06, + "logits/chosen": -1.386330485343933, + "logits/rejected": -1.6820504665374756, + "logps/chosen": -268.83148193359375, + "logps/rejected": -325.9708557128906, + "loss": 0.5389, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3986538350582123, + "rewards/margins": 0.49157992005348206, + "rewards/rejected": -0.8902336955070496, + "step": 4280 + }, + { + "epoch": 0.56, + "learning_rate": 2.401237076844445e-06, + "logits/chosen": -1.3567836284637451, + "logits/rejected": -1.6670172214508057, + "logps/chosen": -332.69195556640625, + "logps/rejected": -364.1340026855469, + "loss": 0.5509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4180997908115387, + "rewards/margins": 0.4607284963130951, + "rewards/rejected": -0.878828227519989, + "step": 4290 + }, + { + "epoch": 0.56, + "learning_rate": 2.38982641150898e-06, + "logits/chosen": -1.6259790658950806, + "logits/rejected": -1.5032711029052734, + "logps/chosen": -338.6427001953125, + "logps/rejected": -362.85211181640625, + "loss": 0.4945, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3534035086631775, + "rewards/margins": 0.6605807542800903, + "rewards/rejected": -1.0139843225479126, + "step": 4300 + }, + { + "epoch": 0.56, + "learning_rate": 2.3784180453824414e-06, + "logits/chosen": -1.6517086029052734, + "logits/rejected": -1.7086608409881592, + "logps/chosen": -319.11883544921875, + "logps/rejected": -358.0398864746094, + "loss": 0.555, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41850417852401733, + "rewards/margins": 0.5336800813674927, + "rewards/rejected": -0.9521842002868652, + "step": 4310 + }, + { + "epoch": 0.57, + "learning_rate": 2.367012216545638e-06, + "logits/chosen": -1.826944351196289, + "logits/rejected": -1.7591358423233032, + "logps/chosen": -345.77978515625, + "logps/rejected": -334.64312744140625, + "loss": 0.5663, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27163487672805786, + "rewards/margins": 0.5245380401611328, + "rewards/rejected": -0.7961729168891907, + "step": 4320 + }, + { + "epoch": 0.57, + "learning_rate": 2.3556091630264294e-06, + "logits/chosen": -1.7030290365219116, + "logits/rejected": -1.624021291732788, + "logps/chosen": -349.04876708984375, + "logps/rejected": -404.719482421875, + "loss": 0.5169, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4447137713432312, + "rewards/margins": 0.5261680483818054, + "rewards/rejected": -0.9708817601203918, + "step": 4330 + }, + { + "epoch": 0.57, + "learning_rate": 2.344209122794757e-06, + "logits/chosen": -1.413947582244873, + "logits/rejected": -1.5500361919403076, + "logps/chosen": -350.61053466796875, + "logps/rejected": -375.8503723144531, + "loss": 0.6026, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5169695615768433, + "rewards/margins": 0.46729883551597595, + "rewards/rejected": -0.9842683672904968, + "step": 4340 + }, + { + "epoch": 0.57, + "learning_rate": 2.3328123337576787e-06, + "logits/chosen": -1.813246488571167, + "logits/rejected": -1.5280475616455078, + "logps/chosen": -327.3409729003906, + "logps/rejected": -397.8530578613281, + "loss": 0.5739, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4233461320400238, + "rewards/margins": 0.5506717562675476, + "rewards/rejected": -0.9740179777145386, + "step": 4350 + }, + { + "epoch": 0.57, + "learning_rate": 2.3214190337544017e-06, + "logits/chosen": -1.647658348083496, + "logits/rejected": -1.6999976634979248, + "logps/chosen": -265.35601806640625, + "logps/rejected": -313.14508056640625, + "loss": 0.5597, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.32170194387435913, + "rewards/margins": 0.516044020652771, + "rewards/rejected": -0.8377459645271301, + "step": 4360 + }, + { + "epoch": 0.57, + "learning_rate": 2.310029460551323e-06, + "logits/chosen": -1.8664038181304932, + "logits/rejected": -1.6384599208831787, + "logps/chosen": -305.837646484375, + "logps/rejected": -346.8574523925781, + "loss": 0.4917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2016543447971344, + "rewards/margins": 0.5420511364936829, + "rewards/rejected": -0.7437055110931396, + "step": 4370 + }, + { + "epoch": 0.57, + "learning_rate": 2.2986438518370645e-06, + "logits/chosen": -1.6314566135406494, + "logits/rejected": -1.7745201587677002, + "logps/chosen": -291.3509216308594, + "logps/rejected": -331.6063537597656, + "loss": 0.5454, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28928637504577637, + "rewards/margins": 0.5069013237953186, + "rewards/rejected": -0.796187698841095, + "step": 4380 + }, + { + "epoch": 0.57, + "learning_rate": 2.2872624452175123e-06, + "logits/chosen": -1.740395188331604, + "logits/rejected": -1.7595630884170532, + "logps/chosen": -310.6142883300781, + "logps/rejected": -332.3433532714844, + "loss": 0.6099, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27852678298950195, + "rewards/margins": 0.32075488567352295, + "rewards/rejected": -0.5992816090583801, + "step": 4390 + }, + { + "epoch": 0.58, + "learning_rate": 2.2758854782108584e-06, + "logits/chosen": -1.562260389328003, + "logits/rejected": -1.5046101808547974, + "logps/chosen": -294.5691223144531, + "logps/rejected": -349.96380615234375, + "loss": 0.5704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.36657577753067017, + "rewards/margins": 0.488314151763916, + "rewards/rejected": -0.8548898696899414, + "step": 4400 + }, + { + "epoch": 0.58, + "learning_rate": 2.2645131882426458e-06, + "logits/chosen": -1.7031164169311523, + "logits/rejected": -1.7161766290664673, + "logps/chosen": -323.69866943359375, + "logps/rejected": -268.2398681640625, + "loss": 0.5857, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20988936722278595, + "rewards/margins": 0.5731913447380066, + "rewards/rejected": -0.7830806374549866, + "step": 4410 + }, + { + "epoch": 0.58, + "learning_rate": 2.2531458126408154e-06, + "logits/chosen": -1.7290242910385132, + "logits/rejected": -1.6596912145614624, + "logps/chosen": -306.05615234375, + "logps/rejected": -331.30548095703125, + "loss": 0.5222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3140954077243805, + "rewards/margins": 0.7424987554550171, + "rewards/rejected": -1.0565941333770752, + "step": 4420 + }, + { + "epoch": 0.58, + "learning_rate": 2.2417835886307452e-06, + "logits/chosen": -1.470320701599121, + "logits/rejected": -1.4352939128875732, + "logps/chosen": -307.79852294921875, + "logps/rejected": -332.3718566894531, + "loss": 0.505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21418030560016632, + "rewards/margins": 0.7467679977416992, + "rewards/rejected": -0.9609482884407043, + "step": 4430 + }, + { + "epoch": 0.58, + "learning_rate": 2.2304267533303075e-06, + "logits/chosen": -1.8363187313079834, + "logits/rejected": -1.543057918548584, + "logps/chosen": -413.6319274902344, + "logps/rejected": -388.0902404785156, + "loss": 0.6349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41396409273147583, + "rewards/margins": 0.4103490710258484, + "rewards/rejected": -0.8243130445480347, + "step": 4440 + }, + { + "epoch": 0.58, + "learning_rate": 2.219075543744918e-06, + "logits/chosen": -1.5793263912200928, + "logits/rejected": -1.5329477787017822, + "logps/chosen": -384.7776794433594, + "logps/rejected": -362.2215270996094, + "loss": 0.6027, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28150027990341187, + "rewards/margins": 0.5145006775856018, + "rewards/rejected": -0.7960010170936584, + "step": 4450 + }, + { + "epoch": 0.58, + "learning_rate": 2.207730196762589e-06, + "logits/chosen": -1.4567307233810425, + "logits/rejected": -1.514478087425232, + "logps/chosen": -319.6458740234375, + "logps/rejected": -336.0107727050781, + "loss": 0.5785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32227176427841187, + "rewards/margins": 0.5825371146202087, + "rewards/rejected": -0.9048089981079102, + "step": 4460 + }, + { + "epoch": 0.58, + "learning_rate": 2.1963909491489846e-06, + "logits/chosen": -1.6490923166275024, + "logits/rejected": -1.731658935546875, + "logps/chosen": -265.0019836425781, + "logps/rejected": -273.8904724121094, + "loss": 0.5855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.40312257409095764, + "rewards/margins": 0.2967475354671478, + "rewards/rejected": -0.6998701095581055, + "step": 4470 + }, + { + "epoch": 0.59, + "learning_rate": 2.185058037542486e-06, + "logits/chosen": -1.5074516534805298, + "logits/rejected": -1.4897878170013428, + "logps/chosen": -362.76336669921875, + "logps/rejected": -379.93212890625, + "loss": 0.5009, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4273719787597656, + "rewards/margins": 0.7565814256668091, + "rewards/rejected": -1.1839535236358643, + "step": 4480 + }, + { + "epoch": 0.59, + "learning_rate": 2.173731698449244e-06, + "logits/chosen": -1.554918646812439, + "logits/rejected": -1.335268259048462, + "logps/chosen": -370.79559326171875, + "logps/rejected": -357.4678955078125, + "loss": 0.5521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3732503354549408, + "rewards/margins": 0.4901328682899475, + "rewards/rejected": -0.8633831739425659, + "step": 4490 + }, + { + "epoch": 0.59, + "learning_rate": 2.1624121682382495e-06, + "logits/chosen": -1.6448684930801392, + "logits/rejected": -1.4385933876037598, + "logps/chosen": -302.4981994628906, + "logps/rejected": -381.69805908203125, + "loss": 0.4989, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.42858752608299255, + "rewards/margins": 0.5322087407112122, + "rewards/rejected": -0.9607963562011719, + "step": 4500 + }, + { + "epoch": 0.59, + "learning_rate": 2.1510996831363993e-06, + "logits/chosen": -1.7892612218856812, + "logits/rejected": -1.4913238286972046, + "logps/chosen": -370.29534912109375, + "logps/rejected": -422.10919189453125, + "loss": 0.6076, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46051445603370667, + "rewards/margins": 0.3651946187019348, + "rewards/rejected": -0.8257088661193848, + "step": 4510 + }, + { + "epoch": 0.59, + "learning_rate": 2.139794479223565e-06, + "logits/chosen": -1.6129817962646484, + "logits/rejected": -1.557083249092102, + "logps/chosen": -328.95660400390625, + "logps/rejected": -356.33514404296875, + "loss": 0.5934, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4673916697502136, + "rewards/margins": 0.45461463928222656, + "rewards/rejected": -0.9220063090324402, + "step": 4520 + }, + { + "epoch": 0.59, + "learning_rate": 2.128496792427669e-06, + "logits/chosen": -1.7383235692977905, + "logits/rejected": -1.6196088790893555, + "logps/chosen": -327.993408203125, + "logps/rejected": -389.205810546875, + "loss": 0.5382, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.26879197359085083, + "rewards/margins": 0.5429565906524658, + "rewards/rejected": -0.8117486238479614, + "step": 4530 + }, + { + "epoch": 0.59, + "learning_rate": 2.117206858519758e-06, + "logits/chosen": -1.8113635778427124, + "logits/rejected": -1.6698815822601318, + "logps/chosen": -350.8209533691406, + "logps/rejected": -361.12127685546875, + "loss": 0.5899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28431791067123413, + "rewards/margins": 0.46632450819015503, + "rewards/rejected": -0.7506424784660339, + "step": 4540 + }, + { + "epoch": 0.6, + "learning_rate": 2.1059249131090844e-06, + "logits/chosen": -1.2963908910751343, + "logits/rejected": -1.2542517185211182, + "logps/chosen": -274.26885986328125, + "logps/rejected": -278.2977600097656, + "loss": 0.583, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3693769872188568, + "rewards/margins": 0.29938265681266785, + "rewards/rejected": -0.6687597036361694, + "step": 4550 + }, + { + "epoch": 0.6, + "learning_rate": 2.094651191638189e-06, + "logits/chosen": -1.7909877300262451, + "logits/rejected": -1.2843589782714844, + "logps/chosen": -335.26873779296875, + "logps/rejected": -335.92718505859375, + "loss": 0.5903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3125837743282318, + "rewards/margins": 0.48868054151535034, + "rewards/rejected": -0.8012644052505493, + "step": 4560 + }, + { + "epoch": 0.6, + "learning_rate": 2.0833859293779867e-06, + "logits/chosen": -1.8204004764556885, + "logits/rejected": -1.6054102182388306, + "logps/chosen": -323.11077880859375, + "logps/rejected": -354.7457580566406, + "loss": 0.5355, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25250402092933655, + "rewards/margins": 0.5258398652076721, + "rewards/rejected": -0.7783438563346863, + "step": 4570 + }, + { + "epoch": 0.6, + "learning_rate": 2.0721293614228568e-06, + "logits/chosen": -1.5242459774017334, + "logits/rejected": -1.5115171670913696, + "logps/chosen": -327.0439758300781, + "logps/rejected": -339.9987487792969, + "loss": 0.5865, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4852047860622406, + "rewards/margins": 0.2210419923067093, + "rewards/rejected": -0.7062466740608215, + "step": 4580 + }, + { + "epoch": 0.6, + "learning_rate": 2.060881722685742e-06, + "logits/chosen": -1.7310596704483032, + "logits/rejected": -1.3760565519332886, + "logps/chosen": -342.5307312011719, + "logps/rejected": -361.5404357910156, + "loss": 0.4875, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27228546142578125, + "rewards/margins": 0.6848529577255249, + "rewards/rejected": -0.9571383595466614, + "step": 4590 + }, + { + "epoch": 0.6, + "learning_rate": 2.049643247893235e-06, + "logits/chosen": -1.863608717918396, + "logits/rejected": -1.6413013935089111, + "logps/chosen": -377.10870361328125, + "logps/rejected": -378.5328063964844, + "loss": 0.5728, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3542596399784088, + "rewards/margins": 0.5362662672996521, + "rewards/rejected": -0.8905259370803833, + "step": 4600 + }, + { + "epoch": 0.6, + "learning_rate": 2.0384141715806903e-06, + "logits/chosen": -1.5854460000991821, + "logits/rejected": -1.7480173110961914, + "logps/chosen": -291.39373779296875, + "logps/rejected": -334.1391906738281, + "loss": 0.6097, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27462029457092285, + "rewards/margins": 0.34743940830230713, + "rewards/rejected": -0.62205970287323, + "step": 4610 + }, + { + "epoch": 0.6, + "learning_rate": 2.0271947280873255e-06, + "logits/chosen": -1.5624407529830933, + "logits/rejected": -1.4370942115783691, + "logps/chosen": -229.4803466796875, + "logps/rejected": -325.2397766113281, + "loss": 0.5514, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16404582560062408, + "rewards/margins": 0.7237406373023987, + "rewards/rejected": -0.8877862691879272, + "step": 4620 + }, + { + "epoch": 0.61, + "learning_rate": 2.0159851515513302e-06, + "logits/chosen": -1.4679185152053833, + "logits/rejected": -1.4515104293823242, + "logps/chosen": -294.8341369628906, + "logps/rejected": -328.0325622558594, + "loss": 0.5679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.42117685079574585, + "rewards/margins": 0.4427080750465393, + "rewards/rejected": -0.8638850450515747, + "step": 4630 + }, + { + "epoch": 0.61, + "learning_rate": 2.004785675904982e-06, + "logits/chosen": -1.594603180885315, + "logits/rejected": -1.7265113592147827, + "logps/chosen": -317.6143493652344, + "logps/rejected": -342.934814453125, + "loss": 0.5743, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4390074610710144, + "rewards/margins": 0.5319045782089233, + "rewards/rejected": -0.9709121584892273, + "step": 4640 + }, + { + "epoch": 0.61, + "learning_rate": 1.9935965348697624e-06, + "logits/chosen": -1.6550309658050537, + "logits/rejected": -1.6105419397354126, + "logps/chosen": -303.5417175292969, + "logps/rejected": -363.78265380859375, + "loss": 0.5957, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3834788203239441, + "rewards/margins": 0.3922328054904938, + "rewards/rejected": -0.7757116556167603, + "step": 4650 + }, + { + "epoch": 0.61, + "learning_rate": 1.9824179619514807e-06, + "logits/chosen": -1.457901954650879, + "logits/rejected": -1.4973808526992798, + "logps/chosen": -197.23626708984375, + "logps/rejected": -258.09075927734375, + "loss": 0.5328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14524070918560028, + "rewards/margins": 0.5147606134414673, + "rewards/rejected": -0.6600013375282288, + "step": 4660 + }, + { + "epoch": 0.61, + "learning_rate": 1.9712501904354004e-06, + "logits/chosen": -1.6303856372833252, + "logits/rejected": -1.5002949237823486, + "logps/chosen": -297.46893310546875, + "logps/rejected": -293.00640869140625, + "loss": 0.5891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4327002167701721, + "rewards/margins": 0.4358891546726227, + "rewards/rejected": -0.8685892820358276, + "step": 4670 + }, + { + "epoch": 0.61, + "learning_rate": 1.960093453381369e-06, + "logits/chosen": -1.4768266677856445, + "logits/rejected": -1.5492140054702759, + "logps/chosen": -278.8161926269531, + "logps/rejected": -319.0284729003906, + "loss": 0.5879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32473501563072205, + "rewards/margins": 0.4120146632194519, + "rewards/rejected": -0.7367497086524963, + "step": 4680 + }, + { + "epoch": 0.61, + "learning_rate": 1.948947983618962e-06, + "logits/chosen": -1.6835148334503174, + "logits/rejected": -1.514500379562378, + "logps/chosen": -351.59271240234375, + "logps/rejected": -339.1379089355469, + "loss": 0.634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.40584683418273926, + "rewards/margins": 0.36626747250556946, + "rewards/rejected": -0.7721143960952759, + "step": 4690 + }, + { + "epoch": 0.62, + "learning_rate": 1.937814013742611e-06, + "logits/chosen": -1.7746378183364868, + "logits/rejected": -1.6544166803359985, + "logps/chosen": -420.8392028808594, + "logps/rejected": -412.474609375, + "loss": 0.5898, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45954641699790955, + "rewards/margins": 0.3412167429924011, + "rewards/rejected": -0.8007631301879883, + "step": 4700 + }, + { + "epoch": 0.62, + "learning_rate": 1.9266917761067617e-06, + "logits/chosen": -1.5112212896347046, + "logits/rejected": -1.5691413879394531, + "logps/chosen": -331.11920166015625, + "logps/rejected": -331.6216735839844, + "loss": 0.5273, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4008978307247162, + "rewards/margins": 0.6116882562637329, + "rewards/rejected": -1.012585997581482, + "step": 4710 + }, + { + "epoch": 0.62, + "learning_rate": 1.915581502821017e-06, + "logits/chosen": -1.5655747652053833, + "logits/rejected": -1.2899545431137085, + "logps/chosen": -328.1627502441406, + "logps/rejected": -435.8638610839844, + "loss": 0.5539, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.38041234016418457, + "rewards/margins": 0.4574778974056244, + "rewards/rejected": -0.8378902673721313, + "step": 4720 + }, + { + "epoch": 0.62, + "learning_rate": 1.9044834257452997e-06, + "logits/chosen": -1.718592643737793, + "logits/rejected": -1.5489732027053833, + "logps/chosen": -303.0252990722656, + "logps/rejected": -305.9423828125, + "loss": 0.5589, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1997176706790924, + "rewards/margins": 0.32135432958602905, + "rewards/rejected": -0.5210720300674438, + "step": 4730 + }, + { + "epoch": 0.62, + "learning_rate": 1.893397776485006e-06, + "logits/chosen": -1.7805715799331665, + "logits/rejected": -1.5652920007705688, + "logps/chosen": -317.63604736328125, + "logps/rejected": -367.9776916503906, + "loss": 0.5076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23475825786590576, + "rewards/margins": 0.8650327920913696, + "rewards/rejected": -1.0997909307479858, + "step": 4740 + }, + { + "epoch": 0.62, + "learning_rate": 1.8823247863861804e-06, + "logits/chosen": -1.756079077720642, + "logits/rejected": -1.6868422031402588, + "logps/chosen": -333.47454833984375, + "logps/rejected": -362.52301025390625, + "loss": 0.5813, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3227579891681671, + "rewards/margins": 0.3966540992259979, + "rewards/rejected": -0.719412088394165, + "step": 4750 + }, + { + "epoch": 0.62, + "learning_rate": 1.8712646865306822e-06, + "logits/chosen": -1.7017524242401123, + "logits/rejected": -1.7233861684799194, + "logps/chosen": -388.99713134765625, + "logps/rejected": -377.5244445800781, + "loss": 0.5559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3806911110877991, + "rewards/margins": 0.4828492999076843, + "rewards/rejected": -0.8635404706001282, + "step": 4760 + }, + { + "epoch": 0.62, + "learning_rate": 1.8602177077313631e-06, + "logits/chosen": -1.6587553024291992, + "logits/rejected": -1.628858208656311, + "logps/chosen": -312.86322021484375, + "logps/rejected": -389.04876708984375, + "loss": 0.5723, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3315592408180237, + "rewards/margins": 0.46664437651634216, + "rewards/rejected": -0.7982035875320435, + "step": 4770 + }, + { + "epoch": 0.63, + "learning_rate": 1.8491840805272546e-06, + "logits/chosen": -1.712566614151001, + "logits/rejected": -1.817307472229004, + "logps/chosen": -331.1411437988281, + "logps/rejected": -333.3837890625, + "loss": 0.5762, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.403207004070282, + "rewards/margins": 0.297088086605072, + "rewards/rejected": -0.700295090675354, + "step": 4780 + }, + { + "epoch": 0.63, + "learning_rate": 1.8381640351787516e-06, + "logits/chosen": -1.8101027011871338, + "logits/rejected": -1.7577232122421265, + "logps/chosen": -286.3332824707031, + "logps/rejected": -350.74945068359375, + "loss": 0.5363, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1596890687942505, + "rewards/margins": 0.5356277227401733, + "rewards/rejected": -0.6953168511390686, + "step": 4790 + }, + { + "epoch": 0.63, + "learning_rate": 1.8271578016628122e-06, + "logits/chosen": -1.4374070167541504, + "logits/rejected": -1.5717618465423584, + "logps/chosen": -291.6906433105469, + "logps/rejected": -314.6520080566406, + "loss": 0.5531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3411603569984436, + "rewards/margins": 0.3061404824256897, + "rewards/rejected": -0.6473008394241333, + "step": 4800 + }, + { + "epoch": 0.63, + "learning_rate": 1.8161656096681546e-06, + "logits/chosen": -1.4271020889282227, + "logits/rejected": -1.5205187797546387, + "logps/chosen": -257.0478515625, + "logps/rejected": -357.82000732421875, + "loss": 0.4998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21422939002513885, + "rewards/margins": 1.0215184688568115, + "rewards/rejected": -1.2357479333877563, + "step": 4810 + }, + { + "epoch": 0.63, + "learning_rate": 1.8051876885904645e-06, + "logits/chosen": -1.4621590375900269, + "logits/rejected": -1.4864403009414673, + "logps/chosen": -341.10906982421875, + "logps/rejected": -349.58282470703125, + "loss": 0.5562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.33063384890556335, + "rewards/margins": 0.5747858285903931, + "rewards/rejected": -0.905419647693634, + "step": 4820 + }, + { + "epoch": 0.63, + "learning_rate": 1.7942242675276098e-06, + "logits/chosen": -1.618364691734314, + "logits/rejected": -1.614487886428833, + "logps/chosen": -240.9441375732422, + "logps/rejected": -324.74810791015625, + "loss": 0.5749, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4098549783229828, + "rewards/margins": 0.36594170331954956, + "rewards/rejected": -0.7757967114448547, + "step": 4830 + }, + { + "epoch": 0.63, + "learning_rate": 1.783275575274856e-06, + "logits/chosen": -1.7210979461669922, + "logits/rejected": -1.5938465595245361, + "logps/chosen": -361.4447937011719, + "logps/rejected": -329.34783935546875, + "loss": 0.5435, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15424402058124542, + "rewards/margins": 0.6647943258285522, + "rewards/rejected": -0.8190382719039917, + "step": 4840 + }, + { + "epoch": 0.63, + "learning_rate": 1.7723418403200943e-06, + "logits/chosen": -1.7758058309555054, + "logits/rejected": -1.466178297996521, + "logps/chosen": -376.75823974609375, + "logps/rejected": -353.8374938964844, + "loss": 0.5415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.37667909264564514, + "rewards/margins": 0.5589955449104309, + "rewards/rejected": -0.9356746673583984, + "step": 4850 + }, + { + "epoch": 0.64, + "learning_rate": 1.7614232908390748e-06, + "logits/chosen": -1.6571295261383057, + "logits/rejected": -1.4460431337356567, + "logps/chosen": -337.8949279785156, + "logps/rejected": -369.03240966796875, + "loss": 0.6248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4235663414001465, + "rewards/margins": 0.3070405125617981, + "rewards/rejected": -0.7306068539619446, + "step": 4860 + }, + { + "epoch": 0.64, + "learning_rate": 1.7505201546906398e-06, + "logits/chosen": -1.5515334606170654, + "logits/rejected": -1.6434730291366577, + "logps/chosen": -370.28326416015625, + "logps/rejected": -336.73388671875, + "loss": 0.5657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39485615491867065, + "rewards/margins": 0.45868173241615295, + "rewards/rejected": -0.8535378575325012, + "step": 4870 + }, + { + "epoch": 0.64, + "learning_rate": 1.7396326594119717e-06, + "logits/chosen": -1.5114092826843262, + "logits/rejected": -1.6485391855239868, + "logps/chosen": -300.55908203125, + "logps/rejected": -323.77227783203125, + "loss": 0.53, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24799218773841858, + "rewards/margins": 0.6091141104698181, + "rewards/rejected": -0.8571063280105591, + "step": 4880 + }, + { + "epoch": 0.64, + "learning_rate": 1.7287610322138449e-06, + "logits/chosen": -1.537276029586792, + "logits/rejected": -1.5622166395187378, + "logps/chosen": -338.65802001953125, + "logps/rejected": -358.07080078125, + "loss": 0.5302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3425188660621643, + "rewards/margins": 0.6254263520240784, + "rewards/rejected": -0.9679452776908875, + "step": 4890 + }, + { + "epoch": 0.64, + "learning_rate": 1.7179054999758817e-06, + "logits/chosen": -1.7274553775787354, + "logits/rejected": -1.430323839187622, + "logps/chosen": -307.8840637207031, + "logps/rejected": -332.33575439453125, + "loss": 0.5659, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.45394062995910645, + "rewards/margins": 0.42709535360336304, + "rewards/rejected": -0.8810359835624695, + "step": 4900 + }, + { + "epoch": 0.64, + "learning_rate": 1.7070662892418225e-06, + "logits/chosen": -1.635499358177185, + "logits/rejected": -1.5756409168243408, + "logps/chosen": -278.1394348144531, + "logps/rejected": -328.87139892578125, + "loss": 0.5335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4298344552516937, + "rewards/margins": 0.578917384147644, + "rewards/rejected": -1.0087518692016602, + "step": 4910 + }, + { + "epoch": 0.64, + "learning_rate": 1.6962436262147913e-06, + "logits/chosen": -1.466153860092163, + "logits/rejected": -1.4338953495025635, + "logps/chosen": -262.88922119140625, + "logps/rejected": -332.0660095214844, + "loss": 0.5778, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45713844895362854, + "rewards/margins": 0.5292654633522034, + "rewards/rejected": -0.98640376329422, + "step": 4920 + }, + { + "epoch": 0.65, + "learning_rate": 1.6854377367525814e-06, + "logits/chosen": -1.5041332244873047, + "logits/rejected": -1.5964603424072266, + "logps/chosen": -332.08544921875, + "logps/rejected": -343.4684143066406, + "loss": 0.5406, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21062858402729034, + "rewards/margins": 0.6219049096107483, + "rewards/rejected": -0.8325334787368774, + "step": 4930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6746488463629362e-06, + "logits/chosen": -1.7638885974884033, + "logits/rejected": -1.8065506219863892, + "logps/chosen": -373.37567138671875, + "logps/rejected": -414.4461364746094, + "loss": 0.578, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4446972906589508, + "rewards/margins": 0.38834792375564575, + "rewards/rejected": -0.8330451846122742, + "step": 4940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6638771801988483e-06, + "logits/chosen": -1.7115004062652588, + "logits/rejected": -1.4689561128616333, + "logps/chosen": -364.6258544921875, + "logps/rejected": -410.55279541015625, + "loss": 0.5907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3528302013874054, + "rewards/margins": 0.6263970136642456, + "rewards/rejected": -0.9792273640632629, + "step": 4950 + }, + { + "epoch": 0.65, + "learning_rate": 1.653122963053857e-06, + "logits/chosen": -1.5834096670150757, + "logits/rejected": -1.3456847667694092, + "logps/chosen": -329.70440673828125, + "logps/rejected": -345.5718078613281, + "loss": 0.5312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30627477169036865, + "rewards/margins": 0.5374767184257507, + "rewards/rejected": -0.8437515497207642, + "step": 4960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6423864193573606e-06, + "logits/chosen": -1.4466499090194702, + "logits/rejected": -1.2427787780761719, + "logps/chosen": -274.56280517578125, + "logps/rejected": -326.6078796386719, + "loss": 0.5214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2284989058971405, + "rewards/margins": 0.5575379133224487, + "rewards/rejected": -0.7860368490219116, + "step": 4970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6316677731699286e-06, + "logits/chosen": -1.740645170211792, + "logits/rejected": -1.6510359048843384, + "logps/chosen": -374.50927734375, + "logps/rejected": -393.22271728515625, + "loss": 0.5073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.581656813621521, + "rewards/margins": 0.643889307975769, + "rewards/rejected": -1.22554612159729, + "step": 4980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6209672481786302e-06, + "logits/chosen": -1.295310616493225, + "logits/rejected": -1.421493411064148, + "logps/chosen": -312.65093994140625, + "logps/rejected": -330.83306884765625, + "loss": 0.517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2958032190799713, + "rewards/margins": 0.7064327001571655, + "rewards/rejected": -1.0022358894348145, + "step": 4990 + }, + { + "epoch": 0.65, + "learning_rate": 1.6102850676923616e-06, + "logits/chosen": -1.5604960918426514, + "logits/rejected": -1.6637929677963257, + "logps/chosen": -320.6743469238281, + "logps/rejected": -365.8443603515625, + "loss": 0.6087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4940219521522522, + "rewards/margins": 0.6050677299499512, + "rewards/rejected": -1.0990897417068481, + "step": 5000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5996214546371888e-06, + "logits/chosen": -1.789644479751587, + "logits/rejected": -1.610169768333435, + "logps/chosen": -294.51312255859375, + "logps/rejected": -302.45697021484375, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3318510055541992, + "rewards/margins": 0.5447564721107483, + "rewards/rejected": -0.8766075372695923, + "step": 5010 + }, + { + "epoch": 0.66, + "learning_rate": 1.588976631551697e-06, + "logits/chosen": -1.6871614456176758, + "logits/rejected": -1.4874083995819092, + "logps/chosen": -324.0235595703125, + "logps/rejected": -356.5283203125, + "loss": 0.5385, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.32501453161239624, + "rewards/margins": 0.5540294647216797, + "rewards/rejected": -0.8790439367294312, + "step": 5020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5783508205823412e-06, + "logits/chosen": -1.552725076675415, + "logits/rejected": -1.5381156206130981, + "logps/chosen": -310.8211975097656, + "logps/rejected": -335.9022216796875, + "loss": 0.65, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4292033314704895, + "rewards/margins": 0.41316431760787964, + "rewards/rejected": -0.8423677682876587, + "step": 5030 + }, + { + "epoch": 0.66, + "learning_rate": 1.5677442434788143e-06, + "logits/chosen": -1.604744553565979, + "logits/rejected": -1.6676803827285767, + "logps/chosen": -335.8542175292969, + "logps/rejected": -381.3191223144531, + "loss": 0.5708, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26305484771728516, + "rewards/margins": 0.6882427334785461, + "rewards/rejected": -0.9512976408004761, + "step": 5040 + }, + { + "epoch": 0.66, + "learning_rate": 1.5571571215894181e-06, + "logits/chosen": -1.689901351928711, + "logits/rejected": -1.7966197729110718, + "logps/chosen": -328.8950500488281, + "logps/rejected": -409.5695495605469, + "loss": 0.5211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40656304359436035, + "rewards/margins": 0.5546118021011353, + "rewards/rejected": -0.9611749649047852, + "step": 5050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5465896758564452e-06, + "logits/chosen": -1.7022901773452759, + "logits/rejected": -1.6684176921844482, + "logps/chosen": -342.58526611328125, + "logps/rejected": -390.37890625, + "loss": 0.594, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.315326988697052, + "rewards/margins": 0.4648142457008362, + "rewards/rejected": -0.7801412343978882, + "step": 5060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5360421268115653e-06, + "logits/chosen": -1.7463855743408203, + "logits/rejected": -1.5876306295394897, + "logps/chosen": -359.6499938964844, + "logps/rejected": -359.08349609375, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40452447533607483, + "rewards/margins": 0.6506131887435913, + "rewards/rejected": -1.0551376342773438, + "step": 5070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5255146945712267e-06, + "logits/chosen": -1.7098106145858765, + "logits/rejected": -1.4700920581817627, + "logps/chosen": -349.40478515625, + "logps/rejected": -313.53009033203125, + "loss": 0.5862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3787660598754883, + "rewards/margins": 0.4539892077445984, + "rewards/rejected": -0.8327552676200867, + "step": 5080 + }, + { + "epoch": 0.67, + "learning_rate": 1.5150075988320594e-06, + "logits/chosen": -1.4833920001983643, + "logits/rejected": -1.2993857860565186, + "logps/chosen": -284.3592834472656, + "logps/rejected": -327.693359375, + "loss": 0.5512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4025176465511322, + "rewards/margins": 0.5441306829452515, + "rewards/rejected": -0.9466484189033508, + "step": 5090 + }, + { + "epoch": 0.67, + "learning_rate": 1.5045210588662929e-06, + "logits/chosen": -1.2175451517105103, + "logits/rejected": -1.4691922664642334, + "logps/chosen": -295.90118408203125, + "logps/rejected": -338.4582214355469, + "loss": 0.5619, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28938689827919006, + "rewards/margins": 0.6416627168655396, + "rewards/rejected": -0.9310495257377625, + "step": 5100 + }, + { + "epoch": 0.67, + "learning_rate": 1.4940552935171781e-06, + "logits/chosen": -1.962206244468689, + "logits/rejected": -1.6976397037506104, + "logps/chosen": -349.7812805175781, + "logps/rejected": -397.66668701171875, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4642232358455658, + "rewards/margins": 0.48757800459861755, + "rewards/rejected": -0.9518013000488281, + "step": 5110 + }, + { + "epoch": 0.67, + "learning_rate": 1.483610521194419e-06, + "logits/chosen": -1.5347378253936768, + "logits/rejected": -1.295058012008667, + "logps/chosen": -364.92901611328125, + "logps/rejected": -376.8507995605469, + "loss": 0.5222, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3261030614376068, + "rewards/margins": 0.9371359944343567, + "rewards/rejected": -1.2632390260696411, + "step": 5120 + }, + { + "epoch": 0.67, + "learning_rate": 1.4731869598696226e-06, + "logits/chosen": -1.2280769348144531, + "logits/rejected": -1.2927637100219727, + "logps/chosen": -286.06817626953125, + "logps/rejected": -360.7110595703125, + "loss": 0.5621, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4254229664802551, + "rewards/margins": 0.4437944293022156, + "rewards/rejected": -0.8692175149917603, + "step": 5130 + }, + { + "epoch": 0.67, + "learning_rate": 1.4627848270717387e-06, + "logits/chosen": -1.5197232961654663, + "logits/rejected": -1.2776516675949097, + "logps/chosen": -325.25506591796875, + "logps/rejected": -355.19366455078125, + "loss": 0.5274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34112221002578735, + "rewards/margins": 0.7135742902755737, + "rewards/rejected": -1.0546965599060059, + "step": 5140 + }, + { + "epoch": 0.67, + "learning_rate": 1.4524043398825277e-06, + "logits/chosen": -1.5192543268203735, + "logits/rejected": -1.4652892351150513, + "logps/chosen": -365.44757080078125, + "logps/rejected": -332.209716796875, + "loss": 0.5801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34040552377700806, + "rewards/margins": 0.4744376242160797, + "rewards/rejected": -0.8148431777954102, + "step": 5150 + }, + { + "epoch": 0.68, + "learning_rate": 1.4420457149320299e-06, + "logits/chosen": -1.612372636795044, + "logits/rejected": -1.7272748947143555, + "logps/chosen": -302.9355163574219, + "logps/rejected": -357.7930603027344, + "loss": 0.6453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3746944069862366, + "rewards/margins": 0.1994570940732956, + "rewards/rejected": -0.5741515159606934, + "step": 5160 + }, + { + "epoch": 0.68, + "learning_rate": 1.431709168394042e-06, + "logits/chosen": -1.7644230127334595, + "logits/rejected": -1.71954345703125, + "logps/chosen": -401.1996765136719, + "logps/rejected": -348.4986877441406, + "loss": 0.5293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5327243208885193, + "rewards/margins": 0.2692771553993225, + "rewards/rejected": -0.8020015954971313, + "step": 5170 + }, + { + "epoch": 0.68, + "learning_rate": 1.4213949159816059e-06, + "logits/chosen": -1.4032543897628784, + "logits/rejected": -1.444990873336792, + "logps/chosen": -293.1512145996094, + "logps/rejected": -338.5201110839844, + "loss": 0.512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3039286732673645, + "rewards/margins": 0.6951041221618652, + "rewards/rejected": -0.9990326762199402, + "step": 5180 + }, + { + "epoch": 0.68, + "learning_rate": 1.4111031729425103e-06, + "logits/chosen": -1.4505698680877686, + "logits/rejected": -1.5793302059173584, + "logps/chosen": -358.84869384765625, + "logps/rejected": -356.6041564941406, + "loss": 0.5548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5110670328140259, + "rewards/margins": 0.5473508834838867, + "rewards/rejected": -1.0584180355072021, + "step": 5190 + }, + { + "epoch": 0.68, + "learning_rate": 1.4008341540547965e-06, + "logits/chosen": -1.4758559465408325, + "logits/rejected": -1.454754114151001, + "logps/chosen": -325.47332763671875, + "logps/rejected": -352.30865478515625, + "loss": 0.5901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4455079436302185, + "rewards/margins": 0.30439290404319763, + "rewards/rejected": -0.7499008178710938, + "step": 5200 + }, + { + "epoch": 0.68, + "learning_rate": 1.3905880736222737e-06, + "logits/chosen": -1.52687406539917, + "logits/rejected": -1.5491580963134766, + "logps/chosen": -280.5191345214844, + "logps/rejected": -343.73291015625, + "loss": 0.5178, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.39644894003868103, + "rewards/margins": 0.41713958978652954, + "rewards/rejected": -0.8135885000228882, + "step": 5210 + }, + { + "epoch": 0.68, + "learning_rate": 1.3803651454700531e-06, + "logits/chosen": -1.4319944381713867, + "logits/rejected": -1.4349100589752197, + "logps/chosen": -320.1703796386719, + "logps/rejected": -327.7433166503906, + "loss": 0.5336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35609325766563416, + "rewards/margins": 0.5811716318130493, + "rewards/rejected": -0.9372648000717163, + "step": 5220 + }, + { + "epoch": 0.68, + "learning_rate": 1.3701655829400773e-06, + "logits/chosen": -1.823012113571167, + "logits/rejected": -1.6029945611953735, + "logps/chosen": -433.506591796875, + "logps/rejected": -381.86260986328125, + "loss": 0.5612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5027961730957031, + "rewards/margins": 0.543420672416687, + "rewards/rejected": -1.0462168455123901, + "step": 5230 + }, + { + "epoch": 0.69, + "learning_rate": 1.3599895988866756e-06, + "logits/chosen": -1.73199462890625, + "logits/rejected": -1.6362035274505615, + "logps/chosen": -292.4515686035156, + "logps/rejected": -337.2004699707031, + "loss": 0.6311, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3816404938697815, + "rewards/margins": 0.35000211000442505, + "rewards/rejected": -0.7316426038742065, + "step": 5240 + }, + { + "epoch": 0.69, + "learning_rate": 1.3498374056721198e-06, + "logits/chosen": -1.9195945262908936, + "logits/rejected": -1.7692878246307373, + "logps/chosen": -303.76422119140625, + "logps/rejected": -428.4300231933594, + "loss": 0.5247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38987627625465393, + "rewards/margins": 0.5539663434028625, + "rewards/rejected": -0.9438425302505493, + "step": 5250 + }, + { + "epoch": 0.69, + "learning_rate": 1.3397092151621883e-06, + "logits/chosen": -1.4648250341415405, + "logits/rejected": -1.3597750663757324, + "logps/chosen": -320.05609130859375, + "logps/rejected": -320.7586364746094, + "loss": 0.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5468131899833679, + "rewards/margins": 0.4295468330383301, + "rewards/rejected": -0.976360023021698, + "step": 5260 + }, + { + "epoch": 0.69, + "learning_rate": 1.3296052387217484e-06, + "logits/chosen": -1.266595721244812, + "logits/rejected": -1.5876617431640625, + "logps/chosen": -262.572998046875, + "logps/rejected": -279.0188903808594, + "loss": 0.5963, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35105082392692566, + "rewards/margins": 0.5339232683181763, + "rewards/rejected": -0.8849741816520691, + "step": 5270 + }, + { + "epoch": 0.69, + "learning_rate": 1.3195256872103476e-06, + "logits/chosen": -1.7793792486190796, + "logits/rejected": -1.5645942687988281, + "logps/chosen": -343.90802001953125, + "logps/rejected": -412.1571350097656, + "loss": 0.5863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3344947099685669, + "rewards/margins": 0.3342021703720093, + "rewards/rejected": -0.6686968803405762, + "step": 5280 + }, + { + "epoch": 0.69, + "learning_rate": 1.3094707709778068e-06, + "logits/chosen": -1.3882719278335571, + "logits/rejected": -1.420300841331482, + "logps/chosen": -282.89678955078125, + "logps/rejected": -305.47943115234375, + "loss": 0.5464, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44044750928878784, + "rewards/margins": 0.4781956672668457, + "rewards/rejected": -0.9186431765556335, + "step": 5290 + }, + { + "epoch": 0.69, + "learning_rate": 1.2994406998598364e-06, + "logits/chosen": -1.6414172649383545, + "logits/rejected": -1.2757562398910522, + "logps/chosen": -231.0124969482422, + "logps/rejected": -307.9460754394531, + "loss": 0.5324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3395569920539856, + "rewards/margins": 0.7885581254959106, + "rewards/rejected": -1.1281150579452515, + "step": 5300 + }, + { + "epoch": 0.69, + "learning_rate": 1.2894356831736558e-06, + "logits/chosen": -1.5317656993865967, + "logits/rejected": -1.4437824487686157, + "logps/chosen": -297.41827392578125, + "logps/rejected": -333.2902526855469, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4341201186180115, + "rewards/margins": 0.6302675008773804, + "rewards/rejected": -1.0643876791000366, + "step": 5310 + }, + { + "epoch": 0.7, + "learning_rate": 1.2794559297136203e-06, + "logits/chosen": -1.2370169162750244, + "logits/rejected": -1.4776989221572876, + "logps/chosen": -295.41314697265625, + "logps/rejected": -357.27960205078125, + "loss": 0.578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.317495733499527, + "rewards/margins": 0.5164381861686707, + "rewards/rejected": -0.83393394947052, + "step": 5320 + }, + { + "epoch": 0.7, + "learning_rate": 1.2695016477468724e-06, + "logits/chosen": -1.6845111846923828, + "logits/rejected": -1.8207229375839233, + "logps/chosen": -347.0459899902344, + "logps/rejected": -327.3348388671875, + "loss": 0.6308, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42949360609054565, + "rewards/margins": 0.3359355330467224, + "rewards/rejected": -0.7654291391372681, + "step": 5330 + }, + { + "epoch": 0.7, + "learning_rate": 1.2595730450089874e-06, + "logits/chosen": -1.727909803390503, + "logits/rejected": -1.69137442111969, + "logps/chosen": -339.36614990234375, + "logps/rejected": -373.46832275390625, + "loss": 0.5507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3283812403678894, + "rewards/margins": 0.6603553891181946, + "rewards/rejected": -0.988736629486084, + "step": 5340 + }, + { + "epoch": 0.7, + "learning_rate": 1.2496703286996433e-06, + "logits/chosen": -1.7269397974014282, + "logits/rejected": -1.5372984409332275, + "logps/chosen": -397.3009338378906, + "logps/rejected": -387.60748291015625, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4792284369468689, + "rewards/margins": 0.6034846305847168, + "rewards/rejected": -1.0827131271362305, + "step": 5350 + }, + { + "epoch": 0.7, + "learning_rate": 1.2397937054782961e-06, + "logits/chosen": -1.6359096765518188, + "logits/rejected": -1.597556710243225, + "logps/chosen": -375.6257019042969, + "logps/rejected": -382.2748107910156, + "loss": 0.522, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.457070529460907, + "rewards/margins": 0.5650699138641357, + "rewards/rejected": -1.022140383720398, + "step": 5360 + }, + { + "epoch": 0.7, + "learning_rate": 1.2299433814598635e-06, + "logits/chosen": -1.64962899684906, + "logits/rejected": -1.742227554321289, + "logps/chosen": -295.298583984375, + "logps/rejected": -331.330810546875, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29296058416366577, + "rewards/margins": 0.35498765110969543, + "rewards/rejected": -0.6479482054710388, + "step": 5370 + }, + { + "epoch": 0.7, + "learning_rate": 1.2201195622104265e-06, + "logits/chosen": -1.556187391281128, + "logits/rejected": -1.52755868434906, + "logps/chosen": -302.2885437011719, + "logps/rejected": -332.32623291015625, + "loss": 0.5436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37561094760894775, + "rewards/margins": 0.7082595825195312, + "rewards/rejected": -1.083870530128479, + "step": 5380 + }, + { + "epoch": 0.71, + "learning_rate": 1.2103224527429417e-06, + "logits/chosen": -1.4576947689056396, + "logits/rejected": -1.4759795665740967, + "logps/chosen": -324.3181457519531, + "logps/rejected": -287.56085205078125, + "loss": 0.6561, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27041420340538025, + "rewards/margins": 0.5179351568222046, + "rewards/rejected": -0.7883493900299072, + "step": 5390 + }, + { + "epoch": 0.71, + "learning_rate": 1.2005522575129559e-06, + "logits/chosen": -1.839748740196228, + "logits/rejected": -1.7827552556991577, + "logps/chosen": -417.31622314453125, + "logps/rejected": -338.0927429199219, + "loss": 0.6775, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38853970170021057, + "rewards/margins": 0.4253416955471039, + "rewards/rejected": -0.813881516456604, + "step": 5400 + }, + { + "epoch": 0.71, + "learning_rate": 1.1908091804143469e-06, + "logits/chosen": -1.339806318283081, + "logits/rejected": -1.307738184928894, + "logps/chosen": -327.3746032714844, + "logps/rejected": -365.6716003417969, + "loss": 0.602, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48991623520851135, + "rewards/margins": 0.4802149832248688, + "rewards/rejected": -0.9701312184333801, + "step": 5410 + }, + { + "epoch": 0.71, + "learning_rate": 1.1810934247750649e-06, + "logits/chosen": -1.4482930898666382, + "logits/rejected": -1.5761969089508057, + "logps/chosen": -376.31842041015625, + "logps/rejected": -355.594482421875, + "loss": 0.5399, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.263977587223053, + "rewards/margins": 0.7046165466308594, + "rewards/rejected": -0.9685942530632019, + "step": 5420 + }, + { + "epoch": 0.71, + "learning_rate": 1.1714051933528881e-06, + "logits/chosen": -1.6793420314788818, + "logits/rejected": -1.521519660949707, + "logps/chosen": -357.15814208984375, + "logps/rejected": -360.46435546875, + "loss": 0.5522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29110783338546753, + "rewards/margins": 0.5364557504653931, + "rewards/rejected": -0.8275636434555054, + "step": 5430 + }, + { + "epoch": 0.71, + "learning_rate": 1.161744688331192e-06, + "logits/chosen": -1.6688343286514282, + "logits/rejected": -1.417170763015747, + "logps/chosen": -323.34857177734375, + "logps/rejected": -380.068115234375, + "loss": 0.6106, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5008817315101624, + "rewards/margins": 0.2509423792362213, + "rewards/rejected": -0.7518240213394165, + "step": 5440 + }, + { + "epoch": 0.71, + "learning_rate": 1.152112111314733e-06, + "logits/chosen": -1.819838285446167, + "logits/rejected": -1.7019535303115845, + "logps/chosen": -308.28741455078125, + "logps/rejected": -319.5489807128906, + "loss": 0.616, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.30229973793029785, + "rewards/margins": 0.3651825189590454, + "rewards/rejected": -0.6674822568893433, + "step": 5450 + }, + { + "epoch": 0.71, + "learning_rate": 1.142507663325439e-06, + "logits/chosen": -1.4386231899261475, + "logits/rejected": -1.7365715503692627, + "logps/chosen": -356.2586975097656, + "logps/rejected": -387.3833923339844, + "loss": 0.5016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3147957921028137, + "rewards/margins": 0.7030807733535767, + "rewards/rejected": -1.0178765058517456, + "step": 5460 + }, + { + "epoch": 0.72, + "learning_rate": 1.132931544798211e-06, + "logits/chosen": -1.2936347723007202, + "logits/rejected": -1.4511758089065552, + "logps/chosen": -328.20892333984375, + "logps/rejected": -363.4265441894531, + "loss": 0.4973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.30207449197769165, + "rewards/margins": 0.6860274076461792, + "rewards/rejected": -0.9881019592285156, + "step": 5470 + }, + { + "epoch": 0.72, + "learning_rate": 1.1233839555767482e-06, + "logits/chosen": -1.3683011531829834, + "logits/rejected": -1.1233896017074585, + "logps/chosen": -311.6413269042969, + "logps/rejected": -299.0362548828125, + "loss": 0.5119, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29436641931533813, + "rewards/margins": 0.7090286016464233, + "rewards/rejected": -1.0033949613571167, + "step": 5480 + }, + { + "epoch": 0.72, + "learning_rate": 1.1138650949093668e-06, + "logits/chosen": -1.5352392196655273, + "logits/rejected": -1.4760797023773193, + "logps/chosen": -279.9900207519531, + "logps/rejected": -261.8179626464844, + "loss": 0.6787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.49985161423683167, + "rewards/margins": 0.33193325996398926, + "rewards/rejected": -0.8317849040031433, + "step": 5490 + }, + { + "epoch": 0.72, + "learning_rate": 1.1043751614448543e-06, + "logits/chosen": -1.58893620967865, + "logits/rejected": -1.2998321056365967, + "logps/chosen": -255.25009155273438, + "logps/rejected": -263.100341796875, + "loss": 0.5581, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2936599850654602, + "rewards/margins": 0.4776766300201416, + "rewards/rejected": -0.771336555480957, + "step": 5500 + }, + { + "epoch": 0.72, + "learning_rate": 1.0949143532283107e-06, + "logits/chosen": -1.5799996852874756, + "logits/rejected": -1.5469352006912231, + "logps/chosen": -347.7076721191406, + "logps/rejected": -389.3499450683594, + "loss": 0.5586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.42106980085372925, + "rewards/margins": 0.5893166661262512, + "rewards/rejected": -1.01038658618927, + "step": 5510 + }, + { + "epoch": 0.72, + "learning_rate": 1.0854828676970275e-06, + "logits/chosen": -1.380615234375, + "logits/rejected": -1.6314111948013306, + "logps/chosen": -274.4620666503906, + "logps/rejected": -310.4410705566406, + "loss": 0.6007, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3287106156349182, + "rewards/margins": 0.42455625534057617, + "rewards/rejected": -0.7532669305801392, + "step": 5520 + }, + { + "epoch": 0.72, + "learning_rate": 1.076080901676361e-06, + "logits/chosen": -1.7739425897598267, + "logits/rejected": -1.8938610553741455, + "logps/chosen": -302.61431884765625, + "logps/rejected": -328.2605285644531, + "loss": 0.6037, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4099544584751129, + "rewards/margins": 0.4388144910335541, + "rewards/rejected": -0.8487690091133118, + "step": 5530 + }, + { + "epoch": 0.72, + "learning_rate": 1.0667086513756234e-06, + "logits/chosen": -1.6532869338989258, + "logits/rejected": -1.3598273992538452, + "logps/chosen": -311.32464599609375, + "logps/rejected": -344.3642883300781, + "loss": 0.5721, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.30887365341186523, + "rewards/margins": 0.5972136855125427, + "rewards/rejected": -0.9060872793197632, + "step": 5540 + }, + { + "epoch": 0.73, + "learning_rate": 1.0573663123839912e-06, + "logits/chosen": -1.452828049659729, + "logits/rejected": -1.3910818099975586, + "logps/chosen": -322.2199401855469, + "logps/rejected": -317.9525146484375, + "loss": 0.5284, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3112536072731018, + "rewards/margins": 0.7667531371116638, + "rewards/rejected": -1.078006625175476, + "step": 5550 + }, + { + "epoch": 0.73, + "learning_rate": 1.0480540796664251e-06, + "logits/chosen": -1.4911980628967285, + "logits/rejected": -1.4798692464828491, + "logps/chosen": -344.4809265136719, + "logps/rejected": -382.9551696777344, + "loss": 0.559, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4746447503566742, + "rewards/margins": 0.323065847158432, + "rewards/rejected": -0.7977105379104614, + "step": 5560 + }, + { + "epoch": 0.73, + "learning_rate": 1.0387721475595978e-06, + "logits/chosen": -1.4452338218688965, + "logits/rejected": -1.356520414352417, + "logps/chosen": -248.4287567138672, + "logps/rejected": -279.7262268066406, + "loss": 0.5951, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4623108506202698, + "rewards/margins": 0.42193174362182617, + "rewards/rejected": -0.8842425346374512, + "step": 5570 + }, + { + "epoch": 0.73, + "learning_rate": 1.0295207097678378e-06, + "logits/chosen": -1.4535021781921387, + "logits/rejected": -1.4040149450302124, + "logps/chosen": -244.7410430908203, + "logps/rejected": -375.2272644042969, + "loss": 0.4971, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.29036831855773926, + "rewards/margins": 0.7012807726860046, + "rewards/rejected": -0.9916491508483887, + "step": 5580 + }, + { + "epoch": 0.73, + "learning_rate": 1.0202999593590924e-06, + "logits/chosen": -1.5947284698486328, + "logits/rejected": -1.6042587757110596, + "logps/chosen": -288.73272705078125, + "logps/rejected": -367.5753479003906, + "loss": 0.4926, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.285076767206192, + "rewards/margins": 0.8736554384231567, + "rewards/rejected": -1.1587321758270264, + "step": 5590 + }, + { + "epoch": 0.73, + "learning_rate": 1.011110088760891e-06, + "logits/chosen": -1.4146513938903809, + "logits/rejected": -1.4851772785186768, + "logps/chosen": -274.80487060546875, + "logps/rejected": -335.02593994140625, + "loss": 0.5069, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.45673370361328125, + "rewards/margins": 0.6715277433395386, + "rewards/rejected": -1.1282615661621094, + "step": 5600 + }, + { + "epoch": 0.73, + "learning_rate": 1.0019512897563347e-06, + "logits/chosen": -1.582021951675415, + "logits/rejected": -1.5874388217926025, + "logps/chosen": -327.87652587890625, + "logps/rejected": -341.3058776855469, + "loss": 0.5873, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3459164500236511, + "rewards/margins": 0.5310074090957642, + "rewards/rejected": -0.8769239187240601, + "step": 5610 + }, + { + "epoch": 0.74, + "learning_rate": 9.928237534800935e-07, + "logits/chosen": -1.7872960567474365, + "logits/rejected": -1.6534982919692993, + "logps/chosen": -299.4911193847656, + "logps/rejected": -333.643310546875, + "loss": 0.5744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2415216863155365, + "rewards/margins": 0.5792838335037231, + "rewards/rejected": -0.820805549621582, + "step": 5620 + }, + { + "epoch": 0.74, + "learning_rate": 9.837276704144174e-07, + "logits/chosen": -1.4801450967788696, + "logits/rejected": -1.3851696252822876, + "logps/chosen": -286.9519348144531, + "logps/rejected": -295.43756103515625, + "loss": 0.5757, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.358195036649704, + "rewards/margins": 0.430875688791275, + "rewards/rejected": -0.789070725440979, + "step": 5630 + }, + { + "epoch": 0.74, + "learning_rate": 9.746632303851569e-07, + "logits/chosen": -1.6603202819824219, + "logits/rejected": -1.523306131362915, + "logps/chosen": -330.683349609375, + "logps/rejected": -319.15509033203125, + "loss": 0.5444, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.36284035444259644, + "rewards/margins": 0.6048654317855835, + "rewards/rejected": -0.9677058458328247, + "step": 5640 + }, + { + "epoch": 0.74, + "learning_rate": 9.65630622557809e-07, + "logits/chosen": -1.6951334476470947, + "logits/rejected": -1.4775047302246094, + "logps/chosen": -380.4350891113281, + "logps/rejected": -383.6583557128906, + "loss": 0.603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41704779863357544, + "rewards/margins": 0.4197191298007965, + "rewards/rejected": -0.8367668986320496, + "step": 5650 + }, + { + "epoch": 0.74, + "learning_rate": 9.56630035433561e-07, + "logits/chosen": -1.7787210941314697, + "logits/rejected": -1.5558375120162964, + "logps/chosen": -351.1031188964844, + "logps/rejected": -325.9077453613281, + "loss": 0.5647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32037654519081116, + "rewards/margins": 0.46739619970321655, + "rewards/rejected": -0.7877727746963501, + "step": 5660 + }, + { + "epoch": 0.74, + "learning_rate": 9.476616568453659e-07, + "logits/chosen": -1.5925695896148682, + "logits/rejected": -1.5889828205108643, + "logps/chosen": -340.7126159667969, + "logps/rejected": -331.0557861328125, + "loss": 0.5091, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3751803934574127, + "rewards/margins": 0.6463561654090881, + "rewards/rejected": -1.0215365886688232, + "step": 5670 + }, + { + "epoch": 0.74, + "learning_rate": 9.387256739540162e-07, + "logits/chosen": -1.7625625133514404, + "logits/rejected": -1.729768991470337, + "logps/chosen": -351.73541259765625, + "logps/rejected": -369.366455078125, + "loss": 0.5145, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2438022792339325, + "rewards/margins": 0.6517744064331055, + "rewards/rejected": -0.8955766558647156, + "step": 5680 + }, + { + "epoch": 0.74, + "learning_rate": 9.298222732442377e-07, + "logits/chosen": -1.940168023109436, + "logits/rejected": -1.7438640594482422, + "logps/chosen": -345.25994873046875, + "logps/rejected": -354.7479553222656, + "loss": 0.5763, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3408891260623932, + "rewards/margins": 0.7196322679519653, + "rewards/rejected": -1.0605213642120361, + "step": 5690 + }, + { + "epoch": 0.75, + "learning_rate": 9.20951640520803e-07, + "logits/chosen": -1.4479033946990967, + "logits/rejected": -1.3271280527114868, + "logps/chosen": -291.07427978515625, + "logps/rejected": -323.8932800292969, + "loss": 0.538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.29643645882606506, + "rewards/margins": 0.5887986421585083, + "rewards/rejected": -0.8852350115776062, + "step": 5700 + }, + { + "epoch": 0.75, + "learning_rate": 9.121139609046484e-07, + "logits/chosen": -1.6300618648529053, + "logits/rejected": -1.4778292179107666, + "logps/chosen": -322.1125793457031, + "logps/rejected": -401.39019775390625, + "loss": 0.6377, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3357270061969757, + "rewards/margins": 0.5135652422904968, + "rewards/rejected": -0.8492921590805054, + "step": 5710 + }, + { + "epoch": 0.75, + "learning_rate": 9.033094188290121e-07, + "logits/chosen": -1.500940203666687, + "logits/rejected": -1.586003303527832, + "logps/chosen": -351.4118957519531, + "logps/rejected": -351.0624084472656, + "loss": 0.6192, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.497396856546402, + "rewards/margins": 0.5018718242645264, + "rewards/rejected": -0.9992687106132507, + "step": 5720 + }, + { + "epoch": 0.75, + "learning_rate": 8.945381980355889e-07, + "logits/chosen": -1.7967160940170288, + "logits/rejected": -1.583381175994873, + "logps/chosen": -333.7850646972656, + "logps/rejected": -392.7862243652344, + "loss": 0.5942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4145778715610504, + "rewards/margins": 0.45821672677993774, + "rewards/rejected": -0.8727946281433105, + "step": 5730 + }, + { + "epoch": 0.75, + "learning_rate": 8.858004815706919e-07, + "logits/chosen": -1.4308854341506958, + "logits/rejected": -1.5166014432907104, + "logps/chosen": -299.985595703125, + "logps/rejected": -341.3712463378906, + "loss": 0.4804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3781455457210541, + "rewards/margins": 0.671671986579895, + "rewards/rejected": -1.049817442893982, + "step": 5740 + }, + { + "epoch": 0.75, + "learning_rate": 8.77096451781432e-07, + "logits/chosen": -1.6099936962127686, + "logits/rejected": -1.3069753646850586, + "logps/chosen": -281.0746154785156, + "logps/rejected": -323.6234130859375, + "loss": 0.59, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5059057474136353, + "rewards/margins": 0.4684232175350189, + "rewards/rejected": -0.9743289947509766, + "step": 5750 + }, + { + "epoch": 0.75, + "learning_rate": 8.684262903119165e-07, + "logits/chosen": -1.5114291906356812, + "logits/rejected": -1.5615962743759155, + "logps/chosen": -300.89691162109375, + "logps/rejected": -353.93011474609375, + "loss": 0.5749, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4185856878757477, + "rewards/margins": 0.3667107820510864, + "rewards/rejected": -0.7852964997291565, + "step": 5760 + }, + { + "epoch": 0.76, + "learning_rate": 8.597901780994525e-07, + "logits/chosen": -1.777143120765686, + "logits/rejected": -1.5190109014511108, + "logps/chosen": -303.24322509765625, + "logps/rejected": -289.6794738769531, + "loss": 0.5898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39851003885269165, + "rewards/margins": 0.39009347558021545, + "rewards/rejected": -0.7886034846305847, + "step": 5770 + }, + { + "epoch": 0.76, + "learning_rate": 8.511882953707773e-07, + "logits/chosen": -1.565483808517456, + "logits/rejected": -1.655055046081543, + "logps/chosen": -366.4530334472656, + "logps/rejected": -372.0270080566406, + "loss": 0.6045, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.37845510244369507, + "rewards/margins": 0.4747762680053711, + "rewards/rejected": -0.8532315492630005, + "step": 5780 + }, + { + "epoch": 0.76, + "learning_rate": 8.426208216382944e-07, + "logits/chosen": -1.4742367267608643, + "logits/rejected": -1.4019474983215332, + "logps/chosen": -279.42779541015625, + "logps/rejected": -342.3641052246094, + "loss": 0.5287, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.390156626701355, + "rewards/margins": 0.5289020538330078, + "rewards/rejected": -0.9190587997436523, + "step": 5790 + }, + { + "epoch": 0.76, + "learning_rate": 8.340879356963245e-07, + "logits/chosen": -1.6543285846710205, + "logits/rejected": -1.397047996520996, + "logps/chosen": -315.0841979980469, + "logps/rejected": -342.9209899902344, + "loss": 0.5521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3553606867790222, + "rewards/margins": 0.40659159421920776, + "rewards/rejected": -0.7619523406028748, + "step": 5800 + }, + { + "epoch": 0.76, + "learning_rate": 8.255898156173777e-07, + "logits/chosen": -1.6289129257202148, + "logits/rejected": -1.6223284006118774, + "logps/chosen": -275.2503356933594, + "logps/rejected": -314.73260498046875, + "loss": 0.5347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3665020167827606, + "rewards/margins": 0.5788888931274414, + "rewards/rejected": -0.9453908801078796, + "step": 5810 + }, + { + "epoch": 0.76, + "learning_rate": 8.171266387484389e-07, + "logits/chosen": -1.56658136844635, + "logits/rejected": -1.5691983699798584, + "logps/chosen": -318.41705322265625, + "logps/rejected": -316.80133056640625, + "loss": 0.5786, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34623074531555176, + "rewards/margins": 0.484809011220932, + "rewards/rejected": -0.8310397267341614, + "step": 5820 + }, + { + "epoch": 0.76, + "learning_rate": 8.086985817072604e-07, + "logits/chosen": -1.6257139444351196, + "logits/rejected": -1.4750010967254639, + "logps/chosen": -310.5641174316406, + "logps/rejected": -333.0919189453125, + "loss": 0.5571, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3126750588417053, + "rewards/margins": 0.5423682332038879, + "rewards/rejected": -0.8550432324409485, + "step": 5830 + }, + { + "epoch": 0.76, + "learning_rate": 8.003058203786835e-07, + "logits/chosen": -1.4551976919174194, + "logits/rejected": -1.3579022884368896, + "logps/chosen": -357.9755554199219, + "logps/rejected": -346.6473083496094, + "loss": 0.5964, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4536215364933014, + "rewards/margins": 0.46021443605422974, + "rewards/rejected": -0.913835883140564, + "step": 5840 + }, + { + "epoch": 0.77, + "learning_rate": 7.91948529910963e-07, + "logits/chosen": -1.6042499542236328, + "logits/rejected": -1.6213362216949463, + "logps/chosen": -308.8471374511719, + "logps/rejected": -362.1397399902344, + "loss": 0.5249, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2874293625354767, + "rewards/margins": 0.7759624719619751, + "rewards/rejected": -1.063391923904419, + "step": 5850 + }, + { + "epoch": 0.77, + "learning_rate": 7.836268847121126e-07, + "logits/chosen": -1.564263939857483, + "logits/rejected": -1.5152868032455444, + "logps/chosen": -350.787841796875, + "logps/rejected": -381.8980712890625, + "loss": 0.5796, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.33957618474960327, + "rewards/margins": 0.4684734344482422, + "rewards/rejected": -0.8080496788024902, + "step": 5860 + }, + { + "epoch": 0.77, + "learning_rate": 7.753410584462681e-07, + "logits/chosen": -1.5913186073303223, + "logits/rejected": -1.4614098072052002, + "logps/chosen": -329.3492431640625, + "logps/rejected": -339.7242126464844, + "loss": 0.5393, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.27319130301475525, + "rewards/margins": 0.8046390414237976, + "rewards/rejected": -1.0778303146362305, + "step": 5870 + }, + { + "epoch": 0.77, + "learning_rate": 7.670912240300596e-07, + "logits/chosen": -1.4377472400665283, + "logits/rejected": -1.5927647352218628, + "logps/chosen": -336.43743896484375, + "logps/rejected": -346.5491943359375, + "loss": 0.5136, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21061411499977112, + "rewards/margins": 0.5287593007087708, + "rewards/rejected": -0.7393734455108643, + "step": 5880 + }, + { + "epoch": 0.77, + "learning_rate": 7.588775536290035e-07, + "logits/chosen": -1.6953150033950806, + "logits/rejected": -1.495138168334961, + "logps/chosen": -310.20794677734375, + "logps/rejected": -341.45367431640625, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35188740491867065, + "rewards/margins": 0.7931149005889893, + "rewards/rejected": -1.1450023651123047, + "step": 5890 + }, + { + "epoch": 0.77, + "learning_rate": 7.507002186539147e-07, + "logits/chosen": -1.3210930824279785, + "logits/rejected": -1.4552217721939087, + "logps/chosen": -389.51373291015625, + "logps/rejected": -373.74774169921875, + "loss": 0.6282, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5041936635971069, + "rewards/margins": 0.24392476677894592, + "rewards/rejected": -0.7481184005737305, + "step": 5900 + }, + { + "epoch": 0.77, + "learning_rate": 7.425593897573216e-07, + "logits/chosen": -1.4796233177185059, + "logits/rejected": -1.3390194177627563, + "logps/chosen": -393.0478820800781, + "logps/rejected": -406.0719299316406, + "loss": 0.5566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44668784737586975, + "rewards/margins": 0.5181747078895569, + "rewards/rejected": -0.9648624658584595, + "step": 5910 + }, + { + "epoch": 0.77, + "learning_rate": 7.344552368299088e-07, + "logits/chosen": -1.5653443336486816, + "logits/rejected": -1.592137098312378, + "logps/chosen": -336.4530334472656, + "logps/rejected": -373.92034912109375, + "loss": 0.5546, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42195168137550354, + "rewards/margins": 0.4567069113254547, + "rewards/rejected": -0.8786584734916687, + "step": 5920 + }, + { + "epoch": 0.78, + "learning_rate": 7.26387928996973e-07, + "logits/chosen": -1.2884600162506104, + "logits/rejected": -1.316709041595459, + "logps/chosen": -322.3119812011719, + "logps/rejected": -335.1204528808594, + "loss": 0.5147, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6697009205818176, + "rewards/margins": 0.5602057576179504, + "rewards/rejected": -1.229906678199768, + "step": 5930 + }, + { + "epoch": 0.78, + "learning_rate": 7.183576346148899e-07, + "logits/chosen": -1.6132042407989502, + "logits/rejected": -1.6713653802871704, + "logps/chosen": -307.05133056640625, + "logps/rejected": -364.9720764160156, + "loss": 0.5346, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.38153284788131714, + "rewards/margins": 0.6180301308631897, + "rewards/rejected": -0.9995629191398621, + "step": 5940 + }, + { + "epoch": 0.78, + "learning_rate": 7.103645212676044e-07, + "logits/chosen": -1.5463507175445557, + "logits/rejected": -1.4541925191879272, + "logps/chosen": -281.3020324707031, + "logps/rejected": -313.65850830078125, + "loss": 0.5199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2693473696708679, + "rewards/margins": 0.6636615991592407, + "rewards/rejected": -0.9330089688301086, + "step": 5950 + }, + { + "epoch": 0.78, + "learning_rate": 7.024087557631318e-07, + "logits/chosen": -1.4065492153167725, + "logits/rejected": -1.3508937358856201, + "logps/chosen": -287.1341247558594, + "logps/rejected": -340.70208740234375, + "loss": 0.5927, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5008789300918579, + "rewards/margins": 0.35419294238090515, + "rewards/rejected": -0.8550717234611511, + "step": 5960 + }, + { + "epoch": 0.78, + "learning_rate": 6.944905041300739e-07, + "logits/chosen": -1.5280015468597412, + "logits/rejected": -1.376063585281372, + "logps/chosen": -364.8132629394531, + "logps/rejected": -352.842529296875, + "loss": 0.5015, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4553113579750061, + "rewards/margins": 0.6243511438369751, + "rewards/rejected": -1.079662561416626, + "step": 5970 + }, + { + "epoch": 0.78, + "learning_rate": 6.866099316141606e-07, + "logits/chosen": -1.702998399734497, + "logits/rejected": -1.6288244724273682, + "logps/chosen": -328.329345703125, + "logps/rejected": -348.07159423828125, + "loss": 0.5747, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5127135515213013, + "rewards/margins": 0.36080771684646606, + "rewards/rejected": -0.8735212087631226, + "step": 5980 + }, + { + "epoch": 0.78, + "learning_rate": 6.787672026747946e-07, + "logits/chosen": -1.4510295391082764, + "logits/rejected": -1.404740571975708, + "logps/chosen": -323.6086120605469, + "logps/rejected": -365.51031494140625, + "loss": 0.6068, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5067334771156311, + "rewards/margins": 0.2689245343208313, + "rewards/rejected": -0.7756580114364624, + "step": 5990 + }, + { + "epoch": 0.79, + "learning_rate": 6.709624809816223e-07, + "logits/chosen": -1.6505467891693115, + "logits/rejected": -1.6951344013214111, + "logps/chosen": -340.3673095703125, + "logps/rejected": -388.4654541015625, + "loss": 0.5332, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2794335186481476, + "rewards/margins": 0.6874333620071411, + "rewards/rejected": -0.9668668508529663, + "step": 6000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -1.5626295804977417, + "eval_logits/rejected": -1.5155683755874634, + "eval_logps/chosen": -332.62847900390625, + "eval_logps/rejected": -349.35693359375, + "eval_loss": 0.5604212284088135, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.41690683364868164, + "eval_rewards/margins": 0.5070940256118774, + "eval_rewards/rejected": -0.9240008592605591, + "eval_runtime": 469.8971, + "eval_samples_per_second": 4.256, + "eval_steps_per_second": 1.064, + "step": 6000 + }, + { + "epoch": 0.79, + "learning_rate": 6.6319592941112e-07, + "logits/chosen": -1.775883674621582, + "logits/rejected": -1.8262300491333008, + "logps/chosen": -395.36834716796875, + "logps/rejected": -391.53692626953125, + "loss": 0.5023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.28452056646347046, + "rewards/margins": 0.6215328574180603, + "rewards/rejected": -0.906053364276886, + "step": 6010 + }, + { + "epoch": 0.79, + "learning_rate": 6.554677100431927e-07, + "logits/chosen": -1.6648069620132446, + "logits/rejected": -1.3172204494476318, + "logps/chosen": -340.46240234375, + "logps/rejected": -350.62030029296875, + "loss": 0.5826, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4547346234321594, + "rewards/margins": 0.48622775077819824, + "rewards/rejected": -0.9409623146057129, + "step": 6020 + }, + { + "epoch": 0.79, + "learning_rate": 6.4777798415779e-07, + "logits/chosen": -1.6310913562774658, + "logits/rejected": -1.7965972423553467, + "logps/chosen": -328.1207580566406, + "logps/rejected": -352.4468078613281, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42136257886886597, + "rewards/margins": 0.5074335932731628, + "rewards/rejected": -0.9287961721420288, + "step": 6030 + }, + { + "epoch": 0.79, + "learning_rate": 6.401269122315451e-07, + "logits/chosen": -1.2970218658447266, + "logits/rejected": -1.2299518585205078, + "logps/chosen": -268.43890380859375, + "logps/rejected": -352.0443420410156, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3852972686290741, + "rewards/margins": 0.6270851492881775, + "rewards/rejected": -1.0123823881149292, + "step": 6040 + }, + { + "epoch": 0.79, + "learning_rate": 6.325146539344196e-07, + "logits/chosen": -1.3449985980987549, + "logits/rejected": -1.341498851776123, + "logps/chosen": -402.92156982421875, + "logps/rejected": -396.3840637207031, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7146986722946167, + "rewards/margins": 0.3242790102958679, + "rewards/rejected": -1.0389776229858398, + "step": 6050 + }, + { + "epoch": 0.79, + "learning_rate": 6.249413681263782e-07, + "logits/chosen": -1.8304609060287476, + "logits/rejected": -1.6912084817886353, + "logps/chosen": -282.3830261230469, + "logps/rejected": -316.68341064453125, + "loss": 0.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3486265540122986, + "rewards/margins": 0.47468265891075134, + "rewards/rejected": -0.8233092427253723, + "step": 6060 + }, + { + "epoch": 0.79, + "learning_rate": 6.174072128540686e-07, + "logits/chosen": -1.5214486122131348, + "logits/rejected": -1.5950642824172974, + "logps/chosen": -316.99346923828125, + "logps/rejected": -345.9567565917969, + "loss": 0.5108, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3326464593410492, + "rewards/margins": 0.5941346883773804, + "rewards/rejected": -0.9267812967300415, + "step": 6070 + }, + { + "epoch": 0.8, + "learning_rate": 6.099123453475245e-07, + "logits/chosen": -1.399414300918579, + "logits/rejected": -1.435448408126831, + "logps/chosen": -332.62713623046875, + "logps/rejected": -331.2655944824219, + "loss": 0.5288, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.45856350660324097, + "rewards/margins": 0.47532668709754944, + "rewards/rejected": -0.933890163898468, + "step": 6080 + }, + { + "epoch": 0.8, + "learning_rate": 6.024569220168836e-07, + "logits/chosen": -1.5160197019577026, + "logits/rejected": -1.6110633611679077, + "logps/chosen": -323.06268310546875, + "logps/rejected": -326.9421691894531, + "loss": 0.5486, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41864481568336487, + "rewards/margins": 0.45449838042259216, + "rewards/rejected": -0.873143196105957, + "step": 6090 + }, + { + "epoch": 0.8, + "learning_rate": 5.950410984491268e-07, + "logits/chosen": -1.79531729221344, + "logits/rejected": -1.7468538284301758, + "logps/chosen": -387.155029296875, + "logps/rejected": -378.2082214355469, + "loss": 0.5834, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3833608329296112, + "rewards/margins": 0.4535873532295227, + "rewards/rejected": -0.8369480967521667, + "step": 6100 + }, + { + "epoch": 0.8, + "learning_rate": 5.876650294048262e-07, + "logits/chosen": -1.4522327184677124, + "logits/rejected": -1.4680945873260498, + "logps/chosen": -324.3064270019531, + "logps/rejected": -394.5743713378906, + "loss": 0.577, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.48628348112106323, + "rewards/margins": 0.5022860169410706, + "rewards/rejected": -0.9885693788528442, + "step": 6110 + }, + { + "epoch": 0.8, + "learning_rate": 5.8032886881492e-07, + "logits/chosen": -1.867713212966919, + "logits/rejected": -1.4701869487762451, + "logps/chosen": -344.90875244140625, + "logps/rejected": -440.5440979003906, + "loss": 0.5894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43864792585372925, + "rewards/margins": 0.40541714429855347, + "rewards/rejected": -0.8440651893615723, + "step": 6120 + }, + { + "epoch": 0.8, + "learning_rate": 5.730327697774988e-07, + "logits/chosen": -1.5455173254013062, + "logits/rejected": -1.4788562059402466, + "logps/chosen": -336.3317565917969, + "logps/rejected": -361.5826721191406, + "loss": 0.6255, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.35634535551071167, + "rewards/margins": 0.331598699092865, + "rewards/rejected": -0.6879440546035767, + "step": 6130 + }, + { + "epoch": 0.8, + "learning_rate": 5.657768845546068e-07, + "logits/chosen": -1.5520979166030884, + "logits/rejected": -1.622862458229065, + "logps/chosen": -336.7666320800781, + "logps/rejected": -418.7025451660156, + "loss": 0.6066, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4384053349494934, + "rewards/margins": 0.20010459423065186, + "rewards/rejected": -0.6385098695755005, + "step": 6140 + }, + { + "epoch": 0.8, + "learning_rate": 5.585613645690713e-07, + "logits/chosen": -1.5879414081573486, + "logits/rejected": -1.4789133071899414, + "logps/chosen": -283.7325134277344, + "logps/rejected": -346.33563232421875, + "loss": 0.5154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48231735825538635, + "rewards/margins": 0.6720245480537415, + "rewards/rejected": -1.1543418169021606, + "step": 6150 + }, + { + "epoch": 0.81, + "learning_rate": 5.513863604013355e-07, + "logits/chosen": -1.7573292255401611, + "logits/rejected": -1.4890235662460327, + "logps/chosen": -413.19775390625, + "logps/rejected": -373.853759765625, + "loss": 0.5623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1855837106704712, + "rewards/margins": 0.755048394203186, + "rewards/rejected": -0.940632164478302, + "step": 6160 + }, + { + "epoch": 0.81, + "learning_rate": 5.442520217863215e-07, + "logits/chosen": -1.7494089603424072, + "logits/rejected": -1.5039844512939453, + "logps/chosen": -313.9460754394531, + "logps/rejected": -342.9593200683594, + "loss": 0.5304, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.464150607585907, + "rewards/margins": 0.4641142785549164, + "rewards/rejected": -0.9282649755477905, + "step": 6170 + }, + { + "epoch": 0.81, + "learning_rate": 5.371584976103034e-07, + "logits/chosen": -1.6324418783187866, + "logits/rejected": -1.3468782901763916, + "logps/chosen": -326.9073181152344, + "logps/rejected": -375.54876708984375, + "loss": 0.5438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.29264509677886963, + "rewards/margins": 0.6713323593139648, + "rewards/rejected": -0.9639774560928345, + "step": 6180 + }, + { + "epoch": 0.81, + "learning_rate": 5.301059359077987e-07, + "logits/chosen": -1.4815248250961304, + "logits/rejected": -1.3908017873764038, + "logps/chosen": -317.67852783203125, + "logps/rejected": -344.7789001464844, + "loss": 0.5579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3552420437335968, + "rewards/margins": 0.6432298421859741, + "rewards/rejected": -0.9984719157218933, + "step": 6190 + }, + { + "epoch": 0.81, + "learning_rate": 5.230944838584806e-07, + "logits/chosen": -1.5379875898361206, + "logits/rejected": -1.6145206689834595, + "logps/chosen": -345.13323974609375, + "logps/rejected": -391.8431091308594, + "loss": 0.5129, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4409099519252777, + "rewards/margins": 0.6926079988479614, + "rewards/rejected": -1.133517861366272, + "step": 6200 + }, + { + "epoch": 0.81, + "learning_rate": 5.161242877841083e-07, + "logits/chosen": -1.6072742938995361, + "logits/rejected": -1.4744181632995605, + "logps/chosen": -353.80206298828125, + "logps/rejected": -327.6481018066406, + "loss": 0.5608, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5244303345680237, + "rewards/margins": 0.2865012288093567, + "rewards/rejected": -0.8109315633773804, + "step": 6210 + }, + { + "epoch": 0.81, + "learning_rate": 5.091954931454682e-07, + "logits/chosen": -1.530173897743225, + "logits/rejected": -1.6602799892425537, + "logps/chosen": -318.1469421386719, + "logps/rejected": -327.0005798339844, + "loss": 0.6495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3629034459590912, + "rewards/margins": 0.24327942728996277, + "rewards/rejected": -0.6061829328536987, + "step": 6220 + }, + { + "epoch": 0.82, + "learning_rate": 5.023082445393446e-07, + "logits/chosen": -1.5476694107055664, + "logits/rejected": -1.6355488300323486, + "logps/chosen": -291.3600769042969, + "logps/rejected": -315.0718994140625, + "loss": 0.5777, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3985288739204407, + "rewards/margins": 0.4143444001674652, + "rewards/rejected": -0.8128732442855835, + "step": 6230 + }, + { + "epoch": 0.82, + "learning_rate": 4.95462685695498e-07, + "logits/chosen": -1.7410736083984375, + "logits/rejected": -1.6319011449813843, + "logps/chosen": -341.6515197753906, + "logps/rejected": -377.9350891113281, + "loss": 0.5607, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3459092676639557, + "rewards/margins": 0.36869245767593384, + "rewards/rejected": -0.7146016955375671, + "step": 6240 + }, + { + "epoch": 0.82, + "learning_rate": 4.88658959473666e-07, + "logits/chosen": -1.5590840578079224, + "logits/rejected": -1.7366011142730713, + "logps/chosen": -334.7305908203125, + "logps/rejected": -396.6339111328125, + "loss": 0.5729, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3456559181213379, + "rewards/margins": 0.5287542343139648, + "rewards/rejected": -0.8744101524353027, + "step": 6250 + }, + { + "epoch": 0.82, + "learning_rate": 4.818972078605821e-07, + "logits/chosen": -1.391929268836975, + "logits/rejected": -1.4947407245635986, + "logps/chosen": -329.9248352050781, + "logps/rejected": -326.04339599609375, + "loss": 0.5992, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5620224475860596, + "rewards/margins": 0.3376283049583435, + "rewards/rejected": -0.8996507525444031, + "step": 6260 + }, + { + "epoch": 0.82, + "learning_rate": 4.7517757196701514e-07, + "logits/chosen": -1.4793212413787842, + "logits/rejected": -1.3556773662567139, + "logps/chosen": -276.32037353515625, + "logps/rejected": -335.99273681640625, + "loss": 0.5739, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6662789583206177, + "rewards/margins": 0.3883989453315735, + "rewards/rejected": -1.054677963256836, + "step": 6270 + }, + { + "epoch": 0.82, + "learning_rate": 4.6850019202482193e-07, + "logits/chosen": -1.5899637937545776, + "logits/rejected": -1.397985816001892, + "logps/chosen": -294.9336853027344, + "logps/rejected": -372.1830139160156, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33499675989151, + "rewards/margins": 0.46375593543052673, + "rewards/rejected": -0.7987526655197144, + "step": 6280 + }, + { + "epoch": 0.82, + "learning_rate": 4.618652073840188e-07, + "logits/chosen": -1.619655966758728, + "logits/rejected": -1.6173222064971924, + "logps/chosen": -342.5983581542969, + "logps/rejected": -366.7782897949219, + "loss": 0.5178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.378616601228714, + "rewards/margins": 0.5226536989212036, + "rewards/rejected": -0.9012703895568848, + "step": 6290 + }, + { + "epoch": 0.82, + "learning_rate": 4.5527275650987965e-07, + "logits/chosen": -1.4216251373291016, + "logits/rejected": -1.5026986598968506, + "logps/chosen": -286.1501159667969, + "logps/rejected": -369.66558837890625, + "loss": 0.5296, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.40463271737098694, + "rewards/margins": 0.7339469194412231, + "rewards/rejected": -1.1385796070098877, + "step": 6300 + }, + { + "epoch": 0.83, + "learning_rate": 4.487229769800394e-07, + "logits/chosen": -1.5018270015716553, + "logits/rejected": -1.370388388633728, + "logps/chosen": -320.88275146484375, + "logps/rejected": -359.34173583984375, + "loss": 0.5847, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4966265559196472, + "rewards/margins": 0.5151475667953491, + "rewards/rejected": -1.0117741823196411, + "step": 6310 + }, + { + "epoch": 0.83, + "learning_rate": 4.422160054816285e-07, + "logits/chosen": -1.5208396911621094, + "logits/rejected": -1.6811376810073853, + "logps/chosen": -318.75701904296875, + "logps/rejected": -349.56988525390625, + "loss": 0.5454, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3771464228630066, + "rewards/margins": 0.5639196634292603, + "rewards/rejected": -0.9410660862922668, + "step": 6320 + }, + { + "epoch": 0.83, + "learning_rate": 4.35751977808416e-07, + "logits/chosen": -1.4296435117721558, + "logits/rejected": -1.560853123664856, + "logps/chosen": -393.7187194824219, + "logps/rejected": -458.0047912597656, + "loss": 0.4658, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2888754606246948, + "rewards/margins": 0.8654838800430298, + "rewards/rejected": -1.154359221458435, + "step": 6330 + }, + { + "epoch": 0.83, + "learning_rate": 4.293310288579794e-07, + "logits/chosen": -1.6840941905975342, + "logits/rejected": -1.6663179397583008, + "logps/chosen": -292.8079833984375, + "logps/rejected": -314.40277099609375, + "loss": 0.5865, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31099236011505127, + "rewards/margins": 0.45018497109413147, + "rewards/rejected": -0.7611774206161499, + "step": 6340 + }, + { + "epoch": 0.83, + "learning_rate": 4.2295329262888733e-07, + "logits/chosen": -1.3920434713363647, + "logits/rejected": -1.315793752670288, + "logps/chosen": -369.482421875, + "logps/rejected": -428.74810791015625, + "loss": 0.6081, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5108703374862671, + "rewards/margins": 0.4567830562591553, + "rewards/rejected": -0.9676534533500671, + "step": 6350 + }, + { + "epoch": 0.83, + "learning_rate": 4.1661890221790316e-07, + "logits/chosen": -1.660236120223999, + "logits/rejected": -1.6515461206436157, + "logps/chosen": -360.94879150390625, + "logps/rejected": -359.22100830078125, + "loss": 0.5426, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3676682114601135, + "rewards/margins": 0.6574610471725464, + "rewards/rejected": -1.0251293182373047, + "step": 6360 + }, + { + "epoch": 0.83, + "learning_rate": 4.103279898172072e-07, + "logits/chosen": -1.6529057025909424, + "logits/rejected": -1.5260584354400635, + "logps/chosen": -343.42083740234375, + "logps/rejected": -350.0111389160156, + "loss": 0.4777, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3618369698524475, + "rewards/margins": 0.6974278688430786, + "rewards/rejected": -1.0592647790908813, + "step": 6370 + }, + { + "epoch": 0.83, + "learning_rate": 4.040806867116401e-07, + "logits/chosen": -1.817495346069336, + "logits/rejected": -1.617215871810913, + "logps/chosen": -343.89990234375, + "logps/rejected": -386.9528503417969, + "loss": 0.5619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3370291292667389, + "rewards/margins": 0.5390554666519165, + "rewards/rejected": -0.8760845065116882, + "step": 6380 + }, + { + "epoch": 0.84, + "learning_rate": 3.978771232759615e-07, + "logits/chosen": -1.541231393814087, + "logits/rejected": -1.794306755065918, + "logps/chosen": -278.22381591796875, + "logps/rejected": -310.72125244140625, + "loss": 0.539, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28730207681655884, + "rewards/margins": 0.5556251406669617, + "rewards/rejected": -0.8429271578788757, + "step": 6390 + }, + { + "epoch": 0.84, + "learning_rate": 3.917174289721276e-07, + "logits/chosen": -1.2345460653305054, + "logits/rejected": -1.2370398044586182, + "logps/chosen": -311.5769348144531, + "logps/rejected": -355.7255554199219, + "loss": 0.4445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.27520936727523804, + "rewards/margins": 0.9276800155639648, + "rewards/rejected": -1.202889323234558, + "step": 6400 + }, + { + "epoch": 0.84, + "learning_rate": 3.856017323465938e-07, + "logits/chosen": -1.5912706851959229, + "logits/rejected": -1.4776086807250977, + "logps/chosen": -320.1431579589844, + "logps/rejected": -322.048583984375, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5533590316772461, + "rewards/margins": 0.4355868697166443, + "rewards/rejected": -0.9889459609985352, + "step": 6410 + }, + { + "epoch": 0.84, + "learning_rate": 3.7953016102762695e-07, + "logits/chosen": -1.640387773513794, + "logits/rejected": -1.593252182006836, + "logps/chosen": -345.67535400390625, + "logps/rejected": -325.80804443359375, + "loss": 0.4993, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.3275294899940491, + "rewards/margins": 0.675873875617981, + "rewards/rejected": -1.0034031867980957, + "step": 6420 + }, + { + "epoch": 0.84, + "learning_rate": 3.7350284172264493e-07, + "logits/chosen": -1.473418951034546, + "logits/rejected": -1.4413989782333374, + "logps/chosen": -326.16754150390625, + "logps/rejected": -352.9417724609375, + "loss": 0.5633, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4691588878631592, + "rewards/margins": 0.6636830568313599, + "rewards/rejected": -1.1328420639038086, + "step": 6430 + }, + { + "epoch": 0.84, + "learning_rate": 3.67519900215573e-07, + "logits/chosen": -1.804229497909546, + "logits/rejected": -1.7522436380386353, + "logps/chosen": -324.98553466796875, + "logps/rejected": -353.6246032714844, + "loss": 0.5268, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.31214380264282227, + "rewards/margins": 0.6218908429145813, + "rewards/rejected": -0.9340347051620483, + "step": 6440 + }, + { + "epoch": 0.84, + "learning_rate": 3.615814613642174e-07, + "logits/chosen": -1.376755714416504, + "logits/rejected": -1.472870945930481, + "logps/chosen": -345.22784423828125, + "logps/rejected": -363.56463623046875, + "loss": 0.636, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49792808294296265, + "rewards/margins": 0.21797530353069305, + "rewards/rejected": -0.7159034013748169, + "step": 6450 + }, + { + "epoch": 0.85, + "learning_rate": 3.5568764909765795e-07, + "logits/chosen": -1.6656601428985596, + "logits/rejected": -1.6456711292266846, + "logps/chosen": -344.89520263671875, + "logps/rejected": -293.02520751953125, + "loss": 0.6463, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24582533538341522, + "rewards/margins": 0.3201192319393158, + "rewards/rejected": -0.5659445524215698, + "step": 6460 + }, + { + "epoch": 0.85, + "learning_rate": 3.498385864136672e-07, + "logits/chosen": -1.7305920124053955, + "logits/rejected": -1.7584028244018555, + "logps/chosen": -362.3868408203125, + "logps/rejected": -357.6371765136719, + "loss": 0.5724, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3776368200778961, + "rewards/margins": 0.5224421620368958, + "rewards/rejected": -0.9000789523124695, + "step": 6470 + }, + { + "epoch": 0.85, + "learning_rate": 3.440343953761363e-07, + "logits/chosen": -1.3341304063796997, + "logits/rejected": -1.3284238576889038, + "logps/chosen": -285.2416076660156, + "logps/rejected": -315.15570068359375, + "loss": 0.5295, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20403487980365753, + "rewards/margins": 0.7653517127037048, + "rewards/rejected": -0.9693864583969116, + "step": 6480 + }, + { + "epoch": 0.85, + "learning_rate": 3.382751971125345e-07, + "logits/chosen": -1.6209923028945923, + "logits/rejected": -1.5280439853668213, + "logps/chosen": -342.360107421875, + "logps/rejected": -407.00128173828125, + "loss": 0.6213, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3022085428237915, + "rewards/margins": 0.3806192874908447, + "rewards/rejected": -0.6828278303146362, + "step": 6490 + }, + { + "epoch": 0.85, + "learning_rate": 3.3256111181137753e-07, + "logits/chosen": -1.5602829456329346, + "logits/rejected": -1.4552596807479858, + "logps/chosen": -300.94891357421875, + "logps/rejected": -370.371337890625, + "loss": 0.6089, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4285522997379303, + "rewards/margins": 0.3806338906288147, + "rewards/rejected": -0.8091861605644226, + "step": 6500 + }, + { + "epoch": 0.85, + "learning_rate": 3.2689225871971905e-07, + "logits/chosen": -1.6810089349746704, + "logits/rejected": -1.608888030052185, + "logps/chosen": -334.80377197265625, + "logps/rejected": -333.66815185546875, + "loss": 0.685, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5907071828842163, + "rewards/margins": 0.1491921991109848, + "rewards/rejected": -0.7398994565010071, + "step": 6510 + }, + { + "epoch": 0.85, + "learning_rate": 3.2126875614066523e-07, + "logits/chosen": -1.6989597082138062, + "logits/rejected": -1.694101333618164, + "logps/chosen": -351.87384033203125, + "logps/rejected": -392.55828857421875, + "loss": 0.4976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3015606999397278, + "rewards/margins": 0.6863117814064026, + "rewards/rejected": -0.9878724813461304, + "step": 6520 + }, + { + "epoch": 0.85, + "learning_rate": 3.156907214309024e-07, + "logits/chosen": -1.3872318267822266, + "logits/rejected": -1.3859946727752686, + "logps/chosen": -289.54168701171875, + "logps/rejected": -292.6477355957031, + "loss": 0.6217, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.49027499556541443, + "rewards/margins": 0.14135076105594635, + "rewards/rejected": -0.631625771522522, + "step": 6530 + }, + { + "epoch": 0.86, + "learning_rate": 3.1015827099824923e-07, + "logits/chosen": -1.4722344875335693, + "logits/rejected": -1.4258567094802856, + "logps/chosen": -346.17010498046875, + "logps/rejected": -362.38934326171875, + "loss": 0.5806, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21185941994190216, + "rewards/margins": 0.45707884430885315, + "rewards/rejected": -0.6689382791519165, + "step": 6540 + }, + { + "epoch": 0.86, + "learning_rate": 3.0467152029922926e-07, + "logits/chosen": -1.409630298614502, + "logits/rejected": -1.5818369388580322, + "logps/chosen": -303.9262390136719, + "logps/rejected": -321.6373596191406, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2497456967830658, + "rewards/margins": 0.5407729744911194, + "rewards/rejected": -0.7905186414718628, + "step": 6550 + }, + { + "epoch": 0.86, + "learning_rate": 2.992305838366591e-07, + "logits/chosen": -1.7107906341552734, + "logits/rejected": -1.4755480289459229, + "logps/chosen": -252.12515258789062, + "logps/rejected": -286.7061462402344, + "loss": 0.5072, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2764195203781128, + "rewards/margins": 0.537240743637085, + "rewards/rejected": -0.8136602640151978, + "step": 6560 + }, + { + "epoch": 0.86, + "learning_rate": 2.938355751572583e-07, + "logits/chosen": -1.6643329858779907, + "logits/rejected": -1.5856143236160278, + "logps/chosen": -265.94219970703125, + "logps/rejected": -321.1658020019531, + "loss": 0.5124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23726694285869598, + "rewards/margins": 0.6284645199775696, + "rewards/rejected": -0.8657315969467163, + "step": 6570 + }, + { + "epoch": 0.86, + "learning_rate": 2.8848660684928307e-07, + "logits/chosen": -1.4225232601165771, + "logits/rejected": -1.4376609325408936, + "logps/chosen": -290.9881286621094, + "logps/rejected": -390.09564208984375, + "loss": 0.5593, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4735269546508789, + "rewards/margins": 0.5094395279884338, + "rewards/rejected": -0.9829665422439575, + "step": 6580 + }, + { + "epoch": 0.86, + "learning_rate": 2.8318379054017383e-07, + "logits/chosen": -1.7295682430267334, + "logits/rejected": -1.3227838277816772, + "logps/chosen": -336.71478271484375, + "logps/rejected": -419.79132080078125, + "loss": 0.5244, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39096230268478394, + "rewards/margins": 0.504561722278595, + "rewards/rejected": -0.8955240249633789, + "step": 6590 + }, + { + "epoch": 0.86, + "learning_rate": 2.779272368942246e-07, + "logits/chosen": -1.5011423826217651, + "logits/rejected": -1.3429138660430908, + "logps/chosen": -331.9137878417969, + "logps/rejected": -327.5262145996094, + "loss": 0.6185, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47789159417152405, + "rewards/margins": 0.4170491695404053, + "rewards/rejected": -0.8949407339096069, + "step": 6600 + }, + { + "epoch": 0.86, + "learning_rate": 2.7271705561027986e-07, + "logits/chosen": -1.878503441810608, + "logits/rejected": -1.8325649499893188, + "logps/chosen": -358.32257080078125, + "logps/rejected": -444.4967346191406, + "loss": 0.4722, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22234638035297394, + "rewards/margins": 0.827469527721405, + "rewards/rejected": -1.0498158931732178, + "step": 6610 + }, + { + "epoch": 0.87, + "learning_rate": 2.6755335541943677e-07, + "logits/chosen": -1.441259741783142, + "logits/rejected": -1.462630033493042, + "logps/chosen": -317.05450439453125, + "logps/rejected": -337.45123291015625, + "loss": 0.5716, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5663586258888245, + "rewards/margins": 0.30441421270370483, + "rewards/rejected": -0.8707727193832397, + "step": 6620 + }, + { + "epoch": 0.87, + "learning_rate": 2.62436244082781e-07, + "logits/chosen": -1.5036875009536743, + "logits/rejected": -1.3889033794403076, + "logps/chosen": -322.90667724609375, + "logps/rejected": -317.14642333984375, + "loss": 0.638, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4977348744869232, + "rewards/margins": 0.19863374531269073, + "rewards/rejected": -0.6963686347007751, + "step": 6630 + }, + { + "epoch": 0.87, + "learning_rate": 2.5736582838913836e-07, + "logits/chosen": -1.6092548370361328, + "logits/rejected": -1.5731115341186523, + "logps/chosen": -298.9450988769531, + "logps/rejected": -325.5741271972656, + "loss": 0.552, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2477584332227707, + "rewards/margins": 0.515131950378418, + "rewards/rejected": -0.7628903388977051, + "step": 6640 + }, + { + "epoch": 0.87, + "learning_rate": 2.5234221415284363e-07, + "logits/chosen": -1.7040058374404907, + "logits/rejected": -1.6046968698501587, + "logps/chosen": -331.9007263183594, + "logps/rejected": -352.8539123535156, + "loss": 0.5817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3842163681983948, + "rewards/margins": 0.518243670463562, + "rewards/rejected": -0.9024600982666016, + "step": 6650 + }, + { + "epoch": 0.87, + "learning_rate": 2.4736550621153375e-07, + "logits/chosen": -1.6045563220977783, + "logits/rejected": -1.5152432918548584, + "logps/chosen": -376.0851135253906, + "logps/rejected": -390.2940673828125, + "loss": 0.6006, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.41698041558265686, + "rewards/margins": 0.5112712383270264, + "rewards/rejected": -0.9282516241073608, + "step": 6660 + }, + { + "epoch": 0.87, + "learning_rate": 2.424358084239609e-07, + "logits/chosen": -1.692214012145996, + "logits/rejected": -1.7570644617080688, + "logps/chosen": -434.9942932128906, + "logps/rejected": -373.55938720703125, + "loss": 0.5963, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37081220746040344, + "rewards/margins": 0.37873297929763794, + "rewards/rejected": -0.7495452165603638, + "step": 6670 + }, + { + "epoch": 0.87, + "learning_rate": 2.3755322366782158e-07, + "logits/chosen": -1.359822392463684, + "logits/rejected": -1.5544034242630005, + "logps/chosen": -370.5433654785156, + "logps/rejected": -360.10614013671875, + "loss": 0.5764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5535110235214233, + "rewards/margins": 0.36962199211120605, + "rewards/rejected": -0.9231330752372742, + "step": 6680 + }, + { + "epoch": 0.88, + "learning_rate": 2.3271785383761431e-07, + "logits/chosen": -1.6887356042861938, + "logits/rejected": -1.5238536596298218, + "logps/chosen": -398.4386291503906, + "logps/rejected": -401.5643615722656, + "loss": 0.6446, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.398753821849823, + "rewards/margins": 0.44112950563430786, + "rewards/rejected": -0.8398832082748413, + "step": 6690 + }, + { + "epoch": 0.88, + "learning_rate": 2.2792979984250978e-07, + "logits/chosen": -1.5033657550811768, + "logits/rejected": -1.4840075969696045, + "logps/chosen": -334.920166015625, + "logps/rejected": -337.3426818847656, + "loss": 0.5733, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3466881513595581, + "rewards/margins": 0.5142420530319214, + "rewards/rejected": -0.8609301447868347, + "step": 6700 + }, + { + "epoch": 0.88, + "learning_rate": 2.231891616042453e-07, + "logits/chosen": -1.5953342914581299, + "logits/rejected": -1.6823656558990479, + "logps/chosen": -302.6649169921875, + "logps/rejected": -401.78607177734375, + "loss": 0.6493, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.38987496495246887, + "rewards/margins": 0.4690842628479004, + "rewards/rejected": -0.8589591979980469, + "step": 6710 + }, + { + "epoch": 0.88, + "learning_rate": 2.1849603805504328e-07, + "logits/chosen": -1.6013914346694946, + "logits/rejected": -1.4907716512680054, + "logps/chosen": -355.11505126953125, + "logps/rejected": -369.0408630371094, + "loss": 0.5422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4737178385257721, + "rewards/margins": 0.3923259377479553, + "rewards/rejected": -0.866043746471405, + "step": 6720 + }, + { + "epoch": 0.88, + "learning_rate": 2.1385052713554066e-07, + "logits/chosen": -1.6961807012557983, + "logits/rejected": -1.527479887008667, + "logps/chosen": -268.41705322265625, + "logps/rejected": -334.3481750488281, + "loss": 0.5633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5642033815383911, + "rewards/margins": 0.4049958288669586, + "rewards/rejected": -0.9691991806030273, + "step": 6730 + }, + { + "epoch": 0.88, + "learning_rate": 2.0925272579274873e-07, + "logits/chosen": -1.8364747762680054, + "logits/rejected": -1.6992851495742798, + "logps/chosen": -363.69915771484375, + "logps/rejected": -387.8775329589844, + "loss": 0.5576, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3651832640171051, + "rewards/margins": 0.5085464715957642, + "rewards/rejected": -0.8737298250198364, + "step": 6740 + }, + { + "epoch": 0.88, + "learning_rate": 2.047027299780302e-07, + "logits/chosen": -1.6250356435775757, + "logits/rejected": -1.7488059997558594, + "logps/chosen": -366.2649230957031, + "logps/rejected": -401.81597900390625, + "loss": 0.5439, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.45524168014526367, + "rewards/margins": 0.5325301289558411, + "rewards/rejected": -0.9877718687057495, + "step": 6750 + }, + { + "epoch": 0.88, + "learning_rate": 2.0020063464509492e-07, + "logits/chosen": -1.7658112049102783, + "logits/rejected": -1.5300390720367432, + "logps/chosen": -301.8518981933594, + "logps/rejected": -277.26312255859375, + "loss": 0.5413, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22260832786560059, + "rewards/margins": 0.6469441056251526, + "rewards/rejected": -0.8695524334907532, + "step": 6760 + }, + { + "epoch": 0.89, + "learning_rate": 1.957465337480191e-07, + "logits/chosen": -1.3755944967269897, + "logits/rejected": -1.4968217611312866, + "logps/chosen": -290.97979736328125, + "logps/rejected": -297.4565734863281, + "loss": 0.5622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3093852400779724, + "rewards/margins": 0.536158561706543, + "rewards/rejected": -0.8455438613891602, + "step": 6770 + }, + { + "epoch": 0.89, + "learning_rate": 1.9134052023928622e-07, + "logits/chosen": -1.5235660076141357, + "logits/rejected": -1.531207799911499, + "logps/chosen": -287.3821716308594, + "logps/rejected": -325.2586364746094, + "loss": 0.6285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47017472982406616, + "rewards/margins": 0.40035057067871094, + "rewards/rejected": -0.8705252408981323, + "step": 6780 + }, + { + "epoch": 0.89, + "learning_rate": 1.8698268606784392e-07, + "logits/chosen": -1.5265495777130127, + "logits/rejected": -1.494541883468628, + "logps/chosen": -291.1766662597656, + "logps/rejected": -358.11322021484375, + "loss": 0.4506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1897139549255371, + "rewards/margins": 0.714760959148407, + "rewards/rejected": -0.9044749140739441, + "step": 6790 + }, + { + "epoch": 0.89, + "learning_rate": 1.826731221771866e-07, + "logits/chosen": -1.642133355140686, + "logits/rejected": -1.729996681213379, + "logps/chosen": -378.7098693847656, + "logps/rejected": -375.87841796875, + "loss": 0.5854, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38714170455932617, + "rewards/margins": 0.63156658411026, + "rewards/rejected": -1.0187082290649414, + "step": 6800 + }, + { + "epoch": 0.89, + "learning_rate": 1.7841191850345967e-07, + "logits/chosen": -1.6351066827774048, + "logits/rejected": -1.674530029296875, + "logps/chosen": -332.3482971191406, + "logps/rejected": -379.23907470703125, + "loss": 0.5899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.49106454849243164, + "rewards/margins": 0.4738168716430664, + "rewards/rejected": -0.964881420135498, + "step": 6810 + }, + { + "epoch": 0.89, + "learning_rate": 1.7419916397357905e-07, + "logits/chosen": -1.557543158531189, + "logits/rejected": -1.252177119255066, + "logps/chosen": -276.99005126953125, + "logps/rejected": -310.1368103027344, + "loss": 0.5307, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4249421954154968, + "rewards/margins": 0.3563283681869507, + "rewards/rejected": -0.7812705636024475, + "step": 6820 + }, + { + "epoch": 0.89, + "learning_rate": 1.700349465033782e-07, + "logits/chosen": -1.6811773777008057, + "logits/rejected": -1.598712682723999, + "logps/chosen": -363.93902587890625, + "logps/rejected": -336.55316162109375, + "loss": 0.6046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4810393750667572, + "rewards/margins": 0.5341545939445496, + "rewards/rejected": -1.0151939392089844, + "step": 6830 + }, + { + "epoch": 0.9, + "learning_rate": 1.6591935299577227e-07, + "logits/chosen": -1.5646713972091675, + "logits/rejected": -1.5634334087371826, + "logps/chosen": -360.24652099609375, + "logps/rejected": -402.7674865722656, + "loss": 0.5407, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4407511353492737, + "rewards/margins": 0.5985373258590698, + "rewards/rejected": -1.0392884016036987, + "step": 6840 + }, + { + "epoch": 0.9, + "learning_rate": 1.6185246933894338e-07, + "logits/chosen": -1.7110605239868164, + "logits/rejected": -1.6546802520751953, + "logps/chosen": -366.2001037597656, + "logps/rejected": -404.2633361816406, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5710258483886719, + "rewards/margins": 0.541045606136322, + "rewards/rejected": -1.1120713949203491, + "step": 6850 + }, + { + "epoch": 0.9, + "learning_rate": 1.5783438040455097e-07, + "logits/chosen": -1.5091670751571655, + "logits/rejected": -1.7258085012435913, + "logps/chosen": -373.1365661621094, + "logps/rejected": -347.4209899902344, + "loss": 0.5846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.49558359384536743, + "rewards/margins": 0.2369447648525238, + "rewards/rejected": -0.7325283288955688, + "step": 6860 + }, + { + "epoch": 0.9, + "learning_rate": 1.538651700459576e-07, + "logits/chosen": -1.77816641330719, + "logits/rejected": -1.663820505142212, + "logps/chosen": -413.3456115722656, + "logps/rejected": -389.6212463378906, + "loss": 0.5605, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5972879528999329, + "rewards/margins": 0.5378345251083374, + "rewards/rejected": -1.1351226568222046, + "step": 6870 + }, + { + "epoch": 0.9, + "learning_rate": 1.4994492109648151e-07, + "logits/chosen": -1.613743543624878, + "logits/rejected": -1.3955706357955933, + "logps/chosen": -276.959228515625, + "logps/rejected": -380.3712463378906, + "loss": 0.5612, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3962731957435608, + "rewards/margins": 0.41871780157089233, + "rewards/rejected": -0.8149908781051636, + "step": 6880 + }, + { + "epoch": 0.9, + "learning_rate": 1.4607371536766695e-07, + "logits/chosen": -1.560004472732544, + "logits/rejected": -1.6445074081420898, + "logps/chosen": -298.59051513671875, + "logps/rejected": -352.2249450683594, + "loss": 0.6466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5622106790542603, + "rewards/margins": 0.34344035387039185, + "rewards/rejected": -0.9056510925292969, + "step": 6890 + }, + { + "epoch": 0.9, + "learning_rate": 1.4225163364757655e-07, + "logits/chosen": -1.7169958353042603, + "logits/rejected": -1.6699968576431274, + "logps/chosen": -378.76513671875, + "logps/rejected": -373.4566650390625, + "loss": 0.4902, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.32254570722579956, + "rewards/margins": 0.5398565530776978, + "rewards/rejected": -0.8624023199081421, + "step": 6900 + }, + { + "epoch": 0.9, + "learning_rate": 1.3847875569910462e-07, + "logits/chosen": -1.5286998748779297, + "logits/rejected": -1.672623634338379, + "logps/chosen": -282.03466796875, + "logps/rejected": -305.9883728027344, + "loss": 0.5737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5421910881996155, + "rewards/margins": 0.3866000175476074, + "rewards/rejected": -0.9287910461425781, + "step": 6910 + }, + { + "epoch": 0.91, + "learning_rate": 1.3475516025831552e-07, + "logits/chosen": -1.627009391784668, + "logits/rejected": -1.6533962488174438, + "logps/chosen": -255.2958526611328, + "logps/rejected": -315.05145263671875, + "loss": 0.55, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37814396619796753, + "rewards/margins": 0.3874396085739136, + "rewards/rejected": -0.7655835747718811, + "step": 6920 + }, + { + "epoch": 0.91, + "learning_rate": 1.310809250327974e-07, + "logits/chosen": -1.5562437772750854, + "logits/rejected": -1.3938499689102173, + "logps/chosen": -274.3265075683594, + "logps/rejected": -342.0391540527344, + "loss": 0.5151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3347598910331726, + "rewards/margins": 0.5865308046340942, + "rewards/rejected": -0.9212905764579773, + "step": 6930 + }, + { + "epoch": 0.91, + "learning_rate": 1.2745612670004153e-07, + "logits/chosen": -1.6131515502929688, + "logits/rejected": -1.4667682647705078, + "logps/chosen": -307.2747802734375, + "logps/rejected": -353.781005859375, + "loss": 0.5227, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4944078326225281, + "rewards/margins": 0.7529302835464478, + "rewards/rejected": -1.2473381757736206, + "step": 6940 + }, + { + "epoch": 0.91, + "learning_rate": 1.2388084090584395e-07, + "logits/chosen": -1.6917057037353516, + "logits/rejected": -1.5495412349700928, + "logps/chosen": -304.48956298828125, + "logps/rejected": -308.452880859375, + "loss": 0.5999, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.34110361337661743, + "rewards/margins": 0.5534747838973999, + "rewards/rejected": -0.8945783376693726, + "step": 6950 + }, + { + "epoch": 0.91, + "learning_rate": 1.2035514226272305e-07, + "logits/chosen": -1.5473452806472778, + "logits/rejected": -1.7397979497909546, + "logps/chosen": -334.78125, + "logps/rejected": -356.70611572265625, + "loss": 0.5691, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.485074907541275, + "rewards/margins": 0.33868488669395447, + "rewards/rejected": -0.8237597346305847, + "step": 6960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1687910434836607e-07, + "logits/chosen": -1.311933159828186, + "logits/rejected": -1.2862112522125244, + "logps/chosen": -274.31280517578125, + "logps/rejected": -400.0562744140625, + "loss": 0.5095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3520568013191223, + "rewards/margins": 0.6713727712631226, + "rewards/rejected": -1.0234296321868896, + "step": 6970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1345279970409128e-07, + "logits/chosen": -1.8304306268692017, + "logits/rejected": -1.623238205909729, + "logps/chosen": -318.59942626953125, + "logps/rejected": -380.1607666015625, + "loss": 0.613, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4191969037055969, + "rewards/margins": 0.2825840711593628, + "rewards/rejected": -0.7017809748649597, + "step": 6980 + }, + { + "epoch": 0.91, + "learning_rate": 1.1007629983333629e-07, + "logits/chosen": -1.4548087120056152, + "logits/rejected": -1.4763644933700562, + "logps/chosen": -304.5970764160156, + "logps/rejected": -365.972900390625, + "loss": 0.4899, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2399648129940033, + "rewards/margins": 0.9897271394729614, + "rewards/rejected": -1.2296921014785767, + "step": 6990 + }, + { + "epoch": 0.92, + "learning_rate": 1.067496752001626e-07, + "logits/chosen": -1.4812662601470947, + "logits/rejected": -1.5030823945999146, + "logps/chosen": -267.0977478027344, + "logps/rejected": -352.5782775878906, + "loss": 0.5518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.32225170731544495, + "rewards/margins": 0.5641762614250183, + "rewards/rejected": -0.8864279985427856, + "step": 7000 + }, + { + "epoch": 0.92, + "learning_rate": 1.0347299522778909e-07, + "logits/chosen": -1.5863200426101685, + "logits/rejected": -1.6913690567016602, + "logps/chosen": -290.74163818359375, + "logps/rejected": -334.97772216796875, + "loss": 0.6684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5021691918373108, + "rewards/margins": 0.25347837805747986, + "rewards/rejected": -0.7556475400924683, + "step": 7010 + }, + { + "epoch": 0.92, + "learning_rate": 1.0024632829713971e-07, + "logits/chosen": -1.70602548122406, + "logits/rejected": -1.567915678024292, + "logps/chosen": -265.72235107421875, + "logps/rejected": -366.5525207519531, + "loss": 0.5519, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2836194932460785, + "rewards/margins": 0.5175027251243591, + "rewards/rejected": -0.8011223077774048, + "step": 7020 + }, + { + "epoch": 0.92, + "learning_rate": 9.706974174541889e-08, + "logits/chosen": -1.7524486780166626, + "logits/rejected": -1.5641875267028809, + "logps/chosen": -351.00909423828125, + "logps/rejected": -349.38421630859375, + "loss": 0.5617, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4362090528011322, + "rewards/margins": 0.3804861307144165, + "rewards/rejected": -0.8166952133178711, + "step": 7030 + }, + { + "epoch": 0.92, + "learning_rate": 9.39433018647043e-08, + "logits/chosen": -1.8955605030059814, + "logits/rejected": -1.4509342908859253, + "logps/chosen": -305.69818115234375, + "logps/rejected": -318.576171875, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38727378845214844, + "rewards/margins": 0.5361100435256958, + "rewards/rejected": -0.923383891582489, + "step": 7040 + }, + { + "epoch": 0.92, + "learning_rate": 9.086707390056543e-08, + "logits/chosen": -1.8034330606460571, + "logits/rejected": -1.7121871709823608, + "logps/chosen": -342.47088623046875, + "logps/rejected": -384.3644104003906, + "loss": 0.6432, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5218561887741089, + "rewards/margins": 0.3391360640525818, + "rewards/rejected": -0.8609922528266907, + "step": 7050 + }, + { + "epoch": 0.92, + "learning_rate": 8.784112205070083e-08, + "logits/chosen": -1.7136493921279907, + "logits/rejected": -1.817949652671814, + "logps/chosen": -290.65972900390625, + "logps/rejected": -341.6028137207031, + "loss": 0.5079, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20523662865161896, + "rewards/margins": 0.6784642338752747, + "rewards/rejected": -0.883700966835022, + "step": 7060 + }, + { + "epoch": 0.93, + "learning_rate": 8.486550946359779e-08, + "logits/chosen": -1.6075270175933838, + "logits/rejected": -1.669390082359314, + "logps/chosen": -307.3182067871094, + "logps/rejected": -315.9342956542969, + "loss": 0.6333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47917962074279785, + "rewards/margins": 0.23002251982688904, + "rewards/rejected": -0.7092021107673645, + "step": 7070 + }, + { + "epoch": 0.93, + "learning_rate": 8.194029823721556e-08, + "logits/chosen": -1.6609010696411133, + "logits/rejected": -1.722688913345337, + "logps/chosen": -355.6262512207031, + "logps/rejected": -353.5245361328125, + "loss": 0.653, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4032622277736664, + "rewards/margins": 0.32975420355796814, + "rewards/rejected": -0.7330164909362793, + "step": 7080 + }, + { + "epoch": 0.93, + "learning_rate": 7.906554941768896e-08, + "logits/chosen": -1.6966356039047241, + "logits/rejected": -1.3942993879318237, + "logps/chosen": -285.47344970703125, + "logps/rejected": -364.02099609375, + "loss": 0.5669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26427119970321655, + "rewards/margins": 0.47898778319358826, + "rewards/rejected": -0.7432589530944824, + "step": 7090 + }, + { + "epoch": 0.93, + "learning_rate": 7.624132299805575e-08, + "logits/chosen": -1.848637342453003, + "logits/rejected": -1.7091423273086548, + "logps/chosen": -331.6549072265625, + "logps/rejected": -381.8346252441406, + "loss": 0.4888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2276802957057953, + "rewards/margins": 0.8029883503913879, + "rewards/rejected": -1.0306686162948608, + "step": 7100 + }, + { + "epoch": 0.93, + "learning_rate": 7.346767791700127e-08, + "logits/chosen": -1.4324672222137451, + "logits/rejected": -1.3924691677093506, + "logps/chosen": -313.3305358886719, + "logps/rejected": -351.5447692871094, + "loss": 0.5605, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4062958359718323, + "rewards/margins": 0.5314801335334778, + "rewards/rejected": -0.9377759695053101, + "step": 7110 + }, + { + "epoch": 0.93, + "learning_rate": 7.07446720576327e-08, + "logits/chosen": -1.6411396265029907, + "logits/rejected": -1.5413631200790405, + "logps/chosen": -345.20513916015625, + "logps/rejected": -364.2772521972656, + "loss": 0.5775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4215497374534607, + "rewards/margins": 0.5629733204841614, + "rewards/rejected": -0.9845231175422668, + "step": 7120 + }, + { + "epoch": 0.93, + "learning_rate": 6.807236224626701e-08, + "logits/chosen": -1.4900765419006348, + "logits/rejected": -1.2240946292877197, + "logps/chosen": -273.09796142578125, + "logps/rejected": -317.6832580566406, + "loss": 0.585, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4732946455478668, + "rewards/margins": 0.2619306147098541, + "rewards/rejected": -0.7352252006530762, + "step": 7130 + }, + { + "epoch": 0.93, + "learning_rate": 6.545080425124888e-08, + "logits/chosen": -1.5523796081542969, + "logits/rejected": -1.6293872594833374, + "logps/chosen": -302.70318603515625, + "logps/rejected": -325.1712646484375, + "loss": 0.6416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4694761335849762, + "rewards/margins": 0.5290634036064148, + "rewards/rejected": -0.9985395669937134, + "step": 7140 + }, + { + "epoch": 0.94, + "learning_rate": 6.288005278178382e-08, + "logits/chosen": -1.9297126531600952, + "logits/rejected": -1.798872709274292, + "logps/chosen": -315.6032409667969, + "logps/rejected": -339.0154724121094, + "loss": 0.5691, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24254803359508514, + "rewards/margins": 0.5197073221206665, + "rewards/rejected": -0.7622553110122681, + "step": 7150 + }, + { + "epoch": 0.94, + "learning_rate": 6.036016148679825e-08, + "logits/chosen": -1.5070799589157104, + "logits/rejected": -1.504810094833374, + "logps/chosen": -321.612060546875, + "logps/rejected": -363.10546875, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4957321286201477, + "rewards/margins": 0.47185593843460083, + "rewards/rejected": -0.9675882458686829, + "step": 7160 + }, + { + "epoch": 0.94, + "learning_rate": 5.7891182953819235e-08, + "logits/chosen": -1.494229793548584, + "logits/rejected": -1.5541422367095947, + "logps/chosen": -338.26751708984375, + "logps/rejected": -339.1259765625, + "loss": 0.6236, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5544977784156799, + "rewards/margins": 0.20619161427021027, + "rewards/rejected": -0.760689377784729, + "step": 7170 + }, + { + "epoch": 0.94, + "learning_rate": 5.547316870787689e-08, + "logits/chosen": -1.687003493309021, + "logits/rejected": -1.7221145629882812, + "logps/chosen": -360.98309326171875, + "logps/rejected": -369.8673095703125, + "loss": 0.6123, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38970285654067993, + "rewards/margins": 0.45449987053871155, + "rewards/rejected": -0.8442028164863586, + "step": 7180 + }, + { + "epoch": 0.94, + "learning_rate": 5.310616921042927e-08, + "logits/chosen": -1.4303717613220215, + "logits/rejected": -1.5265449285507202, + "logps/chosen": -295.11077880859375, + "logps/rejected": -288.38470458984375, + "loss": 0.5451, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3154394328594208, + "rewards/margins": 0.49589186906814575, + "rewards/rejected": -0.8113313913345337, + "step": 7190 + }, + { + "epoch": 0.94, + "learning_rate": 5.079023385830939e-08, + "logits/chosen": -1.6070473194122314, + "logits/rejected": -1.632866621017456, + "logps/chosen": -323.0823974609375, + "logps/rejected": -323.40765380859375, + "loss": 0.574, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3265395164489746, + "rewards/margins": 0.4337926506996155, + "rewards/rejected": -0.7603321671485901, + "step": 7200 + }, + { + "epoch": 0.94, + "learning_rate": 4.8525410982695476e-08, + "logits/chosen": -1.8001930713653564, + "logits/rejected": -1.4306509494781494, + "logps/chosen": -387.4703674316406, + "logps/rejected": -377.4649353027344, + "loss": 0.583, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4647182822227478, + "rewards/margins": 0.456787109375, + "rewards/rejected": -0.9215054512023926, + "step": 7210 + }, + { + "epoch": 0.94, + "learning_rate": 4.6311747848099e-08, + "logits/chosen": -1.9012269973754883, + "logits/rejected": -1.4623916149139404, + "logps/chosen": -340.5151672363281, + "logps/rejected": -369.2735290527344, + "loss": 0.5178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29656848311424255, + "rewards/margins": 0.6395286917686462, + "rewards/rejected": -0.936097264289856, + "step": 7220 + }, + { + "epoch": 0.95, + "learning_rate": 4.4149290651382405e-08, + "logits/chosen": -1.5386755466461182, + "logits/rejected": -1.789312720298767, + "logps/chosen": -267.41357421875, + "logps/rejected": -372.3899230957031, + "loss": 0.5163, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.40368080139160156, + "rewards/margins": 0.702645480632782, + "rewards/rejected": -1.1063263416290283, + "step": 7230 + }, + { + "epoch": 0.95, + "learning_rate": 4.203808452079211e-08, + "logits/chosen": -1.474572777748108, + "logits/rejected": -1.540837049484253, + "logps/chosen": -253.24990844726562, + "logps/rejected": -291.01092529296875, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4242551922798157, + "rewards/margins": 0.3596297800540924, + "rewards/rejected": -0.7838850021362305, + "step": 7240 + }, + { + "epoch": 0.95, + "learning_rate": 3.9978173515018427e-08, + "logits/chosen": -1.769574761390686, + "logits/rejected": -1.5908442735671997, + "logps/chosen": -328.3680114746094, + "logps/rejected": -364.4908142089844, + "loss": 0.6279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.545204758644104, + "rewards/margins": 0.28715890645980835, + "rewards/rejected": -0.8323636054992676, + "step": 7250 + }, + { + "epoch": 0.95, + "learning_rate": 3.7969600622274614e-08, + "logits/chosen": -1.5526801347732544, + "logits/rejected": -1.3907220363616943, + "logps/chosen": -396.95635986328125, + "logps/rejected": -361.26739501953125, + "loss": 0.5034, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4022447466850281, + "rewards/margins": 0.6844984292984009, + "rewards/rejected": -1.0867431163787842, + "step": 7260 + }, + { + "epoch": 0.95, + "learning_rate": 3.601240775940151e-08, + "logits/chosen": -1.929166555404663, + "logits/rejected": -1.719903588294983, + "logps/chosen": -407.12506103515625, + "logps/rejected": -418.3841857910156, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3601692318916321, + "rewards/margins": 0.6154331564903259, + "rewards/rejected": -0.9756024479866028, + "step": 7270 + }, + { + "epoch": 0.95, + "learning_rate": 3.410663577099071e-08, + "logits/chosen": -1.4150351285934448, + "logits/rejected": -1.2052710056304932, + "logps/chosen": -317.63018798828125, + "logps/rejected": -355.95574951171875, + "loss": 0.5317, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.31728240847587585, + "rewards/margins": 0.6196717023849487, + "rewards/rejected": -0.936954140663147, + "step": 7280 + }, + { + "epoch": 0.95, + "learning_rate": 3.2252324428534986e-08, + "logits/chosen": -1.6764023303985596, + "logits/rejected": -1.2567373514175415, + "logps/chosen": -308.662353515625, + "logps/rejected": -343.0526428222656, + "loss": 0.5146, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3319694399833679, + "rewards/margins": 0.7371099591255188, + "rewards/rejected": -1.0690793991088867, + "step": 7290 + }, + { + "epoch": 0.96, + "learning_rate": 3.0449512429594486e-08, + "logits/chosen": -1.783299446105957, + "logits/rejected": -1.506151556968689, + "logps/chosen": -346.70037841796875, + "logps/rejected": -396.6324768066406, + "loss": 0.5199, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3853856921195984, + "rewards/margins": 0.5645334124565125, + "rewards/rejected": -0.9499190449714661, + "step": 7300 + }, + { + "epoch": 0.96, + "learning_rate": 2.8698237396992956e-08, + "logits/chosen": -1.536474347114563, + "logits/rejected": -1.4593311548233032, + "logps/chosen": -291.959228515625, + "logps/rejected": -332.8354187011719, + "loss": 0.595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42144399881362915, + "rewards/margins": 0.644050121307373, + "rewards/rejected": -1.065494179725647, + "step": 7310 + }, + { + "epoch": 0.96, + "learning_rate": 2.6998535878030584e-08, + "logits/chosen": -1.6568374633789062, + "logits/rejected": -1.3942255973815918, + "logps/chosen": -336.87506103515625, + "logps/rejected": -322.36883544921875, + "loss": 0.5506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.42959165573120117, + "rewards/margins": 0.485457181930542, + "rewards/rejected": -0.9150488972663879, + "step": 7320 + }, + { + "epoch": 0.96, + "learning_rate": 2.535044334372072e-08, + "logits/chosen": -1.3848296403884888, + "logits/rejected": -1.67227303981781, + "logps/chosen": -346.5140686035156, + "logps/rejected": -391.8655090332031, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5090900659561157, + "rewards/margins": 0.6221020817756653, + "rewards/rejected": -1.1311920881271362, + "step": 7330 + }, + { + "epoch": 0.96, + "learning_rate": 2.3753994188051853e-08, + "logits/chosen": -1.2568069696426392, + "logits/rejected": -1.3741756677627563, + "logps/chosen": -344.8403015136719, + "logps/rejected": -384.7103576660156, + "loss": 0.5564, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4671897292137146, + "rewards/margins": 0.6004025340080261, + "rewards/rejected": -1.0675921440124512, + "step": 7340 + }, + { + "epoch": 0.96, + "learning_rate": 2.220922172726764e-08, + "logits/chosen": -1.6049737930297852, + "logits/rejected": -1.5539507865905762, + "logps/chosen": -322.817626953125, + "logps/rejected": -311.04742431640625, + "loss": 0.5911, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4312373101711273, + "rewards/margins": 0.29300612211227417, + "rewards/rejected": -0.7242434620857239, + "step": 7350 + }, + { + "epoch": 0.96, + "learning_rate": 2.071615819917244e-08, + "logits/chosen": -1.5915446281433105, + "logits/rejected": -1.6095921993255615, + "logps/chosen": -305.8124694824219, + "logps/rejected": -345.87445068359375, + "loss": 0.6088, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.318966805934906, + "rewards/margins": 0.5085685849189758, + "rewards/rejected": -0.8275354504585266, + "step": 7360 + }, + { + "epoch": 0.96, + "learning_rate": 1.9274834762459393e-08, + "logits/chosen": -1.3576655387878418, + "logits/rejected": -1.6684898138046265, + "logps/chosen": -383.6067810058594, + "logps/rejected": -348.5184020996094, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5847973227500916, + "rewards/margins": 0.2677474021911621, + "rewards/rejected": -0.8525447845458984, + "step": 7370 + }, + { + "epoch": 0.97, + "learning_rate": 1.7885281496058947e-08, + "logits/chosen": -1.6968488693237305, + "logits/rejected": -1.62750244140625, + "logps/chosen": -303.8607482910156, + "logps/rejected": -316.8861389160156, + "loss": 0.569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14647522568702698, + "rewards/margins": 0.46435341238975525, + "rewards/rejected": -0.610828697681427, + "step": 7380 + }, + { + "epoch": 0.97, + "learning_rate": 1.654752739851134e-08, + "logits/chosen": -1.392565131187439, + "logits/rejected": -1.6380723714828491, + "logps/chosen": -265.1003112792969, + "logps/rejected": -347.75311279296875, + "loss": 0.4965, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4391438066959381, + "rewards/margins": 0.5243684649467468, + "rewards/rejected": -0.9635123014450073, + "step": 7390 + }, + { + "epoch": 0.97, + "learning_rate": 1.526160038736235e-08, + "logits/chosen": -1.5403153896331787, + "logits/rejected": -1.464784860610962, + "logps/chosen": -344.7210693359375, + "logps/rejected": -336.9613037109375, + "loss": 0.5732, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.48358240723609924, + "rewards/margins": 0.5803691148757935, + "rewards/rejected": -1.0639514923095703, + "step": 7400 + }, + { + "epoch": 0.97, + "learning_rate": 1.402752729857959e-08, + "logits/chosen": -1.4535226821899414, + "logits/rejected": -1.5209060907363892, + "logps/chosen": -287.3475036621094, + "logps/rejected": -312.4744873046875, + "loss": 0.5826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.47510308027267456, + "rewards/margins": 0.2499203383922577, + "rewards/rejected": -0.7250233888626099, + "step": 7410 + }, + { + "epoch": 0.97, + "learning_rate": 1.2845333885992683e-08, + "logits/chosen": -1.440726399421692, + "logits/rejected": -1.6101270914077759, + "logps/chosen": -295.6780090332031, + "logps/rejected": -341.66168212890625, + "loss": 0.4965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.29168501496315, + "rewards/margins": 0.7195547819137573, + "rewards/rejected": -1.0112398862838745, + "step": 7420 + }, + { + "epoch": 0.97, + "learning_rate": 1.171504482075675e-08, + "logits/chosen": -1.5670578479766846, + "logits/rejected": -1.5231409072875977, + "logps/chosen": -299.3634338378906, + "logps/rejected": -336.41046142578125, + "loss": 0.5166, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3135831654071808, + "rewards/margins": 0.6909624338150024, + "rewards/rejected": -1.0045455694198608, + "step": 7430 + }, + { + "epoch": 0.97, + "learning_rate": 1.0636683690836147e-08, + "logits/chosen": -1.5388962030410767, + "logits/rejected": -1.49226975440979, + "logps/chosen": -286.24981689453125, + "logps/rejected": -295.71099853515625, + "loss": 0.5434, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.34777525067329407, + "rewards/margins": 0.4854651093482971, + "rewards/rejected": -0.8332403302192688, + "step": 7440 + }, + { + "epoch": 0.97, + "learning_rate": 9.610273000513203e-09, + "logits/chosen": -1.6899350881576538, + "logits/rejected": -1.6042506694793701, + "logps/chosen": -323.87164306640625, + "logps/rejected": -427.4601135253906, + "loss": 0.5415, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4221875071525574, + "rewards/margins": 0.6695303916931152, + "rewards/rejected": -1.0917179584503174, + "step": 7450 + }, + { + "epoch": 0.98, + "learning_rate": 8.635834169918312e-09, + "logits/chosen": -1.6366380453109741, + "logits/rejected": -1.7314704656600952, + "logps/chosen": -346.11444091796875, + "logps/rejected": -341.15435791015625, + "loss": 0.598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.608862042427063, + "rewards/margins": 0.23399853706359863, + "rewards/rejected": -0.8428605794906616, + "step": 7460 + }, + { + "epoch": 0.98, + "learning_rate": 7.713387534582506e-09, + "logits/chosen": -1.5489892959594727, + "logits/rejected": -1.8460471630096436, + "logps/chosen": -381.9287109375, + "logps/rejected": -415.3206481933594, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5151020288467407, + "rewards/margins": 0.620938777923584, + "rewards/rejected": -1.1360408067703247, + "step": 7470 + }, + { + "epoch": 0.98, + "learning_rate": 6.84295234501392e-09, + "logits/chosen": -1.4336878061294556, + "logits/rejected": -1.4673105478286743, + "logps/chosen": -311.045166015625, + "logps/rejected": -328.20208740234375, + "loss": 0.5102, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3480697274208069, + "rewards/margins": 0.9048670530319214, + "rewards/rejected": -1.252936601638794, + "step": 7480 + }, + { + "epoch": 0.98, + "learning_rate": 6.024546766295325e-09, + "logits/chosen": -1.7255401611328125, + "logits/rejected": -1.5067691802978516, + "logps/chosen": -316.95538330078125, + "logps/rejected": -349.18310546875, + "loss": 0.5016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.29429227113723755, + "rewards/margins": 0.6684979200363159, + "rewards/rejected": -0.962790310382843, + "step": 7490 + }, + { + "epoch": 0.98, + "learning_rate": 5.2581878777049895e-09, + "logits/chosen": -1.5067514181137085, + "logits/rejected": -1.4448752403259277, + "logps/chosen": -283.251220703125, + "logps/rejected": -307.5720520019531, + "loss": 0.5516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3607473075389862, + "rewards/margins": 0.5495454668998718, + "rewards/rejected": -0.9102927446365356, + "step": 7500 + }, + { + "epoch": 0.98, + "learning_rate": 4.543891672361411e-09, + "logits/chosen": -1.6705787181854248, + "logits/rejected": -1.6046011447906494, + "logps/chosen": -320.6553039550781, + "logps/rejected": -366.6043395996094, + "loss": 0.5602, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3309764266014099, + "rewards/margins": 0.5032334923744202, + "rewards/rejected": -0.8342097997665405, + "step": 7510 + }, + { + "epoch": 0.98, + "learning_rate": 3.881673056887747e-09, + "logits/chosen": -1.4984352588653564, + "logits/rejected": -1.6474872827529907, + "logps/chosen": -259.31341552734375, + "logps/rejected": -301.51605224609375, + "loss": 0.5359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39567244052886963, + "rewards/margins": 0.5540647506713867, + "rewards/rejected": -0.9497373700141907, + "step": 7520 + }, + { + "epoch": 0.99, + "learning_rate": 3.2715458511023425e-09, + "logits/chosen": -1.481465220451355, + "logits/rejected": -1.4603790044784546, + "logps/chosen": -298.96112060546875, + "logps/rejected": -338.4877014160156, + "loss": 0.5098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3760393261909485, + "rewards/margins": 0.4767029285430908, + "rewards/rejected": -0.8527423143386841, + "step": 7530 + }, + { + "epoch": 0.99, + "learning_rate": 2.7135227877289617e-09, + "logits/chosen": -1.2965840101242065, + "logits/rejected": -1.5486103296279907, + "logps/chosen": -279.961181640625, + "logps/rejected": -380.9527282714844, + "loss": 0.5766, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4916453957557678, + "rewards/margins": 0.2411082237958908, + "rewards/rejected": -0.732753574848175, + "step": 7540 + }, + { + "epoch": 0.99, + "learning_rate": 2.2076155121328326e-09, + "logits/chosen": -1.6218845844268799, + "logits/rejected": -1.6690431833267212, + "logps/chosen": -352.5205078125, + "logps/rejected": -405.63385009765625, + "loss": 0.5374, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3985586166381836, + "rewards/margins": 0.48652762174606323, + "rewards/rejected": -0.8850862383842468, + "step": 7550 + }, + { + "epoch": 0.99, + "learning_rate": 1.7538345820755641e-09, + "logits/chosen": -1.4722051620483398, + "logits/rejected": -1.327138900756836, + "logps/chosen": -295.45318603515625, + "logps/rejected": -358.9852600097656, + "loss": 0.6031, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5219311118125916, + "rewards/margins": 0.34917739033699036, + "rewards/rejected": -0.8711085319519043, + "step": 7560 + }, + { + "epoch": 0.99, + "learning_rate": 1.3521894674961567e-09, + "logits/chosen": -1.7795778512954712, + "logits/rejected": -1.587660551071167, + "logps/chosen": -288.39691162109375, + "logps/rejected": -340.3475036621094, + "loss": 0.5372, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40456143021583557, + "rewards/margins": 0.5181632041931152, + "rewards/rejected": -0.9227245450019836, + "step": 7570 + }, + { + "epoch": 0.99, + "learning_rate": 1.0026885503131023e-09, + "logits/chosen": -1.6608966588974, + "logits/rejected": -1.5927197933197021, + "logps/chosen": -286.88323974609375, + "logps/rejected": -334.52569580078125, + "loss": 0.6633, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2914097309112549, + "rewards/margins": 0.5180963277816772, + "rewards/rejected": -0.8095060586929321, + "step": 7580 + }, + { + "epoch": 0.99, + "learning_rate": 7.053391242492491e-10, + "logits/chosen": -1.3557955026626587, + "logits/rejected": -1.2569801807403564, + "logps/chosen": -306.7195739746094, + "logps/rejected": -337.4743957519531, + "loss": 0.5532, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4369729459285736, + "rewards/margins": 0.4693797528743744, + "rewards/rejected": -0.906352698802948, + "step": 7590 + }, + { + "epoch": 0.99, + "learning_rate": 4.6014739467997725e-10, + "logits/chosen": -1.6417070627212524, + "logits/rejected": -1.3917691707611084, + "logps/chosen": -304.7096252441406, + "logps/rejected": -377.54290771484375, + "loss": 0.5922, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.42271485924720764, + "rewards/margins": 0.6610064506530762, + "rewards/rejected": -1.083721399307251, + "step": 7600 + }, + { + "epoch": 1.0, + "learning_rate": 2.671184785033032e-10, + "logits/chosen": -1.8365352153778076, + "logits/rejected": -1.7656691074371338, + "logps/chosen": -367.3888244628906, + "logps/rejected": -384.15850830078125, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4984728693962097, + "rewards/margins": 0.37515193223953247, + "rewards/rejected": -0.8736248016357422, + "step": 7610 + }, + { + "epoch": 1.0, + "learning_rate": 1.2625640403302054e-10, + "logits/chosen": -1.3983865976333618, + "logits/rejected": -1.320116400718689, + "logps/chosen": -274.30023193359375, + "logps/rejected": -357.8987731933594, + "loss": 0.5182, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.35549062490463257, + "rewards/margins": 0.6714946627616882, + "rewards/rejected": -1.0269852876663208, + "step": 7620 + }, + { + "epoch": 1.0, + "learning_rate": 3.756411091515588e-11, + "logits/chosen": -1.7367241382598877, + "logits/rejected": -1.7251498699188232, + "logps/chosen": -366.3235168457031, + "logps/rejected": -350.53240966796875, + "loss": 0.5732, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31917113065719604, + "rewards/margins": 0.5942750573158264, + "rewards/rejected": -0.9134461283683777, + "step": 7630 + }, + { + "epoch": 1.0, + "learning_rate": 1.0434500657963143e-12, + "logits/chosen": -1.8501018285751343, + "logits/rejected": -1.6034660339355469, + "logps/chosen": -341.7672424316406, + "logps/rejected": -369.6858825683594, + "loss": 0.5452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4152866303920746, + "rewards/margins": 0.4775542616844177, + "rewards/rejected": -0.8928408622741699, + "step": 7640 + }, + { + "epoch": 1.0, + "step": 7642, + "total_flos": 0.0, + "train_loss": 0.589801928247341, + "train_runtime": 31982.1679, + "train_samples_per_second": 1.912, + "train_steps_per_second": 0.239 + } + ], + "logging_steps": 10, + "max_steps": 7642, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}