diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10280 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998972954467648, + "eval_steps": 100, + "global_step": 6570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-10, + "logits/chosen": -0.4019157290458679, + "logits/rejected": -0.4092532694339752, + "logps/chosen": -116.08753204345703, + "logps/rejected": -122.52903747558594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -0.4602813720703125, + "logits/rejected": -0.5173085927963257, + "logps/chosen": -126.46894073486328, + "logps/rejected": -92.41258239746094, + "loss": 0.8724, + "rewards/accuracies": 0.5694444179534912, + "rewards/chosen": 0.33254384994506836, + "rewards/margins": 0.4581206142902374, + "rewards/rejected": -0.12557676434516907, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -0.4416370391845703, + "logits/rejected": -0.49417153000831604, + "logps/chosen": -127.47574615478516, + "logps/rejected": -95.31398010253906, + "loss": 0.945, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.15608355402946472, + "rewards/margins": 0.04934122413396835, + "rewards/rejected": 0.10674233734607697, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -0.4033452570438385, + "logits/rejected": -0.4765704572200775, + "logps/chosen": -129.85809326171875, + "logps/rejected": -93.86156463623047, + "loss": 0.9347, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02109346352517605, + "rewards/margins": 0.07605800777673721, + "rewards/rejected": -0.054964542388916016, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -0.43521660566329956, + "logits/rejected": -0.4863054156303406, + "logps/chosen": -134.07125854492188, + "logps/rejected": -95.79774475097656, + "loss": 0.9597, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2760757505893707, + "rewards/margins": -0.2107580155134201, + "rewards/rejected": -0.06531772762537003, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -0.4325088858604431, + "logits/rejected": -0.48078474402427673, + "logps/chosen": -114.5306625366211, + "logps/rejected": -87.7462158203125, + "loss": 0.9379, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.11705958843231201, + "rewards/margins": -0.04724755138158798, + "rewards/rejected": 0.1643071472644806, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -0.42710214853286743, + "logits/rejected": -0.49376431107521057, + "logps/chosen": -125.63948822021484, + "logps/rejected": -95.20314025878906, + "loss": 0.8973, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.12470320612192154, + "rewards/margins": -0.04657207801938057, + "rewards/rejected": -0.07813112437725067, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -0.4520903527736664, + "logits/rejected": -0.49421629309654236, + "logps/chosen": -128.04258728027344, + "logps/rejected": -95.23574829101562, + "loss": 0.9252, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.27560052275657654, + "rewards/margins": 0.32960376143455505, + "rewards/rejected": -0.054003216326236725, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -0.4498293995857239, + "logits/rejected": -0.49676957726478577, + "logps/chosen": -122.39030456542969, + "logps/rejected": -91.36656188964844, + "loss": 0.9493, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.04946544021368027, + "rewards/margins": -0.14646394550800323, + "rewards/rejected": 0.1959293782711029, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -0.4375010132789612, + "logits/rejected": -0.4830726981163025, + "logps/chosen": -116.85307312011719, + "logps/rejected": -93.01774597167969, + "loss": 0.8934, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.009796512313187122, + "rewards/margins": 0.009660542011260986, + "rewards/rejected": -0.019457053393125534, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -0.4444386959075928, + "logits/rejected": -0.5023002028465271, + "logps/chosen": -127.2782211303711, + "logps/rejected": -93.0546646118164, + "loss": 0.8825, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.14289195835590363, + "rewards/margins": 0.13167095184326172, + "rewards/rejected": 0.011221003718674183, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -0.4521982967853546, + "eval_logits/rejected": -0.5016786456108093, + "eval_logps/chosen": -122.78299713134766, + "eval_logps/rejected": -90.6888656616211, + "eval_loss": 0.8872030973434448, + "eval_rewards/accuracies": 0.505586564540863, + "eval_rewards/chosen": 0.18837407231330872, + "eval_rewards/margins": 0.06797005981206894, + "eval_rewards/rejected": 0.12040401250123978, + "eval_runtime": 913.9439, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.196, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -0.4448448121547699, + "logits/rejected": -0.4973272383213043, + "logps/chosen": -129.8375701904297, + "logps/rejected": -94.05601501464844, + "loss": 0.9452, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.08486747741699219, + "rewards/margins": 0.03504283353686333, + "rewards/rejected": 0.04982464388012886, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -0.4298163950443268, + "logits/rejected": -0.48838791251182556, + "logps/chosen": -130.39566040039062, + "logps/rejected": -91.33364868164062, + "loss": 0.88, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.10634370148181915, + "rewards/margins": 0.24824686348438263, + "rewards/rejected": -0.14190316200256348, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -0.4313521385192871, + "logits/rejected": -0.4968926012516022, + "logps/chosen": -133.50741577148438, + "logps/rejected": -91.91551208496094, + "loss": 0.8437, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.014166712760925293, + "rewards/margins": 0.0034161806106567383, + "rewards/rejected": -0.01758289337158203, + "step": 130 + }, + { + "epoch": 0.06, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -0.4153892993927002, + "logits/rejected": -0.4767012596130371, + "logps/chosen": -131.85374450683594, + "logps/rejected": -96.32195281982422, + "loss": 0.8648, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.16102957725524902, + "rewards/margins": 0.05374450609087944, + "rewards/rejected": 0.10728506743907928, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -0.44299325346946716, + "logits/rejected": -0.5104792714118958, + "logps/chosen": -125.35621643066406, + "logps/rejected": -89.10494232177734, + "loss": 0.9226, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.062367748469114304, + "rewards/margins": 0.08922012150287628, + "rewards/rejected": -0.02685236930847168, + "step": 150 + }, + { + "epoch": 0.07, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -0.41546908020973206, + "logits/rejected": -0.4675370752811432, + "logps/chosen": -124.44963073730469, + "logps/rejected": -97.21520233154297, + "loss": 0.8653, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.1597222536802292, + "rewards/margins": 0.09017050266265869, + "rewards/rejected": 0.0695517510175705, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -0.420665442943573, + "logits/rejected": -0.4751351475715637, + "logps/chosen": -128.25018310546875, + "logps/rejected": -93.02848052978516, + "loss": 0.8364, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.15812595188617706, + "rewards/margins": 0.24063143134117126, + "rewards/rejected": -0.0825054869055748, + "step": 170 + }, + { + "epoch": 0.08, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -0.43356451392173767, + "logits/rejected": -0.5012267827987671, + "logps/chosen": -134.00485229492188, + "logps/rejected": -91.32147979736328, + "loss": 0.8665, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2619454562664032, + "rewards/margins": 0.23557980358600616, + "rewards/rejected": 0.02636566199362278, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -0.45523887872695923, + "logits/rejected": -0.5100681185722351, + "logps/chosen": -125.0759506225586, + "logps/rejected": -89.36471557617188, + "loss": 0.846, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.24546337127685547, + "rewards/margins": 0.27231401205062866, + "rewards/rejected": -0.02685065194964409, + "step": 190 + }, + { + "epoch": 0.09, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -0.43358325958251953, + "logits/rejected": -0.488663911819458, + "logps/chosen": -115.94734191894531, + "logps/rejected": -88.48348236083984, + "loss": 0.9136, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.04173297807574272, + "rewards/margins": -0.21215708553791046, + "rewards/rejected": 0.17042410373687744, + "step": 200 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -0.44474586844444275, + "eval_logits/rejected": -0.49599677324295044, + "eval_logps/chosen": -122.50910186767578, + "eval_logps/rejected": -90.78695678710938, + "eval_loss": 0.832494854927063, + "eval_rewards/accuracies": 0.589385449886322, + "eval_rewards/chosen": 0.32531994581222534, + "eval_rewards/margins": 0.2539590895175934, + "eval_rewards/rejected": 0.07136084139347076, + "eval_runtime": 887.9753, + "eval_samples_per_second": 3.223, + "eval_steps_per_second": 0.202, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -0.44061392545700073, + "logits/rejected": -0.48843497037887573, + "logps/chosen": -121.23575592041016, + "logps/rejected": -91.54434204101562, + "loss": 0.88, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.15113969147205353, + "rewards/margins": 0.015409660525619984, + "rewards/rejected": 0.13573002815246582, + "step": 210 + }, + { + "epoch": 0.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -0.44614124298095703, + "logits/rejected": -0.48937082290649414, + "logps/chosen": -115.96971130371094, + "logps/rejected": -90.40010833740234, + "loss": 0.9042, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.017821501940488815, + "rewards/margins": -0.00064764020498842, + "rewards/rejected": 0.01846914365887642, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -0.40590643882751465, + "logits/rejected": -0.4714388847351074, + "logps/chosen": -135.4023895263672, + "logps/rejected": -95.57572937011719, + "loss": 0.8178, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.42753997445106506, + "rewards/margins": 0.28569334745407104, + "rewards/rejected": 0.14184658229351044, + "step": 230 + }, + { + "epoch": 0.11, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -0.44258102774620056, + "logits/rejected": -0.4934941232204437, + "logps/chosen": -124.71822357177734, + "logps/rejected": -90.02816009521484, + "loss": 0.8509, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17398934066295624, + "rewards/margins": 0.06253180652856827, + "rewards/rejected": 0.11145754158496857, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -0.43292659521102905, + "logits/rejected": -0.49328017234802246, + "logps/chosen": -129.06802368164062, + "logps/rejected": -92.1496353149414, + "loss": 0.8357, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.2504062056541443, + "rewards/margins": 0.1820850372314453, + "rewards/rejected": 0.06832118332386017, + "step": 250 + }, + { + "epoch": 0.12, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -0.452961266040802, + "logits/rejected": -0.49590611457824707, + "logps/chosen": -130.70712280273438, + "logps/rejected": -89.96910095214844, + "loss": 0.8312, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.38505929708480835, + "rewards/margins": 0.07876329123973846, + "rewards/rejected": 0.3062959909439087, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -0.43949493765830994, + "logits/rejected": -0.49638843536376953, + "logps/chosen": -131.95510864257812, + "logps/rejected": -91.30790710449219, + "loss": 0.7985, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.44583946466445923, + "rewards/margins": 0.32409507036209106, + "rewards/rejected": 0.12174437195062637, + "step": 270 + }, + { + "epoch": 0.13, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -0.4356844425201416, + "logits/rejected": -0.4804006516933441, + "logps/chosen": -122.7051010131836, + "logps/rejected": -95.61629486083984, + "loss": 0.8578, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.38490691781044006, + "rewards/margins": -0.046961426734924316, + "rewards/rejected": 0.43186837434768677, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -0.4335559010505676, + "logits/rejected": -0.48386988043785095, + "logps/chosen": -125.57695007324219, + "logps/rejected": -93.93878936767578, + "loss": 0.7524, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7190027236938477, + "rewards/margins": 0.5058382749557495, + "rewards/rejected": 0.21316440403461456, + "step": 290 + }, + { + "epoch": 0.14, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -0.4185541570186615, + "logits/rejected": -0.4697909355163574, + "logps/chosen": -124.8671875, + "logps/rejected": -89.48695373535156, + "loss": 0.7507, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7953338623046875, + "rewards/margins": 0.46773916482925415, + "rewards/rejected": 0.32759472727775574, + "step": 300 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -0.44264963269233704, + "eval_logits/rejected": -0.4909226894378662, + "eval_logps/chosen": -122.01156616210938, + "eval_logps/rejected": -90.3702621459961, + "eval_loss": 0.7816197872161865, + "eval_rewards/accuracies": 0.5670391321182251, + "eval_rewards/chosen": 0.5740950107574463, + "eval_rewards/margins": 0.29438889026641846, + "eval_rewards/rejected": 0.27970612049102783, + "eval_runtime": 858.3912, + "eval_samples_per_second": 3.334, + "eval_steps_per_second": 0.209, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -0.4151129722595215, + "logits/rejected": -0.4674547612667084, + "logps/chosen": -120.4942855834961, + "logps/rejected": -88.7030258178711, + "loss": 0.7411, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.721222996711731, + "rewards/margins": 0.4279160499572754, + "rewards/rejected": 0.2933068871498108, + "step": 310 + }, + { + "epoch": 0.15, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -0.4228796362876892, + "logits/rejected": -0.4746120572090149, + "logps/chosen": -130.20228576660156, + "logps/rejected": -95.73072814941406, + "loss": 0.7596, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.8771559000015259, + "rewards/margins": 0.4412263035774231, + "rewards/rejected": 0.435929536819458, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -0.4374767243862152, + "logits/rejected": -0.48573336005210876, + "logps/chosen": -118.52967834472656, + "logps/rejected": -92.9192886352539, + "loss": 0.7083, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.713616669178009, + "rewards/margins": 0.4573392868041992, + "rewards/rejected": 0.25627732276916504, + "step": 330 + }, + { + "epoch": 0.16, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -0.42817750573158264, + "logits/rejected": -0.4862712025642395, + "logps/chosen": -118.77824401855469, + "logps/rejected": -85.43115997314453, + "loss": 0.7041, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8094140887260437, + "rewards/margins": 0.5132724642753601, + "rewards/rejected": 0.29614168405532837, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -0.4172348380088806, + "logits/rejected": -0.4674188196659088, + "logps/chosen": -117.78670501708984, + "logps/rejected": -93.87962341308594, + "loss": 0.6998, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6671510934829712, + "rewards/margins": 0.4894055426120758, + "rewards/rejected": 0.1777455061674118, + "step": 350 + }, + { + "epoch": 0.16, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -0.3993512988090515, + "logits/rejected": -0.4529343247413635, + "logps/chosen": -122.50328063964844, + "logps/rejected": -92.97706604003906, + "loss": 0.7234, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.9135759472846985, + "rewards/margins": 0.6725237965583801, + "rewards/rejected": 0.24105218052864075, + "step": 360 + }, + { + "epoch": 0.17, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -0.38976728916168213, + "logits/rejected": -0.44725877046585083, + "logps/chosen": -123.70979309082031, + "logps/rejected": -93.47122192382812, + "loss": 0.7021, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8796418905258179, + "rewards/margins": 0.516387939453125, + "rewards/rejected": 0.36325401067733765, + "step": 370 + }, + { + "epoch": 0.17, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -0.411344051361084, + "logits/rejected": -0.48015648126602173, + "logps/chosen": -128.3126678466797, + "logps/rejected": -86.05162048339844, + "loss": 0.7003, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.9648821949958801, + "rewards/margins": 0.6015781760215759, + "rewards/rejected": 0.3633040189743042, + "step": 380 + }, + { + "epoch": 0.18, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -0.4367973208427429, + "logits/rejected": -0.48391270637512207, + "logps/chosen": -123.42488098144531, + "logps/rejected": -86.6032943725586, + "loss": 0.6979, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9070743322372437, + "rewards/margins": 0.5875338315963745, + "rewards/rejected": 0.31954047083854675, + "step": 390 + }, + { + "epoch": 0.18, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -0.39601242542266846, + "logits/rejected": -0.4429762363433838, + "logps/chosen": -115.47049713134766, + "logps/rejected": -92.88420104980469, + "loss": 0.6142, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1646414995193481, + "rewards/margins": 0.6666213870048523, + "rewards/rejected": 0.49802008271217346, + "step": 400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -0.4321858584880829, + "eval_logits/rejected": -0.4793085753917694, + "eval_logps/chosen": -121.00923156738281, + "eval_logps/rejected": -90.04886627197266, + "eval_loss": 0.6434672474861145, + "eval_rewards/accuracies": 0.6368715167045593, + "eval_rewards/chosen": 1.0752556324005127, + "eval_rewards/margins": 0.6348468065261841, + "eval_rewards/rejected": 0.4404087960720062, + "eval_runtime": 919.755, + "eval_samples_per_second": 3.112, + "eval_steps_per_second": 0.195, + "step": 400 + }, + { + "epoch": 0.19, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -0.4185422360897064, + "logits/rejected": -0.48887625336647034, + "logps/chosen": -137.428466796875, + "logps/rejected": -89.48545837402344, + "loss": 0.639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.2363882064819336, + "rewards/margins": 0.7988500595092773, + "rewards/rejected": 0.43753814697265625, + "step": 410 + }, + { + "epoch": 0.19, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -0.402921199798584, + "logits/rejected": -0.4546676278114319, + "logps/chosen": -117.1082534790039, + "logps/rejected": -92.3219223022461, + "loss": 0.6449, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8971641659736633, + "rewards/margins": 0.6290008425712585, + "rewards/rejected": 0.2681633532047272, + "step": 420 + }, + { + "epoch": 0.2, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -0.44212061166763306, + "logits/rejected": -0.4884345531463623, + "logps/chosen": -116.8946533203125, + "logps/rejected": -88.76360321044922, + "loss": 0.6229, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.1071293354034424, + "rewards/margins": 0.6796278357505798, + "rewards/rejected": 0.42750149965286255, + "step": 430 + }, + { + "epoch": 0.2, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -0.4305177330970764, + "logits/rejected": -0.48330944776535034, + "logps/chosen": -118.5647201538086, + "logps/rejected": -90.13600158691406, + "loss": 0.6259, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.3376901149749756, + "rewards/margins": 0.8060137033462524, + "rewards/rejected": 0.5316765308380127, + "step": 440 + }, + { + "epoch": 0.21, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -0.401493638753891, + "logits/rejected": -0.4512600302696228, + "logps/chosen": -123.88890075683594, + "logps/rejected": -86.47048950195312, + "loss": 0.5399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.291102647781372, + "rewards/margins": 1.0892807245254517, + "rewards/rejected": 0.2018217146396637, + "step": 450 + }, + { + "epoch": 0.21, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -0.414111465215683, + "logits/rejected": -0.4572978913784027, + "logps/chosen": -130.4043426513672, + "logps/rejected": -93.8148422241211, + "loss": 0.5633, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.4818243980407715, + "rewards/margins": 1.0796071290969849, + "rewards/rejected": 0.40221720933914185, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -0.3844316601753235, + "logits/rejected": -0.43920645117759705, + "logps/chosen": -129.4476318359375, + "logps/rejected": -93.88046264648438, + "loss": 0.6163, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 1.0575988292694092, + "rewards/margins": 0.5015383958816528, + "rewards/rejected": 0.5560603737831116, + "step": 470 + }, + { + "epoch": 0.22, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -0.4316393733024597, + "logits/rejected": -0.46863240003585815, + "logps/chosen": -116.6776123046875, + "logps/rejected": -89.31513977050781, + "loss": 0.5295, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 1.6314785480499268, + "rewards/margins": 1.1132431030273438, + "rewards/rejected": 0.5182352066040039, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -0.3807242810726166, + "logits/rejected": -0.4394947588443756, + "logps/chosen": -125.314453125, + "logps/rejected": -93.21080017089844, + "loss": 0.5445, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 1.4549810886383057, + "rewards/margins": 1.1398333311080933, + "rewards/rejected": 0.3151477575302124, + "step": 490 + }, + { + "epoch": 0.23, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -0.41063275933265686, + "logits/rejected": -0.4601810574531555, + "logps/chosen": -125.56761169433594, + "logps/rejected": -90.01634979248047, + "loss": 0.519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 1.459787368774414, + "rewards/margins": 0.9859598278999329, + "rewards/rejected": 0.47382766008377075, + "step": 500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -0.4084192216396332, + "eval_logits/rejected": -0.45594409108161926, + "eval_logps/chosen": -119.71713256835938, + "eval_logps/rejected": -89.80496978759766, + "eval_loss": 0.5196122527122498, + "eval_rewards/accuracies": 0.74301677942276, + "eval_rewards/chosen": 1.7213094234466553, + "eval_rewards/margins": 1.1589573621749878, + "eval_rewards/rejected": 0.5623520016670227, + "eval_runtime": 904.862, + "eval_samples_per_second": 3.163, + "eval_steps_per_second": 0.198, + "step": 500 + }, + { + "epoch": 0.23, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -0.4057396948337555, + "logits/rejected": -0.45863184332847595, + "logps/chosen": -117.3805923461914, + "logps/rejected": -86.75962829589844, + "loss": 0.5427, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 1.8113657236099243, + "rewards/margins": 1.3476877212524414, + "rewards/rejected": 0.46367788314819336, + "step": 510 + }, + { + "epoch": 0.24, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -0.4061856269836426, + "logits/rejected": -0.4540451467037201, + "logps/chosen": -129.16978454589844, + "logps/rejected": -92.90943908691406, + "loss": 0.4964, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.632061243057251, + "rewards/margins": 1.0041301250457764, + "rewards/rejected": 0.6279311776161194, + "step": 520 + }, + { + "epoch": 0.24, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -0.4023277163505554, + "logits/rejected": -0.4562614858150482, + "logps/chosen": -124.7136459350586, + "logps/rejected": -92.33967590332031, + "loss": 0.5186, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.5743153095245361, + "rewards/margins": 1.1845461130142212, + "rewards/rejected": 0.38976913690567017, + "step": 530 + }, + { + "epoch": 0.25, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -0.3930456042289734, + "logits/rejected": -0.44681698083877563, + "logps/chosen": -128.5962677001953, + "logps/rejected": -96.91268157958984, + "loss": 0.4592, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.9018471240997314, + "rewards/margins": 1.5477434396743774, + "rewards/rejected": 0.3541035056114197, + "step": 540 + }, + { + "epoch": 0.25, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -0.4221636652946472, + "logits/rejected": -0.462319552898407, + "logps/chosen": -128.9414520263672, + "logps/rejected": -93.53214263916016, + "loss": 0.5019, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.7227509021759033, + "rewards/margins": 0.957754909992218, + "rewards/rejected": 0.7649960517883301, + "step": 550 + }, + { + "epoch": 0.26, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -0.3653411269187927, + "logits/rejected": -0.4162854254245758, + "logps/chosen": -121.7026596069336, + "logps/rejected": -91.18373107910156, + "loss": 0.5392, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.8344913721084595, + "rewards/margins": 1.1174156665802002, + "rewards/rejected": 0.717075526714325, + "step": 560 + }, + { + "epoch": 0.26, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -0.3987251818180084, + "logits/rejected": -0.4511111378669739, + "logps/chosen": -123.21244049072266, + "logps/rejected": -90.11592864990234, + "loss": 0.4567, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 1.992093801498413, + "rewards/margins": 1.3696999549865723, + "rewards/rejected": 0.6223939061164856, + "step": 570 + }, + { + "epoch": 0.26, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -0.3962245583534241, + "logits/rejected": -0.4511072635650635, + "logps/chosen": -117.782470703125, + "logps/rejected": -86.41405487060547, + "loss": 0.4408, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 2.0794925689697266, + "rewards/margins": 1.7970508337020874, + "rewards/rejected": 0.2824416756629944, + "step": 580 + }, + { + "epoch": 0.27, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -0.3933233916759491, + "logits/rejected": -0.4369097650051117, + "logps/chosen": -118.86439514160156, + "logps/rejected": -93.77821350097656, + "loss": 0.4553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 2.0210790634155273, + "rewards/margins": 1.2972638607025146, + "rewards/rejected": 0.7238151431083679, + "step": 590 + }, + { + "epoch": 0.27, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -0.3547651171684265, + "logits/rejected": -0.4130920469760895, + "logps/chosen": -121.15980529785156, + "logps/rejected": -90.80103302001953, + "loss": 0.4858, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.020519733428955, + "rewards/margins": 1.2199289798736572, + "rewards/rejected": 0.8005906939506531, + "step": 600 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -0.41381627321243286, + "eval_logits/rejected": -0.4592490494251251, + "eval_logps/chosen": -118.74276733398438, + "eval_logps/rejected": -89.74501037597656, + "eval_loss": 0.4350966513156891, + "eval_rewards/accuracies": 0.7877094745635986, + "eval_rewards/chosen": 2.208491086959839, + "eval_rewards/margins": 1.6161593198776245, + "eval_rewards/rejected": 0.5923314690589905, + "eval_runtime": 922.2358, + "eval_samples_per_second": 3.103, + "eval_steps_per_second": 0.194, + "step": 600 + }, + { + "epoch": 0.28, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -0.3632197082042694, + "logits/rejected": -0.4115583896636963, + "logps/chosen": -126.2831039428711, + "logps/rejected": -91.31169128417969, + "loss": 0.4499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 2.2875258922576904, + "rewards/margins": 1.4958405494689941, + "rewards/rejected": 0.7916852831840515, + "step": 610 + }, + { + "epoch": 0.28, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -0.40923231840133667, + "logits/rejected": -0.44826406240463257, + "logps/chosen": -119.70330810546875, + "logps/rejected": -94.67719268798828, + "loss": 0.4786, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.9047887325286865, + "rewards/margins": 1.1927855014801025, + "rewards/rejected": 0.7120033502578735, + "step": 620 + }, + { + "epoch": 0.29, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -0.40331321954727173, + "logits/rejected": -0.442889541387558, + "logps/chosen": -119.29844665527344, + "logps/rejected": -92.73041534423828, + "loss": 0.4586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.220543384552002, + "rewards/margins": 1.537954330444336, + "rewards/rejected": 0.6825889945030212, + "step": 630 + }, + { + "epoch": 0.29, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -0.3773882985115051, + "logits/rejected": -0.43467479944229126, + "logps/chosen": -126.56062316894531, + "logps/rejected": -90.71516418457031, + "loss": 0.4235, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 1.9954169988632202, + "rewards/margins": 1.6813433170318604, + "rewards/rejected": 0.3140736222267151, + "step": 640 + }, + { + "epoch": 0.3, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -0.3841812312602997, + "logits/rejected": -0.4249148368835449, + "logps/chosen": -123.25791931152344, + "logps/rejected": -89.29008483886719, + "loss": 0.4447, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 2.2786223888397217, + "rewards/margins": 1.6475741863250732, + "rewards/rejected": 0.631048321723938, + "step": 650 + }, + { + "epoch": 0.3, + "learning_rate": 2.998477929984779e-07, + "logits/chosen": -0.3526962995529175, + "logits/rejected": -0.41638392210006714, + "logps/chosen": -123.1829605102539, + "logps/rejected": -91.3757095336914, + "loss": 0.4117, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.311939239501953, + "rewards/margins": 1.7149779796600342, + "rewards/rejected": 0.5969613194465637, + "step": 660 + }, + { + "epoch": 0.31, + "learning_rate": 2.993404363267377e-07, + "logits/chosen": -0.36860209703445435, + "logits/rejected": -0.4237852990627289, + "logps/chosen": -122.25120544433594, + "logps/rejected": -89.36021423339844, + "loss": 0.4159, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1927528381347656, + "rewards/margins": 1.558342456817627, + "rewards/rejected": 0.6344104409217834, + "step": 670 + }, + { + "epoch": 0.31, + "learning_rate": 2.9883307965499743e-07, + "logits/chosen": -0.39097389578819275, + "logits/rejected": -0.4458232820034027, + "logps/chosen": -120.35401916503906, + "logps/rejected": -93.7239761352539, + "loss": 0.4295, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 2.214139461517334, + "rewards/margins": 1.479650855064392, + "rewards/rejected": 0.7344885468482971, + "step": 680 + }, + { + "epoch": 0.31, + "learning_rate": 2.983257229832572e-07, + "logits/chosen": -0.3879626393318176, + "logits/rejected": -0.4396095871925354, + "logps/chosen": -118.8515853881836, + "logps/rejected": -88.3510513305664, + "loss": 0.4265, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.529470682144165, + "rewards/margins": 2.0332565307617188, + "rewards/rejected": 0.4962140917778015, + "step": 690 + }, + { + "epoch": 0.32, + "learning_rate": 2.9781836631151696e-07, + "logits/chosen": -0.39537498354911804, + "logits/rejected": -0.4360222816467285, + "logps/chosen": -128.79356384277344, + "logps/rejected": -91.8262710571289, + "loss": 0.4048, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.615938901901245, + "rewards/margins": 1.7755300998687744, + "rewards/rejected": 0.8404089212417603, + "step": 700 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -0.3952995538711548, + "eval_logits/rejected": -0.43975549936294556, + "eval_logps/chosen": -117.93878936767578, + "eval_logps/rejected": -89.78253936767578, + "eval_loss": 0.3877631723880768, + "eval_rewards/accuracies": 0.832402229309082, + "eval_rewards/chosen": 2.610471487045288, + "eval_rewards/margins": 2.036900758743286, + "eval_rewards/rejected": 0.5735709071159363, + "eval_runtime": 879.8552, + "eval_samples_per_second": 3.253, + "eval_steps_per_second": 0.203, + "step": 700 + }, + { + "epoch": 0.32, + "learning_rate": 2.9731100963977676e-07, + "logits/chosen": -0.3934364318847656, + "logits/rejected": -0.43131130933761597, + "logps/chosen": -122.0168228149414, + "logps/rejected": -95.09669494628906, + "loss": 0.3735, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.480156898498535, + "rewards/margins": 2.1081740856170654, + "rewards/rejected": 0.3719825744628906, + "step": 710 + }, + { + "epoch": 0.33, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -0.36254000663757324, + "logits/rejected": -0.39851677417755127, + "logps/chosen": -113.37959289550781, + "logps/rejected": -93.94721984863281, + "loss": 0.3979, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.346477746963501, + "rewards/margins": 1.4738929271697998, + "rewards/rejected": 0.8725847005844116, + "step": 720 + }, + { + "epoch": 0.33, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -0.37916380167007446, + "logits/rejected": -0.4347075819969177, + "logps/chosen": -120.58372497558594, + "logps/rejected": -91.06839752197266, + "loss": 0.4037, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 2.639740467071533, + "rewards/margins": 2.126530170440674, + "rewards/rejected": 0.5132103562355042, + "step": 730 + }, + { + "epoch": 0.34, + "learning_rate": 2.9578893962455603e-07, + "logits/chosen": -0.38863658905029297, + "logits/rejected": -0.43652552366256714, + "logps/chosen": -118.41629791259766, + "logps/rejected": -91.13375854492188, + "loss": 0.4023, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 2.4551138877868652, + "rewards/margins": 1.8103752136230469, + "rewards/rejected": 0.6447389721870422, + "step": 740 + }, + { + "epoch": 0.34, + "learning_rate": 2.952815829528158e-07, + "logits/chosen": -0.3511260449886322, + "logits/rejected": -0.3955584168434143, + "logps/chosen": -122.5355224609375, + "logps/rejected": -94.37186431884766, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.681807041168213, + "rewards/margins": 2.366050958633423, + "rewards/rejected": 0.31575626134872437, + "step": 750 + }, + { + "epoch": 0.35, + "learning_rate": 2.9477422628107556e-07, + "logits/chosen": -0.36248472332954407, + "logits/rejected": -0.41749343276023865, + "logps/chosen": -127.51533508300781, + "logps/rejected": -89.82352447509766, + "loss": 0.3551, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.612884283065796, + "rewards/margins": 2.194282054901123, + "rewards/rejected": 0.4186023771762848, + "step": 760 + }, + { + "epoch": 0.35, + "learning_rate": 2.9426686960933536e-07, + "logits/chosen": -0.39361852407455444, + "logits/rejected": -0.4406144618988037, + "logps/chosen": -120.9937515258789, + "logps/rejected": -91.25408935546875, + "loss": 0.3773, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 2.731214761734009, + "rewards/margins": 2.3300070762634277, + "rewards/rejected": 0.4012075364589691, + "step": 770 + }, + { + "epoch": 0.36, + "learning_rate": 2.937595129375951e-07, + "logits/chosen": -0.37514811754226685, + "logits/rejected": -0.4206709861755371, + "logps/chosen": -121.06993103027344, + "logps/rejected": -90.89588165283203, + "loss": 0.3809, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.543487548828125, + "rewards/margins": 1.973149299621582, + "rewards/rejected": 0.5703383684158325, + "step": 780 + }, + { + "epoch": 0.36, + "learning_rate": 2.932521562658549e-07, + "logits/chosen": -0.3764232099056244, + "logits/rejected": -0.42200201749801636, + "logps/chosen": -118.751220703125, + "logps/rejected": -90.91673278808594, + "loss": 0.3829, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.8071541786193848, + "rewards/margins": 2.2911696434020996, + "rewards/rejected": 0.5159844756126404, + "step": 790 + }, + { + "epoch": 0.37, + "learning_rate": 2.9274479959411463e-07, + "logits/chosen": -0.4004458487033844, + "logits/rejected": -0.45174115896224976, + "logps/chosen": -121.1677474975586, + "logps/rejected": -94.82032775878906, + "loss": 0.3623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.8220152854919434, + "rewards/margins": 1.960198998451233, + "rewards/rejected": 0.8618165254592896, + "step": 800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -0.40461838245391846, + "eval_logits/rejected": -0.449243426322937, + "eval_logps/chosen": -117.74871063232422, + "eval_logps/rejected": -90.00775146484375, + "eval_loss": 0.33831077814102173, + "eval_rewards/accuracies": 0.8519552946090698, + "eval_rewards/chosen": 2.7055187225341797, + "eval_rewards/margins": 2.244556427001953, + "eval_rewards/rejected": 0.4609623849391937, + "eval_runtime": 884.177, + "eval_samples_per_second": 3.237, + "eval_steps_per_second": 0.202, + "step": 800 + }, + { + "epoch": 0.37, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -0.39423003792762756, + "logits/rejected": -0.4309159219264984, + "logps/chosen": -115.3284912109375, + "logps/rejected": -89.27734375, + "loss": 0.3521, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.6883575916290283, + "rewards/margins": 2.4514644145965576, + "rewards/rejected": 0.2368932068347931, + "step": 810 + }, + { + "epoch": 0.37, + "learning_rate": 2.9173008625063416e-07, + "logits/chosen": -0.38897836208343506, + "logits/rejected": -0.43199315667152405, + "logps/chosen": -115.42658996582031, + "logps/rejected": -88.20014953613281, + "loss": 0.3956, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.8831889629364014, + "rewards/margins": 2.162771701812744, + "rewards/rejected": 0.7204176187515259, + "step": 820 + }, + { + "epoch": 0.38, + "learning_rate": 2.9122272957889396e-07, + "logits/chosen": -0.3778613209724426, + "logits/rejected": -0.42081737518310547, + "logps/chosen": -122.856689453125, + "logps/rejected": -90.86555480957031, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.869499683380127, + "rewards/margins": 2.4971015453338623, + "rewards/rejected": 0.372397780418396, + "step": 830 + }, + { + "epoch": 0.38, + "learning_rate": 2.907153729071537e-07, + "logits/chosen": -0.3957747519016266, + "logits/rejected": -0.44605112075805664, + "logps/chosen": -118.08308410644531, + "logps/rejected": -92.13973236083984, + "loss": 0.3128, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.6560986042022705, + "rewards/margins": 2.404651641845703, + "rewards/rejected": 0.251446932554245, + "step": 840 + }, + { + "epoch": 0.39, + "learning_rate": 2.902080162354135e-07, + "logits/chosen": -0.3683229982852936, + "logits/rejected": -0.41419917345046997, + "logps/chosen": -116.5284194946289, + "logps/rejected": -90.90771484375, + "loss": 0.3368, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 2.9755072593688965, + "rewards/margins": 2.519197940826416, + "rewards/rejected": 0.45630955696105957, + "step": 850 + }, + { + "epoch": 0.39, + "learning_rate": 2.8970065956367323e-07, + "logits/chosen": -0.38079267740249634, + "logits/rejected": -0.42664599418640137, + "logps/chosen": -124.321533203125, + "logps/rejected": -93.04421997070312, + "loss": 0.3352, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 2.8211874961853027, + "rewards/margins": 2.7853455543518066, + "rewards/rejected": 0.035841844975948334, + "step": 860 + }, + { + "epoch": 0.4, + "learning_rate": 2.89193302891933e-07, + "logits/chosen": -0.43004846572875977, + "logits/rejected": -0.4682585299015045, + "logps/chosen": -120.579833984375, + "logps/rejected": -94.06327819824219, + "loss": 0.3491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.4778380393981934, + "rewards/margins": 1.947243094444275, + "rewards/rejected": 0.5305950045585632, + "step": 870 + }, + { + "epoch": 0.4, + "learning_rate": 2.8868594622019276e-07, + "logits/chosen": -0.3675630986690521, + "logits/rejected": -0.43043774366378784, + "logps/chosen": -127.04805755615234, + "logps/rejected": -90.72000122070312, + "loss": 0.3348, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.3756585121154785, + "rewards/margins": 2.680732011795044, + "rewards/rejected": 0.6949266195297241, + "step": 880 + }, + { + "epoch": 0.41, + "learning_rate": 2.8817858954845256e-07, + "logits/chosen": -0.3831022381782532, + "logits/rejected": -0.4073728621006012, + "logps/chosen": -112.22914123535156, + "logps/rejected": -92.7939453125, + "loss": 0.3258, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 2.583207845687866, + "rewards/margins": 2.2793221473693848, + "rewards/rejected": 0.3038859963417053, + "step": 890 + }, + { + "epoch": 0.41, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -0.36432862281799316, + "logits/rejected": -0.4049452841281891, + "logps/chosen": -114.65323638916016, + "logps/rejected": -89.1798324584961, + "loss": 0.308, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.7544989585876465, + "rewards/margins": 2.288748025894165, + "rewards/rejected": 0.4657509922981262, + "step": 900 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -0.39713504910469055, + "eval_logits/rejected": -0.43814149498939514, + "eval_logps/chosen": -117.21139526367188, + "eval_logps/rejected": -90.22852325439453, + "eval_loss": 0.31449276208877563, + "eval_rewards/accuracies": 0.8519552946090698, + "eval_rewards/chosen": 2.9741804599761963, + "eval_rewards/margins": 2.6236064434051514, + "eval_rewards/rejected": 0.3505741357803345, + "eval_runtime": 906.6261, + "eval_samples_per_second": 3.157, + "eval_steps_per_second": 0.197, + "step": 900 + }, + { + "epoch": 0.42, + "learning_rate": 2.871638762049721e-07, + "logits/chosen": -0.37541183829307556, + "logits/rejected": -0.4352906346321106, + "logps/chosen": -122.44512939453125, + "logps/rejected": -92.67110443115234, + "loss": 0.311, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.144416570663452, + "rewards/margins": 2.9438998699188232, + "rewards/rejected": 0.200516939163208, + "step": 910 + }, + { + "epoch": 0.42, + "learning_rate": 2.8665651953323183e-07, + "logits/chosen": -0.4028325080871582, + "logits/rejected": -0.43853726983070374, + "logps/chosen": -122.57420349121094, + "logps/rejected": -88.1255874633789, + "loss": 0.2813, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4015986919403076, + "rewards/margins": 2.7751407623291016, + "rewards/rejected": 0.6264580488204956, + "step": 920 + }, + { + "epoch": 0.42, + "learning_rate": 2.861491628614916e-07, + "logits/chosen": -0.38051438331604004, + "logits/rejected": -0.42849215865135193, + "logps/chosen": -121.06298828125, + "logps/rejected": -91.55492401123047, + "loss": 0.3263, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.2129135131835938, + "rewards/margins": 3.027736186981201, + "rewards/rejected": 0.18517741560935974, + "step": 930 + }, + { + "epoch": 0.43, + "learning_rate": 2.8564180618975136e-07, + "logits/chosen": -0.3525943458080292, + "logits/rejected": -0.40666908025741577, + "logps/chosen": -126.42658996582031, + "logps/rejected": -95.4771499633789, + "loss": 0.3093, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.0789132118225098, + "rewards/margins": 2.573848009109497, + "rewards/rejected": 0.505064845085144, + "step": 940 + }, + { + "epoch": 0.43, + "learning_rate": 2.8513444951801116e-07, + "logits/chosen": -0.37190571427345276, + "logits/rejected": -0.4196457862854004, + "logps/chosen": -114.93281555175781, + "logps/rejected": -90.08222198486328, + "loss": 0.3446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.7400577068328857, + "rewards/margins": 2.4779579639434814, + "rewards/rejected": 0.26209989190101624, + "step": 950 + }, + { + "epoch": 0.44, + "learning_rate": 2.846270928462709e-07, + "logits/chosen": -0.3697434663772583, + "logits/rejected": -0.4116531312465668, + "logps/chosen": -120.270751953125, + "logps/rejected": -96.9891357421875, + "loss": 0.3247, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 2.8343663215637207, + "rewards/margins": 2.7533464431762695, + "rewards/rejected": 0.08101978152990341, + "step": 960 + }, + { + "epoch": 0.44, + "learning_rate": 2.841197361745307e-07, + "logits/chosen": -0.38262155652046204, + "logits/rejected": -0.43416157364845276, + "logps/chosen": -118.68754577636719, + "logps/rejected": -91.05323791503906, + "loss": 0.273, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 3.267169237136841, + "rewards/margins": 2.936789035797119, + "rewards/rejected": 0.33038023114204407, + "step": 970 + }, + { + "epoch": 0.45, + "learning_rate": 2.8361237950279043e-07, + "logits/chosen": -0.3827953338623047, + "logits/rejected": -0.4415118098258972, + "logps/chosen": -131.87547302246094, + "logps/rejected": -93.18158721923828, + "loss": 0.2977, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.0228755474090576, + "rewards/margins": 3.0450031757354736, + "rewards/rejected": -0.022127319127321243, + "step": 980 + }, + { + "epoch": 0.45, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -0.3674587905406952, + "logits/rejected": -0.4189482629299164, + "logps/chosen": -119.06890869140625, + "logps/rejected": -90.67839050292969, + "loss": 0.319, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.407522678375244, + "rewards/margins": 3.029474973678589, + "rewards/rejected": 0.3780475854873657, + "step": 990 + }, + { + "epoch": 0.46, + "learning_rate": 2.8259766615930996e-07, + "logits/chosen": -0.3974978029727936, + "logits/rejected": -0.43341922760009766, + "logps/chosen": -123.0480728149414, + "logps/rejected": -88.08891296386719, + "loss": 0.3092, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 3.463684558868408, + "rewards/margins": 3.0378475189208984, + "rewards/rejected": 0.42583686113357544, + "step": 1000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -0.3925662040710449, + "eval_logits/rejected": -0.4275723695755005, + "eval_logps/chosen": -116.85150909423828, + "eval_logps/rejected": -90.39224243164062, + "eval_loss": 0.31252291798591614, + "eval_rewards/accuracies": 0.8351955413818359, + "eval_rewards/chosen": 3.1541225910186768, + "eval_rewards/margins": 2.8854100704193115, + "eval_rewards/rejected": 0.2687124013900757, + "eval_runtime": 851.8679, + "eval_samples_per_second": 3.36, + "eval_steps_per_second": 0.21, + "step": 1000 + }, + { + "epoch": 0.46, + "learning_rate": 2.8209030948756976e-07, + "logits/chosen": -0.3884614109992981, + "logits/rejected": -0.42015600204467773, + "logps/chosen": -121.27522277832031, + "logps/rejected": -96.0818862915039, + "loss": 0.3186, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.2017555236816406, + "rewards/margins": 3.0563178062438965, + "rewards/rejected": 0.14543786644935608, + "step": 1010 + }, + { + "epoch": 0.47, + "learning_rate": 2.815829528158295e-07, + "logits/chosen": -0.3918910622596741, + "logits/rejected": -0.4254131317138672, + "logps/chosen": -106.18526458740234, + "logps/rejected": -85.96565246582031, + "loss": 0.2848, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.9074699878692627, + "rewards/margins": 2.4807074069976807, + "rewards/rejected": 0.4267624020576477, + "step": 1020 + }, + { + "epoch": 0.47, + "learning_rate": 2.810755961440893e-07, + "logits/chosen": -0.3705700933933258, + "logits/rejected": -0.42028117179870605, + "logps/chosen": -112.59422302246094, + "logps/rejected": -94.79884338378906, + "loss": 0.3074, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.024829626083374, + "rewards/margins": 2.7108585834503174, + "rewards/rejected": 0.31397122144699097, + "step": 1030 + }, + { + "epoch": 0.47, + "learning_rate": 2.8056823947234903e-07, + "logits/chosen": -0.38808757066726685, + "logits/rejected": -0.43867096304893494, + "logps/chosen": -122.36810302734375, + "logps/rejected": -91.27690124511719, + "loss": 0.2752, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.4875054359436035, + "rewards/margins": 3.586559295654297, + "rewards/rejected": -0.09905393421649933, + "step": 1040 + }, + { + "epoch": 0.48, + "learning_rate": 2.800608828006088e-07, + "logits/chosen": -0.38201209902763367, + "logits/rejected": -0.4219762682914734, + "logps/chosen": -112.345458984375, + "logps/rejected": -93.32908630371094, + "loss": 0.2762, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.233158826828003, + "rewards/margins": 2.842348575592041, + "rewards/rejected": 0.3908103108406067, + "step": 1050 + }, + { + "epoch": 0.48, + "learning_rate": 2.7955352612886856e-07, + "logits/chosen": -0.36564648151397705, + "logits/rejected": -0.4152706563472748, + "logps/chosen": -115.9142837524414, + "logps/rejected": -94.08984375, + "loss": 0.3157, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.917323589324951, + "rewards/margins": 2.925326108932495, + "rewards/rejected": -0.008002638816833496, + "step": 1060 + }, + { + "epoch": 0.49, + "learning_rate": 2.7904616945712836e-07, + "logits/chosen": -0.38626644015312195, + "logits/rejected": -0.4261588156223297, + "logps/chosen": -118.94515228271484, + "logps/rejected": -91.15392303466797, + "loss": 0.2888, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 2.740856647491455, + "rewards/margins": 2.7697622776031494, + "rewards/rejected": -0.028905678540468216, + "step": 1070 + }, + { + "epoch": 0.49, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -0.3952401578426361, + "logits/rejected": -0.43863511085510254, + "logps/chosen": -118.4677734375, + "logps/rejected": -85.22468566894531, + "loss": 0.3047, + "rewards/accuracies": 0.8125, + "rewards/chosen": 3.0166592597961426, + "rewards/margins": 2.933008909225464, + "rewards/rejected": 0.08365003764629364, + "step": 1080 + }, + { + "epoch": 0.5, + "learning_rate": 2.780314561136479e-07, + "logits/chosen": -0.39297205209732056, + "logits/rejected": -0.439324289560318, + "logps/chosen": -115.67972564697266, + "logps/rejected": -93.59664916992188, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8857316970825195, + "rewards/margins": 2.8312604427337646, + "rewards/rejected": 0.05447094514966011, + "step": 1090 + }, + { + "epoch": 0.5, + "learning_rate": 2.7752409944190763e-07, + "logits/chosen": -0.39699214696884155, + "logits/rejected": -0.4308001399040222, + "logps/chosen": -117.69087219238281, + "logps/rejected": -93.83064270019531, + "loss": 0.2765, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.924699068069458, + "rewards/margins": 2.9215705394744873, + "rewards/rejected": 0.003128147218376398, + "step": 1100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -0.421588659286499, + "eval_logits/rejected": -0.461477130651474, + "eval_logps/chosen": -116.91805267333984, + "eval_logps/rejected": -90.63465118408203, + "eval_loss": 0.29390230774879456, + "eval_rewards/accuracies": 0.8603351712226868, + "eval_rewards/chosen": 3.1208434104919434, + "eval_rewards/margins": 2.973330020904541, + "eval_rewards/rejected": 0.1475135087966919, + "eval_runtime": 877.9945, + "eval_samples_per_second": 3.26, + "eval_steps_per_second": 0.204, + "step": 1100 + }, + { + "epoch": 0.51, + "learning_rate": 2.770167427701674e-07, + "logits/chosen": -0.3756471872329712, + "logits/rejected": -0.428661584854126, + "logps/chosen": -124.7108154296875, + "logps/rejected": -93.08118438720703, + "loss": 0.3038, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.762239933013916, + "rewards/margins": 3.627725601196289, + "rewards/rejected": 0.13451430201530457, + "step": 1110 + }, + { + "epoch": 0.51, + "learning_rate": 2.7650938609842716e-07, + "logits/chosen": -0.36341890692710876, + "logits/rejected": -0.4196922183036804, + "logps/chosen": -120.17652893066406, + "logps/rejected": -88.23072814941406, + "loss": 0.2855, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.30224871635437, + "rewards/margins": 2.936652898788452, + "rewards/rejected": 0.3655957281589508, + "step": 1120 + }, + { + "epoch": 0.52, + "learning_rate": 2.7600202942668696e-07, + "logits/chosen": -0.3739013969898224, + "logits/rejected": -0.4190409779548645, + "logps/chosen": -124.47865295410156, + "logps/rejected": -91.2358169555664, + "loss": 0.2698, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.268986225128174, + "rewards/margins": 3.0331108570098877, + "rewards/rejected": 0.2358749359846115, + "step": 1130 + }, + { + "epoch": 0.52, + "learning_rate": 2.754946727549467e-07, + "logits/chosen": -0.40065139532089233, + "logits/rejected": -0.43831387162208557, + "logps/chosen": -121.75138854980469, + "logps/rejected": -98.39310455322266, + "loss": 0.2537, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.195882558822632, + "rewards/margins": 3.1010377407073975, + "rewards/rejected": 0.09484489262104034, + "step": 1140 + }, + { + "epoch": 0.52, + "learning_rate": 2.749873160832065e-07, + "logits/chosen": -0.3941956162452698, + "logits/rejected": -0.4352952539920807, + "logps/chosen": -116.66845703125, + "logps/rejected": -88.73519897460938, + "loss": 0.2587, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.9066343307495117, + "rewards/margins": 2.9923722743988037, + "rewards/rejected": -0.08573801815509796, + "step": 1150 + }, + { + "epoch": 0.53, + "learning_rate": 2.7447995941146623e-07, + "logits/chosen": -0.3892219662666321, + "logits/rejected": -0.42604565620422363, + "logps/chosen": -123.84007263183594, + "logps/rejected": -92.99417877197266, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.3677380084991455, + "rewards/margins": 3.233222484588623, + "rewards/rejected": 0.13451552391052246, + "step": 1160 + }, + { + "epoch": 0.53, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -0.4173711836338043, + "logits/rejected": -0.44396501779556274, + "logps/chosen": -115.49979400634766, + "logps/rejected": -95.75325775146484, + "loss": 0.3049, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.100471258163452, + "rewards/margins": 2.7626137733459473, + "rewards/rejected": 0.3378572463989258, + "step": 1170 + }, + { + "epoch": 0.54, + "learning_rate": 2.7346524606798576e-07, + "logits/chosen": -0.3970886170864105, + "logits/rejected": -0.43107110261917114, + "logps/chosen": -119.1224365234375, + "logps/rejected": -94.46098327636719, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.862344264984131, + "rewards/margins": 3.4456276893615723, + "rewards/rejected": -0.5832830667495728, + "step": 1180 + }, + { + "epoch": 0.54, + "learning_rate": 2.7295788939624556e-07, + "logits/chosen": -0.41212111711502075, + "logits/rejected": -0.4597795903682709, + "logps/chosen": -122.7989273071289, + "logps/rejected": -91.87464904785156, + "loss": 0.2644, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.475785732269287, + "rewards/margins": 3.3043880462646484, + "rewards/rejected": 0.17139773070812225, + "step": 1190 + }, + { + "epoch": 0.55, + "learning_rate": 2.724505327245053e-07, + "logits/chosen": -0.4002062678337097, + "logits/rejected": -0.44224053621292114, + "logps/chosen": -122.39054870605469, + "logps/rejected": -93.33247375488281, + "loss": 0.3058, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.027909755706787, + "rewards/margins": 3.0626347064971924, + "rewards/rejected": -0.0347248800098896, + "step": 1200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -0.38868534564971924, + "eval_logits/rejected": -0.42488640546798706, + "eval_logps/chosen": -117.1875, + "eval_logps/rejected": -91.20378875732422, + "eval_loss": 0.2772454023361206, + "eval_rewards/accuracies": 0.8770949840545654, + "eval_rewards/chosen": 2.9861292839050293, + "eval_rewards/margins": 3.123185396194458, + "eval_rewards/rejected": -0.13705596327781677, + "eval_runtime": 857.7682, + "eval_samples_per_second": 3.337, + "eval_steps_per_second": 0.209, + "step": 1200 + }, + { + "epoch": 0.55, + "learning_rate": 2.719431760527651e-07, + "logits/chosen": -0.3781608045101166, + "logits/rejected": -0.4132018983364105, + "logps/chosen": -113.97611236572266, + "logps/rejected": -89.44263458251953, + "loss": 0.2443, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.1568641662597656, + "rewards/margins": 2.9170334339141846, + "rewards/rejected": 0.2398306131362915, + "step": 1210 + }, + { + "epoch": 0.56, + "learning_rate": 2.7143581938102483e-07, + "logits/chosen": -0.40766286849975586, + "logits/rejected": -0.4394180178642273, + "logps/chosen": -126.20048522949219, + "logps/rejected": -91.31429290771484, + "loss": 0.2705, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 3.362502336502075, + "rewards/margins": 3.2503905296325684, + "rewards/rejected": 0.11211173236370087, + "step": 1220 + }, + { + "epoch": 0.56, + "learning_rate": 2.709284627092846e-07, + "logits/chosen": -0.40174245834350586, + "logits/rejected": -0.44669684767723083, + "logps/chosen": -119.60395812988281, + "logps/rejected": -94.1490707397461, + "loss": 0.2448, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.1044528484344482, + "rewards/margins": 3.6764514446258545, + "rewards/rejected": -0.5719983577728271, + "step": 1230 + }, + { + "epoch": 0.57, + "learning_rate": 2.7042110603754436e-07, + "logits/chosen": -0.4040297865867615, + "logits/rejected": -0.448894739151001, + "logps/chosen": -125.84278869628906, + "logps/rejected": -96.10501861572266, + "loss": 0.2429, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.768805742263794, + "rewards/margins": 3.6327590942382812, + "rewards/rejected": 0.13604608178138733, + "step": 1240 + }, + { + "epoch": 0.57, + "learning_rate": 2.6991374936580416e-07, + "logits/chosen": -0.3972640931606293, + "logits/rejected": -0.4390404224395752, + "logps/chosen": -121.781494140625, + "logps/rejected": -97.40977478027344, + "loss": 0.2876, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.0396571159362793, + "rewards/margins": 3.047499179840088, + "rewards/rejected": -0.007842063903808594, + "step": 1250 + }, + { + "epoch": 0.58, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -0.3991449177265167, + "logits/rejected": -0.4396139681339264, + "logps/chosen": -128.02182006835938, + "logps/rejected": -90.22634887695312, + "loss": 0.2382, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.1005046367645264, + "rewards/margins": 3.1185288429260254, + "rewards/rejected": -0.018024539574980736, + "step": 1260 + }, + { + "epoch": 0.58, + "learning_rate": 2.688990360223237e-07, + "logits/chosen": -0.3941733241081238, + "logits/rejected": -0.4330524504184723, + "logps/chosen": -116.48948669433594, + "logps/rejected": -93.32965850830078, + "loss": 0.2589, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.3650689125061035, + "rewards/margins": 3.3145668506622314, + "rewards/rejected": 0.05050189420580864, + "step": 1270 + }, + { + "epoch": 0.58, + "learning_rate": 2.6839167935058343e-07, + "logits/chosen": -0.36839810013771057, + "logits/rejected": -0.409420907497406, + "logps/chosen": -114.0684814453125, + "logps/rejected": -95.41332244873047, + "loss": 0.2674, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 3.460172176361084, + "rewards/margins": 3.2212772369384766, + "rewards/rejected": 0.2388952225446701, + "step": 1280 + }, + { + "epoch": 0.59, + "learning_rate": 2.678843226788432e-07, + "logits/chosen": -0.40016645193099976, + "logits/rejected": -0.43591636419296265, + "logps/chosen": -116.75439453125, + "logps/rejected": -94.6172103881836, + "loss": 0.2954, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.891127109527588, + "rewards/margins": 3.583353042602539, + "rewards/rejected": 0.3077741265296936, + "step": 1290 + }, + { + "epoch": 0.59, + "learning_rate": 2.6737696600710296e-07, + "logits/chosen": -0.40336519479751587, + "logits/rejected": -0.45011377334594727, + "logps/chosen": -122.32728576660156, + "logps/rejected": -92.63130950927734, + "loss": 0.2702, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.126340866088867, + "rewards/margins": 4.417415618896484, + "rewards/rejected": -0.2910749316215515, + "step": 1300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -0.41128745675086975, + "eval_logits/rejected": -0.44969016313552856, + "eval_logps/chosen": -116.51630401611328, + "eval_logps/rejected": -91.05738830566406, + "eval_loss": 0.2592301368713379, + "eval_rewards/accuracies": 0.8715083599090576, + "eval_rewards/chosen": 3.3217227458953857, + "eval_rewards/margins": 3.3855812549591064, + "eval_rewards/rejected": -0.06385818123817444, + "eval_runtime": 913.966, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.196, + "step": 1300 + }, + { + "epoch": 0.6, + "learning_rate": 2.6686960933536276e-07, + "logits/chosen": -0.4039461612701416, + "logits/rejected": -0.43267756700515747, + "logps/chosen": -115.5996322631836, + "logps/rejected": -95.9083023071289, + "loss": 0.2591, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.894240617752075, + "rewards/margins": 2.9390251636505127, + "rewards/rejected": -0.04478471353650093, + "step": 1310 + }, + { + "epoch": 0.6, + "learning_rate": 2.663622526636225e-07, + "logits/chosen": -0.39787545800209045, + "logits/rejected": -0.4207298755645752, + "logps/chosen": -112.7799301147461, + "logps/rejected": -93.30327606201172, + "loss": 0.2804, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 2.9737071990966797, + "rewards/margins": 2.6691317558288574, + "rewards/rejected": 0.3045758605003357, + "step": 1320 + }, + { + "epoch": 0.61, + "learning_rate": 2.658548959918823e-07, + "logits/chosen": -0.36739581823349, + "logits/rejected": -0.4091414511203766, + "logps/chosen": -130.01556396484375, + "logps/rejected": -96.4112548828125, + "loss": 0.2588, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.783113956451416, + "rewards/margins": 3.4915924072265625, + "rewards/rejected": 0.2915215492248535, + "step": 1330 + }, + { + "epoch": 0.61, + "learning_rate": 2.6534753932014203e-07, + "logits/chosen": -0.37962180376052856, + "logits/rejected": -0.4211583733558655, + "logps/chosen": -114.8764419555664, + "logps/rejected": -93.41658020019531, + "loss": 0.2598, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.3292477130889893, + "rewards/margins": 3.3758292198181152, + "rewards/rejected": -0.04658186435699463, + "step": 1340 + }, + { + "epoch": 0.62, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -0.3829995393753052, + "logits/rejected": -0.43469125032424927, + "logps/chosen": -126.12850189208984, + "logps/rejected": -92.5066146850586, + "loss": 0.251, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.180144786834717, + "rewards/margins": 4.059059143066406, + "rewards/rejected": 0.12108540534973145, + "step": 1350 + }, + { + "epoch": 0.62, + "learning_rate": 2.6433282597666156e-07, + "logits/chosen": -0.39498692750930786, + "logits/rejected": -0.4442169666290283, + "logps/chosen": -124.14341735839844, + "logps/rejected": -90.19200134277344, + "loss": 0.2411, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 4.100016117095947, + "rewards/margins": 4.147299289703369, + "rewards/rejected": -0.04728341102600098, + "step": 1360 + }, + { + "epoch": 0.63, + "learning_rate": 2.6382546930492135e-07, + "logits/chosen": -0.4066733419895172, + "logits/rejected": -0.44404226541519165, + "logps/chosen": -115.19721984863281, + "logps/rejected": -93.01029968261719, + "loss": 0.2586, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.18514084815979, + "rewards/margins": 3.501539707183838, + "rewards/rejected": -0.3163986802101135, + "step": 1370 + }, + { + "epoch": 0.63, + "learning_rate": 2.633181126331811e-07, + "logits/chosen": -0.38769611716270447, + "logits/rejected": -0.4122231900691986, + "logps/chosen": -112.17032623291016, + "logps/rejected": -91.19657135009766, + "loss": 0.2633, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.9161415100097656, + "rewards/margins": 3.0066184997558594, + "rewards/rejected": -0.09047670662403107, + "step": 1380 + }, + { + "epoch": 0.63, + "learning_rate": 2.628107559614409e-07, + "logits/chosen": -0.38696232438087463, + "logits/rejected": -0.4295097291469574, + "logps/chosen": -122.4570541381836, + "logps/rejected": -92.13167572021484, + "loss": 0.2477, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.494009494781494, + "rewards/margins": 3.7299437522888184, + "rewards/rejected": -0.23593413829803467, + "step": 1390 + }, + { + "epoch": 0.64, + "learning_rate": 2.6230339928970063e-07, + "logits/chosen": -0.42557573318481445, + "logits/rejected": -0.46044450998306274, + "logps/chosen": -113.8060073852539, + "logps/rejected": -92.50498962402344, + "loss": 0.2316, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.077052354812622, + "rewards/margins": 3.4654159545898438, + "rewards/rejected": -0.3883635103702545, + "step": 1400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -0.3868881165981293, + "eval_logits/rejected": -0.42342227697372437, + "eval_logps/chosen": -116.44770050048828, + "eval_logps/rejected": -91.5164794921875, + "eval_loss": 0.24913275241851807, + "eval_rewards/accuracies": 0.8854748606681824, + "eval_rewards/chosen": 3.3560283184051514, + "eval_rewards/margins": 3.649433135986328, + "eval_rewards/rejected": -0.2934047281742096, + "eval_runtime": 827.5196, + "eval_samples_per_second": 3.459, + "eval_steps_per_second": 0.216, + "step": 1400 + }, + { + "epoch": 0.64, + "learning_rate": 2.617960426179604e-07, + "logits/chosen": -0.3558892607688904, + "logits/rejected": -0.3981640934944153, + "logps/chosen": -122.8342514038086, + "logps/rejected": -95.00323486328125, + "loss": 0.2715, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.5448246002197266, + "rewards/margins": 3.403717041015625, + "rewards/rejected": 0.14110735058784485, + "step": 1410 + }, + { + "epoch": 0.65, + "learning_rate": 2.6128868594622016e-07, + "logits/chosen": -0.3855481445789337, + "logits/rejected": -0.42138853669166565, + "logps/chosen": -114.02071380615234, + "logps/rejected": -91.90477752685547, + "loss": 0.2708, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.9640955924987793, + "rewards/margins": 2.885097026824951, + "rewards/rejected": 0.07899872958660126, + "step": 1420 + }, + { + "epoch": 0.65, + "learning_rate": 2.6078132927447995e-07, + "logits/chosen": -0.3998476564884186, + "logits/rejected": -0.44831499457359314, + "logps/chosen": -115.67866516113281, + "logps/rejected": -90.3123779296875, + "loss": 0.2185, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.520200729370117, + "rewards/margins": 3.7672977447509766, + "rewards/rejected": -0.2470969408750534, + "step": 1430 + }, + { + "epoch": 0.66, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -0.3800181746482849, + "logits/rejected": -0.42868027091026306, + "logps/chosen": -119.03788757324219, + "logps/rejected": -92.73046112060547, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.975376844406128, + "rewards/margins": 3.5816245079040527, + "rewards/rejected": -0.6062482595443726, + "step": 1440 + }, + { + "epoch": 0.66, + "learning_rate": 2.597666159309995e-07, + "logits/chosen": -0.4055546820163727, + "logits/rejected": -0.44052910804748535, + "logps/chosen": -119.14704895019531, + "logps/rejected": -92.04376220703125, + "loss": 0.2639, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.3340554237365723, + "rewards/margins": 3.4932751655578613, + "rewards/rejected": -0.15921981632709503, + "step": 1450 + }, + { + "epoch": 0.67, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -0.3804454803466797, + "logits/rejected": -0.4156479835510254, + "logps/chosen": -118.11918640136719, + "logps/rejected": -90.77540588378906, + "loss": 0.2421, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.448434352874756, + "rewards/margins": 3.3643860816955566, + "rewards/rejected": 0.08404884487390518, + "step": 1460 + }, + { + "epoch": 0.67, + "learning_rate": 2.58751902587519e-07, + "logits/chosen": -0.36997154355049133, + "logits/rejected": -0.42205578088760376, + "logps/chosen": -124.9430160522461, + "logps/rejected": -92.24369812011719, + "loss": 0.2817, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.5387015342712402, + "rewards/margins": 4.0014214515686035, + "rewards/rejected": -0.4627200961112976, + "step": 1470 + }, + { + "epoch": 0.68, + "learning_rate": 2.5824454591577876e-07, + "logits/chosen": -0.38390421867370605, + "logits/rejected": -0.4294559359550476, + "logps/chosen": -117.3547592163086, + "logps/rejected": -87.96221160888672, + "loss": 0.2619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.335505723953247, + "rewards/margins": 3.4297802448272705, + "rewards/rejected": -0.09427466243505478, + "step": 1480 + }, + { + "epoch": 0.68, + "learning_rate": 2.5773718924403855e-07, + "logits/chosen": -0.38687005639076233, + "logits/rejected": -0.42344173789024353, + "logps/chosen": -115.8824462890625, + "logps/rejected": -90.14027404785156, + "loss": 0.2513, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.5795180797576904, + "rewards/margins": 3.676868438720703, + "rewards/rejected": -0.09735004603862762, + "step": 1490 + }, + { + "epoch": 0.68, + "learning_rate": 2.572298325722983e-07, + "logits/chosen": -0.4296097159385681, + "logits/rejected": -0.4675825536251068, + "logps/chosen": -124.1937484741211, + "logps/rejected": -94.04402160644531, + "loss": 0.2344, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2074170112609863, + "rewards/margins": 3.9851672649383545, + "rewards/rejected": -0.7777503728866577, + "step": 1500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -0.4151133894920349, + "eval_logits/rejected": -0.45148965716362, + "eval_logps/chosen": -116.71522521972656, + "eval_logps/rejected": -91.37801361083984, + "eval_loss": 0.2505827844142914, + "eval_rewards/accuracies": 0.8687151074409485, + "eval_rewards/chosen": 3.222259044647217, + "eval_rewards/margins": 3.446425676345825, + "eval_rewards/rejected": -0.2241670787334442, + "eval_runtime": 916.2433, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 0.195, + "step": 1500 + }, + { + "epoch": 0.69, + "learning_rate": 2.567224759005581e-07, + "logits/chosen": -0.4068973660469055, + "logits/rejected": -0.44249239563941956, + "logps/chosen": -121.63665771484375, + "logps/rejected": -89.6568603515625, + "loss": 0.2302, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7271816730499268, + "rewards/margins": 4.20591926574707, + "rewards/rejected": -0.4787377715110779, + "step": 1510 + }, + { + "epoch": 0.69, + "learning_rate": 2.5621511922881783e-07, + "logits/chosen": -0.3942483365535736, + "logits/rejected": -0.4460867941379547, + "logps/chosen": -119.01606750488281, + "logps/rejected": -93.5682601928711, + "loss": 0.2069, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.9261646270751953, + "rewards/margins": 4.379087924957275, + "rewards/rejected": -0.45292338728904724, + "step": 1520 + }, + { + "epoch": 0.7, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -0.39618119597435, + "logits/rejected": -0.4257192611694336, + "logps/chosen": -113.54156494140625, + "logps/rejected": -94.97087097167969, + "loss": 0.2673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.967313289642334, + "rewards/margins": 3.0766537189483643, + "rewards/rejected": -0.10934066772460938, + "step": 1530 + }, + { + "epoch": 0.7, + "learning_rate": 2.5520040588533736e-07, + "logits/chosen": -0.39150649309158325, + "logits/rejected": -0.4350226819515228, + "logps/chosen": -123.06941986083984, + "logps/rejected": -90.06202697753906, + "loss": 0.2895, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.420095920562744, + "rewards/margins": 3.699439287185669, + "rewards/rejected": -0.27934330701828003, + "step": 1540 + }, + { + "epoch": 0.71, + "learning_rate": 2.5469304921359715e-07, + "logits/chosen": -0.3911879062652588, + "logits/rejected": -0.4290149211883545, + "logps/chosen": -116.4568099975586, + "logps/rejected": -95.0242691040039, + "loss": 0.2433, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.3553338050842285, + "rewards/margins": 3.6205520629882812, + "rewards/rejected": -0.26521843671798706, + "step": 1550 + }, + { + "epoch": 0.71, + "learning_rate": 2.541856925418569e-07, + "logits/chosen": -0.4253864288330078, + "logits/rejected": -0.4458925127983093, + "logps/chosen": -116.5554428100586, + "logps/rejected": -93.46299743652344, + "loss": 0.2617, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.091580867767334, + "rewards/margins": 3.3575541973114014, + "rewards/rejected": -0.2659732699394226, + "step": 1560 + }, + { + "epoch": 0.72, + "learning_rate": 2.536783358701167e-07, + "logits/chosen": -0.40082424879074097, + "logits/rejected": -0.43754005432128906, + "logps/chosen": -120.23848724365234, + "logps/rejected": -91.8831558227539, + "loss": 0.269, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.5315544605255127, + "rewards/margins": 3.7128093242645264, + "rewards/rejected": -0.1812548190355301, + "step": 1570 + }, + { + "epoch": 0.72, + "learning_rate": 2.5317097919837643e-07, + "logits/chosen": -0.4118991792201996, + "logits/rejected": -0.4498567581176758, + "logps/chosen": -116.05540466308594, + "logps/rejected": -98.47431945800781, + "loss": 0.2344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.790273904800415, + "rewards/margins": 3.317415237426758, + "rewards/rejected": -0.5271413922309875, + "step": 1580 + }, + { + "epoch": 0.73, + "learning_rate": 2.526636225266362e-07, + "logits/chosen": -0.37646594643592834, + "logits/rejected": -0.42422208189964294, + "logps/chosen": -117.7269515991211, + "logps/rejected": -96.44020080566406, + "loss": 0.2108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.832566738128662, + "rewards/margins": 4.462157726287842, + "rewards/rejected": -0.6295905113220215, + "step": 1590 + }, + { + "epoch": 0.73, + "learning_rate": 2.5215626585489596e-07, + "logits/chosen": -0.3981241285800934, + "logits/rejected": -0.43437424302101135, + "logps/chosen": -109.82353210449219, + "logps/rejected": -89.00421142578125, + "loss": 0.2332, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4090182781219482, + "rewards/margins": 3.632111072540283, + "rewards/rejected": -0.22309276461601257, + "step": 1600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -0.3935575485229492, + "eval_logits/rejected": -0.4299449324607849, + "eval_logps/chosen": -116.73237609863281, + "eval_logps/rejected": -91.7436294555664, + "eval_loss": 0.2350289225578308, + "eval_rewards/accuracies": 0.8854748606681824, + "eval_rewards/chosen": 3.2136857509613037, + "eval_rewards/margins": 3.6206626892089844, + "eval_rewards/rejected": -0.4069768786430359, + "eval_runtime": 902.2925, + "eval_samples_per_second": 3.172, + "eval_steps_per_second": 0.198, + "step": 1600 + }, + { + "epoch": 0.73, + "learning_rate": 2.5164890918315575e-07, + "logits/chosen": -0.41735535860061646, + "logits/rejected": -0.44540077447891235, + "logps/chosen": -116.6932144165039, + "logps/rejected": -92.57784271240234, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.7736692428588867, + "rewards/margins": 3.2737278938293457, + "rewards/rejected": -0.5000584125518799, + "step": 1610 + }, + { + "epoch": 0.74, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -0.37435075640678406, + "logits/rejected": -0.41276389360427856, + "logps/chosen": -115.96415710449219, + "logps/rejected": -92.0928726196289, + "loss": 0.2427, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.167051315307617, + "rewards/margins": 3.515188217163086, + "rewards/rejected": -0.3481367230415344, + "step": 1620 + }, + { + "epoch": 0.74, + "learning_rate": 2.506341958396753e-07, + "logits/chosen": -0.3907029628753662, + "logits/rejected": -0.4280622601509094, + "logps/chosen": -120.83988952636719, + "logps/rejected": -92.36735534667969, + "loss": 0.2151, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.1539828777313232, + "rewards/margins": 4.034056186676025, + "rewards/rejected": -0.880073070526123, + "step": 1630 + }, + { + "epoch": 0.75, + "learning_rate": 2.5012683916793503e-07, + "logits/chosen": -0.3848786950111389, + "logits/rejected": -0.4281742572784424, + "logps/chosen": -126.47706604003906, + "logps/rejected": -93.86870574951172, + "loss": 0.2415, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.9007632732391357, + "rewards/margins": 4.612415313720703, + "rewards/rejected": -0.7116523385047913, + "step": 1640 + }, + { + "epoch": 0.75, + "learning_rate": 2.496194824961948e-07, + "logits/chosen": -0.42201462388038635, + "logits/rejected": -0.44740739464759827, + "logps/chosen": -116.1539535522461, + "logps/rejected": -93.58564758300781, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.071413516998291, + "rewards/margins": 3.6266441345214844, + "rewards/rejected": -0.5552308559417725, + "step": 1650 + }, + { + "epoch": 0.76, + "learning_rate": 2.4911212582445456e-07, + "logits/chosen": -0.3866708278656006, + "logits/rejected": -0.4200906753540039, + "logps/chosen": -115.0282211303711, + "logps/rejected": -95.74185180664062, + "loss": 0.2136, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.5832972526550293, + "rewards/margins": 3.8258235454559326, + "rewards/rejected": -0.24252581596374512, + "step": 1660 + }, + { + "epoch": 0.76, + "learning_rate": 2.4860476915271435e-07, + "logits/chosen": -0.39738452434539795, + "logits/rejected": -0.43101343512535095, + "logps/chosen": -118.44441223144531, + "logps/rejected": -95.55250549316406, + "loss": 0.2669, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 3.5531890392303467, + "rewards/margins": 3.563720226287842, + "rewards/rejected": -0.010531162843108177, + "step": 1670 + }, + { + "epoch": 0.77, + "learning_rate": 2.480974124809741e-07, + "logits/chosen": -0.3905322253704071, + "logits/rejected": -0.4394722580909729, + "logps/chosen": -127.36441802978516, + "logps/rejected": -97.71236419677734, + "loss": 0.2488, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.985499858856201, + "rewards/margins": 4.582793712615967, + "rewards/rejected": -0.5972935557365417, + "step": 1680 + }, + { + "epoch": 0.77, + "learning_rate": 2.475900558092339e-07, + "logits/chosen": -0.36849355697631836, + "logits/rejected": -0.40935856103897095, + "logps/chosen": -119.0000991821289, + "logps/rejected": -89.49138641357422, + "loss": 0.2409, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.6050148010253906, + "rewards/margins": 4.3091864585876465, + "rewards/rejected": -0.7041717767715454, + "step": 1690 + }, + { + "epoch": 0.78, + "learning_rate": 2.4708269913749363e-07, + "logits/chosen": -0.37423986196517944, + "logits/rejected": -0.4061339497566223, + "logps/chosen": -122.22186279296875, + "logps/rejected": -95.46275329589844, + "loss": 0.2258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.297147274017334, + "rewards/margins": 4.038428783416748, + "rewards/rejected": -0.7412816882133484, + "step": 1700 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -0.3959788680076599, + "eval_logits/rejected": -0.43158969283103943, + "eval_logps/chosen": -116.98088073730469, + "eval_logps/rejected": -92.04763793945312, + "eval_loss": 0.24770790338516235, + "eval_rewards/accuracies": 0.8938547372817993, + "eval_rewards/chosen": 3.0894315242767334, + "eval_rewards/margins": 3.6484169960021973, + "eval_rewards/rejected": -0.5589855313301086, + "eval_runtime": 874.9831, + "eval_samples_per_second": 3.271, + "eval_steps_per_second": 0.205, + "step": 1700 + }, + { + "epoch": 0.78, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -0.4236866533756256, + "logits/rejected": -0.46107596158981323, + "logps/chosen": -116.57310485839844, + "logps/rejected": -92.78831481933594, + "loss": 0.2263, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.3795197010040283, + "rewards/margins": 4.285675525665283, + "rewards/rejected": -0.9061552286148071, + "step": 1710 + }, + { + "epoch": 0.79, + "learning_rate": 2.4606798579401316e-07, + "logits/chosen": -0.3939421474933624, + "logits/rejected": -0.4401417374610901, + "logps/chosen": -123.36474609375, + "logps/rejected": -96.49834442138672, + "loss": 0.2373, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.518620014190674, + "rewards/margins": 4.276294708251953, + "rewards/rejected": -0.7576751708984375, + "step": 1720 + }, + { + "epoch": 0.79, + "learning_rate": 2.4556062912227295e-07, + "logits/chosen": -0.4078288674354553, + "logits/rejected": -0.4479657709598541, + "logps/chosen": -120.38444519042969, + "logps/rejected": -90.84424591064453, + "loss": 0.1966, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.933053970336914, + "rewards/margins": 3.6727993488311768, + "rewards/rejected": -0.739745020866394, + "step": 1730 + }, + { + "epoch": 0.79, + "learning_rate": 2.450532724505327e-07, + "logits/chosen": -0.4132927358150482, + "logits/rejected": -0.4574710428714752, + "logps/chosen": -123.63642883300781, + "logps/rejected": -93.76477813720703, + "loss": 0.2602, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.631747007369995, + "rewards/margins": 4.368203639984131, + "rewards/rejected": -0.736456573009491, + "step": 1740 + }, + { + "epoch": 0.8, + "learning_rate": 2.445459157787925e-07, + "logits/chosen": -0.39394044876098633, + "logits/rejected": -0.43226075172424316, + "logps/chosen": -117.86421966552734, + "logps/rejected": -90.55984497070312, + "loss": 0.243, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.801774263381958, + "rewards/margins": 4.736522674560547, + "rewards/rejected": -0.9347483515739441, + "step": 1750 + }, + { + "epoch": 0.8, + "learning_rate": 2.4403855910705223e-07, + "logits/chosen": -0.4240695536136627, + "logits/rejected": -0.45410633087158203, + "logps/chosen": -115.75899505615234, + "logps/rejected": -93.46293640136719, + "loss": 0.2529, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.036893129348755, + "rewards/margins": 3.4991378784179688, + "rewards/rejected": -0.46224460005760193, + "step": 1760 + }, + { + "epoch": 0.81, + "learning_rate": 2.43531202435312e-07, + "logits/chosen": -0.4136069715023041, + "logits/rejected": -0.4493675231933594, + "logps/chosen": -124.36299133300781, + "logps/rejected": -90.94700622558594, + "loss": 0.2342, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.2333157062530518, + "rewards/margins": 3.8581371307373047, + "rewards/rejected": -0.6248215436935425, + "step": 1770 + }, + { + "epoch": 0.81, + "learning_rate": 2.4302384576357176e-07, + "logits/chosen": -0.40225839614868164, + "logits/rejected": -0.4295947551727295, + "logps/chosen": -123.95500183105469, + "logps/rejected": -103.24928283691406, + "loss": 0.2232, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 2.355786085128784, + "rewards/margins": 3.5081162452697754, + "rewards/rejected": -1.1523301601409912, + "step": 1780 + }, + { + "epoch": 0.82, + "learning_rate": 2.4251648909183155e-07, + "logits/chosen": -0.4294680953025818, + "logits/rejected": -0.4623151421546936, + "logps/chosen": -121.99348449707031, + "logps/rejected": -96.97917175292969, + "loss": 0.2182, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.0072503089904785, + "rewards/margins": 4.567361354827881, + "rewards/rejected": -0.5601118206977844, + "step": 1790 + }, + { + "epoch": 0.82, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -0.42369261384010315, + "logits/rejected": -0.45343533158302307, + "logps/chosen": -121.0447006225586, + "logps/rejected": -98.198486328125, + "loss": 0.2526, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 2.7009620666503906, + "rewards/margins": 3.142019510269165, + "rewards/rejected": -0.44105762243270874, + "step": 1800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -0.4075545072555542, + "eval_logits/rejected": -0.4420177936553955, + "eval_logps/chosen": -116.59071350097656, + "eval_logps/rejected": -92.0351333618164, + "eval_loss": 0.22767914831638336, + "eval_rewards/accuracies": 0.8770949840545654, + "eval_rewards/chosen": 3.2845191955566406, + "eval_rewards/margins": 3.8372511863708496, + "eval_rewards/rejected": -0.5527323484420776, + "eval_runtime": 879.6455, + "eval_samples_per_second": 3.254, + "eval_steps_per_second": 0.203, + "step": 1800 + }, + { + "epoch": 0.83, + "learning_rate": 2.415017757483511e-07, + "logits/chosen": -0.3963635563850403, + "logits/rejected": -0.4376469552516937, + "logps/chosen": -120.19471740722656, + "logps/rejected": -97.24011993408203, + "loss": 0.219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7244651317596436, + "rewards/margins": 4.514935493469238, + "rewards/rejected": -0.7904707193374634, + "step": 1810 + }, + { + "epoch": 0.83, + "learning_rate": 2.409944190766108e-07, + "logits/chosen": -0.43789929151535034, + "logits/rejected": -0.4696916937828064, + "logps/chosen": -121.25593566894531, + "logps/rejected": -96.01130676269531, + "loss": 0.2236, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.057839870452881, + "rewards/margins": 3.6217963695526123, + "rewards/rejected": -0.5639564394950867, + "step": 1820 + }, + { + "epoch": 0.84, + "learning_rate": 2.404870624048706e-07, + "logits/chosen": -0.4191747307777405, + "logits/rejected": -0.45385512709617615, + "logps/chosen": -117.59214782714844, + "logps/rejected": -94.40740203857422, + "loss": 0.2352, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.6873555183410645, + "rewards/margins": 4.0088911056518555, + "rewards/rejected": -0.32153594493865967, + "step": 1830 + }, + { + "epoch": 0.84, + "learning_rate": 2.3997970573313036e-07, + "logits/chosen": -0.38190537691116333, + "logits/rejected": -0.4171864986419678, + "logps/chosen": -120.65101623535156, + "logps/rejected": -91.7802963256836, + "loss": 0.2291, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 2.9036548137664795, + "rewards/margins": 3.2020695209503174, + "rewards/rejected": -0.2984146475791931, + "step": 1840 + }, + { + "epoch": 0.84, + "learning_rate": 2.3947234906139015e-07, + "logits/chosen": -0.41627272963523865, + "logits/rejected": -0.45198947191238403, + "logps/chosen": -121.30501556396484, + "logps/rejected": -92.88670349121094, + "loss": 0.2236, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.955059766769409, + "rewards/margins": 4.721311569213867, + "rewards/rejected": -0.7662516832351685, + "step": 1850 + }, + { + "epoch": 0.85, + "learning_rate": 2.389649923896499e-07, + "logits/chosen": -0.3740800619125366, + "logits/rejected": -0.4138231873512268, + "logps/chosen": -114.4834976196289, + "logps/rejected": -92.96529388427734, + "loss": 0.2118, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.1827633380889893, + "rewards/margins": 4.073010444641113, + "rewards/rejected": -0.8902471661567688, + "step": 1860 + }, + { + "epoch": 0.85, + "learning_rate": 2.384576357179097e-07, + "logits/chosen": -0.41853219270706177, + "logits/rejected": -0.4531814455986023, + "logps/chosen": -117.73091125488281, + "logps/rejected": -94.66040802001953, + "loss": 0.2155, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 3.427842617034912, + "rewards/margins": 4.3185224533081055, + "rewards/rejected": -0.8906799554824829, + "step": 1870 + }, + { + "epoch": 0.86, + "learning_rate": 2.3795027904616943e-07, + "logits/chosen": -0.40005069971084595, + "logits/rejected": -0.433000385761261, + "logps/chosen": -120.5252685546875, + "logps/rejected": -94.85150909423828, + "loss": 0.2294, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.2125244140625, + "rewards/margins": 4.113189697265625, + "rewards/rejected": -0.9006654620170593, + "step": 1880 + }, + { + "epoch": 0.86, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -0.3957889676094055, + "logits/rejected": -0.424477756023407, + "logps/chosen": -116.66868591308594, + "logps/rejected": -92.9976577758789, + "loss": 0.254, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 2.8800389766693115, + "rewards/margins": 3.438807249069214, + "rewards/rejected": -0.5587679743766785, + "step": 1890 + }, + { + "epoch": 0.87, + "learning_rate": 2.3693556570268896e-07, + "logits/chosen": -0.38361310958862305, + "logits/rejected": -0.41146165132522583, + "logps/chosen": -117.5991439819336, + "logps/rejected": -99.95137786865234, + "loss": 0.2025, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.4426445960998535, + "rewards/margins": 4.4284892082214355, + "rewards/rejected": -0.9858444333076477, + "step": 1900 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -0.3973536193370819, + "eval_logits/rejected": -0.43190258741378784, + "eval_logps/chosen": -116.74763488769531, + "eval_logps/rejected": -92.54963684082031, + "eval_loss": 0.2182014286518097, + "eval_rewards/accuracies": 0.9022346138954163, + "eval_rewards/chosen": 3.2060580253601074, + "eval_rewards/margins": 4.016037940979004, + "eval_rewards/rejected": -0.8099795579910278, + "eval_runtime": 875.3985, + "eval_samples_per_second": 3.269, + "eval_steps_per_second": 0.204, + "step": 1900 + }, + { + "epoch": 0.87, + "learning_rate": 2.3642820903094873e-07, + "logits/chosen": -0.40541213750839233, + "logits/rejected": -0.44309115409851074, + "logps/chosen": -126.1652603149414, + "logps/rejected": -96.5681381225586, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.576422929763794, + "rewards/margins": 4.267624855041504, + "rewards/rejected": -0.6912025809288025, + "step": 1910 + }, + { + "epoch": 0.88, + "learning_rate": 2.359208523592085e-07, + "logits/chosen": -0.39431583881378174, + "logits/rejected": -0.42382940649986267, + "logps/chosen": -116.77348327636719, + "logps/rejected": -91.7231216430664, + "loss": 0.2451, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.1586005687713623, + "rewards/margins": 3.866903305053711, + "rewards/rejected": -0.7083033323287964, + "step": 1920 + }, + { + "epoch": 0.88, + "learning_rate": 2.3541349568746826e-07, + "logits/chosen": -0.40828245878219604, + "logits/rejected": -0.4404390752315521, + "logps/chosen": -118.46229553222656, + "logps/rejected": -90.4027328491211, + "loss": 0.2174, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.344696044921875, + "rewards/margins": 4.355304718017578, + "rewards/rejected": -1.0106089115142822, + "step": 1930 + }, + { + "epoch": 0.89, + "learning_rate": 2.3490613901572803e-07, + "logits/chosen": -0.41974538564682007, + "logits/rejected": -0.45281344652175903, + "logps/chosen": -112.36436462402344, + "logps/rejected": -94.63860321044922, + "loss": 0.2256, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.5663247108459473, + "rewards/margins": 3.390653133392334, + "rewards/rejected": -0.8243284225463867, + "step": 1940 + }, + { + "epoch": 0.89, + "learning_rate": 2.343987823439878e-07, + "logits/chosen": -0.41871196031570435, + "logits/rejected": -0.4626835286617279, + "logps/chosen": -118.2319564819336, + "logps/rejected": -91.87098693847656, + "loss": 0.2033, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.2902045249938965, + "rewards/margins": 4.170986652374268, + "rewards/rejected": -0.8807823061943054, + "step": 1950 + }, + { + "epoch": 0.89, + "learning_rate": 2.3389142567224756e-07, + "logits/chosen": -0.41229549050331116, + "logits/rejected": -0.44218960404396057, + "logps/chosen": -118.4122543334961, + "logps/rejected": -99.02762603759766, + "loss": 0.2217, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.3712990283966064, + "rewards/margins": 4.0574212074279785, + "rewards/rejected": -0.6861221194267273, + "step": 1960 + }, + { + "epoch": 0.9, + "learning_rate": 2.3338406900050733e-07, + "logits/chosen": -0.4184509217739105, + "logits/rejected": -0.4491788446903229, + "logps/chosen": -124.60710144042969, + "logps/rejected": -96.12918090820312, + "loss": 0.1937, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.895785093307495, + "rewards/margins": 4.719235420227051, + "rewards/rejected": -0.8234502077102661, + "step": 1970 + }, + { + "epoch": 0.9, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -0.4528725743293762, + "logits/rejected": -0.4842372536659241, + "logps/chosen": -124.16194152832031, + "logps/rejected": -97.17805480957031, + "loss": 0.2338, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.83451771736145, + "rewards/margins": 4.723608016967773, + "rewards/rejected": -0.8890898823738098, + "step": 1980 + }, + { + "epoch": 0.91, + "learning_rate": 2.3236935565702686e-07, + "logits/chosen": -0.3931097388267517, + "logits/rejected": -0.43126392364501953, + "logps/chosen": -119.52880859375, + "logps/rejected": -94.76363372802734, + "loss": 0.2115, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.683422565460205, + "rewards/margins": 4.176725387573242, + "rewards/rejected": -0.4933028221130371, + "step": 1990 + }, + { + "epoch": 0.91, + "learning_rate": 2.3186199898528663e-07, + "logits/chosen": -0.40987688302993774, + "logits/rejected": -0.4448050856590271, + "logps/chosen": -117.04490661621094, + "logps/rejected": -95.28965759277344, + "loss": 0.2253, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.0975899696350098, + "rewards/margins": 3.8567593097686768, + "rewards/rejected": -0.7591692209243774, + "step": 2000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -0.4023212492465973, + "eval_logits/rejected": -0.4390886723995209, + "eval_logps/chosen": -116.60671997070312, + "eval_logps/rejected": -92.88094329833984, + "eval_loss": 0.2149447649717331, + "eval_rewards/accuracies": 0.9078212380409241, + "eval_rewards/chosen": 3.2765114307403564, + "eval_rewards/margins": 4.2521491050720215, + "eval_rewards/rejected": -0.975637674331665, + "eval_runtime": 907.6422, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 0.197, + "step": 2000 + }, + { + "epoch": 0.92, + "learning_rate": 2.313546423135464e-07, + "logits/chosen": -0.3986409902572632, + "logits/rejected": -0.43848711252212524, + "logps/chosen": -123.74125671386719, + "logps/rejected": -92.8211669921875, + "loss": 0.2196, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.1617798805236816, + "rewards/margins": 4.177075386047363, + "rewards/rejected": -1.0152956247329712, + "step": 2010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3084728564180616e-07, + "logits/chosen": -0.4132860600948334, + "logits/rejected": -0.43701285123825073, + "logps/chosen": -115.518798828125, + "logps/rejected": -96.49732208251953, + "loss": 0.2174, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.845722198486328, + "rewards/margins": 3.5684571266174316, + "rewards/rejected": -0.7227347493171692, + "step": 2020 + }, + { + "epoch": 0.93, + "learning_rate": 2.3033992897006593e-07, + "logits/chosen": -0.44160446524620056, + "logits/rejected": -0.4706133008003235, + "logps/chosen": -126.89131164550781, + "logps/rejected": -94.20338439941406, + "loss": 0.2095, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.952413558959961, + "rewards/margins": 5.020051002502441, + "rewards/rejected": -1.0676372051239014, + "step": 2030 + }, + { + "epoch": 0.93, + "learning_rate": 2.298325722983257e-07, + "logits/chosen": -0.40945592522621155, + "logits/rejected": -0.4388090670108795, + "logps/chosen": -124.21551513671875, + "logps/rejected": -95.3039321899414, + "loss": 0.2448, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4655966758728027, + "rewards/margins": 4.477190971374512, + "rewards/rejected": -1.011594533920288, + "step": 2040 + }, + { + "epoch": 0.94, + "learning_rate": 2.2932521562658546e-07, + "logits/chosen": -0.41212978959083557, + "logits/rejected": -0.4488631784915924, + "logps/chosen": -121.19816589355469, + "logps/rejected": -92.48617553710938, + "loss": 0.2242, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.629365921020508, + "rewards/margins": 4.309345245361328, + "rewards/rejected": -0.6799794435501099, + "step": 2050 + }, + { + "epoch": 0.94, + "learning_rate": 2.2881785895484523e-07, + "logits/chosen": -0.3852509558200836, + "logits/rejected": -0.4215177595615387, + "logps/chosen": -122.44758605957031, + "logps/rejected": -93.50701141357422, + "loss": 0.2282, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.269455671310425, + "rewards/margins": 4.175347328186035, + "rewards/rejected": -0.9058906435966492, + "step": 2060 + }, + { + "epoch": 0.94, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -0.39701271057128906, + "logits/rejected": -0.42783480882644653, + "logps/chosen": -112.2025375366211, + "logps/rejected": -91.7884292602539, + "loss": 0.2028, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.7778406143188477, + "rewards/margins": 4.39522647857666, + "rewards/rejected": -1.6173861026763916, + "step": 2070 + }, + { + "epoch": 0.95, + "learning_rate": 2.2780314561136476e-07, + "logits/chosen": -0.4007197916507721, + "logits/rejected": -0.4381144940853119, + "logps/chosen": -121.38664245605469, + "logps/rejected": -91.06873321533203, + "loss": 0.1845, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.398432970046997, + "rewards/margins": 4.025904178619385, + "rewards/rejected": -0.6274709701538086, + "step": 2080 + }, + { + "epoch": 0.95, + "learning_rate": 2.2729578893962453e-07, + "logits/chosen": -0.3872908651828766, + "logits/rejected": -0.42537689208984375, + "logps/chosen": -116.7512435913086, + "logps/rejected": -93.22492218017578, + "loss": 0.1976, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5508179664611816, + "rewards/margins": 4.1629509925842285, + "rewards/rejected": -0.6121331453323364, + "step": 2090 + }, + { + "epoch": 0.96, + "learning_rate": 2.267884322678843e-07, + "logits/chosen": -0.37515270709991455, + "logits/rejected": -0.41785183548927307, + "logps/chosen": -116.22972106933594, + "logps/rejected": -88.31718444824219, + "loss": 0.2084, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.261925220489502, + "rewards/margins": 3.7819576263427734, + "rewards/rejected": -0.5200322866439819, + "step": 2100 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -0.3953567445278168, + "eval_logits/rejected": -0.428348183631897, + "eval_logps/chosen": -116.9277114868164, + "eval_logps/rejected": -93.06153869628906, + "eval_loss": 0.22232365608215332, + "eval_rewards/accuracies": 0.8938547372817993, + "eval_rewards/chosen": 3.116020679473877, + "eval_rewards/margins": 4.181955814361572, + "eval_rewards/rejected": -1.0659351348876953, + "eval_runtime": 921.8014, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.194, + "step": 2100 + }, + { + "epoch": 0.96, + "learning_rate": 2.2628107559614406e-07, + "logits/chosen": -0.3972451686859131, + "logits/rejected": -0.429997980594635, + "logps/chosen": -115.76712799072266, + "logps/rejected": -97.56238555908203, + "loss": 0.2216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4144790172576904, + "rewards/margins": 3.9687142372131348, + "rewards/rejected": -0.5542353987693787, + "step": 2110 + }, + { + "epoch": 0.97, + "learning_rate": 2.2577371892440383e-07, + "logits/chosen": -0.4206092953681946, + "logits/rejected": -0.4511509835720062, + "logps/chosen": -113.78810119628906, + "logps/rejected": -93.98689270019531, + "loss": 0.1785, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7417354583740234, + "rewards/margins": 4.476273059844971, + "rewards/rejected": -0.734538197517395, + "step": 2120 + }, + { + "epoch": 0.97, + "learning_rate": 2.252663622526636e-07, + "logits/chosen": -0.42873579263687134, + "logits/rejected": -0.4546934962272644, + "logps/chosen": -120.35272216796875, + "logps/rejected": -96.99116516113281, + "loss": 0.2561, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.1441822052001953, + "rewards/margins": 4.1696553230285645, + "rewards/rejected": -1.0254729986190796, + "step": 2130 + }, + { + "epoch": 0.98, + "learning_rate": 2.2475900558092336e-07, + "logits/chosen": -0.4174647331237793, + "logits/rejected": -0.45524922013282776, + "logps/chosen": -113.0732650756836, + "logps/rejected": -93.70841979980469, + "loss": 0.238, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.7171530723571777, + "rewards/margins": 4.351462364196777, + "rewards/rejected": -0.6343096494674683, + "step": 2140 + }, + { + "epoch": 0.98, + "learning_rate": 2.2425164890918313e-07, + "logits/chosen": -0.39059361815452576, + "logits/rejected": -0.4388251304626465, + "logps/chosen": -131.6301727294922, + "logps/rejected": -97.77519989013672, + "loss": 0.2028, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.9817802906036377, + "rewards/margins": 4.451568603515625, + "rewards/rejected": -0.4697890281677246, + "step": 2150 + }, + { + "epoch": 0.99, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -0.4017879366874695, + "logits/rejected": -0.4366172254085541, + "logps/chosen": -114.01628112792969, + "logps/rejected": -90.86222076416016, + "loss": 0.1971, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.2794747352600098, + "rewards/margins": 3.6212470531463623, + "rewards/rejected": -0.34177258610725403, + "step": 2160 + }, + { + "epoch": 0.99, + "learning_rate": 2.2323693556570266e-07, + "logits/chosen": -0.37799277901649475, + "logits/rejected": -0.4189319610595703, + "logps/chosen": -124.58580017089844, + "logps/rejected": -98.61467742919922, + "loss": 0.1885, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8187057971954346, + "rewards/margins": 4.701231956481934, + "rewards/rejected": -0.8825258016586304, + "step": 2170 + }, + { + "epoch": 1.0, + "learning_rate": 2.2272957889396242e-07, + "logits/chosen": -0.4155615270137787, + "logits/rejected": -0.4478856921195984, + "logps/chosen": -118.5948715209961, + "logps/rejected": -94.40876770019531, + "loss": 0.1998, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.646564483642578, + "rewards/margins": 4.50596284866333, + "rewards/rejected": -0.8593980073928833, + "step": 2180 + }, + { + "epoch": 1.0, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -0.4222278594970703, + "logits/rejected": -0.44699034094810486, + "logps/chosen": -118.62066650390625, + "logps/rejected": -90.86624908447266, + "loss": 0.1933, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.708111524581909, + "rewards/margins": 3.947237491607666, + "rewards/rejected": -1.239126205444336, + "step": 2190 + }, + { + "epoch": 1.0, + "learning_rate": 2.2171486555048196e-07, + "logits/chosen": -0.3967632055282593, + "logits/rejected": -0.43386539816856384, + "logps/chosen": -127.67298889160156, + "logps/rejected": -100.56233978271484, + "loss": 0.1896, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8950390815734863, + "rewards/margins": 5.041460990905762, + "rewards/rejected": -1.146422266960144, + "step": 2200 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -0.4154352843761444, + "eval_logits/rejected": -0.45174553990364075, + "eval_logps/chosen": -116.79273223876953, + "eval_logps/rejected": -92.95588684082031, + "eval_loss": 0.20996138453483582, + "eval_rewards/accuracies": 0.8910614252090454, + "eval_rewards/chosen": 3.183509588241577, + "eval_rewards/margins": 4.196613788604736, + "eval_rewards/rejected": -1.0131044387817383, + "eval_runtime": 896.2038, + "eval_samples_per_second": 3.193, + "eval_steps_per_second": 0.2, + "step": 2200 + }, + { + "epoch": 1.01, + "learning_rate": 2.2120750887874172e-07, + "logits/chosen": -0.381591260433197, + "logits/rejected": -0.4185353219509125, + "logps/chosen": -123.46016693115234, + "logps/rejected": -97.816650390625, + "loss": 0.1978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.249309539794922, + "rewards/margins": 4.560460090637207, + "rewards/rejected": -1.3111507892608643, + "step": 2210 + }, + { + "epoch": 1.01, + "learning_rate": 2.207001522070015e-07, + "logits/chosen": -0.39903947710990906, + "logits/rejected": -0.43686169385910034, + "logps/chosen": -122.17398834228516, + "logps/rejected": -101.26841735839844, + "loss": 0.1726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.203260898590088, + "rewards/margins": 3.9990131855010986, + "rewards/rejected": -0.7957520484924316, + "step": 2220 + }, + { + "epoch": 1.02, + "learning_rate": 2.2019279553526126e-07, + "logits/chosen": -0.42441973090171814, + "logits/rejected": -0.4595940113067627, + "logps/chosen": -121.41120910644531, + "logps/rejected": -87.17717742919922, + "loss": 0.1692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.520594358444214, + "rewards/margins": 4.398536682128906, + "rewards/rejected": -0.8779422044754028, + "step": 2230 + }, + { + "epoch": 1.02, + "learning_rate": 2.1968543886352102e-07, + "logits/chosen": -0.41687169671058655, + "logits/rejected": -0.442725270986557, + "logps/chosen": -113.2309341430664, + "logps/rejected": -93.45161437988281, + "loss": 0.1922, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.1109657287597656, + "rewards/margins": 4.183775424957275, + "rewards/rejected": -1.0728092193603516, + "step": 2240 + }, + { + "epoch": 1.03, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -0.45746049284935, + "logits/rejected": -0.47277164459228516, + "logps/chosen": -115.0101318359375, + "logps/rejected": -92.96504211425781, + "loss": 0.2032, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.604362726211548, + "rewards/margins": 3.793149948120117, + "rewards/rejected": -1.1887872219085693, + "step": 2250 + }, + { + "epoch": 1.03, + "learning_rate": 2.1867072552004056e-07, + "logits/chosen": -0.4215884208679199, + "logits/rejected": -0.45461931824684143, + "logps/chosen": -119.3857192993164, + "logps/rejected": -94.49113464355469, + "loss": 0.17, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.2537574768066406, + "rewards/margins": 4.495412826538086, + "rewards/rejected": -1.2416555881500244, + "step": 2260 + }, + { + "epoch": 1.04, + "learning_rate": 2.1816336884830032e-07, + "logits/chosen": -0.3961917459964752, + "logits/rejected": -0.4312414526939392, + "logps/chosen": -112.6959457397461, + "logps/rejected": -91.66386413574219, + "loss": 0.213, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5616722106933594, + "rewards/margins": 4.888741970062256, + "rewards/rejected": -1.3270705938339233, + "step": 2270 + }, + { + "epoch": 1.04, + "learning_rate": 2.176560121765601e-07, + "logits/chosen": -0.43959441781044006, + "logits/rejected": -0.472449392080307, + "logps/chosen": -122.6341323852539, + "logps/rejected": -97.61588287353516, + "loss": 0.1952, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.182515621185303, + "rewards/margins": 5.238072395324707, + "rewards/rejected": -1.0555566549301147, + "step": 2280 + }, + { + "epoch": 1.05, + "learning_rate": 2.1714865550481986e-07, + "logits/chosen": -0.4102107882499695, + "logits/rejected": -0.4536592364311218, + "logps/chosen": -120.60931396484375, + "logps/rejected": -92.90963745117188, + "loss": 0.2096, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.7443687915802, + "rewards/margins": 4.461726188659668, + "rewards/rejected": -0.7173573970794678, + "step": 2290 + }, + { + "epoch": 1.05, + "learning_rate": 2.1664129883307962e-07, + "logits/chosen": -0.42524951696395874, + "logits/rejected": -0.4480830729007721, + "logps/chosen": -115.37422943115234, + "logps/rejected": -102.08689880371094, + "loss": 0.2294, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 2.9731292724609375, + "rewards/margins": 4.231186389923096, + "rewards/rejected": -1.2580569982528687, + "step": 2300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -0.4051222503185272, + "eval_logits/rejected": -0.44120845198631287, + "eval_logps/chosen": -116.918701171875, + "eval_logps/rejected": -93.1042709350586, + "eval_loss": 0.20699380338191986, + "eval_rewards/accuracies": 0.8938547372817993, + "eval_rewards/chosen": 3.120523691177368, + "eval_rewards/margins": 4.207824230194092, + "eval_rewards/rejected": -1.087300419807434, + "eval_runtime": 870.8814, + "eval_samples_per_second": 3.286, + "eval_steps_per_second": 0.206, + "step": 2300 + }, + { + "epoch": 1.05, + "learning_rate": 2.161339421613394e-07, + "logits/chosen": -0.4129010736942291, + "logits/rejected": -0.4455359876155853, + "logps/chosen": -114.3980712890625, + "logps/rejected": -98.12439727783203, + "loss": 0.2081, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4188950061798096, + "rewards/margins": 4.589454174041748, + "rewards/rejected": -1.170559048652649, + "step": 2310 + }, + { + "epoch": 1.06, + "learning_rate": 2.1562658548959916e-07, + "logits/chosen": -0.4106817841529846, + "logits/rejected": -0.4454409182071686, + "logps/chosen": -124.04959869384766, + "logps/rejected": -92.8015365600586, + "loss": 0.179, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.623793363571167, + "rewards/margins": 4.647304058074951, + "rewards/rejected": -1.0235108137130737, + "step": 2320 + }, + { + "epoch": 1.06, + "learning_rate": 2.1511922881785892e-07, + "logits/chosen": -0.4041675627231598, + "logits/rejected": -0.44240039587020874, + "logps/chosen": -113.25587463378906, + "logps/rejected": -94.64698791503906, + "loss": 0.2092, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.898808002471924, + "rewards/margins": 4.1147894859313965, + "rewards/rejected": -1.2159812450408936, + "step": 2330 + }, + { + "epoch": 1.07, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -0.43367472290992737, + "logits/rejected": -0.47439584136009216, + "logps/chosen": -122.84818267822266, + "logps/rejected": -97.22991180419922, + "loss": 0.1672, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.218113422393799, + "rewards/margins": 4.060845375061035, + "rewards/rejected": -0.8427319526672363, + "step": 2340 + }, + { + "epoch": 1.07, + "learning_rate": 2.1410451547437846e-07, + "logits/chosen": -0.4216938018798828, + "logits/rejected": -0.4517236649990082, + "logps/chosen": -113.54508209228516, + "logps/rejected": -98.01654052734375, + "loss": 0.1917, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2716426849365234, + "rewards/margins": 4.241575717926025, + "rewards/rejected": -0.9699331521987915, + "step": 2350 + }, + { + "epoch": 1.08, + "learning_rate": 2.1359715880263822e-07, + "logits/chosen": -0.39088428020477295, + "logits/rejected": -0.4316210150718689, + "logps/chosen": -121.35298919677734, + "logps/rejected": -93.39158630371094, + "loss": 0.2117, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.365229368209839, + "rewards/margins": 3.8260416984558105, + "rewards/rejected": -0.460812509059906, + "step": 2360 + }, + { + "epoch": 1.08, + "learning_rate": 2.13089802130898e-07, + "logits/chosen": -0.43318310379981995, + "logits/rejected": -0.47380322217941284, + "logps/chosen": -113.5303955078125, + "logps/rejected": -95.12970733642578, + "loss": 0.2116, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.399010419845581, + "rewards/margins": 4.622774124145508, + "rewards/rejected": -1.2237637042999268, + "step": 2370 + }, + { + "epoch": 1.09, + "learning_rate": 2.1258244545915776e-07, + "logits/chosen": -0.4225497245788574, + "logits/rejected": -0.45139962434768677, + "logps/chosen": -120.57337951660156, + "logps/rejected": -97.2274398803711, + "loss": 0.206, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.4524986743927, + "rewards/margins": 4.2372307777404785, + "rewards/rejected": -0.7847325801849365, + "step": 2380 + }, + { + "epoch": 1.09, + "learning_rate": 2.1207508878741752e-07, + "logits/chosen": -0.41160812973976135, + "logits/rejected": -0.45344653725624084, + "logps/chosen": -110.57585144042969, + "logps/rejected": -87.49763488769531, + "loss": 0.2314, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 3.0225930213928223, + "rewards/margins": 4.383017539978027, + "rewards/rejected": -1.3604247570037842, + "step": 2390 + }, + { + "epoch": 1.1, + "learning_rate": 2.115677321156773e-07, + "logits/chosen": -0.4245067536830902, + "logits/rejected": -0.4589248597621918, + "logps/chosen": -125.22467041015625, + "logps/rejected": -100.40376281738281, + "loss": 0.1897, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4412460327148438, + "rewards/margins": 4.521696090698242, + "rewards/rejected": -1.0804498195648193, + "step": 2400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -0.41355904936790466, + "eval_logits/rejected": -0.44829556345939636, + "eval_logps/chosen": -116.84915924072266, + "eval_logps/rejected": -93.104736328125, + "eval_loss": 0.20109796524047852, + "eval_rewards/accuracies": 0.9050279259681702, + "eval_rewards/chosen": 3.155294179916382, + "eval_rewards/margins": 4.242827415466309, + "eval_rewards/rejected": -1.0875334739685059, + "eval_runtime": 925.0662, + "eval_samples_per_second": 3.094, + "eval_steps_per_second": 0.193, + "step": 2400 + }, + { + "epoch": 1.1, + "learning_rate": 2.1106037544393706e-07, + "logits/chosen": -0.4162351191043854, + "logits/rejected": -0.44722890853881836, + "logps/chosen": -117.66087341308594, + "logps/rejected": -98.16817474365234, + "loss": 0.2215, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.286454439163208, + "rewards/margins": 4.626716136932373, + "rewards/rejected": -1.3402621746063232, + "step": 2410 + }, + { + "epoch": 1.1, + "learning_rate": 2.1055301877219682e-07, + "logits/chosen": -0.3990441560745239, + "logits/rejected": -0.44646185636520386, + "logps/chosen": -127.4264144897461, + "logps/rejected": -94.5553207397461, + "loss": 0.2092, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.721330165863037, + "rewards/margins": 4.82289981842041, + "rewards/rejected": -1.1015697717666626, + "step": 2420 + }, + { + "epoch": 1.11, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -0.4313054084777832, + "logits/rejected": -0.4540146291255951, + "logps/chosen": -107.0306396484375, + "logps/rejected": -92.35552215576172, + "loss": 0.1953, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.239025115966797, + "rewards/margins": 4.2459869384765625, + "rewards/rejected": -1.0069619417190552, + "step": 2430 + }, + { + "epoch": 1.11, + "learning_rate": 2.0953830542871636e-07, + "logits/chosen": -0.4273985028266907, + "logits/rejected": -0.45803728699684143, + "logps/chosen": -120.7825927734375, + "logps/rejected": -95.52888488769531, + "loss": 0.2017, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.346850633621216, + "rewards/margins": 4.3342814445495605, + "rewards/rejected": -0.9874309301376343, + "step": 2440 + }, + { + "epoch": 1.12, + "learning_rate": 2.0903094875697612e-07, + "logits/chosen": -0.42989325523376465, + "logits/rejected": -0.47014039754867554, + "logps/chosen": -127.97347259521484, + "logps/rejected": -96.18221282958984, + "loss": 0.1925, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.130273342132568, + "rewards/margins": 4.938960075378418, + "rewards/rejected": -0.8086867332458496, + "step": 2450 + }, + { + "epoch": 1.12, + "learning_rate": 2.085235920852359e-07, + "logits/chosen": -0.4441000819206238, + "logits/rejected": -0.47532176971435547, + "logps/chosen": -118.6019058227539, + "logps/rejected": -97.86750793457031, + "loss": 0.1793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.886178493499756, + "rewards/margins": 4.492079257965088, + "rewards/rejected": -1.605900526046753, + "step": 2460 + }, + { + "epoch": 1.13, + "learning_rate": 2.0801623541349566e-07, + "logits/chosen": -0.39648327231407166, + "logits/rejected": -0.4312874674797058, + "logps/chosen": -118.57756042480469, + "logps/rejected": -95.03660583496094, + "loss": 0.2082, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.6688880920410156, + "rewards/margins": 4.01635217666626, + "rewards/rejected": -0.3474644124507904, + "step": 2470 + }, + { + "epoch": 1.13, + "learning_rate": 2.0750887874175542e-07, + "logits/chosen": -0.42028895020484924, + "logits/rejected": -0.46101123094558716, + "logps/chosen": -120.1390151977539, + "logps/rejected": -102.4979019165039, + "loss": 0.162, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.363370418548584, + "rewards/margins": 5.152722358703613, + "rewards/rejected": -1.7893520593643188, + "step": 2480 + }, + { + "epoch": 1.14, + "learning_rate": 2.070015220700152e-07, + "logits/chosen": -0.4217614531517029, + "logits/rejected": -0.4549930989742279, + "logps/chosen": -120.854736328125, + "logps/rejected": -94.92854309082031, + "loss": 0.1858, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.3437397480010986, + "rewards/margins": 4.921185493469238, + "rewards/rejected": -1.5774452686309814, + "step": 2490 + }, + { + "epoch": 1.14, + "learning_rate": 2.0649416539827496e-07, + "logits/chosen": -0.38463494181632996, + "logits/rejected": -0.42476749420166016, + "logps/chosen": -121.46852111816406, + "logps/rejected": -97.03733825683594, + "loss": 0.1943, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.1613824367523193, + "rewards/margins": 4.206856727600098, + "rewards/rejected": -1.0454740524291992, + "step": 2500 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -0.4137432873249054, + "eval_logits/rejected": -0.4488118886947632, + "eval_logps/chosen": -116.49640655517578, + "eval_logps/rejected": -93.38188171386719, + "eval_loss": 0.1952887624502182, + "eval_rewards/accuracies": 0.9022346138954163, + "eval_rewards/chosen": 3.3316712379455566, + "eval_rewards/margins": 4.5577778816223145, + "eval_rewards/rejected": -1.2261064052581787, + "eval_runtime": 871.869, + "eval_samples_per_second": 3.283, + "eval_steps_per_second": 0.205, + "step": 2500 + }, + { + "epoch": 1.15, + "learning_rate": 2.0598680872653472e-07, + "logits/chosen": -0.4275178015232086, + "logits/rejected": -0.4599391520023346, + "logps/chosen": -121.42420959472656, + "logps/rejected": -96.46556854248047, + "loss": 0.1868, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.031966209411621, + "rewards/margins": 5.205144882202148, + "rewards/rejected": -1.1731784343719482, + "step": 2510 + }, + { + "epoch": 1.15, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -0.40186938643455505, + "logits/rejected": -0.4418622851371765, + "logps/chosen": -119.48040771484375, + "logps/rejected": -94.07899475097656, + "loss": 0.1948, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.7327773571014404, + "rewards/margins": 4.83219051361084, + "rewards/rejected": -1.0994136333465576, + "step": 2520 + }, + { + "epoch": 1.15, + "learning_rate": 2.0497209538305426e-07, + "logits/chosen": -0.4523470401763916, + "logits/rejected": -0.4795070290565491, + "logps/chosen": -117.58869934082031, + "logps/rejected": -92.097412109375, + "loss": 0.1957, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.2375411987304688, + "rewards/margins": 4.7822418212890625, + "rewards/rejected": -1.5447006225585938, + "step": 2530 + }, + { + "epoch": 1.16, + "learning_rate": 2.0446473871131402e-07, + "logits/chosen": -0.42697858810424805, + "logits/rejected": -0.4653555452823639, + "logps/chosen": -128.19395446777344, + "logps/rejected": -104.2452621459961, + "loss": 0.1956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7017173767089844, + "rewards/margins": 5.180688381195068, + "rewards/rejected": -1.478971004486084, + "step": 2540 + }, + { + "epoch": 1.16, + "learning_rate": 2.039573820395738e-07, + "logits/chosen": -0.4241272807121277, + "logits/rejected": -0.4700315594673157, + "logps/chosen": -122.24373626708984, + "logps/rejected": -92.90812683105469, + "loss": 0.2147, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.6799728870391846, + "rewards/margins": 4.794487476348877, + "rewards/rejected": -1.114514946937561, + "step": 2550 + }, + { + "epoch": 1.17, + "learning_rate": 2.0345002536783356e-07, + "logits/chosen": -0.4255804419517517, + "logits/rejected": -0.45682722330093384, + "logps/chosen": -117.48539733886719, + "logps/rejected": -91.08617401123047, + "loss": 0.2029, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.406527042388916, + "rewards/margins": 4.893421649932861, + "rewards/rejected": -1.486893892288208, + "step": 2560 + }, + { + "epoch": 1.17, + "learning_rate": 2.0294266869609332e-07, + "logits/chosen": -0.4311680793762207, + "logits/rejected": -0.46691998839378357, + "logps/chosen": -116.72953033447266, + "logps/rejected": -91.28812408447266, + "loss": 0.1983, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2318546772003174, + "rewards/margins": 4.533720970153809, + "rewards/rejected": -1.301865816116333, + "step": 2570 + }, + { + "epoch": 1.18, + "learning_rate": 2.024353120243531e-07, + "logits/chosen": -0.4406512677669525, + "logits/rejected": -0.471552312374115, + "logps/chosen": -112.9683609008789, + "logps/rejected": -98.54328918457031, + "loss": 0.2039, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4355063438415527, + "rewards/margins": 4.474089622497559, + "rewards/rejected": -1.0385830402374268, + "step": 2580 + }, + { + "epoch": 1.18, + "learning_rate": 2.0192795535261286e-07, + "logits/chosen": -0.41342782974243164, + "logits/rejected": -0.45259562134742737, + "logps/chosen": -122.47785949707031, + "logps/rejected": -98.93025970458984, + "loss": 0.1774, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7889657020568848, + "rewards/margins": 5.727280616760254, + "rewards/rejected": -1.9383147954940796, + "step": 2590 + }, + { + "epoch": 1.19, + "learning_rate": 2.0142059868087262e-07, + "logits/chosen": -0.43133312463760376, + "logits/rejected": -0.45189160108566284, + "logps/chosen": -118.82820129394531, + "logps/rejected": -95.33827209472656, + "loss": 0.1749, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4565823078155518, + "rewards/margins": 4.577003479003906, + "rewards/rejected": -1.1204214096069336, + "step": 2600 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -0.4159504175186157, + "eval_logits/rejected": -0.4499986171722412, + "eval_logps/chosen": -116.7225112915039, + "eval_logps/rejected": -93.57614135742188, + "eval_loss": 0.19750715792179108, + "eval_rewards/accuracies": 0.8910614252090454, + "eval_rewards/chosen": 3.2186174392700195, + "eval_rewards/margins": 4.541853427886963, + "eval_rewards/rejected": -1.323236107826233, + "eval_runtime": 917.0951, + "eval_samples_per_second": 3.121, + "eval_steps_per_second": 0.195, + "step": 2600 + }, + { + "epoch": 1.19, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -0.37940889596939087, + "logits/rejected": -0.4150822162628174, + "logps/chosen": -111.95550537109375, + "logps/rejected": -94.47859954833984, + "loss": 0.1907, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.825603485107422, + "rewards/margins": 5.173952102661133, + "rewards/rejected": -1.3483483791351318, + "step": 2610 + }, + { + "epoch": 1.2, + "learning_rate": 2.0040588533739216e-07, + "logits/chosen": -0.4204653203487396, + "logits/rejected": -0.44999074935913086, + "logps/chosen": -124.92396545410156, + "logps/rejected": -98.2974853515625, + "loss": 0.1823, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0983715057373047, + "rewards/margins": 4.7994771003723145, + "rewards/rejected": -1.7011057138442993, + "step": 2620 + }, + { + "epoch": 1.2, + "learning_rate": 1.9989852866565192e-07, + "logits/chosen": -0.42279934883117676, + "logits/rejected": -0.4580332636833191, + "logps/chosen": -119.20674133300781, + "logps/rejected": -94.82040405273438, + "loss": 0.1818, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.2803149223327637, + "rewards/margins": 4.06785774230957, + "rewards/rejected": -0.7875427603721619, + "step": 2630 + }, + { + "epoch": 1.21, + "learning_rate": 1.993911719939117e-07, + "logits/chosen": -0.39321383833885193, + "logits/rejected": -0.43814224004745483, + "logps/chosen": -117.86689758300781, + "logps/rejected": -91.55029296875, + "loss": 0.1832, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.1397879123687744, + "rewards/margins": 4.142951488494873, + "rewards/rejected": -1.0031640529632568, + "step": 2640 + }, + { + "epoch": 1.21, + "learning_rate": 1.9888381532217146e-07, + "logits/chosen": -0.4433029294013977, + "logits/rejected": -0.46814584732055664, + "logps/chosen": -111.34849548339844, + "logps/rejected": -95.34590911865234, + "loss": 0.1731, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.7614011764526367, + "rewards/margins": 4.330539703369141, + "rewards/rejected": -1.5691388845443726, + "step": 2650 + }, + { + "epoch": 1.21, + "learning_rate": 1.9837645865043122e-07, + "logits/chosen": -0.4222814440727234, + "logits/rejected": -0.45477691292762756, + "logps/chosen": -114.8381576538086, + "logps/rejected": -93.96653747558594, + "loss": 0.1705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6209235191345215, + "rewards/margins": 5.391648769378662, + "rewards/rejected": -1.7707252502441406, + "step": 2660 + }, + { + "epoch": 1.22, + "learning_rate": 1.97869101978691e-07, + "logits/chosen": -0.4320278763771057, + "logits/rejected": -0.4549098610877991, + "logps/chosen": -116.82099914550781, + "logps/rejected": -95.1173324584961, + "loss": 0.1619, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4755518436431885, + "rewards/margins": 3.844717025756836, + "rewards/rejected": -1.3691655397415161, + "step": 2670 + }, + { + "epoch": 1.22, + "learning_rate": 1.9736174530695076e-07, + "logits/chosen": -0.3963245451450348, + "logits/rejected": -0.4262765347957611, + "logps/chosen": -116.55233001708984, + "logps/rejected": -92.63807678222656, + "loss": 0.1714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.089799165725708, + "rewards/margins": 4.424617767333984, + "rewards/rejected": -1.3348182439804077, + "step": 2680 + }, + { + "epoch": 1.23, + "learning_rate": 1.9685438863521052e-07, + "logits/chosen": -0.41304507851600647, + "logits/rejected": -0.4391093850135803, + "logps/chosen": -114.98722839355469, + "logps/rejected": -95.16316223144531, + "loss": 0.1799, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.718804121017456, + "rewards/margins": 4.613897800445557, + "rewards/rejected": -0.8950934410095215, + "step": 2690 + }, + { + "epoch": 1.23, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -0.39430639147758484, + "logits/rejected": -0.4395686089992523, + "logps/chosen": -122.29681396484375, + "logps/rejected": -94.48133087158203, + "loss": 0.1881, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.283339738845825, + "rewards/margins": 4.763991355895996, + "rewards/rejected": -1.480652093887329, + "step": 2700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -0.39620405435562134, + "eval_logits/rejected": -0.4262426793575287, + "eval_logps/chosen": -116.51838684082031, + "eval_logps/rejected": -93.5943603515625, + "eval_loss": 0.18383081257343292, + "eval_rewards/accuracies": 0.9273743033409119, + "eval_rewards/chosen": 3.320681095123291, + "eval_rewards/margins": 4.653022766113281, + "eval_rewards/rejected": -1.3323419094085693, + "eval_runtime": 882.9525, + "eval_samples_per_second": 3.241, + "eval_steps_per_second": 0.203, + "step": 2700 + }, + { + "epoch": 1.24, + "learning_rate": 1.9583967529173006e-07, + "logits/chosen": -0.37125352025032043, + "logits/rejected": -0.40501752495765686, + "logps/chosen": -122.63427734375, + "logps/rejected": -95.20475006103516, + "loss": 0.1623, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9033172130584717, + "rewards/margins": 4.929192543029785, + "rewards/rejected": -1.0258758068084717, + "step": 2710 + }, + { + "epoch": 1.24, + "learning_rate": 1.9533231861998982e-07, + "logits/chosen": -0.4100262522697449, + "logits/rejected": -0.44318699836730957, + "logps/chosen": -123.42138671875, + "logps/rejected": -100.46942901611328, + "loss": 0.1807, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8883254528045654, + "rewards/margins": 5.432125568389893, + "rewards/rejected": -1.5438001155853271, + "step": 2720 + }, + { + "epoch": 1.25, + "learning_rate": 1.948249619482496e-07, + "logits/chosen": -0.4184117913246155, + "logits/rejected": -0.4501233994960785, + "logps/chosen": -116.8466796875, + "logps/rejected": -93.37806701660156, + "loss": 0.1818, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.737032413482666, + "rewards/margins": 4.598247528076172, + "rewards/rejected": -1.8612152338027954, + "step": 2730 + }, + { + "epoch": 1.25, + "learning_rate": 1.9431760527650936e-07, + "logits/chosen": -0.39006081223487854, + "logits/rejected": -0.43392476439476013, + "logps/chosen": -128.30226135253906, + "logps/rejected": -95.20794677734375, + "loss": 0.172, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.975538730621338, + "rewards/margins": 5.608933448791504, + "rewards/rejected": -1.6333945989608765, + "step": 2740 + }, + { + "epoch": 1.26, + "learning_rate": 1.9381024860476912e-07, + "logits/chosen": -0.42632046341896057, + "logits/rejected": -0.4491303861141205, + "logps/chosen": -127.4406509399414, + "logps/rejected": -97.50559997558594, + "loss": 0.1711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2022480964660645, + "rewards/margins": 5.0386505126953125, + "rewards/rejected": -1.836402177810669, + "step": 2750 + }, + { + "epoch": 1.26, + "learning_rate": 1.933028919330289e-07, + "logits/chosen": -0.4237644076347351, + "logits/rejected": -0.44978874921798706, + "logps/chosen": -119.2694320678711, + "logps/rejected": -96.05281066894531, + "loss": 0.1957, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 3.3220973014831543, + "rewards/margins": 4.4357171058654785, + "rewards/rejected": -1.1136192083358765, + "step": 2760 + }, + { + "epoch": 1.26, + "learning_rate": 1.9279553526128866e-07, + "logits/chosen": -0.40960246324539185, + "logits/rejected": -0.44518280029296875, + "logps/chosen": -121.33802795410156, + "logps/rejected": -94.39434814453125, + "loss": 0.222, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.6748435497283936, + "rewards/margins": 5.344630718231201, + "rewards/rejected": -1.6697871685028076, + "step": 2770 + }, + { + "epoch": 1.27, + "learning_rate": 1.9228817858954842e-07, + "logits/chosen": -0.38372522592544556, + "logits/rejected": -0.4217826724052429, + "logps/chosen": -116.77619934082031, + "logps/rejected": -94.98020935058594, + "loss": 0.1494, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7456507682800293, + "rewards/margins": 5.059269905090332, + "rewards/rejected": -1.3136188983917236, + "step": 2780 + }, + { + "epoch": 1.27, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -0.41423946619033813, + "logits/rejected": -0.4451848864555359, + "logps/chosen": -109.53131103515625, + "logps/rejected": -95.5805435180664, + "loss": 0.1633, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.9713897705078125, + "rewards/margins": 4.25122594833374, + "rewards/rejected": -1.279836654663086, + "step": 2790 + }, + { + "epoch": 1.28, + "learning_rate": 1.9127346524606796e-07, + "logits/chosen": -0.3843603730201721, + "logits/rejected": -0.41677069664001465, + "logps/chosen": -121.75700378417969, + "logps/rejected": -96.72332763671875, + "loss": 0.1611, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5085883140563965, + "rewards/margins": 4.820291042327881, + "rewards/rejected": -1.3117033243179321, + "step": 2800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -0.4090903401374817, + "eval_logits/rejected": -0.44037947058677673, + "eval_logps/chosen": -116.5835189819336, + "eval_logps/rejected": -93.6472396850586, + "eval_loss": 0.1833222210407257, + "eval_rewards/accuracies": 0.910614550113678, + "eval_rewards/chosen": 3.288114309310913, + "eval_rewards/margins": 4.646894454956055, + "eval_rewards/rejected": -1.358780026435852, + "eval_runtime": 899.7748, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 0.199, + "step": 2800 + }, + { + "epoch": 1.28, + "learning_rate": 1.9076610857432772e-07, + "logits/chosen": -0.4082648754119873, + "logits/rejected": -0.4388160705566406, + "logps/chosen": -119.51053619384766, + "logps/rejected": -98.2977066040039, + "loss": 0.1952, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.516996383666992, + "rewards/margins": 4.8721923828125, + "rewards/rejected": -1.3551959991455078, + "step": 2810 + }, + { + "epoch": 1.29, + "learning_rate": 1.902587519025875e-07, + "logits/chosen": -0.39836177229881287, + "logits/rejected": -0.43699654936790466, + "logps/chosen": -130.80030822753906, + "logps/rejected": -98.20820617675781, + "loss": 0.1628, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.9940390586853027, + "rewards/margins": 5.695845603942871, + "rewards/rejected": -1.701806664466858, + "step": 2820 + }, + { + "epoch": 1.29, + "learning_rate": 1.8975139523084726e-07, + "logits/chosen": -0.4374879002571106, + "logits/rejected": -0.4642358720302582, + "logps/chosen": -124.3564224243164, + "logps/rejected": -101.50624084472656, + "loss": 0.1992, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 2.7666969299316406, + "rewards/margins": 4.858794212341309, + "rewards/rejected": -2.092097520828247, + "step": 2830 + }, + { + "epoch": 1.3, + "learning_rate": 1.8924403855910702e-07, + "logits/chosen": -0.4187402129173279, + "logits/rejected": -0.4483548700809479, + "logps/chosen": -116.7706527709961, + "logps/rejected": -92.22254943847656, + "loss": 0.1856, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.8258209228515625, + "rewards/margins": 4.667115688323975, + "rewards/rejected": -1.841294527053833, + "step": 2840 + }, + { + "epoch": 1.3, + "learning_rate": 1.887366818873668e-07, + "logits/chosen": -0.4137202799320221, + "logits/rejected": -0.4449295401573181, + "logps/chosen": -117.85295104980469, + "logps/rejected": -96.74876403808594, + "loss": 0.1435, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.32916259765625, + "rewards/margins": 4.432304859161377, + "rewards/rejected": -1.1031419038772583, + "step": 2850 + }, + { + "epoch": 1.31, + "learning_rate": 1.8822932521562656e-07, + "logits/chosen": -0.3739413619041443, + "logits/rejected": -0.4138815999031067, + "logps/chosen": -116.3170394897461, + "logps/rejected": -97.33705139160156, + "loss": 0.1594, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.9688007831573486, + "rewards/margins": 5.0333571434021, + "rewards/rejected": -1.0645564794540405, + "step": 2860 + }, + { + "epoch": 1.31, + "learning_rate": 1.8772196854388632e-07, + "logits/chosen": -0.43607673048973083, + "logits/rejected": -0.4659034311771393, + "logps/chosen": -117.14688873291016, + "logps/rejected": -95.03040313720703, + "loss": 0.1743, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.77264404296875, + "rewards/margins": 5.061250686645508, + "rewards/rejected": -1.288606882095337, + "step": 2870 + }, + { + "epoch": 1.31, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -0.4345594346523285, + "logits/rejected": -0.4634518623352051, + "logps/chosen": -128.435791015625, + "logps/rejected": -94.08797454833984, + "loss": 0.1772, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.388913154602051, + "rewards/margins": 5.8124918937683105, + "rewards/rejected": -1.4235796928405762, + "step": 2880 + }, + { + "epoch": 1.32, + "learning_rate": 1.8670725520040586e-07, + "logits/chosen": -0.438603937625885, + "logits/rejected": -0.46849116683006287, + "logps/chosen": -121.33575439453125, + "logps/rejected": -96.94755554199219, + "loss": 0.1648, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.078803062438965, + "rewards/margins": 5.43464469909668, + "rewards/rejected": -1.3558417558670044, + "step": 2890 + }, + { + "epoch": 1.32, + "learning_rate": 1.8619989852866562e-07, + "logits/chosen": -0.40312066674232483, + "logits/rejected": -0.4359334111213684, + "logps/chosen": -115.02134704589844, + "logps/rejected": -96.40849304199219, + "loss": 0.1653, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.6323986053466797, + "rewards/margins": 5.246556282043457, + "rewards/rejected": -1.614158272743225, + "step": 2900 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -0.3995789587497711, + "eval_logits/rejected": -0.4252215325832367, + "eval_logps/chosen": -116.65079498291016, + "eval_logps/rejected": -94.15835571289062, + "eval_loss": 0.195852130651474, + "eval_rewards/accuracies": 0.9189944267272949, + "eval_rewards/chosen": 3.2544755935668945, + "eval_rewards/margins": 4.868815898895264, + "eval_rewards/rejected": -1.6143405437469482, + "eval_runtime": 907.9432, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 0.197, + "step": 2900 + }, + { + "epoch": 1.33, + "learning_rate": 1.856925418569254e-07, + "logits/chosen": -0.43267160654067993, + "logits/rejected": -0.4583393931388855, + "logps/chosen": -127.33109283447266, + "logps/rejected": -98.23350524902344, + "loss": 0.1736, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.151562690734863, + "rewards/margins": 6.144377708435059, + "rewards/rejected": -1.992815375328064, + "step": 2910 + }, + { + "epoch": 1.33, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -0.41695013642311096, + "logits/rejected": -0.4449712336063385, + "logps/chosen": -115.79557037353516, + "logps/rejected": -96.62245178222656, + "loss": 0.1925, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.329120635986328, + "rewards/margins": 4.9218339920043945, + "rewards/rejected": -1.5927129983901978, + "step": 2920 + }, + { + "epoch": 1.34, + "learning_rate": 1.8467782851344492e-07, + "logits/chosen": -0.40566110610961914, + "logits/rejected": -0.44443726539611816, + "logps/chosen": -116.14949798583984, + "logps/rejected": -94.5146484375, + "loss": 0.2166, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.645608425140381, + "rewards/margins": 4.7818427085876465, + "rewards/rejected": -1.1362345218658447, + "step": 2930 + }, + { + "epoch": 1.34, + "learning_rate": 1.841704718417047e-07, + "logits/chosen": -0.4052400588989258, + "logits/rejected": -0.4479829668998718, + "logps/chosen": -119.50074768066406, + "logps/rejected": -96.74118041992188, + "loss": 0.1592, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.713555097579956, + "rewards/margins": 5.739047527313232, + "rewards/rejected": -2.0254924297332764, + "step": 2940 + }, + { + "epoch": 1.35, + "learning_rate": 1.8366311516996446e-07, + "logits/chosen": -0.3960481286048889, + "logits/rejected": -0.4279060363769531, + "logps/chosen": -113.77364349365234, + "logps/rejected": -89.41834259033203, + "loss": 0.1519, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.5727920532226562, + "rewards/margins": 4.469664096832275, + "rewards/rejected": -0.8968713879585266, + "step": 2950 + }, + { + "epoch": 1.35, + "learning_rate": 1.8315575849822422e-07, + "logits/chosen": -0.3719174563884735, + "logits/rejected": -0.41467922925949097, + "logps/chosen": -122.60404205322266, + "logps/rejected": -98.62785339355469, + "loss": 0.1757, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8493294715881348, + "rewards/margins": 4.896603584289551, + "rewards/rejected": -1.0472742319107056, + "step": 2960 + }, + { + "epoch": 1.36, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -0.4145309031009674, + "logits/rejected": -0.4517810344696045, + "logps/chosen": -117.5206527709961, + "logps/rejected": -91.79268646240234, + "loss": 0.1798, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.707047700881958, + "rewards/margins": 4.669058322906494, + "rewards/rejected": -0.9620102643966675, + "step": 2970 + }, + { + "epoch": 1.36, + "learning_rate": 1.8214104515474375e-07, + "logits/chosen": -0.41848722100257874, + "logits/rejected": -0.44989675283432007, + "logps/chosen": -121.28948974609375, + "logps/rejected": -97.52169036865234, + "loss": 0.1789, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.630518674850464, + "rewards/margins": 5.358605861663818, + "rewards/rejected": -1.7280871868133545, + "step": 2980 + }, + { + "epoch": 1.36, + "learning_rate": 1.8163368848300352e-07, + "logits/chosen": -0.41235464811325073, + "logits/rejected": -0.4453458786010742, + "logps/chosen": -115.3282699584961, + "logps/rejected": -101.21430969238281, + "loss": 0.1992, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.833465576171875, + "rewards/margins": 5.110463619232178, + "rewards/rejected": -1.2769982814788818, + "step": 2990 + }, + { + "epoch": 1.37, + "learning_rate": 1.811263318112633e-07, + "logits/chosen": -0.4016539454460144, + "logits/rejected": -0.43046021461486816, + "logps/chosen": -115.31640625, + "logps/rejected": -94.98763275146484, + "loss": 0.1613, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.6097488403320312, + "rewards/margins": 4.730071067810059, + "rewards/rejected": -1.1203219890594482, + "step": 3000 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -0.4070635735988617, + "eval_logits/rejected": -0.43743789196014404, + "eval_logps/chosen": -116.37444305419922, + "eval_logps/rejected": -93.96776580810547, + "eval_loss": 0.17788465321063995, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.392648935317993, + "eval_rewards/margins": 4.911698818206787, + "eval_rewards/rejected": -1.5190494060516357, + "eval_runtime": 878.9478, + "eval_samples_per_second": 3.256, + "eval_steps_per_second": 0.204, + "step": 3000 + }, + { + "epoch": 1.37, + "learning_rate": 1.8061897513952305e-07, + "logits/chosen": -0.41279563307762146, + "logits/rejected": -0.45051974058151245, + "logps/chosen": -124.44229888916016, + "logps/rejected": -95.6501235961914, + "loss": 0.1596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.279679298400879, + "rewards/margins": 5.73773193359375, + "rewards/rejected": -1.4580527544021606, + "step": 3010 + }, + { + "epoch": 1.38, + "learning_rate": 1.8011161846778282e-07, + "logits/chosen": -0.4213915765285492, + "logits/rejected": -0.45495182275772095, + "logps/chosen": -122.38909912109375, + "logps/rejected": -95.29975891113281, + "loss": 0.1814, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.3720550537109375, + "rewards/margins": 4.8198981285095215, + "rewards/rejected": -1.4478428363800049, + "step": 3020 + }, + { + "epoch": 1.38, + "learning_rate": 1.796042617960426e-07, + "logits/chosen": -0.422152578830719, + "logits/rejected": -0.4571755826473236, + "logps/chosen": -127.7369155883789, + "logps/rejected": -95.61505889892578, + "loss": 0.1488, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.14093279838562, + "rewards/margins": 4.787783622741699, + "rewards/rejected": -1.6468513011932373, + "step": 3030 + }, + { + "epoch": 1.39, + "learning_rate": 1.7909690512430235e-07, + "logits/chosen": -0.43789142370224, + "logits/rejected": -0.4707309305667877, + "logps/chosen": -117.2684097290039, + "logps/rejected": -94.7235336303711, + "loss": 0.1642, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.235317230224609, + "rewards/margins": 5.704260349273682, + "rewards/rejected": -1.4689429998397827, + "step": 3040 + }, + { + "epoch": 1.39, + "learning_rate": 1.7858954845256212e-07, + "logits/chosen": -0.39825141429901123, + "logits/rejected": -0.4274836480617523, + "logps/chosen": -117.91035461425781, + "logps/rejected": -99.51007843017578, + "loss": 0.1547, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6470961570739746, + "rewards/margins": 5.261767864227295, + "rewards/rejected": -1.6146717071533203, + "step": 3050 + }, + { + "epoch": 1.4, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -0.41886386275291443, + "logits/rejected": -0.44838714599609375, + "logps/chosen": -117.38224792480469, + "logps/rejected": -101.48182678222656, + "loss": 0.1534, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.344783067703247, + "rewards/margins": 4.771211624145508, + "rewards/rejected": -1.426428198814392, + "step": 3060 + }, + { + "epoch": 1.4, + "learning_rate": 1.7757483510908165e-07, + "logits/chosen": -0.4153475761413574, + "logits/rejected": -0.4372943043708801, + "logps/chosen": -113.58921813964844, + "logps/rejected": -96.70695495605469, + "loss": 0.1712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.796396255493164, + "rewards/margins": 5.346527099609375, + "rewards/rejected": -1.55013108253479, + "step": 3070 + }, + { + "epoch": 1.41, + "learning_rate": 1.7706747843734142e-07, + "logits/chosen": -0.40615734457969666, + "logits/rejected": -0.4455091953277588, + "logps/chosen": -115.18067932128906, + "logps/rejected": -89.9873275756836, + "loss": 0.1599, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7986888885498047, + "rewards/margins": 5.377463340759277, + "rewards/rejected": -1.5787745714187622, + "step": 3080 + }, + { + "epoch": 1.41, + "learning_rate": 1.765601217656012e-07, + "logits/chosen": -0.39819687604904175, + "logits/rejected": -0.4362192153930664, + "logps/chosen": -120.87335205078125, + "logps/rejected": -93.75814056396484, + "loss": 0.1579, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.934561252593994, + "rewards/margins": 5.506875514984131, + "rewards/rejected": -1.572314977645874, + "step": 3090 + }, + { + "epoch": 1.42, + "learning_rate": 1.7605276509386095e-07, + "logits/chosen": -0.3975370526313782, + "logits/rejected": -0.4346223473548889, + "logps/chosen": -115.0535888671875, + "logps/rejected": -96.48837280273438, + "loss": 0.1785, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.473949432373047, + "rewards/margins": 4.882634162902832, + "rewards/rejected": -1.4086847305297852, + "step": 3100 + }, + { + "epoch": 1.42, + "eval_logits/chosen": -0.39869609475135803, + "eval_logits/rejected": -0.42796045541763306, + "eval_logps/chosen": -116.34907531738281, + "eval_logps/rejected": -94.18682098388672, + "eval_loss": 0.18402154743671417, + "eval_rewards/accuracies": 0.924580991268158, + "eval_rewards/chosen": 3.4053359031677246, + "eval_rewards/margins": 5.033909797668457, + "eval_rewards/rejected": -1.6285736560821533, + "eval_runtime": 898.2256, + "eval_samples_per_second": 3.186, + "eval_steps_per_second": 0.199, + "step": 3100 + }, + { + "epoch": 1.42, + "learning_rate": 1.7554540842212072e-07, + "logits/chosen": -0.4134501516819, + "logits/rejected": -0.44677990674972534, + "logps/chosen": -121.47572326660156, + "logps/rejected": -93.6028060913086, + "loss": 0.1444, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5641727447509766, + "rewards/margins": 4.663364410400391, + "rewards/rejected": -1.0991913080215454, + "step": 3110 + }, + { + "epoch": 1.42, + "learning_rate": 1.750380517503805e-07, + "logits/chosen": -0.4042896330356598, + "logits/rejected": -0.44077545404434204, + "logps/chosen": -121.50472259521484, + "logps/rejected": -96.93872833251953, + "loss": 0.1782, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.516829013824463, + "rewards/margins": 5.27219295501709, + "rewards/rejected": -1.7553646564483643, + "step": 3120 + }, + { + "epoch": 1.43, + "learning_rate": 1.7453069507864025e-07, + "logits/chosen": -0.4291275441646576, + "logits/rejected": -0.4636690020561218, + "logps/chosen": -116.3439712524414, + "logps/rejected": -87.82887268066406, + "loss": 0.1881, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.2129287719726562, + "rewards/margins": 4.926560401916504, + "rewards/rejected": -1.7136313915252686, + "step": 3130 + }, + { + "epoch": 1.43, + "learning_rate": 1.7402333840690002e-07, + "logits/chosen": -0.42489296197891235, + "logits/rejected": -0.4597654938697815, + "logps/chosen": -119.62519836425781, + "logps/rejected": -92.69261169433594, + "loss": 0.1501, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.7139596939086914, + "rewards/margins": 4.3475847244262695, + "rewards/rejected": -1.6336256265640259, + "step": 3140 + }, + { + "epoch": 1.44, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -0.38904017210006714, + "logits/rejected": -0.4342716336250305, + "logps/chosen": -114.5102310180664, + "logps/rejected": -99.30461120605469, + "loss": 0.1625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.767416000366211, + "rewards/margins": 5.650642395019531, + "rewards/rejected": -1.8832263946533203, + "step": 3150 + }, + { + "epoch": 1.44, + "learning_rate": 1.7300862506341955e-07, + "logits/chosen": -0.41989022493362427, + "logits/rejected": -0.44726991653442383, + "logps/chosen": -127.139892578125, + "logps/rejected": -98.16439819335938, + "loss": 0.1631, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7642390727996826, + "rewards/margins": 5.772668361663818, + "rewards/rejected": -2.008429765701294, + "step": 3160 + }, + { + "epoch": 1.45, + "learning_rate": 1.7250126839167932e-07, + "logits/chosen": -0.3980824053287506, + "logits/rejected": -0.4251154065132141, + "logps/chosen": -122.60902404785156, + "logps/rejected": -99.0849380493164, + "loss": 0.1854, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4130959510803223, + "rewards/margins": 4.698065757751465, + "rewards/rejected": -1.2849695682525635, + "step": 3170 + }, + { + "epoch": 1.45, + "learning_rate": 1.719939117199391e-07, + "logits/chosen": -0.42671066522598267, + "logits/rejected": -0.4508728086948395, + "logps/chosen": -117.6341323852539, + "logps/rejected": -98.55741882324219, + "loss": 0.1278, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.02409029006958, + "rewards/margins": 5.484737396240234, + "rewards/rejected": -1.4606469869613647, + "step": 3180 + }, + { + "epoch": 1.46, + "learning_rate": 1.7148655504819885e-07, + "logits/chosen": -0.43290767073631287, + "logits/rejected": -0.47308388352394104, + "logps/chosen": -118.79693603515625, + "logps/rejected": -98.24039459228516, + "loss": 0.1406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.303812026977539, + "rewards/margins": 6.049398422241211, + "rewards/rejected": -1.7455860376358032, + "step": 3190 + }, + { + "epoch": 1.46, + "learning_rate": 1.7097919837645862e-07, + "logits/chosen": -0.4589117169380188, + "logits/rejected": -0.48393720388412476, + "logps/chosen": -120.695556640625, + "logps/rejected": -98.32371520996094, + "loss": 0.1544, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.8730056285858154, + "rewards/margins": 5.28024435043335, + "rewards/rejected": -2.407238483428955, + "step": 3200 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -0.4308675527572632, + "eval_logits/rejected": -0.4623854160308838, + "eval_logps/chosen": -116.15389251708984, + "eval_logps/rejected": -94.2074966430664, + "eval_loss": 0.16856719553470612, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.5029242038726807, + "eval_rewards/margins": 5.141838073730469, + "eval_rewards/rejected": -1.6389143466949463, + "eval_runtime": 914.3632, + "eval_samples_per_second": 3.13, + "eval_steps_per_second": 0.196, + "step": 3200 + }, + { + "epoch": 1.47, + "learning_rate": 1.704718417047184e-07, + "logits/chosen": -0.4398719370365143, + "logits/rejected": -0.4648303985595703, + "logps/chosen": -111.4161376953125, + "logps/rejected": -92.91444396972656, + "loss": 0.15, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 2.9489493370056152, + "rewards/margins": 4.732758522033691, + "rewards/rejected": -1.783808708190918, + "step": 3210 + }, + { + "epoch": 1.47, + "learning_rate": 1.6996448503297815e-07, + "logits/chosen": -0.40833503007888794, + "logits/rejected": -0.43593111634254456, + "logps/chosen": -113.5337905883789, + "logps/rejected": -99.6334228515625, + "loss": 0.1369, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5159003734588623, + "rewards/margins": 5.7945661544799805, + "rewards/rejected": -2.2786660194396973, + "step": 3220 + }, + { + "epoch": 1.47, + "learning_rate": 1.6945712836123792e-07, + "logits/chosen": -0.4148642122745514, + "logits/rejected": -0.45154523849487305, + "logps/chosen": -114.74641418457031, + "logps/rejected": -94.3124008178711, + "loss": 0.148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7251458168029785, + "rewards/margins": 5.643283843994141, + "rewards/rejected": -1.9181379079818726, + "step": 3230 + }, + { + "epoch": 1.48, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -0.40678897500038147, + "logits/rejected": -0.4366432726383209, + "logps/chosen": -121.0203628540039, + "logps/rejected": -100.17606353759766, + "loss": 0.1528, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.0067360401153564, + "rewards/margins": 5.704981803894043, + "rewards/rejected": -2.6982460021972656, + "step": 3240 + }, + { + "epoch": 1.48, + "learning_rate": 1.6844241501775745e-07, + "logits/chosen": -0.41396284103393555, + "logits/rejected": -0.44056564569473267, + "logps/chosen": -117.8901138305664, + "logps/rejected": -96.82243347167969, + "loss": 0.1619, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.752368927001953, + "rewards/margins": 5.435792922973633, + "rewards/rejected": -1.6834239959716797, + "step": 3250 + }, + { + "epoch": 1.49, + "learning_rate": 1.6793505834601722e-07, + "logits/chosen": -0.4228527545928955, + "logits/rejected": -0.4528846740722656, + "logps/chosen": -110.54814147949219, + "logps/rejected": -98.03990936279297, + "loss": 0.1739, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.3309082984924316, + "rewards/margins": 4.948666095733643, + "rewards/rejected": -1.617757797241211, + "step": 3260 + }, + { + "epoch": 1.49, + "learning_rate": 1.67427701674277e-07, + "logits/chosen": -0.4388189911842346, + "logits/rejected": -0.4577499330043793, + "logps/chosen": -114.64045715332031, + "logps/rejected": -94.00943756103516, + "loss": 0.1794, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.3593318462371826, + "rewards/margins": 5.009249687194824, + "rewards/rejected": -1.6499179601669312, + "step": 3270 + }, + { + "epoch": 1.5, + "learning_rate": 1.6692034500253675e-07, + "logits/chosen": -0.4215324819087982, + "logits/rejected": -0.44902530312538147, + "logps/chosen": -116.54881286621094, + "logps/rejected": -97.64686584472656, + "loss": 0.1837, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.203897953033447, + "rewards/margins": 5.84643030166626, + "rewards/rejected": -1.6425319910049438, + "step": 3280 + }, + { + "epoch": 1.5, + "learning_rate": 1.6641298833079652e-07, + "logits/chosen": -0.4341822564601898, + "logits/rejected": -0.4602430760860443, + "logps/chosen": -116.70965576171875, + "logps/rejected": -92.82569885253906, + "loss": 0.14, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.9068729877471924, + "rewards/margins": 5.627469062805176, + "rewards/rejected": -1.7205965518951416, + "step": 3290 + }, + { + "epoch": 1.51, + "learning_rate": 1.659056316590563e-07, + "logits/chosen": -0.39262399077415466, + "logits/rejected": -0.42815399169921875, + "logps/chosen": -110.6005630493164, + "logps/rejected": -90.9563217163086, + "loss": 0.1492, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.0928051471710205, + "rewards/margins": 4.729269027709961, + "rewards/rejected": -1.6364643573760986, + "step": 3300 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -0.39426207542419434, + "eval_logits/rejected": -0.41480591893196106, + "eval_logps/chosen": -116.58888244628906, + "eval_logps/rejected": -94.54850006103516, + "eval_loss": 0.17055107653141022, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.2854251861572266, + "eval_rewards/margins": 5.0948357582092285, + "eval_rewards/rejected": -1.8094104528427124, + "eval_runtime": 859.6929, + "eval_samples_per_second": 3.329, + "eval_steps_per_second": 0.208, + "step": 3300 + }, + { + "epoch": 1.51, + "learning_rate": 1.6539827498731605e-07, + "logits/chosen": -0.4324941635131836, + "logits/rejected": -0.46528664231300354, + "logps/chosen": -120.0330581665039, + "logps/rejected": -95.51181030273438, + "loss": 0.1694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.3808555603027344, + "rewards/margins": 5.3127055168151855, + "rewards/rejected": -1.9318501949310303, + "step": 3310 + }, + { + "epoch": 1.52, + "learning_rate": 1.6489091831557582e-07, + "logits/chosen": -0.3996530771255493, + "logits/rejected": -0.42007988691329956, + "logps/chosen": -122.48661804199219, + "logps/rejected": -97.3335189819336, + "loss": 0.1591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3983426094055176, + "rewards/margins": 5.0991926193237305, + "rewards/rejected": -1.7008495330810547, + "step": 3320 + }, + { + "epoch": 1.52, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -0.41088947653770447, + "logits/rejected": -0.4415004849433899, + "logps/chosen": -115.9881591796875, + "logps/rejected": -96.96780395507812, + "loss": 0.1691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.4160733222961426, + "rewards/margins": 4.396543979644775, + "rewards/rejected": -1.980470895767212, + "step": 3330 + }, + { + "epoch": 1.52, + "learning_rate": 1.6387620497209535e-07, + "logits/chosen": -0.4284709095954895, + "logits/rejected": -0.45423832535743713, + "logps/chosen": -112.177734375, + "logps/rejected": -97.37242126464844, + "loss": 0.1564, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.750192165374756, + "rewards/margins": 5.037611961364746, + "rewards/rejected": -1.2874199151992798, + "step": 3340 + }, + { + "epoch": 1.53, + "learning_rate": 1.6336884830035512e-07, + "logits/chosen": -0.42687755823135376, + "logits/rejected": -0.4660683572292328, + "logps/chosen": -115.24296569824219, + "logps/rejected": -96.22698974609375, + "loss": 0.1809, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4257748126983643, + "rewards/margins": 5.423032760620117, + "rewards/rejected": -1.997257947921753, + "step": 3350 + }, + { + "epoch": 1.53, + "learning_rate": 1.6286149162861489e-07, + "logits/chosen": -0.4044269621372223, + "logits/rejected": -0.44033893942832947, + "logps/chosen": -117.3617935180664, + "logps/rejected": -96.61048889160156, + "loss": 0.1539, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7865517139434814, + "rewards/margins": 5.485646724700928, + "rewards/rejected": -1.6990951299667358, + "step": 3360 + }, + { + "epoch": 1.54, + "learning_rate": 1.6235413495687465e-07, + "logits/chosen": -0.4385055899620056, + "logits/rejected": -0.46612605452537537, + "logps/chosen": -115.01204681396484, + "logps/rejected": -89.59496307373047, + "loss": 0.1509, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.0572566986083984, + "rewards/margins": 4.691444396972656, + "rewards/rejected": -1.6341876983642578, + "step": 3370 + }, + { + "epoch": 1.54, + "learning_rate": 1.6184677828513442e-07, + "logits/chosen": -0.414304256439209, + "logits/rejected": -0.44513431191444397, + "logps/chosen": -121.54780578613281, + "logps/rejected": -95.90415954589844, + "loss": 0.1737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.6485812664031982, + "rewards/margins": 5.4005937576293945, + "rewards/rejected": -1.7520118951797485, + "step": 3380 + }, + { + "epoch": 1.55, + "learning_rate": 1.613394216133942e-07, + "logits/chosen": -0.42347556352615356, + "logits/rejected": -0.4630081057548523, + "logps/chosen": -119.10302734375, + "logps/rejected": -93.24076843261719, + "loss": 0.1655, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.597658157348633, + "rewards/margins": 5.3917436599731445, + "rewards/rejected": -1.7940857410430908, + "step": 3390 + }, + { + "epoch": 1.55, + "learning_rate": 1.6083206494165398e-07, + "logits/chosen": -0.42666107416152954, + "logits/rejected": -0.45836448669433594, + "logps/chosen": -119.47938537597656, + "logps/rejected": -92.59451293945312, + "loss": 0.1719, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.071565628051758, + "rewards/margins": 4.682357311248779, + "rewards/rejected": -1.6107919216156006, + "step": 3400 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -0.42531585693359375, + "eval_logits/rejected": -0.45423340797424316, + "eval_logps/chosen": -116.13007354736328, + "eval_logps/rejected": -94.42098236083984, + "eval_loss": 0.16907574236392975, + "eval_rewards/accuracies": 0.9273743033409119, + "eval_rewards/chosen": 3.514838695526123, + "eval_rewards/margins": 5.260493278503418, + "eval_rewards/rejected": -1.7456541061401367, + "eval_runtime": 896.728, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.2, + "step": 3400 + }, + { + "epoch": 1.56, + "learning_rate": 1.6032470826991375e-07, + "logits/chosen": -0.40940365195274353, + "logits/rejected": -0.4444305896759033, + "logps/chosen": -111.24534606933594, + "logps/rejected": -94.37378692626953, + "loss": 0.1535, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.5870659351348877, + "rewards/margins": 5.215790271759033, + "rewards/rejected": -1.628724455833435, + "step": 3410 + }, + { + "epoch": 1.56, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -0.42105579376220703, + "logits/rejected": -0.45929020643234253, + "logps/chosen": -118.6488037109375, + "logps/rejected": -92.52164459228516, + "loss": 0.1443, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7879090309143066, + "rewards/margins": 5.258182048797607, + "rewards/rejected": -1.4702732563018799, + "step": 3420 + }, + { + "epoch": 1.57, + "learning_rate": 1.5930999492643328e-07, + "logits/chosen": -0.4476688504219055, + "logits/rejected": -0.46989208459854126, + "logps/chosen": -118.64167785644531, + "logps/rejected": -95.72476959228516, + "loss": 0.1678, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.612441301345825, + "rewards/margins": 5.394201755523682, + "rewards/rejected": -1.7817604541778564, + "step": 3430 + }, + { + "epoch": 1.57, + "learning_rate": 1.5880263825469305e-07, + "logits/chosen": -0.41882842779159546, + "logits/rejected": -0.4446406364440918, + "logps/chosen": -116.1468276977539, + "logps/rejected": -94.4334945678711, + "loss": 0.1471, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.578800678253174, + "rewards/margins": 5.660478591918945, + "rewards/rejected": -2.0816779136657715, + "step": 3440 + }, + { + "epoch": 1.57, + "learning_rate": 1.582952815829528e-07, + "logits/chosen": -0.3946971893310547, + "logits/rejected": -0.4405464231967926, + "logps/chosen": -121.08565521240234, + "logps/rejected": -97.06616973876953, + "loss": 0.1506, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.8751914501190186, + "rewards/margins": 5.678435325622559, + "rewards/rejected": -1.8032439947128296, + "step": 3450 + }, + { + "epoch": 1.58, + "learning_rate": 1.5778792491121258e-07, + "logits/chosen": -0.4397282004356384, + "logits/rejected": -0.46519118547439575, + "logps/chosen": -120.30741882324219, + "logps/rejected": -94.88092803955078, + "loss": 0.1525, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6533420085906982, + "rewards/margins": 5.589390754699707, + "rewards/rejected": -1.9360488653182983, + "step": 3460 + }, + { + "epoch": 1.58, + "learning_rate": 1.5728056823947235e-07, + "logits/chosen": -0.39912813901901245, + "logits/rejected": -0.43202313780784607, + "logps/chosen": -112.98824310302734, + "logps/rejected": -94.73600769042969, + "loss": 0.1654, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.3695907592773438, + "rewards/margins": 5.6515021324157715, + "rewards/rejected": -2.2819111347198486, + "step": 3470 + }, + { + "epoch": 1.59, + "learning_rate": 1.567732115677321e-07, + "logits/chosen": -0.42645302414894104, + "logits/rejected": -0.4477473199367523, + "logps/chosen": -118.22267150878906, + "logps/rejected": -100.73336029052734, + "loss": 0.1504, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.873511791229248, + "rewards/margins": 5.4625935554504395, + "rewards/rejected": -1.5890812873840332, + "step": 3480 + }, + { + "epoch": 1.59, + "learning_rate": 1.5626585489599188e-07, + "logits/chosen": -0.43139228224754333, + "logits/rejected": -0.4608747065067291, + "logps/chosen": -120.2907943725586, + "logps/rejected": -100.96830749511719, + "loss": 0.153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3151302337646484, + "rewards/margins": 5.562079906463623, + "rewards/rejected": -2.246950149536133, + "step": 3490 + }, + { + "epoch": 1.6, + "learning_rate": 1.5575849822425165e-07, + "logits/chosen": -0.4321955144405365, + "logits/rejected": -0.46664899587631226, + "logps/chosen": -114.3071060180664, + "logps/rejected": -94.70503234863281, + "loss": 0.1905, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 3.5885214805603027, + "rewards/margins": 5.260445594787598, + "rewards/rejected": -1.671923279762268, + "step": 3500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.4189249873161316, + "eval_logits/rejected": -0.4479340612888336, + "eval_logps/chosen": -116.17154693603516, + "eval_logps/rejected": -94.42037963867188, + "eval_loss": 0.17187124490737915, + "eval_rewards/accuracies": 0.924580991268158, + "eval_rewards/chosen": 3.4941017627716064, + "eval_rewards/margins": 5.239455699920654, + "eval_rewards/rejected": -1.7453538179397583, + "eval_runtime": 908.4719, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.197, + "step": 3500 + }, + { + "epoch": 1.6, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -0.4244809150695801, + "logits/rejected": -0.44996365904808044, + "logps/chosen": -115.8191146850586, + "logps/rejected": -95.07186126708984, + "loss": 0.1318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3521361351013184, + "rewards/margins": 5.482035160064697, + "rewards/rejected": -2.1298987865448, + "step": 3510 + }, + { + "epoch": 1.61, + "learning_rate": 1.5474378488077118e-07, + "logits/chosen": -0.3949509263038635, + "logits/rejected": -0.4299197793006897, + "logps/chosen": -120.37969970703125, + "logps/rejected": -100.93434143066406, + "loss": 0.135, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4647655487060547, + "rewards/margins": 5.028170108795166, + "rewards/rejected": -1.5634050369262695, + "step": 3520 + }, + { + "epoch": 1.61, + "learning_rate": 1.5423642820903095e-07, + "logits/chosen": -0.43281999230384827, + "logits/rejected": -0.46290507912635803, + "logps/chosen": -121.1761474609375, + "logps/rejected": -98.9190673828125, + "loss": 0.1699, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.336357831954956, + "rewards/margins": 4.971770286560059, + "rewards/rejected": -1.635412573814392, + "step": 3530 + }, + { + "epoch": 1.62, + "learning_rate": 1.537290715372907e-07, + "logits/chosen": -0.42768391966819763, + "logits/rejected": -0.44896143674850464, + "logps/chosen": -116.26780700683594, + "logps/rejected": -94.6815414428711, + "loss": 0.1486, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.7715530395507812, + "rewards/margins": 5.036046504974365, + "rewards/rejected": -1.264493465423584, + "step": 3540 + }, + { + "epoch": 1.62, + "learning_rate": 1.5322171486555048e-07, + "logits/chosen": -0.4007837176322937, + "logits/rejected": -0.44758158922195435, + "logps/chosen": -130.20358276367188, + "logps/rejected": -99.24810028076172, + "loss": 0.1621, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 5.0301408767700195, + "rewards/margins": 6.414588928222656, + "rewards/rejected": -1.3844481706619263, + "step": 3550 + }, + { + "epoch": 1.63, + "learning_rate": 1.5271435819381025e-07, + "logits/chosen": -0.38824278116226196, + "logits/rejected": -0.4197087287902832, + "logps/chosen": -126.24420166015625, + "logps/rejected": -102.49394226074219, + "loss": 0.141, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.869976758956909, + "rewards/margins": 5.813518524169922, + "rewards/rejected": -1.9435417652130127, + "step": 3560 + }, + { + "epoch": 1.63, + "learning_rate": 1.5220700152207e-07, + "logits/chosen": -0.4258262515068054, + "logits/rejected": -0.4580133855342865, + "logps/chosen": -123.7559585571289, + "logps/rejected": -95.52310180664062, + "loss": 0.1509, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.197176456451416, + "rewards/margins": 5.937564849853516, + "rewards/rejected": -1.74038827419281, + "step": 3570 + }, + { + "epoch": 1.63, + "learning_rate": 1.5169964485032978e-07, + "logits/chosen": -0.43848925828933716, + "logits/rejected": -0.4572725296020508, + "logps/chosen": -113.8507080078125, + "logps/rejected": -95.45315551757812, + "loss": 0.1559, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.581429958343506, + "rewards/margins": 5.606729507446289, + "rewards/rejected": -2.025299310684204, + "step": 3580 + }, + { + "epoch": 1.64, + "learning_rate": 1.5119228817858955e-07, + "logits/chosen": -0.4294334053993225, + "logits/rejected": -0.44948524236679077, + "logps/chosen": -124.73026275634766, + "logps/rejected": -98.81682586669922, + "loss": 0.1388, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6138317584991455, + "rewards/margins": 5.703227996826172, + "rewards/rejected": -2.0893959999084473, + "step": 3590 + }, + { + "epoch": 1.64, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -0.4376956820487976, + "logits/rejected": -0.4688642621040344, + "logps/chosen": -121.80577087402344, + "logps/rejected": -94.1232681274414, + "loss": 0.1354, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6063103675842285, + "rewards/margins": 5.1961469650268555, + "rewards/rejected": -1.5898375511169434, + "step": 3600 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -0.4303247928619385, + "eval_logits/rejected": -0.4608076214790344, + "eval_logps/chosen": -116.08953857421875, + "eval_logps/rejected": -94.33447265625, + "eval_loss": 0.1748729944229126, + "eval_rewards/accuracies": 0.910614550113678, + "eval_rewards/chosen": 3.535107374191284, + "eval_rewards/margins": 5.23750638961792, + "eval_rewards/rejected": -1.702398657798767, + "eval_runtime": 917.8792, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 0.195, + "step": 3600 + }, + { + "epoch": 1.65, + "learning_rate": 1.5017757483510908e-07, + "logits/chosen": -0.43969064950942993, + "logits/rejected": -0.47763586044311523, + "logps/chosen": -118.63066101074219, + "logps/rejected": -100.40098571777344, + "loss": 0.1322, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.01759672164917, + "rewards/margins": 6.038904666900635, + "rewards/rejected": -2.0213077068328857, + "step": 3610 + }, + { + "epoch": 1.65, + "learning_rate": 1.4967021816336885e-07, + "logits/chosen": -0.44949302077293396, + "logits/rejected": -0.47637391090393066, + "logps/chosen": -113.33602142333984, + "logps/rejected": -91.21766662597656, + "loss": 0.1721, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.409508466720581, + "rewards/margins": 5.6164231300354, + "rewards/rejected": -2.2069146633148193, + "step": 3620 + }, + { + "epoch": 1.66, + "learning_rate": 1.491628614916286e-07, + "logits/chosen": -0.44279003143310547, + "logits/rejected": -0.47035011649131775, + "logps/chosen": -124.08707427978516, + "logps/rejected": -104.35624694824219, + "loss": 0.1456, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6684200763702393, + "rewards/margins": 5.043892860412598, + "rewards/rejected": -1.375472903251648, + "step": 3630 + }, + { + "epoch": 1.66, + "learning_rate": 1.4865550481988838e-07, + "logits/chosen": -0.4371957778930664, + "logits/rejected": -0.4689660966396332, + "logps/chosen": -115.41943359375, + "logps/rejected": -98.3012466430664, + "loss": 0.1469, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.81248140335083, + "rewards/margins": 5.492468357086182, + "rewards/rejected": -1.6799873113632202, + "step": 3640 + }, + { + "epoch": 1.67, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -0.4131147265434265, + "logits/rejected": -0.4399244785308838, + "logps/chosen": -124.3476333618164, + "logps/rejected": -95.99518585205078, + "loss": 0.1569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6779167652130127, + "rewards/margins": 4.953145503997803, + "rewards/rejected": -1.2752286195755005, + "step": 3650 + }, + { + "epoch": 1.67, + "learning_rate": 1.476407914764079e-07, + "logits/chosen": -0.4251777231693268, + "logits/rejected": -0.4502864480018616, + "logps/chosen": -124.25196838378906, + "logps/rejected": -97.07969665527344, + "loss": 0.1475, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.5780227184295654, + "rewards/margins": 5.2519121170043945, + "rewards/rejected": -1.67388916015625, + "step": 3660 + }, + { + "epoch": 1.68, + "learning_rate": 1.4713343480466768e-07, + "logits/chosen": -0.41656866669654846, + "logits/rejected": -0.4453061521053314, + "logps/chosen": -115.84672546386719, + "logps/rejected": -97.28424072265625, + "loss": 0.1387, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4806201457977295, + "rewards/margins": 5.611165523529053, + "rewards/rejected": -2.1305456161499023, + "step": 3670 + }, + { + "epoch": 1.68, + "learning_rate": 1.4662607813292745e-07, + "logits/chosen": -0.4140985906124115, + "logits/rejected": -0.4425446093082428, + "logps/chosen": -122.37489318847656, + "logps/rejected": -94.66325378417969, + "loss": 0.1612, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.579188585281372, + "rewards/margins": 5.4639482498168945, + "rewards/rejected": -1.8847599029541016, + "step": 3680 + }, + { + "epoch": 1.68, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -0.45093196630477905, + "logits/rejected": -0.4685111939907074, + "logps/chosen": -117.92506408691406, + "logps/rejected": -95.20411682128906, + "loss": 0.1499, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.947640895843506, + "rewards/margins": 5.4805779457092285, + "rewards/rejected": -1.5329368114471436, + "step": 3690 + }, + { + "epoch": 1.69, + "learning_rate": 1.4561136478944698e-07, + "logits/chosen": -0.4353989064693451, + "logits/rejected": -0.4571017324924469, + "logps/chosen": -116.68096923828125, + "logps/rejected": -99.04996490478516, + "loss": 0.1644, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5987677574157715, + "rewards/margins": 5.514004707336426, + "rewards/rejected": -1.9152368307113647, + "step": 3700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -0.41919150948524475, + "eval_logits/rejected": -0.4468615651130676, + "eval_logps/chosen": -116.0125503540039, + "eval_logps/rejected": -94.24568176269531, + "eval_loss": 0.1596728265285492, + "eval_rewards/accuracies": 0.924580991268158, + "eval_rewards/chosen": 3.5735957622528076, + "eval_rewards/margins": 5.23159646987915, + "eval_rewards/rejected": -1.6580007076263428, + "eval_runtime": 905.4856, + "eval_samples_per_second": 3.161, + "eval_steps_per_second": 0.198, + "step": 3700 + }, + { + "epoch": 1.69, + "learning_rate": 1.4510400811770675e-07, + "logits/chosen": -0.444713830947876, + "logits/rejected": -0.4759383201599121, + "logps/chosen": -132.91668701171875, + "logps/rejected": -95.53848266601562, + "loss": 0.1504, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.616512775421143, + "rewards/margins": 5.9059953689575195, + "rewards/rejected": -1.2894827127456665, + "step": 3710 + }, + { + "epoch": 1.7, + "learning_rate": 1.445966514459665e-07, + "logits/chosen": -0.42255353927612305, + "logits/rejected": -0.45838356018066406, + "logps/chosen": -115.45365142822266, + "logps/rejected": -95.69837951660156, + "loss": 0.1571, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.2308132648468018, + "rewards/margins": 5.298513889312744, + "rewards/rejected": -2.0677008628845215, + "step": 3720 + }, + { + "epoch": 1.7, + "learning_rate": 1.4408929477422628e-07, + "logits/chosen": -0.4201040267944336, + "logits/rejected": -0.44681286811828613, + "logps/chosen": -119.80631256103516, + "logps/rejected": -95.5534439086914, + "loss": 0.1594, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.08919620513916, + "rewards/margins": 5.784335613250732, + "rewards/rejected": -1.6951395273208618, + "step": 3730 + }, + { + "epoch": 1.71, + "learning_rate": 1.4358193810248604e-07, + "logits/chosen": -0.4364323019981384, + "logits/rejected": -0.4687287211418152, + "logps/chosen": -117.48023986816406, + "logps/rejected": -95.4148178100586, + "loss": 0.1455, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.810602903366089, + "rewards/margins": 5.867162227630615, + "rewards/rejected": -2.0565590858459473, + "step": 3740 + }, + { + "epoch": 1.71, + "learning_rate": 1.430745814307458e-07, + "logits/chosen": -0.4337928295135498, + "logits/rejected": -0.4612889289855957, + "logps/chosen": -118.15765380859375, + "logps/rejected": -97.46696472167969, + "loss": 0.1727, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.426650285720825, + "rewards/margins": 5.878968238830566, + "rewards/rejected": -2.452317953109741, + "step": 3750 + }, + { + "epoch": 1.72, + "learning_rate": 1.4256722475900558e-07, + "logits/chosen": -0.40918678045272827, + "logits/rejected": -0.4386422634124756, + "logps/chosen": -117.19795989990234, + "logps/rejected": -95.10436248779297, + "loss": 0.1812, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.7097976207733154, + "rewards/margins": 5.490555763244629, + "rewards/rejected": -1.7807576656341553, + "step": 3760 + }, + { + "epoch": 1.72, + "learning_rate": 1.4205986808726534e-07, + "logits/chosen": -0.41621828079223633, + "logits/rejected": -0.43877357244491577, + "logps/chosen": -118.92469787597656, + "logps/rejected": -101.1650619506836, + "loss": 0.1475, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.605703830718994, + "rewards/margins": 5.508579254150391, + "rewards/rejected": -1.9028756618499756, + "step": 3770 + }, + { + "epoch": 1.73, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -0.43570631742477417, + "logits/rejected": -0.4763062000274658, + "logps/chosen": -125.0984878540039, + "logps/rejected": -97.47032165527344, + "loss": 0.1453, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5961880683898926, + "rewards/margins": 5.880617618560791, + "rewards/rejected": -2.2844297885894775, + "step": 3780 + }, + { + "epoch": 1.73, + "learning_rate": 1.4104515474378488e-07, + "logits/chosen": -0.4550606608390808, + "logits/rejected": -0.4833446145057678, + "logps/chosen": -123.15104675292969, + "logps/rejected": -93.86997985839844, + "loss": 0.1366, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7832932472229004, + "rewards/margins": 5.126801490783691, + "rewards/rejected": -1.343508005142212, + "step": 3790 + }, + { + "epoch": 1.73, + "learning_rate": 1.4053779807204464e-07, + "logits/chosen": -0.4106081426143646, + "logits/rejected": -0.4471622407436371, + "logps/chosen": -119.42298889160156, + "logps/rejected": -92.5013198852539, + "loss": 0.1598, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.9610819816589355, + "rewards/margins": 4.718194007873535, + "rewards/rejected": -1.7571115493774414, + "step": 3800 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -0.4349251985549927, + "eval_logits/rejected": -0.4631372094154358, + "eval_logps/chosen": -115.83056640625, + "eval_logps/rejected": -94.33674621582031, + "eval_loss": 0.1612546443939209, + "eval_rewards/accuracies": 0.9078212380409241, + "eval_rewards/chosen": 3.6645917892456055, + "eval_rewards/margins": 5.368130683898926, + "eval_rewards/rejected": -1.7035386562347412, + "eval_runtime": 908.5705, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.197, + "step": 3800 + }, + { + "epoch": 1.74, + "learning_rate": 1.400304414003044e-07, + "logits/chosen": -0.4099394679069519, + "logits/rejected": -0.43724679946899414, + "logps/chosen": -116.51655578613281, + "logps/rejected": -99.12494659423828, + "loss": 0.1608, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.131943941116333, + "rewards/margins": 5.072139263153076, + "rewards/rejected": -1.9401954412460327, + "step": 3810 + }, + { + "epoch": 1.74, + "learning_rate": 1.3952308472856418e-07, + "logits/chosen": -0.4199654161930084, + "logits/rejected": -0.44734907150268555, + "logps/chosen": -116.4480209350586, + "logps/rejected": -91.37544250488281, + "loss": 0.1617, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.9373676776885986, + "rewards/margins": 5.529845237731934, + "rewards/rejected": -1.592477560043335, + "step": 3820 + }, + { + "epoch": 1.75, + "learning_rate": 1.3901572805682394e-07, + "logits/chosen": -0.4397161602973938, + "logits/rejected": -0.4739462435245514, + "logps/chosen": -123.17852783203125, + "logps/rejected": -97.94535827636719, + "loss": 0.1313, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.092246055603027, + "rewards/margins": 6.0095133781433105, + "rewards/rejected": -1.9172674417495728, + "step": 3830 + }, + { + "epoch": 1.75, + "learning_rate": 1.385083713850837e-07, + "logits/chosen": -0.44692462682724, + "logits/rejected": -0.47341424226760864, + "logps/chosen": -121.00887298583984, + "logps/rejected": -92.6983413696289, + "loss": 0.1729, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.020684719085693, + "rewards/margins": 6.22866678237915, + "rewards/rejected": -2.207981824874878, + "step": 3840 + }, + { + "epoch": 1.76, + "learning_rate": 1.3800101471334348e-07, + "logits/chosen": -0.39461749792099, + "logits/rejected": -0.4253208637237549, + "logps/chosen": -118.1397476196289, + "logps/rejected": -89.09568786621094, + "loss": 0.1343, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4894938468933105, + "rewards/margins": 5.35275411605835, + "rewards/rejected": -1.8632599115371704, + "step": 3850 + }, + { + "epoch": 1.76, + "learning_rate": 1.3749365804160324e-07, + "logits/chosen": -0.4318575859069824, + "logits/rejected": -0.4667425751686096, + "logps/chosen": -123.15715026855469, + "logps/rejected": -96.49995422363281, + "loss": 0.1608, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.9990055561065674, + "rewards/margins": 5.423766136169434, + "rewards/rejected": -1.4247612953186035, + "step": 3860 + }, + { + "epoch": 1.77, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -0.42642942070961, + "logits/rejected": -0.46308574080467224, + "logps/chosen": -121.68193054199219, + "logps/rejected": -100.54917907714844, + "loss": 0.1442, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.072659015655518, + "rewards/margins": 6.057482719421387, + "rewards/rejected": -1.9848238229751587, + "step": 3870 + }, + { + "epoch": 1.77, + "learning_rate": 1.3647894469812278e-07, + "logits/chosen": -0.4128958582878113, + "logits/rejected": -0.43185925483703613, + "logps/chosen": -123.2667007446289, + "logps/rejected": -95.40806579589844, + "loss": 0.1661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.089308738708496, + "rewards/margins": 5.6935648918151855, + "rewards/rejected": -1.6042560338974, + "step": 3880 + }, + { + "epoch": 1.78, + "learning_rate": 1.3597158802638254e-07, + "logits/chosen": -0.41117334365844727, + "logits/rejected": -0.431837260723114, + "logps/chosen": -117.51399230957031, + "logps/rejected": -99.26336669921875, + "loss": 0.1432, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4673914909362793, + "rewards/margins": 5.661924839019775, + "rewards/rejected": -2.194532632827759, + "step": 3890 + }, + { + "epoch": 1.78, + "learning_rate": 1.354642313546423e-07, + "logits/chosen": -0.42940855026245117, + "logits/rejected": -0.4633054733276367, + "logps/chosen": -115.96989440917969, + "logps/rejected": -92.38642120361328, + "loss": 0.1337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.4177119731903076, + "rewards/margins": 5.029784679412842, + "rewards/rejected": -1.6120731830596924, + "step": 3900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -0.43678709864616394, + "eval_logits/rejected": -0.4658050537109375, + "eval_logps/chosen": -116.05931091308594, + "eval_logps/rejected": -94.6183853149414, + "eval_loss": 0.15831047296524048, + "eval_rewards/accuracies": 0.9134078025817871, + "eval_rewards/chosen": 3.550219774246216, + "eval_rewards/margins": 5.394576549530029, + "eval_rewards/rejected": -1.844356894493103, + "eval_runtime": 910.9099, + "eval_samples_per_second": 3.142, + "eval_steps_per_second": 0.197, + "step": 3900 + }, + { + "epoch": 1.78, + "learning_rate": 1.3495687468290208e-07, + "logits/chosen": -0.4411635994911194, + "logits/rejected": -0.46993690729141235, + "logps/chosen": -121.70280456542969, + "logps/rejected": -93.5676040649414, + "loss": 0.1384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.199113845825195, + "rewards/margins": 6.246798992156982, + "rewards/rejected": -2.0476861000061035, + "step": 3910 + }, + { + "epoch": 1.79, + "learning_rate": 1.3444951801116184e-07, + "logits/chosen": -0.40244975686073303, + "logits/rejected": -0.4448125958442688, + "logps/chosen": -121.76031494140625, + "logps/rejected": -94.28700256347656, + "loss": 0.1325, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.4426021575927734, + "rewards/margins": 5.515939235687256, + "rewards/rejected": -2.0733370780944824, + "step": 3920 + }, + { + "epoch": 1.79, + "learning_rate": 1.339421613394216e-07, + "logits/chosen": -0.39999261498451233, + "logits/rejected": -0.4254913330078125, + "logps/chosen": -120.90069580078125, + "logps/rejected": -101.6038818359375, + "loss": 0.1789, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.0842125415802, + "rewards/margins": 5.103273868560791, + "rewards/rejected": -2.0190606117248535, + "step": 3930 + }, + { + "epoch": 1.8, + "learning_rate": 1.3343480466768138e-07, + "logits/chosen": -0.4498108923435211, + "logits/rejected": -0.48698073625564575, + "logps/chosen": -115.94972229003906, + "logps/rejected": -92.61611938476562, + "loss": 0.1599, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5224971771240234, + "rewards/margins": 5.299635410308838, + "rewards/rejected": -1.7771377563476562, + "step": 3940 + }, + { + "epoch": 1.8, + "learning_rate": 1.3292744799594114e-07, + "logits/chosen": -0.43744587898254395, + "logits/rejected": -0.47521620988845825, + "logps/chosen": -114.79386138916016, + "logps/rejected": -94.7384262084961, + "loss": 0.1607, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.8626174926757812, + "rewards/margins": 5.312564373016357, + "rewards/rejected": -1.4499469995498657, + "step": 3950 + }, + { + "epoch": 1.81, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -0.4359627664089203, + "logits/rejected": -0.44901376962661743, + "logps/chosen": -116.04862976074219, + "logps/rejected": -94.40897369384766, + "loss": 0.136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.450641632080078, + "rewards/margins": 5.271419525146484, + "rewards/rejected": -1.820778250694275, + "step": 3960 + }, + { + "epoch": 1.81, + "learning_rate": 1.3191273465246068e-07, + "logits/chosen": -0.42982253432273865, + "logits/rejected": -0.4628722071647644, + "logps/chosen": -110.03547668457031, + "logps/rejected": -97.66302490234375, + "loss": 0.1322, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.349524736404419, + "rewards/margins": 5.647238731384277, + "rewards/rejected": -2.2977142333984375, + "step": 3970 + }, + { + "epoch": 1.82, + "learning_rate": 1.3140537798072044e-07, + "logits/chosen": -0.4669429659843445, + "logits/rejected": -0.49395719170570374, + "logps/chosen": -114.62618255615234, + "logps/rejected": -90.14851379394531, + "loss": 0.1472, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.9443836212158203, + "rewards/margins": 5.539119243621826, + "rewards/rejected": -1.594735860824585, + "step": 3980 + }, + { + "epoch": 1.82, + "learning_rate": 1.308980213089802e-07, + "logits/chosen": -0.44839197397232056, + "logits/rejected": -0.4741789400577545, + "logps/chosen": -116.9367446899414, + "logps/rejected": -95.77188873291016, + "loss": 0.1628, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.986290693283081, + "rewards/margins": 6.274405479431152, + "rewards/rejected": -2.2881150245666504, + "step": 3990 + }, + { + "epoch": 1.83, + "learning_rate": 1.3039066463723998e-07, + "logits/chosen": -0.43531838059425354, + "logits/rejected": -0.4600156843662262, + "logps/chosen": -120.1506118774414, + "logps/rejected": -94.55183410644531, + "loss": 0.1534, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.6019279956817627, + "rewards/margins": 5.652145862579346, + "rewards/rejected": -2.0502171516418457, + "step": 4000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -0.43281227350234985, + "eval_logits/rejected": -0.46104538440704346, + "eval_logps/chosen": -116.1445541381836, + "eval_logps/rejected": -94.75707244873047, + "eval_loss": 0.1572478860616684, + "eval_rewards/accuracies": 0.9189944267272949, + "eval_rewards/chosen": 3.5075979232788086, + "eval_rewards/margins": 5.421296119689941, + "eval_rewards/rejected": -1.913697600364685, + "eval_runtime": 909.0488, + "eval_samples_per_second": 3.148, + "eval_steps_per_second": 0.197, + "step": 4000 + }, + { + "epoch": 1.83, + "learning_rate": 1.2988330796549974e-07, + "logits/chosen": -0.42936331033706665, + "logits/rejected": -0.46228766441345215, + "logps/chosen": -114.8753433227539, + "logps/rejected": -94.108154296875, + "loss": 0.1411, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.468700885772705, + "rewards/margins": 5.894384384155273, + "rewards/rejected": -2.4256832599639893, + "step": 4010 + }, + { + "epoch": 1.83, + "learning_rate": 1.293759512937595e-07, + "logits/chosen": -0.43876391649246216, + "logits/rejected": -0.46565374732017517, + "logps/chosen": -118.2698745727539, + "logps/rejected": -98.43639373779297, + "loss": 0.1611, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9941275119781494, + "rewards/margins": 5.229546546936035, + "rewards/rejected": -2.2354190349578857, + "step": 4020 + }, + { + "epoch": 1.84, + "learning_rate": 1.2886859462201928e-07, + "logits/chosen": -0.48172348737716675, + "logits/rejected": -0.5027529001235962, + "logps/chosen": -116.6221694946289, + "logps/rejected": -98.63446807861328, + "loss": 0.1486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.6433708667755127, + "rewards/margins": 5.4944939613342285, + "rewards/rejected": -1.8511232137680054, + "step": 4030 + }, + { + "epoch": 1.84, + "learning_rate": 1.2836123795027904e-07, + "logits/chosen": -0.47252827882766724, + "logits/rejected": -0.4866446554660797, + "logps/chosen": -112.34371185302734, + "logps/rejected": -95.17266845703125, + "loss": 0.1148, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.1450066566467285, + "rewards/margins": 5.001776695251465, + "rewards/rejected": -1.8567705154418945, + "step": 4040 + }, + { + "epoch": 1.85, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -0.4539973735809326, + "logits/rejected": -0.47425252199172974, + "logps/chosen": -125.0869140625, + "logps/rejected": -99.01017761230469, + "loss": 0.1351, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.614443778991699, + "rewards/margins": 6.311370372772217, + "rewards/rejected": -1.6969267129898071, + "step": 4050 + }, + { + "epoch": 1.85, + "learning_rate": 1.2734652460679858e-07, + "logits/chosen": -0.41173696517944336, + "logits/rejected": -0.4408145546913147, + "logps/chosen": -124.49433898925781, + "logps/rejected": -98.87799835205078, + "loss": 0.138, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.2565371990203857, + "rewards/margins": 5.311087608337402, + "rewards/rejected": -2.0545499324798584, + "step": 4060 + }, + { + "epoch": 1.86, + "learning_rate": 1.2683916793505834e-07, + "logits/chosen": -0.46297794580459595, + "logits/rejected": -0.4824606478214264, + "logps/chosen": -112.1919174194336, + "logps/rejected": -98.0719223022461, + "loss": 0.1493, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.8646275997161865, + "rewards/margins": 5.0431928634643555, + "rewards/rejected": -2.178565502166748, + "step": 4070 + }, + { + "epoch": 1.86, + "learning_rate": 1.263318112633181e-07, + "logits/chosen": -0.4470444619655609, + "logits/rejected": -0.47431832551956177, + "logps/chosen": -121.6833724975586, + "logps/rejected": -94.71736907958984, + "loss": 0.1406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.3270621299743652, + "rewards/margins": 5.1658525466918945, + "rewards/rejected": -1.8387901782989502, + "step": 4080 + }, + { + "epoch": 1.87, + "learning_rate": 1.2582445459157788e-07, + "logits/chosen": -0.4353705048561096, + "logits/rejected": -0.4633060395717621, + "logps/chosen": -117.09333801269531, + "logps/rejected": -96.31623840332031, + "loss": 0.1741, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.7191150188446045, + "rewards/margins": 5.496578693389893, + "rewards/rejected": -1.7774631977081299, + "step": 4090 + }, + { + "epoch": 1.87, + "learning_rate": 1.2531709791983764e-07, + "logits/chosen": -0.4220728874206543, + "logits/rejected": -0.4554152488708496, + "logps/chosen": -123.45231628417969, + "logps/rejected": -99.41911315917969, + "loss": 0.1327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.361505508422852, + "rewards/margins": 6.2839765548706055, + "rewards/rejected": -1.9224706888198853, + "step": 4100 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -0.4152924418449402, + "eval_logits/rejected": -0.4403891861438751, + "eval_logps/chosen": -116.01750946044922, + "eval_logps/rejected": -94.75827026367188, + "eval_loss": 0.16070948541164398, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.5711212158203125, + "eval_rewards/margins": 5.485420227050781, + "eval_rewards/rejected": -1.9142990112304688, + "eval_runtime": 878.437, + "eval_samples_per_second": 3.258, + "eval_steps_per_second": 0.204, + "step": 4100 + }, + { + "epoch": 1.88, + "learning_rate": 1.248097412480974e-07, + "logits/chosen": -0.4266030788421631, + "logits/rejected": -0.4478863775730133, + "logps/chosen": -130.07701110839844, + "logps/rejected": -99.33512878417969, + "loss": 0.1339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.12672758102417, + "rewards/margins": 5.217502117156982, + "rewards/rejected": -2.0907750129699707, + "step": 4110 + }, + { + "epoch": 1.88, + "learning_rate": 1.2430238457635718e-07, + "logits/chosen": -0.4379239082336426, + "logits/rejected": -0.4631032347679138, + "logps/chosen": -114.28575134277344, + "logps/rejected": -98.11552429199219, + "loss": 0.151, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4146289825439453, + "rewards/margins": 5.491271018981934, + "rewards/rejected": -2.0766425132751465, + "step": 4120 + }, + { + "epoch": 1.89, + "learning_rate": 1.2379502790461694e-07, + "logits/chosen": -0.4479547441005707, + "logits/rejected": -0.48350486159324646, + "logps/chosen": -112.89759826660156, + "logps/rejected": -89.53084564208984, + "loss": 0.1645, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.695988893508911, + "rewards/margins": 5.8413238525390625, + "rewards/rejected": -2.1453351974487305, + "step": 4130 + }, + { + "epoch": 1.89, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -0.42592209577560425, + "logits/rejected": -0.46377819776535034, + "logps/chosen": -118.2711181640625, + "logps/rejected": -91.99664306640625, + "loss": 0.1556, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.791079044342041, + "rewards/margins": 5.7715983390808105, + "rewards/rejected": -1.9805190563201904, + "step": 4140 + }, + { + "epoch": 1.89, + "learning_rate": 1.2278031456113648e-07, + "logits/chosen": -0.42525094747543335, + "logits/rejected": -0.4680793881416321, + "logps/chosen": -131.39756774902344, + "logps/rejected": -96.14659881591797, + "loss": 0.1193, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.135112762451172, + "rewards/margins": 6.375372886657715, + "rewards/rejected": -2.2402608394622803, + "step": 4150 + }, + { + "epoch": 1.9, + "learning_rate": 1.2227295788939624e-07, + "logits/chosen": -0.4264507293701172, + "logits/rejected": -0.4579479694366455, + "logps/chosen": -124.99159240722656, + "logps/rejected": -97.59441375732422, + "loss": 0.1456, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.248692035675049, + "rewards/margins": 5.797226428985596, + "rewards/rejected": -2.548535108566284, + "step": 4160 + }, + { + "epoch": 1.9, + "learning_rate": 1.21765601217656e-07, + "logits/chosen": -0.44981488585472107, + "logits/rejected": -0.4686676561832428, + "logps/chosen": -113.22137451171875, + "logps/rejected": -94.71104431152344, + "loss": 0.1379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8100783824920654, + "rewards/margins": 5.2517571449279785, + "rewards/rejected": -1.4416786432266235, + "step": 4170 + }, + { + "epoch": 1.91, + "learning_rate": 1.2125824454591578e-07, + "logits/chosen": -0.44809216260910034, + "logits/rejected": -0.4698753356933594, + "logps/chosen": -121.99101257324219, + "logps/rejected": -99.77064514160156, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.432638168334961, + "rewards/margins": 6.586982727050781, + "rewards/rejected": -2.1543452739715576, + "step": 4180 + }, + { + "epoch": 1.91, + "learning_rate": 1.2075088787417554e-07, + "logits/chosen": -0.4335380494594574, + "logits/rejected": -0.4673110544681549, + "logps/chosen": -117.7331771850586, + "logps/rejected": -91.36144256591797, + "loss": 0.1445, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.4432575702667236, + "rewards/margins": 5.076163291931152, + "rewards/rejected": -1.6329059600830078, + "step": 4190 + }, + { + "epoch": 1.92, + "learning_rate": 1.202435312024353e-07, + "logits/chosen": -0.441293329000473, + "logits/rejected": -0.474077045917511, + "logps/chosen": -120.2531509399414, + "logps/rejected": -93.95128631591797, + "loss": 0.162, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.6226940155029297, + "rewards/margins": 4.789700508117676, + "rewards/rejected": -1.1670061349868774, + "step": 4200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -0.4373014569282532, + "eval_logits/rejected": -0.46410053968429565, + "eval_logps/chosen": -116.1893081665039, + "eval_logps/rejected": -94.95684814453125, + "eval_loss": 0.15646876394748688, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.485214948654175, + "eval_rewards/margins": 5.498796463012695, + "eval_rewards/rejected": -2.0135817527770996, + "eval_runtime": 905.3895, + "eval_samples_per_second": 3.161, + "eval_steps_per_second": 0.198, + "step": 4200 + }, + { + "epoch": 1.92, + "learning_rate": 1.1973617453069508e-07, + "logits/chosen": -0.41942963004112244, + "logits/rejected": -0.45433226227760315, + "logps/chosen": -119.24137878417969, + "logps/rejected": -93.80500793457031, + "loss": 0.1342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.265337944030762, + "rewards/margins": 5.979970455169678, + "rewards/rejected": -1.714632272720337, + "step": 4210 + }, + { + "epoch": 1.93, + "learning_rate": 1.1922881785895484e-07, + "logits/chosen": -0.476146936416626, + "logits/rejected": -0.493946373462677, + "logps/chosen": -118.25067138671875, + "logps/rejected": -99.85133361816406, + "loss": 0.1604, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.482466220855713, + "rewards/margins": 5.256065368652344, + "rewards/rejected": -1.7735998630523682, + "step": 4220 + }, + { + "epoch": 1.93, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -0.4835086762905121, + "logits/rejected": -0.5142003297805786, + "logps/chosen": -119.5131607055664, + "logps/rejected": -100.95896911621094, + "loss": 0.1474, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.157654285430908, + "rewards/margins": 5.392757892608643, + "rewards/rejected": -2.2351036071777344, + "step": 4230 + }, + { + "epoch": 1.94, + "learning_rate": 1.1821410451547436e-07, + "logits/chosen": -0.40879902243614197, + "logits/rejected": -0.428755521774292, + "logps/chosen": -113.699462890625, + "logps/rejected": -99.03292083740234, + "loss": 0.1403, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.027031421661377, + "rewards/margins": 5.47978401184082, + "rewards/rejected": -1.4527524709701538, + "step": 4240 + }, + { + "epoch": 1.94, + "learning_rate": 1.1770674784373413e-07, + "logits/chosen": -0.46195000410079956, + "logits/rejected": -0.47916096448898315, + "logps/chosen": -110.07051086425781, + "logps/rejected": -96.48656463623047, + "loss": 0.1403, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9599082469940186, + "rewards/margins": 4.992161750793457, + "rewards/rejected": -2.0322537422180176, + "step": 4250 + }, + { + "epoch": 1.94, + "learning_rate": 1.171993911719939e-07, + "logits/chosen": -0.4431841969490051, + "logits/rejected": -0.46271103620529175, + "logps/chosen": -113.02568054199219, + "logps/rejected": -98.78375244140625, + "loss": 0.1526, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.211625576019287, + "rewards/margins": 5.480974197387695, + "rewards/rejected": -2.269348621368408, + "step": 4260 + }, + { + "epoch": 1.95, + "learning_rate": 1.1669203450025366e-07, + "logits/chosen": -0.4586679935455322, + "logits/rejected": -0.4842369556427002, + "logps/chosen": -113.3587646484375, + "logps/rejected": -94.2232437133789, + "loss": 0.18, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.053977012634277, + "rewards/margins": 5.603518009185791, + "rewards/rejected": -1.5495409965515137, + "step": 4270 + }, + { + "epoch": 1.95, + "learning_rate": 1.1618467782851343e-07, + "logits/chosen": -0.42036324739456177, + "logits/rejected": -0.4484184682369232, + "logps/chosen": -118.8740463256836, + "logps/rejected": -97.7596435546875, + "loss": 0.1245, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.5045456886291504, + "rewards/margins": 5.583371162414551, + "rewards/rejected": -2.0788257122039795, + "step": 4280 + }, + { + "epoch": 1.96, + "learning_rate": 1.156773211567732e-07, + "logits/chosen": -0.45418959856033325, + "logits/rejected": -0.48503002524375916, + "logps/chosen": -115.06591796875, + "logps/rejected": -91.06563568115234, + "loss": 0.1351, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.340029239654541, + "rewards/margins": 5.9402241706848145, + "rewards/rejected": -2.6001949310302734, + "step": 4290 + }, + { + "epoch": 1.96, + "learning_rate": 1.1516996448503296e-07, + "logits/chosen": -0.47121506929397583, + "logits/rejected": -0.4892233908176422, + "logps/chosen": -119.47066497802734, + "logps/rejected": -100.1035385131836, + "loss": 0.1471, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.297662258148193, + "rewards/margins": 6.468558311462402, + "rewards/rejected": -2.170895576477051, + "step": 4300 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -0.43375614285469055, + "eval_logits/rejected": -0.46272942423820496, + "eval_logps/chosen": -116.03189086914062, + "eval_logps/rejected": -94.88296508789062, + "eval_loss": 0.15242531895637512, + "eval_rewards/accuracies": 0.924580991268158, + "eval_rewards/chosen": 3.563926935195923, + "eval_rewards/margins": 5.540570259094238, + "eval_rewards/rejected": -1.976643681526184, + "eval_runtime": 903.7216, + "eval_samples_per_second": 3.167, + "eval_steps_per_second": 0.198, + "step": 4300 + }, + { + "epoch": 1.97, + "learning_rate": 1.1466260781329273e-07, + "logits/chosen": -0.4415101110935211, + "logits/rejected": -0.4698669910430908, + "logps/chosen": -116.846923828125, + "logps/rejected": -97.8760986328125, + "loss": 0.145, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.341931104660034, + "rewards/margins": 5.32661771774292, + "rewards/rejected": -1.9846864938735962, + "step": 4310 + }, + { + "epoch": 1.97, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -0.4346837103366852, + "logits/rejected": -0.46257075667381287, + "logps/chosen": -123.95588684082031, + "logps/rejected": -94.39106750488281, + "loss": 0.1233, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.201815605163574, + "rewards/margins": 6.11990213394165, + "rewards/rejected": -1.9180864095687866, + "step": 4320 + }, + { + "epoch": 1.98, + "learning_rate": 1.1364789446981226e-07, + "logits/chosen": -0.4392518997192383, + "logits/rejected": -0.46720820665359497, + "logps/chosen": -122.53446960449219, + "logps/rejected": -101.41107177734375, + "loss": 0.1364, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7970690727233887, + "rewards/margins": 6.052388668060303, + "rewards/rejected": -2.2553200721740723, + "step": 4330 + }, + { + "epoch": 1.98, + "learning_rate": 1.1314053779807203e-07, + "logits/chosen": -0.4463014602661133, + "logits/rejected": -0.4627785086631775, + "logps/chosen": -108.80528259277344, + "logps/rejected": -92.49729919433594, + "loss": 0.1301, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.5569469928741455, + "rewards/margins": 5.680870056152344, + "rewards/rejected": -2.1239230632781982, + "step": 4340 + }, + { + "epoch": 1.99, + "learning_rate": 1.126331811263318e-07, + "logits/chosen": -0.4043458104133606, + "logits/rejected": -0.4282744526863098, + "logps/chosen": -125.00662994384766, + "logps/rejected": -96.8555908203125, + "loss": 0.1329, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.774332046508789, + "rewards/margins": 5.595047473907471, + "rewards/rejected": -1.8207143545150757, + "step": 4350 + }, + { + "epoch": 1.99, + "learning_rate": 1.1212582445459156e-07, + "logits/chosen": -0.424589067697525, + "logits/rejected": -0.4528760313987732, + "logps/chosen": -123.8675537109375, + "logps/rejected": -96.47154998779297, + "loss": 0.1358, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.940141201019287, + "rewards/margins": 5.887082576751709, + "rewards/rejected": -1.9469407796859741, + "step": 4360 + }, + { + "epoch": 1.99, + "learning_rate": 1.1161846778285133e-07, + "logits/chosen": -0.4298287332057953, + "logits/rejected": -0.44278186559677124, + "logps/chosen": -111.9197998046875, + "logps/rejected": -96.86292266845703, + "loss": 0.1402, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8632125854492188, + "rewards/margins": 5.875947952270508, + "rewards/rejected": -2.01273512840271, + "step": 4370 + }, + { + "epoch": 2.0, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -0.44265303015708923, + "logits/rejected": -0.48326772451400757, + "logps/chosen": -120.85821533203125, + "logps/rejected": -97.12603759765625, + "loss": 0.1353, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.985802173614502, + "rewards/margins": 6.470803260803223, + "rewards/rejected": -2.4850013256073, + "step": 4380 + }, + { + "epoch": 2.0, + "learning_rate": 1.1060375443937086e-07, + "logits/chosen": -0.4284425377845764, + "logits/rejected": -0.4572037160396576, + "logps/chosen": -119.15400695800781, + "logps/rejected": -92.87478637695312, + "loss": 0.1248, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.157296180725098, + "rewards/margins": 6.6621294021606445, + "rewards/rejected": -2.5048327445983887, + "step": 4390 + }, + { + "epoch": 2.01, + "learning_rate": 1.1009639776763063e-07, + "logits/chosen": -0.4259072244167328, + "logits/rejected": -0.4534842073917389, + "logps/chosen": -116.02922058105469, + "logps/rejected": -98.30314636230469, + "loss": 0.1333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.9443373680114746, + "rewards/margins": 5.107085227966309, + "rewards/rejected": -2.162747621536255, + "step": 4400 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -0.43279382586479187, + "eval_logits/rejected": -0.4608496427536011, + "eval_logps/chosen": -115.92514038085938, + "eval_logps/rejected": -94.87165069580078, + "eval_loss": 0.14179793000221252, + "eval_rewards/accuracies": 0.916201114654541, + "eval_rewards/chosen": 3.6173088550567627, + "eval_rewards/margins": 5.588301658630371, + "eval_rewards/rejected": -1.9709923267364502, + "eval_runtime": 873.7722, + "eval_samples_per_second": 3.275, + "eval_steps_per_second": 0.205, + "step": 4400 + }, + { + "epoch": 2.01, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -0.4704256057739258, + "logits/rejected": -0.48912009596824646, + "logps/chosen": -116.41622161865234, + "logps/rejected": -102.0516586303711, + "loss": 0.1023, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.746499538421631, + "rewards/margins": 6.171090126037598, + "rewards/rejected": -2.4245896339416504, + "step": 4410 + }, + { + "epoch": 2.02, + "learning_rate": 1.0908168442415016e-07, + "logits/chosen": -0.43700629472732544, + "logits/rejected": -0.4626290202140808, + "logps/chosen": -115.08659362792969, + "logps/rejected": -94.25322723388672, + "loss": 0.1331, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.027894020080566, + "rewards/margins": 6.144411563873291, + "rewards/rejected": -2.1165177822113037, + "step": 4420 + }, + { + "epoch": 2.02, + "learning_rate": 1.0857432775240993e-07, + "logits/chosen": -0.42565712332725525, + "logits/rejected": -0.44756293296813965, + "logps/chosen": -115.29083251953125, + "logps/rejected": -98.44151306152344, + "loss": 0.1637, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.85151743888855, + "rewards/margins": 5.8956618309021, + "rewards/rejected": -2.044144868850708, + "step": 4430 + }, + { + "epoch": 2.03, + "learning_rate": 1.080669710806697e-07, + "logits/chosen": -0.4473974108695984, + "logits/rejected": -0.47457534074783325, + "logps/chosen": -117.94417572021484, + "logps/rejected": -96.59383392333984, + "loss": 0.1394, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.129402160644531, + "rewards/margins": 6.365184307098389, + "rewards/rejected": -2.2357823848724365, + "step": 4440 + }, + { + "epoch": 2.03, + "learning_rate": 1.0755961440892946e-07, + "logits/chosen": -0.44540318846702576, + "logits/rejected": -0.47266215085983276, + "logps/chosen": -112.82417297363281, + "logps/rejected": -96.38731384277344, + "loss": 0.126, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4327216148376465, + "rewards/margins": 5.731344223022461, + "rewards/rejected": -2.2986226081848145, + "step": 4450 + }, + { + "epoch": 2.04, + "learning_rate": 1.0705225773718923e-07, + "logits/chosen": -0.44705715775489807, + "logits/rejected": -0.4615131914615631, + "logps/chosen": -116.78440856933594, + "logps/rejected": -96.42256927490234, + "loss": 0.1444, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.238809585571289, + "rewards/margins": 4.722464561462402, + "rewards/rejected": -1.4836552143096924, + "step": 4460 + }, + { + "epoch": 2.04, + "learning_rate": 1.06544901065449e-07, + "logits/chosen": -0.4247209429740906, + "logits/rejected": -0.4695788025856018, + "logps/chosen": -123.95927429199219, + "logps/rejected": -95.46394348144531, + "loss": 0.1399, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.9008700847625732, + "rewards/margins": 5.591902732849121, + "rewards/rejected": -1.6910324096679688, + "step": 4470 + }, + { + "epoch": 2.04, + "learning_rate": 1.0603754439370876e-07, + "logits/chosen": -0.41793909668922424, + "logits/rejected": -0.449241578578949, + "logps/chosen": -121.62298583984375, + "logps/rejected": -96.32791900634766, + "loss": 0.1114, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.407382011413574, + "rewards/margins": 6.299051761627197, + "rewards/rejected": -1.891669511795044, + "step": 4480 + }, + { + "epoch": 2.05, + "learning_rate": 1.0553018772196853e-07, + "logits/chosen": -0.41730570793151855, + "logits/rejected": -0.45211511850357056, + "logps/chosen": -116.56009674072266, + "logps/rejected": -94.37024688720703, + "loss": 0.1273, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.634586811065674, + "rewards/margins": 5.516324996948242, + "rewards/rejected": -1.881738305091858, + "step": 4490 + }, + { + "epoch": 2.05, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -0.43000978231430054, + "logits/rejected": -0.4528748095035553, + "logps/chosen": -112.00526428222656, + "logps/rejected": -96.77906799316406, + "loss": 0.13, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.7136778831481934, + "rewards/margins": 5.948361873626709, + "rewards/rejected": -2.2346842288970947, + "step": 4500 + }, + { + "epoch": 2.05, + "eval_logits/chosen": -0.43186989426612854, + "eval_logits/rejected": -0.4604170024394989, + "eval_logps/chosen": -115.90472412109375, + "eval_logps/rejected": -94.9027328491211, + "eval_loss": 0.14848706126213074, + "eval_rewards/accuracies": 0.9357541799545288, + "eval_rewards/chosen": 3.627511978149414, + "eval_rewards/margins": 5.61404275894165, + "eval_rewards/rejected": -1.986531138420105, + "eval_runtime": 898.6793, + "eval_samples_per_second": 3.185, + "eval_steps_per_second": 0.199, + "step": 4500 + }, + { + "epoch": 2.06, + "learning_rate": 1.0451547437848806e-07, + "logits/chosen": -0.41770705580711365, + "logits/rejected": -0.4475606381893158, + "logps/chosen": -126.3756332397461, + "logps/rejected": -96.57072448730469, + "loss": 0.1226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7878124713897705, + "rewards/margins": 6.038453102111816, + "rewards/rejected": -2.250640392303467, + "step": 4510 + }, + { + "epoch": 2.06, + "learning_rate": 1.0400811770674783e-07, + "logits/chosen": -0.45794838666915894, + "logits/rejected": -0.4789225161075592, + "logps/chosen": -108.92681884765625, + "logps/rejected": -93.19290161132812, + "loss": 0.1066, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.9215340614318848, + "rewards/margins": 6.354188442230225, + "rewards/rejected": -2.432654857635498, + "step": 4520 + }, + { + "epoch": 2.07, + "learning_rate": 1.035007610350076e-07, + "logits/chosen": -0.43893688917160034, + "logits/rejected": -0.46274012327194214, + "logps/chosen": -118.29610443115234, + "logps/rejected": -96.06135559082031, + "loss": 0.1437, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5714943408966064, + "rewards/margins": 5.405307769775391, + "rewards/rejected": -1.8338134288787842, + "step": 4530 + }, + { + "epoch": 2.07, + "learning_rate": 1.0299340436326736e-07, + "logits/chosen": -0.4240226745605469, + "logits/rejected": -0.4497091770172119, + "logps/chosen": -113.27973937988281, + "logps/rejected": -93.89836883544922, + "loss": 0.1385, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.1435554027557373, + "rewards/margins": 4.301648139953613, + "rewards/rejected": -1.1580922603607178, + "step": 4540 + }, + { + "epoch": 2.08, + "learning_rate": 1.0248604769152713e-07, + "logits/chosen": -0.41185903549194336, + "logits/rejected": -0.436403751373291, + "logps/chosen": -111.78126525878906, + "logps/rejected": -97.95333099365234, + "loss": 0.1297, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2165274620056152, + "rewards/margins": 5.033143520355225, + "rewards/rejected": -1.8166160583496094, + "step": 4550 + }, + { + "epoch": 2.08, + "learning_rate": 1.019786910197869e-07, + "logits/chosen": -0.44074559211730957, + "logits/rejected": -0.4656568467617035, + "logps/chosen": -114.5299301147461, + "logps/rejected": -94.2115478515625, + "loss": 0.1205, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.349818706512451, + "rewards/margins": 5.186150550842285, + "rewards/rejected": -1.8363316059112549, + "step": 4560 + }, + { + "epoch": 2.09, + "learning_rate": 1.0147133434804666e-07, + "logits/chosen": -0.43615585565567017, + "logits/rejected": -0.4757886826992035, + "logps/chosen": -124.17915344238281, + "logps/rejected": -93.8154525756836, + "loss": 0.1529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.06646203994751, + "rewards/margins": 5.818515300750732, + "rewards/rejected": -1.7520534992218018, + "step": 4570 + }, + { + "epoch": 2.09, + "learning_rate": 1.0096397767630643e-07, + "logits/chosen": -0.4336267411708832, + "logits/rejected": -0.45681333541870117, + "logps/chosen": -122.39564514160156, + "logps/rejected": -96.02613830566406, + "loss": 0.1194, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.0391130447387695, + "rewards/margins": 6.454281806945801, + "rewards/rejected": -2.415168046951294, + "step": 4580 + }, + { + "epoch": 2.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -0.43071794509887695, + "logits/rejected": -0.44464653730392456, + "logps/chosen": -116.609619140625, + "logps/rejected": -94.95137023925781, + "loss": 0.1326, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4324917793273926, + "rewards/margins": 4.9647088050842285, + "rewards/rejected": -1.5322175025939941, + "step": 4590 + }, + { + "epoch": 2.1, + "learning_rate": 9.994926433282596e-08, + "logits/chosen": -0.4383629858493805, + "logits/rejected": -0.46357983350753784, + "logps/chosen": -117.3117446899414, + "logps/rejected": -98.81439208984375, + "loss": 0.1311, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8937549591064453, + "rewards/margins": 5.831150531768799, + "rewards/rejected": -1.937395691871643, + "step": 4600 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -0.4123355746269226, + "eval_logits/rejected": -0.4404549300670624, + "eval_logps/chosen": -116.21281433105469, + "eval_logps/rejected": -95.16839599609375, + "eval_loss": 0.15031887590885162, + "eval_rewards/accuracies": 0.9134078025817871, + "eval_rewards/chosen": 3.4734678268432617, + "eval_rewards/margins": 5.592827796936035, + "eval_rewards/rejected": -2.1193599700927734, + "eval_runtime": 907.3168, + "eval_samples_per_second": 3.154, + "eval_steps_per_second": 0.197, + "step": 4600 + }, + { + "epoch": 2.1, + "learning_rate": 9.944190766108573e-08, + "logits/chosen": -0.4133322834968567, + "logits/rejected": -0.4418833255767822, + "logps/chosen": -119.86898040771484, + "logps/rejected": -93.79594421386719, + "loss": 0.1389, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.694523334503174, + "rewards/margins": 5.3314409255981445, + "rewards/rejected": -1.6369177103042603, + "step": 4610 + }, + { + "epoch": 2.11, + "learning_rate": 9.89345509893455e-08, + "logits/chosen": -0.4273843765258789, + "logits/rejected": -0.450580894947052, + "logps/chosen": -112.46533203125, + "logps/rejected": -97.37897491455078, + "loss": 0.1475, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3234305381774902, + "rewards/margins": 5.877073764801025, + "rewards/rejected": -2.553642988204956, + "step": 4620 + }, + { + "epoch": 2.11, + "learning_rate": 9.842719431760526e-08, + "logits/chosen": -0.45100849866867065, + "logits/rejected": -0.4823623597621918, + "logps/chosen": -121.07530212402344, + "logps/rejected": -96.33805084228516, + "loss": 0.1407, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.24837589263916, + "rewards/margins": 6.4495439529418945, + "rewards/rejected": -2.201167583465576, + "step": 4630 + }, + { + "epoch": 2.12, + "learning_rate": 9.791983764586503e-08, + "logits/chosen": -0.4720466732978821, + "logits/rejected": -0.5029277801513672, + "logps/chosen": -115.62332916259766, + "logps/rejected": -89.04841613769531, + "loss": 0.1305, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.728707790374756, + "rewards/margins": 5.528870105743408, + "rewards/rejected": -1.8001619577407837, + "step": 4640 + }, + { + "epoch": 2.12, + "learning_rate": 9.74124809741248e-08, + "logits/chosen": -0.4487905502319336, + "logits/rejected": -0.4747520089149475, + "logps/chosen": -115.44490051269531, + "logps/rejected": -97.00244903564453, + "loss": 0.1243, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.714205503463745, + "rewards/margins": 5.9575676918029785, + "rewards/rejected": -2.2433624267578125, + "step": 4650 + }, + { + "epoch": 2.13, + "learning_rate": 9.690512430238456e-08, + "logits/chosen": -0.4337824285030365, + "logits/rejected": -0.4505422115325928, + "logps/chosen": -118.36312103271484, + "logps/rejected": -93.47285461425781, + "loss": 0.1489, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8410346508026123, + "rewards/margins": 5.889695167541504, + "rewards/rejected": -2.0486607551574707, + "step": 4660 + }, + { + "epoch": 2.13, + "learning_rate": 9.639776763064433e-08, + "logits/chosen": -0.4305817484855652, + "logits/rejected": -0.4584842622280121, + "logps/chosen": -118.65266418457031, + "logps/rejected": -98.68658447265625, + "loss": 0.1346, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.668896436691284, + "rewards/margins": 5.532942771911621, + "rewards/rejected": -1.8640460968017578, + "step": 4670 + }, + { + "epoch": 2.14, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -0.42450302839279175, + "logits/rejected": -0.45716771483421326, + "logps/chosen": -120.25984191894531, + "logps/rejected": -94.94532775878906, + "loss": 0.1458, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.8007895946502686, + "rewards/margins": 6.051023483276367, + "rewards/rejected": -2.2502334117889404, + "step": 4680 + }, + { + "epoch": 2.14, + "learning_rate": 9.538305428716386e-08, + "logits/chosen": -0.4451879560947418, + "logits/rejected": -0.4725143015384674, + "logps/chosen": -121.0210189819336, + "logps/rejected": -94.34584045410156, + "loss": 0.1471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.8500618934631348, + "rewards/margins": 5.363143444061279, + "rewards/rejected": -1.513081669807434, + "step": 4690 + }, + { + "epoch": 2.15, + "learning_rate": 9.487569761542363e-08, + "logits/chosen": -0.4411678910255432, + "logits/rejected": -0.4673629403114319, + "logps/chosen": -120.62171936035156, + "logps/rejected": -93.69928741455078, + "loss": 0.1329, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.366326808929443, + "rewards/margins": 6.472535133361816, + "rewards/rejected": -2.1062076091766357, + "step": 4700 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -0.4228588938713074, + "eval_logits/rejected": -0.45189815759658813, + "eval_logps/chosen": -116.00117492675781, + "eval_logps/rejected": -95.14153289794922, + "eval_loss": 0.14311543107032776, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.579282522201538, + "eval_rewards/margins": 5.685213565826416, + "eval_rewards/rejected": -2.105930805206299, + "eval_runtime": 884.9158, + "eval_samples_per_second": 3.234, + "eval_steps_per_second": 0.202, + "step": 4700 + }, + { + "epoch": 2.15, + "learning_rate": 9.43683409436834e-08, + "logits/chosen": -0.4521303176879883, + "logits/rejected": -0.4794772267341614, + "logps/chosen": -116.8443603515625, + "logps/rejected": -101.2288589477539, + "loss": 0.1377, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.3828024864196777, + "rewards/margins": 5.463811874389648, + "rewards/rejected": -2.0810093879699707, + "step": 4710 + }, + { + "epoch": 2.15, + "learning_rate": 9.386098427194316e-08, + "logits/chosen": -0.4442387521266937, + "logits/rejected": -0.47409456968307495, + "logps/chosen": -119.01100158691406, + "logps/rejected": -94.46539306640625, + "loss": 0.1313, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.3471286296844482, + "rewards/margins": 5.751844882965088, + "rewards/rejected": -2.4047162532806396, + "step": 4720 + }, + { + "epoch": 2.16, + "learning_rate": 9.335362760020293e-08, + "logits/chosen": -0.43975549936294556, + "logits/rejected": -0.46223530173301697, + "logps/chosen": -111.1578140258789, + "logps/rejected": -92.90426635742188, + "loss": 0.1386, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.7703962326049805, + "rewards/margins": 5.044751167297363, + "rewards/rejected": -2.274355411529541, + "step": 4730 + }, + { + "epoch": 2.16, + "learning_rate": 9.28462709284627e-08, + "logits/chosen": -0.44939175248146057, + "logits/rejected": -0.48308318853378296, + "logps/chosen": -118.98689270019531, + "logps/rejected": -95.54768371582031, + "loss": 0.1281, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.045827388763428, + "rewards/margins": 6.658857822418213, + "rewards/rejected": -2.6130292415618896, + "step": 4740 + }, + { + "epoch": 2.17, + "learning_rate": 9.233891425672246e-08, + "logits/chosen": -0.42080554366111755, + "logits/rejected": -0.45268353819847107, + "logps/chosen": -116.78912353515625, + "logps/rejected": -94.59027862548828, + "loss": 0.1419, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.422946929931641, + "rewards/margins": 6.427711486816406, + "rewards/rejected": -2.0047640800476074, + "step": 4750 + }, + { + "epoch": 2.17, + "learning_rate": 9.183155758498223e-08, + "logits/chosen": -0.4549104571342468, + "logits/rejected": -0.4835089147090912, + "logps/chosen": -120.5397720336914, + "logps/rejected": -99.68217468261719, + "loss": 0.1153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.186094045639038, + "rewards/margins": 5.969161033630371, + "rewards/rejected": -2.783067226409912, + "step": 4760 + }, + { + "epoch": 2.18, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -0.42458558082580566, + "logits/rejected": -0.45287784934043884, + "logps/chosen": -121.1539535522461, + "logps/rejected": -100.77606201171875, + "loss": 0.1347, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6020476818084717, + "rewards/margins": 5.12890100479126, + "rewards/rejected": -1.526853322982788, + "step": 4770 + }, + { + "epoch": 2.18, + "learning_rate": 9.081684424150176e-08, + "logits/chosen": -0.42954689264297485, + "logits/rejected": -0.46278315782546997, + "logps/chosen": -122.50996398925781, + "logps/rejected": -98.3419418334961, + "loss": 0.118, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.9317126274108887, + "rewards/margins": 5.562252521514893, + "rewards/rejected": -1.6305391788482666, + "step": 4780 + }, + { + "epoch": 2.19, + "learning_rate": 9.030948756976153e-08, + "logits/chosen": -0.43418097496032715, + "logits/rejected": -0.46971768140792847, + "logps/chosen": -122.99954986572266, + "logps/rejected": -94.2531509399414, + "loss": 0.1419, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.114560127258301, + "rewards/margins": 6.730491638183594, + "rewards/rejected": -2.6159310340881348, + "step": 4790 + }, + { + "epoch": 2.19, + "learning_rate": 8.98021308980213e-08, + "logits/chosen": -0.42841896414756775, + "logits/rejected": -0.46112218499183655, + "logps/chosen": -119.31453704833984, + "logps/rejected": -99.18404388427734, + "loss": 0.1346, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.7384109497070312, + "rewards/margins": 6.064143657684326, + "rewards/rejected": -2.325732946395874, + "step": 4800 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -0.43315914273262024, + "eval_logits/rejected": -0.46388283371925354, + "eval_logps/chosen": -115.94792175292969, + "eval_logps/rejected": -95.05813598632812, + "eval_loss": 0.1493712216615677, + "eval_rewards/accuracies": 0.9273743033409119, + "eval_rewards/chosen": 3.6059112548828125, + "eval_rewards/margins": 5.670140266418457, + "eval_rewards/rejected": -2.0642290115356445, + "eval_runtime": 905.7607, + "eval_samples_per_second": 3.16, + "eval_steps_per_second": 0.198, + "step": 4800 + }, + { + "epoch": 2.2, + "learning_rate": 8.929477422628106e-08, + "logits/chosen": -0.42701220512390137, + "logits/rejected": -0.46004992723464966, + "logps/chosen": -125.9152603149414, + "logps/rejected": -98.69007873535156, + "loss": 0.1357, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 4.1814656257629395, + "rewards/margins": 6.167128086090088, + "rewards/rejected": -1.9856624603271484, + "step": 4810 + }, + { + "epoch": 2.2, + "learning_rate": 8.878741755454083e-08, + "logits/chosen": -0.4257637858390808, + "logits/rejected": -0.45527562499046326, + "logps/chosen": -124.95893859863281, + "logps/rejected": -99.27806091308594, + "loss": 0.1431, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.052617311477661, + "rewards/margins": 5.143216609954834, + "rewards/rejected": -2.090599536895752, + "step": 4820 + }, + { + "epoch": 2.2, + "learning_rate": 8.82800608828006e-08, + "logits/chosen": -0.4246063232421875, + "logits/rejected": -0.4535275399684906, + "logps/chosen": -120.38226318359375, + "logps/rejected": -95.41462707519531, + "loss": 0.1563, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6748244762420654, + "rewards/margins": 5.4428935050964355, + "rewards/rejected": -1.7680695056915283, + "step": 4830 + }, + { + "epoch": 2.21, + "learning_rate": 8.777270421106036e-08, + "logits/chosen": -0.42737627029418945, + "logits/rejected": -0.45712152123451233, + "logps/chosen": -117.77052307128906, + "logps/rejected": -95.78623962402344, + "loss": 0.1244, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.292767286300659, + "rewards/margins": 5.416769504547119, + "rewards/rejected": -2.12400221824646, + "step": 4840 + }, + { + "epoch": 2.21, + "learning_rate": 8.726534753932013e-08, + "logits/chosen": -0.44735169410705566, + "logits/rejected": -0.4684707224369049, + "logps/chosen": -114.5677261352539, + "logps/rejected": -95.72280883789062, + "loss": 0.1371, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.0538907051086426, + "rewards/margins": 5.442469596862793, + "rewards/rejected": -2.388578414916992, + "step": 4850 + }, + { + "epoch": 2.22, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -0.44202107191085815, + "logits/rejected": -0.46817198395729065, + "logps/chosen": -112.7282485961914, + "logps/rejected": -96.51592254638672, + "loss": 0.121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.4495131969451904, + "rewards/margins": 5.658398628234863, + "rewards/rejected": -2.208885669708252, + "step": 4860 + }, + { + "epoch": 2.22, + "learning_rate": 8.625063419583966e-08, + "logits/chosen": -0.4402497410774231, + "logits/rejected": -0.4670419692993164, + "logps/chosen": -113.29608154296875, + "logps/rejected": -94.75654602050781, + "loss": 0.1283, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.559720277786255, + "rewards/margins": 5.763693332672119, + "rewards/rejected": -2.203972816467285, + "step": 4870 + }, + { + "epoch": 2.23, + "learning_rate": 8.574327752409943e-08, + "logits/chosen": -0.42953333258628845, + "logits/rejected": -0.45160502195358276, + "logps/chosen": -109.01979064941406, + "logps/rejected": -94.32757568359375, + "loss": 0.1448, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.248141050338745, + "rewards/margins": 5.128489971160889, + "rewards/rejected": -1.880348563194275, + "step": 4880 + }, + { + "epoch": 2.23, + "learning_rate": 8.52359208523592e-08, + "logits/chosen": -0.42205095291137695, + "logits/rejected": -0.463623046875, + "logps/chosen": -121.49098205566406, + "logps/rejected": -97.1163101196289, + "loss": 0.1358, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.174238681793213, + "rewards/margins": 6.480024814605713, + "rewards/rejected": -2.3057861328125, + "step": 4890 + }, + { + "epoch": 2.24, + "learning_rate": 8.472856418061896e-08, + "logits/chosen": -0.4547550678253174, + "logits/rejected": -0.48270148038864136, + "logps/chosen": -124.11871337890625, + "logps/rejected": -98.28508758544922, + "loss": 0.1462, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.622130870819092, + "rewards/margins": 6.161963939666748, + "rewards/rejected": -2.5398333072662354, + "step": 4900 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -0.425849050283432, + "eval_logits/rejected": -0.4553125500679016, + "eval_logps/chosen": -116.215576171875, + "eval_logps/rejected": -95.25931549072266, + "eval_loss": 0.14547978341579437, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.472090482711792, + "eval_rewards/margins": 5.636913299560547, + "eval_rewards/rejected": -2.164822816848755, + "eval_runtime": 921.1192, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 0.194, + "step": 4900 + }, + { + "epoch": 2.24, + "learning_rate": 8.422120750887873e-08, + "logits/chosen": -0.4150146543979645, + "logits/rejected": -0.43759116530418396, + "logps/chosen": -122.642333984375, + "logps/rejected": -95.4013442993164, + "loss": 0.1495, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.227237701416016, + "rewards/margins": 5.948737144470215, + "rewards/rejected": -1.7214996814727783, + "step": 4910 + }, + { + "epoch": 2.25, + "learning_rate": 8.37138508371385e-08, + "logits/chosen": -0.43968862295150757, + "logits/rejected": -0.4645746648311615, + "logps/chosen": -121.03364562988281, + "logps/rejected": -100.46788024902344, + "loss": 0.137, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.370578289031982, + "rewards/margins": 6.668444633483887, + "rewards/rejected": -2.297865867614746, + "step": 4920 + }, + { + "epoch": 2.25, + "learning_rate": 8.320649416539826e-08, + "logits/chosen": -0.42395099997520447, + "logits/rejected": -0.4579865038394928, + "logps/chosen": -117.4967041015625, + "logps/rejected": -96.34669494628906, + "loss": 0.1244, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.60498309135437, + "rewards/margins": 5.425591945648193, + "rewards/rejected": -1.8206090927124023, + "step": 4930 + }, + { + "epoch": 2.25, + "learning_rate": 8.269913749365803e-08, + "logits/chosen": -0.4292042851448059, + "logits/rejected": -0.4508039355278015, + "logps/chosen": -122.93159484863281, + "logps/rejected": -98.60295104980469, + "loss": 0.1383, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.629180431365967, + "rewards/margins": 5.639649391174316, + "rewards/rejected": -2.0104682445526123, + "step": 4940 + }, + { + "epoch": 2.26, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -0.42406630516052246, + "logits/rejected": -0.45318031311035156, + "logps/chosen": -123.0400161743164, + "logps/rejected": -100.5845718383789, + "loss": 0.1231, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6148409843444824, + "rewards/margins": 5.732459545135498, + "rewards/rejected": -2.1176185607910156, + "step": 4950 + }, + { + "epoch": 2.26, + "learning_rate": 8.168442415017756e-08, + "logits/chosen": -0.4305325448513031, + "logits/rejected": -0.4503403604030609, + "logps/chosen": -122.27742004394531, + "logps/rejected": -96.4347152709961, + "loss": 0.1363, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.2944235801696777, + "rewards/margins": 5.731644630432129, + "rewards/rejected": -2.437220573425293, + "step": 4960 + }, + { + "epoch": 2.27, + "learning_rate": 8.117706747843733e-08, + "logits/chosen": -0.45811018347740173, + "logits/rejected": -0.48085397481918335, + "logps/chosen": -115.161376953125, + "logps/rejected": -95.84727478027344, + "loss": 0.1117, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.9942409992218018, + "rewards/margins": 6.176326274871826, + "rewards/rejected": -2.182084798812866, + "step": 4970 + }, + { + "epoch": 2.27, + "learning_rate": 8.06697108066971e-08, + "logits/chosen": -0.45269566774368286, + "logits/rejected": -0.4753597378730774, + "logps/chosen": -119.2424545288086, + "logps/rejected": -97.34378051757812, + "loss": 0.1201, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.708515167236328, + "rewards/margins": 6.624059200286865, + "rewards/rejected": -2.915544271469116, + "step": 4980 + }, + { + "epoch": 2.28, + "learning_rate": 8.016235413495687e-08, + "logits/chosen": -0.4186610281467438, + "logits/rejected": -0.44470691680908203, + "logps/chosen": -112.29804992675781, + "logps/rejected": -98.46720123291016, + "loss": 0.1281, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.875298500061035, + "rewards/margins": 4.640026569366455, + "rewards/rejected": -1.7647278308868408, + "step": 4990 + }, + { + "epoch": 2.28, + "learning_rate": 7.965499746321664e-08, + "logits/chosen": -0.42619308829307556, + "logits/rejected": -0.4579424262046814, + "logps/chosen": -124.06336975097656, + "logps/rejected": -101.90495300292969, + "loss": 0.1221, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.564988613128662, + "rewards/margins": 6.438859462738037, + "rewards/rejected": -1.8738704919815063, + "step": 5000 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -0.42679208517074585, + "eval_logits/rejected": -0.45248451828956604, + "eval_logps/chosen": -115.90120697021484, + "eval_logps/rejected": -95.22400665283203, + "eval_loss": 0.15375325083732605, + "eval_rewards/accuracies": 0.9385474920272827, + "eval_rewards/chosen": 3.62927508354187, + "eval_rewards/margins": 5.7764434814453125, + "eval_rewards/rejected": -2.1471686363220215, + "eval_runtime": 917.3543, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 0.195, + "step": 5000 + }, + { + "epoch": 2.29, + "learning_rate": 7.91476407914764e-08, + "logits/chosen": -0.4315185546875, + "logits/rejected": -0.4569702744483948, + "logps/chosen": -116.04268646240234, + "logps/rejected": -93.1859130859375, + "loss": 0.1457, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.254358768463135, + "rewards/margins": 5.7949419021606445, + "rewards/rejected": -1.5405833721160889, + "step": 5010 + }, + { + "epoch": 2.29, + "learning_rate": 7.864028411973617e-08, + "logits/chosen": -0.4479445517063141, + "logits/rejected": -0.4728432297706604, + "logps/chosen": -115.8251724243164, + "logps/rejected": -91.85781860351562, + "loss": 0.145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.826247453689575, + "rewards/margins": 5.700136184692383, + "rewards/rejected": -1.8738889694213867, + "step": 5020 + }, + { + "epoch": 2.3, + "learning_rate": 7.813292744799594e-08, + "logits/chosen": -0.42761698365211487, + "logits/rejected": -0.45577025413513184, + "logps/chosen": -118.49787902832031, + "logps/rejected": -97.28572845458984, + "loss": 0.1264, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.764814853668213, + "rewards/margins": 5.964900970458984, + "rewards/rejected": -2.2000865936279297, + "step": 5030 + }, + { + "epoch": 2.3, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -0.4219226837158203, + "logits/rejected": -0.46134597063064575, + "logps/chosen": -114.80213928222656, + "logps/rejected": -96.4936294555664, + "loss": 0.144, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.602666139602661, + "rewards/margins": 5.779239177703857, + "rewards/rejected": -2.1765735149383545, + "step": 5040 + }, + { + "epoch": 2.31, + "learning_rate": 7.711821410451547e-08, + "logits/chosen": -0.4301370084285736, + "logits/rejected": -0.4616777002811432, + "logps/chosen": -120.4800033569336, + "logps/rejected": -95.62647247314453, + "loss": 0.1161, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.8796589374542236, + "rewards/margins": 6.127512454986572, + "rewards/rejected": -2.2478537559509277, + "step": 5050 + }, + { + "epoch": 2.31, + "learning_rate": 7.661085743277524e-08, + "logits/chosen": -0.4376090466976166, + "logits/rejected": -0.4689910411834717, + "logps/chosen": -123.61064147949219, + "logps/rejected": -97.51295471191406, + "loss": 0.1214, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.019012451171875, + "rewards/margins": 5.775191307067871, + "rewards/rejected": -1.7561795711517334, + "step": 5060 + }, + { + "epoch": 2.31, + "learning_rate": 7.6103500761035e-08, + "logits/chosen": -0.43211430311203003, + "logits/rejected": -0.4615866541862488, + "logps/chosen": -114.49861907958984, + "logps/rejected": -94.89616394042969, + "loss": 0.131, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0946316719055176, + "rewards/margins": 5.596832752227783, + "rewards/rejected": -2.502201557159424, + "step": 5070 + }, + { + "epoch": 2.32, + "learning_rate": 7.559614408929477e-08, + "logits/chosen": -0.4486325681209564, + "logits/rejected": -0.47001224756240845, + "logps/chosen": -120.32537841796875, + "logps/rejected": -96.01399230957031, + "loss": 0.1241, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.4849534034729, + "rewards/margins": 6.655509948730469, + "rewards/rejected": -2.1705565452575684, + "step": 5080 + }, + { + "epoch": 2.32, + "learning_rate": 7.508878741755454e-08, + "logits/chosen": -0.44581979513168335, + "logits/rejected": -0.47158461809158325, + "logps/chosen": -121.13935852050781, + "logps/rejected": -96.64893341064453, + "loss": 0.1101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.932180881500244, + "rewards/margins": 6.488729000091553, + "rewards/rejected": -2.556548833847046, + "step": 5090 + }, + { + "epoch": 2.33, + "learning_rate": 7.45814307458143e-08, + "logits/chosen": -0.4497924745082855, + "logits/rejected": -0.4717227518558502, + "logps/chosen": -119.8982925415039, + "logps/rejected": -101.00511169433594, + "loss": 0.1329, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.685342788696289, + "rewards/margins": 6.00387716293335, + "rewards/rejected": -2.3185343742370605, + "step": 5100 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -0.43014708161354065, + "eval_logits/rejected": -0.45783746242523193, + "eval_logps/chosen": -116.2129898071289, + "eval_logps/rejected": -95.2852554321289, + "eval_loss": 0.1486155241727829, + "eval_rewards/accuracies": 0.9357541799545288, + "eval_rewards/chosen": 3.4733829498291016, + "eval_rewards/margins": 5.6511712074279785, + "eval_rewards/rejected": -2.177788019180298, + "eval_runtime": 912.1321, + "eval_samples_per_second": 3.138, + "eval_steps_per_second": 0.196, + "step": 5100 + }, + { + "epoch": 2.33, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -0.4249711036682129, + "logits/rejected": -0.4414646625518799, + "logps/chosen": -116.63139343261719, + "logps/rejected": -96.9327163696289, + "loss": 0.1374, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.5760879516601562, + "rewards/margins": 5.616466522216797, + "rewards/rejected": -2.040379047393799, + "step": 5110 + }, + { + "epoch": 2.34, + "learning_rate": 7.356671740233384e-08, + "logits/chosen": -0.44433823227882385, + "logits/rejected": -0.48357200622558594, + "logps/chosen": -119.763671875, + "logps/rejected": -92.0307388305664, + "loss": 0.1614, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4530186653137207, + "rewards/margins": 5.796075344085693, + "rewards/rejected": -2.3430564403533936, + "step": 5120 + }, + { + "epoch": 2.34, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -0.4470491409301758, + "logits/rejected": -0.45599809288978577, + "logps/chosen": -119.12043762207031, + "logps/rejected": -97.7674789428711, + "loss": 0.1402, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.528527021408081, + "rewards/margins": 5.870127201080322, + "rewards/rejected": -2.341599702835083, + "step": 5130 + }, + { + "epoch": 2.35, + "learning_rate": 7.255200405885337e-08, + "logits/chosen": -0.41116419434547424, + "logits/rejected": -0.44234171509742737, + "logps/chosen": -118.84635925292969, + "logps/rejected": -98.58489990234375, + "loss": 0.1135, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.288877487182617, + "rewards/margins": 6.042611122131348, + "rewards/rejected": -1.7537336349487305, + "step": 5140 + }, + { + "epoch": 2.35, + "learning_rate": 7.204464738711314e-08, + "logits/chosen": -0.454406201839447, + "logits/rejected": -0.47148528695106506, + "logps/chosen": -120.2448959350586, + "logps/rejected": -97.86837768554688, + "loss": 0.1561, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.711885929107666, + "rewards/margins": 6.327667236328125, + "rewards/rejected": -2.6157820224761963, + "step": 5150 + }, + { + "epoch": 2.36, + "learning_rate": 7.15372907153729e-08, + "logits/chosen": -0.4334358274936676, + "logits/rejected": -0.45983433723449707, + "logps/chosen": -117.58463287353516, + "logps/rejected": -98.2049789428711, + "loss": 0.1475, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3072800636291504, + "rewards/margins": 5.857844352722168, + "rewards/rejected": -2.5505640506744385, + "step": 5160 + }, + { + "epoch": 2.36, + "learning_rate": 7.102993404363267e-08, + "logits/chosen": -0.4280088543891907, + "logits/rejected": -0.4603753089904785, + "logps/chosen": -120.7674331665039, + "logps/rejected": -96.3237533569336, + "loss": 0.1194, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.141296863555908, + "rewards/margins": 5.86881160736084, + "rewards/rejected": -1.7275148630142212, + "step": 5170 + }, + { + "epoch": 2.36, + "learning_rate": 7.052257737189244e-08, + "logits/chosen": -0.4364453852176666, + "logits/rejected": -0.45729178190231323, + "logps/chosen": -114.9288330078125, + "logps/rejected": -93.96846771240234, + "loss": 0.1222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.952524185180664, + "rewards/margins": 5.895291805267334, + "rewards/rejected": -1.9427669048309326, + "step": 5180 + }, + { + "epoch": 2.37, + "learning_rate": 7.00152207001522e-08, + "logits/chosen": -0.4382234513759613, + "logits/rejected": -0.47471123933792114, + "logps/chosen": -116.1670913696289, + "logps/rejected": -96.3835678100586, + "loss": 0.1456, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9336535930633545, + "rewards/margins": 6.241081237792969, + "rewards/rejected": -2.3074281215667725, + "step": 5190 + }, + { + "epoch": 2.37, + "learning_rate": 6.950786402841197e-08, + "logits/chosen": -0.44112008810043335, + "logits/rejected": -0.4550951421260834, + "logps/chosen": -121.0194320678711, + "logps/rejected": -97.9061508178711, + "loss": 0.1284, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.909855604171753, + "rewards/margins": 5.62261438369751, + "rewards/rejected": -2.7127585411071777, + "step": 5200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -0.4328519403934479, + "eval_logits/rejected": -0.46113479137420654, + "eval_logps/chosen": -116.19883728027344, + "eval_logps/rejected": -95.26362609863281, + "eval_loss": 0.1526959091424942, + "eval_rewards/accuracies": 0.9078212380409241, + "eval_rewards/chosen": 3.4804577827453613, + "eval_rewards/margins": 5.647433757781982, + "eval_rewards/rejected": -2.1669766902923584, + "eval_runtime": 871.9882, + "eval_samples_per_second": 3.282, + "eval_steps_per_second": 0.205, + "step": 5200 + }, + { + "epoch": 2.38, + "learning_rate": 6.900050735667174e-08, + "logits/chosen": -0.4210163950920105, + "logits/rejected": -0.4521384835243225, + "logps/chosen": -114.48892974853516, + "logps/rejected": -100.7722396850586, + "loss": 0.1248, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.135586738586426, + "rewards/margins": 6.100742340087891, + "rewards/rejected": -2.965155839920044, + "step": 5210 + }, + { + "epoch": 2.38, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -0.39191263914108276, + "logits/rejected": -0.4211401045322418, + "logps/chosen": -115.17881774902344, + "logps/rejected": -103.11189270019531, + "loss": 0.12, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7481086254119873, + "rewards/margins": 5.617436408996582, + "rewards/rejected": -1.8693279027938843, + "step": 5220 + }, + { + "epoch": 2.39, + "learning_rate": 6.798579401319127e-08, + "logits/chosen": -0.42194071412086487, + "logits/rejected": -0.452617883682251, + "logps/chosen": -115.75611877441406, + "logps/rejected": -99.6901626586914, + "loss": 0.1173, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.275547504425049, + "rewards/margins": 6.447475433349609, + "rewards/rejected": -2.1719284057617188, + "step": 5230 + }, + { + "epoch": 2.39, + "learning_rate": 6.747843734145104e-08, + "logits/chosen": -0.4251808226108551, + "logits/rejected": -0.4505956768989563, + "logps/chosen": -113.19279479980469, + "logps/rejected": -95.99583435058594, + "loss": 0.1194, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.490676164627075, + "rewards/margins": 6.030789375305176, + "rewards/rejected": -2.540113687515259, + "step": 5240 + }, + { + "epoch": 2.4, + "learning_rate": 6.69710806697108e-08, + "logits/chosen": -0.44365444779396057, + "logits/rejected": -0.4735577702522278, + "logps/chosen": -119.94706726074219, + "logps/rejected": -96.17887878417969, + "loss": 0.1544, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.618509292602539, + "rewards/margins": 6.0719451904296875, + "rewards/rejected": -2.4534356594085693, + "step": 5250 + }, + { + "epoch": 2.4, + "learning_rate": 6.646372399797057e-08, + "logits/chosen": -0.405931293964386, + "logits/rejected": -0.43033894896507263, + "logps/chosen": -112.36710357666016, + "logps/rejected": -93.01887512207031, + "loss": 0.1402, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3639919757843018, + "rewards/margins": 5.533229827880859, + "rewards/rejected": -2.1692380905151367, + "step": 5260 + }, + { + "epoch": 2.41, + "learning_rate": 6.595636732623034e-08, + "logits/chosen": -0.4300483763217926, + "logits/rejected": -0.46487370133399963, + "logps/chosen": -117.56733703613281, + "logps/rejected": -97.90502166748047, + "loss": 0.1227, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4772467613220215, + "rewards/margins": 6.186609745025635, + "rewards/rejected": -2.7093615531921387, + "step": 5270 + }, + { + "epoch": 2.41, + "learning_rate": 6.54490106544901e-08, + "logits/chosen": -0.44517087936401367, + "logits/rejected": -0.46531495451927185, + "logps/chosen": -116.51817321777344, + "logps/rejected": -96.64463806152344, + "loss": 0.1316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.680448532104492, + "rewards/margins": 6.1895432472229, + "rewards/rejected": -2.509094715118408, + "step": 5280 + }, + { + "epoch": 2.41, + "learning_rate": 6.494165398274987e-08, + "logits/chosen": -0.4242396354675293, + "logits/rejected": -0.44850629568099976, + "logps/chosen": -125.0820083618164, + "logps/rejected": -97.7175521850586, + "loss": 0.1468, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.043990135192871, + "rewards/margins": 4.996575355529785, + "rewards/rejected": -1.9525845050811768, + "step": 5290 + }, + { + "epoch": 2.42, + "learning_rate": 6.443429731100964e-08, + "logits/chosen": -0.45390892028808594, + "logits/rejected": -0.48004603385925293, + "logps/chosen": -122.3233642578125, + "logps/rejected": -99.90754699707031, + "loss": 0.1238, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.569197416305542, + "rewards/margins": 5.976964473724365, + "rewards/rejected": -2.4077672958374023, + "step": 5300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -0.4191027879714966, + "eval_logits/rejected": -0.4451000988483429, + "eval_logps/chosen": -116.24568176269531, + "eval_logps/rejected": -95.28321075439453, + "eval_loss": 0.14333823323249817, + "eval_rewards/accuracies": 0.9273743033409119, + "eval_rewards/chosen": 3.457033634185791, + "eval_rewards/margins": 5.63380241394043, + "eval_rewards/rejected": -2.1767685413360596, + "eval_runtime": 900.5295, + "eval_samples_per_second": 3.178, + "eval_steps_per_second": 0.199, + "step": 5300 + }, + { + "epoch": 2.42, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -0.4350680410861969, + "logits/rejected": -0.4649884104728699, + "logps/chosen": -119.18917083740234, + "logps/rejected": -100.50810241699219, + "loss": 0.1225, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.4932448863983154, + "rewards/margins": 6.379704475402832, + "rewards/rejected": -2.886460304260254, + "step": 5310 + }, + { + "epoch": 2.43, + "learning_rate": 6.341958396752917e-08, + "logits/chosen": -0.4124290347099304, + "logits/rejected": -0.4478355050086975, + "logps/chosen": -121.8824691772461, + "logps/rejected": -96.27501678466797, + "loss": 0.1424, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.710458278656006, + "rewards/margins": 6.276509761810303, + "rewards/rejected": -2.566051959991455, + "step": 5320 + }, + { + "epoch": 2.43, + "learning_rate": 6.291222729578894e-08, + "logits/chosen": -0.41530436277389526, + "logits/rejected": -0.4465157389640808, + "logps/chosen": -121.0118637084961, + "logps/rejected": -96.7669677734375, + "loss": 0.1091, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8442223072052, + "rewards/margins": 5.426248550415039, + "rewards/rejected": -1.582026481628418, + "step": 5330 + }, + { + "epoch": 2.44, + "learning_rate": 6.24048706240487e-08, + "logits/chosen": -0.4436219334602356, + "logits/rejected": -0.47822102904319763, + "logps/chosen": -124.68888854980469, + "logps/rejected": -95.68738555908203, + "loss": 0.1183, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.960723876953125, + "rewards/margins": 5.930281639099121, + "rewards/rejected": -1.9695583581924438, + "step": 5340 + }, + { + "epoch": 2.44, + "learning_rate": 6.189751395230847e-08, + "logits/chosen": -0.4435461163520813, + "logits/rejected": -0.46420255303382874, + "logps/chosen": -124.46327209472656, + "logps/rejected": -97.09564208984375, + "loss": 0.1126, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.607532501220703, + "rewards/margins": 5.740006446838379, + "rewards/rejected": -2.1324732303619385, + "step": 5350 + }, + { + "epoch": 2.45, + "learning_rate": 6.139015728056824e-08, + "logits/chosen": -0.4381752610206604, + "logits/rejected": -0.45854130387306213, + "logps/chosen": -115.46308898925781, + "logps/rejected": -96.0640869140625, + "loss": 0.1432, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.904341220855713, + "rewards/margins": 6.217533111572266, + "rewards/rejected": -2.313192129135132, + "step": 5360 + }, + { + "epoch": 2.45, + "learning_rate": 6.0882800608828e-08, + "logits/chosen": -0.43165189027786255, + "logits/rejected": -0.4557250440120697, + "logps/chosen": -115.7738037109375, + "logps/rejected": -97.3882064819336, + "loss": 0.132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.428105592727661, + "rewards/margins": 5.922058582305908, + "rewards/rejected": -2.493953227996826, + "step": 5370 + }, + { + "epoch": 2.46, + "learning_rate": 6.037544393708777e-08, + "logits/chosen": -0.41999363899230957, + "logits/rejected": -0.44616618752479553, + "logps/chosen": -119.87571716308594, + "logps/rejected": -100.83201599121094, + "loss": 0.0991, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.8233256340026855, + "rewards/margins": 6.311063766479492, + "rewards/rejected": -2.4877381324768066, + "step": 5380 + }, + { + "epoch": 2.46, + "learning_rate": 5.986808726534754e-08, + "logits/chosen": -0.435232937335968, + "logits/rejected": -0.46104755997657776, + "logps/chosen": -122.05622863769531, + "logps/rejected": -95.90516662597656, + "loss": 0.1192, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9345486164093018, + "rewards/margins": 6.371971607208252, + "rewards/rejected": -2.4374232292175293, + "step": 5390 + }, + { + "epoch": 2.46, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -0.4300897717475891, + "logits/rejected": -0.45790234208106995, + "logps/chosen": -116.21836853027344, + "logps/rejected": -94.72840881347656, + "loss": 0.1317, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.370344877243042, + "rewards/margins": 5.3175368309021, + "rewards/rejected": -1.947191834449768, + "step": 5400 + }, + { + "epoch": 2.46, + "eval_logits/chosen": -0.4342188835144043, + "eval_logits/rejected": -0.4564913213253021, + "eval_logps/chosen": -116.03028106689453, + "eval_logps/rejected": -95.37613677978516, + "eval_loss": 0.1420595645904541, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.564730644226074, + "eval_rewards/margins": 5.787961959838867, + "eval_rewards/rejected": -2.2232308387756348, + "eval_runtime": 900.7223, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 0.199, + "step": 5400 + }, + { + "epoch": 2.47, + "learning_rate": 5.8853373921867065e-08, + "logits/chosen": -0.43930888175964355, + "logits/rejected": -0.4661675989627838, + "logps/chosen": -118.8895492553711, + "logps/rejected": -101.40609741210938, + "loss": 0.1252, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.623162031173706, + "rewards/margins": 5.550017356872559, + "rewards/rejected": -1.9268558025360107, + "step": 5410 + }, + { + "epoch": 2.47, + "learning_rate": 5.834601725012683e-08, + "logits/chosen": -0.4218188226222992, + "logits/rejected": -0.4493798613548279, + "logps/chosen": -117.68571472167969, + "logps/rejected": -97.6932373046875, + "loss": 0.1242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.941012144088745, + "rewards/margins": 5.962545871734619, + "rewards/rejected": -2.021533489227295, + "step": 5420 + }, + { + "epoch": 2.48, + "learning_rate": 5.78386605783866e-08, + "logits/chosen": -0.45547741651535034, + "logits/rejected": -0.47590240836143494, + "logps/chosen": -120.83634948730469, + "logps/rejected": -95.0353775024414, + "loss": 0.1355, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9494616985321045, + "rewards/margins": 6.385022163391113, + "rewards/rejected": -2.435560464859009, + "step": 5430 + }, + { + "epoch": 2.48, + "learning_rate": 5.7331303906646365e-08, + "logits/chosen": -0.44121265411376953, + "logits/rejected": -0.46499747037887573, + "logps/chosen": -116.39122009277344, + "logps/rejected": -100.46307373046875, + "loss": 0.1434, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.17018985748291, + "rewards/margins": 5.512290000915527, + "rewards/rejected": -2.3421006202697754, + "step": 5440 + }, + { + "epoch": 2.49, + "learning_rate": 5.682394723490613e-08, + "logits/chosen": -0.41486072540283203, + "logits/rejected": -0.44767364859580994, + "logps/chosen": -125.267578125, + "logps/rejected": -96.89026641845703, + "loss": 0.1006, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.735731840133667, + "rewards/margins": 5.538851261138916, + "rewards/rejected": -1.8031189441680908, + "step": 5450 + }, + { + "epoch": 2.49, + "learning_rate": 5.63165905631659e-08, + "logits/chosen": -0.45300665497779846, + "logits/rejected": -0.4838689863681793, + "logps/chosen": -125.07342529296875, + "logps/rejected": -103.2747573852539, + "loss": 0.1553, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.719700574874878, + "rewards/margins": 6.036759376525879, + "rewards/rejected": -2.31705904006958, + "step": 5460 + }, + { + "epoch": 2.5, + "learning_rate": 5.5809233891425665e-08, + "logits/chosen": -0.41899123787879944, + "logits/rejected": -0.4467073380947113, + "logps/chosen": -117.2870864868164, + "logps/rejected": -91.2414321899414, + "loss": 0.1381, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.313997745513916, + "rewards/margins": 5.549351215362549, + "rewards/rejected": -2.235353469848633, + "step": 5470 + }, + { + "epoch": 2.5, + "learning_rate": 5.530187721968543e-08, + "logits/chosen": -0.40872034430503845, + "logits/rejected": -0.44184407591819763, + "logps/chosen": -122.60057067871094, + "logps/rejected": -97.71039581298828, + "loss": 0.1401, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.035274028778076, + "rewards/margins": 5.73886775970459, + "rewards/rejected": -1.7035942077636719, + "step": 5480 + }, + { + "epoch": 2.51, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -0.4714629054069519, + "logits/rejected": -0.4978507161140442, + "logps/chosen": -121.52668762207031, + "logps/rejected": -98.09647369384766, + "loss": 0.1282, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4877541065216064, + "rewards/margins": 5.688804626464844, + "rewards/rejected": -2.2010507583618164, + "step": 5490 + }, + { + "epoch": 2.51, + "learning_rate": 5.4287163876204964e-08, + "logits/chosen": -0.4360930919647217, + "logits/rejected": -0.4640362858772278, + "logps/chosen": -116.9971923828125, + "logps/rejected": -99.79209899902344, + "loss": 0.131, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.117398262023926, + "rewards/margins": 6.651013374328613, + "rewards/rejected": -2.5336146354675293, + "step": 5500 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -0.4146881103515625, + "eval_logits/rejected": -0.44443634152412415, + "eval_logps/chosen": -116.3175277709961, + "eval_logps/rejected": -95.46585083007812, + "eval_loss": 0.14781765639781952, + "eval_rewards/accuracies": 0.9189944267272949, + "eval_rewards/chosen": 3.4211063385009766, + "eval_rewards/margins": 5.689195156097412, + "eval_rewards/rejected": -2.2680890560150146, + "eval_runtime": 926.1823, + "eval_samples_per_second": 3.09, + "eval_steps_per_second": 0.193, + "step": 5500 + }, + { + "epoch": 2.52, + "learning_rate": 5.377980720446473e-08, + "logits/chosen": -0.43201667070388794, + "logits/rejected": -0.46011418104171753, + "logps/chosen": -113.99064636230469, + "logps/rejected": -92.80901336669922, + "loss": 0.1115, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.70094633102417, + "rewards/margins": 5.098592281341553, + "rewards/rejected": -2.397646427154541, + "step": 5510 + }, + { + "epoch": 2.52, + "learning_rate": 5.32724505327245e-08, + "logits/chosen": -0.41051793098449707, + "logits/rejected": -0.437521755695343, + "logps/chosen": -120.39073181152344, + "logps/rejected": -99.1684799194336, + "loss": 0.1696, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 3.8455090522766113, + "rewards/margins": 5.564748287200928, + "rewards/rejected": -1.7192399501800537, + "step": 5520 + }, + { + "epoch": 2.52, + "learning_rate": 5.2765093860984264e-08, + "logits/chosen": -0.45660334825515747, + "logits/rejected": -0.47934332489967346, + "logps/chosen": -116.85618591308594, + "logps/rejected": -98.81648254394531, + "loss": 0.1406, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.6133334636688232, + "rewards/margins": 5.89406681060791, + "rewards/rejected": -2.280733108520508, + "step": 5530 + }, + { + "epoch": 2.53, + "learning_rate": 5.225773718924403e-08, + "logits/chosen": -0.42749834060668945, + "logits/rejected": -0.45927342772483826, + "logps/chosen": -115.58152770996094, + "logps/rejected": -91.99083709716797, + "loss": 0.1216, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.055908679962158, + "rewards/margins": 5.848606586456299, + "rewards/rejected": -1.7926979064941406, + "step": 5540 + }, + { + "epoch": 2.53, + "learning_rate": 5.17503805175038e-08, + "logits/chosen": -0.4737180173397064, + "logits/rejected": -0.49301090836524963, + "logps/chosen": -115.43916320800781, + "logps/rejected": -97.0264892578125, + "loss": 0.1495, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.613140821456909, + "rewards/margins": 5.604285717010498, + "rewards/rejected": -1.991145133972168, + "step": 5550 + }, + { + "epoch": 2.54, + "learning_rate": 5.1243023845763564e-08, + "logits/chosen": -0.4277074337005615, + "logits/rejected": -0.4595872759819031, + "logps/chosen": -120.0799789428711, + "logps/rejected": -96.8666000366211, + "loss": 0.124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7783608436584473, + "rewards/margins": 6.271768093109131, + "rewards/rejected": -2.4934072494506836, + "step": 5560 + }, + { + "epoch": 2.54, + "learning_rate": 5.073566717402333e-08, + "logits/chosen": -0.44470876455307007, + "logits/rejected": -0.46704134345054626, + "logps/chosen": -113.39207458496094, + "logps/rejected": -97.73541259765625, + "loss": 0.1385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.5125949382781982, + "rewards/margins": 5.59378719329834, + "rewards/rejected": -2.0811917781829834, + "step": 5570 + }, + { + "epoch": 2.55, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -0.43218737840652466, + "logits/rejected": -0.46957531571388245, + "logps/chosen": -121.54664611816406, + "logps/rejected": -92.61653137207031, + "loss": 0.1492, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.846766710281372, + "rewards/margins": 5.690174102783203, + "rewards/rejected": -1.8434079885482788, + "step": 5580 + }, + { + "epoch": 2.55, + "learning_rate": 4.9720953830542864e-08, + "logits/chosen": -0.455254465341568, + "logits/rejected": -0.4841559827327728, + "logps/chosen": -116.07682800292969, + "logps/rejected": -99.91950988769531, + "loss": 0.1311, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.1617395877838135, + "rewards/margins": 5.66703462600708, + "rewards/rejected": -2.5052950382232666, + "step": 5590 + }, + { + "epoch": 2.56, + "learning_rate": 4.921359715880263e-08, + "logits/chosen": -0.4276158809661865, + "logits/rejected": -0.44826555252075195, + "logps/chosen": -116.87984466552734, + "logps/rejected": -93.79035186767578, + "loss": 0.1235, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.352602481842041, + "rewards/margins": 6.3349151611328125, + "rewards/rejected": -1.982312560081482, + "step": 5600 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -0.4234235882759094, + "eval_logits/rejected": -0.4484900236129761, + "eval_logps/chosen": -116.10143280029297, + "eval_logps/rejected": -95.48921966552734, + "eval_loss": 0.14275720715522766, + "eval_rewards/accuracies": 0.9413408041000366, + "eval_rewards/chosen": 3.5291614532470703, + "eval_rewards/margins": 5.808929920196533, + "eval_rewards/rejected": -2.279768466949463, + "eval_runtime": 876.6048, + "eval_samples_per_second": 3.265, + "eval_steps_per_second": 0.204, + "step": 5600 + }, + { + "epoch": 2.56, + "learning_rate": 4.87062404870624e-08, + "logits/chosen": -0.452223539352417, + "logits/rejected": -0.47349920868873596, + "logps/chosen": -117.63553619384766, + "logps/rejected": -96.32161712646484, + "loss": 0.1157, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.280198574066162, + "rewards/margins": 6.476129055023193, + "rewards/rejected": -2.1959307193756104, + "step": 5610 + }, + { + "epoch": 2.57, + "learning_rate": 4.8198883815322164e-08, + "logits/chosen": -0.4440379738807678, + "logits/rejected": -0.4710288941860199, + "logps/chosen": -118.5327377319336, + "logps/rejected": -101.5977554321289, + "loss": 0.1398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.8866469860076904, + "rewards/margins": 5.589003086090088, + "rewards/rejected": -2.7023556232452393, + "step": 5620 + }, + { + "epoch": 2.57, + "learning_rate": 4.769152714358193e-08, + "logits/chosen": -0.4269390106201172, + "logits/rejected": -0.4624324440956116, + "logps/chosen": -113.65742492675781, + "logps/rejected": -95.20040893554688, + "loss": 0.1168, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.0439372062683105, + "rewards/margins": 6.248236656188965, + "rewards/rejected": -2.204299211502075, + "step": 5630 + }, + { + "epoch": 2.57, + "learning_rate": 4.71841704718417e-08, + "logits/chosen": -0.4506987929344177, + "logits/rejected": -0.4784929156303406, + "logps/chosen": -114.27982330322266, + "logps/rejected": -92.17994689941406, + "loss": 0.125, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.4143104553222656, + "rewards/margins": 5.477658271789551, + "rewards/rejected": -2.063347816467285, + "step": 5640 + }, + { + "epoch": 2.58, + "learning_rate": 4.6676813800101464e-08, + "logits/chosen": -0.45349711179733276, + "logits/rejected": -0.47614622116088867, + "logps/chosen": -118.63285827636719, + "logps/rejected": -93.47701263427734, + "loss": 0.1465, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.212621212005615, + "rewards/margins": 6.03704833984375, + "rewards/rejected": -1.824427604675293, + "step": 5650 + }, + { + "epoch": 2.58, + "learning_rate": 4.616945712836123e-08, + "logits/chosen": -0.441559374332428, + "logits/rejected": -0.46678081154823303, + "logps/chosen": -119.7842025756836, + "logps/rejected": -93.7694320678711, + "loss": 0.1281, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7817344665527344, + "rewards/margins": 6.05294132232666, + "rewards/rejected": -2.2712061405181885, + "step": 5660 + }, + { + "epoch": 2.59, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -0.4511653780937195, + "logits/rejected": -0.4725533425807953, + "logps/chosen": -116.87252044677734, + "logps/rejected": -98.78968048095703, + "loss": 0.1162, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.7561240196228027, + "rewards/margins": 5.7545342445373535, + "rewards/rejected": -1.9984098672866821, + "step": 5670 + }, + { + "epoch": 2.59, + "learning_rate": 4.5154743784880764e-08, + "logits/chosen": -0.4162279963493347, + "logits/rejected": -0.4527038037776947, + "logps/chosen": -111.1601791381836, + "logps/rejected": -100.4341812133789, + "loss": 0.1382, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.014040470123291, + "rewards/margins": 6.132898807525635, + "rewards/rejected": -2.1188578605651855, + "step": 5680 + }, + { + "epoch": 2.6, + "learning_rate": 4.464738711314053e-08, + "logits/chosen": -0.43241244554519653, + "logits/rejected": -0.4595095217227936, + "logps/chosen": -117.22080993652344, + "logps/rejected": -95.52210235595703, + "loss": 0.1458, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.282032012939453, + "rewards/margins": 5.7940144538879395, + "rewards/rejected": -2.5119822025299072, + "step": 5690 + }, + { + "epoch": 2.6, + "learning_rate": 4.41400304414003e-08, + "logits/chosen": -0.4618196487426758, + "logits/rejected": -0.4878638684749603, + "logps/chosen": -123.39935302734375, + "logps/rejected": -96.44064331054688, + "loss": 0.1122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.916281223297119, + "rewards/margins": 6.677175998687744, + "rewards/rejected": -2.760894775390625, + "step": 5700 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -0.4233376681804657, + "eval_logits/rejected": -0.44727030396461487, + "eval_logps/chosen": -115.93925476074219, + "eval_logps/rejected": -95.40226745605469, + "eval_loss": 0.1444796770811081, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.610247850418091, + "eval_rewards/margins": 5.846547603607178, + "eval_rewards/rejected": -2.2362990379333496, + "eval_runtime": 911.2103, + "eval_samples_per_second": 3.141, + "eval_steps_per_second": 0.196, + "step": 5700 + }, + { + "epoch": 2.61, + "learning_rate": 4.3632673769660064e-08, + "logits/chosen": -0.4469119906425476, + "logits/rejected": -0.4657912254333496, + "logps/chosen": -121.95216369628906, + "logps/rejected": -101.41227722167969, + "loss": 0.1031, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.711015224456787, + "rewards/margins": 5.949687957763672, + "rewards/rejected": -2.2386727333068848, + "step": 5710 + }, + { + "epoch": 2.61, + "learning_rate": 4.312531709791983e-08, + "logits/chosen": -0.4170357584953308, + "logits/rejected": -0.4365556836128235, + "logps/chosen": -120.26194763183594, + "logps/rejected": -102.38157653808594, + "loss": 0.1371, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.7953782081604004, + "rewards/margins": 5.940617084503174, + "rewards/rejected": -2.1452383995056152, + "step": 5720 + }, + { + "epoch": 2.62, + "learning_rate": 4.26179604261796e-08, + "logits/chosen": -0.4518299102783203, + "logits/rejected": -0.4835347533226013, + "logps/chosen": -112.53265380859375, + "logps/rejected": -90.11258697509766, + "loss": 0.1389, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.475214719772339, + "rewards/margins": 5.532036781311035, + "rewards/rejected": -2.0568222999572754, + "step": 5730 + }, + { + "epoch": 2.62, + "learning_rate": 4.2110603754439363e-08, + "logits/chosen": -0.40014880895614624, + "logits/rejected": -0.4240821897983551, + "logps/chosen": -115.626220703125, + "logps/rejected": -97.0552749633789, + "loss": 0.1141, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.930833101272583, + "rewards/margins": 6.204165458679199, + "rewards/rejected": -2.2733330726623535, + "step": 5740 + }, + { + "epoch": 2.62, + "learning_rate": 4.160324708269913e-08, + "logits/chosen": -0.4410339295864105, + "logits/rejected": -0.4653685986995697, + "logps/chosen": -120.30546569824219, + "logps/rejected": -99.77711486816406, + "loss": 0.1325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5853590965270996, + "rewards/margins": 5.607728004455566, + "rewards/rejected": -2.022369146347046, + "step": 5750 + }, + { + "epoch": 2.63, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -0.4367760121822357, + "logits/rejected": -0.4674459397792816, + "logps/chosen": -108.0204849243164, + "logps/rejected": -93.1372299194336, + "loss": 0.1381, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.783496141433716, + "rewards/margins": 5.598851680755615, + "rewards/rejected": -1.8153560161590576, + "step": 5760 + }, + { + "epoch": 2.63, + "learning_rate": 4.0588533739218663e-08, + "logits/chosen": -0.4705559313297272, + "logits/rejected": -0.49414676427841187, + "logps/chosen": -114.16094970703125, + "logps/rejected": -91.7486801147461, + "loss": 0.1097, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.7645983695983887, + "rewards/margins": 6.290394306182861, + "rewards/rejected": -2.5257956981658936, + "step": 5770 + }, + { + "epoch": 2.64, + "learning_rate": 4.0081177067478437e-08, + "logits/chosen": -0.4423903822898865, + "logits/rejected": -0.46557703614234924, + "logps/chosen": -120.6229019165039, + "logps/rejected": -95.86766052246094, + "loss": 0.1412, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.339417457580566, + "rewards/margins": 6.628237724304199, + "rewards/rejected": -2.28882098197937, + "step": 5780 + }, + { + "epoch": 2.64, + "learning_rate": 3.95738203957382e-08, + "logits/chosen": -0.4328484535217285, + "logits/rejected": -0.4610370993614197, + "logps/chosen": -109.90287780761719, + "logps/rejected": -98.98536682128906, + "loss": 0.1247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.143998861312866, + "rewards/margins": 5.320948600769043, + "rewards/rejected": -2.1769497394561768, + "step": 5790 + }, + { + "epoch": 2.65, + "learning_rate": 3.906646372399797e-08, + "logits/chosen": -0.4369220733642578, + "logits/rejected": -0.46063485741615295, + "logps/chosen": -123.28334045410156, + "logps/rejected": -99.22138977050781, + "loss": 0.1172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.824085235595703, + "rewards/margins": 6.6783342361450195, + "rewards/rejected": -2.854248046875, + "step": 5800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -0.435650110244751, + "eval_logits/rejected": -0.46478769183158875, + "eval_logps/chosen": -115.99723815917969, + "eval_logps/rejected": -95.30946350097656, + "eval_loss": 0.14151296019554138, + "eval_rewards/accuracies": 0.924580991268158, + "eval_rewards/chosen": 3.5812554359436035, + "eval_rewards/margins": 5.771151065826416, + "eval_rewards/rejected": -2.1898951530456543, + "eval_runtime": 896.6878, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.2, + "step": 5800 + }, + { + "epoch": 2.65, + "learning_rate": 3.8559107052257736e-08, + "logits/chosen": -0.4274473190307617, + "logits/rejected": -0.45593366026878357, + "logps/chosen": -113.11031341552734, + "logps/rejected": -95.99092864990234, + "loss": 0.1283, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.605879068374634, + "rewards/margins": 5.030173301696777, + "rewards/rejected": -2.4242947101593018, + "step": 5810 + }, + { + "epoch": 2.66, + "learning_rate": 3.80517503805175e-08, + "logits/chosen": -0.42664894461631775, + "logits/rejected": -0.45183706283569336, + "logps/chosen": -127.3673095703125, + "logps/rejected": -99.54170227050781, + "loss": 0.1521, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.776498794555664, + "rewards/margins": 5.840052127838135, + "rewards/rejected": -2.0635530948638916, + "step": 5820 + }, + { + "epoch": 2.66, + "learning_rate": 3.754439370877727e-08, + "logits/chosen": -0.4793631136417389, + "logits/rejected": -0.504968523979187, + "logps/chosen": -116.24549865722656, + "logps/rejected": -94.78995513916016, + "loss": 0.143, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5556461811065674, + "rewards/margins": 5.986898422241211, + "rewards/rejected": -2.4312522411346436, + "step": 5830 + }, + { + "epoch": 2.67, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -0.41421228647232056, + "logits/rejected": -0.4470001757144928, + "logps/chosen": -113.46546936035156, + "logps/rejected": -95.14855194091797, + "loss": 0.0926, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.197138786315918, + "rewards/margins": 6.1041083335876465, + "rewards/rejected": -1.9069693088531494, + "step": 5840 + }, + { + "epoch": 2.67, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -0.4365863800048828, + "logits/rejected": -0.46884846687316895, + "logps/chosen": -113.07588195800781, + "logps/rejected": -91.25666046142578, + "loss": 0.1092, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.941469192504883, + "rewards/margins": 5.8505144119262695, + "rewards/rejected": -1.9090449810028076, + "step": 5850 + }, + { + "epoch": 2.67, + "learning_rate": 3.602232369355657e-08, + "logits/chosen": -0.457558810710907, + "logits/rejected": -0.48787039518356323, + "logps/chosen": -114.20429992675781, + "logps/rejected": -96.5516586303711, + "loss": 0.1184, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.194037437438965, + "rewards/margins": 6.516490936279297, + "rewards/rejected": -2.322453737258911, + "step": 5860 + }, + { + "epoch": 2.68, + "learning_rate": 3.5514967021816336e-08, + "logits/chosen": -0.42379432916641235, + "logits/rejected": -0.4607546925544739, + "logps/chosen": -121.55973815917969, + "logps/rejected": -99.63563537597656, + "loss": 0.1527, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.44954776763916, + "rewards/margins": 6.263568878173828, + "rewards/rejected": -1.8140218257904053, + "step": 5870 + }, + { + "epoch": 2.68, + "learning_rate": 3.50076103500761e-08, + "logits/chosen": -0.44541049003601074, + "logits/rejected": -0.46894732117652893, + "logps/chosen": -119.17094421386719, + "logps/rejected": -101.33116149902344, + "loss": 0.1135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.428978443145752, + "rewards/margins": 6.683131217956543, + "rewards/rejected": -2.254152774810791, + "step": 5880 + }, + { + "epoch": 2.69, + "learning_rate": 3.450025367833587e-08, + "logits/chosen": -0.4532981514930725, + "logits/rejected": -0.4787921905517578, + "logps/chosen": -122.17115783691406, + "logps/rejected": -98.2051773071289, + "loss": 0.1307, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.271052598953247, + "rewards/margins": 5.8984832763671875, + "rewards/rejected": -2.627429962158203, + "step": 5890 + }, + { + "epoch": 2.69, + "learning_rate": 3.3992897006595636e-08, + "logits/chosen": -0.43592172861099243, + "logits/rejected": -0.4776608347892761, + "logps/chosen": -120.0385971069336, + "logps/rejected": -99.87791442871094, + "loss": 0.1257, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9749057292938232, + "rewards/margins": 6.650933742523193, + "rewards/rejected": -2.6760289669036865, + "step": 5900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -0.42685261368751526, + "eval_logits/rejected": -0.4552892744541168, + "eval_logps/chosen": -116.34471893310547, + "eval_logps/rejected": -95.53901672363281, + "eval_loss": 0.14278730750083923, + "eval_rewards/accuracies": 0.9217877388000488, + "eval_rewards/chosen": 3.4075093269348145, + "eval_rewards/margins": 5.712179183959961, + "eval_rewards/rejected": -2.3046703338623047, + "eval_runtime": 898.1404, + "eval_samples_per_second": 3.187, + "eval_steps_per_second": 0.199, + "step": 5900 + }, + { + "epoch": 2.7, + "learning_rate": 3.34855403348554e-08, + "logits/chosen": -0.4431632459163666, + "logits/rejected": -0.48115652799606323, + "logps/chosen": -124.9809341430664, + "logps/rejected": -96.26667785644531, + "loss": 0.1237, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.165075778961182, + "rewards/margins": 6.46014928817749, + "rewards/rejected": -2.295074462890625, + "step": 5910 + }, + { + "epoch": 2.7, + "learning_rate": 3.297818366311517e-08, + "logits/chosen": -0.4241110682487488, + "logits/rejected": -0.4464682936668396, + "logps/chosen": -117.69609069824219, + "logps/rejected": -96.72189331054688, + "loss": 0.1075, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.744476318359375, + "rewards/margins": 5.3151750564575195, + "rewards/rejected": -1.570698618888855, + "step": 5920 + }, + { + "epoch": 2.71, + "learning_rate": 3.2470826991374936e-08, + "logits/chosen": -0.45314112305641174, + "logits/rejected": -0.4763621687889099, + "logps/chosen": -121.1375961303711, + "logps/rejected": -96.46546936035156, + "loss": 0.112, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.103701114654541, + "rewards/margins": 6.2286553382873535, + "rewards/rejected": -2.1249539852142334, + "step": 5930 + }, + { + "epoch": 2.71, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -0.4642356038093567, + "logits/rejected": -0.4867876470088959, + "logps/chosen": -119.946044921875, + "logps/rejected": -100.07732391357422, + "loss": 0.1153, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.106837749481201, + "rewards/margins": 5.389840126037598, + "rewards/rejected": -2.2830023765563965, + "step": 5940 + }, + { + "epoch": 2.72, + "learning_rate": 3.145611364789447e-08, + "logits/chosen": -0.4027109146118164, + "logits/rejected": -0.4404265880584717, + "logps/chosen": -114.95914459228516, + "logps/rejected": -98.45368957519531, + "loss": 0.1216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.436598300933838, + "rewards/margins": 6.390365123748779, + "rewards/rejected": -2.9537672996520996, + "step": 5950 + }, + { + "epoch": 2.72, + "learning_rate": 3.0948756976154236e-08, + "logits/chosen": -0.43318143486976624, + "logits/rejected": -0.46780461072921753, + "logps/chosen": -116.51350402832031, + "logps/rejected": -94.24980163574219, + "loss": 0.1421, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.344388008117676, + "rewards/margins": 6.8773345947265625, + "rewards/rejected": -2.5329465866088867, + "step": 5960 + }, + { + "epoch": 2.73, + "learning_rate": 3.0441400304414e-08, + "logits/chosen": -0.42986053228378296, + "logits/rejected": -0.4519343972206116, + "logps/chosen": -116.03038024902344, + "logps/rejected": -98.72990417480469, + "loss": 0.1419, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6905105113983154, + "rewards/margins": 5.332497596740723, + "rewards/rejected": -1.6419864892959595, + "step": 5970 + }, + { + "epoch": 2.73, + "learning_rate": 2.993404363267377e-08, + "logits/chosen": -0.4326365888118744, + "logits/rejected": -0.4565103054046631, + "logps/chosen": -107.49725341796875, + "logps/rejected": -99.25447082519531, + "loss": 0.1093, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.4035542011260986, + "rewards/margins": 6.07853364944458, + "rewards/rejected": -2.6749796867370605, + "step": 5980 + }, + { + "epoch": 2.73, + "learning_rate": 2.9426686960933532e-08, + "logits/chosen": -0.4541063904762268, + "logits/rejected": -0.4810553193092346, + "logps/chosen": -127.1908187866211, + "logps/rejected": -98.79722595214844, + "loss": 0.1272, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.745981216430664, + "rewards/margins": 5.777894496917725, + "rewards/rejected": -2.0319135189056396, + "step": 5990 + }, + { + "epoch": 2.74, + "learning_rate": 2.89193302891933e-08, + "logits/chosen": -0.417894184589386, + "logits/rejected": -0.45300230383872986, + "logps/chosen": -118.5451431274414, + "logps/rejected": -100.79390716552734, + "loss": 0.1441, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.300325393676758, + "rewards/margins": 5.615603446960449, + "rewards/rejected": -2.3152778148651123, + "step": 6000 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -0.44005295634269714, + "eval_logits/rejected": -0.46727845072746277, + "eval_logps/chosen": -116.30235290527344, + "eval_logps/rejected": -95.57170867919922, + "eval_loss": 0.14259207248687744, + "eval_rewards/accuracies": 0.9189944267272949, + "eval_rewards/chosen": 3.4286997318267822, + "eval_rewards/margins": 5.749711036682129, + "eval_rewards/rejected": -2.3210110664367676, + "eval_runtime": 905.9645, + "eval_samples_per_second": 3.159, + "eval_steps_per_second": 0.198, + "step": 6000 + }, + { + "epoch": 2.74, + "learning_rate": 2.8411973617453066e-08, + "logits/chosen": -0.44576215744018555, + "logits/rejected": -0.46743813157081604, + "logps/chosen": -115.3594741821289, + "logps/rejected": -98.07334899902344, + "loss": 0.1339, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.71684193611145, + "rewards/margins": 5.9137444496154785, + "rewards/rejected": -2.1969027519226074, + "step": 6010 + }, + { + "epoch": 2.75, + "learning_rate": 2.7904616945712832e-08, + "logits/chosen": -0.40970954298973083, + "logits/rejected": -0.4393082559108734, + "logps/chosen": -124.13734436035156, + "logps/rejected": -93.94351959228516, + "loss": 0.1018, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.169826030731201, + "rewards/margins": 6.453677177429199, + "rewards/rejected": -2.283851146697998, + "step": 6020 + }, + { + "epoch": 2.75, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -0.4212943911552429, + "logits/rejected": -0.4489540159702301, + "logps/chosen": -122.31058502197266, + "logps/rejected": -96.13169860839844, + "loss": 0.1357, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.3273720741271973, + "rewards/margins": 5.711031913757324, + "rewards/rejected": -2.383660316467285, + "step": 6030 + }, + { + "epoch": 2.76, + "learning_rate": 2.6889903602232366e-08, + "logits/chosen": -0.4667048454284668, + "logits/rejected": -0.49229010939598083, + "logps/chosen": -122.15106201171875, + "logps/rejected": -99.2340087890625, + "loss": 0.1091, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.865828037261963, + "rewards/margins": 6.067671775817871, + "rewards/rejected": -2.2018446922302246, + "step": 6040 + }, + { + "epoch": 2.76, + "learning_rate": 2.6382546930492132e-08, + "logits/chosen": -0.4627668857574463, + "logits/rejected": -0.4806435704231262, + "logps/chosen": -122.12646484375, + "logps/rejected": -95.74661254882812, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7494614124298096, + "rewards/margins": 5.866511344909668, + "rewards/rejected": -2.1170496940612793, + "step": 6050 + }, + { + "epoch": 2.77, + "learning_rate": 2.58751902587519e-08, + "logits/chosen": -0.4114772379398346, + "logits/rejected": -0.44379204511642456, + "logps/chosen": -117.29730224609375, + "logps/rejected": -94.56267547607422, + "loss": 0.1538, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.685570478439331, + "rewards/margins": 6.307326316833496, + "rewards/rejected": -2.621755599975586, + "step": 6060 + }, + { + "epoch": 2.77, + "learning_rate": 2.5367833587011665e-08, + "logits/chosen": -0.4341645836830139, + "logits/rejected": -0.46204155683517456, + "logps/chosen": -122.454345703125, + "logps/rejected": -96.84683227539062, + "loss": 0.1169, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7362613677978516, + "rewards/margins": 6.1316447257995605, + "rewards/rejected": -2.3953843116760254, + "step": 6070 + }, + { + "epoch": 2.78, + "learning_rate": 2.4860476915271432e-08, + "logits/chosen": -0.45279574394226074, + "logits/rejected": -0.477097749710083, + "logps/chosen": -128.41921997070312, + "logps/rejected": -93.85469818115234, + "loss": 0.1335, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.203787803649902, + "rewards/margins": 6.0743513107299805, + "rewards/rejected": -1.8705631494522095, + "step": 6080 + }, + { + "epoch": 2.78, + "learning_rate": 2.43531202435312e-08, + "logits/chosen": -0.4524506628513336, + "logits/rejected": -0.4743841290473938, + "logps/chosen": -106.57120513916016, + "logps/rejected": -93.22120666503906, + "loss": 0.1122, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.411867618560791, + "rewards/margins": 5.438270568847656, + "rewards/rejected": -2.0264029502868652, + "step": 6090 + }, + { + "epoch": 2.78, + "learning_rate": 2.3845763571790965e-08, + "logits/chosen": -0.46355438232421875, + "logits/rejected": -0.4836340546607971, + "logps/chosen": -118.38304138183594, + "logps/rejected": -95.64445495605469, + "loss": 0.1359, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.004835605621338, + "rewards/margins": 6.421379089355469, + "rewards/rejected": -2.4165430068969727, + "step": 6100 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -0.4173344075679779, + "eval_logits/rejected": -0.4409016966819763, + "eval_logps/chosen": -116.19310760498047, + "eval_logps/rejected": -95.52818298339844, + "eval_loss": 0.14793427288532257, + "eval_rewards/accuracies": 0.9357541799545288, + "eval_rewards/chosen": 3.4833199977874756, + "eval_rewards/margins": 5.782570838928223, + "eval_rewards/rejected": -2.299251079559326, + "eval_runtime": 926.5032, + "eval_samples_per_second": 3.089, + "eval_steps_per_second": 0.193, + "step": 6100 + }, + { + "epoch": 2.79, + "learning_rate": 2.3338406900050732e-08, + "logits/chosen": -0.4381275177001953, + "logits/rejected": -0.4650820195674896, + "logps/chosen": -118.45979309082031, + "logps/rejected": -89.73667907714844, + "loss": 0.141, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.4960122108459473, + "rewards/margins": 5.491837024688721, + "rewards/rejected": -1.9958248138427734, + "step": 6110 + }, + { + "epoch": 2.79, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -0.43747448921203613, + "logits/rejected": -0.45705556869506836, + "logps/chosen": -119.84431457519531, + "logps/rejected": -100.47638702392578, + "loss": 0.1412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7409744262695312, + "rewards/margins": 5.933928489685059, + "rewards/rejected": -2.1929538249969482, + "step": 6120 + }, + { + "epoch": 2.8, + "learning_rate": 2.2323693556570265e-08, + "logits/chosen": -0.44301462173461914, + "logits/rejected": -0.4626496732234955, + "logps/chosen": -112.18086242675781, + "logps/rejected": -90.97620391845703, + "loss": 0.1081, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.0974788665771484, + "rewards/margins": 5.079029083251953, + "rewards/rejected": -1.9815499782562256, + "step": 6130 + }, + { + "epoch": 2.8, + "learning_rate": 2.1816336884830032e-08, + "logits/chosen": -0.44184190034866333, + "logits/rejected": -0.4639458656311035, + "logps/chosen": -115.1020736694336, + "logps/rejected": -94.1207504272461, + "loss": 0.1555, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.652765989303589, + "rewards/margins": 5.641629219055176, + "rewards/rejected": -1.9888633489608765, + "step": 6140 + }, + { + "epoch": 2.81, + "learning_rate": 2.13089802130898e-08, + "logits/chosen": -0.44147539138793945, + "logits/rejected": -0.4748079776763916, + "logps/chosen": -123.27156066894531, + "logps/rejected": -92.42192077636719, + "loss": 0.1192, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.140544891357422, + "rewards/margins": 6.6083197593688965, + "rewards/rejected": -2.4677751064300537, + "step": 6150 + }, + { + "epoch": 2.81, + "learning_rate": 2.0801623541349565e-08, + "logits/chosen": -0.42793139815330505, + "logits/rejected": -0.4529266953468323, + "logps/chosen": -119.36943054199219, + "logps/rejected": -103.21095275878906, + "loss": 0.1234, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7451796531677246, + "rewards/margins": 5.694420337677002, + "rewards/rejected": -1.9492400884628296, + "step": 6160 + }, + { + "epoch": 2.82, + "learning_rate": 2.0294266869609332e-08, + "logits/chosen": -0.4539063572883606, + "logits/rejected": -0.4759596288204193, + "logps/chosen": -118.61543273925781, + "logps/rejected": -97.66639709472656, + "loss": 0.1363, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.301544189453125, + "rewards/margins": 5.550912380218506, + "rewards/rejected": -2.24936842918396, + "step": 6170 + }, + { + "epoch": 2.82, + "learning_rate": 1.97869101978691e-08, + "logits/chosen": -0.4578167498111725, + "logits/rejected": -0.48147234320640564, + "logps/chosen": -123.6511459350586, + "logps/rejected": -99.3853759765625, + "loss": 0.1132, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5267632007598877, + "rewards/margins": 6.09173059463501, + "rewards/rejected": -2.564967155456543, + "step": 6180 + }, + { + "epoch": 2.83, + "learning_rate": 1.9279553526128868e-08, + "logits/chosen": -0.44004470109939575, + "logits/rejected": -0.47797951102256775, + "logps/chosen": -120.70310974121094, + "logps/rejected": -98.59986877441406, + "loss": 0.1371, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.0213212966918945, + "rewards/margins": 6.670281887054443, + "rewards/rejected": -2.648960590362549, + "step": 6190 + }, + { + "epoch": 2.83, + "learning_rate": 1.8772196854388635e-08, + "logits/chosen": -0.46676498651504517, + "logits/rejected": -0.4783889651298523, + "logps/chosen": -114.30501556396484, + "logps/rejected": -95.9433364868164, + "loss": 0.1332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.327721118927002, + "rewards/margins": 5.742402076721191, + "rewards/rejected": -2.414680004119873, + "step": 6200 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -0.4261917769908905, + "eval_logits/rejected": -0.451227068901062, + "eval_logps/chosen": -116.21163940429688, + "eval_logps/rejected": -95.4748306274414, + "eval_loss": 0.14415588974952698, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.4740583896636963, + "eval_rewards/margins": 5.7466325759887695, + "eval_rewards/rejected": -2.2725744247436523, + "eval_runtime": 922.4424, + "eval_samples_per_second": 3.103, + "eval_steps_per_second": 0.194, + "step": 6200 + }, + { + "epoch": 2.83, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -0.44827452301979065, + "logits/rejected": -0.466301828622818, + "logps/chosen": -120.3111801147461, + "logps/rejected": -96.71390533447266, + "loss": 0.1551, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.323174238204956, + "rewards/margins": 5.543818473815918, + "rewards/rejected": -2.220643997192383, + "step": 6210 + }, + { + "epoch": 2.84, + "learning_rate": 1.7757483510908168e-08, + "logits/chosen": -0.4289180636405945, + "logits/rejected": -0.450812965631485, + "logps/chosen": -112.69013977050781, + "logps/rejected": -97.18971252441406, + "loss": 0.1224, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9697463512420654, + "rewards/margins": 6.175944805145264, + "rewards/rejected": -2.2061984539031982, + "step": 6220 + }, + { + "epoch": 2.84, + "learning_rate": 1.7250126839167935e-08, + "logits/chosen": -0.44035395979881287, + "logits/rejected": -0.45798999071121216, + "logps/chosen": -105.89752197265625, + "logps/rejected": -93.09783935546875, + "loss": 0.1339, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.038432598114014, + "rewards/margins": 5.471198081970215, + "rewards/rejected": -1.4327653646469116, + "step": 6230 + }, + { + "epoch": 2.85, + "learning_rate": 1.67427701674277e-08, + "logits/chosen": -0.4607762396335602, + "logits/rejected": -0.47269120812416077, + "logps/chosen": -118.00733947753906, + "logps/rejected": -96.32691955566406, + "loss": 0.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8974251747131348, + "rewards/margins": 6.761053562164307, + "rewards/rejected": -2.86362886428833, + "step": 6240 + }, + { + "epoch": 2.85, + "learning_rate": 1.6235413495687468e-08, + "logits/chosen": -0.4478604197502136, + "logits/rejected": -0.464190810918808, + "logps/chosen": -115.123291015625, + "logps/rejected": -98.07180786132812, + "loss": 0.126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.360918998718262, + "rewards/margins": 5.949367046356201, + "rewards/rejected": -1.5884480476379395, + "step": 6250 + }, + { + "epoch": 2.86, + "learning_rate": 1.5728056823947235e-08, + "logits/chosen": -0.44280537962913513, + "logits/rejected": -0.46292656660079956, + "logps/chosen": -117.03279876708984, + "logps/rejected": -98.99063873291016, + "loss": 0.1115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.106674671173096, + "rewards/margins": 6.606393337249756, + "rewards/rejected": -2.499718427658081, + "step": 6260 + }, + { + "epoch": 2.86, + "learning_rate": 1.5220700152207e-08, + "logits/chosen": -0.4295724332332611, + "logits/rejected": -0.4528141915798187, + "logps/chosen": -110.50325012207031, + "logps/rejected": -98.00163269042969, + "loss": 0.1204, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.804098606109619, + "rewards/margins": 6.637032508850098, + "rewards/rejected": -2.8329339027404785, + "step": 6270 + }, + { + "epoch": 2.87, + "learning_rate": 1.4713343480466766e-08, + "logits/chosen": -0.439584881067276, + "logits/rejected": -0.4629645347595215, + "logps/chosen": -122.1606216430664, + "logps/rejected": -97.83927154541016, + "loss": 0.1304, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.3962371349334717, + "rewards/margins": 5.390679359436035, + "rewards/rejected": -1.9944416284561157, + "step": 6280 + }, + { + "epoch": 2.87, + "learning_rate": 1.4205986808726533e-08, + "logits/chosen": -0.4244330823421478, + "logits/rejected": -0.44999808073043823, + "logps/chosen": -124.31058502197266, + "logps/rejected": -93.81672668457031, + "loss": 0.1086, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9259040355682373, + "rewards/margins": 6.213841438293457, + "rewards/rejected": -2.2879371643066406, + "step": 6290 + }, + { + "epoch": 2.88, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -0.4378216862678528, + "logits/rejected": -0.4623119831085205, + "logps/chosen": -112.90274810791016, + "logps/rejected": -95.5672607421875, + "loss": 0.1454, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.6615824699401855, + "rewards/margins": 5.905343055725098, + "rewards/rejected": -2.243760824203491, + "step": 6300 + }, + { + "epoch": 2.88, + "eval_logits/chosen": -0.4354705512523651, + "eval_logits/rejected": -0.46043190360069275, + "eval_logps/chosen": -116.27782440185547, + "eval_logps/rejected": -95.51180267333984, + "eval_loss": 0.13972659409046173, + "eval_rewards/accuracies": 0.9357541799545288, + "eval_rewards/chosen": 3.4409618377685547, + "eval_rewards/margins": 5.732030391693115, + "eval_rewards/rejected": -2.2910685539245605, + "eval_runtime": 909.5558, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 0.197, + "step": 6300 + }, + { + "epoch": 2.88, + "learning_rate": 1.3191273465246066e-08, + "logits/chosen": -0.4530218541622162, + "logits/rejected": -0.4725785255432129, + "logps/chosen": -124.86883544921875, + "logps/rejected": -98.3955078125, + "loss": 0.1308, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5713772773742676, + "rewards/margins": 5.833029270172119, + "rewards/rejected": -2.2616517543792725, + "step": 6310 + }, + { + "epoch": 2.88, + "learning_rate": 1.2683916793505833e-08, + "logits/chosen": -0.4388354420661926, + "logits/rejected": -0.4666750431060791, + "logps/chosen": -123.91282653808594, + "logps/rejected": -101.6700668334961, + "loss": 0.1498, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.2302632331848145, + "rewards/margins": 6.3310418128967285, + "rewards/rejected": -2.100778579711914, + "step": 6320 + }, + { + "epoch": 2.89, + "learning_rate": 1.21765601217656e-08, + "logits/chosen": -0.45121073722839355, + "logits/rejected": -0.474397748708725, + "logps/chosen": -113.64533996582031, + "logps/rejected": -96.31367492675781, + "loss": 0.1256, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.1905887126922607, + "rewards/margins": 5.855005741119385, + "rewards/rejected": -2.664416790008545, + "step": 6330 + }, + { + "epoch": 2.89, + "learning_rate": 1.1669203450025366e-08, + "logits/chosen": -0.4414314329624176, + "logits/rejected": -0.4682813286781311, + "logps/chosen": -118.41851806640625, + "logps/rejected": -98.83721160888672, + "loss": 0.1285, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.3815665245056152, + "rewards/margins": 5.590354919433594, + "rewards/rejected": -2.2087883949279785, + "step": 6340 + }, + { + "epoch": 2.9, + "learning_rate": 1.1161846778285133e-08, + "logits/chosen": -0.4640694558620453, + "logits/rejected": -0.4832492470741272, + "logps/chosen": -118.1152114868164, + "logps/rejected": -95.31473541259766, + "loss": 0.116, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.576420307159424, + "rewards/margins": 5.862317085266113, + "rewards/rejected": -2.2858967781066895, + "step": 6350 + }, + { + "epoch": 2.9, + "learning_rate": 1.06544901065449e-08, + "logits/chosen": -0.433235228061676, + "logits/rejected": -0.46126851439476013, + "logps/chosen": -119.97880554199219, + "logps/rejected": -96.08879852294922, + "loss": 0.1267, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7901527881622314, + "rewards/margins": 6.419007778167725, + "rewards/rejected": -2.628854751586914, + "step": 6360 + }, + { + "epoch": 2.91, + "learning_rate": 1.0147133434804666e-08, + "logits/chosen": -0.4347938001155853, + "logits/rejected": -0.46376532316207886, + "logps/chosen": -120.90704345703125, + "logps/rejected": -96.43917083740234, + "loss": 0.1411, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9222941398620605, + "rewards/margins": 6.398364067077637, + "rewards/rejected": -2.4760701656341553, + "step": 6370 + }, + { + "epoch": 2.91, + "learning_rate": 9.639776763064434e-09, + "logits/chosen": -0.43002352118492126, + "logits/rejected": -0.454629123210907, + "logps/chosen": -116.3228988647461, + "logps/rejected": -94.8536605834961, + "loss": 0.1246, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.674093246459961, + "rewards/margins": 6.51773738861084, + "rewards/rejected": -2.843644618988037, + "step": 6380 + }, + { + "epoch": 2.92, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -0.4467340409755707, + "logits/rejected": -0.47312459349632263, + "logps/chosen": -115.29522705078125, + "logps/rejected": -95.09002685546875, + "loss": 0.117, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.3362815380096436, + "rewards/margins": 5.706353187561035, + "rewards/rejected": -2.3700718879699707, + "step": 6390 + }, + { + "epoch": 2.92, + "learning_rate": 8.625063419583967e-09, + "logits/chosen": -0.439365953207016, + "logits/rejected": -0.46871843934059143, + "logps/chosen": -117.05690002441406, + "logps/rejected": -96.14810943603516, + "loss": 0.1355, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.502894878387451, + "rewards/margins": 6.180233955383301, + "rewards/rejected": -2.6773390769958496, + "step": 6400 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -0.4225231111049652, + "eval_logits/rejected": -0.44727426767349243, + "eval_logps/chosen": -116.41168212890625, + "eval_logps/rejected": -95.67748260498047, + "eval_loss": 0.1470990926027298, + "eval_rewards/accuracies": 0.9329608678817749, + "eval_rewards/chosen": 3.3740320205688477, + "eval_rewards/margins": 5.747939586639404, + "eval_rewards/rejected": -2.3739078044891357, + "eval_runtime": 896.7307, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.2, + "step": 6400 + }, + { + "epoch": 2.93, + "learning_rate": 8.117706747843734e-09, + "logits/chosen": -0.42712831497192383, + "logits/rejected": -0.44280409812927246, + "logps/chosen": -119.4900894165039, + "logps/rejected": -98.96273803710938, + "loss": 0.1182, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9419543743133545, + "rewards/margins": 6.928774356842041, + "rewards/rejected": -2.9868204593658447, + "step": 6410 + }, + { + "epoch": 2.93, + "learning_rate": 7.6103500761035e-09, + "logits/chosen": -0.41072121262550354, + "logits/rejected": -0.43890589475631714, + "logps/chosen": -118.20353698730469, + "logps/rejected": -94.67088317871094, + "loss": 0.1151, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.44974422454834, + "rewards/margins": 6.641238212585449, + "rewards/rejected": -2.191495180130005, + "step": 6420 + }, + { + "epoch": 2.94, + "learning_rate": 7.1029934043632664e-09, + "logits/chosen": -0.4472702145576477, + "logits/rejected": -0.46455731987953186, + "logps/chosen": -123.02734375, + "logps/rejected": -95.70638275146484, + "loss": 0.1486, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.530532121658325, + "rewards/margins": 6.119112491607666, + "rewards/rejected": -2.58858060836792, + "step": 6430 + }, + { + "epoch": 2.94, + "learning_rate": 6.595636732623033e-09, + "logits/chosen": -0.4400475025177002, + "logits/rejected": -0.4567417502403259, + "logps/chosen": -113.33174896240234, + "logps/rejected": -98.16852569580078, + "loss": 0.1225, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7850425243377686, + "rewards/margins": 5.5529022216796875, + "rewards/rejected": -1.767859697341919, + "step": 6440 + }, + { + "epoch": 2.94, + "learning_rate": 6.0882800608828e-09, + "logits/chosen": -0.4608462452888489, + "logits/rejected": -0.4785988926887512, + "logps/chosen": -119.42830657958984, + "logps/rejected": -101.83099365234375, + "loss": 0.1226, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.2882943153381348, + "rewards/margins": 5.341013431549072, + "rewards/rejected": -2.0527195930480957, + "step": 6450 + }, + { + "epoch": 2.95, + "learning_rate": 5.580923389142566e-09, + "logits/chosen": -0.42657288908958435, + "logits/rejected": -0.4634782671928406, + "logps/chosen": -118.7545166015625, + "logps/rejected": -99.45500183105469, + "loss": 0.1135, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7978293895721436, + "rewards/margins": 6.2398295402526855, + "rewards/rejected": -2.442000150680542, + "step": 6460 + }, + { + "epoch": 2.95, + "learning_rate": 5.073566717402333e-09, + "logits/chosen": -0.4478042721748352, + "logits/rejected": -0.4692462980747223, + "logps/chosen": -125.03216552734375, + "logps/rejected": -98.91600036621094, + "loss": 0.1383, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.3301844596862793, + "rewards/margins": 5.73842716217041, + "rewards/rejected": -2.40824294090271, + "step": 6470 + }, + { + "epoch": 2.96, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -0.44531145691871643, + "logits/rejected": -0.4770316183567047, + "logps/chosen": -119.36907958984375, + "logps/rejected": -96.36643981933594, + "loss": 0.1276, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4794089794158936, + "rewards/margins": 5.956811904907227, + "rewards/rejected": -2.477403163909912, + "step": 6480 + }, + { + "epoch": 2.96, + "learning_rate": 4.058853373921867e-09, + "logits/chosen": -0.42967623472213745, + "logits/rejected": -0.4564761221408844, + "logps/chosen": -125.42822265625, + "logps/rejected": -97.00397491455078, + "loss": 0.1277, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.159353733062744, + "rewards/margins": 6.9187469482421875, + "rewards/rejected": -2.7593941688537598, + "step": 6490 + }, + { + "epoch": 2.97, + "learning_rate": 3.5514967021816332e-09, + "logits/chosen": -0.44123202562332153, + "logits/rejected": -0.46388721466064453, + "logps/chosen": -115.01322937011719, + "logps/rejected": -94.85794830322266, + "loss": 0.1114, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9512016773223877, + "rewards/margins": 5.423955917358398, + "rewards/rejected": -1.4727542400360107, + "step": 6500 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -0.4345143139362335, + "eval_logits/rejected": -0.4595170319080353, + "eval_logps/chosen": -116.18891906738281, + "eval_logps/rejected": -95.57404327392578, + "eval_loss": 0.1396893560886383, + "eval_rewards/accuracies": 0.9301676154136658, + "eval_rewards/chosen": 3.4854142665863037, + "eval_rewards/margins": 5.8075995445251465, + "eval_rewards/rejected": -2.3221850395202637, + "eval_runtime": 909.3942, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 0.197, + "step": 6500 + }, + { + "epoch": 2.97, + "learning_rate": 3.0441400304414e-09, + "logits/chosen": -0.43526285886764526, + "logits/rejected": -0.4636686444282532, + "logps/chosen": -117.7777099609375, + "logps/rejected": -97.91990661621094, + "loss": 0.1216, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.961691379547119, + "rewards/margins": 5.777933120727539, + "rewards/rejected": -1.8162418603897095, + "step": 6510 + }, + { + "epoch": 2.98, + "learning_rate": 2.5367833587011665e-09, + "logits/chosen": -0.43703460693359375, + "logits/rejected": -0.4599098563194275, + "logps/chosen": -125.13053894042969, + "logps/rejected": -97.0875473022461, + "loss": 0.1135, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.697622299194336, + "rewards/margins": 5.988824367523193, + "rewards/rejected": -2.2912018299102783, + "step": 6520 + }, + { + "epoch": 2.98, + "learning_rate": 2.0294266869609335e-09, + "logits/chosen": -0.4473207890987396, + "logits/rejected": -0.4712890684604645, + "logps/chosen": -111.90199279785156, + "logps/rejected": -96.57246398925781, + "loss": 0.1362, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5259852409362793, + "rewards/margins": 5.500835418701172, + "rewards/rejected": -1.974850058555603, + "step": 6530 + }, + { + "epoch": 2.99, + "learning_rate": 1.5220700152207e-09, + "logits/chosen": -0.4719008803367615, + "logits/rejected": -0.4840930104255676, + "logps/chosen": -113.07936096191406, + "logps/rejected": -94.11990356445312, + "loss": 0.1438, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.121089935302734, + "rewards/margins": 5.908658027648926, + "rewards/rejected": -1.7875677347183228, + "step": 6540 + }, + { + "epoch": 2.99, + "learning_rate": 1.0147133434804667e-09, + "logits/chosen": -0.43527278304100037, + "logits/rejected": -0.4640693664550781, + "logps/chosen": -127.6871566772461, + "logps/rejected": -93.20718383789062, + "loss": 0.143, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.136395454406738, + "rewards/margins": 5.834487438201904, + "rewards/rejected": -1.6980924606323242, + "step": 6550 + }, + { + "epoch": 2.99, + "learning_rate": 5.073566717402334e-10, + "logits/chosen": -0.4505973756313324, + "logits/rejected": -0.46244126558303833, + "logps/chosen": -120.34539794921875, + "logps/rejected": -96.04461669921875, + "loss": 0.1235, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4686827659606934, + "rewards/margins": 5.904242038726807, + "rewards/rejected": -2.435559034347534, + "step": 6560 + }, + { + "epoch": 3.0, + "learning_rate": 0.0, + "logits/chosen": -0.44836997985839844, + "logits/rejected": -0.47291040420532227, + "logps/chosen": -113.19990539550781, + "logps/rejected": -96.67423248291016, + "loss": 0.1073, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.5085701942443848, + "rewards/margins": 5.106671333312988, + "rewards/rejected": -2.5981011390686035, + "step": 6570 + }, + { + "epoch": 3.0, + "step": 6570, + "total_flos": 0.0, + "train_loss": 0.2317562538376319, + "train_runtime": 114179.2227, + "train_samples_per_second": 3.684, + "train_steps_per_second": 0.058 + } + ], + "logging_steps": 10, + "max_steps": 6570, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}