{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998972954467648, "eval_steps": 100, "global_step": 6570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.5662100456621e-10, "logits/chosen": -0.4019157290458679, "logits/rejected": -0.4092532694339752, "logps/chosen": -116.08753204345703, "logps/rejected": -122.52903747558594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 4.5662100456621e-09, "logits/chosen": -0.4602813720703125, "logits/rejected": -0.5173085927963257, "logps/chosen": -126.46894073486328, "logps/rejected": -92.41258239746094, "loss": 0.8724, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.33254384994506836, "rewards/margins": 0.4581206142902374, "rewards/rejected": -0.12557676434516907, "step": 10 }, { "epoch": 0.01, "learning_rate": 9.1324200913242e-09, "logits/chosen": -0.4416370391845703, "logits/rejected": -0.49417153000831604, "logps/chosen": -127.47574615478516, "logps/rejected": -95.31398010253906, "loss": 0.945, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.15608355402946472, "rewards/margins": 0.04934122413396835, "rewards/rejected": 0.10674233734607697, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.36986301369863e-08, "logits/chosen": -0.4033452570438385, "logits/rejected": -0.4765704572200775, "logps/chosen": -129.85809326171875, "logps/rejected": -93.86156463623047, "loss": 0.9347, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.02109346352517605, "rewards/margins": 0.07605800777673721, "rewards/rejected": -0.054964542388916016, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.82648401826484e-08, "logits/chosen": -0.43521660566329956, "logits/rejected": -0.4863054156303406, "logps/chosen": -134.07125854492188, "logps/rejected": -95.79774475097656, "loss": 0.9597, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.2760757505893707, "rewards/margins": -0.2107580155134201, "rewards/rejected": -0.06531772762537003, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.28310502283105e-08, "logits/chosen": -0.4325088858604431, "logits/rejected": -0.48078474402427673, "logps/chosen": -114.5306625366211, "logps/rejected": -87.7462158203125, "loss": 0.9379, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.11705958843231201, "rewards/margins": -0.04724755138158798, "rewards/rejected": 0.1643071472644806, "step": 50 }, { "epoch": 0.03, "learning_rate": 2.73972602739726e-08, "logits/chosen": -0.42710214853286743, "logits/rejected": -0.49376431107521057, "logps/chosen": -125.63948822021484, "logps/rejected": -95.20314025878906, "loss": 0.8973, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.12470320612192154, "rewards/margins": -0.04657207801938057, "rewards/rejected": -0.07813112437725067, "step": 60 }, { "epoch": 0.03, "learning_rate": 3.19634703196347e-08, "logits/chosen": -0.4520903527736664, "logits/rejected": -0.49421629309654236, "logps/chosen": -128.04258728027344, "logps/rejected": -95.23574829101562, "loss": 0.9252, "rewards/accuracies": 0.5625, "rewards/chosen": 0.27560052275657654, "rewards/margins": 0.32960376143455505, "rewards/rejected": -0.054003216326236725, "step": 70 }, { "epoch": 0.04, "learning_rate": 3.65296803652968e-08, "logits/chosen": -0.4498293995857239, "logits/rejected": -0.49676957726478577, "logps/chosen": -122.39030456542969, "logps/rejected": -91.36656188964844, "loss": 0.9493, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.04946544021368027, "rewards/margins": -0.14646394550800323, "rewards/rejected": 0.1959293782711029, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.10958904109589e-08, "logits/chosen": -0.4375010132789612, "logits/rejected": -0.4830726981163025, "logps/chosen": -116.85307312011719, "logps/rejected": -93.01774597167969, "loss": 0.8934, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.009796512313187122, "rewards/margins": 0.009660542011260986, "rewards/rejected": -0.019457053393125534, "step": 90 }, { "epoch": 0.05, "learning_rate": 4.5662100456621e-08, "logits/chosen": -0.4444386959075928, "logits/rejected": -0.5023002028465271, "logps/chosen": -127.2782211303711, "logps/rejected": -93.0546646118164, "loss": 0.8825, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.14289195835590363, "rewards/margins": 0.13167095184326172, "rewards/rejected": 0.011221003718674183, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -0.4521982967853546, "eval_logits/rejected": -0.5016786456108093, "eval_logps/chosen": -122.78299713134766, "eval_logps/rejected": -90.6888656616211, "eval_loss": 0.8872030973434448, "eval_rewards/accuracies": 0.505586564540863, "eval_rewards/chosen": 0.18837407231330872, "eval_rewards/margins": 0.06797005981206894, "eval_rewards/rejected": 0.12040401250123978, "eval_runtime": 913.9439, "eval_samples_per_second": 3.131, "eval_steps_per_second": 0.196, "step": 100 }, { "epoch": 0.05, "learning_rate": 5.02283105022831e-08, "logits/chosen": -0.4448448121547699, "logits/rejected": -0.4973272383213043, "logps/chosen": -129.8375701904297, "logps/rejected": -94.05601501464844, "loss": 0.9452, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.08486747741699219, "rewards/margins": 0.03504283353686333, "rewards/rejected": 0.04982464388012886, "step": 110 }, { "epoch": 0.05, "learning_rate": 5.47945205479452e-08, "logits/chosen": -0.4298163950443268, "logits/rejected": -0.48838791251182556, "logps/chosen": -130.39566040039062, "logps/rejected": -91.33364868164062, "loss": 0.88, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.10634370148181915, "rewards/margins": 0.24824686348438263, "rewards/rejected": -0.14190316200256348, "step": 120 }, { "epoch": 0.06, "learning_rate": 5.93607305936073e-08, "logits/chosen": -0.4313521385192871, "logits/rejected": -0.4968926012516022, "logps/chosen": -133.50741577148438, "logps/rejected": -91.91551208496094, "loss": 0.8437, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.014166712760925293, "rewards/margins": 0.0034161806106567383, "rewards/rejected": -0.01758289337158203, "step": 130 }, { "epoch": 0.06, "learning_rate": 6.39269406392694e-08, "logits/chosen": -0.4153892993927002, "logits/rejected": -0.4767012596130371, "logps/chosen": -131.85374450683594, "logps/rejected": -96.32195281982422, "loss": 0.8648, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.16102957725524902, "rewards/margins": 0.05374450609087944, "rewards/rejected": 0.10728506743907928, "step": 140 }, { "epoch": 0.07, "learning_rate": 6.84931506849315e-08, "logits/chosen": -0.44299325346946716, "logits/rejected": -0.5104792714118958, "logps/chosen": -125.35621643066406, "logps/rejected": -89.10494232177734, "loss": 0.9226, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.062367748469114304, "rewards/margins": 0.08922012150287628, "rewards/rejected": -0.02685236930847168, "step": 150 }, { "epoch": 0.07, "learning_rate": 7.30593607305936e-08, "logits/chosen": -0.41546908020973206, "logits/rejected": -0.4675370752811432, "logps/chosen": -124.44963073730469, "logps/rejected": -97.21520233154297, "loss": 0.8653, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.1597222536802292, "rewards/margins": 0.09017050266265869, "rewards/rejected": 0.0695517510175705, "step": 160 }, { "epoch": 0.08, "learning_rate": 7.76255707762557e-08, "logits/chosen": -0.420665442943573, "logits/rejected": -0.4751351475715637, "logps/chosen": -128.25018310546875, "logps/rejected": -93.02848052978516, "loss": 0.8364, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.15812595188617706, "rewards/margins": 0.24063143134117126, "rewards/rejected": -0.0825054869055748, "step": 170 }, { "epoch": 0.08, "learning_rate": 8.21917808219178e-08, "logits/chosen": -0.43356451392173767, "logits/rejected": -0.5012267827987671, "logps/chosen": -134.00485229492188, "logps/rejected": -91.32147979736328, "loss": 0.8665, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2619454562664032, "rewards/margins": 0.23557980358600616, "rewards/rejected": 0.02636566199362278, "step": 180 }, { "epoch": 0.09, "learning_rate": 8.67579908675799e-08, "logits/chosen": -0.45523887872695923, "logits/rejected": -0.5100681185722351, "logps/chosen": -125.0759506225586, "logps/rejected": -89.36471557617188, "loss": 0.846, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.24546337127685547, "rewards/margins": 0.27231401205062866, "rewards/rejected": -0.02685065194964409, "step": 190 }, { "epoch": 0.09, "learning_rate": 9.1324200913242e-08, "logits/chosen": -0.43358325958251953, "logits/rejected": -0.488663911819458, "logps/chosen": -115.94734191894531, "logps/rejected": -88.48348236083984, "loss": 0.9136, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.04173297807574272, "rewards/margins": -0.21215708553791046, "rewards/rejected": 0.17042410373687744, "step": 200 }, { "epoch": 0.09, "eval_logits/chosen": -0.44474586844444275, "eval_logits/rejected": -0.49599677324295044, "eval_logps/chosen": -122.50910186767578, "eval_logps/rejected": -90.78695678710938, "eval_loss": 0.832494854927063, "eval_rewards/accuracies": 0.589385449886322, "eval_rewards/chosen": 0.32531994581222534, "eval_rewards/margins": 0.2539590895175934, "eval_rewards/rejected": 0.07136084139347076, "eval_runtime": 887.9753, "eval_samples_per_second": 3.223, "eval_steps_per_second": 0.202, "step": 200 }, { "epoch": 0.1, "learning_rate": 9.58904109589041e-08, "logits/chosen": -0.44061392545700073, "logits/rejected": -0.48843497037887573, "logps/chosen": -121.23575592041016, "logps/rejected": -91.54434204101562, "loss": 0.88, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.15113969147205353, "rewards/margins": 0.015409660525619984, "rewards/rejected": 0.13573002815246582, "step": 210 }, { "epoch": 0.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -0.44614124298095703, "logits/rejected": -0.48937082290649414, "logps/chosen": -115.96971130371094, "logps/rejected": -90.40010833740234, "loss": 0.9042, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.017821501940488815, "rewards/margins": -0.00064764020498842, "rewards/rejected": 0.01846914365887642, "step": 220 }, { "epoch": 0.1, "learning_rate": 1.050228310502283e-07, "logits/chosen": -0.40590643882751465, "logits/rejected": -0.4714388847351074, "logps/chosen": -135.4023895263672, "logps/rejected": -95.57572937011719, "loss": 0.8178, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.42753997445106506, "rewards/margins": 0.28569334745407104, "rewards/rejected": 0.14184658229351044, "step": 230 }, { "epoch": 0.11, "learning_rate": 1.095890410958904e-07, "logits/chosen": -0.44258102774620056, "logits/rejected": -0.4934941232204437, "logps/chosen": -124.71822357177734, "logps/rejected": -90.02816009521484, "loss": 0.8509, "rewards/accuracies": 0.5, "rewards/chosen": 0.17398934066295624, "rewards/margins": 0.06253180652856827, "rewards/rejected": 0.11145754158496857, "step": 240 }, { "epoch": 0.11, "learning_rate": 1.141552511415525e-07, "logits/chosen": -0.43292659521102905, "logits/rejected": -0.49328017234802246, "logps/chosen": -129.06802368164062, "logps/rejected": -92.1496353149414, "loss": 0.8357, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2504062056541443, "rewards/margins": 0.1820850372314453, "rewards/rejected": 0.06832118332386017, "step": 250 }, { "epoch": 0.12, "learning_rate": 1.187214611872146e-07, "logits/chosen": -0.452961266040802, "logits/rejected": -0.49590611457824707, "logps/chosen": -130.70712280273438, "logps/rejected": -89.96910095214844, "loss": 0.8312, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.38505929708480835, "rewards/margins": 0.07876329123973846, "rewards/rejected": 0.3062959909439087, "step": 260 }, { "epoch": 0.12, "learning_rate": 1.232876712328767e-07, "logits/chosen": -0.43949493765830994, "logits/rejected": -0.49638843536376953, "logps/chosen": -131.95510864257812, "logps/rejected": -91.30790710449219, "loss": 0.7985, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.44583946466445923, "rewards/margins": 0.32409507036209106, "rewards/rejected": 0.12174437195062637, "step": 270 }, { "epoch": 0.13, "learning_rate": 1.278538812785388e-07, "logits/chosen": -0.4356844425201416, "logits/rejected": -0.4804006516933441, "logps/chosen": -122.7051010131836, "logps/rejected": -95.61629486083984, "loss": 0.8578, "rewards/accuracies": 0.4375, "rewards/chosen": 0.38490691781044006, "rewards/margins": -0.046961426734924316, "rewards/rejected": 0.43186837434768677, "step": 280 }, { "epoch": 0.13, "learning_rate": 1.324200913242009e-07, "logits/chosen": -0.4335559010505676, "logits/rejected": -0.48386988043785095, "logps/chosen": -125.57695007324219, "logps/rejected": -93.93878936767578, "loss": 0.7524, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7190027236938477, "rewards/margins": 0.5058382749557495, "rewards/rejected": 0.21316440403461456, "step": 290 }, { "epoch": 0.14, "learning_rate": 1.36986301369863e-07, "logits/chosen": -0.4185541570186615, "logits/rejected": -0.4697909355163574, "logps/chosen": -124.8671875, "logps/rejected": -89.48695373535156, "loss": 0.7507, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7953338623046875, "rewards/margins": 0.46773916482925415, "rewards/rejected": 0.32759472727775574, "step": 300 }, { "epoch": 0.14, "eval_logits/chosen": -0.44264963269233704, "eval_logits/rejected": -0.4909226894378662, "eval_logps/chosen": -122.01156616210938, "eval_logps/rejected": -90.3702621459961, "eval_loss": 0.7816197872161865, "eval_rewards/accuracies": 0.5670391321182251, "eval_rewards/chosen": 0.5740950107574463, "eval_rewards/margins": 0.29438889026641846, "eval_rewards/rejected": 0.27970612049102783, "eval_runtime": 858.3912, "eval_samples_per_second": 3.334, "eval_steps_per_second": 0.209, "step": 300 }, { "epoch": 0.14, "learning_rate": 1.415525114155251e-07, "logits/chosen": -0.4151129722595215, "logits/rejected": -0.4674547612667084, "logps/chosen": -120.4942855834961, "logps/rejected": -88.7030258178711, "loss": 0.7411, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.721222996711731, "rewards/margins": 0.4279160499572754, "rewards/rejected": 0.2933068871498108, "step": 310 }, { "epoch": 0.15, "learning_rate": 1.461187214611872e-07, "logits/chosen": -0.4228796362876892, "logits/rejected": -0.4746120572090149, "logps/chosen": -130.20228576660156, "logps/rejected": -95.73072814941406, "loss": 0.7596, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8771559000015259, "rewards/margins": 0.4412263035774231, "rewards/rejected": 0.435929536819458, "step": 320 }, { "epoch": 0.15, "learning_rate": 1.506849315068493e-07, "logits/chosen": -0.4374767243862152, "logits/rejected": -0.48573336005210876, "logps/chosen": -118.52967834472656, "logps/rejected": -92.9192886352539, "loss": 0.7083, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.713616669178009, "rewards/margins": 0.4573392868041992, "rewards/rejected": 0.25627732276916504, "step": 330 }, { "epoch": 0.16, "learning_rate": 1.552511415525114e-07, "logits/chosen": -0.42817750573158264, "logits/rejected": -0.4862712025642395, "logps/chosen": -118.77824401855469, "logps/rejected": -85.43115997314453, "loss": 0.7041, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8094140887260437, "rewards/margins": 0.5132724642753601, "rewards/rejected": 0.29614168405532837, "step": 340 }, { "epoch": 0.16, "learning_rate": 1.598173515981735e-07, "logits/chosen": -0.4172348380088806, "logits/rejected": -0.4674188196659088, "logps/chosen": -117.78670501708984, "logps/rejected": -93.87962341308594, "loss": 0.6998, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6671510934829712, "rewards/margins": 0.4894055426120758, "rewards/rejected": 0.1777455061674118, "step": 350 }, { "epoch": 0.16, "learning_rate": 1.643835616438356e-07, "logits/chosen": -0.3993512988090515, "logits/rejected": -0.4529343247413635, "logps/chosen": -122.50328063964844, "logps/rejected": -92.97706604003906, "loss": 0.7234, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9135759472846985, "rewards/margins": 0.6725237965583801, "rewards/rejected": 0.24105218052864075, "step": 360 }, { "epoch": 0.17, "learning_rate": 1.689497716894977e-07, "logits/chosen": -0.38976728916168213, "logits/rejected": -0.44725877046585083, "logps/chosen": -123.70979309082031, "logps/rejected": -93.47122192382812, "loss": 0.7021, "rewards/accuracies": 0.625, "rewards/chosen": 0.8796418905258179, "rewards/margins": 0.516387939453125, "rewards/rejected": 0.36325401067733765, "step": 370 }, { "epoch": 0.17, "learning_rate": 1.735159817351598e-07, "logits/chosen": -0.411344051361084, "logits/rejected": -0.48015648126602173, "logps/chosen": -128.3126678466797, "logps/rejected": -86.05162048339844, "loss": 0.7003, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9648821949958801, "rewards/margins": 0.6015781760215759, "rewards/rejected": 0.3633040189743042, "step": 380 }, { "epoch": 0.18, "learning_rate": 1.780821917808219e-07, "logits/chosen": -0.4367973208427429, "logits/rejected": -0.48391270637512207, "logps/chosen": -123.42488098144531, "logps/rejected": -86.6032943725586, "loss": 0.6979, "rewards/accuracies": 0.625, "rewards/chosen": 0.9070743322372437, "rewards/margins": 0.5875338315963745, "rewards/rejected": 0.31954047083854675, "step": 390 }, { "epoch": 0.18, "learning_rate": 1.82648401826484e-07, "logits/chosen": -0.39601242542266846, "logits/rejected": -0.4429762363433838, "logps/chosen": -115.47049713134766, "logps/rejected": -92.88420104980469, "loss": 0.6142, "rewards/accuracies": 0.625, "rewards/chosen": 1.1646414995193481, "rewards/margins": 0.6666213870048523, "rewards/rejected": 0.49802008271217346, "step": 400 }, { "epoch": 0.18, "eval_logits/chosen": -0.4321858584880829, "eval_logits/rejected": -0.4793085753917694, "eval_logps/chosen": -121.00923156738281, "eval_logps/rejected": -90.04886627197266, "eval_loss": 0.6434672474861145, "eval_rewards/accuracies": 0.6368715167045593, "eval_rewards/chosen": 1.0752556324005127, "eval_rewards/margins": 0.6348468065261841, "eval_rewards/rejected": 0.4404087960720062, "eval_runtime": 919.755, "eval_samples_per_second": 3.112, "eval_steps_per_second": 0.195, "step": 400 }, { "epoch": 0.19, "learning_rate": 1.872146118721461e-07, "logits/chosen": -0.4185422360897064, "logits/rejected": -0.48887625336647034, "logps/chosen": -137.428466796875, "logps/rejected": -89.48545837402344, "loss": 0.639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2363882064819336, "rewards/margins": 0.7988500595092773, "rewards/rejected": 0.43753814697265625, "step": 410 }, { "epoch": 0.19, "learning_rate": 1.917808219178082e-07, "logits/chosen": -0.402921199798584, "logits/rejected": -0.4546676278114319, "logps/chosen": -117.1082534790039, "logps/rejected": -92.3219223022461, "loss": 0.6449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8971641659736633, "rewards/margins": 0.6290008425712585, "rewards/rejected": 0.2681633532047272, "step": 420 }, { "epoch": 0.2, "learning_rate": 1.963470319634703e-07, "logits/chosen": -0.44212061166763306, "logits/rejected": -0.4884345531463623, "logps/chosen": -116.8946533203125, "logps/rejected": -88.76360321044922, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1071293354034424, "rewards/margins": 0.6796278357505798, "rewards/rejected": 0.42750149965286255, "step": 430 }, { "epoch": 0.2, "learning_rate": 2.009132420091324e-07, "logits/chosen": -0.4305177330970764, "logits/rejected": -0.48330944776535034, "logps/chosen": -118.5647201538086, "logps/rejected": -90.13600158691406, "loss": 0.6259, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3376901149749756, "rewards/margins": 0.8060137033462524, "rewards/rejected": 0.5316765308380127, "step": 440 }, { "epoch": 0.21, "learning_rate": 2.054794520547945e-07, "logits/chosen": -0.401493638753891, "logits/rejected": -0.4512600302696228, "logps/chosen": -123.88890075683594, "logps/rejected": -86.47048950195312, "loss": 0.5399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.291102647781372, "rewards/margins": 1.0892807245254517, "rewards/rejected": 0.2018217146396637, "step": 450 }, { "epoch": 0.21, "learning_rate": 2.100456621004566e-07, "logits/chosen": -0.414111465215683, "logits/rejected": -0.4572978913784027, "logps/chosen": -130.4043426513672, "logps/rejected": -93.8148422241211, "loss": 0.5633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4818243980407715, "rewards/margins": 1.0796071290969849, "rewards/rejected": 0.40221720933914185, "step": 460 }, { "epoch": 0.21, "learning_rate": 2.146118721461187e-07, "logits/chosen": -0.3844316601753235, "logits/rejected": -0.43920645117759705, "logps/chosen": -129.4476318359375, "logps/rejected": -93.88046264648438, "loss": 0.6163, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0575988292694092, "rewards/margins": 0.5015383958816528, "rewards/rejected": 0.5560603737831116, "step": 470 }, { "epoch": 0.22, "learning_rate": 2.191780821917808e-07, "logits/chosen": -0.4316393733024597, "logits/rejected": -0.46863240003585815, "logps/chosen": -116.6776123046875, "logps/rejected": -89.31513977050781, "loss": 0.5295, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.6314785480499268, "rewards/margins": 1.1132431030273438, "rewards/rejected": 0.5182352066040039, "step": 480 }, { "epoch": 0.22, "learning_rate": 2.237442922374429e-07, "logits/chosen": -0.3807242810726166, "logits/rejected": -0.4394947588443756, "logps/chosen": -125.314453125, "logps/rejected": -93.21080017089844, "loss": 0.5445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.4549810886383057, "rewards/margins": 1.1398333311080933, "rewards/rejected": 0.3151477575302124, "step": 490 }, { "epoch": 0.23, "learning_rate": 2.28310502283105e-07, "logits/chosen": -0.41063275933265686, "logits/rejected": -0.4601810574531555, "logps/chosen": -125.56761169433594, "logps/rejected": -90.01634979248047, "loss": 0.519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.459787368774414, "rewards/margins": 0.9859598278999329, "rewards/rejected": 0.47382766008377075, "step": 500 }, { "epoch": 0.23, "eval_logits/chosen": -0.4084192216396332, "eval_logits/rejected": -0.45594409108161926, "eval_logps/chosen": -119.71713256835938, "eval_logps/rejected": -89.80496978759766, "eval_loss": 0.5196122527122498, "eval_rewards/accuracies": 0.74301677942276, "eval_rewards/chosen": 1.7213094234466553, "eval_rewards/margins": 1.1589573621749878, "eval_rewards/rejected": 0.5623520016670227, "eval_runtime": 904.862, "eval_samples_per_second": 3.163, "eval_steps_per_second": 0.198, "step": 500 }, { "epoch": 0.23, "learning_rate": 2.328767123287671e-07, "logits/chosen": -0.4057396948337555, "logits/rejected": -0.45863184332847595, "logps/chosen": -117.3805923461914, "logps/rejected": -86.75962829589844, "loss": 0.5427, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.8113657236099243, "rewards/margins": 1.3476877212524414, "rewards/rejected": 0.46367788314819336, "step": 510 }, { "epoch": 0.24, "learning_rate": 2.374429223744292e-07, "logits/chosen": -0.4061856269836426, "logits/rejected": -0.4540451467037201, "logps/chosen": -129.16978454589844, "logps/rejected": -92.90943908691406, "loss": 0.4964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.632061243057251, "rewards/margins": 1.0041301250457764, "rewards/rejected": 0.6279311776161194, "step": 520 }, { "epoch": 0.24, "learning_rate": 2.420091324200913e-07, "logits/chosen": -0.4023277163505554, "logits/rejected": -0.4562614858150482, "logps/chosen": -124.7136459350586, "logps/rejected": -92.33967590332031, "loss": 0.5186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5743153095245361, "rewards/margins": 1.1845461130142212, "rewards/rejected": 0.38976913690567017, "step": 530 }, { "epoch": 0.25, "learning_rate": 2.465753424657534e-07, "logits/chosen": -0.3930456042289734, "logits/rejected": -0.44681698083877563, "logps/chosen": -128.5962677001953, "logps/rejected": -96.91268157958984, "loss": 0.4592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.9018471240997314, "rewards/margins": 1.5477434396743774, "rewards/rejected": 0.3541035056114197, "step": 540 }, { "epoch": 0.25, "learning_rate": 2.511415525114155e-07, "logits/chosen": -0.4221636652946472, "logits/rejected": -0.462319552898407, "logps/chosen": -128.9414520263672, "logps/rejected": -93.53214263916016, "loss": 0.5019, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.7227509021759033, "rewards/margins": 0.957754909992218, "rewards/rejected": 0.7649960517883301, "step": 550 }, { "epoch": 0.26, "learning_rate": 2.557077625570776e-07, "logits/chosen": -0.3653411269187927, "logits/rejected": -0.4162854254245758, "logps/chosen": -121.7026596069336, "logps/rejected": -91.18373107910156, "loss": 0.5392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8344913721084595, "rewards/margins": 1.1174156665802002, "rewards/rejected": 0.717075526714325, "step": 560 }, { "epoch": 0.26, "learning_rate": 2.602739726027397e-07, "logits/chosen": -0.3987251818180084, "logits/rejected": -0.4511111378669739, "logps/chosen": -123.21244049072266, "logps/rejected": -90.11592864990234, "loss": 0.4567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.992093801498413, "rewards/margins": 1.3696999549865723, "rewards/rejected": 0.6223939061164856, "step": 570 }, { "epoch": 0.26, "learning_rate": 2.648401826484018e-07, "logits/chosen": -0.3962245583534241, "logits/rejected": -0.4511072635650635, "logps/chosen": -117.782470703125, "logps/rejected": -86.41405487060547, "loss": 0.4408, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.0794925689697266, "rewards/margins": 1.7970508337020874, "rewards/rejected": 0.2824416756629944, "step": 580 }, { "epoch": 0.27, "learning_rate": 2.694063926940639e-07, "logits/chosen": -0.3933233916759491, "logits/rejected": -0.4369097650051117, "logps/chosen": -118.86439514160156, "logps/rejected": -93.77821350097656, "loss": 0.4553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 2.0210790634155273, "rewards/margins": 1.2972638607025146, "rewards/rejected": 0.7238151431083679, "step": 590 }, { "epoch": 0.27, "learning_rate": 2.73972602739726e-07, "logits/chosen": -0.3547651171684265, "logits/rejected": -0.4130920469760895, "logps/chosen": -121.15980529785156, "logps/rejected": -90.80103302001953, "loss": 0.4858, "rewards/accuracies": 0.6875, "rewards/chosen": 2.020519733428955, "rewards/margins": 1.2199289798736572, "rewards/rejected": 0.8005906939506531, "step": 600 }, { "epoch": 0.27, "eval_logits/chosen": -0.41381627321243286, "eval_logits/rejected": -0.4592490494251251, "eval_logps/chosen": -118.74276733398438, "eval_logps/rejected": -89.74501037597656, "eval_loss": 0.4350966513156891, "eval_rewards/accuracies": 0.7877094745635986, "eval_rewards/chosen": 2.208491086959839, "eval_rewards/margins": 1.6161593198776245, "eval_rewards/rejected": 0.5923314690589905, "eval_runtime": 922.2358, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.194, "step": 600 }, { "epoch": 0.28, "learning_rate": 2.785388127853881e-07, "logits/chosen": -0.3632197082042694, "logits/rejected": -0.4115583896636963, "logps/chosen": -126.2831039428711, "logps/rejected": -91.31169128417969, "loss": 0.4499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.2875258922576904, "rewards/margins": 1.4958405494689941, "rewards/rejected": 0.7916852831840515, "step": 610 }, { "epoch": 0.28, "learning_rate": 2.831050228310502e-07, "logits/chosen": -0.40923231840133667, "logits/rejected": -0.44826406240463257, "logps/chosen": -119.70330810546875, "logps/rejected": -94.67719268798828, "loss": 0.4786, "rewards/accuracies": 0.6875, "rewards/chosen": 1.9047887325286865, "rewards/margins": 1.1927855014801025, "rewards/rejected": 0.7120033502578735, "step": 620 }, { "epoch": 0.29, "learning_rate": 2.876712328767123e-07, "logits/chosen": -0.40331321954727173, "logits/rejected": -0.442889541387558, "logps/chosen": -119.29844665527344, "logps/rejected": -92.73041534423828, "loss": 0.4586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.220543384552002, "rewards/margins": 1.537954330444336, "rewards/rejected": 0.6825889945030212, "step": 630 }, { "epoch": 0.29, "learning_rate": 2.922374429223744e-07, "logits/chosen": -0.3773882985115051, "logits/rejected": -0.43467479944229126, "logps/chosen": -126.56062316894531, "logps/rejected": -90.71516418457031, "loss": 0.4235, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.9954169988632202, "rewards/margins": 1.6813433170318604, "rewards/rejected": 0.3140736222267151, "step": 640 }, { "epoch": 0.3, "learning_rate": 2.968036529680365e-07, "logits/chosen": -0.3841812312602997, "logits/rejected": -0.4249148368835449, "logps/chosen": -123.25791931152344, "logps/rejected": -89.29008483886719, "loss": 0.4447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.2786223888397217, "rewards/margins": 1.6475741863250732, "rewards/rejected": 0.631048321723938, "step": 650 }, { "epoch": 0.3, "learning_rate": 2.998477929984779e-07, "logits/chosen": -0.3526962995529175, "logits/rejected": -0.41638392210006714, "logps/chosen": -123.1829605102539, "logps/rejected": -91.3757095336914, "loss": 0.4117, "rewards/accuracies": 0.75, "rewards/chosen": 2.311939239501953, "rewards/margins": 1.7149779796600342, "rewards/rejected": 0.5969613194465637, "step": 660 }, { "epoch": 0.31, "learning_rate": 2.993404363267377e-07, "logits/chosen": -0.36860209703445435, "logits/rejected": -0.4237852990627289, "logps/chosen": -122.25120544433594, "logps/rejected": -89.36021423339844, "loss": 0.4159, "rewards/accuracies": 0.75, "rewards/chosen": 2.1927528381347656, "rewards/margins": 1.558342456817627, "rewards/rejected": 0.6344104409217834, "step": 670 }, { "epoch": 0.31, "learning_rate": 2.9883307965499743e-07, "logits/chosen": -0.39097389578819275, "logits/rejected": -0.4458232820034027, "logps/chosen": -120.35401916503906, "logps/rejected": -93.7239761352539, "loss": 0.4295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.214139461517334, "rewards/margins": 1.479650855064392, "rewards/rejected": 0.7344885468482971, "step": 680 }, { "epoch": 0.31, "learning_rate": 2.983257229832572e-07, "logits/chosen": -0.3879626393318176, "logits/rejected": -0.4396095871925354, "logps/chosen": -118.8515853881836, "logps/rejected": -88.3510513305664, "loss": 0.4265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.529470682144165, "rewards/margins": 2.0332565307617188, "rewards/rejected": 0.4962140917778015, "step": 690 }, { "epoch": 0.32, "learning_rate": 2.9781836631151696e-07, "logits/chosen": -0.39537498354911804, "logits/rejected": -0.4360222816467285, "logps/chosen": -128.79356384277344, "logps/rejected": -91.8262710571289, "loss": 0.4048, "rewards/accuracies": 0.8125, "rewards/chosen": 2.615938901901245, "rewards/margins": 1.7755300998687744, "rewards/rejected": 0.8404089212417603, "step": 700 }, { "epoch": 0.32, "eval_logits/chosen": -0.3952995538711548, "eval_logits/rejected": -0.43975549936294556, "eval_logps/chosen": -117.93878936767578, "eval_logps/rejected": -89.78253936767578, "eval_loss": 0.3877631723880768, "eval_rewards/accuracies": 0.832402229309082, "eval_rewards/chosen": 2.610471487045288, "eval_rewards/margins": 2.036900758743286, "eval_rewards/rejected": 0.5735709071159363, "eval_runtime": 879.8552, "eval_samples_per_second": 3.253, "eval_steps_per_second": 0.203, "step": 700 }, { "epoch": 0.32, "learning_rate": 2.9731100963977676e-07, "logits/chosen": -0.3934364318847656, "logits/rejected": -0.43131130933761597, "logps/chosen": -122.0168228149414, "logps/rejected": -95.09669494628906, "loss": 0.3735, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.480156898498535, "rewards/margins": 2.1081740856170654, "rewards/rejected": 0.3719825744628906, "step": 710 }, { "epoch": 0.33, "learning_rate": 2.968036529680365e-07, "logits/chosen": -0.36254000663757324, "logits/rejected": -0.39851677417755127, "logps/chosen": -113.37959289550781, "logps/rejected": -93.94721984863281, "loss": 0.3979, "rewards/accuracies": 0.6875, "rewards/chosen": 2.346477746963501, "rewards/margins": 1.4738929271697998, "rewards/rejected": 0.8725847005844116, "step": 720 }, { "epoch": 0.33, "learning_rate": 2.962962962962963e-07, "logits/chosen": -0.37916380167007446, "logits/rejected": -0.4347075819969177, "logps/chosen": -120.58372497558594, "logps/rejected": -91.06839752197266, "loss": 0.4037, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.639740467071533, "rewards/margins": 2.126530170440674, "rewards/rejected": 0.5132103562355042, "step": 730 }, { "epoch": 0.34, "learning_rate": 2.9578893962455603e-07, "logits/chosen": -0.38863658905029297, "logits/rejected": -0.43652552366256714, "logps/chosen": -118.41629791259766, "logps/rejected": -91.13375854492188, "loss": 0.4023, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 2.4551138877868652, "rewards/margins": 1.8103752136230469, "rewards/rejected": 0.6447389721870422, "step": 740 }, { "epoch": 0.34, "learning_rate": 2.952815829528158e-07, "logits/chosen": -0.3511260449886322, "logits/rejected": -0.3955584168434143, "logps/chosen": -122.5355224609375, "logps/rejected": -94.37186431884766, "loss": 0.3418, "rewards/accuracies": 0.875, "rewards/chosen": 2.681807041168213, "rewards/margins": 2.366050958633423, "rewards/rejected": 0.31575626134872437, "step": 750 }, { "epoch": 0.35, "learning_rate": 2.9477422628107556e-07, "logits/chosen": -0.36248472332954407, "logits/rejected": -0.41749343276023865, "logps/chosen": -127.51533508300781, "logps/rejected": -89.82352447509766, "loss": 0.3551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.612884283065796, "rewards/margins": 2.194282054901123, "rewards/rejected": 0.4186023771762848, "step": 760 }, { "epoch": 0.35, "learning_rate": 2.9426686960933536e-07, "logits/chosen": -0.39361852407455444, "logits/rejected": -0.4406144618988037, "logps/chosen": -120.9937515258789, "logps/rejected": -91.25408935546875, "loss": 0.3773, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.731214761734009, "rewards/margins": 2.3300070762634277, "rewards/rejected": 0.4012075364589691, "step": 770 }, { "epoch": 0.36, "learning_rate": 2.937595129375951e-07, "logits/chosen": -0.37514811754226685, "logits/rejected": -0.4206709861755371, "logps/chosen": -121.06993103027344, "logps/rejected": -90.89588165283203, "loss": 0.3809, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.543487548828125, "rewards/margins": 1.973149299621582, "rewards/rejected": 0.5703383684158325, "step": 780 }, { "epoch": 0.36, "learning_rate": 2.932521562658549e-07, "logits/chosen": -0.3764232099056244, "logits/rejected": -0.42200201749801636, "logps/chosen": -118.751220703125, "logps/rejected": -90.91673278808594, "loss": 0.3829, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.8071541786193848, "rewards/margins": 2.2911696434020996, "rewards/rejected": 0.5159844756126404, "step": 790 }, { "epoch": 0.37, "learning_rate": 2.9274479959411463e-07, "logits/chosen": -0.4004458487033844, "logits/rejected": -0.45174115896224976, "logps/chosen": -121.1677474975586, "logps/rejected": -94.82032775878906, "loss": 0.3623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8220152854919434, "rewards/margins": 1.960198998451233, "rewards/rejected": 0.8618165254592896, "step": 800 }, { "epoch": 0.37, "eval_logits/chosen": -0.40461838245391846, "eval_logits/rejected": -0.449243426322937, "eval_logps/chosen": -117.74871063232422, "eval_logps/rejected": -90.00775146484375, "eval_loss": 0.33831077814102173, "eval_rewards/accuracies": 0.8519552946090698, "eval_rewards/chosen": 2.7055187225341797, "eval_rewards/margins": 2.244556427001953, "eval_rewards/rejected": 0.4609623849391937, "eval_runtime": 884.177, "eval_samples_per_second": 3.237, "eval_steps_per_second": 0.202, "step": 800 }, { "epoch": 0.37, "learning_rate": 2.922374429223744e-07, "logits/chosen": -0.39423003792762756, "logits/rejected": -0.4309159219264984, "logps/chosen": -115.3284912109375, "logps/rejected": -89.27734375, "loss": 0.3521, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.6883575916290283, "rewards/margins": 2.4514644145965576, "rewards/rejected": 0.2368932068347931, "step": 810 }, { "epoch": 0.37, "learning_rate": 2.9173008625063416e-07, "logits/chosen": -0.38897836208343506, "logits/rejected": -0.43199315667152405, "logps/chosen": -115.42658996582031, "logps/rejected": -88.20014953613281, "loss": 0.3956, "rewards/accuracies": 0.8125, "rewards/chosen": 2.8831889629364014, "rewards/margins": 2.162771701812744, "rewards/rejected": 0.7204176187515259, "step": 820 }, { "epoch": 0.38, "learning_rate": 2.9122272957889396e-07, "logits/chosen": -0.3778613209724426, "logits/rejected": -0.42081737518310547, "logps/chosen": -122.856689453125, "logps/rejected": -90.86555480957031, "loss": 0.313, "rewards/accuracies": 0.875, "rewards/chosen": 2.869499683380127, "rewards/margins": 2.4971015453338623, "rewards/rejected": 0.372397780418396, "step": 830 }, { "epoch": 0.38, "learning_rate": 2.907153729071537e-07, "logits/chosen": -0.3957747519016266, "logits/rejected": -0.44605112075805664, "logps/chosen": -118.08308410644531, "logps/rejected": -92.13973236083984, "loss": 0.3128, "rewards/accuracies": 0.8125, "rewards/chosen": 2.6560986042022705, "rewards/margins": 2.404651641845703, "rewards/rejected": 0.251446932554245, "step": 840 }, { "epoch": 0.39, "learning_rate": 2.902080162354135e-07, "logits/chosen": -0.3683229982852936, "logits/rejected": -0.41419917345046997, "logps/chosen": -116.5284194946289, "logps/rejected": -90.90771484375, "loss": 0.3368, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.9755072593688965, "rewards/margins": 2.519197940826416, "rewards/rejected": 0.45630955696105957, "step": 850 }, { "epoch": 0.39, "learning_rate": 2.8970065956367323e-07, "logits/chosen": -0.38079267740249634, "logits/rejected": -0.42664599418640137, "logps/chosen": -124.321533203125, "logps/rejected": -93.04421997070312, "loss": 0.3352, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.8211874961853027, "rewards/margins": 2.7853455543518066, "rewards/rejected": 0.035841844975948334, "step": 860 }, { "epoch": 0.4, "learning_rate": 2.89193302891933e-07, "logits/chosen": -0.43004846572875977, "logits/rejected": -0.4682585299015045, "logps/chosen": -120.579833984375, "logps/rejected": -94.06327819824219, "loss": 0.3491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4778380393981934, "rewards/margins": 1.947243094444275, "rewards/rejected": 0.5305950045585632, "step": 870 }, { "epoch": 0.4, "learning_rate": 2.8868594622019276e-07, "logits/chosen": -0.3675630986690521, "logits/rejected": -0.43043774366378784, "logps/chosen": -127.04805755615234, "logps/rejected": -90.72000122070312, "loss": 0.3348, "rewards/accuracies": 0.8125, "rewards/chosen": 3.3756585121154785, "rewards/margins": 2.680732011795044, "rewards/rejected": 0.6949266195297241, "step": 880 }, { "epoch": 0.41, "learning_rate": 2.8817858954845256e-07, "logits/chosen": -0.3831022381782532, "logits/rejected": -0.4073728621006012, "logps/chosen": -112.22914123535156, "logps/rejected": -92.7939453125, "loss": 0.3258, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.583207845687866, "rewards/margins": 2.2793221473693848, "rewards/rejected": 0.3038859963417053, "step": 890 }, { "epoch": 0.41, "learning_rate": 2.876712328767123e-07, "logits/chosen": -0.36432862281799316, "logits/rejected": -0.4049452841281891, "logps/chosen": -114.65323638916016, "logps/rejected": -89.1798324584961, "loss": 0.308, "rewards/accuracies": 0.75, "rewards/chosen": 2.7544989585876465, "rewards/margins": 2.288748025894165, "rewards/rejected": 0.4657509922981262, "step": 900 }, { "epoch": 0.41, "eval_logits/chosen": -0.39713504910469055, "eval_logits/rejected": -0.43814149498939514, "eval_logps/chosen": -117.21139526367188, "eval_logps/rejected": -90.22852325439453, "eval_loss": 0.31449276208877563, "eval_rewards/accuracies": 0.8519552946090698, "eval_rewards/chosen": 2.9741804599761963, "eval_rewards/margins": 2.6236064434051514, "eval_rewards/rejected": 0.3505741357803345, "eval_runtime": 906.6261, "eval_samples_per_second": 3.157, "eval_steps_per_second": 0.197, "step": 900 }, { "epoch": 0.42, "learning_rate": 2.871638762049721e-07, "logits/chosen": -0.37541183829307556, "logits/rejected": -0.4352906346321106, "logps/chosen": -122.44512939453125, "logps/rejected": -92.67110443115234, "loss": 0.311, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.144416570663452, "rewards/margins": 2.9438998699188232, "rewards/rejected": 0.200516939163208, "step": 910 }, { "epoch": 0.42, "learning_rate": 2.8665651953323183e-07, "logits/chosen": -0.4028325080871582, "logits/rejected": -0.43853726983070374, "logps/chosen": -122.57420349121094, "logps/rejected": -88.1255874633789, "loss": 0.2813, "rewards/accuracies": 0.875, "rewards/chosen": 3.4015986919403076, "rewards/margins": 2.7751407623291016, "rewards/rejected": 0.6264580488204956, "step": 920 }, { "epoch": 0.42, "learning_rate": 2.861491628614916e-07, "logits/chosen": -0.38051438331604004, "logits/rejected": -0.42849215865135193, "logps/chosen": -121.06298828125, "logps/rejected": -91.55492401123047, "loss": 0.3263, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.2129135131835938, "rewards/margins": 3.027736186981201, "rewards/rejected": 0.18517741560935974, "step": 930 }, { "epoch": 0.43, "learning_rate": 2.8564180618975136e-07, "logits/chosen": -0.3525943458080292, "logits/rejected": -0.40666908025741577, "logps/chosen": -126.42658996582031, "logps/rejected": -95.4771499633789, "loss": 0.3093, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.0789132118225098, "rewards/margins": 2.573848009109497, "rewards/rejected": 0.505064845085144, "step": 940 }, { "epoch": 0.43, "learning_rate": 2.8513444951801116e-07, "logits/chosen": -0.37190571427345276, "logits/rejected": -0.4196457862854004, "logps/chosen": -114.93281555175781, "logps/rejected": -90.08222198486328, "loss": 0.3446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7400577068328857, "rewards/margins": 2.4779579639434814, "rewards/rejected": 0.26209989190101624, "step": 950 }, { "epoch": 0.44, "learning_rate": 2.846270928462709e-07, "logits/chosen": -0.3697434663772583, "logits/rejected": -0.4116531312465668, "logps/chosen": -120.270751953125, "logps/rejected": -96.9891357421875, "loss": 0.3247, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.8343663215637207, "rewards/margins": 2.7533464431762695, "rewards/rejected": 0.08101978152990341, "step": 960 }, { "epoch": 0.44, "learning_rate": 2.841197361745307e-07, "logits/chosen": -0.38262155652046204, "logits/rejected": -0.43416157364845276, "logps/chosen": -118.68754577636719, "logps/rejected": -91.05323791503906, "loss": 0.273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.267169237136841, "rewards/margins": 2.936789035797119, "rewards/rejected": 0.33038023114204407, "step": 970 }, { "epoch": 0.45, "learning_rate": 2.8361237950279043e-07, "logits/chosen": -0.3827953338623047, "logits/rejected": -0.4415118098258972, "logps/chosen": -131.87547302246094, "logps/rejected": -93.18158721923828, "loss": 0.2977, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.0228755474090576, "rewards/margins": 3.0450031757354736, "rewards/rejected": -0.022127319127321243, "step": 980 }, { "epoch": 0.45, "learning_rate": 2.831050228310502e-07, "logits/chosen": -0.3674587905406952, "logits/rejected": -0.4189482629299164, "logps/chosen": -119.06890869140625, "logps/rejected": -90.67839050292969, "loss": 0.319, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.407522678375244, "rewards/margins": 3.029474973678589, "rewards/rejected": 0.3780475854873657, "step": 990 }, { "epoch": 0.46, "learning_rate": 2.8259766615930996e-07, "logits/chosen": -0.3974978029727936, "logits/rejected": -0.43341922760009766, "logps/chosen": -123.0480728149414, "logps/rejected": -88.08891296386719, "loss": 0.3092, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.463684558868408, "rewards/margins": 3.0378475189208984, "rewards/rejected": 0.42583686113357544, "step": 1000 }, { "epoch": 0.46, "eval_logits/chosen": -0.3925662040710449, "eval_logits/rejected": -0.4275723695755005, "eval_logps/chosen": -116.85150909423828, "eval_logps/rejected": -90.39224243164062, "eval_loss": 0.31252291798591614, "eval_rewards/accuracies": 0.8351955413818359, "eval_rewards/chosen": 3.1541225910186768, "eval_rewards/margins": 2.8854100704193115, "eval_rewards/rejected": 0.2687124013900757, "eval_runtime": 851.8679, "eval_samples_per_second": 3.36, "eval_steps_per_second": 0.21, "step": 1000 }, { "epoch": 0.46, "learning_rate": 2.8209030948756976e-07, "logits/chosen": -0.3884614109992981, "logits/rejected": -0.42015600204467773, "logps/chosen": -121.27522277832031, "logps/rejected": -96.0818862915039, "loss": 0.3186, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.2017555236816406, "rewards/margins": 3.0563178062438965, "rewards/rejected": 0.14543786644935608, "step": 1010 }, { "epoch": 0.47, "learning_rate": 2.815829528158295e-07, "logits/chosen": -0.3918910622596741, "logits/rejected": -0.4254131317138672, "logps/chosen": -106.18526458740234, "logps/rejected": -85.96565246582031, "loss": 0.2848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.9074699878692627, "rewards/margins": 2.4807074069976807, "rewards/rejected": 0.4267624020576477, "step": 1020 }, { "epoch": 0.47, "learning_rate": 2.810755961440893e-07, "logits/chosen": -0.3705700933933258, "logits/rejected": -0.42028117179870605, "logps/chosen": -112.59422302246094, "logps/rejected": -94.79884338378906, "loss": 0.3074, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.024829626083374, "rewards/margins": 2.7108585834503174, "rewards/rejected": 0.31397122144699097, "step": 1030 }, { "epoch": 0.47, "learning_rate": 2.8056823947234903e-07, "logits/chosen": -0.38808757066726685, "logits/rejected": -0.43867096304893494, "logps/chosen": -122.36810302734375, "logps/rejected": -91.27690124511719, "loss": 0.2752, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.4875054359436035, "rewards/margins": 3.586559295654297, "rewards/rejected": -0.09905393421649933, "step": 1040 }, { "epoch": 0.48, "learning_rate": 2.800608828006088e-07, "logits/chosen": -0.38201209902763367, "logits/rejected": -0.4219762682914734, "logps/chosen": -112.345458984375, "logps/rejected": -93.32908630371094, "loss": 0.2762, "rewards/accuracies": 0.8125, "rewards/chosen": 3.233158826828003, "rewards/margins": 2.842348575592041, "rewards/rejected": 0.3908103108406067, "step": 1050 }, { "epoch": 0.48, "learning_rate": 2.7955352612886856e-07, "logits/chosen": -0.36564648151397705, "logits/rejected": -0.4152706563472748, "logps/chosen": -115.9142837524414, "logps/rejected": -94.08984375, "loss": 0.3157, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.917323589324951, "rewards/margins": 2.925326108932495, "rewards/rejected": -0.008002638816833496, "step": 1060 }, { "epoch": 0.49, "learning_rate": 2.7904616945712836e-07, "logits/chosen": -0.38626644015312195, "logits/rejected": -0.4261588156223297, "logps/chosen": -118.94515228271484, "logps/rejected": -91.15392303466797, "loss": 0.2888, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.740856647491455, "rewards/margins": 2.7697622776031494, "rewards/rejected": -0.028905678540468216, "step": 1070 }, { "epoch": 0.49, "learning_rate": 2.785388127853881e-07, "logits/chosen": -0.3952401578426361, "logits/rejected": -0.43863511085510254, "logps/chosen": -118.4677734375, "logps/rejected": -85.22468566894531, "loss": 0.3047, "rewards/accuracies": 0.8125, "rewards/chosen": 3.0166592597961426, "rewards/margins": 2.933008909225464, "rewards/rejected": 0.08365003764629364, "step": 1080 }, { "epoch": 0.5, "learning_rate": 2.780314561136479e-07, "logits/chosen": -0.39297205209732056, "logits/rejected": -0.439324289560318, "logps/chosen": -115.67972564697266, "logps/rejected": -93.59664916992188, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": 2.8857316970825195, "rewards/margins": 2.8312604427337646, "rewards/rejected": 0.05447094514966011, "step": 1090 }, { "epoch": 0.5, "learning_rate": 2.7752409944190763e-07, "logits/chosen": -0.39699214696884155, "logits/rejected": -0.4308001399040222, "logps/chosen": -117.69087219238281, "logps/rejected": -93.83064270019531, "loss": 0.2765, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.924699068069458, "rewards/margins": 2.9215705394744873, "rewards/rejected": 0.003128147218376398, "step": 1100 }, { "epoch": 0.5, "eval_logits/chosen": -0.421588659286499, "eval_logits/rejected": -0.461477130651474, "eval_logps/chosen": -116.91805267333984, "eval_logps/rejected": -90.63465118408203, "eval_loss": 0.29390230774879456, "eval_rewards/accuracies": 0.8603351712226868, "eval_rewards/chosen": 3.1208434104919434, "eval_rewards/margins": 2.973330020904541, "eval_rewards/rejected": 0.1475135087966919, "eval_runtime": 877.9945, "eval_samples_per_second": 3.26, "eval_steps_per_second": 0.204, "step": 1100 }, { "epoch": 0.51, "learning_rate": 2.770167427701674e-07, "logits/chosen": -0.3756471872329712, "logits/rejected": -0.428661584854126, "logps/chosen": -124.7108154296875, "logps/rejected": -93.08118438720703, "loss": 0.3038, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.762239933013916, "rewards/margins": 3.627725601196289, "rewards/rejected": 0.13451430201530457, "step": 1110 }, { "epoch": 0.51, "learning_rate": 2.7650938609842716e-07, "logits/chosen": -0.36341890692710876, "logits/rejected": -0.4196922183036804, "logps/chosen": -120.17652893066406, "logps/rejected": -88.23072814941406, "loss": 0.2855, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.30224871635437, "rewards/margins": 2.936652898788452, "rewards/rejected": 0.3655957281589508, "step": 1120 }, { "epoch": 0.52, "learning_rate": 2.7600202942668696e-07, "logits/chosen": -0.3739013969898224, "logits/rejected": -0.4190409779548645, "logps/chosen": -124.47865295410156, "logps/rejected": -91.2358169555664, "loss": 0.2698, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.268986225128174, "rewards/margins": 3.0331108570098877, "rewards/rejected": 0.2358749359846115, "step": 1130 }, { "epoch": 0.52, "learning_rate": 2.754946727549467e-07, "logits/chosen": -0.40065139532089233, "logits/rejected": -0.43831387162208557, "logps/chosen": -121.75138854980469, "logps/rejected": -98.39310455322266, "loss": 0.2537, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.195882558822632, "rewards/margins": 3.1010377407073975, "rewards/rejected": 0.09484489262104034, "step": 1140 }, { "epoch": 0.52, "learning_rate": 2.749873160832065e-07, "logits/chosen": -0.3941956162452698, "logits/rejected": -0.4352952539920807, "logps/chosen": -116.66845703125, "logps/rejected": -88.73519897460938, "loss": 0.2587, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.9066343307495117, "rewards/margins": 2.9923722743988037, "rewards/rejected": -0.08573801815509796, "step": 1150 }, { "epoch": 0.53, "learning_rate": 2.7447995941146623e-07, "logits/chosen": -0.3892219662666321, "logits/rejected": -0.42604565620422363, "logps/chosen": -123.84007263183594, "logps/rejected": -92.99417877197266, "loss": 0.2535, "rewards/accuracies": 0.875, "rewards/chosen": 3.3677380084991455, "rewards/margins": 3.233222484588623, "rewards/rejected": 0.13451552391052246, "step": 1160 }, { "epoch": 0.53, "learning_rate": 2.73972602739726e-07, "logits/chosen": -0.4173711836338043, "logits/rejected": -0.44396501779556274, "logps/chosen": -115.49979400634766, "logps/rejected": -95.75325775146484, "loss": 0.3049, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.100471258163452, "rewards/margins": 2.7626137733459473, "rewards/rejected": 0.3378572463989258, "step": 1170 }, { "epoch": 0.54, "learning_rate": 2.7346524606798576e-07, "logits/chosen": -0.3970886170864105, "logits/rejected": -0.43107110261917114, "logps/chosen": -119.1224365234375, "logps/rejected": -94.46098327636719, "loss": 0.2798, "rewards/accuracies": 0.875, "rewards/chosen": 2.862344264984131, "rewards/margins": 3.4456276893615723, "rewards/rejected": -0.5832830667495728, "step": 1180 }, { "epoch": 0.54, "learning_rate": 2.7295788939624556e-07, "logits/chosen": -0.41212111711502075, "logits/rejected": -0.4597795903682709, "logps/chosen": -122.7989273071289, "logps/rejected": -91.87464904785156, "loss": 0.2644, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.475785732269287, "rewards/margins": 3.3043880462646484, "rewards/rejected": 0.17139773070812225, "step": 1190 }, { "epoch": 0.55, "learning_rate": 2.724505327245053e-07, "logits/chosen": -0.4002062678337097, "logits/rejected": -0.44224053621292114, "logps/chosen": -122.39054870605469, "logps/rejected": -93.33247375488281, "loss": 0.3058, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.027909755706787, "rewards/margins": 3.0626347064971924, "rewards/rejected": -0.0347248800098896, "step": 1200 }, { "epoch": 0.55, "eval_logits/chosen": -0.38868534564971924, "eval_logits/rejected": -0.42488640546798706, "eval_logps/chosen": -117.1875, "eval_logps/rejected": -91.20378875732422, "eval_loss": 0.2772454023361206, "eval_rewards/accuracies": 0.8770949840545654, "eval_rewards/chosen": 2.9861292839050293, "eval_rewards/margins": 3.123185396194458, "eval_rewards/rejected": -0.13705596327781677, "eval_runtime": 857.7682, "eval_samples_per_second": 3.337, "eval_steps_per_second": 0.209, "step": 1200 }, { "epoch": 0.55, "learning_rate": 2.719431760527651e-07, "logits/chosen": -0.3781608045101166, "logits/rejected": -0.4132018983364105, "logps/chosen": -113.97611236572266, "logps/rejected": -89.44263458251953, "loss": 0.2443, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.1568641662597656, "rewards/margins": 2.9170334339141846, "rewards/rejected": 0.2398306131362915, "step": 1210 }, { "epoch": 0.56, "learning_rate": 2.7143581938102483e-07, "logits/chosen": -0.40766286849975586, "logits/rejected": -0.4394180178642273, "logps/chosen": -126.20048522949219, "logps/rejected": -91.31429290771484, "loss": 0.2705, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.362502336502075, "rewards/margins": 3.2503905296325684, "rewards/rejected": 0.11211173236370087, "step": 1220 }, { "epoch": 0.56, "learning_rate": 2.709284627092846e-07, "logits/chosen": -0.40174245834350586, "logits/rejected": -0.44669684767723083, "logps/chosen": -119.60395812988281, "logps/rejected": -94.1490707397461, "loss": 0.2448, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.1044528484344482, "rewards/margins": 3.6764514446258545, "rewards/rejected": -0.5719983577728271, "step": 1230 }, { "epoch": 0.57, "learning_rate": 2.7042110603754436e-07, "logits/chosen": -0.4040297865867615, "logits/rejected": -0.448894739151001, "logps/chosen": -125.84278869628906, "logps/rejected": -96.10501861572266, "loss": 0.2429, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.768805742263794, "rewards/margins": 3.6327590942382812, "rewards/rejected": 0.13604608178138733, "step": 1240 }, { "epoch": 0.57, "learning_rate": 2.6991374936580416e-07, "logits/chosen": -0.3972640931606293, "logits/rejected": -0.4390404224395752, "logps/chosen": -121.781494140625, "logps/rejected": -97.40977478027344, "loss": 0.2876, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.0396571159362793, "rewards/margins": 3.047499179840088, "rewards/rejected": -0.007842063903808594, "step": 1250 }, { "epoch": 0.58, "learning_rate": 2.694063926940639e-07, "logits/chosen": -0.3991449177265167, "logits/rejected": -0.4396139681339264, "logps/chosen": -128.02182006835938, "logps/rejected": -90.22634887695312, "loss": 0.2382, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.1005046367645264, "rewards/margins": 3.1185288429260254, "rewards/rejected": -0.018024539574980736, "step": 1260 }, { "epoch": 0.58, "learning_rate": 2.688990360223237e-07, "logits/chosen": -0.3941733241081238, "logits/rejected": -0.4330524504184723, "logps/chosen": -116.48948669433594, "logps/rejected": -93.32965850830078, "loss": 0.2589, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.3650689125061035, "rewards/margins": 3.3145668506622314, "rewards/rejected": 0.05050189420580864, "step": 1270 }, { "epoch": 0.58, "learning_rate": 2.6839167935058343e-07, "logits/chosen": -0.36839810013771057, "logits/rejected": -0.409420907497406, "logps/chosen": -114.0684814453125, "logps/rejected": -95.41332244873047, "loss": 0.2674, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 3.460172176361084, "rewards/margins": 3.2212772369384766, "rewards/rejected": 0.2388952225446701, "step": 1280 }, { "epoch": 0.59, "learning_rate": 2.678843226788432e-07, "logits/chosen": -0.40016645193099976, "logits/rejected": -0.43591636419296265, "logps/chosen": -116.75439453125, "logps/rejected": -94.6172103881836, "loss": 0.2954, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.891127109527588, "rewards/margins": 3.583353042602539, "rewards/rejected": 0.3077741265296936, "step": 1290 }, { "epoch": 0.59, "learning_rate": 2.6737696600710296e-07, "logits/chosen": -0.40336519479751587, "logits/rejected": -0.45011377334594727, "logps/chosen": -122.32728576660156, "logps/rejected": -92.63130950927734, "loss": 0.2702, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.126340866088867, "rewards/margins": 4.417415618896484, "rewards/rejected": -0.2910749316215515, "step": 1300 }, { "epoch": 0.59, "eval_logits/chosen": -0.41128745675086975, "eval_logits/rejected": -0.44969016313552856, "eval_logps/chosen": -116.51630401611328, "eval_logps/rejected": -91.05738830566406, "eval_loss": 0.2592301368713379, "eval_rewards/accuracies": 0.8715083599090576, "eval_rewards/chosen": 3.3217227458953857, "eval_rewards/margins": 3.3855812549591064, "eval_rewards/rejected": -0.06385818123817444, "eval_runtime": 913.966, "eval_samples_per_second": 3.131, "eval_steps_per_second": 0.196, "step": 1300 }, { "epoch": 0.6, "learning_rate": 2.6686960933536276e-07, "logits/chosen": -0.4039461612701416, "logits/rejected": -0.43267756700515747, "logps/chosen": -115.5996322631836, "logps/rejected": -95.9083023071289, "loss": 0.2591, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.894240617752075, "rewards/margins": 2.9390251636505127, "rewards/rejected": -0.04478471353650093, "step": 1310 }, { "epoch": 0.6, "learning_rate": 2.663622526636225e-07, "logits/chosen": -0.39787545800209045, "logits/rejected": -0.4207298755645752, "logps/chosen": -112.7799301147461, "logps/rejected": -93.30327606201172, "loss": 0.2804, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.9737071990966797, "rewards/margins": 2.6691317558288574, "rewards/rejected": 0.3045758605003357, "step": 1320 }, { "epoch": 0.61, "learning_rate": 2.658548959918823e-07, "logits/chosen": -0.36739581823349, "logits/rejected": -0.4091414511203766, "logps/chosen": -130.01556396484375, "logps/rejected": -96.4112548828125, "loss": 0.2588, "rewards/accuracies": 0.875, "rewards/chosen": 3.783113956451416, "rewards/margins": 3.4915924072265625, "rewards/rejected": 0.2915215492248535, "step": 1330 }, { "epoch": 0.61, "learning_rate": 2.6534753932014203e-07, "logits/chosen": -0.37962180376052856, "logits/rejected": -0.4211583733558655, "logps/chosen": -114.8764419555664, "logps/rejected": -93.41658020019531, "loss": 0.2598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.3292477130889893, "rewards/margins": 3.3758292198181152, "rewards/rejected": -0.04658186435699463, "step": 1340 }, { "epoch": 0.62, "learning_rate": 2.648401826484018e-07, "logits/chosen": -0.3829995393753052, "logits/rejected": -0.43469125032424927, "logps/chosen": -126.12850189208984, "logps/rejected": -92.5066146850586, "loss": 0.251, "rewards/accuracies": 0.9375, "rewards/chosen": 4.180144786834717, "rewards/margins": 4.059059143066406, "rewards/rejected": 0.12108540534973145, "step": 1350 }, { "epoch": 0.62, "learning_rate": 2.6433282597666156e-07, "logits/chosen": -0.39498692750930786, "logits/rejected": -0.4442169666290283, "logps/chosen": -124.14341735839844, "logps/rejected": -90.19200134277344, "loss": 0.2411, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 4.100016117095947, "rewards/margins": 4.147299289703369, "rewards/rejected": -0.04728341102600098, "step": 1360 }, { "epoch": 0.63, "learning_rate": 2.6382546930492135e-07, "logits/chosen": -0.4066733419895172, "logits/rejected": -0.44404226541519165, "logps/chosen": -115.19721984863281, "logps/rejected": -93.01029968261719, "loss": 0.2586, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.18514084815979, "rewards/margins": 3.501539707183838, "rewards/rejected": -0.3163986802101135, "step": 1370 }, { "epoch": 0.63, "learning_rate": 2.633181126331811e-07, "logits/chosen": -0.38769611716270447, "logits/rejected": -0.4122231900691986, "logps/chosen": -112.17032623291016, "logps/rejected": -91.19657135009766, "loss": 0.2633, "rewards/accuracies": 0.8125, "rewards/chosen": 2.9161415100097656, "rewards/margins": 3.0066184997558594, "rewards/rejected": -0.09047670662403107, "step": 1380 }, { "epoch": 0.63, "learning_rate": 2.628107559614409e-07, "logits/chosen": -0.38696232438087463, "logits/rejected": -0.4295097291469574, "logps/chosen": -122.4570541381836, "logps/rejected": -92.13167572021484, "loss": 0.2477, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.494009494781494, "rewards/margins": 3.7299437522888184, "rewards/rejected": -0.23593413829803467, "step": 1390 }, { "epoch": 0.64, "learning_rate": 2.6230339928970063e-07, "logits/chosen": -0.42557573318481445, "logits/rejected": -0.46044450998306274, "logps/chosen": -113.8060073852539, "logps/rejected": -92.50498962402344, "loss": 0.2316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.077052354812622, "rewards/margins": 3.4654159545898438, "rewards/rejected": -0.3883635103702545, "step": 1400 }, { "epoch": 0.64, "eval_logits/chosen": -0.3868881165981293, "eval_logits/rejected": -0.42342227697372437, "eval_logps/chosen": -116.44770050048828, "eval_logps/rejected": -91.5164794921875, "eval_loss": 0.24913275241851807, "eval_rewards/accuracies": 0.8854748606681824, "eval_rewards/chosen": 3.3560283184051514, "eval_rewards/margins": 3.649433135986328, "eval_rewards/rejected": -0.2934047281742096, "eval_runtime": 827.5196, "eval_samples_per_second": 3.459, "eval_steps_per_second": 0.216, "step": 1400 }, { "epoch": 0.64, "learning_rate": 2.617960426179604e-07, "logits/chosen": -0.3558892607688904, "logits/rejected": -0.3981640934944153, "logps/chosen": -122.8342514038086, "logps/rejected": -95.00323486328125, "loss": 0.2715, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.5448246002197266, "rewards/margins": 3.403717041015625, "rewards/rejected": 0.14110735058784485, "step": 1410 }, { "epoch": 0.65, "learning_rate": 2.6128868594622016e-07, "logits/chosen": -0.3855481445789337, "logits/rejected": -0.42138853669166565, "logps/chosen": -114.02071380615234, "logps/rejected": -91.90477752685547, "loss": 0.2708, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.9640955924987793, "rewards/margins": 2.885097026824951, "rewards/rejected": 0.07899872958660126, "step": 1420 }, { "epoch": 0.65, "learning_rate": 2.6078132927447995e-07, "logits/chosen": -0.3998476564884186, "logits/rejected": -0.44831499457359314, "logps/chosen": -115.67866516113281, "logps/rejected": -90.3123779296875, "loss": 0.2185, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.520200729370117, "rewards/margins": 3.7672977447509766, "rewards/rejected": -0.2470969408750534, "step": 1430 }, { "epoch": 0.66, "learning_rate": 2.602739726027397e-07, "logits/chosen": -0.3800181746482849, "logits/rejected": -0.42868027091026306, "logps/chosen": -119.03788757324219, "logps/rejected": -92.73046112060547, "loss": 0.2751, "rewards/accuracies": 0.875, "rewards/chosen": 2.975376844406128, "rewards/margins": 3.5816245079040527, "rewards/rejected": -0.6062482595443726, "step": 1440 }, { "epoch": 0.66, "learning_rate": 2.597666159309995e-07, "logits/chosen": -0.4055546820163727, "logits/rejected": -0.44052910804748535, "logps/chosen": -119.14704895019531, "logps/rejected": -92.04376220703125, "loss": 0.2639, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.3340554237365723, "rewards/margins": 3.4932751655578613, "rewards/rejected": -0.15921981632709503, "step": 1450 }, { "epoch": 0.67, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -0.3804454803466797, "logits/rejected": -0.4156479835510254, "logps/chosen": -118.11918640136719, "logps/rejected": -90.77540588378906, "loss": 0.2421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.448434352874756, "rewards/margins": 3.3643860816955566, "rewards/rejected": 0.08404884487390518, "step": 1460 }, { "epoch": 0.67, "learning_rate": 2.58751902587519e-07, "logits/chosen": -0.36997154355049133, "logits/rejected": -0.42205578088760376, "logps/chosen": -124.9430160522461, "logps/rejected": -92.24369812011719, "loss": 0.2817, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.5387015342712402, "rewards/margins": 4.0014214515686035, "rewards/rejected": -0.4627200961112976, "step": 1470 }, { "epoch": 0.68, "learning_rate": 2.5824454591577876e-07, "logits/chosen": -0.38390421867370605, "logits/rejected": -0.4294559359550476, "logps/chosen": -117.3547592163086, "logps/rejected": -87.96221160888672, "loss": 0.2619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.335505723953247, "rewards/margins": 3.4297802448272705, "rewards/rejected": -0.09427466243505478, "step": 1480 }, { "epoch": 0.68, "learning_rate": 2.5773718924403855e-07, "logits/chosen": -0.38687005639076233, "logits/rejected": -0.42344173789024353, "logps/chosen": -115.8824462890625, "logps/rejected": -90.14027404785156, "loss": 0.2513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5795180797576904, "rewards/margins": 3.676868438720703, "rewards/rejected": -0.09735004603862762, "step": 1490 }, { "epoch": 0.68, "learning_rate": 2.572298325722983e-07, "logits/chosen": -0.4296097159385681, "logits/rejected": -0.4675825536251068, "logps/chosen": -124.1937484741211, "logps/rejected": -94.04402160644531, "loss": 0.2344, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.2074170112609863, "rewards/margins": 3.9851672649383545, "rewards/rejected": -0.7777503728866577, "step": 1500 }, { "epoch": 0.68, "eval_logits/chosen": -0.4151133894920349, "eval_logits/rejected": -0.45148965716362, "eval_logps/chosen": -116.71522521972656, "eval_logps/rejected": -91.37801361083984, "eval_loss": 0.2505827844142914, "eval_rewards/accuracies": 0.8687151074409485, "eval_rewards/chosen": 3.222259044647217, "eval_rewards/margins": 3.446425676345825, "eval_rewards/rejected": -0.2241670787334442, "eval_runtime": 916.2433, "eval_samples_per_second": 3.124, "eval_steps_per_second": 0.195, "step": 1500 }, { "epoch": 0.69, "learning_rate": 2.567224759005581e-07, "logits/chosen": -0.4068973660469055, "logits/rejected": -0.44249239563941956, "logps/chosen": -121.63665771484375, "logps/rejected": -89.6568603515625, "loss": 0.2302, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7271816730499268, "rewards/margins": 4.20591926574707, "rewards/rejected": -0.4787377715110779, "step": 1510 }, { "epoch": 0.69, "learning_rate": 2.5621511922881783e-07, "logits/chosen": -0.3942483365535736, "logits/rejected": -0.4460867941379547, "logps/chosen": -119.01606750488281, "logps/rejected": -93.5682601928711, "loss": 0.2069, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.9261646270751953, "rewards/margins": 4.379087924957275, "rewards/rejected": -0.45292338728904724, "step": 1520 }, { "epoch": 0.7, "learning_rate": 2.557077625570776e-07, "logits/chosen": -0.39618119597435, "logits/rejected": -0.4257192611694336, "logps/chosen": -113.54156494140625, "logps/rejected": -94.97087097167969, "loss": 0.2673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.967313289642334, "rewards/margins": 3.0766537189483643, "rewards/rejected": -0.10934066772460938, "step": 1530 }, { "epoch": 0.7, "learning_rate": 2.5520040588533736e-07, "logits/chosen": -0.39150649309158325, "logits/rejected": -0.4350226819515228, "logps/chosen": -123.06941986083984, "logps/rejected": -90.06202697753906, "loss": 0.2895, "rewards/accuracies": 0.9375, "rewards/chosen": 3.420095920562744, "rewards/margins": 3.699439287185669, "rewards/rejected": -0.27934330701828003, "step": 1540 }, { "epoch": 0.71, "learning_rate": 2.5469304921359715e-07, "logits/chosen": -0.3911879062652588, "logits/rejected": -0.4290149211883545, "logps/chosen": -116.4568099975586, "logps/rejected": -95.0242691040039, "loss": 0.2433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3553338050842285, "rewards/margins": 3.6205520629882812, "rewards/rejected": -0.26521843671798706, "step": 1550 }, { "epoch": 0.71, "learning_rate": 2.541856925418569e-07, "logits/chosen": -0.4253864288330078, "logits/rejected": -0.4458925127983093, "logps/chosen": -116.5554428100586, "logps/rejected": -93.46299743652344, "loss": 0.2617, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.091580867767334, "rewards/margins": 3.3575541973114014, "rewards/rejected": -0.2659732699394226, "step": 1560 }, { "epoch": 0.72, "learning_rate": 2.536783358701167e-07, "logits/chosen": -0.40082424879074097, "logits/rejected": -0.43754005432128906, "logps/chosen": -120.23848724365234, "logps/rejected": -91.8831558227539, "loss": 0.269, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.5315544605255127, "rewards/margins": 3.7128093242645264, "rewards/rejected": -0.1812548190355301, "step": 1570 }, { "epoch": 0.72, "learning_rate": 2.5317097919837643e-07, "logits/chosen": -0.4118991792201996, "logits/rejected": -0.4498567581176758, "logps/chosen": -116.05540466308594, "logps/rejected": -98.47431945800781, "loss": 0.2344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.790273904800415, "rewards/margins": 3.317415237426758, "rewards/rejected": -0.5271413922309875, "step": 1580 }, { "epoch": 0.73, "learning_rate": 2.526636225266362e-07, "logits/chosen": -0.37646594643592834, "logits/rejected": -0.42422208189964294, "logps/chosen": -117.7269515991211, "logps/rejected": -96.44020080566406, "loss": 0.2108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.832566738128662, "rewards/margins": 4.462157726287842, "rewards/rejected": -0.6295905113220215, "step": 1590 }, { "epoch": 0.73, "learning_rate": 2.5215626585489596e-07, "logits/chosen": -0.3981241285800934, "logits/rejected": -0.43437424302101135, "logps/chosen": -109.82353210449219, "logps/rejected": -89.00421142578125, "loss": 0.2332, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4090182781219482, "rewards/margins": 3.632111072540283, "rewards/rejected": -0.22309276461601257, "step": 1600 }, { "epoch": 0.73, "eval_logits/chosen": -0.3935575485229492, "eval_logits/rejected": -0.4299449324607849, "eval_logps/chosen": -116.73237609863281, "eval_logps/rejected": -91.7436294555664, "eval_loss": 0.2350289225578308, "eval_rewards/accuracies": 0.8854748606681824, "eval_rewards/chosen": 3.2136857509613037, "eval_rewards/margins": 3.6206626892089844, "eval_rewards/rejected": -0.4069768786430359, "eval_runtime": 902.2925, "eval_samples_per_second": 3.172, "eval_steps_per_second": 0.198, "step": 1600 }, { "epoch": 0.73, "learning_rate": 2.5164890918315575e-07, "logits/chosen": -0.41735535860061646, "logits/rejected": -0.44540077447891235, "logps/chosen": -116.6932144165039, "logps/rejected": -92.57784271240234, "loss": 0.2426, "rewards/accuracies": 0.875, "rewards/chosen": 2.7736692428588867, "rewards/margins": 3.2737278938293457, "rewards/rejected": -0.5000584125518799, "step": 1610 }, { "epoch": 0.74, "learning_rate": 2.511415525114155e-07, "logits/chosen": -0.37435075640678406, "logits/rejected": -0.41276389360427856, "logps/chosen": -115.96415710449219, "logps/rejected": -92.0928726196289, "loss": 0.2427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.167051315307617, "rewards/margins": 3.515188217163086, "rewards/rejected": -0.3481367230415344, "step": 1620 }, { "epoch": 0.74, "learning_rate": 2.506341958396753e-07, "logits/chosen": -0.3907029628753662, "logits/rejected": -0.4280622601509094, "logps/chosen": -120.83988952636719, "logps/rejected": -92.36735534667969, "loss": 0.2151, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.1539828777313232, "rewards/margins": 4.034056186676025, "rewards/rejected": -0.880073070526123, "step": 1630 }, { "epoch": 0.75, "learning_rate": 2.5012683916793503e-07, "logits/chosen": -0.3848786950111389, "logits/rejected": -0.4281742572784424, "logps/chosen": -126.47706604003906, "logps/rejected": -93.86870574951172, "loss": 0.2415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9007632732391357, "rewards/margins": 4.612415313720703, "rewards/rejected": -0.7116523385047913, "step": 1640 }, { "epoch": 0.75, "learning_rate": 2.496194824961948e-07, "logits/chosen": -0.42201462388038635, "logits/rejected": -0.44740739464759827, "logps/chosen": -116.1539535522461, "logps/rejected": -93.58564758300781, "loss": 0.2175, "rewards/accuracies": 0.9375, "rewards/chosen": 3.071413516998291, "rewards/margins": 3.6266441345214844, "rewards/rejected": -0.5552308559417725, "step": 1650 }, { "epoch": 0.76, "learning_rate": 2.4911212582445456e-07, "logits/chosen": -0.3866708278656006, "logits/rejected": -0.4200906753540039, "logps/chosen": -115.0282211303711, "logps/rejected": -95.74185180664062, "loss": 0.2136, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.5832972526550293, "rewards/margins": 3.8258235454559326, "rewards/rejected": -0.24252581596374512, "step": 1660 }, { "epoch": 0.76, "learning_rate": 2.4860476915271435e-07, "logits/chosen": -0.39738452434539795, "logits/rejected": -0.43101343512535095, "logps/chosen": -118.44441223144531, "logps/rejected": -95.55250549316406, "loss": 0.2669, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.5531890392303467, "rewards/margins": 3.563720226287842, "rewards/rejected": -0.010531162843108177, "step": 1670 }, { "epoch": 0.77, "learning_rate": 2.480974124809741e-07, "logits/chosen": -0.3905322253704071, "logits/rejected": -0.4394722580909729, "logps/chosen": -127.36441802978516, "logps/rejected": -97.71236419677734, "loss": 0.2488, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.985499858856201, "rewards/margins": 4.582793712615967, "rewards/rejected": -0.5972935557365417, "step": 1680 }, { "epoch": 0.77, "learning_rate": 2.475900558092339e-07, "logits/chosen": -0.36849355697631836, "logits/rejected": -0.40935856103897095, "logps/chosen": -119.0000991821289, "logps/rejected": -89.49138641357422, "loss": 0.2409, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.6050148010253906, "rewards/margins": 4.3091864585876465, "rewards/rejected": -0.7041717767715454, "step": 1690 }, { "epoch": 0.78, "learning_rate": 2.4708269913749363e-07, "logits/chosen": -0.37423986196517944, "logits/rejected": -0.4061339497566223, "logps/chosen": -122.22186279296875, "logps/rejected": -95.46275329589844, "loss": 0.2258, "rewards/accuracies": 0.9375, "rewards/chosen": 3.297147274017334, "rewards/margins": 4.038428783416748, "rewards/rejected": -0.7412816882133484, "step": 1700 }, { "epoch": 0.78, "eval_logits/chosen": -0.3959788680076599, "eval_logits/rejected": -0.43158969283103943, "eval_logps/chosen": -116.98088073730469, "eval_logps/rejected": -92.04763793945312, "eval_loss": 0.24770790338516235, "eval_rewards/accuracies": 0.8938547372817993, "eval_rewards/chosen": 3.0894315242767334, "eval_rewards/margins": 3.6484169960021973, "eval_rewards/rejected": -0.5589855313301086, "eval_runtime": 874.9831, "eval_samples_per_second": 3.271, "eval_steps_per_second": 0.205, "step": 1700 }, { "epoch": 0.78, "learning_rate": 2.465753424657534e-07, "logits/chosen": -0.4236866533756256, "logits/rejected": -0.46107596158981323, "logps/chosen": -116.57310485839844, "logps/rejected": -92.78831481933594, "loss": 0.2263, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.3795197010040283, "rewards/margins": 4.285675525665283, "rewards/rejected": -0.9061552286148071, "step": 1710 }, { "epoch": 0.79, "learning_rate": 2.4606798579401316e-07, "logits/chosen": -0.3939421474933624, "logits/rejected": -0.4401417374610901, "logps/chosen": -123.36474609375, "logps/rejected": -96.49834442138672, "loss": 0.2373, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.518620014190674, "rewards/margins": 4.276294708251953, "rewards/rejected": -0.7576751708984375, "step": 1720 }, { "epoch": 0.79, "learning_rate": 2.4556062912227295e-07, "logits/chosen": -0.4078288674354553, "logits/rejected": -0.4479657709598541, "logps/chosen": -120.38444519042969, "logps/rejected": -90.84424591064453, "loss": 0.1966, "rewards/accuracies": 0.875, "rewards/chosen": 2.933053970336914, "rewards/margins": 3.6727993488311768, "rewards/rejected": -0.739745020866394, "step": 1730 }, { "epoch": 0.79, "learning_rate": 2.450532724505327e-07, "logits/chosen": -0.4132927358150482, "logits/rejected": -0.4574710428714752, "logps/chosen": -123.63642883300781, "logps/rejected": -93.76477813720703, "loss": 0.2602, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.631747007369995, "rewards/margins": 4.368203639984131, "rewards/rejected": -0.736456573009491, "step": 1740 }, { "epoch": 0.8, "learning_rate": 2.445459157787925e-07, "logits/chosen": -0.39394044876098633, "logits/rejected": -0.43226075172424316, "logps/chosen": -117.86421966552734, "logps/rejected": -90.55984497070312, "loss": 0.243, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.801774263381958, "rewards/margins": 4.736522674560547, "rewards/rejected": -0.9347483515739441, "step": 1750 }, { "epoch": 0.8, "learning_rate": 2.4403855910705223e-07, "logits/chosen": -0.4240695536136627, "logits/rejected": -0.45410633087158203, "logps/chosen": -115.75899505615234, "logps/rejected": -93.46293640136719, "loss": 0.2529, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.036893129348755, "rewards/margins": 3.4991378784179688, "rewards/rejected": -0.46224460005760193, "step": 1760 }, { "epoch": 0.81, "learning_rate": 2.43531202435312e-07, "logits/chosen": -0.4136069715023041, "logits/rejected": -0.4493675231933594, "logps/chosen": -124.36299133300781, "logps/rejected": -90.94700622558594, "loss": 0.2342, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.2333157062530518, "rewards/margins": 3.8581371307373047, "rewards/rejected": -0.6248215436935425, "step": 1770 }, { "epoch": 0.81, "learning_rate": 2.4302384576357176e-07, "logits/chosen": -0.40225839614868164, "logits/rejected": -0.4295947551727295, "logps/chosen": -123.95500183105469, "logps/rejected": -103.24928283691406, "loss": 0.2232, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.355786085128784, "rewards/margins": 3.5081162452697754, "rewards/rejected": -1.1523301601409912, "step": 1780 }, { "epoch": 0.82, "learning_rate": 2.4251648909183155e-07, "logits/chosen": -0.4294680953025818, "logits/rejected": -0.4623151421546936, "logps/chosen": -121.99348449707031, "logps/rejected": -96.97917175292969, "loss": 0.2182, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0072503089904785, "rewards/margins": 4.567361354827881, "rewards/rejected": -0.5601118206977844, "step": 1790 }, { "epoch": 0.82, "learning_rate": 2.420091324200913e-07, "logits/chosen": -0.42369261384010315, "logits/rejected": -0.45343533158302307, "logps/chosen": -121.0447006225586, "logps/rejected": -98.198486328125, "loss": 0.2526, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.7009620666503906, "rewards/margins": 3.142019510269165, "rewards/rejected": -0.44105762243270874, "step": 1800 }, { "epoch": 0.82, "eval_logits/chosen": -0.4075545072555542, "eval_logits/rejected": -0.4420177936553955, "eval_logps/chosen": -116.59071350097656, "eval_logps/rejected": -92.0351333618164, "eval_loss": 0.22767914831638336, "eval_rewards/accuracies": 0.8770949840545654, "eval_rewards/chosen": 3.2845191955566406, "eval_rewards/margins": 3.8372511863708496, "eval_rewards/rejected": -0.5527323484420776, "eval_runtime": 879.6455, "eval_samples_per_second": 3.254, "eval_steps_per_second": 0.203, "step": 1800 }, { "epoch": 0.83, "learning_rate": 2.415017757483511e-07, "logits/chosen": -0.3963635563850403, "logits/rejected": -0.4376469552516937, "logps/chosen": -120.19471740722656, "logps/rejected": -97.24011993408203, "loss": 0.219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7244651317596436, "rewards/margins": 4.514935493469238, "rewards/rejected": -0.7904707193374634, "step": 1810 }, { "epoch": 0.83, "learning_rate": 2.409944190766108e-07, "logits/chosen": -0.43789929151535034, "logits/rejected": -0.4696916937828064, "logps/chosen": -121.25593566894531, "logps/rejected": -96.01130676269531, "loss": 0.2236, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.057839870452881, "rewards/margins": 3.6217963695526123, "rewards/rejected": -0.5639564394950867, "step": 1820 }, { "epoch": 0.84, "learning_rate": 2.404870624048706e-07, "logits/chosen": -0.4191747307777405, "logits/rejected": -0.45385512709617615, "logps/chosen": -117.59214782714844, "logps/rejected": -94.40740203857422, "loss": 0.2352, "rewards/accuracies": 0.875, "rewards/chosen": 3.6873555183410645, "rewards/margins": 4.0088911056518555, "rewards/rejected": -0.32153594493865967, "step": 1830 }, { "epoch": 0.84, "learning_rate": 2.3997970573313036e-07, "logits/chosen": -0.38190537691116333, "logits/rejected": -0.4171864986419678, "logps/chosen": -120.65101623535156, "logps/rejected": -91.7802963256836, "loss": 0.2291, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.9036548137664795, "rewards/margins": 3.2020695209503174, "rewards/rejected": -0.2984146475791931, "step": 1840 }, { "epoch": 0.84, "learning_rate": 2.3947234906139015e-07, "logits/chosen": -0.41627272963523865, "logits/rejected": -0.45198947191238403, "logps/chosen": -121.30501556396484, "logps/rejected": -92.88670349121094, "loss": 0.2236, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.955059766769409, "rewards/margins": 4.721311569213867, "rewards/rejected": -0.7662516832351685, "step": 1850 }, { "epoch": 0.85, "learning_rate": 2.389649923896499e-07, "logits/chosen": -0.3740800619125366, "logits/rejected": -0.4138231873512268, "logps/chosen": -114.4834976196289, "logps/rejected": -92.96529388427734, "loss": 0.2118, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1827633380889893, "rewards/margins": 4.073010444641113, "rewards/rejected": -0.8902471661567688, "step": 1860 }, { "epoch": 0.85, "learning_rate": 2.384576357179097e-07, "logits/chosen": -0.41853219270706177, "logits/rejected": -0.4531814455986023, "logps/chosen": -117.73091125488281, "logps/rejected": -94.66040802001953, "loss": 0.2155, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.427842617034912, "rewards/margins": 4.3185224533081055, "rewards/rejected": -0.8906799554824829, "step": 1870 }, { "epoch": 0.86, "learning_rate": 2.3795027904616943e-07, "logits/chosen": -0.40005069971084595, "logits/rejected": -0.433000385761261, "logps/chosen": -120.5252685546875, "logps/rejected": -94.85150909423828, "loss": 0.2294, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.2125244140625, "rewards/margins": 4.113189697265625, "rewards/rejected": -0.9006654620170593, "step": 1880 }, { "epoch": 0.86, "learning_rate": 2.374429223744292e-07, "logits/chosen": -0.3957889676094055, "logits/rejected": -0.424477756023407, "logps/chosen": -116.66868591308594, "logps/rejected": -92.9976577758789, "loss": 0.254, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.8800389766693115, "rewards/margins": 3.438807249069214, "rewards/rejected": -0.5587679743766785, "step": 1890 }, { "epoch": 0.87, "learning_rate": 2.3693556570268896e-07, "logits/chosen": -0.38361310958862305, "logits/rejected": -0.41146165132522583, "logps/chosen": -117.5991439819336, "logps/rejected": -99.95137786865234, "loss": 0.2025, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.4426445960998535, "rewards/margins": 4.4284892082214355, "rewards/rejected": -0.9858444333076477, "step": 1900 }, { "epoch": 0.87, "eval_logits/chosen": -0.3973536193370819, "eval_logits/rejected": -0.43190258741378784, "eval_logps/chosen": -116.74763488769531, "eval_logps/rejected": -92.54963684082031, "eval_loss": 0.2182014286518097, "eval_rewards/accuracies": 0.9022346138954163, "eval_rewards/chosen": 3.2060580253601074, "eval_rewards/margins": 4.016037940979004, "eval_rewards/rejected": -0.8099795579910278, "eval_runtime": 875.3985, "eval_samples_per_second": 3.269, "eval_steps_per_second": 0.204, "step": 1900 }, { "epoch": 0.87, "learning_rate": 2.3642820903094873e-07, "logits/chosen": -0.40541213750839233, "logits/rejected": -0.44309115409851074, "logps/chosen": -126.1652603149414, "logps/rejected": -96.5681381225586, "loss": 0.2123, "rewards/accuracies": 0.875, "rewards/chosen": 3.576422929763794, "rewards/margins": 4.267624855041504, "rewards/rejected": -0.6912025809288025, "step": 1910 }, { "epoch": 0.88, "learning_rate": 2.359208523592085e-07, "logits/chosen": -0.39431583881378174, "logits/rejected": -0.42382940649986267, "logps/chosen": -116.77348327636719, "logps/rejected": -91.7231216430664, "loss": 0.2451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1586005687713623, "rewards/margins": 3.866903305053711, "rewards/rejected": -0.7083033323287964, "step": 1920 }, { "epoch": 0.88, "learning_rate": 2.3541349568746826e-07, "logits/chosen": -0.40828245878219604, "logits/rejected": -0.4404390752315521, "logps/chosen": -118.46229553222656, "logps/rejected": -90.4027328491211, "loss": 0.2174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.344696044921875, "rewards/margins": 4.355304718017578, "rewards/rejected": -1.0106089115142822, "step": 1930 }, { "epoch": 0.89, "learning_rate": 2.3490613901572803e-07, "logits/chosen": -0.41974538564682007, "logits/rejected": -0.45281344652175903, "logps/chosen": -112.36436462402344, "logps/rejected": -94.63860321044922, "loss": 0.2256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5663247108459473, "rewards/margins": 3.390653133392334, "rewards/rejected": -0.8243284225463867, "step": 1940 }, { "epoch": 0.89, "learning_rate": 2.343987823439878e-07, "logits/chosen": -0.41871196031570435, "logits/rejected": -0.4626835286617279, "logps/chosen": -118.2319564819336, "logps/rejected": -91.87098693847656, "loss": 0.2033, "rewards/accuracies": 0.875, "rewards/chosen": 3.2902045249938965, "rewards/margins": 4.170986652374268, "rewards/rejected": -0.8807823061943054, "step": 1950 }, { "epoch": 0.89, "learning_rate": 2.3389142567224756e-07, "logits/chosen": -0.41229549050331116, "logits/rejected": -0.44218960404396057, "logps/chosen": -118.4122543334961, "logps/rejected": -99.02762603759766, "loss": 0.2217, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.3712990283966064, "rewards/margins": 4.0574212074279785, "rewards/rejected": -0.6861221194267273, "step": 1960 }, { "epoch": 0.9, "learning_rate": 2.3338406900050733e-07, "logits/chosen": -0.4184509217739105, "logits/rejected": -0.4491788446903229, "logps/chosen": -124.60710144042969, "logps/rejected": -96.12918090820312, "loss": 0.1937, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.895785093307495, "rewards/margins": 4.719235420227051, "rewards/rejected": -0.8234502077102661, "step": 1970 }, { "epoch": 0.9, "learning_rate": 2.328767123287671e-07, "logits/chosen": -0.4528725743293762, "logits/rejected": -0.4842372536659241, "logps/chosen": -124.16194152832031, "logps/rejected": -97.17805480957031, "loss": 0.2338, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.83451771736145, "rewards/margins": 4.723608016967773, "rewards/rejected": -0.8890898823738098, "step": 1980 }, { "epoch": 0.91, "learning_rate": 2.3236935565702686e-07, "logits/chosen": -0.3931097388267517, "logits/rejected": -0.43126392364501953, "logps/chosen": -119.52880859375, "logps/rejected": -94.76363372802734, "loss": 0.2115, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.683422565460205, "rewards/margins": 4.176725387573242, "rewards/rejected": -0.4933028221130371, "step": 1990 }, { "epoch": 0.91, "learning_rate": 2.3186199898528663e-07, "logits/chosen": -0.40987688302993774, "logits/rejected": -0.4448050856590271, "logps/chosen": -117.04490661621094, "logps/rejected": -95.28965759277344, "loss": 0.2253, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.0975899696350098, "rewards/margins": 3.8567593097686768, "rewards/rejected": -0.7591692209243774, "step": 2000 }, { "epoch": 0.91, "eval_logits/chosen": -0.4023212492465973, "eval_logits/rejected": -0.4390886723995209, "eval_logps/chosen": -116.60671997070312, "eval_logps/rejected": -92.88094329833984, "eval_loss": 0.2149447649717331, "eval_rewards/accuracies": 0.9078212380409241, "eval_rewards/chosen": 3.2765114307403564, "eval_rewards/margins": 4.2521491050720215, "eval_rewards/rejected": -0.975637674331665, "eval_runtime": 907.6422, "eval_samples_per_second": 3.153, "eval_steps_per_second": 0.197, "step": 2000 }, { "epoch": 0.92, "learning_rate": 2.313546423135464e-07, "logits/chosen": -0.3986409902572632, "logits/rejected": -0.43848711252212524, "logps/chosen": -123.74125671386719, "logps/rejected": -92.8211669921875, "loss": 0.2196, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.1617798805236816, "rewards/margins": 4.177075386047363, "rewards/rejected": -1.0152956247329712, "step": 2010 }, { "epoch": 0.92, "learning_rate": 2.3084728564180616e-07, "logits/chosen": -0.4132860600948334, "logits/rejected": -0.43701285123825073, "logps/chosen": -115.518798828125, "logps/rejected": -96.49732208251953, "loss": 0.2174, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.845722198486328, "rewards/margins": 3.5684571266174316, "rewards/rejected": -0.7227347493171692, "step": 2020 }, { "epoch": 0.93, "learning_rate": 2.3033992897006593e-07, "logits/chosen": -0.44160446524620056, "logits/rejected": -0.4706133008003235, "logps/chosen": -126.89131164550781, "logps/rejected": -94.20338439941406, "loss": 0.2095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.952413558959961, "rewards/margins": 5.020051002502441, "rewards/rejected": -1.0676372051239014, "step": 2030 }, { "epoch": 0.93, "learning_rate": 2.298325722983257e-07, "logits/chosen": -0.40945592522621155, "logits/rejected": -0.4388090670108795, "logps/chosen": -124.21551513671875, "logps/rejected": -95.3039321899414, "loss": 0.2448, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4655966758728027, "rewards/margins": 4.477190971374512, "rewards/rejected": -1.011594533920288, "step": 2040 }, { "epoch": 0.94, "learning_rate": 2.2932521562658546e-07, "logits/chosen": -0.41212978959083557, "logits/rejected": -0.4488631784915924, "logps/chosen": -121.19816589355469, "logps/rejected": -92.48617553710938, "loss": 0.2242, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.629365921020508, "rewards/margins": 4.309345245361328, "rewards/rejected": -0.6799794435501099, "step": 2050 }, { "epoch": 0.94, "learning_rate": 2.2881785895484523e-07, "logits/chosen": -0.3852509558200836, "logits/rejected": -0.4215177595615387, "logps/chosen": -122.44758605957031, "logps/rejected": -93.50701141357422, "loss": 0.2282, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.269455671310425, "rewards/margins": 4.175347328186035, "rewards/rejected": -0.9058906435966492, "step": 2060 }, { "epoch": 0.94, "learning_rate": 2.28310502283105e-07, "logits/chosen": -0.39701271057128906, "logits/rejected": -0.42783480882644653, "logps/chosen": -112.2025375366211, "logps/rejected": -91.7884292602539, "loss": 0.2028, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.7778406143188477, "rewards/margins": 4.39522647857666, "rewards/rejected": -1.6173861026763916, "step": 2070 }, { "epoch": 0.95, "learning_rate": 2.2780314561136476e-07, "logits/chosen": -0.4007197916507721, "logits/rejected": -0.4381144940853119, "logps/chosen": -121.38664245605469, "logps/rejected": -91.06873321533203, "loss": 0.1845, "rewards/accuracies": 0.875, "rewards/chosen": 3.398432970046997, "rewards/margins": 4.025904178619385, "rewards/rejected": -0.6274709701538086, "step": 2080 }, { "epoch": 0.95, "learning_rate": 2.2729578893962453e-07, "logits/chosen": -0.3872908651828766, "logits/rejected": -0.42537689208984375, "logps/chosen": -116.7512435913086, "logps/rejected": -93.22492218017578, "loss": 0.1976, "rewards/accuracies": 0.875, "rewards/chosen": 3.5508179664611816, "rewards/margins": 4.1629509925842285, "rewards/rejected": -0.6121331453323364, "step": 2090 }, { "epoch": 0.96, "learning_rate": 2.267884322678843e-07, "logits/chosen": -0.37515270709991455, "logits/rejected": -0.41785183548927307, "logps/chosen": -116.22972106933594, "logps/rejected": -88.31718444824219, "loss": 0.2084, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.261925220489502, "rewards/margins": 3.7819576263427734, "rewards/rejected": -0.5200322866439819, "step": 2100 }, { "epoch": 0.96, "eval_logits/chosen": -0.3953567445278168, "eval_logits/rejected": -0.428348183631897, "eval_logps/chosen": -116.9277114868164, "eval_logps/rejected": -93.06153869628906, "eval_loss": 0.22232365608215332, "eval_rewards/accuracies": 0.8938547372817993, "eval_rewards/chosen": 3.116020679473877, "eval_rewards/margins": 4.181955814361572, "eval_rewards/rejected": -1.0659351348876953, "eval_runtime": 921.8014, "eval_samples_per_second": 3.105, "eval_steps_per_second": 0.194, "step": 2100 }, { "epoch": 0.96, "learning_rate": 2.2628107559614406e-07, "logits/chosen": -0.3972451686859131, "logits/rejected": -0.429997980594635, "logps/chosen": -115.76712799072266, "logps/rejected": -97.56238555908203, "loss": 0.2216, "rewards/accuracies": 0.9375, "rewards/chosen": 3.4144790172576904, "rewards/margins": 3.9687142372131348, "rewards/rejected": -0.5542353987693787, "step": 2110 }, { "epoch": 0.97, "learning_rate": 2.2577371892440383e-07, "logits/chosen": -0.4206092953681946, "logits/rejected": -0.4511509835720062, "logps/chosen": -113.78810119628906, "logps/rejected": -93.98689270019531, "loss": 0.1785, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7417354583740234, "rewards/margins": 4.476273059844971, "rewards/rejected": -0.734538197517395, "step": 2120 }, { "epoch": 0.97, "learning_rate": 2.252663622526636e-07, "logits/chosen": -0.42873579263687134, "logits/rejected": -0.4546934962272644, "logps/chosen": -120.35272216796875, "logps/rejected": -96.99116516113281, "loss": 0.2561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1441822052001953, "rewards/margins": 4.1696553230285645, "rewards/rejected": -1.0254729986190796, "step": 2130 }, { "epoch": 0.98, "learning_rate": 2.2475900558092336e-07, "logits/chosen": -0.4174647331237793, "logits/rejected": -0.45524922013282776, "logps/chosen": -113.0732650756836, "logps/rejected": -93.70841979980469, "loss": 0.238, "rewards/accuracies": 0.875, "rewards/chosen": 3.7171530723571777, "rewards/margins": 4.351462364196777, "rewards/rejected": -0.6343096494674683, "step": 2140 }, { "epoch": 0.98, "learning_rate": 2.2425164890918313e-07, "logits/chosen": -0.39059361815452576, "logits/rejected": -0.4388251304626465, "logps/chosen": -131.6301727294922, "logps/rejected": -97.77519989013672, "loss": 0.2028, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.9817802906036377, "rewards/margins": 4.451568603515625, "rewards/rejected": -0.4697890281677246, "step": 2150 }, { "epoch": 0.99, "learning_rate": 2.237442922374429e-07, "logits/chosen": -0.4017879366874695, "logits/rejected": -0.4366172254085541, "logps/chosen": -114.01628112792969, "logps/rejected": -90.86222076416016, "loss": 0.1971, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.2794747352600098, "rewards/margins": 3.6212470531463623, "rewards/rejected": -0.34177258610725403, "step": 2160 }, { "epoch": 0.99, "learning_rate": 2.2323693556570266e-07, "logits/chosen": -0.37799277901649475, "logits/rejected": -0.4189319610595703, "logps/chosen": -124.58580017089844, "logps/rejected": -98.61467742919922, "loss": 0.1885, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8187057971954346, "rewards/margins": 4.701231956481934, "rewards/rejected": -0.8825258016586304, "step": 2170 }, { "epoch": 1.0, "learning_rate": 2.2272957889396242e-07, "logits/chosen": -0.4155615270137787, "logits/rejected": -0.4478856921195984, "logps/chosen": -118.5948715209961, "logps/rejected": -94.40876770019531, "loss": 0.1998, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.646564483642578, "rewards/margins": 4.50596284866333, "rewards/rejected": -0.8593980073928833, "step": 2180 }, { "epoch": 1.0, "learning_rate": 2.222222222222222e-07, "logits/chosen": -0.4222278594970703, "logits/rejected": -0.44699034094810486, "logps/chosen": -118.62066650390625, "logps/rejected": -90.86624908447266, "loss": 0.1933, "rewards/accuracies": 0.875, "rewards/chosen": 2.708111524581909, "rewards/margins": 3.947237491607666, "rewards/rejected": -1.239126205444336, "step": 2190 }, { "epoch": 1.0, "learning_rate": 2.2171486555048196e-07, "logits/chosen": -0.3967632055282593, "logits/rejected": -0.43386539816856384, "logps/chosen": -127.67298889160156, "logps/rejected": -100.56233978271484, "loss": 0.1896, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8950390815734863, "rewards/margins": 5.041460990905762, "rewards/rejected": -1.146422266960144, "step": 2200 }, { "epoch": 1.0, "eval_logits/chosen": -0.4154352843761444, "eval_logits/rejected": -0.45174553990364075, "eval_logps/chosen": -116.79273223876953, "eval_logps/rejected": -92.95588684082031, "eval_loss": 0.20996138453483582, "eval_rewards/accuracies": 0.8910614252090454, "eval_rewards/chosen": 3.183509588241577, "eval_rewards/margins": 4.196613788604736, "eval_rewards/rejected": -1.0131044387817383, "eval_runtime": 896.2038, "eval_samples_per_second": 3.193, "eval_steps_per_second": 0.2, "step": 2200 }, { "epoch": 1.01, "learning_rate": 2.2120750887874172e-07, "logits/chosen": -0.381591260433197, "logits/rejected": -0.4185353219509125, "logps/chosen": -123.46016693115234, "logps/rejected": -97.816650390625, "loss": 0.1978, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.249309539794922, "rewards/margins": 4.560460090637207, "rewards/rejected": -1.3111507892608643, "step": 2210 }, { "epoch": 1.01, "learning_rate": 2.207001522070015e-07, "logits/chosen": -0.39903947710990906, "logits/rejected": -0.43686169385910034, "logps/chosen": -122.17398834228516, "logps/rejected": -101.26841735839844, "loss": 0.1726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.203260898590088, "rewards/margins": 3.9990131855010986, "rewards/rejected": -0.7957520484924316, "step": 2220 }, { "epoch": 1.02, "learning_rate": 2.2019279553526126e-07, "logits/chosen": -0.42441973090171814, "logits/rejected": -0.4595940113067627, "logps/chosen": -121.41120910644531, "logps/rejected": -87.17717742919922, "loss": 0.1692, "rewards/accuracies": 0.9375, "rewards/chosen": 3.520594358444214, "rewards/margins": 4.398536682128906, "rewards/rejected": -0.8779422044754028, "step": 2230 }, { "epoch": 1.02, "learning_rate": 2.1968543886352102e-07, "logits/chosen": -0.41687169671058655, "logits/rejected": -0.442725270986557, "logps/chosen": -113.2309341430664, "logps/rejected": -93.45161437988281, "loss": 0.1922, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.1109657287597656, "rewards/margins": 4.183775424957275, "rewards/rejected": -1.0728092193603516, "step": 2240 }, { "epoch": 1.03, "learning_rate": 2.191780821917808e-07, "logits/chosen": -0.45746049284935, "logits/rejected": -0.47277164459228516, "logps/chosen": -115.0101318359375, "logps/rejected": -92.96504211425781, "loss": 0.2032, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.604362726211548, "rewards/margins": 3.793149948120117, "rewards/rejected": -1.1887872219085693, "step": 2250 }, { "epoch": 1.03, "learning_rate": 2.1867072552004056e-07, "logits/chosen": -0.4215884208679199, "logits/rejected": -0.45461931824684143, "logps/chosen": -119.3857192993164, "logps/rejected": -94.49113464355469, "loss": 0.17, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.2537574768066406, "rewards/margins": 4.495412826538086, "rewards/rejected": -1.2416555881500244, "step": 2260 }, { "epoch": 1.04, "learning_rate": 2.1816336884830032e-07, "logits/chosen": -0.3961917459964752, "logits/rejected": -0.4312414526939392, "logps/chosen": -112.6959457397461, "logps/rejected": -91.66386413574219, "loss": 0.213, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5616722106933594, "rewards/margins": 4.888741970062256, "rewards/rejected": -1.3270705938339233, "step": 2270 }, { "epoch": 1.04, "learning_rate": 2.176560121765601e-07, "logits/chosen": -0.43959441781044006, "logits/rejected": -0.472449392080307, "logps/chosen": -122.6341323852539, "logps/rejected": -97.61588287353516, "loss": 0.1952, "rewards/accuracies": 0.9375, "rewards/chosen": 4.182515621185303, "rewards/margins": 5.238072395324707, "rewards/rejected": -1.0555566549301147, "step": 2280 }, { "epoch": 1.05, "learning_rate": 2.1714865550481986e-07, "logits/chosen": -0.4102107882499695, "logits/rejected": -0.4536592364311218, "logps/chosen": -120.60931396484375, "logps/rejected": -92.90963745117188, "loss": 0.2096, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7443687915802, "rewards/margins": 4.461726188659668, "rewards/rejected": -0.7173573970794678, "step": 2290 }, { "epoch": 1.05, "learning_rate": 2.1664129883307962e-07, "logits/chosen": -0.42524951696395874, "logits/rejected": -0.4480830729007721, "logps/chosen": -115.37422943115234, "logps/rejected": -102.08689880371094, "loss": 0.2294, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.9731292724609375, "rewards/margins": 4.231186389923096, "rewards/rejected": -1.2580569982528687, "step": 2300 }, { "epoch": 1.05, "eval_logits/chosen": -0.4051222503185272, "eval_logits/rejected": -0.44120845198631287, "eval_logps/chosen": -116.918701171875, "eval_logps/rejected": -93.1042709350586, "eval_loss": 0.20699380338191986, "eval_rewards/accuracies": 0.8938547372817993, "eval_rewards/chosen": 3.120523691177368, "eval_rewards/margins": 4.207824230194092, "eval_rewards/rejected": -1.087300419807434, "eval_runtime": 870.8814, "eval_samples_per_second": 3.286, "eval_steps_per_second": 0.206, "step": 2300 }, { "epoch": 1.05, "learning_rate": 2.161339421613394e-07, "logits/chosen": -0.4129010736942291, "logits/rejected": -0.4455359876155853, "logps/chosen": -114.3980712890625, "logps/rejected": -98.12439727783203, "loss": 0.2081, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.4188950061798096, "rewards/margins": 4.589454174041748, "rewards/rejected": -1.170559048652649, "step": 2310 }, { "epoch": 1.06, "learning_rate": 2.1562658548959916e-07, "logits/chosen": -0.4106817841529846, "logits/rejected": -0.4454409182071686, "logps/chosen": -124.04959869384766, "logps/rejected": -92.8015365600586, "loss": 0.179, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.623793363571167, "rewards/margins": 4.647304058074951, "rewards/rejected": -1.0235108137130737, "step": 2320 }, { "epoch": 1.06, "learning_rate": 2.1511922881785892e-07, "logits/chosen": -0.4041675627231598, "logits/rejected": -0.44240039587020874, "logps/chosen": -113.25587463378906, "logps/rejected": -94.64698791503906, "loss": 0.2092, "rewards/accuracies": 0.875, "rewards/chosen": 2.898808002471924, "rewards/margins": 4.1147894859313965, "rewards/rejected": -1.2159812450408936, "step": 2330 }, { "epoch": 1.07, "learning_rate": 2.146118721461187e-07, "logits/chosen": -0.43367472290992737, "logits/rejected": -0.47439584136009216, "logps/chosen": -122.84818267822266, "logps/rejected": -97.22991180419922, "loss": 0.1672, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.218113422393799, "rewards/margins": 4.060845375061035, "rewards/rejected": -0.8427319526672363, "step": 2340 }, { "epoch": 1.07, "learning_rate": 2.1410451547437846e-07, "logits/chosen": -0.4216938018798828, "logits/rejected": -0.4517236649990082, "logps/chosen": -113.54508209228516, "logps/rejected": -98.01654052734375, "loss": 0.1917, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.2716426849365234, "rewards/margins": 4.241575717926025, "rewards/rejected": -0.9699331521987915, "step": 2350 }, { "epoch": 1.08, "learning_rate": 2.1359715880263822e-07, "logits/chosen": -0.39088428020477295, "logits/rejected": -0.4316210150718689, "logps/chosen": -121.35298919677734, "logps/rejected": -93.39158630371094, "loss": 0.2117, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.365229368209839, "rewards/margins": 3.8260416984558105, "rewards/rejected": -0.460812509059906, "step": 2360 }, { "epoch": 1.08, "learning_rate": 2.13089802130898e-07, "logits/chosen": -0.43318310379981995, "logits/rejected": -0.47380322217941284, "logps/chosen": -113.5303955078125, "logps/rejected": -95.12970733642578, "loss": 0.2116, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.399010419845581, "rewards/margins": 4.622774124145508, "rewards/rejected": -1.2237637042999268, "step": 2370 }, { "epoch": 1.09, "learning_rate": 2.1258244545915776e-07, "logits/chosen": -0.4225497245788574, "logits/rejected": -0.45139962434768677, "logps/chosen": -120.57337951660156, "logps/rejected": -97.2274398803711, "loss": 0.206, "rewards/accuracies": 0.875, "rewards/chosen": 3.4524986743927, "rewards/margins": 4.2372307777404785, "rewards/rejected": -0.7847325801849365, "step": 2380 }, { "epoch": 1.09, "learning_rate": 2.1207508878741752e-07, "logits/chosen": -0.41160812973976135, "logits/rejected": -0.45344653725624084, "logps/chosen": -110.57585144042969, "logps/rejected": -87.49763488769531, "loss": 0.2314, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.0225930213928223, "rewards/margins": 4.383017539978027, "rewards/rejected": -1.3604247570037842, "step": 2390 }, { "epoch": 1.1, "learning_rate": 2.115677321156773e-07, "logits/chosen": -0.4245067536830902, "logits/rejected": -0.4589248597621918, "logps/chosen": -125.22467041015625, "logps/rejected": -100.40376281738281, "loss": 0.1897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4412460327148438, "rewards/margins": 4.521696090698242, "rewards/rejected": -1.0804498195648193, "step": 2400 }, { "epoch": 1.1, "eval_logits/chosen": -0.41355904936790466, "eval_logits/rejected": -0.44829556345939636, "eval_logps/chosen": -116.84915924072266, "eval_logps/rejected": -93.104736328125, "eval_loss": 0.20109796524047852, "eval_rewards/accuracies": 0.9050279259681702, "eval_rewards/chosen": 3.155294179916382, "eval_rewards/margins": 4.242827415466309, "eval_rewards/rejected": -1.0875334739685059, "eval_runtime": 925.0662, "eval_samples_per_second": 3.094, "eval_steps_per_second": 0.193, "step": 2400 }, { "epoch": 1.1, "learning_rate": 2.1106037544393706e-07, "logits/chosen": -0.4162351191043854, "logits/rejected": -0.44722890853881836, "logps/chosen": -117.66087341308594, "logps/rejected": -98.16817474365234, "loss": 0.2215, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.286454439163208, "rewards/margins": 4.626716136932373, "rewards/rejected": -1.3402621746063232, "step": 2410 }, { "epoch": 1.1, "learning_rate": 2.1055301877219682e-07, "logits/chosen": -0.3990441560745239, "logits/rejected": -0.44646185636520386, "logps/chosen": -127.4264144897461, "logps/rejected": -94.5553207397461, "loss": 0.2092, "rewards/accuracies": 0.9375, "rewards/chosen": 3.721330165863037, "rewards/margins": 4.82289981842041, "rewards/rejected": -1.1015697717666626, "step": 2420 }, { "epoch": 1.11, "learning_rate": 2.100456621004566e-07, "logits/chosen": -0.4313054084777832, "logits/rejected": -0.4540146291255951, "logps/chosen": -107.0306396484375, "logps/rejected": -92.35552215576172, "loss": 0.1953, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.239025115966797, "rewards/margins": 4.2459869384765625, "rewards/rejected": -1.0069619417190552, "step": 2430 }, { "epoch": 1.11, "learning_rate": 2.0953830542871636e-07, "logits/chosen": -0.4273985028266907, "logits/rejected": -0.45803728699684143, "logps/chosen": -120.7825927734375, "logps/rejected": -95.52888488769531, "loss": 0.2017, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.346850633621216, "rewards/margins": 4.3342814445495605, "rewards/rejected": -0.9874309301376343, "step": 2440 }, { "epoch": 1.12, "learning_rate": 2.0903094875697612e-07, "logits/chosen": -0.42989325523376465, "logits/rejected": -0.47014039754867554, "logps/chosen": -127.97347259521484, "logps/rejected": -96.18221282958984, "loss": 0.1925, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.130273342132568, "rewards/margins": 4.938960075378418, "rewards/rejected": -0.8086867332458496, "step": 2450 }, { "epoch": 1.12, "learning_rate": 2.085235920852359e-07, "logits/chosen": -0.4441000819206238, "logits/rejected": -0.47532176971435547, "logps/chosen": -118.6019058227539, "logps/rejected": -97.86750793457031, "loss": 0.1793, "rewards/accuracies": 0.9375, "rewards/chosen": 2.886178493499756, "rewards/margins": 4.492079257965088, "rewards/rejected": -1.605900526046753, "step": 2460 }, { "epoch": 1.13, "learning_rate": 2.0801623541349566e-07, "logits/chosen": -0.39648327231407166, "logits/rejected": -0.4312874674797058, "logps/chosen": -118.57756042480469, "logps/rejected": -95.03660583496094, "loss": 0.2082, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.6688880920410156, "rewards/margins": 4.01635217666626, "rewards/rejected": -0.3474644124507904, "step": 2470 }, { "epoch": 1.13, "learning_rate": 2.0750887874175542e-07, "logits/chosen": -0.42028895020484924, "logits/rejected": -0.46101123094558716, "logps/chosen": -120.1390151977539, "logps/rejected": -102.4979019165039, "loss": 0.162, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.363370418548584, "rewards/margins": 5.152722358703613, "rewards/rejected": -1.7893520593643188, "step": 2480 }, { "epoch": 1.14, "learning_rate": 2.070015220700152e-07, "logits/chosen": -0.4217614531517029, "logits/rejected": -0.4549930989742279, "logps/chosen": -120.854736328125, "logps/rejected": -94.92854309082031, "loss": 0.1858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3437397480010986, "rewards/margins": 4.921185493469238, "rewards/rejected": -1.5774452686309814, "step": 2490 }, { "epoch": 1.14, "learning_rate": 2.0649416539827496e-07, "logits/chosen": -0.38463494181632996, "logits/rejected": -0.42476749420166016, "logps/chosen": -121.46852111816406, "logps/rejected": -97.03733825683594, "loss": 0.1943, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.1613824367523193, "rewards/margins": 4.206856727600098, "rewards/rejected": -1.0454740524291992, "step": 2500 }, { "epoch": 1.14, "eval_logits/chosen": -0.4137432873249054, "eval_logits/rejected": -0.4488118886947632, "eval_logps/chosen": -116.49640655517578, "eval_logps/rejected": -93.38188171386719, "eval_loss": 0.1952887624502182, "eval_rewards/accuracies": 0.9022346138954163, "eval_rewards/chosen": 3.3316712379455566, "eval_rewards/margins": 4.5577778816223145, "eval_rewards/rejected": -1.2261064052581787, "eval_runtime": 871.869, "eval_samples_per_second": 3.283, "eval_steps_per_second": 0.205, "step": 2500 }, { "epoch": 1.15, "learning_rate": 2.0598680872653472e-07, "logits/chosen": -0.4275178015232086, "logits/rejected": -0.4599391520023346, "logps/chosen": -121.42420959472656, "logps/rejected": -96.46556854248047, "loss": 0.1868, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.031966209411621, "rewards/margins": 5.205144882202148, "rewards/rejected": -1.1731784343719482, "step": 2510 }, { "epoch": 1.15, "learning_rate": 2.054794520547945e-07, "logits/chosen": -0.40186938643455505, "logits/rejected": -0.4418622851371765, "logps/chosen": -119.48040771484375, "logps/rejected": -94.07899475097656, "loss": 0.1948, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7327773571014404, "rewards/margins": 4.83219051361084, "rewards/rejected": -1.0994136333465576, "step": 2520 }, { "epoch": 1.15, "learning_rate": 2.0497209538305426e-07, "logits/chosen": -0.4523470401763916, "logits/rejected": -0.4795070290565491, "logps/chosen": -117.58869934082031, "logps/rejected": -92.097412109375, "loss": 0.1957, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.2375411987304688, "rewards/margins": 4.7822418212890625, "rewards/rejected": -1.5447006225585938, "step": 2530 }, { "epoch": 1.16, "learning_rate": 2.0446473871131402e-07, "logits/chosen": -0.42697858810424805, "logits/rejected": -0.4653555452823639, "logps/chosen": -128.19395446777344, "logps/rejected": -104.2452621459961, "loss": 0.1956, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7017173767089844, "rewards/margins": 5.180688381195068, "rewards/rejected": -1.478971004486084, "step": 2540 }, { "epoch": 1.16, "learning_rate": 2.039573820395738e-07, "logits/chosen": -0.4241272807121277, "logits/rejected": -0.4700315594673157, "logps/chosen": -122.24373626708984, "logps/rejected": -92.90812683105469, "loss": 0.2147, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.6799728870391846, "rewards/margins": 4.794487476348877, "rewards/rejected": -1.114514946937561, "step": 2550 }, { "epoch": 1.17, "learning_rate": 2.0345002536783356e-07, "logits/chosen": -0.4255804419517517, "logits/rejected": -0.45682722330093384, "logps/chosen": -117.48539733886719, "logps/rejected": -91.08617401123047, "loss": 0.2029, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.406527042388916, "rewards/margins": 4.893421649932861, "rewards/rejected": -1.486893892288208, "step": 2560 }, { "epoch": 1.17, "learning_rate": 2.0294266869609332e-07, "logits/chosen": -0.4311680793762207, "logits/rejected": -0.46691998839378357, "logps/chosen": -116.72953033447266, "logps/rejected": -91.28812408447266, "loss": 0.1983, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.2318546772003174, "rewards/margins": 4.533720970153809, "rewards/rejected": -1.301865816116333, "step": 2570 }, { "epoch": 1.18, "learning_rate": 2.024353120243531e-07, "logits/chosen": -0.4406512677669525, "logits/rejected": -0.471552312374115, "logps/chosen": -112.9683609008789, "logps/rejected": -98.54328918457031, "loss": 0.2039, "rewards/accuracies": 0.9375, "rewards/chosen": 3.4355063438415527, "rewards/margins": 4.474089622497559, "rewards/rejected": -1.0385830402374268, "step": 2580 }, { "epoch": 1.18, "learning_rate": 2.0192795535261286e-07, "logits/chosen": -0.41342782974243164, "logits/rejected": -0.45259562134742737, "logps/chosen": -122.47785949707031, "logps/rejected": -98.93025970458984, "loss": 0.1774, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7889657020568848, "rewards/margins": 5.727280616760254, "rewards/rejected": -1.9383147954940796, "step": 2590 }, { "epoch": 1.19, "learning_rate": 2.0142059868087262e-07, "logits/chosen": -0.43133312463760376, "logits/rejected": -0.45189160108566284, "logps/chosen": -118.82820129394531, "logps/rejected": -95.33827209472656, "loss": 0.1749, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4565823078155518, "rewards/margins": 4.577003479003906, "rewards/rejected": -1.1204214096069336, "step": 2600 }, { "epoch": 1.19, "eval_logits/chosen": -0.4159504175186157, "eval_logits/rejected": -0.4499986171722412, "eval_logps/chosen": -116.7225112915039, "eval_logps/rejected": -93.57614135742188, "eval_loss": 0.19750715792179108, "eval_rewards/accuracies": 0.8910614252090454, "eval_rewards/chosen": 3.2186174392700195, "eval_rewards/margins": 4.541853427886963, "eval_rewards/rejected": -1.323236107826233, "eval_runtime": 917.0951, "eval_samples_per_second": 3.121, "eval_steps_per_second": 0.195, "step": 2600 }, { "epoch": 1.19, "learning_rate": 2.009132420091324e-07, "logits/chosen": -0.37940889596939087, "logits/rejected": -0.4150822162628174, "logps/chosen": -111.95550537109375, "logps/rejected": -94.47859954833984, "loss": 0.1907, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.825603485107422, "rewards/margins": 5.173952102661133, "rewards/rejected": -1.3483483791351318, "step": 2610 }, { "epoch": 1.2, "learning_rate": 2.0040588533739216e-07, "logits/chosen": -0.4204653203487396, "logits/rejected": -0.44999074935913086, "logps/chosen": -124.92396545410156, "logps/rejected": -98.2974853515625, "loss": 0.1823, "rewards/accuracies": 0.875, "rewards/chosen": 3.0983715057373047, "rewards/margins": 4.7994771003723145, "rewards/rejected": -1.7011057138442993, "step": 2620 }, { "epoch": 1.2, "learning_rate": 1.9989852866565192e-07, "logits/chosen": -0.42279934883117676, "logits/rejected": -0.4580332636833191, "logps/chosen": -119.20674133300781, "logps/rejected": -94.82040405273438, "loss": 0.1818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2803149223327637, "rewards/margins": 4.06785774230957, "rewards/rejected": -0.7875427603721619, "step": 2630 }, { "epoch": 1.21, "learning_rate": 1.993911719939117e-07, "logits/chosen": -0.39321383833885193, "logits/rejected": -0.43814224004745483, "logps/chosen": -117.86689758300781, "logps/rejected": -91.55029296875, "loss": 0.1832, "rewards/accuracies": 0.875, "rewards/chosen": 3.1397879123687744, "rewards/margins": 4.142951488494873, "rewards/rejected": -1.0031640529632568, "step": 2640 }, { "epoch": 1.21, "learning_rate": 1.9888381532217146e-07, "logits/chosen": -0.4433029294013977, "logits/rejected": -0.46814584732055664, "logps/chosen": -111.34849548339844, "logps/rejected": -95.34590911865234, "loss": 0.1731, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.7614011764526367, "rewards/margins": 4.330539703369141, "rewards/rejected": -1.5691388845443726, "step": 2650 }, { "epoch": 1.21, "learning_rate": 1.9837645865043122e-07, "logits/chosen": -0.4222814440727234, "logits/rejected": -0.45477691292762756, "logps/chosen": -114.8381576538086, "logps/rejected": -93.96653747558594, "loss": 0.1705, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6209235191345215, "rewards/margins": 5.391648769378662, "rewards/rejected": -1.7707252502441406, "step": 2660 }, { "epoch": 1.22, "learning_rate": 1.97869101978691e-07, "logits/chosen": -0.4320278763771057, "logits/rejected": -0.4549098610877991, "logps/chosen": -116.82099914550781, "logps/rejected": -95.1173324584961, "loss": 0.1619, "rewards/accuracies": 0.875, "rewards/chosen": 2.4755518436431885, "rewards/margins": 3.844717025756836, "rewards/rejected": -1.3691655397415161, "step": 2670 }, { "epoch": 1.22, "learning_rate": 1.9736174530695076e-07, "logits/chosen": -0.3963245451450348, "logits/rejected": -0.4262765347957611, "logps/chosen": -116.55233001708984, "logps/rejected": -92.63807678222656, "loss": 0.1714, "rewards/accuracies": 0.9375, "rewards/chosen": 3.089799165725708, "rewards/margins": 4.424617767333984, "rewards/rejected": -1.3348182439804077, "step": 2680 }, { "epoch": 1.23, "learning_rate": 1.9685438863521052e-07, "logits/chosen": -0.41304507851600647, "logits/rejected": -0.4391093850135803, "logps/chosen": -114.98722839355469, "logps/rejected": -95.16316223144531, "loss": 0.1799, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.718804121017456, "rewards/margins": 4.613897800445557, "rewards/rejected": -0.8950934410095215, "step": 2690 }, { "epoch": 1.23, "learning_rate": 1.963470319634703e-07, "logits/chosen": -0.39430639147758484, "logits/rejected": -0.4395686089992523, "logps/chosen": -122.29681396484375, "logps/rejected": -94.48133087158203, "loss": 0.1881, "rewards/accuracies": 0.875, "rewards/chosen": 3.283339738845825, "rewards/margins": 4.763991355895996, "rewards/rejected": -1.480652093887329, "step": 2700 }, { "epoch": 1.23, "eval_logits/chosen": -0.39620405435562134, "eval_logits/rejected": -0.4262426793575287, "eval_logps/chosen": -116.51838684082031, "eval_logps/rejected": -93.5943603515625, "eval_loss": 0.18383081257343292, "eval_rewards/accuracies": 0.9273743033409119, "eval_rewards/chosen": 3.320681095123291, "eval_rewards/margins": 4.653022766113281, "eval_rewards/rejected": -1.3323419094085693, "eval_runtime": 882.9525, "eval_samples_per_second": 3.241, "eval_steps_per_second": 0.203, "step": 2700 }, { "epoch": 1.24, "learning_rate": 1.9583967529173006e-07, "logits/chosen": -0.37125352025032043, "logits/rejected": -0.40501752495765686, "logps/chosen": -122.63427734375, "logps/rejected": -95.20475006103516, "loss": 0.1623, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9033172130584717, "rewards/margins": 4.929192543029785, "rewards/rejected": -1.0258758068084717, "step": 2710 }, { "epoch": 1.24, "learning_rate": 1.9533231861998982e-07, "logits/chosen": -0.4100262522697449, "logits/rejected": -0.44318699836730957, "logps/chosen": -123.42138671875, "logps/rejected": -100.46942901611328, "loss": 0.1807, "rewards/accuracies": 0.9375, "rewards/chosen": 3.8883254528045654, "rewards/margins": 5.432125568389893, "rewards/rejected": -1.5438001155853271, "step": 2720 }, { "epoch": 1.25, "learning_rate": 1.948249619482496e-07, "logits/chosen": -0.4184117913246155, "logits/rejected": -0.4501233994960785, "logps/chosen": -116.8466796875, "logps/rejected": -93.37806701660156, "loss": 0.1818, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.737032413482666, "rewards/margins": 4.598247528076172, "rewards/rejected": -1.8612152338027954, "step": 2730 }, { "epoch": 1.25, "learning_rate": 1.9431760527650936e-07, "logits/chosen": -0.39006081223487854, "logits/rejected": -0.43392476439476013, "logps/chosen": -128.30226135253906, "logps/rejected": -95.20794677734375, "loss": 0.172, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.975538730621338, "rewards/margins": 5.608933448791504, "rewards/rejected": -1.6333945989608765, "step": 2740 }, { "epoch": 1.26, "learning_rate": 1.9381024860476912e-07, "logits/chosen": -0.42632046341896057, "logits/rejected": -0.4491303861141205, "logps/chosen": -127.4406509399414, "logps/rejected": -97.50559997558594, "loss": 0.1711, "rewards/accuracies": 0.9375, "rewards/chosen": 3.2022480964660645, "rewards/margins": 5.0386505126953125, "rewards/rejected": -1.836402177810669, "step": 2750 }, { "epoch": 1.26, "learning_rate": 1.933028919330289e-07, "logits/chosen": -0.4237644076347351, "logits/rejected": -0.44978874921798706, "logps/chosen": -119.2694320678711, "logps/rejected": -96.05281066894531, "loss": 0.1957, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.3220973014831543, "rewards/margins": 4.4357171058654785, "rewards/rejected": -1.1136192083358765, "step": 2760 }, { "epoch": 1.26, "learning_rate": 1.9279553526128866e-07, "logits/chosen": -0.40960246324539185, "logits/rejected": -0.44518280029296875, "logps/chosen": -121.33802795410156, "logps/rejected": -94.39434814453125, "loss": 0.222, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6748435497283936, "rewards/margins": 5.344630718231201, "rewards/rejected": -1.6697871685028076, "step": 2770 }, { "epoch": 1.27, "learning_rate": 1.9228817858954842e-07, "logits/chosen": -0.38372522592544556, "logits/rejected": -0.4217826724052429, "logps/chosen": -116.77619934082031, "logps/rejected": -94.98020935058594, "loss": 0.1494, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7456507682800293, "rewards/margins": 5.059269905090332, "rewards/rejected": -1.3136188983917236, "step": 2780 }, { "epoch": 1.27, "learning_rate": 1.917808219178082e-07, "logits/chosen": -0.41423946619033813, "logits/rejected": -0.4451848864555359, "logps/chosen": -109.53131103515625, "logps/rejected": -95.5805435180664, "loss": 0.1633, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.9713897705078125, "rewards/margins": 4.25122594833374, "rewards/rejected": -1.279836654663086, "step": 2790 }, { "epoch": 1.28, "learning_rate": 1.9127346524606796e-07, "logits/chosen": -0.3843603730201721, "logits/rejected": -0.41677069664001465, "logps/chosen": -121.75700378417969, "logps/rejected": -96.72332763671875, "loss": 0.1611, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5085883140563965, "rewards/margins": 4.820291042327881, "rewards/rejected": -1.3117033243179321, "step": 2800 }, { "epoch": 1.28, "eval_logits/chosen": -0.4090903401374817, "eval_logits/rejected": -0.44037947058677673, "eval_logps/chosen": -116.5835189819336, "eval_logps/rejected": -93.6472396850586, "eval_loss": 0.1833222210407257, "eval_rewards/accuracies": 0.910614550113678, "eval_rewards/chosen": 3.288114309310913, "eval_rewards/margins": 4.646894454956055, "eval_rewards/rejected": -1.358780026435852, "eval_runtime": 899.7748, "eval_samples_per_second": 3.181, "eval_steps_per_second": 0.199, "step": 2800 }, { "epoch": 1.28, "learning_rate": 1.9076610857432772e-07, "logits/chosen": -0.4082648754119873, "logits/rejected": -0.4388160705566406, "logps/chosen": -119.51053619384766, "logps/rejected": -98.2977066040039, "loss": 0.1952, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.516996383666992, "rewards/margins": 4.8721923828125, "rewards/rejected": -1.3551959991455078, "step": 2810 }, { "epoch": 1.29, "learning_rate": 1.902587519025875e-07, "logits/chosen": -0.39836177229881287, "logits/rejected": -0.43699654936790466, "logps/chosen": -130.80030822753906, "logps/rejected": -98.20820617675781, "loss": 0.1628, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.9940390586853027, "rewards/margins": 5.695845603942871, "rewards/rejected": -1.701806664466858, "step": 2820 }, { "epoch": 1.29, "learning_rate": 1.8975139523084726e-07, "logits/chosen": -0.4374879002571106, "logits/rejected": -0.4642358720302582, "logps/chosen": -124.3564224243164, "logps/rejected": -101.50624084472656, "loss": 0.1992, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.7666969299316406, "rewards/margins": 4.858794212341309, "rewards/rejected": -2.092097520828247, "step": 2830 }, { "epoch": 1.3, "learning_rate": 1.8924403855910702e-07, "logits/chosen": -0.4187402129173279, "logits/rejected": -0.4483548700809479, "logps/chosen": -116.7706527709961, "logps/rejected": -92.22254943847656, "loss": 0.1856, "rewards/accuracies": 0.875, "rewards/chosen": 2.8258209228515625, "rewards/margins": 4.667115688323975, "rewards/rejected": -1.841294527053833, "step": 2840 }, { "epoch": 1.3, "learning_rate": 1.887366818873668e-07, "logits/chosen": -0.4137202799320221, "logits/rejected": -0.4449295401573181, "logps/chosen": -117.85295104980469, "logps/rejected": -96.74876403808594, "loss": 0.1435, "rewards/accuracies": 0.9375, "rewards/chosen": 3.32916259765625, "rewards/margins": 4.432304859161377, "rewards/rejected": -1.1031419038772583, "step": 2850 }, { "epoch": 1.31, "learning_rate": 1.8822932521562656e-07, "logits/chosen": -0.3739413619041443, "logits/rejected": -0.4138815999031067, "logps/chosen": -116.3170394897461, "logps/rejected": -97.33705139160156, "loss": 0.1594, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.9688007831573486, "rewards/margins": 5.0333571434021, "rewards/rejected": -1.0645564794540405, "step": 2860 }, { "epoch": 1.31, "learning_rate": 1.8772196854388632e-07, "logits/chosen": -0.43607673048973083, "logits/rejected": -0.4659034311771393, "logps/chosen": -117.14688873291016, "logps/rejected": -95.03040313720703, "loss": 0.1743, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.77264404296875, "rewards/margins": 5.061250686645508, "rewards/rejected": -1.288606882095337, "step": 2870 }, { "epoch": 1.31, "learning_rate": 1.872146118721461e-07, "logits/chosen": -0.4345594346523285, "logits/rejected": -0.4634518623352051, "logps/chosen": -128.435791015625, "logps/rejected": -94.08797454833984, "loss": 0.1772, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.388913154602051, "rewards/margins": 5.8124918937683105, "rewards/rejected": -1.4235796928405762, "step": 2880 }, { "epoch": 1.32, "learning_rate": 1.8670725520040586e-07, "logits/chosen": -0.438603937625885, "logits/rejected": -0.46849116683006287, "logps/chosen": -121.33575439453125, "logps/rejected": -96.94755554199219, "loss": 0.1648, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.078803062438965, "rewards/margins": 5.43464469909668, "rewards/rejected": -1.3558417558670044, "step": 2890 }, { "epoch": 1.32, "learning_rate": 1.8619989852866562e-07, "logits/chosen": -0.40312066674232483, "logits/rejected": -0.4359334111213684, "logps/chosen": -115.02134704589844, "logps/rejected": -96.40849304199219, "loss": 0.1653, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.6323986053466797, "rewards/margins": 5.246556282043457, "rewards/rejected": -1.614158272743225, "step": 2900 }, { "epoch": 1.32, "eval_logits/chosen": -0.3995789587497711, "eval_logits/rejected": -0.4252215325832367, "eval_logps/chosen": -116.65079498291016, "eval_logps/rejected": -94.15835571289062, "eval_loss": 0.195852130651474, "eval_rewards/accuracies": 0.9189944267272949, "eval_rewards/chosen": 3.2544755935668945, "eval_rewards/margins": 4.868815898895264, "eval_rewards/rejected": -1.6143405437469482, "eval_runtime": 907.9432, "eval_samples_per_second": 3.152, "eval_steps_per_second": 0.197, "step": 2900 }, { "epoch": 1.33, "learning_rate": 1.856925418569254e-07, "logits/chosen": -0.43267160654067993, "logits/rejected": -0.4583393931388855, "logps/chosen": -127.33109283447266, "logps/rejected": -98.23350524902344, "loss": 0.1736, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.151562690734863, "rewards/margins": 6.144377708435059, "rewards/rejected": -1.992815375328064, "step": 2910 }, { "epoch": 1.33, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -0.41695013642311096, "logits/rejected": -0.4449712336063385, "logps/chosen": -115.79557037353516, "logps/rejected": -96.62245178222656, "loss": 0.1925, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.329120635986328, "rewards/margins": 4.9218339920043945, "rewards/rejected": -1.5927129983901978, "step": 2920 }, { "epoch": 1.34, "learning_rate": 1.8467782851344492e-07, "logits/chosen": -0.40566110610961914, "logits/rejected": -0.44443726539611816, "logps/chosen": -116.14949798583984, "logps/rejected": -94.5146484375, "loss": 0.2166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.645608425140381, "rewards/margins": 4.7818427085876465, "rewards/rejected": -1.1362345218658447, "step": 2930 }, { "epoch": 1.34, "learning_rate": 1.841704718417047e-07, "logits/chosen": -0.4052400588989258, "logits/rejected": -0.4479829668998718, "logps/chosen": -119.50074768066406, "logps/rejected": -96.74118041992188, "loss": 0.1592, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.713555097579956, "rewards/margins": 5.739047527313232, "rewards/rejected": -2.0254924297332764, "step": 2940 }, { "epoch": 1.35, "learning_rate": 1.8366311516996446e-07, "logits/chosen": -0.3960481286048889, "logits/rejected": -0.4279060363769531, "logps/chosen": -113.77364349365234, "logps/rejected": -89.41834259033203, "loss": 0.1519, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.5727920532226562, "rewards/margins": 4.469664096832275, "rewards/rejected": -0.8968713879585266, "step": 2950 }, { "epoch": 1.35, "learning_rate": 1.8315575849822422e-07, "logits/chosen": -0.3719174563884735, "logits/rejected": -0.41467922925949097, "logps/chosen": -122.60404205322266, "logps/rejected": -98.62785339355469, "loss": 0.1757, "rewards/accuracies": 0.9375, "rewards/chosen": 3.8493294715881348, "rewards/margins": 4.896603584289551, "rewards/rejected": -1.0472742319107056, "step": 2960 }, { "epoch": 1.36, "learning_rate": 1.82648401826484e-07, "logits/chosen": -0.4145309031009674, "logits/rejected": -0.4517810344696045, "logps/chosen": -117.5206527709961, "logps/rejected": -91.79268646240234, "loss": 0.1798, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.707047700881958, "rewards/margins": 4.669058322906494, "rewards/rejected": -0.9620102643966675, "step": 2970 }, { "epoch": 1.36, "learning_rate": 1.8214104515474375e-07, "logits/chosen": -0.41848722100257874, "logits/rejected": -0.44989675283432007, "logps/chosen": -121.28948974609375, "logps/rejected": -97.52169036865234, "loss": 0.1789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.630518674850464, "rewards/margins": 5.358605861663818, "rewards/rejected": -1.7280871868133545, "step": 2980 }, { "epoch": 1.36, "learning_rate": 1.8163368848300352e-07, "logits/chosen": -0.41235464811325073, "logits/rejected": -0.4453458786010742, "logps/chosen": -115.3282699584961, "logps/rejected": -101.21430969238281, "loss": 0.1992, "rewards/accuracies": 0.9375, "rewards/chosen": 3.833465576171875, "rewards/margins": 5.110463619232178, "rewards/rejected": -1.2769982814788818, "step": 2990 }, { "epoch": 1.37, "learning_rate": 1.811263318112633e-07, "logits/chosen": -0.4016539454460144, "logits/rejected": -0.43046021461486816, "logps/chosen": -115.31640625, "logps/rejected": -94.98763275146484, "loss": 0.1613, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.6097488403320312, "rewards/margins": 4.730071067810059, "rewards/rejected": -1.1203219890594482, "step": 3000 }, { "epoch": 1.37, "eval_logits/chosen": -0.4070635735988617, "eval_logits/rejected": -0.43743789196014404, "eval_logps/chosen": -116.37444305419922, "eval_logps/rejected": -93.96776580810547, "eval_loss": 0.17788465321063995, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.392648935317993, "eval_rewards/margins": 4.911698818206787, "eval_rewards/rejected": -1.5190494060516357, "eval_runtime": 878.9478, "eval_samples_per_second": 3.256, "eval_steps_per_second": 0.204, "step": 3000 }, { "epoch": 1.37, "learning_rate": 1.8061897513952305e-07, "logits/chosen": -0.41279563307762146, "logits/rejected": -0.45051974058151245, "logps/chosen": -124.44229888916016, "logps/rejected": -95.6501235961914, "loss": 0.1596, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.279679298400879, "rewards/margins": 5.73773193359375, "rewards/rejected": -1.4580527544021606, "step": 3010 }, { "epoch": 1.38, "learning_rate": 1.8011161846778282e-07, "logits/chosen": -0.4213915765285492, "logits/rejected": -0.45495182275772095, "logps/chosen": -122.38909912109375, "logps/rejected": -95.29975891113281, "loss": 0.1814, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.3720550537109375, "rewards/margins": 4.8198981285095215, "rewards/rejected": -1.4478428363800049, "step": 3020 }, { "epoch": 1.38, "learning_rate": 1.796042617960426e-07, "logits/chosen": -0.422152578830719, "logits/rejected": -0.4571755826473236, "logps/chosen": -127.7369155883789, "logps/rejected": -95.61505889892578, "loss": 0.1488, "rewards/accuracies": 0.875, "rewards/chosen": 3.14093279838562, "rewards/margins": 4.787783622741699, "rewards/rejected": -1.6468513011932373, "step": 3030 }, { "epoch": 1.39, "learning_rate": 1.7909690512430235e-07, "logits/chosen": -0.43789142370224, "logits/rejected": -0.4707309305667877, "logps/chosen": -117.2684097290039, "logps/rejected": -94.7235336303711, "loss": 0.1642, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.235317230224609, "rewards/margins": 5.704260349273682, "rewards/rejected": -1.4689429998397827, "step": 3040 }, { "epoch": 1.39, "learning_rate": 1.7858954845256212e-07, "logits/chosen": -0.39825141429901123, "logits/rejected": -0.4274836480617523, "logps/chosen": -117.91035461425781, "logps/rejected": -99.51007843017578, "loss": 0.1547, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6470961570739746, "rewards/margins": 5.261767864227295, "rewards/rejected": -1.6146717071533203, "step": 3050 }, { "epoch": 1.4, "learning_rate": 1.780821917808219e-07, "logits/chosen": -0.41886386275291443, "logits/rejected": -0.44838714599609375, "logps/chosen": -117.38224792480469, "logps/rejected": -101.48182678222656, "loss": 0.1534, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.344783067703247, "rewards/margins": 4.771211624145508, "rewards/rejected": -1.426428198814392, "step": 3060 }, { "epoch": 1.4, "learning_rate": 1.7757483510908165e-07, "logits/chosen": -0.4153475761413574, "logits/rejected": -0.4372943043708801, "logps/chosen": -113.58921813964844, "logps/rejected": -96.70695495605469, "loss": 0.1712, "rewards/accuracies": 0.9375, "rewards/chosen": 3.796396255493164, "rewards/margins": 5.346527099609375, "rewards/rejected": -1.55013108253479, "step": 3070 }, { "epoch": 1.41, "learning_rate": 1.7706747843734142e-07, "logits/chosen": -0.40615734457969666, "logits/rejected": -0.4455091953277588, "logps/chosen": -115.18067932128906, "logps/rejected": -89.9873275756836, "loss": 0.1599, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7986888885498047, "rewards/margins": 5.377463340759277, "rewards/rejected": -1.5787745714187622, "step": 3080 }, { "epoch": 1.41, "learning_rate": 1.765601217656012e-07, "logits/chosen": -0.39819687604904175, "logits/rejected": -0.4362192153930664, "logps/chosen": -120.87335205078125, "logps/rejected": -93.75814056396484, "loss": 0.1579, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.934561252593994, "rewards/margins": 5.506875514984131, "rewards/rejected": -1.572314977645874, "step": 3090 }, { "epoch": 1.42, "learning_rate": 1.7605276509386095e-07, "logits/chosen": -0.3975370526313782, "logits/rejected": -0.4346223473548889, "logps/chosen": -115.0535888671875, "logps/rejected": -96.48837280273438, "loss": 0.1785, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.473949432373047, "rewards/margins": 4.882634162902832, "rewards/rejected": -1.4086847305297852, "step": 3100 }, { "epoch": 1.42, "eval_logits/chosen": -0.39869609475135803, "eval_logits/rejected": -0.42796045541763306, "eval_logps/chosen": -116.34907531738281, "eval_logps/rejected": -94.18682098388672, "eval_loss": 0.18402154743671417, "eval_rewards/accuracies": 0.924580991268158, "eval_rewards/chosen": 3.4053359031677246, "eval_rewards/margins": 5.033909797668457, "eval_rewards/rejected": -1.6285736560821533, "eval_runtime": 898.2256, "eval_samples_per_second": 3.186, "eval_steps_per_second": 0.199, "step": 3100 }, { "epoch": 1.42, "learning_rate": 1.7554540842212072e-07, "logits/chosen": -0.4134501516819, "logits/rejected": -0.44677990674972534, "logps/chosen": -121.47572326660156, "logps/rejected": -93.6028060913086, "loss": 0.1444, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5641727447509766, "rewards/margins": 4.663364410400391, "rewards/rejected": -1.0991913080215454, "step": 3110 }, { "epoch": 1.42, "learning_rate": 1.750380517503805e-07, "logits/chosen": -0.4042896330356598, "logits/rejected": -0.44077545404434204, "logps/chosen": -121.50472259521484, "logps/rejected": -96.93872833251953, "loss": 0.1782, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.516829013824463, "rewards/margins": 5.27219295501709, "rewards/rejected": -1.7553646564483643, "step": 3120 }, { "epoch": 1.43, "learning_rate": 1.7453069507864025e-07, "logits/chosen": -0.4291275441646576, "logits/rejected": -0.4636690020561218, "logps/chosen": -116.3439712524414, "logps/rejected": -87.82887268066406, "loss": 0.1881, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.2129287719726562, "rewards/margins": 4.926560401916504, "rewards/rejected": -1.7136313915252686, "step": 3130 }, { "epoch": 1.43, "learning_rate": 1.7402333840690002e-07, "logits/chosen": -0.42489296197891235, "logits/rejected": -0.4597654938697815, "logps/chosen": -119.62519836425781, "logps/rejected": -92.69261169433594, "loss": 0.1501, "rewards/accuracies": 0.9375, "rewards/chosen": 2.7139596939086914, "rewards/margins": 4.3475847244262695, "rewards/rejected": -1.6336256265640259, "step": 3140 }, { "epoch": 1.44, "learning_rate": 1.735159817351598e-07, "logits/chosen": -0.38904017210006714, "logits/rejected": -0.4342716336250305, "logps/chosen": -114.5102310180664, "logps/rejected": -99.30461120605469, "loss": 0.1625, "rewards/accuracies": 0.9375, "rewards/chosen": 3.767416000366211, "rewards/margins": 5.650642395019531, "rewards/rejected": -1.8832263946533203, "step": 3150 }, { "epoch": 1.44, "learning_rate": 1.7300862506341955e-07, "logits/chosen": -0.41989022493362427, "logits/rejected": -0.44726991653442383, "logps/chosen": -127.139892578125, "logps/rejected": -98.16439819335938, "loss": 0.1631, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7642390727996826, "rewards/margins": 5.772668361663818, "rewards/rejected": -2.008429765701294, "step": 3160 }, { "epoch": 1.45, "learning_rate": 1.7250126839167932e-07, "logits/chosen": -0.3980824053287506, "logits/rejected": -0.4251154065132141, "logps/chosen": -122.60902404785156, "logps/rejected": -99.0849380493164, "loss": 0.1854, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.4130959510803223, "rewards/margins": 4.698065757751465, "rewards/rejected": -1.2849695682525635, "step": 3170 }, { "epoch": 1.45, "learning_rate": 1.719939117199391e-07, "logits/chosen": -0.42671066522598267, "logits/rejected": -0.4508728086948395, "logps/chosen": -117.6341323852539, "logps/rejected": -98.55741882324219, "loss": 0.1278, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.02409029006958, "rewards/margins": 5.484737396240234, "rewards/rejected": -1.4606469869613647, "step": 3180 }, { "epoch": 1.46, "learning_rate": 1.7148655504819885e-07, "logits/chosen": -0.43290767073631287, "logits/rejected": -0.47308388352394104, "logps/chosen": -118.79693603515625, "logps/rejected": -98.24039459228516, "loss": 0.1406, "rewards/accuracies": 0.9375, "rewards/chosen": 4.303812026977539, "rewards/margins": 6.049398422241211, "rewards/rejected": -1.7455860376358032, "step": 3190 }, { "epoch": 1.46, "learning_rate": 1.7097919837645862e-07, "logits/chosen": -0.4589117169380188, "logits/rejected": -0.48393720388412476, "logps/chosen": -120.695556640625, "logps/rejected": -98.32371520996094, "loss": 0.1544, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.8730056285858154, "rewards/margins": 5.28024435043335, "rewards/rejected": -2.407238483428955, "step": 3200 }, { "epoch": 1.46, "eval_logits/chosen": -0.4308675527572632, "eval_logits/rejected": -0.4623854160308838, "eval_logps/chosen": -116.15389251708984, "eval_logps/rejected": -94.2074966430664, "eval_loss": 0.16856719553470612, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.5029242038726807, "eval_rewards/margins": 5.141838073730469, "eval_rewards/rejected": -1.6389143466949463, "eval_runtime": 914.3632, "eval_samples_per_second": 3.13, "eval_steps_per_second": 0.196, "step": 3200 }, { "epoch": 1.47, "learning_rate": 1.704718417047184e-07, "logits/chosen": -0.4398719370365143, "logits/rejected": -0.4648303985595703, "logps/chosen": -111.4161376953125, "logps/rejected": -92.91444396972656, "loss": 0.15, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.9489493370056152, "rewards/margins": 4.732758522033691, "rewards/rejected": -1.783808708190918, "step": 3210 }, { "epoch": 1.47, "learning_rate": 1.6996448503297815e-07, "logits/chosen": -0.40833503007888794, "logits/rejected": -0.43593111634254456, "logps/chosen": -113.5337905883789, "logps/rejected": -99.6334228515625, "loss": 0.1369, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5159003734588623, "rewards/margins": 5.7945661544799805, "rewards/rejected": -2.2786660194396973, "step": 3220 }, { "epoch": 1.47, "learning_rate": 1.6945712836123792e-07, "logits/chosen": -0.4148642122745514, "logits/rejected": -0.45154523849487305, "logps/chosen": -114.74641418457031, "logps/rejected": -94.3124008178711, "loss": 0.148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7251458168029785, "rewards/margins": 5.643283843994141, "rewards/rejected": -1.9181379079818726, "step": 3230 }, { "epoch": 1.48, "learning_rate": 1.689497716894977e-07, "logits/chosen": -0.40678897500038147, "logits/rejected": -0.4366432726383209, "logps/chosen": -121.0203628540039, "logps/rejected": -100.17606353759766, "loss": 0.1528, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.0067360401153564, "rewards/margins": 5.704981803894043, "rewards/rejected": -2.6982460021972656, "step": 3240 }, { "epoch": 1.48, "learning_rate": 1.6844241501775745e-07, "logits/chosen": -0.41396284103393555, "logits/rejected": -0.44056564569473267, "logps/chosen": -117.8901138305664, "logps/rejected": -96.82243347167969, "loss": 0.1619, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.752368927001953, "rewards/margins": 5.435792922973633, "rewards/rejected": -1.6834239959716797, "step": 3250 }, { "epoch": 1.49, "learning_rate": 1.6793505834601722e-07, "logits/chosen": -0.4228527545928955, "logits/rejected": -0.4528846740722656, "logps/chosen": -110.54814147949219, "logps/rejected": -98.03990936279297, "loss": 0.1739, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.3309082984924316, "rewards/margins": 4.948666095733643, "rewards/rejected": -1.617757797241211, "step": 3260 }, { "epoch": 1.49, "learning_rate": 1.67427701674277e-07, "logits/chosen": -0.4388189911842346, "logits/rejected": -0.4577499330043793, "logps/chosen": -114.64045715332031, "logps/rejected": -94.00943756103516, "loss": 0.1794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3593318462371826, "rewards/margins": 5.009249687194824, "rewards/rejected": -1.6499179601669312, "step": 3270 }, { "epoch": 1.5, "learning_rate": 1.6692034500253675e-07, "logits/chosen": -0.4215324819087982, "logits/rejected": -0.44902530312538147, "logps/chosen": -116.54881286621094, "logps/rejected": -97.64686584472656, "loss": 0.1837, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.203897953033447, "rewards/margins": 5.84643030166626, "rewards/rejected": -1.6425319910049438, "step": 3280 }, { "epoch": 1.5, "learning_rate": 1.6641298833079652e-07, "logits/chosen": -0.4341822564601898, "logits/rejected": -0.4602430760860443, "logps/chosen": -116.70965576171875, "logps/rejected": -92.82569885253906, "loss": 0.14, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.9068729877471924, "rewards/margins": 5.627469062805176, "rewards/rejected": -1.7205965518951416, "step": 3290 }, { "epoch": 1.51, "learning_rate": 1.659056316590563e-07, "logits/chosen": -0.39262399077415466, "logits/rejected": -0.42815399169921875, "logps/chosen": -110.6005630493164, "logps/rejected": -90.9563217163086, "loss": 0.1492, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.0928051471710205, "rewards/margins": 4.729269027709961, "rewards/rejected": -1.6364643573760986, "step": 3300 }, { "epoch": 1.51, "eval_logits/chosen": -0.39426207542419434, "eval_logits/rejected": -0.41480591893196106, "eval_logps/chosen": -116.58888244628906, "eval_logps/rejected": -94.54850006103516, "eval_loss": 0.17055107653141022, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.2854251861572266, "eval_rewards/margins": 5.0948357582092285, "eval_rewards/rejected": -1.8094104528427124, "eval_runtime": 859.6929, "eval_samples_per_second": 3.329, "eval_steps_per_second": 0.208, "step": 3300 }, { "epoch": 1.51, "learning_rate": 1.6539827498731605e-07, "logits/chosen": -0.4324941635131836, "logits/rejected": -0.46528664231300354, "logps/chosen": -120.0330581665039, "logps/rejected": -95.51181030273438, "loss": 0.1694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3808555603027344, "rewards/margins": 5.3127055168151855, "rewards/rejected": -1.9318501949310303, "step": 3310 }, { "epoch": 1.52, "learning_rate": 1.6489091831557582e-07, "logits/chosen": -0.3996530771255493, "logits/rejected": -0.42007988691329956, "logps/chosen": -122.48661804199219, "logps/rejected": -97.3335189819336, "loss": 0.1591, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3983426094055176, "rewards/margins": 5.0991926193237305, "rewards/rejected": -1.7008495330810547, "step": 3320 }, { "epoch": 1.52, "learning_rate": 1.643835616438356e-07, "logits/chosen": -0.41088947653770447, "logits/rejected": -0.4415004849433899, "logps/chosen": -115.9881591796875, "logps/rejected": -96.96780395507812, "loss": 0.1691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4160733222961426, "rewards/margins": 4.396543979644775, "rewards/rejected": -1.980470895767212, "step": 3330 }, { "epoch": 1.52, "learning_rate": 1.6387620497209535e-07, "logits/chosen": -0.4284709095954895, "logits/rejected": -0.45423832535743713, "logps/chosen": -112.177734375, "logps/rejected": -97.37242126464844, "loss": 0.1564, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.750192165374756, "rewards/margins": 5.037611961364746, "rewards/rejected": -1.2874199151992798, "step": 3340 }, { "epoch": 1.53, "learning_rate": 1.6336884830035512e-07, "logits/chosen": -0.42687755823135376, "logits/rejected": -0.4660683572292328, "logps/chosen": -115.24296569824219, "logps/rejected": -96.22698974609375, "loss": 0.1809, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4257748126983643, "rewards/margins": 5.423032760620117, "rewards/rejected": -1.997257947921753, "step": 3350 }, { "epoch": 1.53, "learning_rate": 1.6286149162861489e-07, "logits/chosen": -0.4044269621372223, "logits/rejected": -0.44033893942832947, "logps/chosen": -117.3617935180664, "logps/rejected": -96.61048889160156, "loss": 0.1539, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7865517139434814, "rewards/margins": 5.485646724700928, "rewards/rejected": -1.6990951299667358, "step": 3360 }, { "epoch": 1.54, "learning_rate": 1.6235413495687465e-07, "logits/chosen": -0.4385055899620056, "logits/rejected": -0.46612605452537537, "logps/chosen": -115.01204681396484, "logps/rejected": -89.59496307373047, "loss": 0.1509, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.0572566986083984, "rewards/margins": 4.691444396972656, "rewards/rejected": -1.6341876983642578, "step": 3370 }, { "epoch": 1.54, "learning_rate": 1.6184677828513442e-07, "logits/chosen": -0.414304256439209, "logits/rejected": -0.44513431191444397, "logps/chosen": -121.54780578613281, "logps/rejected": -95.90415954589844, "loss": 0.1737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6485812664031982, "rewards/margins": 5.4005937576293945, "rewards/rejected": -1.7520118951797485, "step": 3380 }, { "epoch": 1.55, "learning_rate": 1.613394216133942e-07, "logits/chosen": -0.42347556352615356, "logits/rejected": -0.4630081057548523, "logps/chosen": -119.10302734375, "logps/rejected": -93.24076843261719, "loss": 0.1655, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.597658157348633, "rewards/margins": 5.3917436599731445, "rewards/rejected": -1.7940857410430908, "step": 3390 }, { "epoch": 1.55, "learning_rate": 1.6083206494165398e-07, "logits/chosen": -0.42666107416152954, "logits/rejected": -0.45836448669433594, "logps/chosen": -119.47938537597656, "logps/rejected": -92.59451293945312, "loss": 0.1719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.071565628051758, "rewards/margins": 4.682357311248779, "rewards/rejected": -1.6107919216156006, "step": 3400 }, { "epoch": 1.55, "eval_logits/chosen": -0.42531585693359375, "eval_logits/rejected": -0.45423340797424316, "eval_logps/chosen": -116.13007354736328, "eval_logps/rejected": -94.42098236083984, "eval_loss": 0.16907574236392975, "eval_rewards/accuracies": 0.9273743033409119, "eval_rewards/chosen": 3.514838695526123, "eval_rewards/margins": 5.260493278503418, "eval_rewards/rejected": -1.7456541061401367, "eval_runtime": 896.728, "eval_samples_per_second": 3.192, "eval_steps_per_second": 0.2, "step": 3400 }, { "epoch": 1.56, "learning_rate": 1.6032470826991375e-07, "logits/chosen": -0.40940365195274353, "logits/rejected": -0.4444305896759033, "logps/chosen": -111.24534606933594, "logps/rejected": -94.37378692626953, "loss": 0.1535, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.5870659351348877, "rewards/margins": 5.215790271759033, "rewards/rejected": -1.628724455833435, "step": 3410 }, { "epoch": 1.56, "learning_rate": 1.598173515981735e-07, "logits/chosen": -0.42105579376220703, "logits/rejected": -0.45929020643234253, "logps/chosen": -118.6488037109375, "logps/rejected": -92.52164459228516, "loss": 0.1443, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7879090309143066, "rewards/margins": 5.258182048797607, "rewards/rejected": -1.4702732563018799, "step": 3420 }, { "epoch": 1.57, "learning_rate": 1.5930999492643328e-07, "logits/chosen": -0.4476688504219055, "logits/rejected": -0.46989208459854126, "logps/chosen": -118.64167785644531, "logps/rejected": -95.72476959228516, "loss": 0.1678, "rewards/accuracies": 0.9375, "rewards/chosen": 3.612441301345825, "rewards/margins": 5.394201755523682, "rewards/rejected": -1.7817604541778564, "step": 3430 }, { "epoch": 1.57, "learning_rate": 1.5880263825469305e-07, "logits/chosen": -0.41882842779159546, "logits/rejected": -0.4446406364440918, "logps/chosen": -116.1468276977539, "logps/rejected": -94.4334945678711, "loss": 0.1471, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.578800678253174, "rewards/margins": 5.660478591918945, "rewards/rejected": -2.0816779136657715, "step": 3440 }, { "epoch": 1.57, "learning_rate": 1.582952815829528e-07, "logits/chosen": -0.3946971893310547, "logits/rejected": -0.4405464231967926, "logps/chosen": -121.08565521240234, "logps/rejected": -97.06616973876953, "loss": 0.1506, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.8751914501190186, "rewards/margins": 5.678435325622559, "rewards/rejected": -1.8032439947128296, "step": 3450 }, { "epoch": 1.58, "learning_rate": 1.5778792491121258e-07, "logits/chosen": -0.4397282004356384, "logits/rejected": -0.46519118547439575, "logps/chosen": -120.30741882324219, "logps/rejected": -94.88092803955078, "loss": 0.1525, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6533420085906982, "rewards/margins": 5.589390754699707, "rewards/rejected": -1.9360488653182983, "step": 3460 }, { "epoch": 1.58, "learning_rate": 1.5728056823947235e-07, "logits/chosen": -0.39912813901901245, "logits/rejected": -0.43202313780784607, "logps/chosen": -112.98824310302734, "logps/rejected": -94.73600769042969, "loss": 0.1654, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.3695907592773438, "rewards/margins": 5.6515021324157715, "rewards/rejected": -2.2819111347198486, "step": 3470 }, { "epoch": 1.59, "learning_rate": 1.567732115677321e-07, "logits/chosen": -0.42645302414894104, "logits/rejected": -0.4477473199367523, "logps/chosen": -118.22267150878906, "logps/rejected": -100.73336029052734, "loss": 0.1504, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.873511791229248, "rewards/margins": 5.4625935554504395, "rewards/rejected": -1.5890812873840332, "step": 3480 }, { "epoch": 1.59, "learning_rate": 1.5626585489599188e-07, "logits/chosen": -0.43139228224754333, "logits/rejected": -0.4608747065067291, "logps/chosen": -120.2907943725586, "logps/rejected": -100.96830749511719, "loss": 0.153, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3151302337646484, "rewards/margins": 5.562079906463623, "rewards/rejected": -2.246950149536133, "step": 3490 }, { "epoch": 1.6, "learning_rate": 1.5575849822425165e-07, "logits/chosen": -0.4321955144405365, "logits/rejected": -0.46664899587631226, "logps/chosen": -114.3071060180664, "logps/rejected": -94.70503234863281, "loss": 0.1905, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.5885214805603027, "rewards/margins": 5.260445594787598, "rewards/rejected": -1.671923279762268, "step": 3500 }, { "epoch": 1.6, "eval_logits/chosen": -0.4189249873161316, "eval_logits/rejected": -0.4479340612888336, "eval_logps/chosen": -116.17154693603516, "eval_logps/rejected": -94.42037963867188, "eval_loss": 0.17187124490737915, "eval_rewards/accuracies": 0.924580991268158, "eval_rewards/chosen": 3.4941017627716064, "eval_rewards/margins": 5.239455699920654, "eval_rewards/rejected": -1.7453538179397583, "eval_runtime": 908.4719, "eval_samples_per_second": 3.15, "eval_steps_per_second": 0.197, "step": 3500 }, { "epoch": 1.6, "learning_rate": 1.552511415525114e-07, "logits/chosen": -0.4244809150695801, "logits/rejected": -0.44996365904808044, "logps/chosen": -115.8191146850586, "logps/rejected": -95.07186126708984, "loss": 0.1318, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3521361351013184, "rewards/margins": 5.482035160064697, "rewards/rejected": -2.1298987865448, "step": 3510 }, { "epoch": 1.61, "learning_rate": 1.5474378488077118e-07, "logits/chosen": -0.3949509263038635, "logits/rejected": -0.4299197793006897, "logps/chosen": -120.37969970703125, "logps/rejected": -100.93434143066406, "loss": 0.135, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.4647655487060547, "rewards/margins": 5.028170108795166, "rewards/rejected": -1.5634050369262695, "step": 3520 }, { "epoch": 1.61, "learning_rate": 1.5423642820903095e-07, "logits/chosen": -0.43281999230384827, "logits/rejected": -0.46290507912635803, "logps/chosen": -121.1761474609375, "logps/rejected": -98.9190673828125, "loss": 0.1699, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.336357831954956, "rewards/margins": 4.971770286560059, "rewards/rejected": -1.635412573814392, "step": 3530 }, { "epoch": 1.62, "learning_rate": 1.537290715372907e-07, "logits/chosen": -0.42768391966819763, "logits/rejected": -0.44896143674850464, "logps/chosen": -116.26780700683594, "logps/rejected": -94.6815414428711, "loss": 0.1486, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.7715530395507812, "rewards/margins": 5.036046504974365, "rewards/rejected": -1.264493465423584, "step": 3540 }, { "epoch": 1.62, "learning_rate": 1.5322171486555048e-07, "logits/chosen": -0.4007837176322937, "logits/rejected": -0.44758158922195435, "logps/chosen": -130.20358276367188, "logps/rejected": -99.24810028076172, "loss": 0.1621, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 5.0301408767700195, "rewards/margins": 6.414588928222656, "rewards/rejected": -1.3844481706619263, "step": 3550 }, { "epoch": 1.63, "learning_rate": 1.5271435819381025e-07, "logits/chosen": -0.38824278116226196, "logits/rejected": -0.4197087287902832, "logps/chosen": -126.24420166015625, "logps/rejected": -102.49394226074219, "loss": 0.141, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.869976758956909, "rewards/margins": 5.813518524169922, "rewards/rejected": -1.9435417652130127, "step": 3560 }, { "epoch": 1.63, "learning_rate": 1.5220700152207e-07, "logits/chosen": -0.4258262515068054, "logits/rejected": -0.4580133855342865, "logps/chosen": -123.7559585571289, "logps/rejected": -95.52310180664062, "loss": 0.1509, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.197176456451416, "rewards/margins": 5.937564849853516, "rewards/rejected": -1.74038827419281, "step": 3570 }, { "epoch": 1.63, "learning_rate": 1.5169964485032978e-07, "logits/chosen": -0.43848925828933716, "logits/rejected": -0.4572725296020508, "logps/chosen": -113.8507080078125, "logps/rejected": -95.45315551757812, "loss": 0.1559, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.581429958343506, "rewards/margins": 5.606729507446289, "rewards/rejected": -2.025299310684204, "step": 3580 }, { "epoch": 1.64, "learning_rate": 1.5119228817858955e-07, "logits/chosen": -0.4294334053993225, "logits/rejected": -0.44948524236679077, "logps/chosen": -124.73026275634766, "logps/rejected": -98.81682586669922, "loss": 0.1388, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6138317584991455, "rewards/margins": 5.703227996826172, "rewards/rejected": -2.0893959999084473, "step": 3590 }, { "epoch": 1.64, "learning_rate": 1.506849315068493e-07, "logits/chosen": -0.4376956820487976, "logits/rejected": -0.4688642621040344, "logps/chosen": -121.80577087402344, "logps/rejected": -94.1232681274414, "loss": 0.1354, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6063103675842285, "rewards/margins": 5.1961469650268555, "rewards/rejected": -1.5898375511169434, "step": 3600 }, { "epoch": 1.64, "eval_logits/chosen": -0.4303247928619385, "eval_logits/rejected": -0.4608076214790344, "eval_logps/chosen": -116.08953857421875, "eval_logps/rejected": -94.33447265625, "eval_loss": 0.1748729944229126, "eval_rewards/accuracies": 0.910614550113678, "eval_rewards/chosen": 3.535107374191284, "eval_rewards/margins": 5.23750638961792, "eval_rewards/rejected": -1.702398657798767, "eval_runtime": 917.8792, "eval_samples_per_second": 3.118, "eval_steps_per_second": 0.195, "step": 3600 }, { "epoch": 1.65, "learning_rate": 1.5017757483510908e-07, "logits/chosen": -0.43969064950942993, "logits/rejected": -0.47763586044311523, "logps/chosen": -118.63066101074219, "logps/rejected": -100.40098571777344, "loss": 0.1322, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.01759672164917, "rewards/margins": 6.038904666900635, "rewards/rejected": -2.0213077068328857, "step": 3610 }, { "epoch": 1.65, "learning_rate": 1.4967021816336885e-07, "logits/chosen": -0.44949302077293396, "logits/rejected": -0.47637391090393066, "logps/chosen": -113.33602142333984, "logps/rejected": -91.21766662597656, "loss": 0.1721, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.409508466720581, "rewards/margins": 5.6164231300354, "rewards/rejected": -2.2069146633148193, "step": 3620 }, { "epoch": 1.66, "learning_rate": 1.491628614916286e-07, "logits/chosen": -0.44279003143310547, "logits/rejected": -0.47035011649131775, "logps/chosen": -124.08707427978516, "logps/rejected": -104.35624694824219, "loss": 0.1456, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6684200763702393, "rewards/margins": 5.043892860412598, "rewards/rejected": -1.375472903251648, "step": 3630 }, { "epoch": 1.66, "learning_rate": 1.4865550481988838e-07, "logits/chosen": -0.4371957778930664, "logits/rejected": -0.4689660966396332, "logps/chosen": -115.41943359375, "logps/rejected": -98.3012466430664, "loss": 0.1469, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.81248140335083, "rewards/margins": 5.492468357086182, "rewards/rejected": -1.6799873113632202, "step": 3640 }, { "epoch": 1.67, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -0.4131147265434265, "logits/rejected": -0.4399244785308838, "logps/chosen": -124.3476333618164, "logps/rejected": -95.99518585205078, "loss": 0.1569, "rewards/accuracies": 0.9375, "rewards/chosen": 3.6779167652130127, "rewards/margins": 4.953145503997803, "rewards/rejected": -1.2752286195755005, "step": 3650 }, { "epoch": 1.67, "learning_rate": 1.476407914764079e-07, "logits/chosen": -0.4251777231693268, "logits/rejected": -0.4502864480018616, "logps/chosen": -124.25196838378906, "logps/rejected": -97.07969665527344, "loss": 0.1475, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.5780227184295654, "rewards/margins": 5.2519121170043945, "rewards/rejected": -1.67388916015625, "step": 3660 }, { "epoch": 1.68, "learning_rate": 1.4713343480466768e-07, "logits/chosen": -0.41656866669654846, "logits/rejected": -0.4453061521053314, "logps/chosen": -115.84672546386719, "logps/rejected": -97.28424072265625, "loss": 0.1387, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4806201457977295, "rewards/margins": 5.611165523529053, "rewards/rejected": -2.1305456161499023, "step": 3670 }, { "epoch": 1.68, "learning_rate": 1.4662607813292745e-07, "logits/chosen": -0.4140985906124115, "logits/rejected": -0.4425446093082428, "logps/chosen": -122.37489318847656, "logps/rejected": -94.66325378417969, "loss": 0.1612, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.579188585281372, "rewards/margins": 5.4639482498168945, "rewards/rejected": -1.8847599029541016, "step": 3680 }, { "epoch": 1.68, "learning_rate": 1.461187214611872e-07, "logits/chosen": -0.45093196630477905, "logits/rejected": -0.4685111939907074, "logps/chosen": -117.92506408691406, "logps/rejected": -95.20411682128906, "loss": 0.1499, "rewards/accuracies": 0.9375, "rewards/chosen": 3.947640895843506, "rewards/margins": 5.4805779457092285, "rewards/rejected": -1.5329368114471436, "step": 3690 }, { "epoch": 1.69, "learning_rate": 1.4561136478944698e-07, "logits/chosen": -0.4353989064693451, "logits/rejected": -0.4571017324924469, "logps/chosen": -116.68096923828125, "logps/rejected": -99.04996490478516, "loss": 0.1644, "rewards/accuracies": 0.875, "rewards/chosen": 3.5987677574157715, "rewards/margins": 5.514004707336426, "rewards/rejected": -1.9152368307113647, "step": 3700 }, { "epoch": 1.69, "eval_logits/chosen": -0.41919150948524475, "eval_logits/rejected": -0.4468615651130676, "eval_logps/chosen": -116.0125503540039, "eval_logps/rejected": -94.24568176269531, "eval_loss": 0.1596728265285492, "eval_rewards/accuracies": 0.924580991268158, "eval_rewards/chosen": 3.5735957622528076, "eval_rewards/margins": 5.23159646987915, "eval_rewards/rejected": -1.6580007076263428, "eval_runtime": 905.4856, "eval_samples_per_second": 3.161, "eval_steps_per_second": 0.198, "step": 3700 }, { "epoch": 1.69, "learning_rate": 1.4510400811770675e-07, "logits/chosen": -0.444713830947876, "logits/rejected": -0.4759383201599121, "logps/chosen": -132.91668701171875, "logps/rejected": -95.53848266601562, "loss": 0.1504, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.616512775421143, "rewards/margins": 5.9059953689575195, "rewards/rejected": -1.2894827127456665, "step": 3710 }, { "epoch": 1.7, "learning_rate": 1.445966514459665e-07, "logits/chosen": -0.42255353927612305, "logits/rejected": -0.45838356018066406, "logps/chosen": -115.45365142822266, "logps/rejected": -95.69837951660156, "loss": 0.1571, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.2308132648468018, "rewards/margins": 5.298513889312744, "rewards/rejected": -2.0677008628845215, "step": 3720 }, { "epoch": 1.7, "learning_rate": 1.4408929477422628e-07, "logits/chosen": -0.4201040267944336, "logits/rejected": -0.44681286811828613, "logps/chosen": -119.80631256103516, "logps/rejected": -95.5534439086914, "loss": 0.1594, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.08919620513916, "rewards/margins": 5.784335613250732, "rewards/rejected": -1.6951395273208618, "step": 3730 }, { "epoch": 1.71, "learning_rate": 1.4358193810248604e-07, "logits/chosen": -0.4364323019981384, "logits/rejected": -0.4687287211418152, "logps/chosen": -117.48023986816406, "logps/rejected": -95.4148178100586, "loss": 0.1455, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.810602903366089, "rewards/margins": 5.867162227630615, "rewards/rejected": -2.0565590858459473, "step": 3740 }, { "epoch": 1.71, "learning_rate": 1.430745814307458e-07, "logits/chosen": -0.4337928295135498, "logits/rejected": -0.4612889289855957, "logps/chosen": -118.15765380859375, "logps/rejected": -97.46696472167969, "loss": 0.1727, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.426650285720825, "rewards/margins": 5.878968238830566, "rewards/rejected": -2.452317953109741, "step": 3750 }, { "epoch": 1.72, "learning_rate": 1.4256722475900558e-07, "logits/chosen": -0.40918678045272827, "logits/rejected": -0.4386422634124756, "logps/chosen": -117.19795989990234, "logps/rejected": -95.10436248779297, "loss": 0.1812, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.7097976207733154, "rewards/margins": 5.490555763244629, "rewards/rejected": -1.7807576656341553, "step": 3760 }, { "epoch": 1.72, "learning_rate": 1.4205986808726534e-07, "logits/chosen": -0.41621828079223633, "logits/rejected": -0.43877357244491577, "logps/chosen": -118.92469787597656, "logps/rejected": -101.1650619506836, "loss": 0.1475, "rewards/accuracies": 0.9375, "rewards/chosen": 3.605703830718994, "rewards/margins": 5.508579254150391, "rewards/rejected": -1.9028756618499756, "step": 3770 }, { "epoch": 1.73, "learning_rate": 1.415525114155251e-07, "logits/chosen": -0.43570631742477417, "logits/rejected": -0.4763062000274658, "logps/chosen": -125.0984878540039, "logps/rejected": -97.47032165527344, "loss": 0.1453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5961880683898926, "rewards/margins": 5.880617618560791, "rewards/rejected": -2.2844297885894775, "step": 3780 }, { "epoch": 1.73, "learning_rate": 1.4104515474378488e-07, "logits/chosen": -0.4550606608390808, "logits/rejected": -0.4833446145057678, "logps/chosen": -123.15104675292969, "logps/rejected": -93.86997985839844, "loss": 0.1366, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7832932472229004, "rewards/margins": 5.126801490783691, "rewards/rejected": -1.343508005142212, "step": 3790 }, { "epoch": 1.73, "learning_rate": 1.4053779807204464e-07, "logits/chosen": -0.4106081426143646, "logits/rejected": -0.4471622407436371, "logps/chosen": -119.42298889160156, "logps/rejected": -92.5013198852539, "loss": 0.1598, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.9610819816589355, "rewards/margins": 4.718194007873535, "rewards/rejected": -1.7571115493774414, "step": 3800 }, { "epoch": 1.73, "eval_logits/chosen": -0.4349251985549927, "eval_logits/rejected": -0.4631372094154358, "eval_logps/chosen": -115.83056640625, "eval_logps/rejected": -94.33674621582031, "eval_loss": 0.1612546443939209, "eval_rewards/accuracies": 0.9078212380409241, "eval_rewards/chosen": 3.6645917892456055, "eval_rewards/margins": 5.368130683898926, "eval_rewards/rejected": -1.7035386562347412, "eval_runtime": 908.5705, "eval_samples_per_second": 3.15, "eval_steps_per_second": 0.197, "step": 3800 }, { "epoch": 1.74, "learning_rate": 1.400304414003044e-07, "logits/chosen": -0.4099394679069519, "logits/rejected": -0.43724679946899414, "logps/chosen": -116.51655578613281, "logps/rejected": -99.12494659423828, "loss": 0.1608, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.131943941116333, "rewards/margins": 5.072139263153076, "rewards/rejected": -1.9401954412460327, "step": 3810 }, { "epoch": 1.74, "learning_rate": 1.3952308472856418e-07, "logits/chosen": -0.4199654161930084, "logits/rejected": -0.44734907150268555, "logps/chosen": -116.4480209350586, "logps/rejected": -91.37544250488281, "loss": 0.1617, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.9373676776885986, "rewards/margins": 5.529845237731934, "rewards/rejected": -1.592477560043335, "step": 3820 }, { "epoch": 1.75, "learning_rate": 1.3901572805682394e-07, "logits/chosen": -0.4397161602973938, "logits/rejected": -0.4739462435245514, "logps/chosen": -123.17852783203125, "logps/rejected": -97.94535827636719, "loss": 0.1313, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.092246055603027, "rewards/margins": 6.0095133781433105, "rewards/rejected": -1.9172674417495728, "step": 3830 }, { "epoch": 1.75, "learning_rate": 1.385083713850837e-07, "logits/chosen": -0.44692462682724, "logits/rejected": -0.47341424226760864, "logps/chosen": -121.00887298583984, "logps/rejected": -92.6983413696289, "loss": 0.1729, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.020684719085693, "rewards/margins": 6.22866678237915, "rewards/rejected": -2.207981824874878, "step": 3840 }, { "epoch": 1.76, "learning_rate": 1.3800101471334348e-07, "logits/chosen": -0.39461749792099, "logits/rejected": -0.4253208637237549, "logps/chosen": -118.1397476196289, "logps/rejected": -89.09568786621094, "loss": 0.1343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4894938468933105, "rewards/margins": 5.35275411605835, "rewards/rejected": -1.8632599115371704, "step": 3850 }, { "epoch": 1.76, "learning_rate": 1.3749365804160324e-07, "logits/chosen": -0.4318575859069824, "logits/rejected": -0.4667425751686096, "logps/chosen": -123.15715026855469, "logps/rejected": -96.49995422363281, "loss": 0.1608, "rewards/accuracies": 0.875, "rewards/chosen": 3.9990055561065674, "rewards/margins": 5.423766136169434, "rewards/rejected": -1.4247612953186035, "step": 3860 }, { "epoch": 1.77, "learning_rate": 1.36986301369863e-07, "logits/chosen": -0.42642942070961, "logits/rejected": -0.46308574080467224, "logps/chosen": -121.68193054199219, "logps/rejected": -100.54917907714844, "loss": 0.1442, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.072659015655518, "rewards/margins": 6.057482719421387, "rewards/rejected": -1.9848238229751587, "step": 3870 }, { "epoch": 1.77, "learning_rate": 1.3647894469812278e-07, "logits/chosen": -0.4128958582878113, "logits/rejected": -0.43185925483703613, "logps/chosen": -123.2667007446289, "logps/rejected": -95.40806579589844, "loss": 0.1661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.089308738708496, "rewards/margins": 5.6935648918151855, "rewards/rejected": -1.6042560338974, "step": 3880 }, { "epoch": 1.78, "learning_rate": 1.3597158802638254e-07, "logits/chosen": -0.41117334365844727, "logits/rejected": -0.431837260723114, "logps/chosen": -117.51399230957031, "logps/rejected": -99.26336669921875, "loss": 0.1432, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4673914909362793, "rewards/margins": 5.661924839019775, "rewards/rejected": -2.194532632827759, "step": 3890 }, { "epoch": 1.78, "learning_rate": 1.354642313546423e-07, "logits/chosen": -0.42940855026245117, "logits/rejected": -0.4633054733276367, "logps/chosen": -115.96989440917969, "logps/rejected": -92.38642120361328, "loss": 0.1337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4177119731903076, "rewards/margins": 5.029784679412842, "rewards/rejected": -1.6120731830596924, "step": 3900 }, { "epoch": 1.78, "eval_logits/chosen": -0.43678709864616394, "eval_logits/rejected": -0.4658050537109375, "eval_logps/chosen": -116.05931091308594, "eval_logps/rejected": -94.6183853149414, "eval_loss": 0.15831047296524048, "eval_rewards/accuracies": 0.9134078025817871, "eval_rewards/chosen": 3.550219774246216, "eval_rewards/margins": 5.394576549530029, "eval_rewards/rejected": -1.844356894493103, "eval_runtime": 910.9099, "eval_samples_per_second": 3.142, "eval_steps_per_second": 0.197, "step": 3900 }, { "epoch": 1.78, "learning_rate": 1.3495687468290208e-07, "logits/chosen": -0.4411635994911194, "logits/rejected": -0.46993690729141235, "logps/chosen": -121.70280456542969, "logps/rejected": -93.5676040649414, "loss": 0.1384, "rewards/accuracies": 0.9375, "rewards/chosen": 4.199113845825195, "rewards/margins": 6.246798992156982, "rewards/rejected": -2.0476861000061035, "step": 3910 }, { "epoch": 1.79, "learning_rate": 1.3444951801116184e-07, "logits/chosen": -0.40244975686073303, "logits/rejected": -0.4448125958442688, "logps/chosen": -121.76031494140625, "logps/rejected": -94.28700256347656, "loss": 0.1325, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.4426021575927734, "rewards/margins": 5.515939235687256, "rewards/rejected": -2.0733370780944824, "step": 3920 }, { "epoch": 1.79, "learning_rate": 1.339421613394216e-07, "logits/chosen": -0.39999261498451233, "logits/rejected": -0.4254913330078125, "logps/chosen": -120.90069580078125, "logps/rejected": -101.6038818359375, "loss": 0.1789, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.0842125415802, "rewards/margins": 5.103273868560791, "rewards/rejected": -2.0190606117248535, "step": 3930 }, { "epoch": 1.8, "learning_rate": 1.3343480466768138e-07, "logits/chosen": -0.4498108923435211, "logits/rejected": -0.48698073625564575, "logps/chosen": -115.94972229003906, "logps/rejected": -92.61611938476562, "loss": 0.1599, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5224971771240234, "rewards/margins": 5.299635410308838, "rewards/rejected": -1.7771377563476562, "step": 3940 }, { "epoch": 1.8, "learning_rate": 1.3292744799594114e-07, "logits/chosen": -0.43744587898254395, "logits/rejected": -0.47521620988845825, "logps/chosen": -114.79386138916016, "logps/rejected": -94.7384262084961, "loss": 0.1607, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.8626174926757812, "rewards/margins": 5.312564373016357, "rewards/rejected": -1.4499469995498657, "step": 3950 }, { "epoch": 1.81, "learning_rate": 1.324200913242009e-07, "logits/chosen": -0.4359627664089203, "logits/rejected": -0.44901376962661743, "logps/chosen": -116.04862976074219, "logps/rejected": -94.40897369384766, "loss": 0.136, "rewards/accuracies": 0.9375, "rewards/chosen": 3.450641632080078, "rewards/margins": 5.271419525146484, "rewards/rejected": -1.820778250694275, "step": 3960 }, { "epoch": 1.81, "learning_rate": 1.3191273465246068e-07, "logits/chosen": -0.42982253432273865, "logits/rejected": -0.4628722071647644, "logps/chosen": -110.03547668457031, "logps/rejected": -97.66302490234375, "loss": 0.1322, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.349524736404419, "rewards/margins": 5.647238731384277, "rewards/rejected": -2.2977142333984375, "step": 3970 }, { "epoch": 1.82, "learning_rate": 1.3140537798072044e-07, "logits/chosen": -0.4669429659843445, "logits/rejected": -0.49395719170570374, "logps/chosen": -114.62618255615234, "logps/rejected": -90.14851379394531, "loss": 0.1472, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.9443836212158203, "rewards/margins": 5.539119243621826, "rewards/rejected": -1.594735860824585, "step": 3980 }, { "epoch": 1.82, "learning_rate": 1.308980213089802e-07, "logits/chosen": -0.44839197397232056, "logits/rejected": -0.4741789400577545, "logps/chosen": -116.9367446899414, "logps/rejected": -95.77188873291016, "loss": 0.1628, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.986290693283081, "rewards/margins": 6.274405479431152, "rewards/rejected": -2.2881150245666504, "step": 3990 }, { "epoch": 1.83, "learning_rate": 1.3039066463723998e-07, "logits/chosen": -0.43531838059425354, "logits/rejected": -0.4600156843662262, "logps/chosen": -120.1506118774414, "logps/rejected": -94.55183410644531, "loss": 0.1534, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.6019279956817627, "rewards/margins": 5.652145862579346, "rewards/rejected": -2.0502171516418457, "step": 4000 }, { "epoch": 1.83, "eval_logits/chosen": -0.43281227350234985, "eval_logits/rejected": -0.46104538440704346, "eval_logps/chosen": -116.1445541381836, "eval_logps/rejected": -94.75707244873047, "eval_loss": 0.1572478860616684, "eval_rewards/accuracies": 0.9189944267272949, "eval_rewards/chosen": 3.5075979232788086, "eval_rewards/margins": 5.421296119689941, "eval_rewards/rejected": -1.913697600364685, "eval_runtime": 909.0488, "eval_samples_per_second": 3.148, "eval_steps_per_second": 0.197, "step": 4000 }, { "epoch": 1.83, "learning_rate": 1.2988330796549974e-07, "logits/chosen": -0.42936331033706665, "logits/rejected": -0.46228766441345215, "logps/chosen": -114.8753433227539, "logps/rejected": -94.108154296875, "loss": 0.1411, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.468700885772705, "rewards/margins": 5.894384384155273, "rewards/rejected": -2.4256832599639893, "step": 4010 }, { "epoch": 1.83, "learning_rate": 1.293759512937595e-07, "logits/chosen": -0.43876391649246216, "logits/rejected": -0.46565374732017517, "logps/chosen": -118.2698745727539, "logps/rejected": -98.43639373779297, "loss": 0.1611, "rewards/accuracies": 0.9375, "rewards/chosen": 2.9941275119781494, "rewards/margins": 5.229546546936035, "rewards/rejected": -2.2354190349578857, "step": 4020 }, { "epoch": 1.84, "learning_rate": 1.2886859462201928e-07, "logits/chosen": -0.48172348737716675, "logits/rejected": -0.5027529001235962, "logps/chosen": -116.6221694946289, "logps/rejected": -98.63446807861328, "loss": 0.1486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6433708667755127, "rewards/margins": 5.4944939613342285, "rewards/rejected": -1.8511232137680054, "step": 4030 }, { "epoch": 1.84, "learning_rate": 1.2836123795027904e-07, "logits/chosen": -0.47252827882766724, "logits/rejected": -0.4866446554660797, "logps/chosen": -112.34371185302734, "logps/rejected": -95.17266845703125, "loss": 0.1148, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.1450066566467285, "rewards/margins": 5.001776695251465, "rewards/rejected": -1.8567705154418945, "step": 4040 }, { "epoch": 1.85, "learning_rate": 1.278538812785388e-07, "logits/chosen": -0.4539973735809326, "logits/rejected": -0.47425252199172974, "logps/chosen": -125.0869140625, "logps/rejected": -99.01017761230469, "loss": 0.1351, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.614443778991699, "rewards/margins": 6.311370372772217, "rewards/rejected": -1.6969267129898071, "step": 4050 }, { "epoch": 1.85, "learning_rate": 1.2734652460679858e-07, "logits/chosen": -0.41173696517944336, "logits/rejected": -0.4408145546913147, "logps/chosen": -124.49433898925781, "logps/rejected": -98.87799835205078, "loss": 0.138, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.2565371990203857, "rewards/margins": 5.311087608337402, "rewards/rejected": -2.0545499324798584, "step": 4060 }, { "epoch": 1.86, "learning_rate": 1.2683916793505834e-07, "logits/chosen": -0.46297794580459595, "logits/rejected": -0.4824606478214264, "logps/chosen": -112.1919174194336, "logps/rejected": -98.0719223022461, "loss": 0.1493, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.8646275997161865, "rewards/margins": 5.0431928634643555, "rewards/rejected": -2.178565502166748, "step": 4070 }, { "epoch": 1.86, "learning_rate": 1.263318112633181e-07, "logits/chosen": -0.4470444619655609, "logits/rejected": -0.47431832551956177, "logps/chosen": -121.6833724975586, "logps/rejected": -94.71736907958984, "loss": 0.1406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3270621299743652, "rewards/margins": 5.1658525466918945, "rewards/rejected": -1.8387901782989502, "step": 4080 }, { "epoch": 1.87, "learning_rate": 1.2582445459157788e-07, "logits/chosen": -0.4353705048561096, "logits/rejected": -0.4633060395717621, "logps/chosen": -117.09333801269531, "logps/rejected": -96.31623840332031, "loss": 0.1741, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7191150188446045, "rewards/margins": 5.496578693389893, "rewards/rejected": -1.7774631977081299, "step": 4090 }, { "epoch": 1.87, "learning_rate": 1.2531709791983764e-07, "logits/chosen": -0.4220728874206543, "logits/rejected": -0.4554152488708496, "logps/chosen": -123.45231628417969, "logps/rejected": -99.41911315917969, "loss": 0.1327, "rewards/accuracies": 0.9375, "rewards/chosen": 4.361505508422852, "rewards/margins": 6.2839765548706055, "rewards/rejected": -1.9224706888198853, "step": 4100 }, { "epoch": 1.87, "eval_logits/chosen": -0.4152924418449402, "eval_logits/rejected": -0.4403891861438751, "eval_logps/chosen": -116.01750946044922, "eval_logps/rejected": -94.75827026367188, "eval_loss": 0.16070948541164398, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.5711212158203125, "eval_rewards/margins": 5.485420227050781, "eval_rewards/rejected": -1.9142990112304688, "eval_runtime": 878.437, "eval_samples_per_second": 3.258, "eval_steps_per_second": 0.204, "step": 4100 }, { "epoch": 1.88, "learning_rate": 1.248097412480974e-07, "logits/chosen": -0.4266030788421631, "logits/rejected": -0.4478863775730133, "logps/chosen": -130.07701110839844, "logps/rejected": -99.33512878417969, "loss": 0.1339, "rewards/accuracies": 0.9375, "rewards/chosen": 3.12672758102417, "rewards/margins": 5.217502117156982, "rewards/rejected": -2.0907750129699707, "step": 4110 }, { "epoch": 1.88, "learning_rate": 1.2430238457635718e-07, "logits/chosen": -0.4379239082336426, "logits/rejected": -0.4631032347679138, "logps/chosen": -114.28575134277344, "logps/rejected": -98.11552429199219, "loss": 0.151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4146289825439453, "rewards/margins": 5.491271018981934, "rewards/rejected": -2.0766425132751465, "step": 4120 }, { "epoch": 1.89, "learning_rate": 1.2379502790461694e-07, "logits/chosen": -0.4479547441005707, "logits/rejected": -0.48350486159324646, "logps/chosen": -112.89759826660156, "logps/rejected": -89.53084564208984, "loss": 0.1645, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.695988893508911, "rewards/margins": 5.8413238525390625, "rewards/rejected": -2.1453351974487305, "step": 4130 }, { "epoch": 1.89, "learning_rate": 1.232876712328767e-07, "logits/chosen": -0.42592209577560425, "logits/rejected": -0.46377819776535034, "logps/chosen": -118.2711181640625, "logps/rejected": -91.99664306640625, "loss": 0.1556, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.791079044342041, "rewards/margins": 5.7715983390808105, "rewards/rejected": -1.9805190563201904, "step": 4140 }, { "epoch": 1.89, "learning_rate": 1.2278031456113648e-07, "logits/chosen": -0.42525094747543335, "logits/rejected": -0.4680793881416321, "logps/chosen": -131.39756774902344, "logps/rejected": -96.14659881591797, "loss": 0.1193, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.135112762451172, "rewards/margins": 6.375372886657715, "rewards/rejected": -2.2402608394622803, "step": 4150 }, { "epoch": 1.9, "learning_rate": 1.2227295788939624e-07, "logits/chosen": -0.4264507293701172, "logits/rejected": -0.4579479694366455, "logps/chosen": -124.99159240722656, "logps/rejected": -97.59441375732422, "loss": 0.1456, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.248692035675049, "rewards/margins": 5.797226428985596, "rewards/rejected": -2.548535108566284, "step": 4160 }, { "epoch": 1.9, "learning_rate": 1.21765601217656e-07, "logits/chosen": -0.44981488585472107, "logits/rejected": -0.4686676561832428, "logps/chosen": -113.22137451171875, "logps/rejected": -94.71104431152344, "loss": 0.1379, "rewards/accuracies": 0.9375, "rewards/chosen": 3.8100783824920654, "rewards/margins": 5.2517571449279785, "rewards/rejected": -1.4416786432266235, "step": 4170 }, { "epoch": 1.91, "learning_rate": 1.2125824454591578e-07, "logits/chosen": -0.44809216260910034, "logits/rejected": -0.4698753356933594, "logps/chosen": -121.99101257324219, "logps/rejected": -99.77064514160156, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": 4.432638168334961, "rewards/margins": 6.586982727050781, "rewards/rejected": -2.1543452739715576, "step": 4180 }, { "epoch": 1.91, "learning_rate": 1.2075088787417554e-07, "logits/chosen": -0.4335380494594574, "logits/rejected": -0.4673110544681549, "logps/chosen": -117.7331771850586, "logps/rejected": -91.36144256591797, "loss": 0.1445, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.4432575702667236, "rewards/margins": 5.076163291931152, "rewards/rejected": -1.6329059600830078, "step": 4190 }, { "epoch": 1.92, "learning_rate": 1.202435312024353e-07, "logits/chosen": -0.441293329000473, "logits/rejected": -0.474077045917511, "logps/chosen": -120.2531509399414, "logps/rejected": -93.95128631591797, "loss": 0.162, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6226940155029297, "rewards/margins": 4.789700508117676, "rewards/rejected": -1.1670061349868774, "step": 4200 }, { "epoch": 1.92, "eval_logits/chosen": -0.4373014569282532, "eval_logits/rejected": -0.46410053968429565, "eval_logps/chosen": -116.1893081665039, "eval_logps/rejected": -94.95684814453125, "eval_loss": 0.15646876394748688, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.485214948654175, "eval_rewards/margins": 5.498796463012695, "eval_rewards/rejected": -2.0135817527770996, "eval_runtime": 905.3895, "eval_samples_per_second": 3.161, "eval_steps_per_second": 0.198, "step": 4200 }, { "epoch": 1.92, "learning_rate": 1.1973617453069508e-07, "logits/chosen": -0.41942963004112244, "logits/rejected": -0.45433226227760315, "logps/chosen": -119.24137878417969, "logps/rejected": -93.80500793457031, "loss": 0.1342, "rewards/accuracies": 0.9375, "rewards/chosen": 4.265337944030762, "rewards/margins": 5.979970455169678, "rewards/rejected": -1.714632272720337, "step": 4210 }, { "epoch": 1.93, "learning_rate": 1.1922881785895484e-07, "logits/chosen": -0.476146936416626, "logits/rejected": -0.493946373462677, "logps/chosen": -118.25067138671875, "logps/rejected": -99.85133361816406, "loss": 0.1604, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.482466220855713, "rewards/margins": 5.256065368652344, "rewards/rejected": -1.7735998630523682, "step": 4220 }, { "epoch": 1.93, "learning_rate": 1.187214611872146e-07, "logits/chosen": -0.4835086762905121, "logits/rejected": -0.5142003297805786, "logps/chosen": -119.5131607055664, "logps/rejected": -100.95896911621094, "loss": 0.1474, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.157654285430908, "rewards/margins": 5.392757892608643, "rewards/rejected": -2.2351036071777344, "step": 4230 }, { "epoch": 1.94, "learning_rate": 1.1821410451547436e-07, "logits/chosen": -0.40879902243614197, "logits/rejected": -0.428755521774292, "logps/chosen": -113.699462890625, "logps/rejected": -99.03292083740234, "loss": 0.1403, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.027031421661377, "rewards/margins": 5.47978401184082, "rewards/rejected": -1.4527524709701538, "step": 4240 }, { "epoch": 1.94, "learning_rate": 1.1770674784373413e-07, "logits/chosen": -0.46195000410079956, "logits/rejected": -0.47916096448898315, "logps/chosen": -110.07051086425781, "logps/rejected": -96.48656463623047, "loss": 0.1403, "rewards/accuracies": 0.875, "rewards/chosen": 2.9599082469940186, "rewards/margins": 4.992161750793457, "rewards/rejected": -2.0322537422180176, "step": 4250 }, { "epoch": 1.94, "learning_rate": 1.171993911719939e-07, "logits/chosen": -0.4431841969490051, "logits/rejected": -0.46271103620529175, "logps/chosen": -113.02568054199219, "logps/rejected": -98.78375244140625, "loss": 0.1526, "rewards/accuracies": 0.9375, "rewards/chosen": 3.211625576019287, "rewards/margins": 5.480974197387695, "rewards/rejected": -2.269348621368408, "step": 4260 }, { "epoch": 1.95, "learning_rate": 1.1669203450025366e-07, "logits/chosen": -0.4586679935455322, "logits/rejected": -0.4842369556427002, "logps/chosen": -113.3587646484375, "logps/rejected": -94.2232437133789, "loss": 0.18, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.053977012634277, "rewards/margins": 5.603518009185791, "rewards/rejected": -1.5495409965515137, "step": 4270 }, { "epoch": 1.95, "learning_rate": 1.1618467782851343e-07, "logits/chosen": -0.42036324739456177, "logits/rejected": -0.4484184682369232, "logps/chosen": -118.8740463256836, "logps/rejected": -97.7596435546875, "loss": 0.1245, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.5045456886291504, "rewards/margins": 5.583371162414551, "rewards/rejected": -2.0788257122039795, "step": 4280 }, { "epoch": 1.96, "learning_rate": 1.156773211567732e-07, "logits/chosen": -0.45418959856033325, "logits/rejected": -0.48503002524375916, "logps/chosen": -115.06591796875, "logps/rejected": -91.06563568115234, "loss": 0.1351, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.340029239654541, "rewards/margins": 5.9402241706848145, "rewards/rejected": -2.6001949310302734, "step": 4290 }, { "epoch": 1.96, "learning_rate": 1.1516996448503296e-07, "logits/chosen": -0.47121506929397583, "logits/rejected": -0.4892233908176422, "logps/chosen": -119.47066497802734, "logps/rejected": -100.1035385131836, "loss": 0.1471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.297662258148193, "rewards/margins": 6.468558311462402, "rewards/rejected": -2.170895576477051, "step": 4300 }, { "epoch": 1.96, "eval_logits/chosen": -0.43375614285469055, "eval_logits/rejected": -0.46272942423820496, "eval_logps/chosen": -116.03189086914062, "eval_logps/rejected": -94.88296508789062, "eval_loss": 0.15242531895637512, "eval_rewards/accuracies": 0.924580991268158, "eval_rewards/chosen": 3.563926935195923, "eval_rewards/margins": 5.540570259094238, "eval_rewards/rejected": -1.976643681526184, "eval_runtime": 903.7216, "eval_samples_per_second": 3.167, "eval_steps_per_second": 0.198, "step": 4300 }, { "epoch": 1.97, "learning_rate": 1.1466260781329273e-07, "logits/chosen": -0.4415101110935211, "logits/rejected": -0.4698669910430908, "logps/chosen": -116.846923828125, "logps/rejected": -97.8760986328125, "loss": 0.145, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.341931104660034, "rewards/margins": 5.32661771774292, "rewards/rejected": -1.9846864938735962, "step": 4310 }, { "epoch": 1.97, "learning_rate": 1.141552511415525e-07, "logits/chosen": -0.4346837103366852, "logits/rejected": -0.46257075667381287, "logps/chosen": -123.95588684082031, "logps/rejected": -94.39106750488281, "loss": 0.1233, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.201815605163574, "rewards/margins": 6.11990213394165, "rewards/rejected": -1.9180864095687866, "step": 4320 }, { "epoch": 1.98, "learning_rate": 1.1364789446981226e-07, "logits/chosen": -0.4392518997192383, "logits/rejected": -0.46720820665359497, "logps/chosen": -122.53446960449219, "logps/rejected": -101.41107177734375, "loss": 0.1364, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7970690727233887, "rewards/margins": 6.052388668060303, "rewards/rejected": -2.2553200721740723, "step": 4330 }, { "epoch": 1.98, "learning_rate": 1.1314053779807203e-07, "logits/chosen": -0.4463014602661133, "logits/rejected": -0.4627785086631775, "logps/chosen": -108.80528259277344, "logps/rejected": -92.49729919433594, "loss": 0.1301, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.5569469928741455, "rewards/margins": 5.680870056152344, "rewards/rejected": -2.1239230632781982, "step": 4340 }, { "epoch": 1.99, "learning_rate": 1.126331811263318e-07, "logits/chosen": -0.4043458104133606, "logits/rejected": -0.4282744526863098, "logps/chosen": -125.00662994384766, "logps/rejected": -96.8555908203125, "loss": 0.1329, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.774332046508789, "rewards/margins": 5.595047473907471, "rewards/rejected": -1.8207143545150757, "step": 4350 }, { "epoch": 1.99, "learning_rate": 1.1212582445459156e-07, "logits/chosen": -0.424589067697525, "logits/rejected": -0.4528760313987732, "logps/chosen": -123.8675537109375, "logps/rejected": -96.47154998779297, "loss": 0.1358, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.940141201019287, "rewards/margins": 5.887082576751709, "rewards/rejected": -1.9469407796859741, "step": 4360 }, { "epoch": 1.99, "learning_rate": 1.1161846778285133e-07, "logits/chosen": -0.4298287332057953, "logits/rejected": -0.44278186559677124, "logps/chosen": -111.9197998046875, "logps/rejected": -96.86292266845703, "loss": 0.1402, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8632125854492188, "rewards/margins": 5.875947952270508, "rewards/rejected": -2.01273512840271, "step": 4370 }, { "epoch": 2.0, "learning_rate": 1.111111111111111e-07, "logits/chosen": -0.44265303015708923, "logits/rejected": -0.48326772451400757, "logps/chosen": -120.85821533203125, "logps/rejected": -97.12603759765625, "loss": 0.1353, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.985802173614502, "rewards/margins": 6.470803260803223, "rewards/rejected": -2.4850013256073, "step": 4380 }, { "epoch": 2.0, "learning_rate": 1.1060375443937086e-07, "logits/chosen": -0.4284425377845764, "logits/rejected": -0.4572037160396576, "logps/chosen": -119.15400695800781, "logps/rejected": -92.87478637695312, "loss": 0.1248, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.157296180725098, "rewards/margins": 6.6621294021606445, "rewards/rejected": -2.5048327445983887, "step": 4390 }, { "epoch": 2.01, "learning_rate": 1.1009639776763063e-07, "logits/chosen": -0.4259072244167328, "logits/rejected": -0.4534842073917389, "logps/chosen": -116.02922058105469, "logps/rejected": -98.30314636230469, "loss": 0.1333, "rewards/accuracies": 0.9375, "rewards/chosen": 2.9443373680114746, "rewards/margins": 5.107085227966309, "rewards/rejected": -2.162747621536255, "step": 4400 }, { "epoch": 2.01, "eval_logits/chosen": -0.43279382586479187, "eval_logits/rejected": -0.4608496427536011, "eval_logps/chosen": -115.92514038085938, "eval_logps/rejected": -94.87165069580078, "eval_loss": 0.14179793000221252, "eval_rewards/accuracies": 0.916201114654541, "eval_rewards/chosen": 3.6173088550567627, "eval_rewards/margins": 5.588301658630371, "eval_rewards/rejected": -1.9709923267364502, "eval_runtime": 873.7722, "eval_samples_per_second": 3.275, "eval_steps_per_second": 0.205, "step": 4400 }, { "epoch": 2.01, "learning_rate": 1.095890410958904e-07, "logits/chosen": -0.4704256057739258, "logits/rejected": -0.48912009596824646, "logps/chosen": -116.41622161865234, "logps/rejected": -102.0516586303711, "loss": 0.1023, "rewards/accuracies": 0.9375, "rewards/chosen": 3.746499538421631, "rewards/margins": 6.171090126037598, "rewards/rejected": -2.4245896339416504, "step": 4410 }, { "epoch": 2.02, "learning_rate": 1.0908168442415016e-07, "logits/chosen": -0.43700629472732544, "logits/rejected": -0.4626290202140808, "logps/chosen": -115.08659362792969, "logps/rejected": -94.25322723388672, "loss": 0.1331, "rewards/accuracies": 0.9375, "rewards/chosen": 4.027894020080566, "rewards/margins": 6.144411563873291, "rewards/rejected": -2.1165177822113037, "step": 4420 }, { "epoch": 2.02, "learning_rate": 1.0857432775240993e-07, "logits/chosen": -0.42565712332725525, "logits/rejected": -0.44756293296813965, "logps/chosen": -115.29083251953125, "logps/rejected": -98.44151306152344, "loss": 0.1637, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.85151743888855, "rewards/margins": 5.8956618309021, "rewards/rejected": -2.044144868850708, "step": 4430 }, { "epoch": 2.03, "learning_rate": 1.080669710806697e-07, "logits/chosen": -0.4473974108695984, "logits/rejected": -0.47457534074783325, "logps/chosen": -117.94417572021484, "logps/rejected": -96.59383392333984, "loss": 0.1394, "rewards/accuracies": 0.9375, "rewards/chosen": 4.129402160644531, "rewards/margins": 6.365184307098389, "rewards/rejected": -2.2357823848724365, "step": 4440 }, { "epoch": 2.03, "learning_rate": 1.0755961440892946e-07, "logits/chosen": -0.44540318846702576, "logits/rejected": -0.47266215085983276, "logps/chosen": -112.82417297363281, "logps/rejected": -96.38731384277344, "loss": 0.126, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4327216148376465, "rewards/margins": 5.731344223022461, "rewards/rejected": -2.2986226081848145, "step": 4450 }, { "epoch": 2.04, "learning_rate": 1.0705225773718923e-07, "logits/chosen": -0.44705715775489807, "logits/rejected": -0.4615131914615631, "logps/chosen": -116.78440856933594, "logps/rejected": -96.42256927490234, "loss": 0.1444, "rewards/accuracies": 0.875, "rewards/chosen": 3.238809585571289, "rewards/margins": 4.722464561462402, "rewards/rejected": -1.4836552143096924, "step": 4460 }, { "epoch": 2.04, "learning_rate": 1.06544901065449e-07, "logits/chosen": -0.4247209429740906, "logits/rejected": -0.4695788025856018, "logps/chosen": -123.95927429199219, "logps/rejected": -95.46394348144531, "loss": 0.1399, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.9008700847625732, "rewards/margins": 5.591902732849121, "rewards/rejected": -1.6910324096679688, "step": 4470 }, { "epoch": 2.04, "learning_rate": 1.0603754439370876e-07, "logits/chosen": -0.41793909668922424, "logits/rejected": -0.449241578578949, "logps/chosen": -121.62298583984375, "logps/rejected": -96.32791900634766, "loss": 0.1114, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.407382011413574, "rewards/margins": 6.299051761627197, "rewards/rejected": -1.891669511795044, "step": 4480 }, { "epoch": 2.05, "learning_rate": 1.0553018772196853e-07, "logits/chosen": -0.41730570793151855, "logits/rejected": -0.45211511850357056, "logps/chosen": -116.56009674072266, "logps/rejected": -94.37024688720703, "loss": 0.1273, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.634586811065674, "rewards/margins": 5.516324996948242, "rewards/rejected": -1.881738305091858, "step": 4490 }, { "epoch": 2.05, "learning_rate": 1.050228310502283e-07, "logits/chosen": -0.43000978231430054, "logits/rejected": -0.4528748095035553, "logps/chosen": -112.00526428222656, "logps/rejected": -96.77906799316406, "loss": 0.13, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7136778831481934, "rewards/margins": 5.948361873626709, "rewards/rejected": -2.2346842288970947, "step": 4500 }, { "epoch": 2.05, "eval_logits/chosen": -0.43186989426612854, "eval_logits/rejected": -0.4604170024394989, "eval_logps/chosen": -115.90472412109375, "eval_logps/rejected": -94.9027328491211, "eval_loss": 0.14848706126213074, "eval_rewards/accuracies": 0.9357541799545288, "eval_rewards/chosen": 3.627511978149414, "eval_rewards/margins": 5.61404275894165, "eval_rewards/rejected": -1.986531138420105, "eval_runtime": 898.6793, "eval_samples_per_second": 3.185, "eval_steps_per_second": 0.199, "step": 4500 }, { "epoch": 2.06, "learning_rate": 1.0451547437848806e-07, "logits/chosen": -0.41770705580711365, "logits/rejected": -0.4475606381893158, "logps/chosen": -126.3756332397461, "logps/rejected": -96.57072448730469, "loss": 0.1226, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7878124713897705, "rewards/margins": 6.038453102111816, "rewards/rejected": -2.250640392303467, "step": 4510 }, { "epoch": 2.06, "learning_rate": 1.0400811770674783e-07, "logits/chosen": -0.45794838666915894, "logits/rejected": -0.4789225161075592, "logps/chosen": -108.92681884765625, "logps/rejected": -93.19290161132812, "loss": 0.1066, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.9215340614318848, "rewards/margins": 6.354188442230225, "rewards/rejected": -2.432654857635498, "step": 4520 }, { "epoch": 2.07, "learning_rate": 1.035007610350076e-07, "logits/chosen": -0.43893688917160034, "logits/rejected": -0.46274012327194214, "logps/chosen": -118.29610443115234, "logps/rejected": -96.06135559082031, "loss": 0.1437, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5714943408966064, "rewards/margins": 5.405307769775391, "rewards/rejected": -1.8338134288787842, "step": 4530 }, { "epoch": 2.07, "learning_rate": 1.0299340436326736e-07, "logits/chosen": -0.4240226745605469, "logits/rejected": -0.4497091770172119, "logps/chosen": -113.27973937988281, "logps/rejected": -93.89836883544922, "loss": 0.1385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1435554027557373, "rewards/margins": 4.301648139953613, "rewards/rejected": -1.1580922603607178, "step": 4540 }, { "epoch": 2.08, "learning_rate": 1.0248604769152713e-07, "logits/chosen": -0.41185903549194336, "logits/rejected": -0.436403751373291, "logps/chosen": -111.78126525878906, "logps/rejected": -97.95333099365234, "loss": 0.1297, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.2165274620056152, "rewards/margins": 5.033143520355225, "rewards/rejected": -1.8166160583496094, "step": 4550 }, { "epoch": 2.08, "learning_rate": 1.019786910197869e-07, "logits/chosen": -0.44074559211730957, "logits/rejected": -0.4656568467617035, "logps/chosen": -114.5299301147461, "logps/rejected": -94.2115478515625, "loss": 0.1205, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.349818706512451, "rewards/margins": 5.186150550842285, "rewards/rejected": -1.8363316059112549, "step": 4560 }, { "epoch": 2.09, "learning_rate": 1.0147133434804666e-07, "logits/chosen": -0.43615585565567017, "logits/rejected": -0.4757886826992035, "logps/chosen": -124.17915344238281, "logps/rejected": -93.8154525756836, "loss": 0.1529, "rewards/accuracies": 0.9375, "rewards/chosen": 4.06646203994751, "rewards/margins": 5.818515300750732, "rewards/rejected": -1.7520534992218018, "step": 4570 }, { "epoch": 2.09, "learning_rate": 1.0096397767630643e-07, "logits/chosen": -0.4336267411708832, "logits/rejected": -0.45681333541870117, "logps/chosen": -122.39564514160156, "logps/rejected": -96.02613830566406, "loss": 0.1194, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.0391130447387695, "rewards/margins": 6.454281806945801, "rewards/rejected": -2.415168046951294, "step": 4580 }, { "epoch": 2.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -0.43071794509887695, "logits/rejected": -0.44464653730392456, "logps/chosen": -116.609619140625, "logps/rejected": -94.95137023925781, "loss": 0.1326, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.4324917793273926, "rewards/margins": 4.9647088050842285, "rewards/rejected": -1.5322175025939941, "step": 4590 }, { "epoch": 2.1, "learning_rate": 9.994926433282596e-08, "logits/chosen": -0.4383629858493805, "logits/rejected": -0.46357983350753784, "logps/chosen": -117.3117446899414, "logps/rejected": -98.81439208984375, "loss": 0.1311, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8937549591064453, "rewards/margins": 5.831150531768799, "rewards/rejected": -1.937395691871643, "step": 4600 }, { "epoch": 2.1, "eval_logits/chosen": -0.4123355746269226, "eval_logits/rejected": -0.4404549300670624, "eval_logps/chosen": -116.21281433105469, "eval_logps/rejected": -95.16839599609375, "eval_loss": 0.15031887590885162, "eval_rewards/accuracies": 0.9134078025817871, "eval_rewards/chosen": 3.4734678268432617, "eval_rewards/margins": 5.592827796936035, "eval_rewards/rejected": -2.1193599700927734, "eval_runtime": 907.3168, "eval_samples_per_second": 3.154, "eval_steps_per_second": 0.197, "step": 4600 }, { "epoch": 2.1, "learning_rate": 9.944190766108573e-08, "logits/chosen": -0.4133322834968567, "logits/rejected": -0.4418833255767822, "logps/chosen": -119.86898040771484, "logps/rejected": -93.79594421386719, "loss": 0.1389, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.694523334503174, "rewards/margins": 5.3314409255981445, "rewards/rejected": -1.6369177103042603, "step": 4610 }, { "epoch": 2.11, "learning_rate": 9.89345509893455e-08, "logits/chosen": -0.4273843765258789, "logits/rejected": -0.450580894947052, "logps/chosen": -112.46533203125, "logps/rejected": -97.37897491455078, "loss": 0.1475, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3234305381774902, "rewards/margins": 5.877073764801025, "rewards/rejected": -2.553642988204956, "step": 4620 }, { "epoch": 2.11, "learning_rate": 9.842719431760526e-08, "logits/chosen": -0.45100849866867065, "logits/rejected": -0.4823623597621918, "logps/chosen": -121.07530212402344, "logps/rejected": -96.33805084228516, "loss": 0.1407, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.24837589263916, "rewards/margins": 6.4495439529418945, "rewards/rejected": -2.201167583465576, "step": 4630 }, { "epoch": 2.12, "learning_rate": 9.791983764586503e-08, "logits/chosen": -0.4720466732978821, "logits/rejected": -0.5029277801513672, "logps/chosen": -115.62332916259766, "logps/rejected": -89.04841613769531, "loss": 0.1305, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.728707790374756, "rewards/margins": 5.528870105743408, "rewards/rejected": -1.8001619577407837, "step": 4640 }, { "epoch": 2.12, "learning_rate": 9.74124809741248e-08, "logits/chosen": -0.4487905502319336, "logits/rejected": -0.4747520089149475, "logps/chosen": -115.44490051269531, "logps/rejected": -97.00244903564453, "loss": 0.1243, "rewards/accuracies": 0.9375, "rewards/chosen": 3.714205503463745, "rewards/margins": 5.9575676918029785, "rewards/rejected": -2.2433624267578125, "step": 4650 }, { "epoch": 2.13, "learning_rate": 9.690512430238456e-08, "logits/chosen": -0.4337824285030365, "logits/rejected": -0.4505422115325928, "logps/chosen": -118.36312103271484, "logps/rejected": -93.47285461425781, "loss": 0.1489, "rewards/accuracies": 0.9375, "rewards/chosen": 3.8410346508026123, "rewards/margins": 5.889695167541504, "rewards/rejected": -2.0486607551574707, "step": 4660 }, { "epoch": 2.13, "learning_rate": 9.639776763064433e-08, "logits/chosen": -0.4305817484855652, "logits/rejected": -0.4584842622280121, "logps/chosen": -118.65266418457031, "logps/rejected": -98.68658447265625, "loss": 0.1346, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.668896436691284, "rewards/margins": 5.532942771911621, "rewards/rejected": -1.8640460968017578, "step": 4670 }, { "epoch": 2.14, "learning_rate": 9.58904109589041e-08, "logits/chosen": -0.42450302839279175, "logits/rejected": -0.45716771483421326, "logps/chosen": -120.25984191894531, "logps/rejected": -94.94532775878906, "loss": 0.1458, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.8007895946502686, "rewards/margins": 6.051023483276367, "rewards/rejected": -2.2502334117889404, "step": 4680 }, { "epoch": 2.14, "learning_rate": 9.538305428716386e-08, "logits/chosen": -0.4451879560947418, "logits/rejected": -0.4725143015384674, "logps/chosen": -121.0210189819336, "logps/rejected": -94.34584045410156, "loss": 0.1471, "rewards/accuracies": 0.9375, "rewards/chosen": 3.8500618934631348, "rewards/margins": 5.363143444061279, "rewards/rejected": -1.513081669807434, "step": 4690 }, { "epoch": 2.15, "learning_rate": 9.487569761542363e-08, "logits/chosen": -0.4411678910255432, "logits/rejected": -0.4673629403114319, "logps/chosen": -120.62171936035156, "logps/rejected": -93.69928741455078, "loss": 0.1329, "rewards/accuracies": 0.9375, "rewards/chosen": 4.366326808929443, "rewards/margins": 6.472535133361816, "rewards/rejected": -2.1062076091766357, "step": 4700 }, { "epoch": 2.15, "eval_logits/chosen": -0.4228588938713074, "eval_logits/rejected": -0.45189815759658813, "eval_logps/chosen": -116.00117492675781, "eval_logps/rejected": -95.14153289794922, "eval_loss": 0.14311543107032776, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.579282522201538, "eval_rewards/margins": 5.685213565826416, "eval_rewards/rejected": -2.105930805206299, "eval_runtime": 884.9158, "eval_samples_per_second": 3.234, "eval_steps_per_second": 0.202, "step": 4700 }, { "epoch": 2.15, "learning_rate": 9.43683409436834e-08, "logits/chosen": -0.4521303176879883, "logits/rejected": -0.4794772267341614, "logps/chosen": -116.8443603515625, "logps/rejected": -101.2288589477539, "loss": 0.1377, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3828024864196777, "rewards/margins": 5.463811874389648, "rewards/rejected": -2.0810093879699707, "step": 4710 }, { "epoch": 2.15, "learning_rate": 9.386098427194316e-08, "logits/chosen": -0.4442387521266937, "logits/rejected": -0.47409456968307495, "logps/chosen": -119.01100158691406, "logps/rejected": -94.46539306640625, "loss": 0.1313, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3471286296844482, "rewards/margins": 5.751844882965088, "rewards/rejected": -2.4047162532806396, "step": 4720 }, { "epoch": 2.16, "learning_rate": 9.335362760020293e-08, "logits/chosen": -0.43975549936294556, "logits/rejected": -0.46223530173301697, "logps/chosen": -111.1578140258789, "logps/rejected": -92.90426635742188, "loss": 0.1386, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.7703962326049805, "rewards/margins": 5.044751167297363, "rewards/rejected": -2.274355411529541, "step": 4730 }, { "epoch": 2.16, "learning_rate": 9.28462709284627e-08, "logits/chosen": -0.44939175248146057, "logits/rejected": -0.48308318853378296, "logps/chosen": -118.98689270019531, "logps/rejected": -95.54768371582031, "loss": 0.1281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.045827388763428, "rewards/margins": 6.658857822418213, "rewards/rejected": -2.6130292415618896, "step": 4740 }, { "epoch": 2.17, "learning_rate": 9.233891425672246e-08, "logits/chosen": -0.42080554366111755, "logits/rejected": -0.45268353819847107, "logps/chosen": -116.78912353515625, "logps/rejected": -94.59027862548828, "loss": 0.1419, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.422946929931641, "rewards/margins": 6.427711486816406, "rewards/rejected": -2.0047640800476074, "step": 4750 }, { "epoch": 2.17, "learning_rate": 9.183155758498223e-08, "logits/chosen": -0.4549104571342468, "logits/rejected": -0.4835089147090912, "logps/chosen": -120.5397720336914, "logps/rejected": -99.68217468261719, "loss": 0.1153, "rewards/accuracies": 0.9375, "rewards/chosen": 3.186094045639038, "rewards/margins": 5.969161033630371, "rewards/rejected": -2.783067226409912, "step": 4760 }, { "epoch": 2.18, "learning_rate": 9.1324200913242e-08, "logits/chosen": -0.42458558082580566, "logits/rejected": -0.45287784934043884, "logps/chosen": -121.1539535522461, "logps/rejected": -100.77606201171875, "loss": 0.1347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6020476818084717, "rewards/margins": 5.12890100479126, "rewards/rejected": -1.526853322982788, "step": 4770 }, { "epoch": 2.18, "learning_rate": 9.081684424150176e-08, "logits/chosen": -0.42954689264297485, "logits/rejected": -0.46278315782546997, "logps/chosen": -122.50996398925781, "logps/rejected": -98.3419418334961, "loss": 0.118, "rewards/accuracies": 0.9375, "rewards/chosen": 3.9317126274108887, "rewards/margins": 5.562252521514893, "rewards/rejected": -1.6305391788482666, "step": 4780 }, { "epoch": 2.19, "learning_rate": 9.030948756976153e-08, "logits/chosen": -0.43418097496032715, "logits/rejected": -0.46971768140792847, "logps/chosen": -122.99954986572266, "logps/rejected": -94.2531509399414, "loss": 0.1419, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.114560127258301, "rewards/margins": 6.730491638183594, "rewards/rejected": -2.6159310340881348, "step": 4790 }, { "epoch": 2.19, "learning_rate": 8.98021308980213e-08, "logits/chosen": -0.42841896414756775, "logits/rejected": -0.46112218499183655, "logps/chosen": -119.31453704833984, "logps/rejected": -99.18404388427734, "loss": 0.1346, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7384109497070312, "rewards/margins": 6.064143657684326, "rewards/rejected": -2.325732946395874, "step": 4800 }, { "epoch": 2.19, "eval_logits/chosen": -0.43315914273262024, "eval_logits/rejected": -0.46388283371925354, "eval_logps/chosen": -115.94792175292969, "eval_logps/rejected": -95.05813598632812, "eval_loss": 0.1493712216615677, "eval_rewards/accuracies": 0.9273743033409119, "eval_rewards/chosen": 3.6059112548828125, "eval_rewards/margins": 5.670140266418457, "eval_rewards/rejected": -2.0642290115356445, "eval_runtime": 905.7607, "eval_samples_per_second": 3.16, "eval_steps_per_second": 0.198, "step": 4800 }, { "epoch": 2.2, "learning_rate": 8.929477422628106e-08, "logits/chosen": -0.42701220512390137, "logits/rejected": -0.46004992723464966, "logps/chosen": -125.9152603149414, "logps/rejected": -98.69007873535156, "loss": 0.1357, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 4.1814656257629395, "rewards/margins": 6.167128086090088, "rewards/rejected": -1.9856624603271484, "step": 4810 }, { "epoch": 2.2, "learning_rate": 8.878741755454083e-08, "logits/chosen": -0.4257637858390808, "logits/rejected": -0.45527562499046326, "logps/chosen": -124.95893859863281, "logps/rejected": -99.27806091308594, "loss": 0.1431, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.052617311477661, "rewards/margins": 5.143216609954834, "rewards/rejected": -2.090599536895752, "step": 4820 }, { "epoch": 2.2, "learning_rate": 8.82800608828006e-08, "logits/chosen": -0.4246063232421875, "logits/rejected": -0.4535275399684906, "logps/chosen": -120.38226318359375, "logps/rejected": -95.41462707519531, "loss": 0.1563, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6748244762420654, "rewards/margins": 5.4428935050964355, "rewards/rejected": -1.7680695056915283, "step": 4830 }, { "epoch": 2.21, "learning_rate": 8.777270421106036e-08, "logits/chosen": -0.42737627029418945, "logits/rejected": -0.45712152123451233, "logps/chosen": -117.77052307128906, "logps/rejected": -95.78623962402344, "loss": 0.1244, "rewards/accuracies": 0.9375, "rewards/chosen": 3.292767286300659, "rewards/margins": 5.416769504547119, "rewards/rejected": -2.12400221824646, "step": 4840 }, { "epoch": 2.21, "learning_rate": 8.726534753932013e-08, "logits/chosen": -0.44735169410705566, "logits/rejected": -0.4684707224369049, "logps/chosen": -114.5677261352539, "logps/rejected": -95.72280883789062, "loss": 0.1371, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.0538907051086426, "rewards/margins": 5.442469596862793, "rewards/rejected": -2.388578414916992, "step": 4850 }, { "epoch": 2.22, "learning_rate": 8.67579908675799e-08, "logits/chosen": -0.44202107191085815, "logits/rejected": -0.46817198395729065, "logps/chosen": -112.7282485961914, "logps/rejected": -96.51592254638672, "loss": 0.121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.4495131969451904, "rewards/margins": 5.658398628234863, "rewards/rejected": -2.208885669708252, "step": 4860 }, { "epoch": 2.22, "learning_rate": 8.625063419583966e-08, "logits/chosen": -0.4402497410774231, "logits/rejected": -0.4670419692993164, "logps/chosen": -113.29608154296875, "logps/rejected": -94.75654602050781, "loss": 0.1283, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.559720277786255, "rewards/margins": 5.763693332672119, "rewards/rejected": -2.203972816467285, "step": 4870 }, { "epoch": 2.23, "learning_rate": 8.574327752409943e-08, "logits/chosen": -0.42953333258628845, "logits/rejected": -0.45160502195358276, "logps/chosen": -109.01979064941406, "logps/rejected": -94.32757568359375, "loss": 0.1448, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.248141050338745, "rewards/margins": 5.128489971160889, "rewards/rejected": -1.880348563194275, "step": 4880 }, { "epoch": 2.23, "learning_rate": 8.52359208523592e-08, "logits/chosen": -0.42205095291137695, "logits/rejected": -0.463623046875, "logps/chosen": -121.49098205566406, "logps/rejected": -97.1163101196289, "loss": 0.1358, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.174238681793213, "rewards/margins": 6.480024814605713, "rewards/rejected": -2.3057861328125, "step": 4890 }, { "epoch": 2.24, "learning_rate": 8.472856418061896e-08, "logits/chosen": -0.4547550678253174, "logits/rejected": -0.48270148038864136, "logps/chosen": -124.11871337890625, "logps/rejected": -98.28508758544922, "loss": 0.1462, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.622130870819092, "rewards/margins": 6.161963939666748, "rewards/rejected": -2.5398333072662354, "step": 4900 }, { "epoch": 2.24, "eval_logits/chosen": -0.425849050283432, "eval_logits/rejected": -0.4553125500679016, "eval_logps/chosen": -116.215576171875, "eval_logps/rejected": -95.25931549072266, "eval_loss": 0.14547978341579437, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.472090482711792, "eval_rewards/margins": 5.636913299560547, "eval_rewards/rejected": -2.164822816848755, "eval_runtime": 921.1192, "eval_samples_per_second": 3.107, "eval_steps_per_second": 0.194, "step": 4900 }, { "epoch": 2.24, "learning_rate": 8.422120750887873e-08, "logits/chosen": -0.4150146543979645, "logits/rejected": -0.43759116530418396, "logps/chosen": -122.642333984375, "logps/rejected": -95.4013442993164, "loss": 0.1495, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.227237701416016, "rewards/margins": 5.948737144470215, "rewards/rejected": -1.7214996814727783, "step": 4910 }, { "epoch": 2.25, "learning_rate": 8.37138508371385e-08, "logits/chosen": -0.43968862295150757, "logits/rejected": -0.4645746648311615, "logps/chosen": -121.03364562988281, "logps/rejected": -100.46788024902344, "loss": 0.137, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.370578289031982, "rewards/margins": 6.668444633483887, "rewards/rejected": -2.297865867614746, "step": 4920 }, { "epoch": 2.25, "learning_rate": 8.320649416539826e-08, "logits/chosen": -0.42395099997520447, "logits/rejected": -0.4579865038394928, "logps/chosen": -117.4967041015625, "logps/rejected": -96.34669494628906, "loss": 0.1244, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.60498309135437, "rewards/margins": 5.425591945648193, "rewards/rejected": -1.8206090927124023, "step": 4930 }, { "epoch": 2.25, "learning_rate": 8.269913749365803e-08, "logits/chosen": -0.4292042851448059, "logits/rejected": -0.4508039355278015, "logps/chosen": -122.93159484863281, "logps/rejected": -98.60295104980469, "loss": 0.1383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.629180431365967, "rewards/margins": 5.639649391174316, "rewards/rejected": -2.0104682445526123, "step": 4940 }, { "epoch": 2.26, "learning_rate": 8.21917808219178e-08, "logits/chosen": -0.42406630516052246, "logits/rejected": -0.45318031311035156, "logps/chosen": -123.0400161743164, "logps/rejected": -100.5845718383789, "loss": 0.1231, "rewards/accuracies": 0.9375, "rewards/chosen": 3.6148409843444824, "rewards/margins": 5.732459545135498, "rewards/rejected": -2.1176185607910156, "step": 4950 }, { "epoch": 2.26, "learning_rate": 8.168442415017756e-08, "logits/chosen": -0.4305325448513031, "logits/rejected": -0.4503403604030609, "logps/chosen": -122.27742004394531, "logps/rejected": -96.4347152709961, "loss": 0.1363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.2944235801696777, "rewards/margins": 5.731644630432129, "rewards/rejected": -2.437220573425293, "step": 4960 }, { "epoch": 2.27, "learning_rate": 8.117706747843733e-08, "logits/chosen": -0.45811018347740173, "logits/rejected": -0.48085397481918335, "logps/chosen": -115.161376953125, "logps/rejected": -95.84727478027344, "loss": 0.1117, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.9942409992218018, "rewards/margins": 6.176326274871826, "rewards/rejected": -2.182084798812866, "step": 4970 }, { "epoch": 2.27, "learning_rate": 8.06697108066971e-08, "logits/chosen": -0.45269566774368286, "logits/rejected": -0.4753597378730774, "logps/chosen": -119.2424545288086, "logps/rejected": -97.34378051757812, "loss": 0.1201, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.708515167236328, "rewards/margins": 6.624059200286865, "rewards/rejected": -2.915544271469116, "step": 4980 }, { "epoch": 2.28, "learning_rate": 8.016235413495687e-08, "logits/chosen": -0.4186610281467438, "logits/rejected": -0.44470691680908203, "logps/chosen": -112.29804992675781, "logps/rejected": -98.46720123291016, "loss": 0.1281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.875298500061035, "rewards/margins": 4.640026569366455, "rewards/rejected": -1.7647278308868408, "step": 4990 }, { "epoch": 2.28, "learning_rate": 7.965499746321664e-08, "logits/chosen": -0.42619308829307556, "logits/rejected": -0.4579424262046814, "logps/chosen": -124.06336975097656, "logps/rejected": -101.90495300292969, "loss": 0.1221, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.564988613128662, "rewards/margins": 6.438859462738037, "rewards/rejected": -1.8738704919815063, "step": 5000 }, { "epoch": 2.28, "eval_logits/chosen": -0.42679208517074585, "eval_logits/rejected": -0.45248451828956604, "eval_logps/chosen": -115.90120697021484, "eval_logps/rejected": -95.22400665283203, "eval_loss": 0.15375325083732605, "eval_rewards/accuracies": 0.9385474920272827, "eval_rewards/chosen": 3.62927508354187, "eval_rewards/margins": 5.7764434814453125, "eval_rewards/rejected": -2.1471686363220215, "eval_runtime": 917.3543, "eval_samples_per_second": 3.12, "eval_steps_per_second": 0.195, "step": 5000 }, { "epoch": 2.29, "learning_rate": 7.91476407914764e-08, "logits/chosen": -0.4315185546875, "logits/rejected": -0.4569702744483948, "logps/chosen": -116.04268646240234, "logps/rejected": -93.1859130859375, "loss": 0.1457, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.254358768463135, "rewards/margins": 5.7949419021606445, "rewards/rejected": -1.5405833721160889, "step": 5010 }, { "epoch": 2.29, "learning_rate": 7.864028411973617e-08, "logits/chosen": -0.4479445517063141, "logits/rejected": -0.4728432297706604, "logps/chosen": -115.8251724243164, "logps/rejected": -91.85781860351562, "loss": 0.145, "rewards/accuracies": 0.9375, "rewards/chosen": 3.826247453689575, "rewards/margins": 5.700136184692383, "rewards/rejected": -1.8738889694213867, "step": 5020 }, { "epoch": 2.3, "learning_rate": 7.813292744799594e-08, "logits/chosen": -0.42761698365211487, "logits/rejected": -0.45577025413513184, "logps/chosen": -118.49787902832031, "logps/rejected": -97.28572845458984, "loss": 0.1264, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.764814853668213, "rewards/margins": 5.964900970458984, "rewards/rejected": -2.2000865936279297, "step": 5030 }, { "epoch": 2.3, "learning_rate": 7.76255707762557e-08, "logits/chosen": -0.4219226837158203, "logits/rejected": -0.46134597063064575, "logps/chosen": -114.80213928222656, "logps/rejected": -96.4936294555664, "loss": 0.144, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.602666139602661, "rewards/margins": 5.779239177703857, "rewards/rejected": -2.1765735149383545, "step": 5040 }, { "epoch": 2.31, "learning_rate": 7.711821410451547e-08, "logits/chosen": -0.4301370084285736, "logits/rejected": -0.4616777002811432, "logps/chosen": -120.4800033569336, "logps/rejected": -95.62647247314453, "loss": 0.1161, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.8796589374542236, "rewards/margins": 6.127512454986572, "rewards/rejected": -2.2478537559509277, "step": 5050 }, { "epoch": 2.31, "learning_rate": 7.661085743277524e-08, "logits/chosen": -0.4376090466976166, "logits/rejected": -0.4689910411834717, "logps/chosen": -123.61064147949219, "logps/rejected": -97.51295471191406, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 4.019012451171875, "rewards/margins": 5.775191307067871, "rewards/rejected": -1.7561795711517334, "step": 5060 }, { "epoch": 2.31, "learning_rate": 7.6103500761035e-08, "logits/chosen": -0.43211430311203003, "logits/rejected": -0.4615866541862488, "logps/chosen": -114.49861907958984, "logps/rejected": -94.89616394042969, "loss": 0.131, "rewards/accuracies": 0.9375, "rewards/chosen": 3.0946316719055176, "rewards/margins": 5.596832752227783, "rewards/rejected": -2.502201557159424, "step": 5070 }, { "epoch": 2.32, "learning_rate": 7.559614408929477e-08, "logits/chosen": -0.4486325681209564, "logits/rejected": -0.47001224756240845, "logps/chosen": -120.32537841796875, "logps/rejected": -96.01399230957031, "loss": 0.1241, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.4849534034729, "rewards/margins": 6.655509948730469, "rewards/rejected": -2.1705565452575684, "step": 5080 }, { "epoch": 2.32, "learning_rate": 7.508878741755454e-08, "logits/chosen": -0.44581979513168335, "logits/rejected": -0.47158461809158325, "logps/chosen": -121.13935852050781, "logps/rejected": -96.64893341064453, "loss": 0.1101, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.932180881500244, "rewards/margins": 6.488729000091553, "rewards/rejected": -2.556548833847046, "step": 5090 }, { "epoch": 2.33, "learning_rate": 7.45814307458143e-08, "logits/chosen": -0.4497924745082855, "logits/rejected": -0.4717227518558502, "logps/chosen": -119.8982925415039, "logps/rejected": -101.00511169433594, "loss": 0.1329, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.685342788696289, "rewards/margins": 6.00387716293335, "rewards/rejected": -2.3185343742370605, "step": 5100 }, { "epoch": 2.33, "eval_logits/chosen": -0.43014708161354065, "eval_logits/rejected": -0.45783746242523193, "eval_logps/chosen": -116.2129898071289, "eval_logps/rejected": -95.2852554321289, "eval_loss": 0.1486155241727829, "eval_rewards/accuracies": 0.9357541799545288, "eval_rewards/chosen": 3.4733829498291016, "eval_rewards/margins": 5.6511712074279785, "eval_rewards/rejected": -2.177788019180298, "eval_runtime": 912.1321, "eval_samples_per_second": 3.138, "eval_steps_per_second": 0.196, "step": 5100 }, { "epoch": 2.33, "learning_rate": 7.407407407407407e-08, "logits/chosen": -0.4249711036682129, "logits/rejected": -0.4414646625518799, "logps/chosen": -116.63139343261719, "logps/rejected": -96.9327163696289, "loss": 0.1374, "rewards/accuracies": 0.875, "rewards/chosen": 3.5760879516601562, "rewards/margins": 5.616466522216797, "rewards/rejected": -2.040379047393799, "step": 5110 }, { "epoch": 2.34, "learning_rate": 7.356671740233384e-08, "logits/chosen": -0.44433823227882385, "logits/rejected": -0.48357200622558594, "logps/chosen": -119.763671875, "logps/rejected": -92.0307388305664, "loss": 0.1614, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4530186653137207, "rewards/margins": 5.796075344085693, "rewards/rejected": -2.3430564403533936, "step": 5120 }, { "epoch": 2.34, "learning_rate": 7.30593607305936e-08, "logits/chosen": -0.4470491409301758, "logits/rejected": -0.45599809288978577, "logps/chosen": -119.12043762207031, "logps/rejected": -97.7674789428711, "loss": 0.1402, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.528527021408081, "rewards/margins": 5.870127201080322, "rewards/rejected": -2.341599702835083, "step": 5130 }, { "epoch": 2.35, "learning_rate": 7.255200405885337e-08, "logits/chosen": -0.41116419434547424, "logits/rejected": -0.44234171509742737, "logps/chosen": -118.84635925292969, "logps/rejected": -98.58489990234375, "loss": 0.1135, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.288877487182617, "rewards/margins": 6.042611122131348, "rewards/rejected": -1.7537336349487305, "step": 5140 }, { "epoch": 2.35, "learning_rate": 7.204464738711314e-08, "logits/chosen": -0.454406201839447, "logits/rejected": -0.47148528695106506, "logps/chosen": -120.2448959350586, "logps/rejected": -97.86837768554688, "loss": 0.1561, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.711885929107666, "rewards/margins": 6.327667236328125, "rewards/rejected": -2.6157820224761963, "step": 5150 }, { "epoch": 2.36, "learning_rate": 7.15372907153729e-08, "logits/chosen": -0.4334358274936676, "logits/rejected": -0.45983433723449707, "logps/chosen": -117.58463287353516, "logps/rejected": -98.2049789428711, "loss": 0.1475, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3072800636291504, "rewards/margins": 5.857844352722168, "rewards/rejected": -2.5505640506744385, "step": 5160 }, { "epoch": 2.36, "learning_rate": 7.102993404363267e-08, "logits/chosen": -0.4280088543891907, "logits/rejected": -0.4603753089904785, "logps/chosen": -120.7674331665039, "logps/rejected": -96.3237533569336, "loss": 0.1194, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.141296863555908, "rewards/margins": 5.86881160736084, "rewards/rejected": -1.7275148630142212, "step": 5170 }, { "epoch": 2.36, "learning_rate": 7.052257737189244e-08, "logits/chosen": -0.4364453852176666, "logits/rejected": -0.45729178190231323, "logps/chosen": -114.9288330078125, "logps/rejected": -93.96846771240234, "loss": 0.1222, "rewards/accuracies": 0.9375, "rewards/chosen": 3.952524185180664, "rewards/margins": 5.895291805267334, "rewards/rejected": -1.9427669048309326, "step": 5180 }, { "epoch": 2.37, "learning_rate": 7.00152207001522e-08, "logits/chosen": -0.4382234513759613, "logits/rejected": -0.47471123933792114, "logps/chosen": -116.1670913696289, "logps/rejected": -96.3835678100586, "loss": 0.1456, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9336535930633545, "rewards/margins": 6.241081237792969, "rewards/rejected": -2.3074281215667725, "step": 5190 }, { "epoch": 2.37, "learning_rate": 6.950786402841197e-08, "logits/chosen": -0.44112008810043335, "logits/rejected": -0.4550951421260834, "logps/chosen": -121.0194320678711, "logps/rejected": -97.9061508178711, "loss": 0.1284, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.909855604171753, "rewards/margins": 5.62261438369751, "rewards/rejected": -2.7127585411071777, "step": 5200 }, { "epoch": 2.37, "eval_logits/chosen": -0.4328519403934479, "eval_logits/rejected": -0.46113479137420654, "eval_logps/chosen": -116.19883728027344, "eval_logps/rejected": -95.26362609863281, "eval_loss": 0.1526959091424942, "eval_rewards/accuracies": 0.9078212380409241, "eval_rewards/chosen": 3.4804577827453613, "eval_rewards/margins": 5.647433757781982, "eval_rewards/rejected": -2.1669766902923584, "eval_runtime": 871.9882, "eval_samples_per_second": 3.282, "eval_steps_per_second": 0.205, "step": 5200 }, { "epoch": 2.38, "learning_rate": 6.900050735667174e-08, "logits/chosen": -0.4210163950920105, "logits/rejected": -0.4521384835243225, "logps/chosen": -114.48892974853516, "logps/rejected": -100.7722396850586, "loss": 0.1248, "rewards/accuracies": 0.9375, "rewards/chosen": 3.135586738586426, "rewards/margins": 6.100742340087891, "rewards/rejected": -2.965155839920044, "step": 5210 }, { "epoch": 2.38, "learning_rate": 6.84931506849315e-08, "logits/chosen": -0.39191263914108276, "logits/rejected": -0.4211401045322418, "logps/chosen": -115.17881774902344, "logps/rejected": -103.11189270019531, "loss": 0.12, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7481086254119873, "rewards/margins": 5.617436408996582, "rewards/rejected": -1.8693279027938843, "step": 5220 }, { "epoch": 2.39, "learning_rate": 6.798579401319127e-08, "logits/chosen": -0.42194071412086487, "logits/rejected": -0.452617883682251, "logps/chosen": -115.75611877441406, "logps/rejected": -99.6901626586914, "loss": 0.1173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.275547504425049, "rewards/margins": 6.447475433349609, "rewards/rejected": -2.1719284057617188, "step": 5230 }, { "epoch": 2.39, "learning_rate": 6.747843734145104e-08, "logits/chosen": -0.4251808226108551, "logits/rejected": -0.4505956768989563, "logps/chosen": -113.19279479980469, "logps/rejected": -95.99583435058594, "loss": 0.1194, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.490676164627075, "rewards/margins": 6.030789375305176, "rewards/rejected": -2.540113687515259, "step": 5240 }, { "epoch": 2.4, "learning_rate": 6.69710806697108e-08, "logits/chosen": -0.44365444779396057, "logits/rejected": -0.4735577702522278, "logps/chosen": -119.94706726074219, "logps/rejected": -96.17887878417969, "loss": 0.1544, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.618509292602539, "rewards/margins": 6.0719451904296875, "rewards/rejected": -2.4534356594085693, "step": 5250 }, { "epoch": 2.4, "learning_rate": 6.646372399797057e-08, "logits/chosen": -0.405931293964386, "logits/rejected": -0.43033894896507263, "logps/chosen": -112.36710357666016, "logps/rejected": -93.01887512207031, "loss": 0.1402, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3639919757843018, "rewards/margins": 5.533229827880859, "rewards/rejected": -2.1692380905151367, "step": 5260 }, { "epoch": 2.41, "learning_rate": 6.595636732623034e-08, "logits/chosen": -0.4300483763217926, "logits/rejected": -0.46487370133399963, "logps/chosen": -117.56733703613281, "logps/rejected": -97.90502166748047, "loss": 0.1227, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4772467613220215, "rewards/margins": 6.186609745025635, "rewards/rejected": -2.7093615531921387, "step": 5270 }, { "epoch": 2.41, "learning_rate": 6.54490106544901e-08, "logits/chosen": -0.44517087936401367, "logits/rejected": -0.46531495451927185, "logps/chosen": -116.51817321777344, "logps/rejected": -96.64463806152344, "loss": 0.1316, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.680448532104492, "rewards/margins": 6.1895432472229, "rewards/rejected": -2.509094715118408, "step": 5280 }, { "epoch": 2.41, "learning_rate": 6.494165398274987e-08, "logits/chosen": -0.4242396354675293, "logits/rejected": -0.44850629568099976, "logps/chosen": -125.0820083618164, "logps/rejected": -97.7175521850586, "loss": 0.1468, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.043990135192871, "rewards/margins": 4.996575355529785, "rewards/rejected": -1.9525845050811768, "step": 5290 }, { "epoch": 2.42, "learning_rate": 6.443429731100964e-08, "logits/chosen": -0.45390892028808594, "logits/rejected": -0.48004603385925293, "logps/chosen": -122.3233642578125, "logps/rejected": -99.90754699707031, "loss": 0.1238, "rewards/accuracies": 0.9375, "rewards/chosen": 3.569197416305542, "rewards/margins": 5.976964473724365, "rewards/rejected": -2.4077672958374023, "step": 5300 }, { "epoch": 2.42, "eval_logits/chosen": -0.4191027879714966, "eval_logits/rejected": -0.4451000988483429, "eval_logps/chosen": -116.24568176269531, "eval_logps/rejected": -95.28321075439453, "eval_loss": 0.14333823323249817, "eval_rewards/accuracies": 0.9273743033409119, "eval_rewards/chosen": 3.457033634185791, "eval_rewards/margins": 5.63380241394043, "eval_rewards/rejected": -2.1767685413360596, "eval_runtime": 900.5295, "eval_samples_per_second": 3.178, "eval_steps_per_second": 0.199, "step": 5300 }, { "epoch": 2.42, "learning_rate": 6.39269406392694e-08, "logits/chosen": -0.4350680410861969, "logits/rejected": -0.4649884104728699, "logps/chosen": -119.18917083740234, "logps/rejected": -100.50810241699219, "loss": 0.1225, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.4932448863983154, "rewards/margins": 6.379704475402832, "rewards/rejected": -2.886460304260254, "step": 5310 }, { "epoch": 2.43, "learning_rate": 6.341958396752917e-08, "logits/chosen": -0.4124290347099304, "logits/rejected": -0.4478355050086975, "logps/chosen": -121.8824691772461, "logps/rejected": -96.27501678466797, "loss": 0.1424, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.710458278656006, "rewards/margins": 6.276509761810303, "rewards/rejected": -2.566051959991455, "step": 5320 }, { "epoch": 2.43, "learning_rate": 6.291222729578894e-08, "logits/chosen": -0.41530436277389526, "logits/rejected": -0.4465157389640808, "logps/chosen": -121.0118637084961, "logps/rejected": -96.7669677734375, "loss": 0.1091, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8442223072052, "rewards/margins": 5.426248550415039, "rewards/rejected": -1.582026481628418, "step": 5330 }, { "epoch": 2.44, "learning_rate": 6.24048706240487e-08, "logits/chosen": -0.4436219334602356, "logits/rejected": -0.47822102904319763, "logps/chosen": -124.68888854980469, "logps/rejected": -95.68738555908203, "loss": 0.1183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.960723876953125, "rewards/margins": 5.930281639099121, "rewards/rejected": -1.9695583581924438, "step": 5340 }, { "epoch": 2.44, "learning_rate": 6.189751395230847e-08, "logits/chosen": -0.4435461163520813, "logits/rejected": -0.46420255303382874, "logps/chosen": -124.46327209472656, "logps/rejected": -97.09564208984375, "loss": 0.1126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.607532501220703, "rewards/margins": 5.740006446838379, "rewards/rejected": -2.1324732303619385, "step": 5350 }, { "epoch": 2.45, "learning_rate": 6.139015728056824e-08, "logits/chosen": -0.4381752610206604, "logits/rejected": -0.45854130387306213, "logps/chosen": -115.46308898925781, "logps/rejected": -96.0640869140625, "loss": 0.1432, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.904341220855713, "rewards/margins": 6.217533111572266, "rewards/rejected": -2.313192129135132, "step": 5360 }, { "epoch": 2.45, "learning_rate": 6.0882800608828e-08, "logits/chosen": -0.43165189027786255, "logits/rejected": -0.4557250440120697, "logps/chosen": -115.7738037109375, "logps/rejected": -97.3882064819336, "loss": 0.132, "rewards/accuracies": 0.9375, "rewards/chosen": 3.428105592727661, "rewards/margins": 5.922058582305908, "rewards/rejected": -2.493953227996826, "step": 5370 }, { "epoch": 2.46, "learning_rate": 6.037544393708777e-08, "logits/chosen": -0.41999363899230957, "logits/rejected": -0.44616618752479553, "logps/chosen": -119.87571716308594, "logps/rejected": -100.83201599121094, "loss": 0.0991, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.8233256340026855, "rewards/margins": 6.311063766479492, "rewards/rejected": -2.4877381324768066, "step": 5380 }, { "epoch": 2.46, "learning_rate": 5.986808726534754e-08, "logits/chosen": -0.435232937335968, "logits/rejected": -0.46104755997657776, "logps/chosen": -122.05622863769531, "logps/rejected": -95.90516662597656, "loss": 0.1192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9345486164093018, "rewards/margins": 6.371971607208252, "rewards/rejected": -2.4374232292175293, "step": 5390 }, { "epoch": 2.46, "learning_rate": 5.93607305936073e-08, "logits/chosen": -0.4300897717475891, "logits/rejected": -0.45790234208106995, "logps/chosen": -116.21836853027344, "logps/rejected": -94.72840881347656, "loss": 0.1317, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.370344877243042, "rewards/margins": 5.3175368309021, "rewards/rejected": -1.947191834449768, "step": 5400 }, { "epoch": 2.46, "eval_logits/chosen": -0.4342188835144043, "eval_logits/rejected": -0.4564913213253021, "eval_logps/chosen": -116.03028106689453, "eval_logps/rejected": -95.37613677978516, "eval_loss": 0.1420595645904541, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.564730644226074, "eval_rewards/margins": 5.787961959838867, "eval_rewards/rejected": -2.2232308387756348, "eval_runtime": 900.7223, "eval_samples_per_second": 3.177, "eval_steps_per_second": 0.199, "step": 5400 }, { "epoch": 2.47, "learning_rate": 5.8853373921867065e-08, "logits/chosen": -0.43930888175964355, "logits/rejected": -0.4661675989627838, "logps/chosen": -118.8895492553711, "logps/rejected": -101.40609741210938, "loss": 0.1252, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.623162031173706, "rewards/margins": 5.550017356872559, "rewards/rejected": -1.9268558025360107, "step": 5410 }, { "epoch": 2.47, "learning_rate": 5.834601725012683e-08, "logits/chosen": -0.4218188226222992, "logits/rejected": -0.4493798613548279, "logps/chosen": -117.68571472167969, "logps/rejected": -97.6932373046875, "loss": 0.1242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.941012144088745, "rewards/margins": 5.962545871734619, "rewards/rejected": -2.021533489227295, "step": 5420 }, { "epoch": 2.48, "learning_rate": 5.78386605783866e-08, "logits/chosen": -0.45547741651535034, "logits/rejected": -0.47590240836143494, "logps/chosen": -120.83634948730469, "logps/rejected": -95.0353775024414, "loss": 0.1355, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9494616985321045, "rewards/margins": 6.385022163391113, "rewards/rejected": -2.435560464859009, "step": 5430 }, { "epoch": 2.48, "learning_rate": 5.7331303906646365e-08, "logits/chosen": -0.44121265411376953, "logits/rejected": -0.46499747037887573, "logps/chosen": -116.39122009277344, "logps/rejected": -100.46307373046875, "loss": 0.1434, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.17018985748291, "rewards/margins": 5.512290000915527, "rewards/rejected": -2.3421006202697754, "step": 5440 }, { "epoch": 2.49, "learning_rate": 5.682394723490613e-08, "logits/chosen": -0.41486072540283203, "logits/rejected": -0.44767364859580994, "logps/chosen": -125.267578125, "logps/rejected": -96.89026641845703, "loss": 0.1006, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.735731840133667, "rewards/margins": 5.538851261138916, "rewards/rejected": -1.8031189441680908, "step": 5450 }, { "epoch": 2.49, "learning_rate": 5.63165905631659e-08, "logits/chosen": -0.45300665497779846, "logits/rejected": -0.4838689863681793, "logps/chosen": -125.07342529296875, "logps/rejected": -103.2747573852539, "loss": 0.1553, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.719700574874878, "rewards/margins": 6.036759376525879, "rewards/rejected": -2.31705904006958, "step": 5460 }, { "epoch": 2.5, "learning_rate": 5.5809233891425665e-08, "logits/chosen": -0.41899123787879944, "logits/rejected": -0.4467073380947113, "logps/chosen": -117.2870864868164, "logps/rejected": -91.2414321899414, "loss": 0.1381, "rewards/accuracies": 0.9375, "rewards/chosen": 3.313997745513916, "rewards/margins": 5.549351215362549, "rewards/rejected": -2.235353469848633, "step": 5470 }, { "epoch": 2.5, "learning_rate": 5.530187721968543e-08, "logits/chosen": -0.40872034430503845, "logits/rejected": -0.44184407591819763, "logps/chosen": -122.60057067871094, "logps/rejected": -97.71039581298828, "loss": 0.1401, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.035274028778076, "rewards/margins": 5.73886775970459, "rewards/rejected": -1.7035942077636719, "step": 5480 }, { "epoch": 2.51, "learning_rate": 5.47945205479452e-08, "logits/chosen": -0.4714629054069519, "logits/rejected": -0.4978507161140442, "logps/chosen": -121.52668762207031, "logps/rejected": -98.09647369384766, "loss": 0.1282, "rewards/accuracies": 0.9375, "rewards/chosen": 3.4877541065216064, "rewards/margins": 5.688804626464844, "rewards/rejected": -2.2010507583618164, "step": 5490 }, { "epoch": 2.51, "learning_rate": 5.4287163876204964e-08, "logits/chosen": -0.4360930919647217, "logits/rejected": -0.4640362858772278, "logps/chosen": -116.9971923828125, "logps/rejected": -99.79209899902344, "loss": 0.131, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.117398262023926, "rewards/margins": 6.651013374328613, "rewards/rejected": -2.5336146354675293, "step": 5500 }, { "epoch": 2.51, "eval_logits/chosen": -0.4146881103515625, "eval_logits/rejected": -0.44443634152412415, "eval_logps/chosen": -116.3175277709961, "eval_logps/rejected": -95.46585083007812, "eval_loss": 0.14781765639781952, "eval_rewards/accuracies": 0.9189944267272949, "eval_rewards/chosen": 3.4211063385009766, "eval_rewards/margins": 5.689195156097412, "eval_rewards/rejected": -2.2680890560150146, "eval_runtime": 926.1823, "eval_samples_per_second": 3.09, "eval_steps_per_second": 0.193, "step": 5500 }, { "epoch": 2.52, "learning_rate": 5.377980720446473e-08, "logits/chosen": -0.43201667070388794, "logits/rejected": -0.46011418104171753, "logps/chosen": -113.99064636230469, "logps/rejected": -92.80901336669922, "loss": 0.1115, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.70094633102417, "rewards/margins": 5.098592281341553, "rewards/rejected": -2.397646427154541, "step": 5510 }, { "epoch": 2.52, "learning_rate": 5.32724505327245e-08, "logits/chosen": -0.41051793098449707, "logits/rejected": -0.437521755695343, "logps/chosen": -120.39073181152344, "logps/rejected": -99.1684799194336, "loss": 0.1696, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.8455090522766113, "rewards/margins": 5.564748287200928, "rewards/rejected": -1.7192399501800537, "step": 5520 }, { "epoch": 2.52, "learning_rate": 5.2765093860984264e-08, "logits/chosen": -0.45660334825515747, "logits/rejected": -0.47934332489967346, "logps/chosen": -116.85618591308594, "logps/rejected": -98.81648254394531, "loss": 0.1406, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.6133334636688232, "rewards/margins": 5.89406681060791, "rewards/rejected": -2.280733108520508, "step": 5530 }, { "epoch": 2.53, "learning_rate": 5.225773718924403e-08, "logits/chosen": -0.42749834060668945, "logits/rejected": -0.45927342772483826, "logps/chosen": -115.58152770996094, "logps/rejected": -91.99083709716797, "loss": 0.1216, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.055908679962158, "rewards/margins": 5.848606586456299, "rewards/rejected": -1.7926979064941406, "step": 5540 }, { "epoch": 2.53, "learning_rate": 5.17503805175038e-08, "logits/chosen": -0.4737180173397064, "logits/rejected": -0.49301090836524963, "logps/chosen": -115.43916320800781, "logps/rejected": -97.0264892578125, "loss": 0.1495, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.613140821456909, "rewards/margins": 5.604285717010498, "rewards/rejected": -1.991145133972168, "step": 5550 }, { "epoch": 2.54, "learning_rate": 5.1243023845763564e-08, "logits/chosen": -0.4277074337005615, "logits/rejected": -0.4595872759819031, "logps/chosen": -120.0799789428711, "logps/rejected": -96.8666000366211, "loss": 0.124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7783608436584473, "rewards/margins": 6.271768093109131, "rewards/rejected": -2.4934072494506836, "step": 5560 }, { "epoch": 2.54, "learning_rate": 5.073566717402333e-08, "logits/chosen": -0.44470876455307007, "logits/rejected": -0.46704134345054626, "logps/chosen": -113.39207458496094, "logps/rejected": -97.73541259765625, "loss": 0.1385, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.5125949382781982, "rewards/margins": 5.59378719329834, "rewards/rejected": -2.0811917781829834, "step": 5570 }, { "epoch": 2.55, "learning_rate": 5.02283105022831e-08, "logits/chosen": -0.43218737840652466, "logits/rejected": -0.46957531571388245, "logps/chosen": -121.54664611816406, "logps/rejected": -92.61653137207031, "loss": 0.1492, "rewards/accuracies": 0.9375, "rewards/chosen": 3.846766710281372, "rewards/margins": 5.690174102783203, "rewards/rejected": -1.8434079885482788, "step": 5580 }, { "epoch": 2.55, "learning_rate": 4.9720953830542864e-08, "logits/chosen": -0.455254465341568, "logits/rejected": -0.4841559827327728, "logps/chosen": -116.07682800292969, "logps/rejected": -99.91950988769531, "loss": 0.1311, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.1617395877838135, "rewards/margins": 5.66703462600708, "rewards/rejected": -2.5052950382232666, "step": 5590 }, { "epoch": 2.56, "learning_rate": 4.921359715880263e-08, "logits/chosen": -0.4276158809661865, "logits/rejected": -0.44826555252075195, "logps/chosen": -116.87984466552734, "logps/rejected": -93.79035186767578, "loss": 0.1235, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.352602481842041, "rewards/margins": 6.3349151611328125, "rewards/rejected": -1.982312560081482, "step": 5600 }, { "epoch": 2.56, "eval_logits/chosen": -0.4234235882759094, "eval_logits/rejected": -0.4484900236129761, "eval_logps/chosen": -116.10143280029297, "eval_logps/rejected": -95.48921966552734, "eval_loss": 0.14275720715522766, "eval_rewards/accuracies": 0.9413408041000366, "eval_rewards/chosen": 3.5291614532470703, "eval_rewards/margins": 5.808929920196533, "eval_rewards/rejected": -2.279768466949463, "eval_runtime": 876.6048, "eval_samples_per_second": 3.265, "eval_steps_per_second": 0.204, "step": 5600 }, { "epoch": 2.56, "learning_rate": 4.87062404870624e-08, "logits/chosen": -0.452223539352417, "logits/rejected": -0.47349920868873596, "logps/chosen": -117.63553619384766, "logps/rejected": -96.32161712646484, "loss": 0.1157, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.280198574066162, "rewards/margins": 6.476129055023193, "rewards/rejected": -2.1959307193756104, "step": 5610 }, { "epoch": 2.57, "learning_rate": 4.8198883815322164e-08, "logits/chosen": -0.4440379738807678, "logits/rejected": -0.4710288941860199, "logps/chosen": -118.5327377319336, "logps/rejected": -101.5977554321289, "loss": 0.1398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8866469860076904, "rewards/margins": 5.589003086090088, "rewards/rejected": -2.7023556232452393, "step": 5620 }, { "epoch": 2.57, "learning_rate": 4.769152714358193e-08, "logits/chosen": -0.4269390106201172, "logits/rejected": -0.4624324440956116, "logps/chosen": -113.65742492675781, "logps/rejected": -95.20040893554688, "loss": 0.1168, "rewards/accuracies": 0.9375, "rewards/chosen": 4.0439372062683105, "rewards/margins": 6.248236656188965, "rewards/rejected": -2.204299211502075, "step": 5630 }, { "epoch": 2.57, "learning_rate": 4.71841704718417e-08, "logits/chosen": -0.4506987929344177, "logits/rejected": -0.4784929156303406, "logps/chosen": -114.27982330322266, "logps/rejected": -92.17994689941406, "loss": 0.125, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.4143104553222656, "rewards/margins": 5.477658271789551, "rewards/rejected": -2.063347816467285, "step": 5640 }, { "epoch": 2.58, "learning_rate": 4.6676813800101464e-08, "logits/chosen": -0.45349711179733276, "logits/rejected": -0.47614622116088867, "logps/chosen": -118.63285827636719, "logps/rejected": -93.47701263427734, "loss": 0.1465, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.212621212005615, "rewards/margins": 6.03704833984375, "rewards/rejected": -1.824427604675293, "step": 5650 }, { "epoch": 2.58, "learning_rate": 4.616945712836123e-08, "logits/chosen": -0.441559374332428, "logits/rejected": -0.46678081154823303, "logps/chosen": -119.7842025756836, "logps/rejected": -93.7694320678711, "loss": 0.1281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7817344665527344, "rewards/margins": 6.05294132232666, "rewards/rejected": -2.2712061405181885, "step": 5660 }, { "epoch": 2.59, "learning_rate": 4.5662100456621e-08, "logits/chosen": -0.4511653780937195, "logits/rejected": -0.4725533425807953, "logps/chosen": -116.87252044677734, "logps/rejected": -98.78968048095703, "loss": 0.1162, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.7561240196228027, "rewards/margins": 5.7545342445373535, "rewards/rejected": -1.9984098672866821, "step": 5670 }, { "epoch": 2.59, "learning_rate": 4.5154743784880764e-08, "logits/chosen": -0.4162279963493347, "logits/rejected": -0.4527038037776947, "logps/chosen": -111.1601791381836, "logps/rejected": -100.4341812133789, "loss": 0.1382, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.014040470123291, "rewards/margins": 6.132898807525635, "rewards/rejected": -2.1188578605651855, "step": 5680 }, { "epoch": 2.6, "learning_rate": 4.464738711314053e-08, "logits/chosen": -0.43241244554519653, "logits/rejected": -0.4595095217227936, "logps/chosen": -117.22080993652344, "logps/rejected": -95.52210235595703, "loss": 0.1458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.282032012939453, "rewards/margins": 5.7940144538879395, "rewards/rejected": -2.5119822025299072, "step": 5690 }, { "epoch": 2.6, "learning_rate": 4.41400304414003e-08, "logits/chosen": -0.4618196487426758, "logits/rejected": -0.4878638684749603, "logps/chosen": -123.39935302734375, "logps/rejected": -96.44064331054688, "loss": 0.1122, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.916281223297119, "rewards/margins": 6.677175998687744, "rewards/rejected": -2.760894775390625, "step": 5700 }, { "epoch": 2.6, "eval_logits/chosen": -0.4233376681804657, "eval_logits/rejected": -0.44727030396461487, "eval_logps/chosen": -115.93925476074219, "eval_logps/rejected": -95.40226745605469, "eval_loss": 0.1444796770811081, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.610247850418091, "eval_rewards/margins": 5.846547603607178, "eval_rewards/rejected": -2.2362990379333496, "eval_runtime": 911.2103, "eval_samples_per_second": 3.141, "eval_steps_per_second": 0.196, "step": 5700 }, { "epoch": 2.61, "learning_rate": 4.3632673769660064e-08, "logits/chosen": -0.4469119906425476, "logits/rejected": -0.4657912254333496, "logps/chosen": -121.95216369628906, "logps/rejected": -101.41227722167969, "loss": 0.1031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.711015224456787, "rewards/margins": 5.949687957763672, "rewards/rejected": -2.2386727333068848, "step": 5710 }, { "epoch": 2.61, "learning_rate": 4.312531709791983e-08, "logits/chosen": -0.4170357584953308, "logits/rejected": -0.4365556836128235, "logps/chosen": -120.26194763183594, "logps/rejected": -102.38157653808594, "loss": 0.1371, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7953782081604004, "rewards/margins": 5.940617084503174, "rewards/rejected": -2.1452383995056152, "step": 5720 }, { "epoch": 2.62, "learning_rate": 4.26179604261796e-08, "logits/chosen": -0.4518299102783203, "logits/rejected": -0.4835347533226013, "logps/chosen": -112.53265380859375, "logps/rejected": -90.11258697509766, "loss": 0.1389, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.475214719772339, "rewards/margins": 5.532036781311035, "rewards/rejected": -2.0568222999572754, "step": 5730 }, { "epoch": 2.62, "learning_rate": 4.2110603754439363e-08, "logits/chosen": -0.40014880895614624, "logits/rejected": -0.4240821897983551, "logps/chosen": -115.626220703125, "logps/rejected": -97.0552749633789, "loss": 0.1141, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.930833101272583, "rewards/margins": 6.204165458679199, "rewards/rejected": -2.2733330726623535, "step": 5740 }, { "epoch": 2.62, "learning_rate": 4.160324708269913e-08, "logits/chosen": -0.4410339295864105, "logits/rejected": -0.4653685986995697, "logps/chosen": -120.30546569824219, "logps/rejected": -99.77711486816406, "loss": 0.1325, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5853590965270996, "rewards/margins": 5.607728004455566, "rewards/rejected": -2.022369146347046, "step": 5750 }, { "epoch": 2.63, "learning_rate": 4.10958904109589e-08, "logits/chosen": -0.4367760121822357, "logits/rejected": -0.4674459397792816, "logps/chosen": -108.0204849243164, "logps/rejected": -93.1372299194336, "loss": 0.1381, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.783496141433716, "rewards/margins": 5.598851680755615, "rewards/rejected": -1.8153560161590576, "step": 5760 }, { "epoch": 2.63, "learning_rate": 4.0588533739218663e-08, "logits/chosen": -0.4705559313297272, "logits/rejected": -0.49414676427841187, "logps/chosen": -114.16094970703125, "logps/rejected": -91.7486801147461, "loss": 0.1097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.7645983695983887, "rewards/margins": 6.290394306182861, "rewards/rejected": -2.5257956981658936, "step": 5770 }, { "epoch": 2.64, "learning_rate": 4.0081177067478437e-08, "logits/chosen": -0.4423903822898865, "logits/rejected": -0.46557703614234924, "logps/chosen": -120.6229019165039, "logps/rejected": -95.86766052246094, "loss": 0.1412, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.339417457580566, "rewards/margins": 6.628237724304199, "rewards/rejected": -2.28882098197937, "step": 5780 }, { "epoch": 2.64, "learning_rate": 3.95738203957382e-08, "logits/chosen": -0.4328484535217285, "logits/rejected": -0.4610370993614197, "logps/chosen": -109.90287780761719, "logps/rejected": -98.98536682128906, "loss": 0.1247, "rewards/accuracies": 0.9375, "rewards/chosen": 3.143998861312866, "rewards/margins": 5.320948600769043, "rewards/rejected": -2.1769497394561768, "step": 5790 }, { "epoch": 2.65, "learning_rate": 3.906646372399797e-08, "logits/chosen": -0.4369220733642578, "logits/rejected": -0.46063485741615295, "logps/chosen": -123.28334045410156, "logps/rejected": -99.22138977050781, "loss": 0.1172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.824085235595703, "rewards/margins": 6.6783342361450195, "rewards/rejected": -2.854248046875, "step": 5800 }, { "epoch": 2.65, "eval_logits/chosen": -0.435650110244751, "eval_logits/rejected": -0.46478769183158875, "eval_logps/chosen": -115.99723815917969, "eval_logps/rejected": -95.30946350097656, "eval_loss": 0.14151296019554138, "eval_rewards/accuracies": 0.924580991268158, "eval_rewards/chosen": 3.5812554359436035, "eval_rewards/margins": 5.771151065826416, "eval_rewards/rejected": -2.1898951530456543, "eval_runtime": 896.6878, "eval_samples_per_second": 3.192, "eval_steps_per_second": 0.2, "step": 5800 }, { "epoch": 2.65, "learning_rate": 3.8559107052257736e-08, "logits/chosen": -0.4274473190307617, "logits/rejected": -0.45593366026878357, "logps/chosen": -113.11031341552734, "logps/rejected": -95.99092864990234, "loss": 0.1283, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.605879068374634, "rewards/margins": 5.030173301696777, "rewards/rejected": -2.4242947101593018, "step": 5810 }, { "epoch": 2.66, "learning_rate": 3.80517503805175e-08, "logits/chosen": -0.42664894461631775, "logits/rejected": -0.45183706283569336, "logps/chosen": -127.3673095703125, "logps/rejected": -99.54170227050781, "loss": 0.1521, "rewards/accuracies": 0.9375, "rewards/chosen": 3.776498794555664, "rewards/margins": 5.840052127838135, "rewards/rejected": -2.0635530948638916, "step": 5820 }, { "epoch": 2.66, "learning_rate": 3.754439370877727e-08, "logits/chosen": -0.4793631136417389, "logits/rejected": -0.504968523979187, "logps/chosen": -116.24549865722656, "logps/rejected": -94.78995513916016, "loss": 0.143, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5556461811065674, "rewards/margins": 5.986898422241211, "rewards/rejected": -2.4312522411346436, "step": 5830 }, { "epoch": 2.67, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -0.41421228647232056, "logits/rejected": -0.4470001757144928, "logps/chosen": -113.46546936035156, "logps/rejected": -95.14855194091797, "loss": 0.0926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.197138786315918, "rewards/margins": 6.1041083335876465, "rewards/rejected": -1.9069693088531494, "step": 5840 }, { "epoch": 2.67, "learning_rate": 3.65296803652968e-08, "logits/chosen": -0.4365863800048828, "logits/rejected": -0.46884846687316895, "logps/chosen": -113.07588195800781, "logps/rejected": -91.25666046142578, "loss": 0.1092, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.941469192504883, "rewards/margins": 5.8505144119262695, "rewards/rejected": -1.9090449810028076, "step": 5850 }, { "epoch": 2.67, "learning_rate": 3.602232369355657e-08, "logits/chosen": -0.457558810710907, "logits/rejected": -0.48787039518356323, "logps/chosen": -114.20429992675781, "logps/rejected": -96.5516586303711, "loss": 0.1184, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.194037437438965, "rewards/margins": 6.516490936279297, "rewards/rejected": -2.322453737258911, "step": 5860 }, { "epoch": 2.68, "learning_rate": 3.5514967021816336e-08, "logits/chosen": -0.42379432916641235, "logits/rejected": -0.4607546925544739, "logps/chosen": -121.55973815917969, "logps/rejected": -99.63563537597656, "loss": 0.1527, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.44954776763916, "rewards/margins": 6.263568878173828, "rewards/rejected": -1.8140218257904053, "step": 5870 }, { "epoch": 2.68, "learning_rate": 3.50076103500761e-08, "logits/chosen": -0.44541049003601074, "logits/rejected": -0.46894732117652893, "logps/chosen": -119.17094421386719, "logps/rejected": -101.33116149902344, "loss": 0.1135, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.428978443145752, "rewards/margins": 6.683131217956543, "rewards/rejected": -2.254152774810791, "step": 5880 }, { "epoch": 2.69, "learning_rate": 3.450025367833587e-08, "logits/chosen": -0.4532981514930725, "logits/rejected": -0.4787921905517578, "logps/chosen": -122.17115783691406, "logps/rejected": -98.2051773071289, "loss": 0.1307, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.271052598953247, "rewards/margins": 5.8984832763671875, "rewards/rejected": -2.627429962158203, "step": 5890 }, { "epoch": 2.69, "learning_rate": 3.3992897006595636e-08, "logits/chosen": -0.43592172861099243, "logits/rejected": -0.4776608347892761, "logps/chosen": -120.0385971069336, "logps/rejected": -99.87791442871094, "loss": 0.1257, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.9749057292938232, "rewards/margins": 6.650933742523193, "rewards/rejected": -2.6760289669036865, "step": 5900 }, { "epoch": 2.69, "eval_logits/chosen": -0.42685261368751526, "eval_logits/rejected": -0.4552892744541168, "eval_logps/chosen": -116.34471893310547, "eval_logps/rejected": -95.53901672363281, "eval_loss": 0.14278730750083923, "eval_rewards/accuracies": 0.9217877388000488, "eval_rewards/chosen": 3.4075093269348145, "eval_rewards/margins": 5.712179183959961, "eval_rewards/rejected": -2.3046703338623047, "eval_runtime": 898.1404, "eval_samples_per_second": 3.187, "eval_steps_per_second": 0.199, "step": 5900 }, { "epoch": 2.7, "learning_rate": 3.34855403348554e-08, "logits/chosen": -0.4431632459163666, "logits/rejected": -0.48115652799606323, "logps/chosen": -124.9809341430664, "logps/rejected": -96.26667785644531, "loss": 0.1237, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.165075778961182, "rewards/margins": 6.46014928817749, "rewards/rejected": -2.295074462890625, "step": 5910 }, { "epoch": 2.7, "learning_rate": 3.297818366311517e-08, "logits/chosen": -0.4241110682487488, "logits/rejected": -0.4464682936668396, "logps/chosen": -117.69609069824219, "logps/rejected": -96.72189331054688, "loss": 0.1075, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.744476318359375, "rewards/margins": 5.3151750564575195, "rewards/rejected": -1.570698618888855, "step": 5920 }, { "epoch": 2.71, "learning_rate": 3.2470826991374936e-08, "logits/chosen": -0.45314112305641174, "logits/rejected": -0.4763621687889099, "logps/chosen": -121.1375961303711, "logps/rejected": -96.46546936035156, "loss": 0.112, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.103701114654541, "rewards/margins": 6.2286553382873535, "rewards/rejected": -2.1249539852142334, "step": 5930 }, { "epoch": 2.71, "learning_rate": 3.19634703196347e-08, "logits/chosen": -0.4642356038093567, "logits/rejected": -0.4867876470088959, "logps/chosen": -119.946044921875, "logps/rejected": -100.07732391357422, "loss": 0.1153, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.106837749481201, "rewards/margins": 5.389840126037598, "rewards/rejected": -2.2830023765563965, "step": 5940 }, { "epoch": 2.72, "learning_rate": 3.145611364789447e-08, "logits/chosen": -0.4027109146118164, "logits/rejected": -0.4404265880584717, "logps/chosen": -114.95914459228516, "logps/rejected": -98.45368957519531, "loss": 0.1216, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.436598300933838, "rewards/margins": 6.390365123748779, "rewards/rejected": -2.9537672996520996, "step": 5950 }, { "epoch": 2.72, "learning_rate": 3.0948756976154236e-08, "logits/chosen": -0.43318143486976624, "logits/rejected": -0.46780461072921753, "logps/chosen": -116.51350402832031, "logps/rejected": -94.24980163574219, "loss": 0.1421, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.344388008117676, "rewards/margins": 6.8773345947265625, "rewards/rejected": -2.5329465866088867, "step": 5960 }, { "epoch": 2.73, "learning_rate": 3.0441400304414e-08, "logits/chosen": -0.42986053228378296, "logits/rejected": -0.4519343972206116, "logps/chosen": -116.03038024902344, "logps/rejected": -98.72990417480469, "loss": 0.1419, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6905105113983154, "rewards/margins": 5.332497596740723, "rewards/rejected": -1.6419864892959595, "step": 5970 }, { "epoch": 2.73, "learning_rate": 2.993404363267377e-08, "logits/chosen": -0.4326365888118744, "logits/rejected": -0.4565103054046631, "logps/chosen": -107.49725341796875, "logps/rejected": -99.25447082519531, "loss": 0.1093, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.4035542011260986, "rewards/margins": 6.07853364944458, "rewards/rejected": -2.6749796867370605, "step": 5980 }, { "epoch": 2.73, "learning_rate": 2.9426686960933532e-08, "logits/chosen": -0.4541063904762268, "logits/rejected": -0.4810553193092346, "logps/chosen": -127.1908187866211, "logps/rejected": -98.79722595214844, "loss": 0.1272, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.745981216430664, "rewards/margins": 5.777894496917725, "rewards/rejected": -2.0319135189056396, "step": 5990 }, { "epoch": 2.74, "learning_rate": 2.89193302891933e-08, "logits/chosen": -0.417894184589386, "logits/rejected": -0.45300230383872986, "logps/chosen": -118.5451431274414, "logps/rejected": -100.79390716552734, "loss": 0.1441, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.300325393676758, "rewards/margins": 5.615603446960449, "rewards/rejected": -2.3152778148651123, "step": 6000 }, { "epoch": 2.74, "eval_logits/chosen": -0.44005295634269714, "eval_logits/rejected": -0.46727845072746277, "eval_logps/chosen": -116.30235290527344, "eval_logps/rejected": -95.57170867919922, "eval_loss": 0.14259207248687744, "eval_rewards/accuracies": 0.9189944267272949, "eval_rewards/chosen": 3.4286997318267822, "eval_rewards/margins": 5.749711036682129, "eval_rewards/rejected": -2.3210110664367676, "eval_runtime": 905.9645, "eval_samples_per_second": 3.159, "eval_steps_per_second": 0.198, "step": 6000 }, { "epoch": 2.74, "learning_rate": 2.8411973617453066e-08, "logits/chosen": -0.44576215744018555, "logits/rejected": -0.46743813157081604, "logps/chosen": -115.3594741821289, "logps/rejected": -98.07334899902344, "loss": 0.1339, "rewards/accuracies": 0.9375, "rewards/chosen": 3.71684193611145, "rewards/margins": 5.9137444496154785, "rewards/rejected": -2.1969027519226074, "step": 6010 }, { "epoch": 2.75, "learning_rate": 2.7904616945712832e-08, "logits/chosen": -0.40970954298973083, "logits/rejected": -0.4393082559108734, "logps/chosen": -124.13734436035156, "logps/rejected": -93.94351959228516, "loss": 0.1018, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.169826030731201, "rewards/margins": 6.453677177429199, "rewards/rejected": -2.283851146697998, "step": 6020 }, { "epoch": 2.75, "learning_rate": 2.73972602739726e-08, "logits/chosen": -0.4212943911552429, "logits/rejected": -0.4489540159702301, "logps/chosen": -122.31058502197266, "logps/rejected": -96.13169860839844, "loss": 0.1357, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3273720741271973, "rewards/margins": 5.711031913757324, "rewards/rejected": -2.383660316467285, "step": 6030 }, { "epoch": 2.76, "learning_rate": 2.6889903602232366e-08, "logits/chosen": -0.4667048454284668, "logits/rejected": -0.49229010939598083, "logps/chosen": -122.15106201171875, "logps/rejected": -99.2340087890625, "loss": 0.1091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.865828037261963, "rewards/margins": 6.067671775817871, "rewards/rejected": -2.2018446922302246, "step": 6040 }, { "epoch": 2.76, "learning_rate": 2.6382546930492132e-08, "logits/chosen": -0.4627668857574463, "logits/rejected": -0.4806435704231262, "logps/chosen": -122.12646484375, "logps/rejected": -95.74661254882812, "loss": 0.1049, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7494614124298096, "rewards/margins": 5.866511344909668, "rewards/rejected": -2.1170496940612793, "step": 6050 }, { "epoch": 2.77, "learning_rate": 2.58751902587519e-08, "logits/chosen": -0.4114772379398346, "logits/rejected": -0.44379204511642456, "logps/chosen": -117.29730224609375, "logps/rejected": -94.56267547607422, "loss": 0.1538, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.685570478439331, "rewards/margins": 6.307326316833496, "rewards/rejected": -2.621755599975586, "step": 6060 }, { "epoch": 2.77, "learning_rate": 2.5367833587011665e-08, "logits/chosen": -0.4341645836830139, "logits/rejected": -0.46204155683517456, "logps/chosen": -122.454345703125, "logps/rejected": -96.84683227539062, "loss": 0.1169, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7362613677978516, "rewards/margins": 6.1316447257995605, "rewards/rejected": -2.3953843116760254, "step": 6070 }, { "epoch": 2.78, "learning_rate": 2.4860476915271432e-08, "logits/chosen": -0.45279574394226074, "logits/rejected": -0.477097749710083, "logps/chosen": -128.41921997070312, "logps/rejected": -93.85469818115234, "loss": 0.1335, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.203787803649902, "rewards/margins": 6.0743513107299805, "rewards/rejected": -1.8705631494522095, "step": 6080 }, { "epoch": 2.78, "learning_rate": 2.43531202435312e-08, "logits/chosen": -0.4524506628513336, "logits/rejected": -0.4743841290473938, "logps/chosen": -106.57120513916016, "logps/rejected": -93.22120666503906, "loss": 0.1122, "rewards/accuracies": 0.9375, "rewards/chosen": 3.411867618560791, "rewards/margins": 5.438270568847656, "rewards/rejected": -2.0264029502868652, "step": 6090 }, { "epoch": 2.78, "learning_rate": 2.3845763571790965e-08, "logits/chosen": -0.46355438232421875, "logits/rejected": -0.4836340546607971, "logps/chosen": -118.38304138183594, "logps/rejected": -95.64445495605469, "loss": 0.1359, "rewards/accuracies": 0.9375, "rewards/chosen": 4.004835605621338, "rewards/margins": 6.421379089355469, "rewards/rejected": -2.4165430068969727, "step": 6100 }, { "epoch": 2.78, "eval_logits/chosen": -0.4173344075679779, "eval_logits/rejected": -0.4409016966819763, "eval_logps/chosen": -116.19310760498047, "eval_logps/rejected": -95.52818298339844, "eval_loss": 0.14793427288532257, "eval_rewards/accuracies": 0.9357541799545288, "eval_rewards/chosen": 3.4833199977874756, "eval_rewards/margins": 5.782570838928223, "eval_rewards/rejected": -2.299251079559326, "eval_runtime": 926.5032, "eval_samples_per_second": 3.089, "eval_steps_per_second": 0.193, "step": 6100 }, { "epoch": 2.79, "learning_rate": 2.3338406900050732e-08, "logits/chosen": -0.4381275177001953, "logits/rejected": -0.4650820195674896, "logps/chosen": -118.45979309082031, "logps/rejected": -89.73667907714844, "loss": 0.141, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4960122108459473, "rewards/margins": 5.491837024688721, "rewards/rejected": -1.9958248138427734, "step": 6110 }, { "epoch": 2.79, "learning_rate": 2.28310502283105e-08, "logits/chosen": -0.43747448921203613, "logits/rejected": -0.45705556869506836, "logps/chosen": -119.84431457519531, "logps/rejected": -100.47638702392578, "loss": 0.1412, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7409744262695312, "rewards/margins": 5.933928489685059, "rewards/rejected": -2.1929538249969482, "step": 6120 }, { "epoch": 2.8, "learning_rate": 2.2323693556570265e-08, "logits/chosen": -0.44301462173461914, "logits/rejected": -0.4626496732234955, "logps/chosen": -112.18086242675781, "logps/rejected": -90.97620391845703, "loss": 0.1081, "rewards/accuracies": 0.9375, "rewards/chosen": 3.0974788665771484, "rewards/margins": 5.079029083251953, "rewards/rejected": -1.9815499782562256, "step": 6130 }, { "epoch": 2.8, "learning_rate": 2.1816336884830032e-08, "logits/chosen": -0.44184190034866333, "logits/rejected": -0.4639458656311035, "logps/chosen": -115.1020736694336, "logps/rejected": -94.1207504272461, "loss": 0.1555, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.652765989303589, "rewards/margins": 5.641629219055176, "rewards/rejected": -1.9888633489608765, "step": 6140 }, { "epoch": 2.81, "learning_rate": 2.13089802130898e-08, "logits/chosen": -0.44147539138793945, "logits/rejected": -0.4748079776763916, "logps/chosen": -123.27156066894531, "logps/rejected": -92.42192077636719, "loss": 0.1192, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.140544891357422, "rewards/margins": 6.6083197593688965, "rewards/rejected": -2.4677751064300537, "step": 6150 }, { "epoch": 2.81, "learning_rate": 2.0801623541349565e-08, "logits/chosen": -0.42793139815330505, "logits/rejected": -0.4529266953468323, "logps/chosen": -119.36943054199219, "logps/rejected": -103.21095275878906, "loss": 0.1234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7451796531677246, "rewards/margins": 5.694420337677002, "rewards/rejected": -1.9492400884628296, "step": 6160 }, { "epoch": 2.82, "learning_rate": 2.0294266869609332e-08, "logits/chosen": -0.4539063572883606, "logits/rejected": -0.4759596288204193, "logps/chosen": -118.61543273925781, "logps/rejected": -97.66639709472656, "loss": 0.1363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.301544189453125, "rewards/margins": 5.550912380218506, "rewards/rejected": -2.24936842918396, "step": 6170 }, { "epoch": 2.82, "learning_rate": 1.97869101978691e-08, "logits/chosen": -0.4578167498111725, "logits/rejected": -0.48147234320640564, "logps/chosen": -123.6511459350586, "logps/rejected": -99.3853759765625, "loss": 0.1132, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5267632007598877, "rewards/margins": 6.09173059463501, "rewards/rejected": -2.564967155456543, "step": 6180 }, { "epoch": 2.83, "learning_rate": 1.9279553526128868e-08, "logits/chosen": -0.44004470109939575, "logits/rejected": -0.47797951102256775, "logps/chosen": -120.70310974121094, "logps/rejected": -98.59986877441406, "loss": 0.1371, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.0213212966918945, "rewards/margins": 6.670281887054443, "rewards/rejected": -2.648960590362549, "step": 6190 }, { "epoch": 2.83, "learning_rate": 1.8772196854388635e-08, "logits/chosen": -0.46676498651504517, "logits/rejected": -0.4783889651298523, "logps/chosen": -114.30501556396484, "logps/rejected": -95.9433364868164, "loss": 0.1332, "rewards/accuracies": 0.9375, "rewards/chosen": 3.327721118927002, "rewards/margins": 5.742402076721191, "rewards/rejected": -2.414680004119873, "step": 6200 }, { "epoch": 2.83, "eval_logits/chosen": -0.4261917769908905, "eval_logits/rejected": -0.451227068901062, "eval_logps/chosen": -116.21163940429688, "eval_logps/rejected": -95.4748306274414, "eval_loss": 0.14415588974952698, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.4740583896636963, "eval_rewards/margins": 5.7466325759887695, "eval_rewards/rejected": -2.2725744247436523, "eval_runtime": 922.4424, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.194, "step": 6200 }, { "epoch": 2.83, "learning_rate": 1.82648401826484e-08, "logits/chosen": -0.44827452301979065, "logits/rejected": -0.466301828622818, "logps/chosen": -120.3111801147461, "logps/rejected": -96.71390533447266, "loss": 0.1551, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.323174238204956, "rewards/margins": 5.543818473815918, "rewards/rejected": -2.220643997192383, "step": 6210 }, { "epoch": 2.84, "learning_rate": 1.7757483510908168e-08, "logits/chosen": -0.4289180636405945, "logits/rejected": -0.450812965631485, "logps/chosen": -112.69013977050781, "logps/rejected": -97.18971252441406, "loss": 0.1224, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.9697463512420654, "rewards/margins": 6.175944805145264, "rewards/rejected": -2.2061984539031982, "step": 6220 }, { "epoch": 2.84, "learning_rate": 1.7250126839167935e-08, "logits/chosen": -0.44035395979881287, "logits/rejected": -0.45798999071121216, "logps/chosen": -105.89752197265625, "logps/rejected": -93.09783935546875, "loss": 0.1339, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.038432598114014, "rewards/margins": 5.471198081970215, "rewards/rejected": -1.4327653646469116, "step": 6230 }, { "epoch": 2.85, "learning_rate": 1.67427701674277e-08, "logits/chosen": -0.4607762396335602, "logits/rejected": -0.47269120812416077, "logps/chosen": -118.00733947753906, "logps/rejected": -96.32691955566406, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 3.8974251747131348, "rewards/margins": 6.761053562164307, "rewards/rejected": -2.86362886428833, "step": 6240 }, { "epoch": 2.85, "learning_rate": 1.6235413495687468e-08, "logits/chosen": -0.4478604197502136, "logits/rejected": -0.464190810918808, "logps/chosen": -115.123291015625, "logps/rejected": -98.07180786132812, "loss": 0.126, "rewards/accuracies": 0.9375, "rewards/chosen": 4.360918998718262, "rewards/margins": 5.949367046356201, "rewards/rejected": -1.5884480476379395, "step": 6250 }, { "epoch": 2.86, "learning_rate": 1.5728056823947235e-08, "logits/chosen": -0.44280537962913513, "logits/rejected": -0.46292656660079956, "logps/chosen": -117.03279876708984, "logps/rejected": -98.99063873291016, "loss": 0.1115, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.106674671173096, "rewards/margins": 6.606393337249756, "rewards/rejected": -2.499718427658081, "step": 6260 }, { "epoch": 2.86, "learning_rate": 1.5220700152207e-08, "logits/chosen": -0.4295724332332611, "logits/rejected": -0.4528141915798187, "logps/chosen": -110.50325012207031, "logps/rejected": -98.00163269042969, "loss": 0.1204, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.804098606109619, "rewards/margins": 6.637032508850098, "rewards/rejected": -2.8329339027404785, "step": 6270 }, { "epoch": 2.87, "learning_rate": 1.4713343480466766e-08, "logits/chosen": -0.439584881067276, "logits/rejected": -0.4629645347595215, "logps/chosen": -122.1606216430664, "logps/rejected": -97.83927154541016, "loss": 0.1304, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.3962371349334717, "rewards/margins": 5.390679359436035, "rewards/rejected": -1.9944416284561157, "step": 6280 }, { "epoch": 2.87, "learning_rate": 1.4205986808726533e-08, "logits/chosen": -0.4244330823421478, "logits/rejected": -0.44999808073043823, "logps/chosen": -124.31058502197266, "logps/rejected": -93.81672668457031, "loss": 0.1086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.9259040355682373, "rewards/margins": 6.213841438293457, "rewards/rejected": -2.2879371643066406, "step": 6290 }, { "epoch": 2.88, "learning_rate": 1.36986301369863e-08, "logits/chosen": -0.4378216862678528, "logits/rejected": -0.4623119831085205, "logps/chosen": -112.90274810791016, "logps/rejected": -95.5672607421875, "loss": 0.1454, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.6615824699401855, "rewards/margins": 5.905343055725098, "rewards/rejected": -2.243760824203491, "step": 6300 }, { "epoch": 2.88, "eval_logits/chosen": -0.4354705512523651, "eval_logits/rejected": -0.46043190360069275, "eval_logps/chosen": -116.27782440185547, "eval_logps/rejected": -95.51180267333984, "eval_loss": 0.13972659409046173, "eval_rewards/accuracies": 0.9357541799545288, "eval_rewards/chosen": 3.4409618377685547, "eval_rewards/margins": 5.732030391693115, "eval_rewards/rejected": -2.2910685539245605, "eval_runtime": 909.5558, "eval_samples_per_second": 3.147, "eval_steps_per_second": 0.197, "step": 6300 }, { "epoch": 2.88, "learning_rate": 1.3191273465246066e-08, "logits/chosen": -0.4530218541622162, "logits/rejected": -0.4725785255432129, "logps/chosen": -124.86883544921875, "logps/rejected": -98.3955078125, "loss": 0.1308, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5713772773742676, "rewards/margins": 5.833029270172119, "rewards/rejected": -2.2616517543792725, "step": 6310 }, { "epoch": 2.88, "learning_rate": 1.2683916793505833e-08, "logits/chosen": -0.4388354420661926, "logits/rejected": -0.4666750431060791, "logps/chosen": -123.91282653808594, "logps/rejected": -101.6700668334961, "loss": 0.1498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.2302632331848145, "rewards/margins": 6.3310418128967285, "rewards/rejected": -2.100778579711914, "step": 6320 }, { "epoch": 2.89, "learning_rate": 1.21765601217656e-08, "logits/chosen": -0.45121073722839355, "logits/rejected": -0.474397748708725, "logps/chosen": -113.64533996582031, "logps/rejected": -96.31367492675781, "loss": 0.1256, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.1905887126922607, "rewards/margins": 5.855005741119385, "rewards/rejected": -2.664416790008545, "step": 6330 }, { "epoch": 2.89, "learning_rate": 1.1669203450025366e-08, "logits/chosen": -0.4414314329624176, "logits/rejected": -0.4682813286781311, "logps/chosen": -118.41851806640625, "logps/rejected": -98.83721160888672, "loss": 0.1285, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3815665245056152, "rewards/margins": 5.590354919433594, "rewards/rejected": -2.2087883949279785, "step": 6340 }, { "epoch": 2.9, "learning_rate": 1.1161846778285133e-08, "logits/chosen": -0.4640694558620453, "logits/rejected": -0.4832492470741272, "logps/chosen": -118.1152114868164, "logps/rejected": -95.31473541259766, "loss": 0.116, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.576420307159424, "rewards/margins": 5.862317085266113, "rewards/rejected": -2.2858967781066895, "step": 6350 }, { "epoch": 2.9, "learning_rate": 1.06544901065449e-08, "logits/chosen": -0.433235228061676, "logits/rejected": -0.46126851439476013, "logps/chosen": -119.97880554199219, "logps/rejected": -96.08879852294922, "loss": 0.1267, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.7901527881622314, "rewards/margins": 6.419007778167725, "rewards/rejected": -2.628854751586914, "step": 6360 }, { "epoch": 2.91, "learning_rate": 1.0147133434804666e-08, "logits/chosen": -0.4347938001155853, "logits/rejected": -0.46376532316207886, "logps/chosen": -120.90704345703125, "logps/rejected": -96.43917083740234, "loss": 0.1411, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.9222941398620605, "rewards/margins": 6.398364067077637, "rewards/rejected": -2.4760701656341553, "step": 6370 }, { "epoch": 2.91, "learning_rate": 9.639776763064434e-09, "logits/chosen": -0.43002352118492126, "logits/rejected": -0.454629123210907, "logps/chosen": -116.3228988647461, "logps/rejected": -94.8536605834961, "loss": 0.1246, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.674093246459961, "rewards/margins": 6.51773738861084, "rewards/rejected": -2.843644618988037, "step": 6380 }, { "epoch": 2.92, "learning_rate": 9.1324200913242e-09, "logits/chosen": -0.4467340409755707, "logits/rejected": -0.47312459349632263, "logps/chosen": -115.29522705078125, "logps/rejected": -95.09002685546875, "loss": 0.117, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3362815380096436, "rewards/margins": 5.706353187561035, "rewards/rejected": -2.3700718879699707, "step": 6390 }, { "epoch": 2.92, "learning_rate": 8.625063419583967e-09, "logits/chosen": -0.439365953207016, "logits/rejected": -0.46871843934059143, "logps/chosen": -117.05690002441406, "logps/rejected": -96.14810943603516, "loss": 0.1355, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.502894878387451, "rewards/margins": 6.180233955383301, "rewards/rejected": -2.6773390769958496, "step": 6400 }, { "epoch": 2.92, "eval_logits/chosen": -0.4225231111049652, "eval_logits/rejected": -0.44727426767349243, "eval_logps/chosen": -116.41168212890625, "eval_logps/rejected": -95.67748260498047, "eval_loss": 0.1470990926027298, "eval_rewards/accuracies": 0.9329608678817749, "eval_rewards/chosen": 3.3740320205688477, "eval_rewards/margins": 5.747939586639404, "eval_rewards/rejected": -2.3739078044891357, "eval_runtime": 896.7307, "eval_samples_per_second": 3.192, "eval_steps_per_second": 0.2, "step": 6400 }, { "epoch": 2.93, "learning_rate": 8.117706747843734e-09, "logits/chosen": -0.42712831497192383, "logits/rejected": -0.44280409812927246, "logps/chosen": -119.4900894165039, "logps/rejected": -98.96273803710938, "loss": 0.1182, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.9419543743133545, "rewards/margins": 6.928774356842041, "rewards/rejected": -2.9868204593658447, "step": 6410 }, { "epoch": 2.93, "learning_rate": 7.6103500761035e-09, "logits/chosen": -0.41072121262550354, "logits/rejected": -0.43890589475631714, "logps/chosen": -118.20353698730469, "logps/rejected": -94.67088317871094, "loss": 0.1151, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.44974422454834, "rewards/margins": 6.641238212585449, "rewards/rejected": -2.191495180130005, "step": 6420 }, { "epoch": 2.94, "learning_rate": 7.1029934043632664e-09, "logits/chosen": -0.4472702145576477, "logits/rejected": -0.46455731987953186, "logps/chosen": -123.02734375, "logps/rejected": -95.70638275146484, "loss": 0.1486, "rewards/accuracies": 0.9375, "rewards/chosen": 3.530532121658325, "rewards/margins": 6.119112491607666, "rewards/rejected": -2.58858060836792, "step": 6430 }, { "epoch": 2.94, "learning_rate": 6.595636732623033e-09, "logits/chosen": -0.4400475025177002, "logits/rejected": -0.4567417502403259, "logps/chosen": -113.33174896240234, "logps/rejected": -98.16852569580078, "loss": 0.1225, "rewards/accuracies": 0.9375, "rewards/chosen": 3.7850425243377686, "rewards/margins": 5.5529022216796875, "rewards/rejected": -1.767859697341919, "step": 6440 }, { "epoch": 2.94, "learning_rate": 6.0882800608828e-09, "logits/chosen": -0.4608462452888489, "logits/rejected": -0.4785988926887512, "logps/chosen": -119.42830657958984, "logps/rejected": -101.83099365234375, "loss": 0.1226, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.2882943153381348, "rewards/margins": 5.341013431549072, "rewards/rejected": -2.0527195930480957, "step": 6450 }, { "epoch": 2.95, "learning_rate": 5.580923389142566e-09, "logits/chosen": -0.42657288908958435, "logits/rejected": -0.4634782671928406, "logps/chosen": -118.7545166015625, "logps/rejected": -99.45500183105469, "loss": 0.1135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7978293895721436, "rewards/margins": 6.2398295402526855, "rewards/rejected": -2.442000150680542, "step": 6460 }, { "epoch": 2.95, "learning_rate": 5.073566717402333e-09, "logits/chosen": -0.4478042721748352, "logits/rejected": -0.4692462980747223, "logps/chosen": -125.03216552734375, "logps/rejected": -98.91600036621094, "loss": 0.1383, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.3301844596862793, "rewards/margins": 5.73842716217041, "rewards/rejected": -2.40824294090271, "step": 6470 }, { "epoch": 2.96, "learning_rate": 4.5662100456621e-09, "logits/chosen": -0.44531145691871643, "logits/rejected": -0.4770316183567047, "logps/chosen": -119.36907958984375, "logps/rejected": -96.36643981933594, "loss": 0.1276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.4794089794158936, "rewards/margins": 5.956811904907227, "rewards/rejected": -2.477403163909912, "step": 6480 }, { "epoch": 2.96, "learning_rate": 4.058853373921867e-09, "logits/chosen": -0.42967623472213745, "logits/rejected": -0.4564761221408844, "logps/chosen": -125.42822265625, "logps/rejected": -97.00397491455078, "loss": 0.1277, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.159353733062744, "rewards/margins": 6.9187469482421875, "rewards/rejected": -2.7593941688537598, "step": 6490 }, { "epoch": 2.97, "learning_rate": 3.5514967021816332e-09, "logits/chosen": -0.44123202562332153, "logits/rejected": -0.46388721466064453, "logps/chosen": -115.01322937011719, "logps/rejected": -94.85794830322266, "loss": 0.1114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9512016773223877, "rewards/margins": 5.423955917358398, "rewards/rejected": -1.4727542400360107, "step": 6500 }, { "epoch": 2.97, "eval_logits/chosen": -0.4345143139362335, "eval_logits/rejected": -0.4595170319080353, "eval_logps/chosen": -116.18891906738281, "eval_logps/rejected": -95.57404327392578, "eval_loss": 0.1396893560886383, "eval_rewards/accuracies": 0.9301676154136658, "eval_rewards/chosen": 3.4854142665863037, "eval_rewards/margins": 5.8075995445251465, "eval_rewards/rejected": -2.3221850395202637, "eval_runtime": 909.3942, "eval_samples_per_second": 3.147, "eval_steps_per_second": 0.197, "step": 6500 }, { "epoch": 2.97, "learning_rate": 3.0441400304414e-09, "logits/chosen": -0.43526285886764526, "logits/rejected": -0.4636686444282532, "logps/chosen": -117.7777099609375, "logps/rejected": -97.91990661621094, "loss": 0.1216, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.961691379547119, "rewards/margins": 5.777933120727539, "rewards/rejected": -1.8162418603897095, "step": 6510 }, { "epoch": 2.98, "learning_rate": 2.5367833587011665e-09, "logits/chosen": -0.43703460693359375, "logits/rejected": -0.4599098563194275, "logps/chosen": -125.13053894042969, "logps/rejected": -97.0875473022461, "loss": 0.1135, "rewards/accuracies": 0.9375, "rewards/chosen": 3.697622299194336, "rewards/margins": 5.988824367523193, "rewards/rejected": -2.2912018299102783, "step": 6520 }, { "epoch": 2.98, "learning_rate": 2.0294266869609335e-09, "logits/chosen": -0.4473207890987396, "logits/rejected": -0.4712890684604645, "logps/chosen": -111.90199279785156, "logps/rejected": -96.57246398925781, "loss": 0.1362, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5259852409362793, "rewards/margins": 5.500835418701172, "rewards/rejected": -1.974850058555603, "step": 6530 }, { "epoch": 2.99, "learning_rate": 1.5220700152207e-09, "logits/chosen": -0.4719008803367615, "logits/rejected": -0.4840930104255676, "logps/chosen": -113.07936096191406, "logps/rejected": -94.11990356445312, "loss": 0.1438, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.121089935302734, "rewards/margins": 5.908658027648926, "rewards/rejected": -1.7875677347183228, "step": 6540 }, { "epoch": 2.99, "learning_rate": 1.0147133434804667e-09, "logits/chosen": -0.43527278304100037, "logits/rejected": -0.4640693664550781, "logps/chosen": -127.6871566772461, "logps/rejected": -93.20718383789062, "loss": 0.143, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.136395454406738, "rewards/margins": 5.834487438201904, "rewards/rejected": -1.6980924606323242, "step": 6550 }, { "epoch": 2.99, "learning_rate": 5.073566717402334e-10, "logits/chosen": -0.4505973756313324, "logits/rejected": -0.46244126558303833, "logps/chosen": -120.34539794921875, "logps/rejected": -96.04461669921875, "loss": 0.1235, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.4686827659606934, "rewards/margins": 5.904242038726807, "rewards/rejected": -2.435559034347534, "step": 6560 }, { "epoch": 3.0, "learning_rate": 0.0, "logits/chosen": -0.44836997985839844, "logits/rejected": -0.47291040420532227, "logps/chosen": -113.19990539550781, "logps/rejected": -96.67423248291016, "loss": 0.1073, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.5085701942443848, "rewards/margins": 5.106671333312988, "rewards/rejected": -2.5981011390686035, "step": 6570 }, { "epoch": 3.0, "step": 6570, "total_flos": 0.0, "train_loss": 0.2317562538376319, "train_runtime": 114179.2227, "train_samples_per_second": 3.684, "train_steps_per_second": 0.058 } ], "logging_steps": 10, "max_steps": 6570, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }