MathOctopus-MAPO-DPO-13B / trainer_state.json
VincentVioletLx
commit from VincentLx
0fe2ce9
raw
history blame contribute delete
No virus
103 kB
{
"best_metric": 0.6881732940673828,
"best_model_checkpoint": "/mnt/data/shesj/Trained/RL4CoT/DPO/Parallel_13B_numglueCorrect_extend_10lang_v3_iter2.json/checkpoint-1000",
"epoch": 0.2508938091952581,
"eval_steps": 100,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1e-08,
"logits/chosen": -1.5961366891860962,
"logits/rejected": -1.4505422115325928,
"logps/chosen": -5.596881866455078,
"logps/rejected": -9.411199569702148,
"loss": 0.6934,
"rewards/accuracies": 0.26249998807907104,
"rewards/chosen": -0.0024300559889525175,
"rewards/margins": -0.003987783100455999,
"rewards/rejected": 0.00155772699508816,
"step": 5
},
{
"epoch": 0.0,
"learning_rate": 2e-08,
"logits/chosen": -1.7575080394744873,
"logits/rejected": -1.5709590911865234,
"logps/chosen": -5.7118635177612305,
"logps/rejected": -8.229207992553711,
"loss": 0.6932,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.005083759315311909,
"rewards/margins": 0.007960619404911995,
"rewards/rejected": -0.0028768605552613735,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 3e-08,
"logits/chosen": -1.3776355981826782,
"logits/rejected": -1.1297904253005981,
"logps/chosen": -5.6487884521484375,
"logps/rejected": -6.3460259437561035,
"loss": 0.6928,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0028433147817850113,
"rewards/margins": 0.004021753557026386,
"rewards/rejected": -0.0011784390080720186,
"step": 15
},
{
"epoch": 0.01,
"learning_rate": 4e-08,
"logits/chosen": -1.3416035175323486,
"logits/rejected": -1.4808504581451416,
"logps/chosen": -6.05244255065918,
"logps/rejected": -6.185896873474121,
"loss": 0.6939,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.002370675327256322,
"rewards/margins": -0.003694503800943494,
"rewards/rejected": 0.0013238281244412065,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 5e-08,
"logits/chosen": -1.5403258800506592,
"logits/rejected": -1.6286535263061523,
"logps/chosen": -5.387463569641113,
"logps/rejected": -6.760479927062988,
"loss": 0.6931,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.002958612982183695,
"rewards/margins": -0.0027186197694391012,
"rewards/rejected": -0.0002399933582637459,
"step": 25
},
{
"epoch": 0.01,
"learning_rate": 6e-08,
"logits/chosen": -1.3823367357254028,
"logits/rejected": -1.3627607822418213,
"logps/chosen": -5.9833083152771,
"logps/rejected": -5.850650787353516,
"loss": 0.6936,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.002619728446006775,
"rewards/margins": 0.004939082078635693,
"rewards/rejected": -0.002319354098290205,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 6.999999999999999e-08,
"logits/chosen": -1.4706413745880127,
"logits/rejected": -1.4250898361206055,
"logps/chosen": -5.871586799621582,
"logps/rejected": -7.329322814941406,
"loss": 0.6927,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0053069936111569405,
"rewards/margins": 0.006646360270678997,
"rewards/rejected": -0.001339366426691413,
"step": 35
},
{
"epoch": 0.01,
"learning_rate": 8e-08,
"logits/chosen": -1.3677151203155518,
"logits/rejected": -1.4711055755615234,
"logps/chosen": -5.220858573913574,
"logps/rejected": -6.019894599914551,
"loss": 0.6922,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.0007630128529854119,
"rewards/margins": 0.0009836136596277356,
"rewards/rejected": -0.0002206008939538151,
"step": 40
},
{
"epoch": 0.01,
"learning_rate": 9e-08,
"logits/chosen": -1.5524793863296509,
"logits/rejected": -1.5295554399490356,
"logps/chosen": -5.439484596252441,
"logps/rejected": -6.122335433959961,
"loss": 0.6937,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.0029087450820952654,
"rewards/margins": -0.0008771896245889366,
"rewards/rejected": -0.002031555399298668,
"step": 45
},
{
"epoch": 0.01,
"learning_rate": 1e-07,
"logits/chosen": -1.5465644598007202,
"logits/rejected": -1.5324052572250366,
"logps/chosen": -5.974602699279785,
"logps/rejected": -6.071439743041992,
"loss": 0.6932,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.00048004291602410376,
"rewards/margins": -0.0023091284092515707,
"rewards/rejected": 0.0027891716454178095,
"step": 50
},
{
"epoch": 0.01,
"learning_rate": 1.1e-07,
"logits/chosen": -1.4663435220718384,
"logits/rejected": -1.4614862203598022,
"logps/chosen": -5.4424333572387695,
"logps/rejected": -6.563809394836426,
"loss": 0.6937,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0017671022797003388,
"rewards/margins": 0.00013901680358685553,
"rewards/rejected": 0.0016280856216326356,
"step": 55
},
{
"epoch": 0.02,
"learning_rate": 1.2e-07,
"logits/chosen": -1.3572139739990234,
"logits/rejected": -1.5018689632415771,
"logps/chosen": -5.387210845947266,
"logps/rejected": -5.628620147705078,
"loss": 0.6934,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0009455516701564193,
"rewards/margins": -0.0012193130096420646,
"rewards/rejected": 0.0002737622708082199,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 1.3e-07,
"logits/chosen": -1.4500153064727783,
"logits/rejected": -1.5059070587158203,
"logps/chosen": -5.2743024826049805,
"logps/rejected": -5.707627773284912,
"loss": 0.6932,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -8.441717363893986e-05,
"rewards/margins": -0.0039388397708535194,
"rewards/rejected": 0.0038544225972145796,
"step": 65
},
{
"epoch": 0.02,
"learning_rate": 1.3999999999999998e-07,
"logits/chosen": -1.400269627571106,
"logits/rejected": -1.4793713092803955,
"logps/chosen": -5.212708473205566,
"logps/rejected": -7.475738525390625,
"loss": 0.6938,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.0009079872397705913,
"rewards/margins": -0.004652161151170731,
"rewards/rejected": 0.0037441744934767485,
"step": 70
},
{
"epoch": 0.02,
"learning_rate": 1.5e-07,
"logits/chosen": -1.6910864114761353,
"logits/rejected": -1.6056768894195557,
"logps/chosen": -5.710541725158691,
"logps/rejected": -6.734696388244629,
"loss": 0.6925,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0002835780323948711,
"rewards/margins": 0.001549507724121213,
"rewards/rejected": -0.0018330859020352364,
"step": 75
},
{
"epoch": 0.02,
"learning_rate": 1.6e-07,
"logits/chosen": -1.348565936088562,
"logits/rejected": -1.3284913301467896,
"logps/chosen": -5.4736833572387695,
"logps/rejected": -6.32749080657959,
"loss": 0.6916,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.004232033155858517,
"rewards/margins": 0.008012665435671806,
"rewards/rejected": -0.003780632745474577,
"step": 80
},
{
"epoch": 0.02,
"learning_rate": 1.7e-07,
"logits/chosen": -1.4623692035675049,
"logits/rejected": -1.3986611366271973,
"logps/chosen": -5.681046485900879,
"logps/rejected": -6.193971157073975,
"loss": 0.6923,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.003290227148681879,
"rewards/margins": 0.00016479431360494345,
"rewards/rejected": -0.003455021884292364,
"step": 85
},
{
"epoch": 0.02,
"learning_rate": 1.8e-07,
"logits/chosen": -1.6442101001739502,
"logits/rejected": -1.7291923761367798,
"logps/chosen": -6.625936985015869,
"logps/rejected": -8.430340766906738,
"loss": 0.6935,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.004968828056007624,
"rewards/margins": 0.004905154462903738,
"rewards/rejected": 6.367354944813997e-05,
"step": 90
},
{
"epoch": 0.02,
"learning_rate": 1.8999999999999998e-07,
"logits/chosen": -1.3575623035430908,
"logits/rejected": -1.3733503818511963,
"logps/chosen": -5.06931209564209,
"logps/rejected": -5.564260005950928,
"loss": 0.6925,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0030343024991452694,
"rewards/margins": 0.004197937436401844,
"rewards/rejected": -0.0011636342387646437,
"step": 95
},
{
"epoch": 0.03,
"learning_rate": 2e-07,
"logits/chosen": -1.4199776649475098,
"logits/rejected": -1.3951759338378906,
"logps/chosen": -4.820796012878418,
"logps/rejected": -5.737803936004639,
"loss": 0.6933,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.0034723032731562853,
"rewards/margins": -0.003412929829210043,
"rewards/rejected": -5.9372840041760355e-05,
"step": 100
},
{
"epoch": 0.03,
"eval_logits/chosen": -3.167680501937866,
"eval_logits/rejected": -3.1470484733581543,
"eval_logps/chosen": -6.138812065124512,
"eval_logps/rejected": -6.94625997543335,
"eval_loss": 0.6930756568908691,
"eval_rewards/accuracies": 0.5112179517745972,
"eval_rewards/chosen": 0.00018124215421266854,
"eval_rewards/margins": 0.00026574215735308826,
"eval_rewards/rejected": -8.450019231531769e-05,
"eval_runtime": 615.2356,
"eval_samples_per_second": 32.401,
"eval_steps_per_second": 0.507,
"step": 100
},
{
"epoch": 0.03,
"learning_rate": 1.9999658256641745e-07,
"logits/chosen": -1.5726451873779297,
"logits/rejected": -1.4517858028411865,
"logps/chosen": -6.400355339050293,
"logps/rejected": -8.746145248413086,
"loss": 0.6923,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.0017850773874670267,
"rewards/margins": 0.003671336220577359,
"rewards/rejected": -0.001886258483864367,
"step": 105
},
{
"epoch": 0.03,
"learning_rate": 1.999863304992469e-07,
"logits/chosen": -1.4806731939315796,
"logits/rejected": -1.4488584995269775,
"logps/chosen": -4.718806266784668,
"logps/rejected": -6.621995449066162,
"loss": 0.6931,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.003079561050981283,
"rewards/margins": -0.0010764991166070104,
"rewards/rejected": -0.002003061817958951,
"step": 110
},
{
"epoch": 0.03,
"learning_rate": 1.9996924449920347e-07,
"logits/chosen": -1.3951839208602905,
"logits/rejected": -1.5014991760253906,
"logps/chosen": -5.849172115325928,
"logps/rejected": -6.110888957977295,
"loss": 0.6916,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.002346306573599577,
"rewards/margins": -0.0002563331217970699,
"rewards/rejected": -0.0020899735391139984,
"step": 115
},
{
"epoch": 0.03,
"learning_rate": 1.999453257340926e-07,
"logits/chosen": -1.709111213684082,
"logits/rejected": -1.6502996683120728,
"logps/chosen": -5.375964164733887,
"logps/rejected": -6.7974419593811035,
"loss": 0.6908,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0006786620942875743,
"rewards/margins": 0.002161815296858549,
"rewards/rejected": -0.0014831533189862967,
"step": 120
},
{
"epoch": 0.03,
"learning_rate": 1.9991457583873009e-07,
"logits/chosen": -1.6699622869491577,
"logits/rejected": -1.6052825450897217,
"logps/chosen": -5.923104286193848,
"logps/rejected": -6.845878601074219,
"loss": 0.6904,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.002157662995159626,
"rewards/margins": 0.0037304076831787825,
"rewards/rejected": -0.0015727445716038346,
"step": 125
},
{
"epoch": 0.03,
"learning_rate": 1.9987699691483047e-07,
"logits/chosen": -1.5257129669189453,
"logits/rejected": -1.5561866760253906,
"logps/chosen": -4.64754581451416,
"logps/rejected": -6.103314399719238,
"loss": 0.69,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.00055070681264624,
"rewards/margins": 0.008159702643752098,
"rewards/rejected": -0.0076089962385594845,
"step": 130
},
{
"epoch": 0.03,
"learning_rate": 1.9983259153086325e-07,
"logits/chosen": -1.5833688974380493,
"logits/rejected": -1.5711325407028198,
"logps/chosen": -5.653990745544434,
"logps/rejected": -7.226287841796875,
"loss": 0.6918,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0021617007441818714,
"rewards/margins": 0.005132616497576237,
"rewards/rejected": -0.007294316776096821,
"step": 135
},
{
"epoch": 0.04,
"learning_rate": 1.9978136272187745e-07,
"logits/chosen": -1.3831666707992554,
"logits/rejected": -1.3798400163650513,
"logps/chosen": -4.3581929206848145,
"logps/rejected": -5.750641345977783,
"loss": 0.6905,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0044810837134718895,
"rewards/margins": 0.002624013228341937,
"rewards/rejected": 0.0018570702522993088,
"step": 140
},
{
"epoch": 0.04,
"learning_rate": 1.997233139892941e-07,
"logits/chosen": -1.4784475564956665,
"logits/rejected": -1.453616976737976,
"logps/chosen": -5.93430757522583,
"logps/rejected": -6.510165214538574,
"loss": 0.6904,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0023500092793256044,
"rewards/margins": 0.007515914738178253,
"rewards/rejected": -0.005165906623005867,
"step": 145
},
{
"epoch": 0.04,
"learning_rate": 1.9965844930066698e-07,
"logits/chosen": -1.4583405256271362,
"logits/rejected": -1.611533761024475,
"logps/chosen": -5.597588539123535,
"logps/rejected": -7.059754848480225,
"loss": 0.6884,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0004460577911231667,
"rewards/margins": 0.008472367189824581,
"rewards/rejected": -0.00891842506825924,
"step": 150
},
{
"epoch": 0.04,
"learning_rate": 1.9958677308941136e-07,
"logits/chosen": -1.590573787689209,
"logits/rejected": -1.5703933238983154,
"logps/chosen": -8.305377006530762,
"logps/rejected": -6.981376647949219,
"loss": 0.6898,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0003787805908359587,
"rewards/margins": 0.006806156598031521,
"rewards/rejected": -0.006427376065403223,
"step": 155
},
{
"epoch": 0.04,
"learning_rate": 1.9950829025450114e-07,
"logits/chosen": -1.6168692111968994,
"logits/rejected": -1.3941491842269897,
"logps/chosen": -4.724443435668945,
"logps/rejected": -7.058165073394775,
"loss": 0.6897,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.002109301509335637,
"rewards/margins": 0.008472011424601078,
"rewards/rejected": -0.00636270921677351,
"step": 160
},
{
"epoch": 0.04,
"learning_rate": 1.9942300616013377e-07,
"logits/chosen": -1.7095611095428467,
"logits/rejected": -1.7674989700317383,
"logps/chosen": -5.134617328643799,
"logps/rejected": -5.987712383270264,
"loss": 0.6906,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0022057683672755957,
"rewards/margins": 0.0033194683492183685,
"rewards/rejected": -0.0055252364836633205,
"step": 165
},
{
"epoch": 0.04,
"learning_rate": 1.993309266353638e-07,
"logits/chosen": -1.3613307476043701,
"logits/rejected": -1.4475274085998535,
"logps/chosen": -6.0252180099487305,
"logps/rejected": -6.769097328186035,
"loss": 0.6906,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.00019303560839034617,
"rewards/margins": 0.0034273392520844936,
"rewards/rejected": -0.0036203742492944,
"step": 170
},
{
"epoch": 0.04,
"learning_rate": 1.992320579737045e-07,
"logits/chosen": -1.2607237100601196,
"logits/rejected": -1.2645083665847778,
"logps/chosen": -5.982339859008789,
"logps/rejected": -5.792677879333496,
"loss": 0.687,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.006508027669042349,
"rewards/margins": 0.01678382232785225,
"rewards/rejected": -0.010275794193148613,
"step": 175
},
{
"epoch": 0.05,
"learning_rate": 1.9912640693269751e-07,
"logits/chosen": -1.4913660287857056,
"logits/rejected": -1.467761516571045,
"logps/chosen": -4.889002799987793,
"logps/rejected": -6.567469120025635,
"loss": 0.6895,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.007032574620097876,
"rewards/margins": 0.015939272940158844,
"rewards/rejected": -0.008906697854399681,
"step": 180
},
{
"epoch": 0.05,
"learning_rate": 1.9901398073345117e-07,
"logits/chosen": -1.6264896392822266,
"logits/rejected": -1.787503957748413,
"logps/chosen": -5.5176873207092285,
"logps/rejected": -7.238592624664307,
"loss": 0.6888,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0024650241248309612,
"rewards/margins": 0.014559340663254261,
"rewards/rejected": -0.012094316072762012,
"step": 185
},
{
"epoch": 0.05,
"learning_rate": 1.9889478706014683e-07,
"logits/chosen": -1.5220094919204712,
"logits/rejected": -1.3731660842895508,
"logps/chosen": -4.48035192489624,
"logps/rejected": -5.973559856414795,
"loss": 0.6892,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.002238192595541477,
"rewards/margins": 0.0097486088052392,
"rewards/rejected": -0.007510416209697723,
"step": 190
},
{
"epoch": 0.05,
"learning_rate": 1.9876883405951376e-07,
"logits/chosen": -1.3608970642089844,
"logits/rejected": -1.2253252267837524,
"logps/chosen": -6.359659194946289,
"logps/rejected": -6.143754005432129,
"loss": 0.6891,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0014768632827326655,
"rewards/margins": 0.004477277398109436,
"rewards/rejected": -0.0030004139989614487,
"step": 195
},
{
"epoch": 0.05,
"learning_rate": 1.9863613034027222e-07,
"logits/chosen": -1.3653035163879395,
"logits/rejected": -1.3042250871658325,
"logps/chosen": -5.9716901779174805,
"logps/rejected": -6.985511779785156,
"loss": 0.6853,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.003856885712593794,
"rewards/margins": 0.011315730400383472,
"rewards/rejected": -0.0074588460847735405,
"step": 200
},
{
"epoch": 0.05,
"eval_logits/chosen": -3.174004077911377,
"eval_logits/rejected": -3.153486490249634,
"eval_logps/chosen": -6.161444664001465,
"eval_logps/rejected": -6.977956771850586,
"eval_loss": 0.6930607557296753,
"eval_rewards/accuracies": 0.5140224099159241,
"eval_rewards/chosen": -0.0020819876808673143,
"eval_rewards/margins": 0.0011722741182893515,
"eval_rewards/rejected": -0.0032542620319873095,
"eval_runtime": 618.1553,
"eval_samples_per_second": 32.248,
"eval_steps_per_second": 0.505,
"step": 200
},
{
"epoch": 0.05,
"learning_rate": 1.9849668497254518e-07,
"logits/chosen": -1.366420030593872,
"logits/rejected": -1.418501377105713,
"logps/chosen": -4.841452121734619,
"logps/rejected": -7.0436296463012695,
"loss": 0.6882,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0067129782401025295,
"rewards/margins": 0.013699628412723541,
"rewards/rejected": -0.006986652500927448,
"step": 205
},
{
"epoch": 0.05,
"learning_rate": 1.9835050748723822e-07,
"logits/chosen": -1.5236493349075317,
"logits/rejected": -1.4237031936645508,
"logps/chosen": -5.028156280517578,
"logps/rejected": -6.966952323913574,
"loss": 0.6862,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.004889342468231916,
"rewards/margins": 0.01445357222110033,
"rewards/rejected": -0.009564228355884552,
"step": 210
},
{
"epoch": 0.05,
"learning_rate": 1.9819760787538837e-07,
"logits/chosen": -1.7881724834442139,
"logits/rejected": -1.702300786972046,
"logps/chosen": -5.008362293243408,
"logps/rejected": -7.346070766448975,
"loss": 0.6862,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.003184701083227992,
"rewards/margins": 0.009327715262770653,
"rewards/rejected": -0.012512415647506714,
"step": 215
},
{
"epoch": 0.06,
"learning_rate": 1.9803799658748093e-07,
"logits/chosen": -1.7177143096923828,
"logits/rejected": -1.579524278640747,
"logps/chosen": -5.551783561706543,
"logps/rejected": -8.588147163391113,
"loss": 0.6859,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.00936056487262249,
"rewards/margins": -0.0009745029965415597,
"rewards/rejected": -0.00838606245815754,
"step": 220
},
{
"epoch": 0.06,
"learning_rate": 1.9787168453273545e-07,
"logits/chosen": -1.4372203350067139,
"logits/rejected": -1.4148085117340088,
"logps/chosen": -5.2672553062438965,
"logps/rejected": -5.7078680992126465,
"loss": 0.685,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.004485909827053547,
"rewards/margins": 0.01839793473482132,
"rewards/rejected": -0.013912022113800049,
"step": 225
},
{
"epoch": 0.06,
"learning_rate": 1.9769868307835993e-07,
"logits/chosen": -1.6145379543304443,
"logits/rejected": -1.536407709121704,
"logps/chosen": -5.295413970947266,
"logps/rejected": -7.089319705963135,
"loss": 0.6852,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.00589085603132844,
"rewards/margins": 0.01651725545525551,
"rewards/rejected": -0.010626398026943207,
"step": 230
},
{
"epoch": 0.06,
"learning_rate": 1.9751900404877398e-07,
"logits/chosen": -1.5907440185546875,
"logits/rejected": -1.5271718502044678,
"logps/chosen": -4.938467979431152,
"logps/rejected": -5.892498970031738,
"loss": 0.6846,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0029959643725305796,
"rewards/margins": 0.021143438294529915,
"rewards/rejected": -0.01814747229218483,
"step": 235
},
{
"epoch": 0.06,
"learning_rate": 1.9733265972480058e-07,
"logits/chosen": -1.6651685237884521,
"logits/rejected": -1.5356318950653076,
"logps/chosen": -7.418261528015137,
"logps/rejected": -8.457880973815918,
"loss": 0.6852,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0033951636869460344,
"rewards/margins": 0.026372741907835007,
"rewards/rejected": -0.02297757938504219,
"step": 240
},
{
"epoch": 0.06,
"learning_rate": 1.9713966284282674e-07,
"logits/chosen": -1.4138095378875732,
"logits/rejected": -1.3163411617279053,
"logps/chosen": -6.462838172912598,
"logps/rejected": -7.8138837814331055,
"loss": 0.6839,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.007619897834956646,
"rewards/margins": 0.0050119319930672646,
"rewards/rejected": -0.012631828896701336,
"step": 245
},
{
"epoch": 0.06,
"learning_rate": 1.9694002659393302e-07,
"logits/chosen": -1.5196382999420166,
"logits/rejected": -1.477994680404663,
"logps/chosen": -5.463156700134277,
"logps/rejected": -6.823214530944824,
"loss": 0.6831,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0001128343865275383,
"rewards/margins": 0.022026551887392998,
"rewards/rejected": -0.02213938534259796,
"step": 250
},
{
"epoch": 0.06,
"learning_rate": 1.9673376462299182e-07,
"logits/chosen": -1.522629976272583,
"logits/rejected": -1.4759032726287842,
"logps/chosen": -6.2718963623046875,
"logps/rejected": -7.001539707183838,
"loss": 0.6825,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.00030401311232708395,
"rewards/margins": 0.021813327446579933,
"rewards/rejected": -0.021509312093257904,
"step": 255
},
{
"epoch": 0.07,
"learning_rate": 1.9652089102773487e-07,
"logits/chosen": -1.815498948097229,
"logits/rejected": -1.9041208028793335,
"logps/chosen": -4.9399566650390625,
"logps/rejected": -6.410666465759277,
"loss": 0.6844,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0037350181955844164,
"rewards/margins": 0.023170799016952515,
"rewards/rejected": -0.01943577639758587,
"step": 260
},
{
"epoch": 0.07,
"learning_rate": 1.963014203577896e-07,
"logits/chosen": -1.5250468254089355,
"logits/rejected": -1.5195215940475464,
"logps/chosen": -4.857203483581543,
"logps/rejected": -6.858325958251953,
"loss": 0.6829,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.007676561363041401,
"rewards/margins": 0.008902650326490402,
"rewards/rejected": -0.016579212620854378,
"step": 265
},
{
"epoch": 0.07,
"learning_rate": 1.9607536761368482e-07,
"logits/chosen": -1.398611068725586,
"logits/rejected": -1.4291143417358398,
"logps/chosen": -5.944151878356934,
"logps/rejected": -6.5233049392700195,
"loss": 0.682,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.004370951093733311,
"rewards/margins": 0.02115224488079548,
"rewards/rejected": -0.016781292855739594,
"step": 270
},
{
"epoch": 0.07,
"learning_rate": 1.9584274824582527e-07,
"logits/chosen": -1.3120192289352417,
"logits/rejected": -1.3574376106262207,
"logps/chosen": -4.558770179748535,
"logps/rejected": -5.6866583824157715,
"loss": 0.6821,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.004323200322687626,
"rewards/margins": 0.01929726079106331,
"rewards/rejected": -0.014974060468375683,
"step": 275
},
{
"epoch": 0.07,
"learning_rate": 1.9560357815343574e-07,
"logits/chosen": -1.5410289764404297,
"logits/rejected": -1.5422722101211548,
"logps/chosen": -7.481714725494385,
"logps/rejected": -6.8584442138671875,
"loss": 0.6795,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.004259726498275995,
"rewards/margins": 0.026613563299179077,
"rewards/rejected": -0.02235383726656437,
"step": 280
},
{
"epoch": 0.07,
"learning_rate": 1.9535787368347442e-07,
"logits/chosen": -1.6866031885147095,
"logits/rejected": -1.573209524154663,
"logps/chosen": -6.985440731048584,
"logps/rejected": -6.92074728012085,
"loss": 0.6813,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.003224983811378479,
"rewards/margins": 0.03093186393380165,
"rewards/rejected": -0.03415685147047043,
"step": 285
},
{
"epoch": 0.07,
"learning_rate": 1.9510565162951537e-07,
"logits/chosen": -1.5131175518035889,
"logits/rejected": -1.5239301919937134,
"logps/chosen": -5.186364650726318,
"logps/rejected": -6.2596564292907715,
"loss": 0.6808,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.012961247935891151,
"rewards/margins": 0.05755941942334175,
"rewards/rejected": -0.04459817335009575,
"step": 290
},
{
"epoch": 0.07,
"learning_rate": 1.9484692923060094e-07,
"logits/chosen": -1.540378212928772,
"logits/rejected": -1.5740854740142822,
"logps/chosen": -6.078642845153809,
"logps/rejected": -6.573060512542725,
"loss": 0.6809,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.011097406968474388,
"rewards/margins": 0.030860627070069313,
"rewards/rejected": -0.041958026587963104,
"step": 295
},
{
"epoch": 0.08,
"learning_rate": 1.9458172417006346e-07,
"logits/chosen": -1.5138806104660034,
"logits/rejected": -1.3892674446105957,
"logps/chosen": -5.267967224121094,
"logps/rejected": -6.3208441734313965,
"loss": 0.6801,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0035496647469699383,
"rewards/margins": 0.038710370659828186,
"rewards/rejected": -0.03516070172190666,
"step": 300
},
{
"epoch": 0.08,
"eval_logits/chosen": -3.184242010116577,
"eval_logits/rejected": -3.1636037826538086,
"eval_logps/chosen": -6.222862720489502,
"eval_logps/rejected": -7.056950569152832,
"eval_loss": 0.6925042867660522,
"eval_rewards/accuracies": 0.5252403616905212,
"eval_rewards/chosen": -0.008223854936659336,
"eval_rewards/margins": 0.0029297315049916506,
"eval_rewards/rejected": -0.011153585277497768,
"eval_runtime": 620.1964,
"eval_samples_per_second": 32.141,
"eval_steps_per_second": 0.503,
"step": 300
},
{
"epoch": 0.08,
"learning_rate": 1.943100545743165e-07,
"logits/chosen": -1.6512733697891235,
"logits/rejected": -1.721056342124939,
"logps/chosen": -5.9783034324646,
"logps/rejected": -5.989422798156738,
"loss": 0.6787,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.009316826239228249,
"rewards/margins": 0.029956454411149025,
"rewards/rejected": -0.039273280650377274,
"step": 305
},
{
"epoch": 0.08,
"learning_rate": 1.9403193901161612e-07,
"logits/chosen": -1.4372011423110962,
"logits/rejected": -1.2608239650726318,
"logps/chosen": -5.216915130615234,
"logps/rejected": -7.368325233459473,
"loss": 0.6783,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.00843791477382183,
"rewards/margins": 0.019914906471967697,
"rewards/rejected": -0.028352823108434677,
"step": 310
},
{
"epoch": 0.08,
"learning_rate": 1.9374739649079154e-07,
"logits/chosen": -1.4851138591766357,
"logits/rejected": -1.5565675497055054,
"logps/chosen": -6.530230522155762,
"logps/rejected": -5.977784156799316,
"loss": 0.6795,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.005143959075212479,
"rewards/margins": 0.03382311016321182,
"rewards/rejected": -0.028679147362709045,
"step": 315
},
{
"epoch": 0.08,
"learning_rate": 1.9345644645994608e-07,
"logits/chosen": -1.7324178218841553,
"logits/rejected": -1.6093097925186157,
"logps/chosen": -5.668793201446533,
"logps/rejected": -8.58001708984375,
"loss": 0.6813,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.007929663173854351,
"rewards/margins": 0.016542982310056686,
"rewards/rejected": -0.02447264827787876,
"step": 320
},
{
"epoch": 0.08,
"learning_rate": 1.9315910880512788e-07,
"logits/chosen": -1.664690613746643,
"logits/rejected": -1.6959164142608643,
"logps/chosen": -4.798338413238525,
"logps/rejected": -6.650571346282959,
"loss": 0.6793,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.002946319757029414,
"rewards/margins": 0.03725341707468033,
"rewards/rejected": -0.04019974544644356,
"step": 325
},
{
"epoch": 0.08,
"learning_rate": 1.928554038489707e-07,
"logits/chosen": -1.6366933584213257,
"logits/rejected": -1.6312482357025146,
"logps/chosen": -5.757204532623291,
"logps/rejected": -6.9877214431762695,
"loss": 0.6771,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.004410495515912771,
"rewards/margins": 0.03300872817635536,
"rewards/rejected": -0.03741922602057457,
"step": 330
},
{
"epoch": 0.08,
"learning_rate": 1.9254535234930483e-07,
"logits/chosen": -1.383345365524292,
"logits/rejected": -1.4103121757507324,
"logps/chosen": -7.0129594802856445,
"logps/rejected": -6.204277038574219,
"loss": 0.6795,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0018229834968224168,
"rewards/margins": 0.018833067268133163,
"rewards/rejected": -0.020656052976846695,
"step": 335
},
{
"epoch": 0.09,
"learning_rate": 1.9222897549773846e-07,
"logits/chosen": -1.3323512077331543,
"logits/rejected": -1.3201320171356201,
"logps/chosen": -5.118114471435547,
"logps/rejected": -7.212060451507568,
"loss": 0.6776,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.009920697659254074,
"rewards/margins": 0.029569601640105247,
"rewards/rejected": -0.03949030116200447,
"step": 340
},
{
"epoch": 0.09,
"learning_rate": 1.9190629491820908e-07,
"logits/chosen": -1.4638566970825195,
"logits/rejected": -1.4451267719268799,
"logps/chosen": -5.2605671882629395,
"logps/rejected": -6.085940837860107,
"loss": 0.6739,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.005447469651699066,
"rewards/margins": 0.034440845251083374,
"rewards/rejected": -0.03988831490278244,
"step": 345
},
{
"epoch": 0.09,
"learning_rate": 1.9157733266550572e-07,
"logits/chosen": -1.5962337255477905,
"logits/rejected": -1.6220163106918335,
"logps/chosen": -6.018959045410156,
"logps/rejected": -6.889263153076172,
"loss": 0.674,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0057447971776127815,
"rewards/margins": 0.07764319330453873,
"rewards/rejected": -0.08338797837495804,
"step": 350
},
{
"epoch": 0.09,
"learning_rate": 1.9124211122376135e-07,
"logits/chosen": -1.4028089046478271,
"logits/rejected": -1.3758150339126587,
"logps/chosen": -5.279055595397949,
"logps/rejected": -8.15546989440918,
"loss": 0.6746,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.004057350568473339,
"rewards/margins": 0.041755061596632004,
"rewards/rejected": -0.03769771382212639,
"step": 355
},
{
"epoch": 0.09,
"learning_rate": 1.9090065350491624e-07,
"logits/chosen": -1.466732144355774,
"logits/rejected": -1.505338430404663,
"logps/chosen": -6.279904365539551,
"logps/rejected": -6.962622165679932,
"loss": 0.6722,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0070061227306723595,
"rewards/margins": 0.04479537159204483,
"rewards/rejected": -0.037789251655340195,
"step": 360
},
{
"epoch": 0.09,
"learning_rate": 1.905529828471519e-07,
"logits/chosen": -1.5488044023513794,
"logits/rejected": -1.4669849872589111,
"logps/chosen": -6.46566104888916,
"logps/rejected": -6.615389823913574,
"loss": 0.6752,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.009375613182783127,
"rewards/margins": 0.0315098911523819,
"rewards/rejected": -0.04088550806045532,
"step": 365
},
{
"epoch": 0.09,
"learning_rate": 1.901991230132959e-07,
"logits/chosen": -1.5748860836029053,
"logits/rejected": -1.5498300790786743,
"logps/chosen": -5.943437099456787,
"logps/rejected": -7.429045677185059,
"loss": 0.6752,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.012559416703879833,
"rewards/margins": 0.03320794180035591,
"rewards/rejected": -0.04576735943555832,
"step": 370
},
{
"epoch": 0.09,
"learning_rate": 1.8983909818919788e-07,
"logits/chosen": -1.4835641384124756,
"logits/rejected": -1.4734997749328613,
"logps/chosen": -5.776877403259277,
"logps/rejected": -6.821134090423584,
"loss": 0.6746,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 8.074291690718383e-05,
"rewards/margins": 0.041906945407390594,
"rewards/rejected": -0.04182619974017143,
"step": 375
},
{
"epoch": 0.1,
"learning_rate": 1.8947293298207635e-07,
"logits/chosen": -1.590511441230774,
"logits/rejected": -1.5636823177337646,
"logps/chosen": -6.252840042114258,
"logps/rejected": -7.568482398986816,
"loss": 0.6773,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.009611086919903755,
"rewards/margins": 0.04731304943561554,
"rewards/rejected": -0.056924134492874146,
"step": 380
},
{
"epoch": 0.1,
"learning_rate": 1.8910065241883678e-07,
"logits/chosen": -1.7463115453720093,
"logits/rejected": -1.552099347114563,
"logps/chosen": -5.755384922027588,
"logps/rejected": -9.108583450317383,
"loss": 0.6762,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.014299096539616585,
"rewards/margins": 0.04487228766083717,
"rewards/rejected": -0.05917138606309891,
"step": 385
},
{
"epoch": 0.1,
"learning_rate": 1.8872228194436116e-07,
"logits/chosen": -1.4452903270721436,
"logits/rejected": -1.4187732934951782,
"logps/chosen": -5.313692092895508,
"logps/rejected": -6.416788578033447,
"loss": 0.6713,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.017885476350784302,
"rewards/margins": 0.03439543396234512,
"rewards/rejected": -0.052280914038419724,
"step": 390
},
{
"epoch": 0.1,
"learning_rate": 1.8833784741976886e-07,
"logits/chosen": -1.5387766361236572,
"logits/rejected": -1.5287269353866577,
"logps/chosen": -5.880291938781738,
"logps/rejected": -6.970144748687744,
"loss": 0.6682,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.018293332308530807,
"rewards/margins": 0.044221844524145126,
"rewards/rejected": -0.06251517683267593,
"step": 395
},
{
"epoch": 0.1,
"learning_rate": 1.8794737512064888e-07,
"logits/chosen": -1.640610933303833,
"logits/rejected": -1.539394736289978,
"logps/chosen": -5.029040336608887,
"logps/rejected": -6.50949764251709,
"loss": 0.6676,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.008673004806041718,
"rewards/margins": 0.04148182272911072,
"rewards/rejected": -0.050154827535152435,
"step": 400
},
{
"epoch": 0.1,
"eval_logits/chosen": -3.1871755123138428,
"eval_logits/rejected": -3.1664505004882812,
"eval_logps/chosen": -6.327807903289795,
"eval_logps/rejected": -7.203298091888428,
"eval_loss": 0.6918210983276367,
"eval_rewards/accuracies": 0.5324519276618958,
"eval_rewards/chosen": -0.018718333914875984,
"eval_rewards/margins": 0.007069970481097698,
"eval_rewards/rejected": -0.025788303464651108,
"eval_runtime": 624.3628,
"eval_samples_per_second": 31.927,
"eval_steps_per_second": 0.5,
"step": 400
},
{
"epoch": 0.1,
"learning_rate": 1.875508917352643e-07,
"logits/chosen": -1.3193857669830322,
"logits/rejected": -1.3139396905899048,
"logps/chosen": -5.116377830505371,
"logps/rejected": -7.48989200592041,
"loss": 0.6702,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.00014799665950704366,
"rewards/margins": 0.06528617441654205,
"rewards/rejected": -0.06513817608356476,
"step": 405
},
{
"epoch": 0.1,
"learning_rate": 1.871484243627277e-07,
"logits/chosen": -1.6524467468261719,
"logits/rejected": -1.6226232051849365,
"logps/chosen": -5.752681732177734,
"logps/rejected": -7.157814979553223,
"loss": 0.6721,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02660415694117546,
"rewards/margins": 0.04491695761680603,
"rewards/rejected": -0.07152111828327179,
"step": 410
},
{
"epoch": 0.1,
"learning_rate": 1.867400005111495e-07,
"logits/chosen": -1.4393689632415771,
"logits/rejected": -1.369827151298523,
"logps/chosen": -6.169909477233887,
"logps/rejected": -7.438270568847656,
"loss": 0.6732,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.005500434432178736,
"rewards/margins": 0.06915347278118134,
"rewards/rejected": -0.07465390115976334,
"step": 415
},
{
"epoch": 0.11,
"learning_rate": 1.8632564809575738e-07,
"logits/chosen": -1.622018814086914,
"logits/rejected": -1.367488145828247,
"logps/chosen": -5.794252872467041,
"logps/rejected": -7.3042449951171875,
"loss": 0.6702,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.016323495656251907,
"rewards/margins": 0.07359044253826141,
"rewards/rejected": -0.057266950607299805,
"step": 420
},
{
"epoch": 0.11,
"learning_rate": 1.859053954369885e-07,
"logits/chosen": -1.5450842380523682,
"logits/rejected": -1.5149590969085693,
"logps/chosen": -4.754348278045654,
"logps/rejected": -6.53006649017334,
"loss": 0.6752,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.005179319530725479,
"rewards/margins": 0.04943736642599106,
"rewards/rejected": -0.05461668223142624,
"step": 425
},
{
"epoch": 0.11,
"learning_rate": 1.854792712585539e-07,
"logits/chosen": -1.6106449365615845,
"logits/rejected": -1.472044587135315,
"logps/chosen": -5.688014030456543,
"logps/rejected": -7.954685211181641,
"loss": 0.6644,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.01195032149553299,
"rewards/margins": 0.08017265051603317,
"rewards/rejected": -0.09212296456098557,
"step": 430
},
{
"epoch": 0.11,
"learning_rate": 1.8504730468547506e-07,
"logits/chosen": -1.7078673839569092,
"logits/rejected": -1.6030772924423218,
"logps/chosen": -5.776875972747803,
"logps/rejected": -8.993513107299805,
"loss": 0.6652,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03464338928461075,
"rewards/margins": 0.04098203405737877,
"rewards/rejected": -0.07562542706727982,
"step": 435
},
{
"epoch": 0.11,
"learning_rate": 1.846095252420935e-07,
"logits/chosen": -1.5690264701843262,
"logits/rejected": -1.466038465499878,
"logps/chosen": -5.061371326446533,
"logps/rejected": -7.400214195251465,
"loss": 0.6721,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0034414425026625395,
"rewards/margins": 0.06261752545833588,
"rewards/rejected": -0.06605897843837738,
"step": 440
},
{
"epoch": 0.11,
"learning_rate": 1.841659628500527e-07,
"logits/chosen": -1.5850152969360352,
"logits/rejected": -1.626733422279358,
"logps/chosen": -5.730135440826416,
"logps/rejected": -7.101342678070068,
"loss": 0.6687,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0294067170470953,
"rewards/margins": 0.06425820291042328,
"rewards/rejected": -0.09366491436958313,
"step": 445
},
{
"epoch": 0.11,
"learning_rate": 1.8371664782625284e-07,
"logits/chosen": -1.4130651950836182,
"logits/rejected": -1.4390531778335571,
"logps/chosen": -6.015176296234131,
"logps/rejected": -6.557525634765625,
"loss": 0.6697,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.019521493464708328,
"rewards/margins": 0.048861414194107056,
"rewards/rejected": -0.06838290393352509,
"step": 450
},
{
"epoch": 0.11,
"learning_rate": 1.8326161088077904e-07,
"logits/chosen": -1.5105092525482178,
"logits/rejected": -1.5329506397247314,
"logps/chosen": -5.465790748596191,
"logps/rejected": -6.999060153961182,
"loss": 0.6671,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.01001508068293333,
"rewards/margins": 0.05736090615391731,
"rewards/rejected": -0.06737598031759262,
"step": 455
},
{
"epoch": 0.12,
"learning_rate": 1.82800883114802e-07,
"logits/chosen": -1.3845876455307007,
"logits/rejected": -1.4282658100128174,
"logps/chosen": -6.4336256980896,
"logps/rejected": -6.6461639404296875,
"loss": 0.6661,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.023983094841241837,
"rewards/margins": 0.0474400632083416,
"rewards/rejected": -0.07142315804958344,
"step": 460
},
{
"epoch": 0.12,
"learning_rate": 1.8233449601845256e-07,
"logits/chosen": -1.5965187549591064,
"logits/rejected": -1.6954765319824219,
"logps/chosen": -7.093475341796875,
"logps/rejected": -8.442846298217773,
"loss": 0.662,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0007729934295639396,
"rewards/margins": 0.09539145231246948,
"rewards/rejected": -0.09616444259881973,
"step": 465
},
{
"epoch": 0.12,
"learning_rate": 1.8186248146866925e-07,
"logits/chosen": -1.4333058595657349,
"logits/rejected": -1.3761308193206787,
"logps/chosen": -6.583089351654053,
"logps/rejected": -8.258574485778809,
"loss": 0.6664,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0373983308672905,
"rewards/margins": 0.028671253472566605,
"rewards/rejected": -0.0660695806145668,
"step": 470
},
{
"epoch": 0.12,
"learning_rate": 1.8138487172701948e-07,
"logits/chosen": -1.4833250045776367,
"logits/rejected": -1.5289559364318848,
"logps/chosen": -5.485930442810059,
"logps/rejected": -6.312931060791016,
"loss": 0.6665,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.00883159227669239,
"rewards/margins": 0.05059976503252983,
"rewards/rejected": -0.059431351721286774,
"step": 475
},
{
"epoch": 0.12,
"learning_rate": 1.8090169943749475e-07,
"logits/chosen": -1.3064377307891846,
"logits/rejected": -1.246734619140625,
"logps/chosen": -4.533870220184326,
"logps/rejected": -6.509539604187012,
"loss": 0.6642,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.005517001263797283,
"rewards/margins": 0.06515659391880035,
"rewards/rejected": -0.07067359238862991,
"step": 480
},
{
"epoch": 0.12,
"learning_rate": 1.8041299762427914e-07,
"logits/chosen": -1.605373740196228,
"logits/rejected": -1.5201643705368042,
"logps/chosen": -6.204909801483154,
"logps/rejected": -8.210671424865723,
"loss": 0.6623,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.024663742631673813,
"rewards/margins": 0.06276446580886841,
"rewards/rejected": -0.08742821216583252,
"step": 485
},
{
"epoch": 0.12,
"learning_rate": 1.7991879968949247e-07,
"logits/chosen": -1.2218480110168457,
"logits/rejected": -1.1649186611175537,
"logps/chosen": -4.534018516540527,
"logps/rejected": -5.6343183517456055,
"loss": 0.6628,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.007990372367203236,
"rewards/margins": 0.07446859031915665,
"rewards/rejected": -0.0824589654803276,
"step": 490
},
{
"epoch": 0.12,
"learning_rate": 1.794191394109071e-07,
"logits/chosen": -1.4611269235610962,
"logits/rejected": -1.4972448348999023,
"logps/chosen": -6.087195873260498,
"logps/rejected": -7.726794242858887,
"loss": 0.6659,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02641758695244789,
"rewards/margins": 0.03801130875945091,
"rewards/rejected": -0.0644288882613182,
"step": 495
},
{
"epoch": 0.13,
"learning_rate": 1.7891405093963936e-07,
"logits/chosen": -1.6527025699615479,
"logits/rejected": -1.5353628396987915,
"logps/chosen": -4.67386531829834,
"logps/rejected": -6.836350917816162,
"loss": 0.6582,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.008362488821148872,
"rewards/margins": 0.07853694260120392,
"rewards/rejected": -0.08689942955970764,
"step": 500
},
{
"epoch": 0.13,
"eval_logits/chosen": -3.196692705154419,
"eval_logits/rejected": -3.1757330894470215,
"eval_logps/chosen": -6.479369640350342,
"eval_logps/rejected": -7.3778076171875,
"eval_loss": 0.691510021686554,
"eval_rewards/accuracies": 0.5356570482254028,
"eval_rewards/chosen": -0.03387455642223358,
"eval_rewards/margins": 0.0093647176399827,
"eval_rewards/rejected": -0.043239280581474304,
"eval_runtime": 630.1142,
"eval_samples_per_second": 31.636,
"eval_steps_per_second": 0.495,
"step": 500
},
{
"epoch": 0.13,
"learning_rate": 1.7840356879781529e-07,
"logits/chosen": -1.5877869129180908,
"logits/rejected": -1.5104506015777588,
"logps/chosen": -6.052631378173828,
"logps/rejected": -9.69563102722168,
"loss": 0.6607,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.04113469645380974,
"rewards/margins": 0.08254175633192062,
"rewards/rejected": -0.12367645651102066,
"step": 505
},
{
"epoch": 0.13,
"learning_rate": 1.7788772787621125e-07,
"logits/chosen": -1.5799081325531006,
"logits/rejected": -1.6663821935653687,
"logps/chosen": -6.070633888244629,
"logps/rejected": -6.397655487060547,
"loss": 0.6615,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03644927218556404,
"rewards/margins": 0.06373059004545212,
"rewards/rejected": -0.10017986595630646,
"step": 510
},
{
"epoch": 0.13,
"learning_rate": 1.7736656343186894e-07,
"logits/chosen": -1.5198477506637573,
"logits/rejected": -1.5320686101913452,
"logps/chosen": -4.8720879554748535,
"logps/rejected": -6.920307159423828,
"loss": 0.6546,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.011885623447597027,
"rewards/margins": 0.12990930676460266,
"rewards/rejected": -0.11802370846271515,
"step": 515
},
{
"epoch": 0.13,
"learning_rate": 1.768401110856859e-07,
"logits/chosen": -1.4357372522354126,
"logits/rejected": -1.3116753101348877,
"logps/chosen": -5.608946323394775,
"logps/rejected": -7.910216331481934,
"loss": 0.6588,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03163328021764755,
"rewards/margins": 0.06858251243829727,
"rewards/rejected": -0.10021580755710602,
"step": 520
},
{
"epoch": 0.13,
"learning_rate": 1.7630840681998066e-07,
"logits/chosen": -1.5340244770050049,
"logits/rejected": -1.417262315750122,
"logps/chosen": -5.505238056182861,
"logps/rejected": -9.424348831176758,
"loss": 0.657,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.043297283351421356,
"rewards/margins": 0.06860554218292236,
"rewards/rejected": -0.11190283298492432,
"step": 525
},
{
"epoch": 0.13,
"learning_rate": 1.7577148697603348e-07,
"logits/chosen": -1.7233011722564697,
"logits/rejected": -1.6292282342910767,
"logps/chosen": -5.7183942794799805,
"logps/rejected": -6.829588890075684,
"loss": 0.6596,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0456002876162529,
"rewards/margins": 0.06835009157657623,
"rewards/rejected": -0.11395038664340973,
"step": 530
},
{
"epoch": 0.13,
"learning_rate": 1.7522938825160247e-07,
"logits/chosen": -1.3610880374908447,
"logits/rejected": -1.3855262994766235,
"logps/chosen": -5.5726470947265625,
"logps/rejected": -7.635517120361328,
"loss": 0.6577,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03400539606809616,
"rewards/margins": 0.03809204697608948,
"rewards/rejected": -0.07209744304418564,
"step": 535
},
{
"epoch": 0.14,
"learning_rate": 1.7468214769841538e-07,
"logits/chosen": -1.5978724956512451,
"logits/rejected": -1.4684964418411255,
"logps/chosen": -5.9864959716796875,
"logps/rejected": -7.737608432769775,
"loss": 0.6572,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.044120945036411285,
"rewards/margins": 0.06804494559764862,
"rewards/rejected": -0.11216588318347931,
"step": 540
},
{
"epoch": 0.14,
"learning_rate": 1.7412980271963708e-07,
"logits/chosen": -1.567631721496582,
"logits/rejected": -1.54317045211792,
"logps/chosen": -6.259519100189209,
"logps/rejected": -7.840844631195068,
"loss": 0.6614,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.006099678575992584,
"rewards/margins": 0.1069895401597023,
"rewards/rejected": -0.11308921873569489,
"step": 545
},
{
"epoch": 0.14,
"learning_rate": 1.7357239106731316e-07,
"logits/chosen": -1.6185524463653564,
"logits/rejected": -1.6444612741470337,
"logps/chosen": -7.109327793121338,
"logps/rejected": -8.976202964782715,
"loss": 0.6555,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.04382595047354698,
"rewards/margins": 0.09603992104530334,
"rewards/rejected": -0.13986587524414062,
"step": 550
},
{
"epoch": 0.14,
"learning_rate": 1.7300995083978961e-07,
"logits/chosen": -1.7173988819122314,
"logits/rejected": -1.5636038780212402,
"logps/chosen": -5.074532508850098,
"logps/rejected": -7.6101531982421875,
"loss": 0.6536,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04033565893769264,
"rewards/margins": 0.08647169172763824,
"rewards/rejected": -0.1268073469400406,
"step": 555
},
{
"epoch": 0.14,
"learning_rate": 1.724425204791089e-07,
"logits/chosen": -1.3909645080566406,
"logits/rejected": -1.4084515571594238,
"logps/chosen": -5.7829999923706055,
"logps/rejected": -6.748002529144287,
"loss": 0.6638,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03097168169915676,
"rewards/margins": 0.04933810234069824,
"rewards/rejected": -0.08030977845191956,
"step": 560
},
{
"epoch": 0.14,
"learning_rate": 1.7187013876838238e-07,
"logits/chosen": -1.8116487264633179,
"logits/rejected": -1.7086451053619385,
"logps/chosen": -6.233320236206055,
"logps/rejected": -8.50374698638916,
"loss": 0.6505,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.014792789705097675,
"rewards/margins": 0.09041319042444229,
"rewards/rejected": -0.1052059754729271,
"step": 565
},
{
"epoch": 0.14,
"learning_rate": 1.712928448291397e-07,
"logits/chosen": -1.4688892364501953,
"logits/rejected": -1.531427025794983,
"logps/chosen": -6.673917293548584,
"logps/rejected": -6.955280303955078,
"loss": 0.6468,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.013536572456359863,
"rewards/margins": 0.09923507273197174,
"rewards/rejected": -0.11277163028717041,
"step": 570
},
{
"epoch": 0.14,
"learning_rate": 1.7071067811865473e-07,
"logits/chosen": -1.7241573333740234,
"logits/rejected": -1.6227924823760986,
"logps/chosen": -6.4433488845825195,
"logps/rejected": -9.645414352416992,
"loss": 0.6523,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.054280836135149,
"rewards/margins": 0.09889966994524002,
"rewards/rejected": -0.15318050980567932,
"step": 575
},
{
"epoch": 0.15,
"learning_rate": 1.7012367842724884e-07,
"logits/chosen": -1.4540927410125732,
"logits/rejected": -1.448845624923706,
"logps/chosen": -7.165667533874512,
"logps/rejected": -9.222039222717285,
"loss": 0.6556,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03299577161669731,
"rewards/margins": 0.09829618781805038,
"rewards/rejected": -0.1312919557094574,
"step": 580
},
{
"epoch": 0.15,
"learning_rate": 1.695318858755712e-07,
"logits/chosen": -1.6242682933807373,
"logits/rejected": -1.507509708404541,
"logps/chosen": -4.892175197601318,
"logps/rejected": -6.663909912109375,
"loss": 0.6587,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.05031831935048103,
"rewards/margins": 0.06618543714284897,
"rewards/rejected": -0.1165037602186203,
"step": 585
},
{
"epoch": 0.15,
"learning_rate": 1.6893534091185658e-07,
"logits/chosen": -1.655474305152893,
"logits/rejected": -1.6360971927642822,
"logps/chosen": -6.940272331237793,
"logps/rejected": -10.048660278320312,
"loss": 0.6463,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.006456449627876282,
"rewards/margins": 0.1727210134267807,
"rewards/rejected": -0.17917747795581818,
"step": 590
},
{
"epoch": 0.15,
"learning_rate": 1.6833408430916082e-07,
"logits/chosen": -1.3439010381698608,
"logits/rejected": -1.429510235786438,
"logps/chosen": -5.466313362121582,
"logps/rejected": -7.18454647064209,
"loss": 0.6487,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05292614549398422,
"rewards/margins": 0.12489708513021469,
"rewards/rejected": -0.17782321572303772,
"step": 595
},
{
"epoch": 0.15,
"learning_rate": 1.6772815716257412e-07,
"logits/chosen": -1.51864492893219,
"logits/rejected": -1.455951452255249,
"logps/chosen": -6.4935503005981445,
"logps/rejected": -8.119839668273926,
"loss": 0.6537,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0410698801279068,
"rewards/margins": 0.07796063274145126,
"rewards/rejected": -0.11903052031993866,
"step": 600
},
{
"epoch": 0.15,
"eval_logits/chosen": -3.1874260902404785,
"eval_logits/rejected": -3.1666018962860107,
"eval_logps/chosen": -6.68336296081543,
"eval_logps/rejected": -7.635651588439941,
"eval_loss": 0.6910278797149658,
"eval_rewards/accuracies": 0.5352563858032227,
"eval_rewards/chosen": -0.05427387356758118,
"eval_rewards/margins": 0.014749797061085701,
"eval_rewards/rejected": -0.06902367621660233,
"eval_runtime": 633.9901,
"eval_samples_per_second": 31.442,
"eval_steps_per_second": 0.492,
"step": 600
},
{
"epoch": 0.15,
"learning_rate": 1.6711760088641197e-07,
"logits/chosen": -1.707196593284607,
"logits/rejected": -1.7934414148330688,
"logps/chosen": -6.166356086730957,
"logps/rejected": -8.222631454467773,
"loss": 0.6509,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04317507520318031,
"rewards/margins": 0.09725725650787354,
"rewards/rejected": -0.14043232798576355,
"step": 605
},
{
"epoch": 0.15,
"learning_rate": 1.665024572113848e-07,
"logits/chosen": -1.545772910118103,
"logits/rejected": -1.6466634273529053,
"logps/chosen": -6.318427085876465,
"logps/rejected": -8.524378776550293,
"loss": 0.6514,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0410161130130291,
"rewards/margins": 0.10211832821369171,
"rewards/rejected": -0.14313443005084991,
"step": 610
},
{
"epoch": 0.15,
"learning_rate": 1.6588276818174578e-07,
"logits/chosen": -1.8286006450653076,
"logits/rejected": -1.8894586563110352,
"logps/chosen": -6.069952011108398,
"logps/rejected": -9.206938743591309,
"loss": 0.651,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.020980229601264,
"rewards/margins": 0.11290951818227768,
"rewards/rejected": -0.13388976454734802,
"step": 615
},
{
"epoch": 0.16,
"learning_rate": 1.6525857615241686e-07,
"logits/chosen": -1.47043776512146,
"logits/rejected": -1.5266094207763672,
"logps/chosen": -6.51092529296875,
"logps/rejected": -7.585784912109375,
"loss": 0.6509,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.04623446241021156,
"rewards/margins": 0.08200391381978989,
"rewards/rejected": -0.12823837995529175,
"step": 620
},
{
"epoch": 0.16,
"learning_rate": 1.6462992378609406e-07,
"logits/chosen": -1.3878867626190186,
"logits/rejected": -1.427337884902954,
"logps/chosen": -5.593390464782715,
"logps/rejected": -8.398979187011719,
"loss": 0.6468,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.024669837206602097,
"rewards/margins": 0.16932611167430878,
"rewards/rejected": -0.19399593770503998,
"step": 625
},
{
"epoch": 0.16,
"learning_rate": 1.6399685405033166e-07,
"logits/chosen": -1.4157261848449707,
"logits/rejected": -1.3857667446136475,
"logps/chosen": -7.09283447265625,
"logps/rejected": -7.571598052978516,
"loss": 0.6464,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06203197315335274,
"rewards/margins": 0.08331136405467987,
"rewards/rejected": -0.1453433334827423,
"step": 630
},
{
"epoch": 0.16,
"learning_rate": 1.6335941021460504e-07,
"logits/chosen": -1.4545994997024536,
"logits/rejected": -1.5172913074493408,
"logps/chosen": -6.2775068283081055,
"logps/rejected": -9.521524429321289,
"loss": 0.6483,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.045348554849624634,
"rewards/margins": 0.10597596317529678,
"rewards/rejected": -0.15132452547550201,
"step": 635
},
{
"epoch": 0.16,
"learning_rate": 1.627176358473537e-07,
"logits/chosen": -1.5027107000350952,
"logits/rejected": -1.3006885051727295,
"logps/chosen": -6.991432189941406,
"logps/rejected": -8.119719505310059,
"loss": 0.6497,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07398322969675064,
"rewards/margins": 0.03964931517839432,
"rewards/rejected": -0.11363253742456436,
"step": 640
},
{
"epoch": 0.16,
"learning_rate": 1.6207157481300312e-07,
"logits/chosen": -1.5265876054763794,
"logits/rejected": -1.4646865129470825,
"logps/chosen": -6.282595634460449,
"logps/rejected": -7.717820167541504,
"loss": 0.6474,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.07993953675031662,
"rewards/margins": 0.09873492270708084,
"rewards/rejected": -0.17867444455623627,
"step": 645
},
{
"epoch": 0.16,
"learning_rate": 1.614212712689668e-07,
"logits/chosen": -1.3973186016082764,
"logits/rejected": -1.3966352939605713,
"logps/chosen": -6.049260139465332,
"logps/rejected": -6.803788185119629,
"loss": 0.6457,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.053327836096286774,
"rewards/margins": 0.13262109458446503,
"rewards/rejected": -0.1859489530324936,
"step": 650
},
{
"epoch": 0.16,
"learning_rate": 1.607667696626281e-07,
"logits/chosen": -1.604760766029358,
"logits/rejected": -1.6347767114639282,
"logps/chosen": -5.876626014709473,
"logps/rejected": -9.119338989257812,
"loss": 0.6534,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.058147139847278595,
"rewards/margins": 0.09601394087076187,
"rewards/rejected": -0.15416109561920166,
"step": 655
},
{
"epoch": 0.17,
"learning_rate": 1.601081147283025e-07,
"logits/chosen": -1.517514944076538,
"logits/rejected": -1.4594268798828125,
"logps/chosen": -5.623353004455566,
"logps/rejected": -8.838793754577637,
"loss": 0.6423,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.034189894795417786,
"rewards/margins": 0.120149627327919,
"rewards/rejected": -0.1543395221233368,
"step": 660
},
{
"epoch": 0.17,
"learning_rate": 1.594453514841798e-07,
"logits/chosen": -1.4519226551055908,
"logits/rejected": -1.4916832447052002,
"logps/chosen": -6.024622917175293,
"logps/rejected": -8.156673431396484,
"loss": 0.6421,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03883025795221329,
"rewards/margins": 0.14476829767227173,
"rewards/rejected": -0.18359854817390442,
"step": 665
},
{
"epoch": 0.17,
"learning_rate": 1.5877852522924732e-07,
"logits/chosen": -1.4973927736282349,
"logits/rejected": -1.5078349113464355,
"logps/chosen": -5.639887809753418,
"logps/rejected": -6.538842678070068,
"loss": 0.6519,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.046415191143751144,
"rewards/margins": 0.08505970239639282,
"rewards/rejected": -0.13147488236427307,
"step": 670
},
{
"epoch": 0.17,
"learning_rate": 1.5810768154019382e-07,
"logits/chosen": -1.6372696161270142,
"logits/rejected": -1.604660987854004,
"logps/chosen": -6.092626571655273,
"logps/rejected": -7.028026580810547,
"loss": 0.6493,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.07696966081857681,
"rewards/margins": 0.10025089979171753,
"rewards/rejected": -0.17722055315971375,
"step": 675
},
{
"epoch": 0.17,
"learning_rate": 1.5743286626829435e-07,
"logits/chosen": -1.3090213537216187,
"logits/rejected": -1.3185111284255981,
"logps/chosen": -4.761218070983887,
"logps/rejected": -6.58809757232666,
"loss": 0.6395,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03409787639975548,
"rewards/margins": 0.1063770055770874,
"rewards/rejected": -0.14047487080097198,
"step": 680
},
{
"epoch": 0.17,
"learning_rate": 1.5675412553627636e-07,
"logits/chosen": -1.4627618789672852,
"logits/rejected": -1.4510043859481812,
"logps/chosen": -5.731337547302246,
"logps/rejected": -10.051878929138184,
"loss": 0.6498,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03276892006397247,
"rewards/margins": 0.13961870968341827,
"rewards/rejected": -0.17238759994506836,
"step": 685
},
{
"epoch": 0.17,
"learning_rate": 1.5607150573516727e-07,
"logits/chosen": -1.450136423110962,
"logits/rejected": -1.4004848003387451,
"logps/chosen": -6.355518341064453,
"logps/rejected": -7.548236846923828,
"loss": 0.6462,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07692820578813553,
"rewards/margins": 0.07390854507684708,
"rewards/rejected": -0.1508367359638214,
"step": 690
},
{
"epoch": 0.17,
"learning_rate": 1.5538505352112372e-07,
"logits/chosen": -1.7112722396850586,
"logits/rejected": -1.4571855068206787,
"logps/chosen": -6.127640724182129,
"logps/rejected": -7.857392311096191,
"loss": 0.6451,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07270301878452301,
"rewards/margins": 0.13500170409679413,
"rewards/rejected": -0.20770475268363953,
"step": 695
},
{
"epoch": 0.18,
"learning_rate": 1.546948158122427e-07,
"logits/chosen": -1.7370418310165405,
"logits/rejected": -1.7001125812530518,
"logps/chosen": -5.915326118469238,
"logps/rejected": -8.323230743408203,
"loss": 0.6435,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07308584451675415,
"rewards/margins": 0.15741673111915588,
"rewards/rejected": -0.23050260543823242,
"step": 700
},
{
"epoch": 0.18,
"eval_logits/chosen": -3.186901092529297,
"eval_logits/rejected": -3.1663575172424316,
"eval_logps/chosen": -6.852160930633545,
"eval_logps/rejected": -7.856788158416748,
"eval_loss": 0.6899130344390869,
"eval_rewards/accuracies": 0.5376602411270142,
"eval_rewards/chosen": -0.0711536630988121,
"eval_rewards/margins": 0.019983632490038872,
"eval_rewards/rejected": -0.09113729745149612,
"eval_runtime": 640.2435,
"eval_samples_per_second": 31.135,
"eval_steps_per_second": 0.487,
"step": 700
},
{
"epoch": 0.18,
"learning_rate": 1.540008397853547e-07,
"logits/chosen": -1.5285913944244385,
"logits/rejected": -1.506296992301941,
"logps/chosen": -4.79876184463501,
"logps/rejected": -7.281890869140625,
"loss": 0.6362,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.01859206147491932,
"rewards/margins": 0.11602876335382462,
"rewards/rejected": -0.1346208155155182,
"step": 705
},
{
"epoch": 0.18,
"learning_rate": 1.5330317287279937e-07,
"logits/chosen": -1.489984393119812,
"logits/rejected": -1.384037733078003,
"logps/chosen": -6.871964931488037,
"logps/rejected": -9.540796279907227,
"loss": 0.6343,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.09499697387218475,
"rewards/margins": 0.11305873095989227,
"rewards/rejected": -0.20805568993091583,
"step": 710
},
{
"epoch": 0.18,
"learning_rate": 1.526018627591834e-07,
"logits/chosen": -1.4063003063201904,
"logits/rejected": -1.392664909362793,
"logps/chosen": -7.688409328460693,
"logps/rejected": -8.560149192810059,
"loss": 0.6462,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.08843918889760971,
"rewards/margins": 0.12001106888055801,
"rewards/rejected": -0.20845024287700653,
"step": 715
},
{
"epoch": 0.18,
"learning_rate": 1.5189695737812152e-07,
"logits/chosen": -1.4897139072418213,
"logits/rejected": -1.437138319015503,
"logps/chosen": -5.6488165855407715,
"logps/rejected": -8.617653846740723,
"loss": 0.637,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.08085988461971283,
"rewards/margins": 0.1466083824634552,
"rewards/rejected": -0.22746825218200684,
"step": 720
},
{
"epoch": 0.18,
"learning_rate": 1.511885049089601e-07,
"logits/chosen": -1.4502151012420654,
"logits/rejected": -1.558260440826416,
"logps/chosen": -6.4499664306640625,
"logps/rejected": -8.321115493774414,
"loss": 0.6455,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.08257200568914413,
"rewards/margins": 0.10597743839025497,
"rewards/rejected": -0.1885494440793991,
"step": 725
},
{
"epoch": 0.18,
"learning_rate": 1.5047655377348439e-07,
"logits/chosen": -1.3676706552505493,
"logits/rejected": -1.3527071475982666,
"logps/chosen": -6.454686641693115,
"logps/rejected": -7.9275712966918945,
"loss": 0.6392,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.07613205164670944,
"rewards/margins": 0.11505208909511566,
"rewards/rejected": -0.1911841332912445,
"step": 730
},
{
"epoch": 0.18,
"learning_rate": 1.4976115263260874e-07,
"logits/chosen": -1.4929918050765991,
"logits/rejected": -1.4645825624465942,
"logps/chosen": -5.613523006439209,
"logps/rejected": -8.017206192016602,
"loss": 0.6462,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.043209511786699295,
"rewards/margins": 0.09339593350887299,
"rewards/rejected": -0.13660545647144318,
"step": 735
},
{
"epoch": 0.19,
"learning_rate": 1.4904235038305082e-07,
"logits/chosen": -1.430387258529663,
"logits/rejected": -1.5527921915054321,
"logps/chosen": -7.354534149169922,
"logps/rejected": -8.235260963439941,
"loss": 0.6458,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.07374344021081924,
"rewards/margins": 0.14879578351974487,
"rewards/rejected": -0.22253921627998352,
"step": 740
},
{
"epoch": 0.19,
"learning_rate": 1.483201961539896e-07,
"logits/chosen": -1.4932714700698853,
"logits/rejected": -1.4173743724822998,
"logps/chosen": -5.170498847961426,
"logps/rejected": -7.974796295166016,
"loss": 0.6307,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.04303674027323723,
"rewards/margins": 0.15367302298545837,
"rewards/rejected": -0.1967097669839859,
"step": 745
},
{
"epoch": 0.19,
"learning_rate": 1.4759473930370737e-07,
"logits/chosen": -1.7062429189682007,
"logits/rejected": -1.6499900817871094,
"logps/chosen": -6.257295608520508,
"logps/rejected": -7.387567043304443,
"loss": 0.6434,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.10747422277927399,
"rewards/margins": 0.08016838133335114,
"rewards/rejected": -0.18764260411262512,
"step": 750
},
{
"epoch": 0.19,
"learning_rate": 1.4686602941621615e-07,
"logits/chosen": -1.244971752166748,
"logits/rejected": -1.2606076002120972,
"logps/chosen": -6.677346706390381,
"logps/rejected": -9.851409912109375,
"loss": 0.6353,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08402319997549057,
"rewards/margins": 0.10642798244953156,
"rewards/rejected": -0.19045117497444153,
"step": 755
},
{
"epoch": 0.19,
"learning_rate": 1.4613411629786877e-07,
"logits/chosen": -1.4713022708892822,
"logits/rejected": -1.5048084259033203,
"logps/chosen": -6.147872447967529,
"logps/rejected": -7.910445213317871,
"loss": 0.6417,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07068430632352829,
"rewards/margins": 0.16299991309642792,
"rewards/rejected": -0.2336842119693756,
"step": 760
},
{
"epoch": 0.19,
"learning_rate": 1.4539904997395468e-07,
"logits/chosen": -1.3301279544830322,
"logits/rejected": -1.3574144840240479,
"logps/chosen": -6.699339389801025,
"logps/rejected": -9.695663452148438,
"loss": 0.6279,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06842740625143051,
"rewards/margins": 0.13677598536014557,
"rewards/rejected": -0.20520341396331787,
"step": 765
},
{
"epoch": 0.19,
"learning_rate": 1.4466088068528067e-07,
"logits/chosen": -1.51852548122406,
"logits/rejected": -1.4426764249801636,
"logps/chosen": -5.722136497497559,
"logps/rejected": -8.637394905090332,
"loss": 0.6389,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0995694026350975,
"rewards/margins": 0.11200227588415146,
"rewards/rejected": -0.21157169342041016,
"step": 770
},
{
"epoch": 0.19,
"learning_rate": 1.4391965888473702e-07,
"logits/chosen": -1.5839837789535522,
"logits/rejected": -1.5870234966278076,
"logps/chosen": -6.29650354385376,
"logps/rejected": -9.936391830444336,
"loss": 0.63,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.08398771286010742,
"rewards/margins": 0.11872847378253937,
"rewards/rejected": -0.20271620154380798,
"step": 775
},
{
"epoch": 0.2,
"learning_rate": 1.4317543523384928e-07,
"logits/chosen": -1.694641351699829,
"logits/rejected": -1.6382360458374023,
"logps/chosen": -6.944577217102051,
"logps/rejected": -8.599451065063477,
"loss": 0.6283,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08465109765529633,
"rewards/margins": 0.16851410269737244,
"rewards/rejected": -0.25316524505615234,
"step": 780
},
{
"epoch": 0.2,
"learning_rate": 1.4242826059931536e-07,
"logits/chosen": -1.4301807880401611,
"logits/rejected": -1.3396793603897095,
"logps/chosen": -5.984529972076416,
"logps/rejected": -8.90467643737793,
"loss": 0.6422,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.11020038276910782,
"rewards/margins": 0.13162291049957275,
"rewards/rejected": -0.24182331562042236,
"step": 785
},
{
"epoch": 0.2,
"learning_rate": 1.4167818604952903e-07,
"logits/chosen": -1.8726218938827515,
"logits/rejected": -1.8315349817276,
"logps/chosen": -6.4224066734313965,
"logps/rejected": -9.21501350402832,
"loss": 0.6316,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.08100078999996185,
"rewards/margins": 0.16276691854000092,
"rewards/rejected": -0.24376770853996277,
"step": 790
},
{
"epoch": 0.2,
"learning_rate": 1.4092526285108939e-07,
"logits/chosen": -1.6488440036773682,
"logits/rejected": -1.6959072351455688,
"logps/chosen": -5.05312442779541,
"logps/rejected": -8.503290176391602,
"loss": 0.6354,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.025250781327486038,
"rewards/margins": 0.17517754435539246,
"rewards/rejected": -0.2004283368587494,
"step": 795
},
{
"epoch": 0.2,
"learning_rate": 1.4016954246529695e-07,
"logits/chosen": -1.5666801929473877,
"logits/rejected": -1.6438080072402954,
"logps/chosen": -6.046337127685547,
"logps/rejected": -8.245607376098633,
"loss": 0.6332,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.08281711488962173,
"rewards/margins": 0.1388954371213913,
"rewards/rejected": -0.22171254456043243,
"step": 800
},
{
"epoch": 0.2,
"eval_logits/chosen": -3.1937944889068604,
"eval_logits/rejected": -3.173306941986084,
"eval_logps/chosen": -7.060844421386719,
"eval_logps/rejected": -8.098358154296875,
"eval_loss": 0.6894627213478088,
"eval_rewards/accuracies": 0.5336538553237915,
"eval_rewards/chosen": -0.09202194213867188,
"eval_rewards/margins": 0.023272372782230377,
"eval_rewards/rejected": -0.11529432237148285,
"eval_runtime": 642.2741,
"eval_samples_per_second": 31.037,
"eval_steps_per_second": 0.486,
"step": 800
},
{
"epoch": 0.2,
"learning_rate": 1.3941107654463616e-07,
"logits/chosen": -1.6347227096557617,
"logits/rejected": -1.5565736293792725,
"logps/chosen": -6.210860252380371,
"logps/rejected": -8.191205978393555,
"loss": 0.633,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0968286544084549,
"rewards/margins": 0.13983137905597687,
"rewards/rejected": -0.23666004836559296,
"step": 805
},
{
"epoch": 0.2,
"learning_rate": 1.3864991692924522e-07,
"logits/chosen": -1.6105167865753174,
"logits/rejected": -1.5960274934768677,
"logps/chosen": -6.203638553619385,
"logps/rejected": -9.020109176635742,
"loss": 0.6296,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09015347063541412,
"rewards/margins": 0.13406077027320862,
"rewards/rejected": -0.22421424090862274,
"step": 810
},
{
"epoch": 0.2,
"learning_rate": 1.3788611564337276e-07,
"logits/chosen": -1.5526584386825562,
"logits/rejected": -1.5093594789505005,
"logps/chosen": -6.796191215515137,
"logps/rejected": -8.405149459838867,
"loss": 0.6223,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.08056505024433136,
"rewards/margins": 0.13686737418174744,
"rewards/rejected": -0.21743245422840118,
"step": 815
},
{
"epoch": 0.21,
"learning_rate": 1.3711972489182207e-07,
"logits/chosen": -1.311798334121704,
"logits/rejected": -1.2505333423614502,
"logps/chosen": -6.062074184417725,
"logps/rejected": -8.897860527038574,
"loss": 0.6306,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.030773481354117393,
"rewards/margins": 0.19692835211753845,
"rewards/rejected": -0.22770185768604279,
"step": 820
},
{
"epoch": 0.21,
"learning_rate": 1.3635079705638297e-07,
"logits/chosen": -1.5671780109405518,
"logits/rejected": -1.5879342555999756,
"logps/chosen": -6.3402533531188965,
"logps/rejected": -9.183328628540039,
"loss": 0.6338,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.06746585667133331,
"rewards/margins": 0.13011819124221802,
"rewards/rejected": -0.19758403301239014,
"step": 825
},
{
"epoch": 0.21,
"learning_rate": 1.3557938469225164e-07,
"logits/chosen": -1.423302412033081,
"logits/rejected": -1.4359217882156372,
"logps/chosen": -6.0072126388549805,
"logps/rejected": -9.093564987182617,
"loss": 0.6206,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09784204512834549,
"rewards/margins": 0.13212862610816956,
"rewards/rejected": -0.22997066378593445,
"step": 830
},
{
"epoch": 0.21,
"learning_rate": 1.3480554052443843e-07,
"logits/chosen": -1.7213542461395264,
"logits/rejected": -1.6981548070907593,
"logps/chosen": -7.577986240386963,
"logps/rejected": -10.498396873474121,
"loss": 0.6315,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.10542915016412735,
"rewards/margins": 0.1352270096540451,
"rewards/rejected": -0.24065613746643066,
"step": 835
},
{
"epoch": 0.21,
"learning_rate": 1.340293174441643e-07,
"logits/chosen": -1.6847597360610962,
"logits/rejected": -1.6527271270751953,
"logps/chosen": -8.06293773651123,
"logps/rejected": -9.775976181030273,
"loss": 0.6238,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11751914024353027,
"rewards/margins": 0.1262793093919754,
"rewards/rejected": -0.24379844963550568,
"step": 840
},
{
"epoch": 0.21,
"learning_rate": 1.332507685052457e-07,
"logits/chosen": -1.3307305574417114,
"logits/rejected": -1.296414852142334,
"logps/chosen": -5.506723880767822,
"logps/rejected": -9.845108032226562,
"loss": 0.6284,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.05931394174695015,
"rewards/margins": 0.17723853886127472,
"rewards/rejected": -0.23655244708061218,
"step": 845
},
{
"epoch": 0.21,
"learning_rate": 1.3246994692046836e-07,
"logits/chosen": -1.4784770011901855,
"logits/rejected": -1.4177029132843018,
"logps/chosen": -6.148890495300293,
"logps/rejected": -9.090556144714355,
"loss": 0.6283,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.07422871887683868,
"rewards/margins": 0.16514147818088531,
"rewards/rejected": -0.239370197057724,
"step": 850
},
{
"epoch": 0.21,
"learning_rate": 1.3168690605795043e-07,
"logits/chosen": -1.4130356311798096,
"logits/rejected": -1.453981637954712,
"logps/chosen": -5.568487167358398,
"logps/rejected": -9.608223915100098,
"loss": 0.6299,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.087576724588871,
"rewards/margins": 0.2175295650959015,
"rewards/rejected": -0.3051062524318695,
"step": 855
},
{
"epoch": 0.22,
"learning_rate": 1.3090169943749475e-07,
"logits/chosen": -1.4787486791610718,
"logits/rejected": -1.437145471572876,
"logps/chosen": -5.853695869445801,
"logps/rejected": -8.345501899719238,
"loss": 0.6359,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.05679728835821152,
"rewards/margins": 0.1354844868183136,
"rewards/rejected": -0.19228176772594452,
"step": 860
},
{
"epoch": 0.22,
"learning_rate": 1.3011438072693074e-07,
"logits/chosen": -1.5212247371673584,
"logits/rejected": -1.4433953762054443,
"logps/chosen": -6.458197593688965,
"logps/rejected": -9.791971206665039,
"loss": 0.6192,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.07254395633935928,
"rewards/margins": 0.23906107246875763,
"rewards/rejected": -0.3116050362586975,
"step": 865
},
{
"epoch": 0.22,
"learning_rate": 1.2932500373844649e-07,
"logits/chosen": -1.6338106393814087,
"logits/rejected": -1.4234752655029297,
"logps/chosen": -5.266790866851807,
"logps/rejected": -8.406009674072266,
"loss": 0.6266,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09523673355579376,
"rewards/margins": 0.15846498310565948,
"rewards/rejected": -0.25370171666145325,
"step": 870
},
{
"epoch": 0.22,
"learning_rate": 1.2853362242491051e-07,
"logits/chosen": -1.4831264019012451,
"logits/rejected": -1.5002310276031494,
"logps/chosen": -5.224946022033691,
"logps/rejected": -7.2889556884765625,
"loss": 0.6128,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06067908555269241,
"rewards/margins": 0.17062053084373474,
"rewards/rejected": -0.23129959404468536,
"step": 875
},
{
"epoch": 0.22,
"learning_rate": 1.2774029087618446e-07,
"logits/chosen": -1.758953332901001,
"logits/rejected": -1.6146351099014282,
"logps/chosen": -7.610304355621338,
"logps/rejected": -10.307147026062012,
"loss": 0.619,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.07071717083454132,
"rewards/margins": 0.20475205779075623,
"rewards/rejected": -0.27546921372413635,
"step": 880
},
{
"epoch": 0.22,
"learning_rate": 1.2694506331542577e-07,
"logits/chosen": -1.591770052909851,
"logits/rejected": -1.4535247087478638,
"logps/chosen": -5.625763893127441,
"logps/rejected": -9.85864543914795,
"loss": 0.6345,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0788426622748375,
"rewards/margins": 0.15841318666934967,
"rewards/rejected": -0.23725584149360657,
"step": 885
},
{
"epoch": 0.22,
"learning_rate": 1.2614799409538198e-07,
"logits/chosen": -1.4659042358398438,
"logits/rejected": -1.4563062191009521,
"logps/chosen": -5.280413627624512,
"logps/rejected": -9.323397636413574,
"loss": 0.6226,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09044213593006134,
"rewards/margins": 0.1876935213804245,
"rewards/rejected": -0.2781356871128082,
"step": 890
},
{
"epoch": 0.22,
"learning_rate": 1.253491376946754e-07,
"logits/chosen": -1.3718680143356323,
"logits/rejected": -1.3589755296707153,
"logps/chosen": -6.7582688331604,
"logps/rejected": -10.708890914916992,
"loss": 0.6275,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.06859103590250015,
"rewards/margins": 0.16634361445903778,
"rewards/rejected": -0.23493464291095734,
"step": 895
},
{
"epoch": 0.23,
"learning_rate": 1.2454854871407992e-07,
"logits/chosen": -1.3176133632659912,
"logits/rejected": -1.3136779069900513,
"logps/chosen": -6.8164777755737305,
"logps/rejected": -9.460234642028809,
"loss": 0.6217,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1115039810538292,
"rewards/margins": 0.17541712522506714,
"rewards/rejected": -0.28692108392715454,
"step": 900
},
{
"epoch": 0.23,
"eval_logits/chosen": -3.18924617767334,
"eval_logits/rejected": -3.1685447692871094,
"eval_logps/chosen": -7.315045356750488,
"eval_logps/rejected": -8.408605575561523,
"eval_loss": 0.6886266469955444,
"eval_rewards/accuracies": 0.5352563858032227,
"eval_rewards/chosen": -0.11744209378957748,
"eval_rewards/margins": 0.0288768969476223,
"eval_rewards/rejected": -0.14631898701190948,
"eval_runtime": 646.5131,
"eval_samples_per_second": 30.833,
"eval_steps_per_second": 0.483,
"step": 900
},
{
"epoch": 0.23,
"learning_rate": 1.2374628187278885e-07,
"logits/chosen": -1.6750404834747314,
"logits/rejected": -1.6324043273925781,
"logps/chosen": -6.7752885818481445,
"logps/rejected": -9.884492874145508,
"loss": 0.6205,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.08316051214933395,
"rewards/margins": 0.19135987758636475,
"rewards/rejected": -0.2745203971862793,
"step": 905
},
{
"epoch": 0.23,
"learning_rate": 1.2294239200467515e-07,
"logits/chosen": -1.4121886491775513,
"logits/rejected": -1.4634597301483154,
"logps/chosen": -8.272059440612793,
"logps/rejected": -10.233227729797363,
"loss": 0.6219,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1476745754480362,
"rewards/margins": 0.1988331377506256,
"rewards/rejected": -0.346507728099823,
"step": 910
},
{
"epoch": 0.23,
"learning_rate": 1.2213693405454345e-07,
"logits/chosen": -1.6356998682022095,
"logits/rejected": -1.6384683847427368,
"logps/chosen": -5.971181392669678,
"logps/rejected": -8.54847240447998,
"loss": 0.6212,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.13469815254211426,
"rewards/margins": 0.1848105490207672,
"rewards/rejected": -0.31950870156288147,
"step": 915
},
{
"epoch": 0.23,
"learning_rate": 1.213299630743747e-07,
"logits/chosen": -1.4081295728683472,
"logits/rejected": -1.333418846130371,
"logps/chosen": -7.040464878082275,
"logps/rejected": -8.731954574584961,
"loss": 0.6179,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12728752195835114,
"rewards/margins": 0.17130446434020996,
"rewards/rejected": -0.2985920011997223,
"step": 920
},
{
"epoch": 0.23,
"learning_rate": 1.205215342195634e-07,
"logits/chosen": -1.5196300745010376,
"logits/rejected": -1.5172007083892822,
"logps/chosen": -5.7593536376953125,
"logps/rejected": -9.683046340942383,
"loss": 0.6175,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.10798660665750504,
"rewards/margins": 0.25386783480644226,
"rewards/rejected": -0.3618544340133667,
"step": 925
},
{
"epoch": 0.23,
"learning_rate": 1.1971170274514802e-07,
"logits/chosen": -1.6615116596221924,
"logits/rejected": -1.6914132833480835,
"logps/chosen": -7.840911865234375,
"logps/rejected": -9.82246208190918,
"loss": 0.6232,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.14061352610588074,
"rewards/margins": 0.14244748651981354,
"rewards/rejected": -0.2830609679222107,
"step": 930
},
{
"epoch": 0.23,
"learning_rate": 1.1890052400203402e-07,
"logits/chosen": -1.834905982017517,
"logits/rejected": -1.7889604568481445,
"logps/chosen": -7.288022041320801,
"logps/rejected": -10.376224517822266,
"loss": 0.6095,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1694309115409851,
"rewards/margins": 0.1324268877506256,
"rewards/rejected": -0.3018577992916107,
"step": 935
},
{
"epoch": 0.24,
"learning_rate": 1.18088053433211e-07,
"logits/chosen": -1.5051194429397583,
"logits/rejected": -1.5041710138320923,
"logps/chosen": -6.950632572174072,
"logps/rejected": -9.823295593261719,
"loss": 0.6208,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11773916333913803,
"rewards/margins": 0.17614324390888214,
"rewards/rejected": -0.29388242959976196,
"step": 940
},
{
"epoch": 0.24,
"learning_rate": 1.1727434656996305e-07,
"logits/chosen": -1.6864219903945923,
"logits/rejected": -1.7134393453598022,
"logps/chosen": -8.343110084533691,
"logps/rejected": -10.6605863571167,
"loss": 0.6269,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1767118275165558,
"rewards/margins": 0.15801793336868286,
"rewards/rejected": -0.33472976088523865,
"step": 945
},
{
"epoch": 0.24,
"learning_rate": 1.1645945902807339e-07,
"logits/chosen": -1.521843671798706,
"logits/rejected": -1.4162412881851196,
"logps/chosen": -5.303705215454102,
"logps/rejected": -8.564371109008789,
"loss": 0.6195,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.08533243834972382,
"rewards/margins": 0.2330024540424347,
"rewards/rejected": -0.3183349072933197,
"step": 950
},
{
"epoch": 0.24,
"learning_rate": 1.1564344650402309e-07,
"logits/chosen": -1.4854631423950195,
"logits/rejected": -1.5085594654083252,
"logps/chosen": -6.938723087310791,
"logps/rejected": -9.417280197143555,
"loss": 0.6228,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.11080266535282135,
"rewards/margins": 0.18629567325115204,
"rewards/rejected": -0.2970983386039734,
"step": 955
},
{
"epoch": 0.24,
"learning_rate": 1.1482636477118419e-07,
"logits/chosen": -1.5579372644424438,
"logits/rejected": -1.5621235370635986,
"logps/chosen": -6.055428981781006,
"logps/rejected": -8.274986267089844,
"loss": 0.617,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.08702854812145233,
"rewards/margins": 0.19237728416919708,
"rewards/rejected": -0.2794058322906494,
"step": 960
},
{
"epoch": 0.24,
"learning_rate": 1.1400826967600779e-07,
"logits/chosen": -1.6652123928070068,
"logits/rejected": -1.5670098066329956,
"logps/chosen": -6.44259786605835,
"logps/rejected": -9.023821830749512,
"loss": 0.6279,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.12496472895145416,
"rewards/margins": 0.1399177610874176,
"rewards/rejected": -0.26488250494003296,
"step": 965
},
{
"epoch": 0.24,
"learning_rate": 1.131892171342069e-07,
"logits/chosen": -1.5863568782806396,
"logits/rejected": -1.4539623260498047,
"logps/chosen": -8.477279663085938,
"logps/rejected": -11.437823295593262,
"loss": 0.6105,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.12180640548467636,
"rewards/margins": 0.20829221606254578,
"rewards/rejected": -0.3300986588001251,
"step": 970
},
{
"epoch": 0.24,
"learning_rate": 1.1236926312693478e-07,
"logits/chosen": -1.4525238275527954,
"logits/rejected": -1.3738415241241455,
"logps/chosen": -6.627744197845459,
"logps/rejected": -10.264178276062012,
"loss": 0.6171,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.12857326865196228,
"rewards/margins": 0.23741576075553894,
"rewards/rejected": -0.365989089012146,
"step": 975
},
{
"epoch": 0.25,
"learning_rate": 1.1154846369695863e-07,
"logits/chosen": -1.2888273000717163,
"logits/rejected": -1.2684919834136963,
"logps/chosen": -7.151463985443115,
"logps/rejected": -10.000380516052246,
"loss": 0.6203,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.09203344583511353,
"rewards/margins": 0.21837130188941956,
"rewards/rejected": -0.3104047179222107,
"step": 980
},
{
"epoch": 0.25,
"learning_rate": 1.1072687494482918e-07,
"logits/chosen": -1.3414087295532227,
"logits/rejected": -1.3495652675628662,
"logps/chosen": -7.086977481842041,
"logps/rejected": -10.637145042419434,
"loss": 0.6076,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.11822090297937393,
"rewards/margins": 0.2522438168525696,
"rewards/rejected": -0.3704647123813629,
"step": 985
},
{
"epoch": 0.25,
"learning_rate": 1.0990455302504628e-07,
"logits/chosen": -1.2805410623550415,
"logits/rejected": -1.3479764461517334,
"logps/chosen": -6.615203857421875,
"logps/rejected": -9.906420707702637,
"loss": 0.6078,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1480402648448944,
"rewards/margins": 0.21094676852226257,
"rewards/rejected": -0.358987033367157,
"step": 990
},
{
"epoch": 0.25,
"learning_rate": 1.0908155414222082e-07,
"logits/chosen": -1.3715662956237793,
"logits/rejected": -1.4605109691619873,
"logps/chosen": -6.958237648010254,
"logps/rejected": -8.883742332458496,
"loss": 0.6167,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.13871873915195465,
"rewards/margins": 0.16220495104789734,
"rewards/rejected": -0.3009237051010132,
"step": 995
},
{
"epoch": 0.25,
"learning_rate": 1.0825793454723325e-07,
"logits/chosen": -1.5984854698181152,
"logits/rejected": -1.618065595626831,
"logps/chosen": -7.258332252502441,
"logps/rejected": -9.4107666015625,
"loss": 0.6015,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.0792965292930603,
"rewards/margins": 0.28216832876205444,
"rewards/rejected": -0.36146482825279236,
"step": 1000
},
{
"epoch": 0.25,
"eval_logits/chosen": -3.169694662094116,
"eval_logits/rejected": -3.1489059925079346,
"eval_logps/chosen": -7.470149040222168,
"eval_logps/rejected": -8.596240043640137,
"eval_loss": 0.6881732940673828,
"eval_rewards/accuracies": 0.5396634340286255,
"eval_rewards/chosen": -0.13295257091522217,
"eval_rewards/margins": 0.032129984349012375,
"eval_rewards/rejected": -0.16508255898952484,
"eval_runtime": 650.2113,
"eval_samples_per_second": 30.658,
"eval_steps_per_second": 0.48,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}