{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 500, "global_step": 656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.575757575757576e-08, "logits/chosen": 0.07398031651973724, "logits/rejected": 0.059482574462890625, "logps/chosen": -279.7221984863281, "logps/rejected": -295.30865478515625, "loss": 2.4106, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.575757575757576e-07, "logits/chosen": 0.08179842680692673, "logits/rejected": 0.2137567102909088, "logps/chosen": -371.2894287109375, "logps/rejected": -378.87701416015625, "loss": 2.1369, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": 0.000596717931330204, "rewards/margins": 0.0007703733863309026, "rewards/rejected": -0.00017365541134495288, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.13426382839679718, "logits/rejected": 0.17069879174232483, "logps/chosen": -337.7759704589844, "logps/rejected": -351.1375427246094, "loss": 2.1857, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0013722162693738937, "rewards/margins": -0.0006242281524464488, "rewards/rejected": -0.0007479880005121231, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.11453332751989365, "logits/rejected": 0.1672835648059845, "logps/chosen": -343.336181640625, "logps/rejected": -351.83966064453125, "loss": 2.2006, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0007104009273461998, "rewards/margins": 0.0031625095289200544, "rewards/rejected": -0.0024521087761968374, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.14377865195274353, "logits/rejected": 0.23349857330322266, "logps/chosen": -338.24847412109375, "logps/rejected": -321.5999450683594, "loss": 2.0523, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00015345169231295586, "rewards/margins": 0.0033258639741688967, "rewards/rejected": -0.003479315433651209, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.7878787878787882e-06, "logits/chosen": 0.12487177550792694, "logits/rejected": 0.23440325260162354, "logps/chosen": -385.6036682128906, "logps/rejected": -353.2607727050781, "loss": 2.0721, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0014706759247928858, "rewards/margins": 0.005332515574991703, "rewards/rejected": -0.006803191266953945, "step": 50 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.1113414317369461, "logits/rejected": 0.1453917920589447, "logps/chosen": -375.4793701171875, "logps/rejected": -355.79571533203125, "loss": 2.0509, "rewards/accuracies": 0.625, "rewards/chosen": -0.002245596144348383, "rewards/margins": 0.019777730107307434, "rewards/rejected": -0.02202332578599453, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.999432965739786e-06, "logits/chosen": 0.13350918889045715, "logits/rejected": 0.1631946861743927, "logps/chosen": -323.2510986328125, "logps/rejected": -328.84039306640625, "loss": 2.0626, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0012016391847282648, "rewards/margins": 0.03956783190369606, "rewards/rejected": -0.040769465267658234, "step": 70 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.0718630701303482, "logits/rejected": 0.19179414212703705, "logps/chosen": -378.56396484375, "logps/rejected": -368.79949951171875, "loss": 2.0501, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.003595351707190275, "rewards/margins": 0.06342312693595886, "rewards/rejected": -0.0598277822136879, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.979613761906212e-06, "logits/chosen": 0.10020647943019867, "logits/rejected": 0.21944165229797363, "logps/chosen": -358.9504089355469, "logps/rejected": -345.926513671875, "loss": 1.8966, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.010454339906573296, "rewards/margins": 0.09268515557050705, "rewards/rejected": -0.1031394973397255, "step": 90 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.1388009488582611, "logits/rejected": 0.26329106092453003, "logps/chosen": -337.170166015625, "logps/rejected": -351.45172119140625, "loss": 1.9484, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.009544052183628082, "rewards/margins": 0.14409229159355164, "rewards/rejected": -0.13454824686050415, "step": 100 }, { "epoch": 0.17, "learning_rate": 4.931699543346854e-06, "logits/chosen": 0.10444238036870956, "logits/rejected": 0.20885030925273895, "logps/chosen": -329.65020751953125, "logps/rejected": -367.345458984375, "loss": 1.7584, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03416838496923447, "rewards/margins": 0.19879209995269775, "rewards/rejected": -0.23296049237251282, "step": 110 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.10559381544589996, "logits/rejected": 0.1961037516593933, "logps/chosen": -354.2994079589844, "logps/rejected": -344.41925048828125, "loss": 1.7929, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05705786868929863, "rewards/margins": 0.17845068871974945, "rewards/rejected": -0.23550856113433838, "step": 120 }, { "epoch": 0.2, "learning_rate": 4.8562331973035396e-06, "logits/chosen": 0.12288101017475128, "logits/rejected": 0.22050254046916962, "logps/chosen": -327.88311767578125, "logps/rejected": -356.74407958984375, "loss": 1.8191, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07013808190822601, "rewards/margins": 0.22160351276397705, "rewards/rejected": -0.29174157977104187, "step": 130 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.1860750913619995, "logits/rejected": 0.18267032504081726, "logps/chosen": -338.1869812011719, "logps/rejected": -380.5123596191406, "loss": 1.7142, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07972263544797897, "rewards/margins": 0.2994547486305237, "rewards/rejected": -0.37917739152908325, "step": 140 }, { "epoch": 0.23, "learning_rate": 4.754069787631761e-06, "logits/chosen": 0.15666987001895905, "logits/rejected": 0.24864494800567627, "logps/chosen": -409.4656677246094, "logps/rejected": -393.28466796875, "loss": 1.6771, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06894813477993011, "rewards/margins": 0.3388601243495941, "rewards/rejected": -0.40780824422836304, "step": 150 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.12029329687356949, "logits/rejected": 0.17621104419231415, "logps/chosen": -326.62701416015625, "logps/rejected": -357.37164306640625, "loss": 1.74, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.045461155474185944, "rewards/margins": 0.287504643201828, "rewards/rejected": -0.33296579122543335, "step": 160 }, { "epoch": 0.26, "learning_rate": 4.626366866585528e-06, "logits/chosen": 0.17087192833423615, "logits/rejected": 0.25556960701942444, "logps/chosen": -338.80230712890625, "logps/rejected": -350.237060546875, "loss": 1.6582, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02211242914199829, "rewards/margins": 0.2152978479862213, "rewards/rejected": -0.237410306930542, "step": 170 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.14991971850395203, "logits/rejected": 0.1636931598186493, "logps/chosen": -308.0624084472656, "logps/rejected": -344.2478942871094, "loss": 1.6719, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.004089848138391972, "rewards/margins": 0.2675096392631531, "rewards/rejected": -0.2634198069572449, "step": 180 }, { "epoch": 0.29, "learning_rate": 4.474571359287791e-06, "logits/chosen": 0.15207740664482117, "logits/rejected": 0.211543008685112, "logps/chosen": -336.4064636230469, "logps/rejected": -340.141357421875, "loss": 1.6014, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01036889385432005, "rewards/margins": 0.2858952283859253, "rewards/rejected": -0.296264111995697, "step": 190 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.1840183436870575, "logits/rejected": 0.22152027487754822, "logps/chosen": -404.16021728515625, "logps/rejected": -356.364013671875, "loss": 1.7463, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03463779762387276, "rewards/margins": 0.25136780738830566, "rewards/rejected": -0.21672996878623962, "step": 200 }, { "epoch": 0.32, "learning_rate": 4.30040316949064e-06, "logits/chosen": 0.1487782895565033, "logits/rejected": 0.20272579789161682, "logps/chosen": -347.5115966796875, "logps/rejected": -344.53753662109375, "loss": 1.6848, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.019732611253857613, "rewards/margins": 0.21993084251880646, "rewards/rejected": -0.2001982480287552, "step": 210 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.11921755224466324, "logits/rejected": 0.16543138027191162, "logps/chosen": -302.01861572265625, "logps/rejected": -345.60491943359375, "loss": 1.7084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0038991228211671114, "rewards/margins": 0.31049156188964844, "rewards/rejected": -0.3143906593322754, "step": 220 }, { "epoch": 0.35, "learning_rate": 4.105835692378557e-06, "logits/chosen": 0.13227376341819763, "logits/rejected": 0.17412447929382324, "logps/chosen": -337.3625793457031, "logps/rejected": -365.3255615234375, "loss": 1.6995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05614470690488815, "rewards/margins": 0.28102201223373413, "rewards/rejected": -0.3371667265892029, "step": 230 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.13456036150455475, "logits/rejected": 0.22667856514453888, "logps/chosen": -403.4490966796875, "logps/rejected": -408.80413818359375, "loss": 1.4862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.025508727878332138, "rewards/margins": 0.3425619602203369, "rewards/rejected": -0.36807072162628174, "step": 240 }, { "epoch": 0.38, "learning_rate": 3.893073455212438e-06, "logits/chosen": 0.13273295760154724, "logits/rejected": 0.21381357312202454, "logps/chosen": -335.8673095703125, "logps/rejected": -351.50103759765625, "loss": 1.5854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09447745978832245, "rewards/margins": 0.27237147092819214, "rewards/rejected": -0.3668489158153534, "step": 250 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.18772640824317932, "logits/rejected": 0.20650401711463928, "logps/chosen": -320.12945556640625, "logps/rejected": -362.57281494140625, "loss": 1.7323, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.056845515966415405, "rewards/margins": 0.2855423092842102, "rewards/rejected": -0.342387855052948, "step": 260 }, { "epoch": 0.41, "learning_rate": 3.6645271391548542e-06, "logits/chosen": 0.154958575963974, "logits/rejected": 0.19303588569164276, "logps/chosen": -360.35443115234375, "logps/rejected": -359.91497802734375, "loss": 1.6388, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.005552726797759533, "rewards/margins": 0.27676287293434143, "rewards/rejected": -0.2712101340293884, "step": 270 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.1470947563648224, "logits/rejected": 0.19554203748703003, "logps/chosen": -343.96466064453125, "logps/rejected": -351.0820007324219, "loss": 1.6004, "rewards/accuracies": 0.75, "rewards/chosen": 0.018333502113819122, "rewards/margins": 0.3293381631374359, "rewards/rejected": -0.3110046684741974, "step": 280 }, { "epoch": 0.44, "learning_rate": 3.4227862652892106e-06, "logits/chosen": 0.18207962810993195, "logits/rejected": 0.25772327184677124, "logps/chosen": -379.73248291015625, "logps/rejected": -392.091552734375, "loss": 1.631, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0037728759925812483, "rewards/margins": 0.3134722113609314, "rewards/rejected": -0.3172450661659241, "step": 290 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.18880879878997803, "logits/rejected": 0.22835353016853333, "logps/chosen": -318.1977844238281, "logps/rejected": -360.68096923828125, "loss": 1.6509, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.051277369260787964, "rewards/margins": 0.28354349732398987, "rewards/rejected": -0.33482086658477783, "step": 300 }, { "epoch": 0.47, "learning_rate": 3.1705898543111576e-06, "logits/chosen": 0.1640356034040451, "logits/rejected": 0.2013256549835205, "logps/chosen": -345.88494873046875, "logps/rejected": -396.95489501953125, "loss": 1.5511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03583758696913719, "rewards/margins": 0.3143077492713928, "rewards/rejected": -0.35014528036117554, "step": 310 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.10814084857702255, "logits/rejected": 0.17361339926719666, "logps/chosen": -331.90240478515625, "logps/rejected": -345.795166015625, "loss": 1.6032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06420620530843735, "rewards/margins": 0.19957861304283142, "rewards/rejected": -0.26378482580184937, "step": 320 }, { "epoch": 0.5, "learning_rate": 2.910795392329649e-06, "logits/chosen": 0.13951388001441956, "logits/rejected": 0.19447830319404602, "logps/chosen": -364.3550720214844, "logps/rejected": -359.9488830566406, "loss": 1.595, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.010126419365406036, "rewards/margins": 0.31530141830444336, "rewards/rejected": -0.3254278302192688, "step": 330 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.1637967824935913, "logits/rejected": 0.15295840799808502, "logps/chosen": -340.40386962890625, "logps/rejected": -370.7852478027344, "loss": 1.5882, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09541453421115875, "rewards/margins": 0.31104880571365356, "rewards/rejected": -0.4064633250236511, "step": 340 }, { "epoch": 0.53, "learning_rate": 2.6463464544075344e-06, "logits/chosen": 0.14287754893302917, "logits/rejected": 0.21446409821510315, "logps/chosen": -355.376220703125, "logps/rejected": -388.11944580078125, "loss": 1.4669, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.019069373607635498, "rewards/margins": 0.37106311321258545, "rewards/rejected": -0.39013251662254333, "step": 350 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.13659325242042542, "logits/rejected": 0.22452709078788757, "logps/chosen": -303.7241516113281, "logps/rejected": -364.5718688964844, "loss": 1.5243, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.022251833230257034, "rewards/margins": 0.3609643578529358, "rewards/rejected": -0.38321617245674133, "step": 360 }, { "epoch": 0.56, "learning_rate": 2.380239352679908e-06, "logits/chosen": 0.13927368819713593, "logits/rejected": 0.2216307818889618, "logps/chosen": -325.48150634765625, "logps/rejected": -382.0296325683594, "loss": 1.5408, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0005920141702517867, "rewards/margins": 0.3364425599575043, "rewards/rejected": -0.3358505666255951, "step": 370 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.15144166350364685, "logits/rejected": 0.20430748164653778, "logps/chosen": -369.327880859375, "logps/rejected": -383.3594055175781, "loss": 1.6408, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05453786998987198, "rewards/margins": 0.41130223870277405, "rewards/rejected": -0.46584004163742065, "step": 380 }, { "epoch": 0.59, "learning_rate": 2.1154891869403436e-06, "logits/chosen": 0.11367179453372955, "logits/rejected": 0.20442676544189453, "logps/chosen": -361.3442077636719, "logps/rejected": -390.0636291503906, "loss": 1.641, "rewards/accuracies": 0.71875, "rewards/chosen": -0.013566520996391773, "rewards/margins": 0.45338043570518494, "rewards/rejected": -0.4669469892978668, "step": 390 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.15614674985408783, "logits/rejected": 0.20679621398448944, "logps/chosen": -328.5157165527344, "logps/rejected": -368.5887451171875, "loss": 1.6521, "rewards/accuracies": 0.6875, "rewards/chosen": -0.036636289209127426, "rewards/margins": 0.31485632061958313, "rewards/rejected": -0.35149258375167847, "step": 400 }, { "epoch": 0.62, "learning_rate": 1.8550956823554708e-06, "logits/chosen": 0.12708225846290588, "logits/rejected": 0.21543464064598083, "logps/chosen": -377.3260498046875, "logps/rejected": -364.48870849609375, "loss": 1.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0568900890648365, "rewards/margins": 0.30653566122055054, "rewards/rejected": -0.36342576146125793, "step": 410 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.20352402329444885, "logits/rejected": 0.27957600355148315, "logps/chosen": -347.7279357910156, "logps/rejected": -355.5521545410156, "loss": 1.6226, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06915120780467987, "rewards/margins": 0.2660463750362396, "rewards/rejected": -0.3351975977420807, "step": 420 }, { "epoch": 0.66, "learning_rate": 1.6020092013802002e-06, "logits/chosen": 0.14161694049835205, "logits/rejected": 0.22023169696331024, "logps/chosen": -323.6744689941406, "logps/rejected": -365.5609130859375, "loss": 1.5258, "rewards/accuracies": 0.71875, "rewards/chosen": -0.026104014366865158, "rewards/margins": 0.40274888277053833, "rewards/rejected": -0.4288528859615326, "step": 430 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.20196688175201416, "logits/rejected": 0.22374701499938965, "logps/chosen": -346.626220703125, "logps/rejected": -398.343994140625, "loss": 1.6026, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14315533638000488, "rewards/margins": 0.2796045243740082, "rewards/rejected": -0.42275986075401306, "step": 440 }, { "epoch": 0.69, "learning_rate": 1.3590973149722103e-06, "logits/chosen": 0.16043411195278168, "logits/rejected": 0.24400117993354797, "logps/chosen": -350.2712097167969, "logps/rejected": -377.76129150390625, "loss": 1.6442, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05976473540067673, "rewards/margins": 0.2989664673805237, "rewards/rejected": -0.3587311804294586, "step": 450 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.14352941513061523, "logits/rejected": 0.24110408127307892, "logps/chosen": -367.91851806640625, "logps/rejected": -397.81072998046875, "loss": 1.6837, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.08046244829893112, "rewards/margins": 0.36456337571144104, "rewards/rejected": -0.44502583146095276, "step": 460 }, { "epoch": 0.72, "learning_rate": 1.1291123118671665e-06, "logits/chosen": 0.0973966121673584, "logits/rejected": 0.18068069219589233, "logps/chosen": -339.65704345703125, "logps/rejected": -340.48236083984375, "loss": 1.6314, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06265803426504135, "rewards/margins": 0.30512723326683044, "rewards/rejected": -0.3677853047847748, "step": 470 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.17376969754695892, "logits/rejected": 0.21862807869911194, "logps/chosen": -355.0315856933594, "logps/rejected": -376.3943786621094, "loss": 1.5767, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02587694302201271, "rewards/margins": 0.3213956654071808, "rewards/rejected": -0.3472725749015808, "step": 480 }, { "epoch": 0.75, "learning_rate": 9.146600140475945e-07, "logits/chosen": 0.1421867460012436, "logits/rejected": 0.23107881844043732, "logps/chosen": -391.0975341796875, "logps/rejected": -383.6336975097656, "loss": 1.7495, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10866842418909073, "rewards/margins": 0.2634437382221222, "rewards/rejected": -0.3721121549606323, "step": 490 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.13602428138256073, "logits/rejected": 0.23974844813346863, "logps/chosen": -341.78582763671875, "logps/rejected": -399.82904052734375, "loss": 1.5854, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.04695357754826546, "rewards/margins": 0.3549908697605133, "rewards/rejected": -0.40194445848464966, "step": 500 }, { "epoch": 0.78, "learning_rate": 7.181702517385789e-07, "logits/chosen": 0.170148104429245, "logits/rejected": 0.21931186318397522, "logps/chosen": -323.8975524902344, "logps/rejected": -348.66766357421875, "loss": 1.7339, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07490874826908112, "rewards/margins": 0.3289056718349457, "rewards/rejected": -0.4038144052028656, "step": 510 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.17631427943706512, "logits/rejected": 0.23277851939201355, "logps/chosen": -345.8653259277344, "logps/rejected": -342.0807800292969, "loss": 1.6504, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02272309735417366, "rewards/margins": 0.33489790558815, "rewards/rejected": -0.3576210141181946, "step": 520 }, { "epoch": 0.81, "learning_rate": 5.418693324604082e-07, "logits/chosen": 0.1863461136817932, "logits/rejected": 0.25381818413734436, "logps/chosen": -358.6033630371094, "logps/rejected": -392.04302978515625, "loss": 1.542, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.001642666757106781, "rewards/margins": 0.41695213317871094, "rewards/rejected": -0.41530942916870117, "step": 530 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.11545145511627197, "logits/rejected": 0.20592764019966125, "logps/chosen": -349.9122619628906, "logps/rejected": -408.61590576171875, "loss": 1.5374, "rewards/accuracies": 0.75, "rewards/chosen": 8.928254101192579e-05, "rewards/margins": 0.40838712453842163, "rewards/rejected": -0.4082978367805481, "step": 540 }, { "epoch": 0.84, "learning_rate": 3.877548160747768e-07, "logits/chosen": 0.12814117968082428, "logits/rejected": 0.19134709239006042, "logps/chosen": -337.3287658691406, "logps/rejected": -354.94415283203125, "loss": 1.6835, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.006277731154114008, "rewards/margins": 0.3102794587612152, "rewards/rejected": -0.30400174856185913, "step": 550 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.1329401135444641, "logits/rejected": 0.2162102907896042, "logps/chosen": -341.5213928222656, "logps/rejected": -338.03179931640625, "loss": 1.6469, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0008542388677597046, "rewards/margins": 0.3375299572944641, "rewards/rejected": -0.338384211063385, "step": 560 }, { "epoch": 0.87, "learning_rate": 2.5757288163336806e-07, "logits/chosen": 0.1493878811597824, "logits/rejected": 0.20596106350421906, "logps/chosen": -352.513916015625, "logps/rejected": -391.4380798339844, "loss": 1.6831, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06621219962835312, "rewards/margins": 0.2802005708217621, "rewards/rejected": -0.3464128077030182, "step": 570 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.1281604915857315, "logits/rejected": 0.19645507633686066, "logps/chosen": -302.35040283203125, "logps/rejected": -354.00372314453125, "loss": 1.6607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.044999100267887115, "rewards/margins": 0.3108140826225281, "rewards/rejected": -0.3558131754398346, "step": 580 }, { "epoch": 0.9, "learning_rate": 1.5279854247146703e-07, "logits/chosen": 0.1600816547870636, "logits/rejected": 0.2620231509208679, "logps/chosen": -363.3172607421875, "logps/rejected": -383.7359619140625, "loss": 1.5077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.035707950592041016, "rewards/margins": 0.3358200192451477, "rewards/rejected": -0.3715279698371887, "step": 590 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.1881883442401886, "logits/rejected": 0.23025290668010712, "logps/chosen": -348.2078552246094, "logps/rejected": -356.3308410644531, "loss": 1.4724, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01992412842810154, "rewards/margins": 0.35621514916419983, "rewards/rejected": -0.37613925337791443, "step": 600 }, { "epoch": 0.93, "learning_rate": 7.46189337174788e-08, "logits/chosen": 0.16047361493110657, "logits/rejected": 0.21806029975414276, "logps/chosen": -338.9239196777344, "logps/rejected": -370.13238525390625, "loss": 1.5501, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0008598908898420632, "rewards/margins": 0.35586023330688477, "rewards/rejected": -0.35672011971473694, "step": 610 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.1737244576215744, "logits/rejected": 0.2040444165468216, "logps/chosen": -367.3244323730469, "logps/rejected": -375.1661071777344, "loss": 1.6325, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08697754144668579, "rewards/margins": 0.260633647441864, "rewards/rejected": -0.3476111590862274, "step": 620 }, { "epoch": 0.96, "learning_rate": 2.3919861577572924e-08, "logits/chosen": 0.17082975804805756, "logits/rejected": 0.2609696090221405, "logps/chosen": -356.7315979003906, "logps/rejected": -364.6842041015625, "loss": 1.6992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03417596220970154, "rewards/margins": 0.30746400356292725, "rewards/rejected": -0.34163999557495117, "step": 630 }, { "epoch": 0.98, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.07360972464084625, "logits/rejected": 0.17394272983074188, "logps/chosen": -343.9101867675781, "logps/rejected": -367.1744079589844, "loss": 1.4701, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.053998060524463654, "rewards/margins": 0.355236679315567, "rewards/rejected": -0.4092347025871277, "step": 640 }, { "epoch": 0.99, "learning_rate": 1.2757667974155896e-09, "logits/chosen": 0.16294406354427338, "logits/rejected": 0.23806321620941162, "logps/chosen": -380.12554931640625, "logps/rejected": -385.9973449707031, "loss": 1.6559, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.054873187094926834, "rewards/margins": 0.2882222533226013, "rewards/rejected": -0.34309545159339905, "step": 650 }, { "epoch": 1.0, "step": 656, "total_flos": 0.0, "train_loss": 1.6983561014256827, "train_runtime": 7833.1099, "train_samples_per_second": 2.681, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 656, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }