{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 8706, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034458993797381116, "grad_norm": 1.5267287492752075, "learning_rate": 5.74052812858783e-11, "logits/chosen": -3.024087429046631, "logits/rejected": -2.988196611404419, "logps/chosen": -47.308799743652344, "logps/rejected": -44.131954193115234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0034458993797381117, "grad_norm": 1.6058450937271118, "learning_rate": 5.74052812858783e-10, "logits/chosen": -3.0891964435577393, "logits/rejected": -3.0714855194091797, "logps/chosen": -51.68312454223633, "logps/rejected": -51.65239715576172, "loss": 0.6932, "rewards/accuracies": 0.4409722089767456, "rewards/chosen": -5.2873670938424766e-05, "rewards/margins": -1.918851376103703e-05, "rewards/rejected": -3.368516263435595e-05, "step": 10 }, { "epoch": 0.006891798759476223, "grad_norm": 1.4658929109573364, "learning_rate": 1.148105625717566e-09, "logits/chosen": -3.098442554473877, "logits/rejected": -3.0707199573516846, "logps/chosen": -56.02653884887695, "logps/rejected": -54.55193328857422, "loss": 0.6932, "rewards/accuracies": 0.515625, "rewards/chosen": 7.602188270539045e-05, "rewards/margins": -1.7312861018581316e-05, "rewards/rejected": 9.333473281003535e-05, "step": 20 }, { "epoch": 0.010337698139214336, "grad_norm": 1.8542293310165405, "learning_rate": 1.7221584385763488e-09, "logits/chosen": -3.127657175064087, "logits/rejected": -3.104579210281372, "logps/chosen": -55.351768493652344, "logps/rejected": -52.82017135620117, "loss": 0.6933, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.499646598153049e-06, "rewards/margins": -0.00020936639339197427, "rewards/rejected": 0.00020686673815362155, "step": 30 }, { "epoch": 0.013783597518952447, "grad_norm": 1.5636777877807617, "learning_rate": 2.296211251435132e-09, "logits/chosen": -3.1053519248962402, "logits/rejected": -3.07869029045105, "logps/chosen": -56.423561096191406, "logps/rejected": -53.66264724731445, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 9.274556214222685e-05, "rewards/margins": 0.00011847207497339696, "rewards/rejected": -2.5726534659042954e-05, "step": 40 }, { "epoch": 0.01722949689869056, "grad_norm": 1.743420958518982, "learning_rate": 2.870264064293915e-09, "logits/chosen": -3.0816597938537598, "logits/rejected": -3.043335437774658, "logps/chosen": -54.714881896972656, "logps/rejected": -51.24363327026367, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -6.899209984112531e-05, "rewards/margins": 8.803128366707824e-06, "rewards/rejected": -7.779523002682254e-05, "step": 50 }, { "epoch": 0.02067539627842867, "grad_norm": 1.7985620498657227, "learning_rate": 3.4443168771526976e-09, "logits/chosen": -3.0938124656677246, "logits/rejected": -3.0741233825683594, "logps/chosen": -54.38738250732422, "logps/rejected": -53.95930862426758, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -7.798677688697353e-05, "rewards/margins": -5.248068077889911e-07, "rewards/rejected": -7.746197661617771e-05, "step": 60 }, { "epoch": 0.024121295658166782, "grad_norm": 1.7512099742889404, "learning_rate": 4.018369690011481e-09, "logits/chosen": -3.111898899078369, "logits/rejected": -3.095719575881958, "logps/chosen": -54.67177200317383, "logps/rejected": -53.773719787597656, "loss": 0.693, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.00018034051754511893, "rewards/margins": 0.000329008384142071, "rewards/rejected": -0.0001486678229412064, "step": 70 }, { "epoch": 0.027567195037904894, "grad_norm": 1.5692384243011475, "learning_rate": 4.592422502870264e-09, "logits/chosen": -3.0583581924438477, "logits/rejected": -3.0390734672546387, "logps/chosen": -53.05257034301758, "logps/rejected": -53.555763244628906, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 4.451820132089779e-05, "rewards/margins": 7.756784179946408e-05, "rewards/rejected": -3.3049644116545096e-05, "step": 80 }, { "epoch": 0.031013094417643005, "grad_norm": 1.7685738801956177, "learning_rate": 5.166475315729047e-09, "logits/chosen": -3.0694994926452637, "logits/rejected": -3.0503792762756348, "logps/chosen": -56.342140197753906, "logps/rejected": -52.5681266784668, "loss": 0.693, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 2.987789957842324e-05, "rewards/margins": 0.0001999745873035863, "rewards/rejected": -0.0001700967113720253, "step": 90 }, { "epoch": 0.03445899379738112, "grad_norm": 1.7545464038848877, "learning_rate": 5.74052812858783e-09, "logits/chosen": -3.0917317867279053, "logits/rejected": -3.062188148498535, "logps/chosen": -56.145225524902344, "logps/rejected": -53.62772750854492, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 7.897531759226695e-05, "rewards/margins": 5.607344792224467e-05, "rewards/rejected": 2.290186057507526e-05, "step": 100 }, { "epoch": 0.03445899379738112, "eval_logits/chosen": -3.1631908416748047, "eval_logits/rejected": -3.1575169563293457, "eval_logps/chosen": -58.71397399902344, "eval_logps/rejected": -63.17209243774414, "eval_loss": 0.6931983828544617, "eval_rewards/accuracies": 0.4828066825866699, "eval_rewards/chosen": -2.0795272575924173e-05, "eval_rewards/margins": -0.00010116487828781828, "eval_rewards/rejected": 8.036960934987292e-05, "eval_runtime": 382.9886, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 100 }, { "epoch": 0.03790489317711923, "grad_norm": 1.6116759777069092, "learning_rate": 6.314580941446612e-09, "logits/chosen": -3.0325913429260254, "logits/rejected": -3.0126051902770996, "logps/chosen": -52.730247497558594, "logps/rejected": -54.299835205078125, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 1.509770481789019e-05, "rewards/margins": -1.9921517377952114e-05, "rewards/rejected": 3.501922401483171e-05, "step": 110 }, { "epoch": 0.04135079255685734, "grad_norm": 1.5366730690002441, "learning_rate": 6.888633754305395e-09, "logits/chosen": -3.037839889526367, "logits/rejected": -3.007537841796875, "logps/chosen": -52.4300537109375, "logps/rejected": -51.04934310913086, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 1.8376089428784326e-05, "rewards/margins": 1.4738890058652032e-05, "rewards/rejected": 3.6371864098327933e-06, "step": 120 }, { "epoch": 0.044796691936595454, "grad_norm": 1.714571475982666, "learning_rate": 7.462686567164179e-09, "logits/chosen": -3.1060352325439453, "logits/rejected": -3.0896782875061035, "logps/chosen": -53.569786071777344, "logps/rejected": -53.855125427246094, "loss": 0.6931, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 4.835512299905531e-05, "rewards/margins": 2.531794052629266e-05, "rewards/rejected": 2.303719156770967e-05, "step": 130 }, { "epoch": 0.048242591316333565, "grad_norm": 1.8579405546188354, "learning_rate": 8.036739380022962e-09, "logits/chosen": -3.0789341926574707, "logits/rejected": -3.0528130531311035, "logps/chosen": -55.37175369262695, "logps/rejected": -53.847442626953125, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -5.393805986386724e-05, "rewards/margins": -0.00017780621419660747, "rewards/rejected": 0.00012386815797071904, "step": 140 }, { "epoch": 0.051688490696071676, "grad_norm": 1.6249550580978394, "learning_rate": 8.610792192881745e-09, "logits/chosen": -3.024933338165283, "logits/rejected": -3.0134525299072266, "logps/chosen": -54.112518310546875, "logps/rejected": -54.11296463012695, "loss": 0.6932, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -3.113759885309264e-05, "rewards/margins": -3.104945062659681e-05, "rewards/rejected": -8.81467698832239e-08, "step": 150 }, { "epoch": 0.05513439007580979, "grad_norm": 1.6739519834518433, "learning_rate": 9.184845005740529e-09, "logits/chosen": -3.0443265438079834, "logits/rejected": -3.0283732414245605, "logps/chosen": -54.01426315307617, "logps/rejected": -51.27742385864258, "loss": 0.6931, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -5.676864930137526e-06, "rewards/margins": 9.094272536458448e-05, "rewards/rejected": -9.661958029028028e-05, "step": 160 }, { "epoch": 0.0585802894555479, "grad_norm": 1.6702892780303955, "learning_rate": 9.758897818599312e-09, "logits/chosen": -3.044332981109619, "logits/rejected": -3.021695613861084, "logps/chosen": -53.786903381347656, "logps/rejected": -52.07037353515625, "loss": 0.6932, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -7.057005859678611e-05, "rewards/margins": -9.588641114532948e-05, "rewards/rejected": 2.531635072955396e-05, "step": 170 }, { "epoch": 0.06202618883528601, "grad_norm": 1.665608286857605, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -3.05308198928833, "logits/rejected": -3.021799325942993, "logps/chosen": -55.39881134033203, "logps/rejected": -52.055274963378906, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.2503273814654676e-06, "rewards/margins": 0.0001963514368981123, "rewards/rejected": -0.00019960175268352032, "step": 180 }, { "epoch": 0.06547208821502412, "grad_norm": 1.5228668451309204, "learning_rate": 1.0907003444316877e-08, "logits/chosen": -3.1539382934570312, "logits/rejected": -3.126840591430664, "logps/chosen": -52.94257736206055, "logps/rejected": -51.77439498901367, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 1.8653812730917707e-05, "rewards/margins": 0.00010460461635375395, "rewards/rejected": -8.595079270889983e-05, "step": 190 }, { "epoch": 0.06891798759476224, "grad_norm": 1.7340269088745117, "learning_rate": 1.148105625717566e-08, "logits/chosen": -3.085669994354248, "logits/rejected": -3.065054178237915, "logps/chosen": -54.360809326171875, "logps/rejected": -53.9770622253418, "loss": 0.6932, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -8.438022632617503e-05, "rewards/margins": -0.0001881776552181691, "rewards/rejected": 0.00010379742889199406, "step": 200 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.163332939147949, "eval_logits/rejected": -3.157672643661499, "eval_logps/chosen": -58.71128463745117, "eval_logps/rejected": -63.170894622802734, "eval_loss": 0.6931909322738647, "eval_rewards/accuracies": 0.4693308472633362, "eval_rewards/chosen": 6.1291966630960815e-06, "eval_rewards/margins": -8.617867570137605e-05, "eval_rewards/rejected": 9.230787691194564e-05, "eval_runtime": 383.39, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 200 }, { "epoch": 0.07236388697450034, "grad_norm": 1.7234423160552979, "learning_rate": 1.2055109070034444e-08, "logits/chosen": -3.067368268966675, "logits/rejected": -3.0613369941711426, "logps/chosen": -52.45257568359375, "logps/rejected": -54.711036682128906, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 3.1624855182599276e-05, "rewards/margins": -6.70813606120646e-05, "rewards/rejected": 9.870620851870626e-05, "step": 210 }, { "epoch": 0.07580978635423846, "grad_norm": 1.7323672771453857, "learning_rate": 1.2629161882893224e-08, "logits/chosen": -3.1108169555664062, "logits/rejected": -3.086294651031494, "logps/chosen": -53.69478225708008, "logps/rejected": -53.714332580566406, "loss": 0.6931, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 3.151056444039568e-05, "rewards/margins": 0.00012868284829892218, "rewards/rejected": -9.717225475469604e-05, "step": 220 }, { "epoch": 0.07925568573397657, "grad_norm": 1.6582297086715698, "learning_rate": 1.3203214695752007e-08, "logits/chosen": -3.0402538776397705, "logits/rejected": -3.014230966567993, "logps/chosen": -56.13749313354492, "logps/rejected": -53.79499053955078, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00010608303273329511, "rewards/margins": 0.00011148623889312148, "rewards/rejected": -5.4032047955843154e-06, "step": 230 }, { "epoch": 0.08270158511371468, "grad_norm": 1.8092167377471924, "learning_rate": 1.377726750861079e-08, "logits/chosen": -3.0465779304504395, "logits/rejected": -3.028172731399536, "logps/chosen": -53.755638122558594, "logps/rejected": -55.17675018310547, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -3.250144800404087e-05, "rewards/margins": -6.0262616898398846e-05, "rewards/rejected": 2.776115979941096e-05, "step": 240 }, { "epoch": 0.08614748449345279, "grad_norm": 1.5629115104675293, "learning_rate": 1.4351320321469574e-08, "logits/chosen": -2.986212968826294, "logits/rejected": -2.9469199180603027, "logps/chosen": -57.78411865234375, "logps/rejected": -51.483917236328125, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.4326238670037128e-05, "rewards/margins": 0.00018755378550849855, "rewards/rejected": -0.00020188004418741912, "step": 250 }, { "epoch": 0.08959338387319091, "grad_norm": 1.581558108329773, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -3.0338284969329834, "logits/rejected": -3.010404586791992, "logps/chosen": -57.34649658203125, "logps/rejected": -51.81713104248047, "loss": 0.6932, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -5.560139470617287e-05, "rewards/margins": -7.648382052138913e-06, "rewards/rejected": -4.79530353914015e-05, "step": 260 }, { "epoch": 0.09303928325292901, "grad_norm": 1.5749309062957764, "learning_rate": 1.549942594718714e-08, "logits/chosen": -3.0476455688476562, "logits/rejected": -3.018409252166748, "logps/chosen": -54.27478790283203, "logps/rejected": -52.06067657470703, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 1.1507063391036354e-05, "rewards/margins": 9.997030429076403e-05, "rewards/rejected": -8.846323180478066e-05, "step": 270 }, { "epoch": 0.09648518263266713, "grad_norm": 1.7571849822998047, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -3.086862802505493, "logits/rejected": -3.0739307403564453, "logps/chosen": -52.846221923828125, "logps/rejected": -53.468536376953125, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -5.004048580303788e-05, "rewards/margins": -2.9615650419145823e-05, "rewards/rejected": -2.0424817193998024e-05, "step": 280 }, { "epoch": 0.09993108201240523, "grad_norm": 1.4686821699142456, "learning_rate": 1.6647531572904707e-08, "logits/chosen": -3.0487747192382812, "logits/rejected": -3.041539430618286, "logps/chosen": -51.24309158325195, "logps/rejected": -53.52986526489258, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -7.97271859482862e-05, "rewards/margins": -0.00010071504220832139, "rewards/rejected": 2.098785807902459e-05, "step": 290 }, { "epoch": 0.10337698139214335, "grad_norm": 1.781776785850525, "learning_rate": 1.722158438576349e-08, "logits/chosen": -3.037935495376587, "logits/rejected": -3.01487135887146, "logps/chosen": -54.37398147583008, "logps/rejected": -55.74623489379883, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 7.359846495091915e-05, "rewards/margins": 0.0002631952229421586, "rewards/rejected": -0.00018959671433549374, "step": 300 }, { "epoch": 0.10337698139214335, "eval_logits/chosen": -3.163011074066162, "eval_logits/rejected": -3.1573681831359863, "eval_logps/chosen": -58.71122741699219, "eval_logps/rejected": -63.172950744628906, "eval_loss": 0.6931803822517395, "eval_rewards/accuracies": 0.47606876492500305, "eval_rewards/chosen": 6.681264494545758e-06, "eval_rewards/margins": -6.501571624539793e-05, "eval_rewards/rejected": 7.169699529185891e-05, "eval_runtime": 382.9433, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 300 }, { "epoch": 0.10682288077188146, "grad_norm": 1.6823004484176636, "learning_rate": 1.7795637198622274e-08, "logits/chosen": -3.0697779655456543, "logits/rejected": -3.055175304412842, "logps/chosen": -53.7920036315918, "logps/rejected": -53.2698974609375, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": -1.945759777299827e-06, "rewards/margins": 5.7572568039176986e-05, "rewards/rejected": -5.9518333728192374e-05, "step": 310 }, { "epoch": 0.11026878015161957, "grad_norm": 1.8053383827209473, "learning_rate": 1.8369690011481057e-08, "logits/chosen": -3.116947650909424, "logits/rejected": -3.098017692565918, "logps/chosen": -53.384422302246094, "logps/rejected": -52.58729934692383, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 4.022657230962068e-05, "rewards/margins": 1.1503079804242589e-05, "rewards/rejected": 2.8723463401547633e-05, "step": 320 }, { "epoch": 0.11371467953135768, "grad_norm": 1.604368805885315, "learning_rate": 1.894374282433984e-08, "logits/chosen": -3.0587456226348877, "logits/rejected": -3.0564169883728027, "logps/chosen": -53.051727294921875, "logps/rejected": -53.55405807495117, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -2.6459397304279264e-06, "rewards/margins": 0.00011294658906990662, "rewards/rejected": -0.00011559253471205011, "step": 330 }, { "epoch": 0.1171605789110958, "grad_norm": 1.6719446182250977, "learning_rate": 1.9517795637198624e-08, "logits/chosen": -3.0025782585144043, "logits/rejected": -2.9876708984375, "logps/chosen": -53.480812072753906, "logps/rejected": -54.1949462890625, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00011401448864489794, "rewards/margins": -9.441097790841013e-05, "rewards/rejected": -1.960352892638184e-05, "step": 340 }, { "epoch": 0.1206064782908339, "grad_norm": 1.603629231452942, "learning_rate": 2.0091848450057404e-08, "logits/chosen": -3.1061129570007324, "logits/rejected": -3.076988935470581, "logps/chosen": -57.39946365356445, "logps/rejected": -51.76738739013672, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 5.912123015150428e-05, "rewards/margins": 0.00012035444524371997, "rewards/rejected": -6.12332223681733e-05, "step": 350 }, { "epoch": 0.12405237767057202, "grad_norm": 1.689586877822876, "learning_rate": 2.0665901262916187e-08, "logits/chosen": -3.044360399246216, "logits/rejected": -3.0278127193450928, "logps/chosen": -53.98747634887695, "logps/rejected": -54.560401916503906, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.570212331600487e-05, "rewards/margins": 7.963169628055766e-05, "rewards/rejected": -0.00012533381232060492, "step": 360 }, { "epoch": 0.12749827705031014, "grad_norm": 1.6778897047042847, "learning_rate": 2.123995407577497e-08, "logits/chosen": -3.083096504211426, "logits/rejected": -3.057722568511963, "logps/chosen": -55.63151931762695, "logps/rejected": -53.124488830566406, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": -6.956577999517322e-05, "rewards/margins": 7.417744200211018e-05, "rewards/rejected": -0.00014374320744536817, "step": 370 }, { "epoch": 0.13094417643004824, "grad_norm": 1.798614740371704, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -3.1230146884918213, "logits/rejected": -3.08950138092041, "logps/chosen": -55.228919982910156, "logps/rejected": -51.76960372924805, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.4379041355568916e-05, "rewards/margins": 3.588591061998159e-05, "rewards/rejected": -8.02649519755505e-05, "step": 380 }, { "epoch": 0.13439007580978635, "grad_norm": 1.6894241571426392, "learning_rate": 2.2388059701492537e-08, "logits/chosen": -3.096428394317627, "logits/rejected": -3.0692429542541504, "logps/chosen": -53.0290412902832, "logps/rejected": -51.69932174682617, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 6.04914384894073e-06, "rewards/margins": 0.00010031403508037329, "rewards/rejected": -9.426489850739017e-05, "step": 390 }, { "epoch": 0.13783597518952448, "grad_norm": 1.7336101531982422, "learning_rate": 2.296211251435132e-08, "logits/chosen": -3.044656753540039, "logits/rejected": -3.014923095703125, "logps/chosen": -54.45336151123047, "logps/rejected": -54.00486373901367, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2634811355383135e-05, "rewards/margins": 0.0002166276826756075, "rewards/rejected": -0.00022926248493604362, "step": 400 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.163149356842041, "eval_logits/rejected": -3.157475233078003, "eval_logps/chosen": -58.69733810424805, "eval_logps/rejected": -63.15825271606445, "eval_loss": 0.6931843757629395, "eval_rewards/accuracies": 0.48420074582099915, "eval_rewards/chosen": 0.00014556771202478558, "eval_rewards/margins": -7.311295485123992e-05, "eval_rewards/rejected": 0.00021868068142794073, "eval_runtime": 383.1255, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 400 }, { "epoch": 0.14128187456926258, "grad_norm": 1.5207126140594482, "learning_rate": 2.3536165327210104e-08, "logits/chosen": -3.0870676040649414, "logits/rejected": -3.0602707862854004, "logps/chosen": -54.1017951965332, "logps/rejected": -53.16627883911133, "loss": 0.6931, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -9.258401405531913e-05, "rewards/margins": 0.00011996408284176141, "rewards/rejected": -0.00021254811144899577, "step": 410 }, { "epoch": 0.1447277739490007, "grad_norm": 1.7107670307159424, "learning_rate": 2.4110218140068887e-08, "logits/chosen": -3.0917537212371826, "logits/rejected": -3.07071590423584, "logps/chosen": -54.345314025878906, "logps/rejected": -51.79109573364258, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -7.97007669461891e-05, "rewards/margins": 0.00023882141977082938, "rewards/rejected": -0.0003185221867170185, "step": 420 }, { "epoch": 0.1481736733287388, "grad_norm": 1.7153894901275635, "learning_rate": 2.4684270952927668e-08, "logits/chosen": -3.0317769050598145, "logits/rejected": -3.016413688659668, "logps/chosen": -51.86307907104492, "logps/rejected": -53.631927490234375, "loss": 0.6932, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.00022911284759175032, "rewards/margins": -0.00012028827040921897, "rewards/rejected": -0.00010882456263061613, "step": 430 }, { "epoch": 0.15161957270847692, "grad_norm": 1.5204572677612305, "learning_rate": 2.5258323765786448e-08, "logits/chosen": -3.075502872467041, "logits/rejected": -3.0622317790985107, "logps/chosen": -51.52375411987305, "logps/rejected": -52.54096221923828, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.013546175789088e-05, "rewards/margins": 0.0001495412434451282, "rewards/rejected": -0.0002296767197549343, "step": 440 }, { "epoch": 0.15506547208821503, "grad_norm": 1.5638824701309204, "learning_rate": 2.583237657864523e-08, "logits/chosen": -3.071061849594116, "logits/rejected": -3.0471391677856445, "logps/chosen": -56.325172424316406, "logps/rejected": -53.29718780517578, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.00013258328544907272, "rewards/margins": 0.00011050030298065394, "rewards/rejected": -0.0002430836029816419, "step": 450 }, { "epoch": 0.15851137146795313, "grad_norm": 1.6237907409667969, "learning_rate": 2.6406429391504014e-08, "logits/chosen": -3.0728936195373535, "logits/rejected": -3.0471415519714355, "logps/chosen": -52.68574905395508, "logps/rejected": -50.77009963989258, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00011142897710669786, "rewards/margins": 3.6573685065377504e-05, "rewards/rejected": -0.00014800265489611775, "step": 460 }, { "epoch": 0.16195727084769124, "grad_norm": 1.541483998298645, "learning_rate": 2.6980482204362798e-08, "logits/chosen": -3.145155429840088, "logits/rejected": -3.1183369159698486, "logps/chosen": -56.34538650512695, "logps/rejected": -54.72991943359375, "loss": 0.6929, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -7.881497367634438e-06, "rewards/margins": 0.00041124955168925226, "rewards/rejected": -0.00041913107270374894, "step": 470 }, { "epoch": 0.16540317022742937, "grad_norm": 1.7185354232788086, "learning_rate": 2.755453501722158e-08, "logits/chosen": -2.9122116565704346, "logits/rejected": -2.9051220417022705, "logps/chosen": -53.04082107543945, "logps/rejected": -55.65662384033203, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.00034471676917746663, "rewards/margins": 6.586960807908326e-05, "rewards/rejected": -0.0004105864209122956, "step": 480 }, { "epoch": 0.16884906960716747, "grad_norm": 1.8825846910476685, "learning_rate": 2.8128587830080364e-08, "logits/chosen": -3.1184704303741455, "logits/rejected": -3.090733289718628, "logps/chosen": -58.371986389160156, "logps/rejected": -53.67230987548828, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00015857252583373338, "rewards/margins": 0.00014086466399021447, "rewards/rejected": -0.0002994372043758631, "step": 490 }, { "epoch": 0.17229496898690558, "grad_norm": 1.5853806734085083, "learning_rate": 2.8702640642939148e-08, "logits/chosen": -3.01218318939209, "logits/rejected": -2.986639976501465, "logps/chosen": -55.746246337890625, "logps/rejected": -52.217247009277344, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.0002750814310275018, "rewards/margins": 9.051460074260831e-05, "rewards/rejected": -0.0003655959735624492, "step": 500 }, { "epoch": 0.17229496898690558, "eval_logits/chosen": -3.1630947589874268, "eval_logits/rejected": -3.1575026512145996, "eval_logps/chosen": -58.687747955322266, "eval_logps/rejected": -63.159366607666016, "eval_loss": 0.6931309103965759, "eval_rewards/accuracies": 0.49326208233833313, "eval_rewards/chosen": 0.0002414963091723621, "eval_rewards/margins": 3.386462776688859e-05, "eval_rewards/rejected": 0.00020763167412951589, "eval_runtime": 383.1279, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 500 }, { "epoch": 0.17574086836664368, "grad_norm": 1.7183098793029785, "learning_rate": 2.927669345579793e-08, "logits/chosen": -3.0154900550842285, "logits/rejected": -3.0029807090759277, "logps/chosen": -55.45313262939453, "logps/rejected": -56.1241340637207, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00022910613915883005, "rewards/margins": 0.00042603391921147704, "rewards/rejected": -0.0006551401456817985, "step": 510 }, { "epoch": 0.17918676774638181, "grad_norm": 1.529685378074646, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -3.114351511001587, "logits/rejected": -3.095560073852539, "logps/chosen": -53.58135223388672, "logps/rejected": -53.77289962768555, "loss": 0.693, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.00022775967954657972, "rewards/margins": 0.0002510346530470997, "rewards/rejected": -0.00047879433259367943, "step": 520 }, { "epoch": 0.18263266712611992, "grad_norm": 1.6587562561035156, "learning_rate": 3.0424799081515494e-08, "logits/chosen": -2.997591495513916, "logits/rejected": -2.970189332962036, "logps/chosen": -56.84648513793945, "logps/rejected": -52.83030319213867, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00024324092373717576, "rewards/margins": 0.00031215764465741813, "rewards/rejected": -0.0005553985829465091, "step": 530 }, { "epoch": 0.18607856650585802, "grad_norm": 1.579174280166626, "learning_rate": 3.099885189437428e-08, "logits/chosen": -3.134850025177002, "logits/rejected": -3.106194496154785, "logps/chosen": -55.90632247924805, "logps/rejected": -52.05424880981445, "loss": 0.6931, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.00047803809866309166, "rewards/margins": 0.00017016839410644025, "rewards/rejected": -0.0006482064491137862, "step": 540 }, { "epoch": 0.18952446588559613, "grad_norm": 1.5796416997909546, "learning_rate": 3.157290470723307e-08, "logits/chosen": -3.0336523056030273, "logits/rejected": -3.027980327606201, "logps/chosen": -51.60371780395508, "logps/rejected": -53.66399002075195, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0003416317922528833, "rewards/margins": 9.073803812498227e-05, "rewards/rejected": -0.0004323698522057384, "step": 550 }, { "epoch": 0.19297036526533426, "grad_norm": 1.660001277923584, "learning_rate": 3.214695752009185e-08, "logits/chosen": -3.0794060230255127, "logits/rejected": -3.074620485305786, "logps/chosen": -54.669044494628906, "logps/rejected": -55.166542053222656, "loss": 0.693, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.0003561445919331163, "rewards/margins": 0.000328877562424168, "rewards/rejected": -0.0006850221543572843, "step": 560 }, { "epoch": 0.19641626464507236, "grad_norm": 1.7443041801452637, "learning_rate": 3.2721010332950634e-08, "logits/chosen": -3.114729881286621, "logits/rejected": -3.08553147315979, "logps/chosen": -54.493499755859375, "logps/rejected": -53.668670654296875, "loss": 0.6929, "rewards/accuracies": 0.546875, "rewards/chosen": -0.00036628678208217025, "rewards/margins": 0.000434595305705443, "rewards/rejected": -0.0008008821168914437, "step": 570 }, { "epoch": 0.19986216402481047, "grad_norm": 1.5333077907562256, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -3.0559775829315186, "logits/rejected": -3.0420336723327637, "logps/chosen": -53.71303176879883, "logps/rejected": -54.594749450683594, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00022023660130798817, "rewards/margins": 0.0005593840032815933, "rewards/rejected": -0.0007796206045895815, "step": 580 }, { "epoch": 0.2033080634045486, "grad_norm": 1.7492039203643799, "learning_rate": 3.38691159586682e-08, "logits/chosen": -2.9624226093292236, "logits/rejected": -2.9403374195098877, "logps/chosen": -52.4870719909668, "logps/rejected": -52.93088912963867, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0004652452189475298, "rewards/margins": 0.00046647139242850244, "rewards/rejected": -0.0009317166986875236, "step": 590 }, { "epoch": 0.2067539627842867, "grad_norm": 1.478962779045105, "learning_rate": 3.444316877152698e-08, "logits/chosen": -3.062162399291992, "logits/rejected": -3.0369365215301514, "logps/chosen": -55.97605514526367, "logps/rejected": -50.92509078979492, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0004408372624311596, "rewards/margins": 0.0004519358335528523, "rewards/rejected": -0.0008927730959840119, "step": 600 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.162525177001953, "eval_logits/rejected": -3.156876802444458, "eval_logps/chosen": -58.66804885864258, "eval_logps/rejected": -63.146324157714844, "eval_loss": 0.6930976510047913, "eval_rewards/accuracies": 0.49883827567100525, "eval_rewards/chosen": 0.0004384896019473672, "eval_rewards/margins": 0.00010045692033600062, "eval_rewards/rejected": 0.00033803266705945134, "eval_runtime": 383.2598, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 600 }, { "epoch": 0.2101998621640248, "grad_norm": 1.5590308904647827, "learning_rate": 3.501722158438576e-08, "logits/chosen": -3.083258867263794, "logits/rejected": -3.0526123046875, "logps/chosen": -53.620948791503906, "logps/rejected": -52.74138259887695, "loss": 0.6929, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.00036167577491141856, "rewards/margins": 0.0005563817103393376, "rewards/rejected": -0.0009180573979392648, "step": 610 }, { "epoch": 0.2136457615437629, "grad_norm": 1.624756097793579, "learning_rate": 3.559127439724455e-08, "logits/chosen": -3.089564561843872, "logits/rejected": -3.060040235519409, "logps/chosen": -54.137107849121094, "logps/rejected": -52.6269416809082, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00043127205572091043, "rewards/margins": 0.0006135430303402245, "rewards/rejected": -0.0010448151733726263, "step": 620 }, { "epoch": 0.21709166092350105, "grad_norm": 1.5574851036071777, "learning_rate": 3.616532721010333e-08, "logits/chosen": -3.0648295879364014, "logits/rejected": -3.0309245586395264, "logps/chosen": -53.41007614135742, "logps/rejected": -50.83017349243164, "loss": 0.693, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.0007039115880616009, "rewards/margins": 0.0002391432353761047, "rewards/rejected": -0.0009430547943338752, "step": 630 }, { "epoch": 0.22053756030323915, "grad_norm": 1.6602699756622314, "learning_rate": 3.6739380022962115e-08, "logits/chosen": -3.0855560302734375, "logits/rejected": -3.0858309268951416, "logps/chosen": -51.580543518066406, "logps/rejected": -57.39887237548828, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005685995565727353, "rewards/margins": 0.0003290728200227022, "rewards/rejected": -0.0008976723765954375, "step": 640 }, { "epoch": 0.22398345968297725, "grad_norm": 1.509259819984436, "learning_rate": 3.7313432835820895e-08, "logits/chosen": -3.061649799346924, "logits/rejected": -3.0394208431243896, "logps/chosen": -52.47649383544922, "logps/rejected": -52.17650604248047, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0006856679101474583, "rewards/margins": 0.0003688350261654705, "rewards/rejected": -0.001054503140039742, "step": 650 }, { "epoch": 0.22742935906271536, "grad_norm": 1.7701866626739502, "learning_rate": 3.788748564867968e-08, "logits/chosen": -3.0886669158935547, "logits/rejected": -3.058096408843994, "logps/chosen": -54.43735885620117, "logps/rejected": -50.229915618896484, "loss": 0.6927, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.000572009535972029, "rewards/margins": 0.000946041545830667, "rewards/rejected": -0.0015180512564256787, "step": 660 }, { "epoch": 0.2308752584424535, "grad_norm": 1.6014260053634644, "learning_rate": 3.846153846153846e-08, "logits/chosen": -3.075366497039795, "logits/rejected": -3.0492441654205322, "logps/chosen": -55.22419357299805, "logps/rejected": -51.61750411987305, "loss": 0.6928, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0004568920121528208, "rewards/margins": 0.000684737809933722, "rewards/rejected": -0.001141629763878882, "step": 670 }, { "epoch": 0.2343211578221916, "grad_norm": 1.6289467811584473, "learning_rate": 3.903559127439725e-08, "logits/chosen": -3.040513515472412, "logits/rejected": -3.022066593170166, "logps/chosen": -54.72515869140625, "logps/rejected": -54.75127029418945, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0007326538907364011, "rewards/margins": 0.0004857041931245476, "rewards/rejected": -0.0012183580547571182, "step": 680 }, { "epoch": 0.2377670572019297, "grad_norm": 1.5907139778137207, "learning_rate": 3.960964408725603e-08, "logits/chosen": -3.079125165939331, "logits/rejected": -3.0600507259368896, "logps/chosen": -53.56365203857422, "logps/rejected": -52.69280242919922, "loss": 0.6927, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.0005984751624055207, "rewards/margins": 0.0008061518892645836, "rewards/rejected": -0.0014046269934624434, "step": 690 }, { "epoch": 0.2412129565816678, "grad_norm": 1.582619547843933, "learning_rate": 4.018369690011481e-08, "logits/chosen": -3.084700107574463, "logits/rejected": -3.0581741333007812, "logps/chosen": -56.4050407409668, "logps/rejected": -53.29130172729492, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.000537283718585968, "rewards/margins": 0.0010676311794668436, "rewards/rejected": -0.0016049148980528116, "step": 700 }, { "epoch": 0.2412129565816678, "eval_logits/chosen": -3.161731004714966, "eval_logits/rejected": -3.1560850143432617, "eval_logps/chosen": -58.66014099121094, "eval_logps/rejected": -63.14492416381836, "eval_loss": 0.6930655241012573, "eval_rewards/accuracies": 0.5274163484573364, "eval_rewards/chosen": 0.0005175235564820468, "eval_rewards/margins": 0.0001655405794735998, "eval_rewards/rejected": 0.00035198291880078614, "eval_runtime": 383.1299, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 700 }, { "epoch": 0.24465885596140594, "grad_norm": 1.4900749921798706, "learning_rate": 4.0757749712973595e-08, "logits/chosen": -3.0182652473449707, "logits/rejected": -3.0152182579040527, "logps/chosen": -52.647216796875, "logps/rejected": -54.094764709472656, "loss": 0.6928, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.0008331532590091228, "rewards/margins": 0.0006963290506973863, "rewards/rejected": -0.0015294823097065091, "step": 710 }, { "epoch": 0.24810475534114404, "grad_norm": 1.5864753723144531, "learning_rate": 4.1331802525832375e-08, "logits/chosen": -3.0603785514831543, "logits/rejected": -3.038881301879883, "logps/chosen": -52.11266326904297, "logps/rejected": -52.61206817626953, "loss": 0.6927, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0008909286698326468, "rewards/margins": 0.0009582725469954312, "rewards/rejected": -0.001849201275035739, "step": 720 }, { "epoch": 0.25155065472088217, "grad_norm": 1.658844232559204, "learning_rate": 4.190585533869116e-08, "logits/chosen": -3.039299488067627, "logits/rejected": -3.004225969314575, "logps/chosen": -53.66937255859375, "logps/rejected": -51.41010665893555, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.000820048910100013, "rewards/margins": 0.0013549254508689046, "rewards/rejected": -0.002174974186345935, "step": 730 }, { "epoch": 0.2549965541006203, "grad_norm": 1.4955841302871704, "learning_rate": 4.247990815154994e-08, "logits/chosen": -3.189666271209717, "logits/rejected": -3.153520345687866, "logps/chosen": -54.25288772583008, "logps/rejected": -53.40996170043945, "loss": 0.6926, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.0008569598430767655, "rewards/margins": 0.0011587422341108322, "rewards/rejected": -0.002015701960772276, "step": 740 }, { "epoch": 0.2584424534803584, "grad_norm": 1.6525216102600098, "learning_rate": 4.305396096440873e-08, "logits/chosen": -3.059471845626831, "logits/rejected": -3.045736074447632, "logps/chosen": -53.788604736328125, "logps/rejected": -53.38872146606445, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0010240455158054829, "rewards/margins": 0.001181575353257358, "rewards/rejected": -0.002205620752647519, "step": 750 }, { "epoch": 0.2618883528600965, "grad_norm": 1.575608730316162, "learning_rate": 4.362801377726751e-08, "logits/chosen": -3.0197482109069824, "logits/rejected": -2.9978528022766113, "logps/chosen": -55.5356330871582, "logps/rejected": -52.37388229370117, "loss": 0.6926, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.001073414459824562, "rewards/margins": 0.0010430815163999796, "rewards/rejected": -0.0021164959762245417, "step": 760 }, { "epoch": 0.2653342522398346, "grad_norm": 1.6197123527526855, "learning_rate": 4.420206659012629e-08, "logits/chosen": -3.0276763439178467, "logits/rejected": -3.0032382011413574, "logps/chosen": -53.92132568359375, "logps/rejected": -51.12883377075195, "loss": 0.6925, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0012427789624780416, "rewards/margins": 0.001257481286302209, "rewards/rejected": -0.0025002602487802505, "step": 770 }, { "epoch": 0.2687801516195727, "grad_norm": 1.6428828239440918, "learning_rate": 4.4776119402985075e-08, "logits/chosen": -3.0183887481689453, "logits/rejected": -2.9996588230133057, "logps/chosen": -52.25994873046875, "logps/rejected": -50.87326431274414, "loss": 0.6925, "rewards/accuracies": 0.578125, "rewards/chosen": -0.001513983472250402, "rewards/margins": 0.0012144233332946897, "rewards/rejected": -0.0027284070383757353, "step": 780 }, { "epoch": 0.2722260509993108, "grad_norm": 1.7458298206329346, "learning_rate": 4.5350172215843855e-08, "logits/chosen": -3.044975757598877, "logits/rejected": -3.0408968925476074, "logps/chosen": -52.57117462158203, "logps/rejected": -55.31633377075195, "loss": 0.6926, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.0015549950767308474, "rewards/margins": 0.0010043425718322396, "rewards/rejected": -0.0025593372993171215, "step": 790 }, { "epoch": 0.27567195037904896, "grad_norm": 1.5474584102630615, "learning_rate": 4.592422502870264e-08, "logits/chosen": -3.0545830726623535, "logits/rejected": -3.028029680252075, "logps/chosen": -53.2750358581543, "logps/rejected": -53.02095413208008, "loss": 0.6926, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.001201919512823224, "rewards/margins": 0.001046768738888204, "rewards/rejected": -0.00224868836812675, "step": 800 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.160796880722046, "eval_logits/rejected": -3.155186414718628, "eval_logps/chosen": -58.63302993774414, "eval_logps/rejected": -63.131134033203125, "eval_loss": 0.6929999589920044, "eval_rewards/accuracies": 0.5285780429840088, "eval_rewards/chosen": 0.0007886533858254552, "eval_rewards/margins": 0.00029877302586100996, "eval_rewards/rejected": 0.0004898803890682757, "eval_runtime": 383.0219, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 800 }, { "epoch": 0.27911784975878706, "grad_norm": 1.6502569913864136, "learning_rate": 4.649827784156142e-08, "logits/chosen": -3.0507452487945557, "logits/rejected": -3.0272066593170166, "logps/chosen": -53.829551696777344, "logps/rejected": -56.162513732910156, "loss": 0.6926, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0016247531166300178, "rewards/margins": 0.0011813701130449772, "rewards/rejected": -0.0028061233460903168, "step": 810 }, { "epoch": 0.28256374913852517, "grad_norm": 1.7326775789260864, "learning_rate": 4.707233065442021e-08, "logits/chosen": -3.1037213802337646, "logits/rejected": -3.0718483924865723, "logps/chosen": -56.322601318359375, "logps/rejected": -51.860992431640625, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0009053934481926262, "rewards/margins": 0.0023578822147101164, "rewards/rejected": -0.0032632756046950817, "step": 820 }, { "epoch": 0.28600964851826327, "grad_norm": 1.696651816368103, "learning_rate": 4.764638346727899e-08, "logits/chosen": -3.1127066612243652, "logits/rejected": -3.0752177238464355, "logps/chosen": -56.18726348876953, "logps/rejected": -52.6662483215332, "loss": 0.692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0008326550014317036, "rewards/margins": 0.002323477528989315, "rewards/rejected": -0.0031561325304210186, "step": 830 }, { "epoch": 0.2894555478980014, "grad_norm": 1.6434106826782227, "learning_rate": 4.8220436280137775e-08, "logits/chosen": -3.0974814891815186, "logits/rejected": -3.0733208656311035, "logps/chosen": -55.1783447265625, "logps/rejected": -55.13411331176758, "loss": 0.6921, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0011686356738209724, "rewards/margins": 0.0021039179991930723, "rewards/rejected": -0.003272553440183401, "step": 840 }, { "epoch": 0.2929014472777395, "grad_norm": 1.7480825185775757, "learning_rate": 4.8794489092996555e-08, "logits/chosen": -3.009683609008789, "logits/rejected": -3.0043578147888184, "logps/chosen": -52.76066207885742, "logps/rejected": -54.09674072265625, "loss": 0.6927, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.002064872533082962, "rewards/margins": 0.0009717949433252215, "rewards/rejected": -0.0030366675928235054, "step": 850 }, { "epoch": 0.2963473466574776, "grad_norm": 1.7978547811508179, "learning_rate": 4.9368541905855335e-08, "logits/chosen": -3.1012463569641113, "logits/rejected": -3.066155433654785, "logps/chosen": -57.55305099487305, "logps/rejected": -53.08917236328125, "loss": 0.6922, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0013521341606974602, "rewards/margins": 0.001842877478338778, "rewards/rejected": -0.0031950115226209164, "step": 860 }, { "epoch": 0.2997932460372157, "grad_norm": 1.6422313451766968, "learning_rate": 4.994259471871412e-08, "logits/chosen": -3.0753352642059326, "logits/rejected": -3.0616536140441895, "logps/chosen": -53.2499885559082, "logps/rejected": -53.61870193481445, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0023219056893140078, "rewards/margins": 0.0010298988781869411, "rewards/rejected": -0.003351804567500949, "step": 870 }, { "epoch": 0.30323914541695385, "grad_norm": 1.616622805595398, "learning_rate": 4.999983721428015e-08, "logits/chosen": -3.0881237983703613, "logits/rejected": -3.06410551071167, "logps/chosen": -55.7447395324707, "logps/rejected": -54.83588409423828, "loss": 0.6921, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0014014923945069313, "rewards/margins": 0.002081379760056734, "rewards/rejected": -0.0034828719217330217, "step": 880 }, { "epoch": 0.30668504479669195, "grad_norm": 1.7424137592315674, "learning_rate": 4.99992745009332e-08, "logits/chosen": -3.066657543182373, "logits/rejected": -3.0491175651550293, "logps/chosen": -54.9911994934082, "logps/rejected": -54.118560791015625, "loss": 0.6923, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0024106844794005156, "rewards/margins": 0.0017519593238830566, "rewards/rejected": -0.004162644036114216, "step": 890 }, { "epoch": 0.31013094417643006, "grad_norm": 1.664026141166687, "learning_rate": 4.999830985930383e-08, "logits/chosen": -3.1527099609375, "logits/rejected": -3.129481077194214, "logps/chosen": -52.08544921875, "logps/rejected": -52.6777229309082, "loss": 0.692, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0018667599651962519, "rewards/margins": 0.0023035637568682432, "rewards/rejected": -0.004170323722064495, "step": 900 }, { "epoch": 0.31013094417643006, "eval_logits/chosen": -3.1592347621917725, "eval_logits/rejected": -3.1535661220550537, "eval_logps/chosen": -58.609867095947266, "eval_logps/rejected": -63.12839889526367, "eval_loss": 0.6928993463516235, "eval_rewards/accuracies": 0.5436803102493286, "eval_rewards/chosen": 0.0010202607372775674, "eval_rewards/margins": 0.0005029859603382647, "eval_rewards/rejected": 0.0005172747187316418, "eval_runtime": 383.2123, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 900 }, { "epoch": 0.31357684355616816, "grad_norm": 1.5931446552276611, "learning_rate": 4.9996943304901175e-08, "logits/chosen": -3.123072862625122, "logits/rejected": -3.0878140926361084, "logps/chosen": -56.0644416809082, "logps/rejected": -52.55695724487305, "loss": 0.6922, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0017481872346252203, "rewards/margins": 0.00195808382704854, "rewards/rejected": -0.003706271294504404, "step": 910 }, { "epoch": 0.31702274293590627, "grad_norm": 1.65394926071167, "learning_rate": 4.999517485969613e-08, "logits/chosen": -3.1080398559570312, "logits/rejected": -3.098968744277954, "logps/chosen": -55.4267692565918, "logps/rejected": -55.24713134765625, "loss": 0.6926, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.00245068222284317, "rewards/margins": 0.0011253143893554807, "rewards/rejected": -0.0035759967286139727, "step": 920 }, { "epoch": 0.32046864231564437, "grad_norm": 1.715622067451477, "learning_rate": 4.9993004552121054e-08, "logits/chosen": -3.044404983520508, "logits/rejected": -3.0262794494628906, "logps/chosen": -55.39171600341797, "logps/rejected": -54.67119598388672, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0014088581083342433, "rewards/margins": 0.0030882726423442364, "rewards/rejected": -0.0044971308670938015, "step": 930 }, { "epoch": 0.3239145416953825, "grad_norm": 1.6518906354904175, "learning_rate": 4.999043241706928e-08, "logits/chosen": -3.0806727409362793, "logits/rejected": -3.0488638877868652, "logps/chosen": -54.528785705566406, "logps/rejected": -53.192230224609375, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": -0.0019601555541157722, "rewards/margins": 0.0036666926462203264, "rewards/rejected": -0.005626848433166742, "step": 940 }, { "epoch": 0.32736044107512063, "grad_norm": 1.7741785049438477, "learning_rate": 4.9987458495894555e-08, "logits/chosen": -3.0929343700408936, "logits/rejected": -3.073413133621216, "logps/chosen": -55.4083366394043, "logps/rejected": -53.78661346435547, "loss": 0.6918, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0015956215793266892, "rewards/margins": 0.0027222721837460995, "rewards/rejected": -0.004317893646657467, "step": 950 }, { "epoch": 0.33080634045485874, "grad_norm": 1.6572853326797485, "learning_rate": 4.998408283641039e-08, "logits/chosen": -3.0153255462646484, "logits/rejected": -2.983853340148926, "logps/chosen": -55.172760009765625, "logps/rejected": -54.06475067138672, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0025458005256950855, "rewards/margins": 0.0031041749753057957, "rewards/rejected": -0.005649975501000881, "step": 960 }, { "epoch": 0.33425223983459684, "grad_norm": 1.5741428136825562, "learning_rate": 4.998030549288928e-08, "logits/chosen": -3.032937526702881, "logits/rejected": -3.005075454711914, "logps/chosen": -52.7095832824707, "logps/rejected": -52.52618408203125, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": -0.002943779807537794, "rewards/margins": 0.002946648746728897, "rewards/rejected": -0.005890428088605404, "step": 970 }, { "epoch": 0.33769813921433495, "grad_norm": 1.4964897632598877, "learning_rate": 4.9976126526061846e-08, "logits/chosen": -3.0544424057006836, "logits/rejected": -3.044027328491211, "logps/chosen": -53.75004196166992, "logps/rejected": -55.165306091308594, "loss": 0.6921, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0031646634452044964, "rewards/margins": 0.0021168093662708998, "rewards/rejected": -0.005281473509967327, "step": 980 }, { "epoch": 0.34114403859407305, "grad_norm": 1.5912277698516846, "learning_rate": 4.997154600311582e-08, "logits/chosen": -3.055800676345825, "logits/rejected": -3.038321018218994, "logps/chosen": -56.14738082885742, "logps/rejected": -53.450096130371094, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.002801768248900771, "rewards/margins": 0.002274418016895652, "rewards/rejected": -0.005076186265796423, "step": 990 }, { "epoch": 0.34458993797381116, "grad_norm": 1.6680179834365845, "learning_rate": 4.996656399769502e-08, "logits/chosen": -3.0986924171447754, "logits/rejected": -3.0843400955200195, "logps/chosen": -50.77027130126953, "logps/rejected": -53.58820343017578, "loss": 0.6915, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.003726143855601549, "rewards/margins": 0.0032459970097988844, "rewards/rejected": -0.006972140166908503, "step": 1000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.1571621894836426, "eval_logits/rejected": -3.1515400409698486, "eval_logps/chosen": -58.56086349487305, "eval_logps/rejected": -63.10968780517578, "eval_loss": 0.6927512884140015, "eval_rewards/accuracies": 0.5497211813926697, "eval_rewards/chosen": 0.0015103232581168413, "eval_rewards/margins": 0.0008059804094955325, "eval_rewards/rejected": 0.0007043426739983261, "eval_runtime": 382.8964, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 1000 }, { "epoch": 0.34803583735354926, "grad_norm": 1.789591670036316, "learning_rate": 4.9961180589898114e-08, "logits/chosen": -3.059546947479248, "logits/rejected": -3.0342812538146973, "logps/chosen": -53.02336502075195, "logps/rejected": -53.93638229370117, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": -0.003077675122767687, "rewards/margins": 0.0038811513222754, "rewards/rejected": -0.0069588259793818, "step": 1010 }, { "epoch": 0.35148173673328736, "grad_norm": 1.6734188795089722, "learning_rate": 4.995539586627738e-08, "logits/chosen": -3.073002338409424, "logits/rejected": -3.055800199508667, "logps/chosen": -55.24200439453125, "logps/rejected": -55.18772506713867, "loss": 0.6919, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.0035789520479738712, "rewards/margins": 0.0026233140379190445, "rewards/rejected": -0.006202266085892916, "step": 1020 }, { "epoch": 0.3549276361130255, "grad_norm": 1.58321213722229, "learning_rate": 4.994920991983728e-08, "logits/chosen": -3.116424083709717, "logits/rejected": -3.0845885276794434, "logps/chosen": -53.51762008666992, "logps/rejected": -53.563209533691406, "loss": 0.6915, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0024187860544770956, "rewards/margins": 0.0032498023938387632, "rewards/rejected": -0.0056685879826545715, "step": 1030 }, { "epoch": 0.35837353549276363, "grad_norm": 1.6085413694381714, "learning_rate": 4.994262285003296e-08, "logits/chosen": -3.097822666168213, "logits/rejected": -3.07124400138855, "logps/chosen": -53.94586181640625, "logps/rejected": -53.320213317871094, "loss": 0.6919, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.00365361082367599, "rewards/margins": 0.0024818789679557085, "rewards/rejected": -0.006135490257292986, "step": 1040 }, { "epoch": 0.36181943487250173, "grad_norm": 1.7765018939971924, "learning_rate": 4.9935634762768674e-08, "logits/chosen": -3.0671820640563965, "logits/rejected": -3.054347515106201, "logps/chosen": -55.204689025878906, "logps/rejected": -53.545379638671875, "loss": 0.6919, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0033408261369913816, "rewards/margins": 0.0025750866625458, "rewards/rejected": -0.005915912799537182, "step": 1050 }, { "epoch": 0.36526533425223984, "grad_norm": 1.7263202667236328, "learning_rate": 4.9928245770396104e-08, "logits/chosen": -3.0302653312683105, "logits/rejected": -3.018087387084961, "logps/chosen": -51.57648849487305, "logps/rejected": -53.71232986450195, "loss": 0.6916, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00338856503367424, "rewards/margins": 0.0031401391606777906, "rewards/rejected": -0.006528704427182674, "step": 1060 }, { "epoch": 0.36871123363197794, "grad_norm": 1.631087303161621, "learning_rate": 4.992045599171248e-08, "logits/chosen": -2.9889609813690186, "logits/rejected": -2.975574016571045, "logps/chosen": -52.14996337890625, "logps/rejected": -53.917015075683594, "loss": 0.6918, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.005872171837836504, "rewards/margins": 0.002851809374988079, "rewards/rejected": -0.008723980747163296, "step": 1070 }, { "epoch": 0.37215713301171605, "grad_norm": 1.4895782470703125, "learning_rate": 4.991226555195873e-08, "logits/chosen": -3.0126376152038574, "logits/rejected": -2.9817304611206055, "logps/chosen": -55.026512145996094, "logps/rejected": -49.54633712768555, "loss": 0.6909, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0030370699241757393, "rewards/margins": 0.004613037686794996, "rewards/rejected": -0.007650108076632023, "step": 1080 }, { "epoch": 0.37560303239145415, "grad_norm": 1.9056816101074219, "learning_rate": 4.990367458281747e-08, "logits/chosen": -3.1438345909118652, "logits/rejected": -3.116790771484375, "logps/chosen": -56.613868713378906, "logps/rejected": -52.95048904418945, "loss": 0.6912, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004361911676824093, "rewards/margins": 0.004060900770127773, "rewards/rejected": -0.008422811515629292, "step": 1090 }, { "epoch": 0.37904893177119225, "grad_norm": 1.9068539142608643, "learning_rate": 4.989468322241083e-08, "logits/chosen": -3.146623134613037, "logits/rejected": -3.131809711456299, "logps/chosen": -52.995521545410156, "logps/rejected": -55.104713439941406, "loss": 0.6914, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004202415235340595, "rewards/margins": 0.0035998572129756212, "rewards/rejected": -0.007802271284162998, "step": 1100 }, { "epoch": 0.37904893177119225, "eval_logits/chosen": -3.155352830886841, "eval_logits/rejected": -3.1496880054473877, "eval_logps/chosen": -58.527748107910156, "eval_logps/rejected": -63.10512161254883, "eval_loss": 0.6926132440567017, "eval_rewards/accuracies": 0.5601765513420105, "eval_rewards/chosen": 0.0018414849182590842, "eval_rewards/margins": 0.00109143799636513, "eval_rewards/rejected": 0.0007500468054786325, "eval_runtime": 382.8828, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 1100 }, { "epoch": 0.3824948311509304, "grad_norm": 1.7736352682113647, "learning_rate": 4.988529161529829e-08, "logits/chosen": -3.03230881690979, "logits/rejected": -3.020458698272705, "logps/chosen": -54.60709762573242, "logps/rejected": -54.489280700683594, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": -0.003049212507903576, "rewards/margins": 0.004059980157762766, "rewards/rejected": -0.007109192665666342, "step": 1110 }, { "epoch": 0.3859407305306685, "grad_norm": 1.7841767072677612, "learning_rate": 4.987549991247432e-08, "logits/chosen": -3.080092668533325, "logits/rejected": -3.045909881591797, "logps/chosen": -55.738037109375, "logps/rejected": -53.1776237487793, "loss": 0.6906, "rewards/accuracies": 0.609375, "rewards/chosen": -0.004944270011037588, "rewards/margins": 0.0051543209701776505, "rewards/rejected": -0.0100985923781991, "step": 1120 }, { "epoch": 0.3893866299104066, "grad_norm": 1.6271467208862305, "learning_rate": 4.9865308271366e-08, "logits/chosen": -2.993594169616699, "logits/rejected": -2.963174343109131, "logps/chosen": -58.512779235839844, "logps/rejected": -56.269554138183594, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.00434028310701251, "rewards/margins": 0.004802732728421688, "rewards/rejected": -0.009143015369772911, "step": 1130 }, { "epoch": 0.3928325292901447, "grad_norm": 1.644989252090454, "learning_rate": 4.985471685583044e-08, "logits/chosen": -2.9966654777526855, "logits/rejected": -2.974935531616211, "logps/chosen": -54.28868865966797, "logps/rejected": -54.5621337890625, "loss": 0.6912, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.005362226627767086, "rewards/margins": 0.004026120528578758, "rewards/rejected": -0.00938834622502327, "step": 1140 }, { "epoch": 0.39627842866988283, "grad_norm": 1.7991400957107544, "learning_rate": 4.984372583615214e-08, "logits/chosen": -3.0232834815979004, "logits/rejected": -2.9918055534362793, "logps/chosen": -54.80377960205078, "logps/rejected": -54.71489715576172, "loss": 0.6901, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.0038980108220130205, "rewards/margins": 0.006170094478875399, "rewards/rejected": -0.010068105533719063, "step": 1150 }, { "epoch": 0.39972432804962094, "grad_norm": 1.6369379758834839, "learning_rate": 4.983233538904032e-08, "logits/chosen": -3.0659379959106445, "logits/rejected": -3.037196636199951, "logps/chosen": -54.79767608642578, "logps/rejected": -52.376670837402344, "loss": 0.6905, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.005379176698625088, "rewards/margins": 0.005478682927787304, "rewards/rejected": -0.010857859626412392, "step": 1160 }, { "epoch": 0.40317022742935904, "grad_norm": 1.5575422048568726, "learning_rate": 4.9820545697625974e-08, "logits/chosen": -3.0458505153656006, "logits/rejected": -3.0288150310516357, "logps/chosen": -54.64646530151367, "logps/rejected": -54.92894744873047, "loss": 0.6906, "rewards/accuracies": 0.609375, "rewards/chosen": -0.005909957457333803, "rewards/margins": 0.005214801989495754, "rewards/rejected": -0.011124758049845695, "step": 1170 }, { "epoch": 0.4066161268090972, "grad_norm": 1.6410942077636719, "learning_rate": 4.980835695145905e-08, "logits/chosen": -3.061549425125122, "logits/rejected": -3.0435023307800293, "logps/chosen": -54.84943771362305, "logps/rejected": -54.292213439941406, "loss": 0.6905, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005377430934458971, "rewards/margins": 0.0053153508342802525, "rewards/rejected": -0.010692780837416649, "step": 1180 }, { "epoch": 0.4100620261888353, "grad_norm": 1.6093648672103882, "learning_rate": 4.97957693465053e-08, "logits/chosen": -3.0494790077209473, "logits/rejected": -3.0336313247680664, "logps/chosen": -54.64577102661133, "logps/rejected": -54.43803787231445, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005937017500400543, "rewards/margins": 0.004796821624040604, "rewards/rejected": -0.010733840055763721, "step": 1190 }, { "epoch": 0.4135079255685734, "grad_norm": 1.6119037866592407, "learning_rate": 4.978278308514316e-08, "logits/chosen": -3.1219568252563477, "logits/rejected": -3.097869396209717, "logps/chosen": -54.946311950683594, "logps/rejected": -54.40472412109375, "loss": 0.6905, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.006790393032133579, "rewards/margins": 0.005468717776238918, "rewards/rejected": -0.012259109877049923, "step": 1200 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.15278959274292, "eval_logits/rejected": -3.1471242904663086, "eval_logps/chosen": -58.5269775390625, "eval_logps/rejected": -63.151371002197266, "eval_loss": 0.6923871040344238, "eval_rewards/accuracies": 0.5701673030853271, "eval_rewards/chosen": 0.001849168329499662, "eval_rewards/margins": 0.0015616599703207612, "eval_rewards/rejected": 0.0002875083882827312, "eval_runtime": 382.8635, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 1200 }, { "epoch": 0.4169538249483115, "grad_norm": 1.641495704650879, "learning_rate": 4.976939837616053e-08, "logits/chosen": -3.1185858249664307, "logits/rejected": -3.087258815765381, "logps/chosen": -55.43867874145508, "logps/rejected": -53.19841384887695, "loss": 0.6898, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.004224506206810474, "rewards/margins": 0.0069058439694345, "rewards/rejected": -0.011130350641906261, "step": 1210 }, { "epoch": 0.4203997243280496, "grad_norm": 1.7155033349990845, "learning_rate": 4.9755615434751385e-08, "logits/chosen": -3.07661509513855, "logits/rejected": -3.0587010383605957, "logps/chosen": -55.486061096191406, "logps/rejected": -54.20790481567383, "loss": 0.6904, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.005361359566450119, "rewards/margins": 0.005550517234951258, "rewards/rejected": -0.010911877267062664, "step": 1220 }, { "epoch": 0.4238456237077877, "grad_norm": 1.850020408630371, "learning_rate": 4.97414344825123e-08, "logits/chosen": -3.022465467453003, "logits/rejected": -2.9943459033966064, "logps/chosen": -55.37152099609375, "logps/rejected": -53.61724090576172, "loss": 0.691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007716395892202854, "rewards/margins": 0.004501350689679384, "rewards/rejected": -0.012217746116220951, "step": 1230 }, { "epoch": 0.4272915230875258, "grad_norm": 1.7823643684387207, "learning_rate": 4.9726855747438935e-08, "logits/chosen": -3.1238064765930176, "logits/rejected": -3.0974245071411133, "logps/chosen": -55.36341094970703, "logps/rejected": -52.74017333984375, "loss": 0.6906, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006522178649902344, "rewards/margins": 0.005209864117205143, "rewards/rejected": -0.011732041835784912, "step": 1240 }, { "epoch": 0.43073742246726393, "grad_norm": 1.6708329916000366, "learning_rate": 4.971187946392232e-08, "logits/chosen": -3.0946736335754395, "logits/rejected": -3.078284740447998, "logps/chosen": -54.32783126831055, "logps/rejected": -55.1889762878418, "loss": 0.692, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.008630650117993355, "rewards/margins": 0.0024363049305975437, "rewards/rejected": -0.011066955514252186, "step": 1250 }, { "epoch": 0.4341833218470021, "grad_norm": 1.6142418384552002, "learning_rate": 4.9696505872745125e-08, "logits/chosen": -2.977407455444336, "logits/rejected": -2.958488941192627, "logps/chosen": -53.172752380371094, "logps/rejected": -51.704345703125, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007458123378455639, "rewards/margins": 0.004668209236115217, "rewards/rejected": -0.012126332148909569, "step": 1260 }, { "epoch": 0.4376292212267402, "grad_norm": 1.6153451204299927, "learning_rate": 4.968073522107776e-08, "logits/chosen": -3.0809402465820312, "logits/rejected": -3.0733203887939453, "logps/chosen": -53.261199951171875, "logps/rejected": -55.31684494018555, "loss": 0.6912, "rewards/accuracies": 0.609375, "rewards/chosen": -0.009061941877007484, "rewards/margins": 0.004105345346033573, "rewards/rejected": -0.013167287223041058, "step": 1270 }, { "epoch": 0.4410751206064783, "grad_norm": 1.7609448432922363, "learning_rate": 4.9664567762474435e-08, "logits/chosen": -3.0890049934387207, "logits/rejected": -3.060281991958618, "logps/chosen": -57.43438720703125, "logps/rejected": -55.255027770996094, "loss": 0.6893, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004626637790352106, "rewards/margins": 0.007965070195496082, "rewards/rejected": -0.012591707520186901, "step": 1280 }, { "epoch": 0.4445210199862164, "grad_norm": 1.714154601097107, "learning_rate": 4.9648003756869036e-08, "logits/chosen": -3.079932451248169, "logits/rejected": -3.061206102371216, "logps/chosen": -56.797264099121094, "logps/rejected": -56.032386779785156, "loss": 0.6913, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008035015314817429, "rewards/margins": 0.0038569513708353043, "rewards/rejected": -0.011891965754330158, "step": 1290 }, { "epoch": 0.4479669193659545, "grad_norm": 1.6558353900909424, "learning_rate": 4.963104347057098e-08, "logits/chosen": -3.0565571784973145, "logits/rejected": -3.013684034347534, "logps/chosen": -57.629615783691406, "logps/rejected": -51.45759201049805, "loss": 0.6889, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007343019358813763, "rewards/margins": 0.008679641410708427, "rewards/rejected": -0.016022661700844765, "step": 1300 }, { "epoch": 0.4479669193659545, "eval_logits/chosen": -3.149733543395996, "eval_logits/rejected": -3.1441028118133545, "eval_logps/chosen": -58.515750885009766, "eval_logps/rejected": -63.1881103515625, "eval_loss": 0.6921555399894714, "eval_rewards/accuracies": 0.5720260143280029, "eval_rewards/chosen": 0.001961431000381708, "eval_rewards/margins": 0.0020412460435181856, "eval_rewards/rejected": -7.981513044796884e-05, "eval_runtime": 382.9703, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 1300 }, { "epoch": 0.4514128187456926, "grad_norm": 1.57466721534729, "learning_rate": 4.9613687176260945e-08, "logits/chosen": -3.0671749114990234, "logits/rejected": -3.032008409500122, "logps/chosen": -55.6620979309082, "logps/rejected": -52.671852111816406, "loss": 0.6894, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.008395962417125702, "rewards/margins": 0.007722176611423492, "rewards/rejected": -0.016118139028549194, "step": 1310 }, { "epoch": 0.4548587181254307, "grad_norm": 1.6889199018478394, "learning_rate": 4.959593515298644e-08, "logits/chosen": -3.005653142929077, "logits/rejected": -2.9786884784698486, "logps/chosen": -53.27629852294922, "logps/rejected": -52.618492126464844, "loss": 0.6896, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.008218175731599331, "rewards/margins": 0.0072518326342105865, "rewards/rejected": -0.015470008365809917, "step": 1320 }, { "epoch": 0.4583046175051689, "grad_norm": 1.766128420829773, "learning_rate": 4.957778768615736e-08, "logits/chosen": -3.0031771659851074, "logits/rejected": -2.9730334281921387, "logps/chosen": -56.095115661621094, "logps/rejected": -54.7608642578125, "loss": 0.69, "rewards/accuracies": 0.59375, "rewards/chosen": -0.007347867824137211, "rewards/margins": 0.006466665305197239, "rewards/rejected": -0.01381453312933445, "step": 1330 }, { "epoch": 0.461750516884907, "grad_norm": 1.8540762662887573, "learning_rate": 4.955924506754137e-08, "logits/chosen": -3.012941837310791, "logits/rejected": -2.9848413467407227, "logps/chosen": -56.86073684692383, "logps/rejected": -54.42765426635742, "loss": 0.6892, "rewards/accuracies": 0.640625, "rewards/chosen": -0.007639763411134481, "rewards/margins": 0.008015084080398083, "rewards/rejected": -0.015654848888516426, "step": 1340 }, { "epoch": 0.4651964162646451, "grad_norm": 1.6432679891586304, "learning_rate": 4.9540307595259254e-08, "logits/chosen": -3.024003744125366, "logits/rejected": -2.999232530593872, "logps/chosen": -52.834503173828125, "logps/rejected": -53.460899353027344, "loss": 0.6888, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.008985722437500954, "rewards/margins": 0.008808594197034836, "rewards/rejected": -0.01779431663453579, "step": 1350 }, { "epoch": 0.4686423156443832, "grad_norm": 1.7412396669387817, "learning_rate": 4.9520975573780065e-08, "logits/chosen": -3.0569210052490234, "logits/rejected": -3.0489988327026367, "logps/chosen": -56.62506103515625, "logps/rejected": -55.83217239379883, "loss": 0.6913, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.008781121112406254, "rewards/margins": 0.003973082173615694, "rewards/rejected": -0.012754203751683235, "step": 1360 }, { "epoch": 0.4720882150241213, "grad_norm": 1.8197298049926758, "learning_rate": 4.9501249313916264e-08, "logits/chosen": -3.0461108684539795, "logits/rejected": -3.0252792835235596, "logps/chosen": -56.53290557861328, "logps/rejected": -53.746803283691406, "loss": 0.6901, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.00819920003414154, "rewards/margins": 0.006229158956557512, "rewards/rejected": -0.01442835945636034, "step": 1370 }, { "epoch": 0.4755341144038594, "grad_norm": 1.7825852632522583, "learning_rate": 4.948112913281874e-08, "logits/chosen": -2.9965758323669434, "logits/rejected": -2.9852707386016846, "logps/chosen": -52.77811813354492, "logps/rejected": -55.78017044067383, "loss": 0.69, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.010087520815432072, "rewards/margins": 0.006550487130880356, "rewards/rejected": -0.016638008877635002, "step": 1380 }, { "epoch": 0.4789800137835975, "grad_norm": 1.6735769510269165, "learning_rate": 4.9460615353971656e-08, "logits/chosen": -3.0636024475097656, "logits/rejected": -3.0333611965179443, "logps/chosen": -54.9933967590332, "logps/rejected": -53.31294631958008, "loss": 0.6907, "rewards/accuracies": 0.578125, "rewards/chosen": -0.009715999476611614, "rewards/margins": 0.0051447683945298195, "rewards/rejected": -0.014860766939818859, "step": 1390 }, { "epoch": 0.4824259131633356, "grad_norm": 1.8178917169570923, "learning_rate": 4.943970830718732e-08, "logits/chosen": -3.081256866455078, "logits/rejected": -3.0518198013305664, "logps/chosen": -56.68550491333008, "logps/rejected": -54.99749755859375, "loss": 0.6896, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007875410839915276, "rewards/margins": 0.007347362581640482, "rewards/rejected": -0.01522277481853962, "step": 1400 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.1466073989868164, "eval_logits/rejected": -3.1409943103790283, "eval_logps/chosen": -58.54644012451172, "eval_logps/rejected": -63.25551986694336, "eval_loss": 0.691982090473175, "eval_rewards/accuracies": 0.5685408711433411, "eval_rewards/chosen": 0.0016545829130336642, "eval_rewards/margins": 0.0024085063487291336, "eval_rewards/rejected": -0.0007539235521107912, "eval_runtime": 382.6116, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 1400 }, { "epoch": 0.48587181254307377, "grad_norm": 1.7290781736373901, "learning_rate": 4.941840832860081e-08, "logits/chosen": -3.037386655807495, "logits/rejected": -3.018889904022217, "logps/chosen": -55.7311897277832, "logps/rejected": -55.98353958129883, "loss": 0.6898, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.008996831253170967, "rewards/margins": 0.006921672727912664, "rewards/rejected": -0.01591850444674492, "step": 1410 }, { "epoch": 0.48931771192281187, "grad_norm": 1.580364465713501, "learning_rate": 4.939671576066461e-08, "logits/chosen": -3.0005571842193604, "logits/rejected": -2.987203598022461, "logps/chosen": -55.080833435058594, "logps/rejected": -54.877960205078125, "loss": 0.6898, "rewards/accuracies": 0.640625, "rewards/chosen": -0.010124399326741695, "rewards/margins": 0.006932587828487158, "rewards/rejected": -0.017056988552212715, "step": 1420 }, { "epoch": 0.49276361130255, "grad_norm": 1.6177619695663452, "learning_rate": 4.937463095214312e-08, "logits/chosen": -3.038665294647217, "logits/rejected": -3.0166945457458496, "logps/chosen": -54.52788162231445, "logps/rejected": -55.391517639160156, "loss": 0.688, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.009224084205925465, "rewards/margins": 0.010559674352407455, "rewards/rejected": -0.019783759489655495, "step": 1430 }, { "epoch": 0.4962095106822881, "grad_norm": 1.8677644729614258, "learning_rate": 4.935215425810699e-08, "logits/chosen": -3.1079931259155273, "logits/rejected": -3.082766056060791, "logps/chosen": -56.69069290161133, "logps/rejected": -53.928863525390625, "loss": 0.6884, "rewards/accuracies": 0.59375, "rewards/chosen": -0.009587946347892284, "rewards/margins": 0.009884225204586983, "rewards/rejected": -0.019472172483801842, "step": 1440 }, { "epoch": 0.4996554100620262, "grad_norm": 1.6572710275650024, "learning_rate": 4.9329286039927495e-08, "logits/chosen": -3.1142032146453857, "logits/rejected": -3.106661558151245, "logps/chosen": -53.715484619140625, "logps/rejected": -53.52259063720703, "loss": 0.6908, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.009926171973347664, "rewards/margins": 0.005030042491853237, "rewards/rejected": -0.014956213533878326, "step": 1450 }, { "epoch": 0.5031013094417643, "grad_norm": 1.644881010055542, "learning_rate": 4.930602666527063e-08, "logits/chosen": -3.0849814414978027, "logits/rejected": -3.076979160308838, "logps/chosen": -55.85205841064453, "logps/rejected": -57.56487274169922, "loss": 0.689, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.010134072974324226, "rewards/margins": 0.008481858298182487, "rewards/rejected": -0.018615933135151863, "step": 1460 }, { "epoch": 0.5065472088215024, "grad_norm": 1.5868796110153198, "learning_rate": 4.928237650809127e-08, "logits/chosen": -3.0454933643341064, "logits/rejected": -3.0128085613250732, "logps/chosen": -53.47612380981445, "logps/rejected": -53.11464309692383, "loss": 0.6889, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01020341832190752, "rewards/margins": 0.008677304722368717, "rewards/rejected": -0.018880721181631088, "step": 1470 }, { "epoch": 0.5099931082012406, "grad_norm": 1.9091362953186035, "learning_rate": 4.925833594862714e-08, "logits/chosen": -3.075089454650879, "logits/rejected": -3.0520739555358887, "logps/chosen": -55.163734436035156, "logps/rejected": -55.09224319458008, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010291971266269684, "rewards/margins": 0.008484887890517712, "rewards/rejected": -0.01877685822546482, "step": 1480 }, { "epoch": 0.5134390075809786, "grad_norm": 1.6294002532958984, "learning_rate": 4.923390537339268e-08, "logits/chosen": -2.9856584072113037, "logits/rejected": -2.958491802215576, "logps/chosen": -55.59227752685547, "logps/rejected": -54.35834884643555, "loss": 0.6883, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.010039235465228558, "rewards/margins": 0.010070567950606346, "rewards/rejected": -0.02010980434715748, "step": 1490 }, { "epoch": 0.5168849069607168, "grad_norm": 1.5686553716659546, "learning_rate": 4.920908517517286e-08, "logits/chosen": -3.036954402923584, "logits/rejected": -3.0201165676116943, "logps/chosen": -55.33870315551758, "logps/rejected": -56.05976104736328, "loss": 0.6894, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0102760661393404, "rewards/margins": 0.007776142563670874, "rewards/rejected": -0.01805220916867256, "step": 1500 }, { "epoch": 0.5168849069607168, "eval_logits/chosen": -3.1431756019592285, "eval_logits/rejected": -3.1375293731689453, "eval_logps/chosen": -58.59446334838867, "eval_logps/rejected": -63.34099197387695, "eval_loss": 0.6918071508407593, "eval_rewards/accuracies": 0.5722583532333374, "eval_rewards/chosen": 0.0011742868227884173, "eval_rewards/margins": 0.002782951109111309, "eval_rewards/rejected": -0.0016086641699075699, "eval_runtime": 382.5631, "eval_samples_per_second": 11.25, "eval_steps_per_second": 1.406, "step": 1500 }, { "epoch": 0.5203308063404548, "grad_norm": 1.7908358573913574, "learning_rate": 4.918387575301683e-08, "logits/chosen": -3.111736297607422, "logits/rejected": -3.0841097831726074, "logps/chosen": -55.88440704345703, "logps/rejected": -53.73253631591797, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": -0.011713027954101562, "rewards/margins": 0.008561616763472557, "rewards/rejected": -0.02027464285492897, "step": 1510 }, { "epoch": 0.523776705720193, "grad_norm": 1.5355770587921143, "learning_rate": 4.915827751223157e-08, "logits/chosen": -3.107668399810791, "logits/rejected": -3.093592882156372, "logps/chosen": -54.899436950683594, "logps/rejected": -56.577857971191406, "loss": 0.6908, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.014680874533951283, "rewards/margins": 0.005072314292192459, "rewards/rejected": -0.019753187894821167, "step": 1520 }, { "epoch": 0.5272226050999311, "grad_norm": 1.701234221458435, "learning_rate": 4.913229086437528e-08, "logits/chosen": -3.036414861679077, "logits/rejected": -3.020360231399536, "logps/chosen": -55.82305145263672, "logps/rejected": -56.5231819152832, "loss": 0.691, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01206111814826727, "rewards/margins": 0.004545740317553282, "rewards/rejected": -0.016606858000159264, "step": 1530 }, { "epoch": 0.5306685044796692, "grad_norm": 1.552851676940918, "learning_rate": 4.910591622725084e-08, "logits/chosen": -2.970153331756592, "logits/rejected": -2.948779582977295, "logps/chosen": -55.9726448059082, "logps/rejected": -56.580360412597656, "loss": 0.6901, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.015721255913376808, "rewards/margins": 0.006385567598044872, "rewards/rejected": -0.022106822580099106, "step": 1540 }, { "epoch": 0.5341144038594073, "grad_norm": 1.80674409866333, "learning_rate": 4.907915402489907e-08, "logits/chosen": -3.030345916748047, "logits/rejected": -3.0058493614196777, "logps/chosen": -54.95343780517578, "logps/rejected": -54.7541389465332, "loss": 0.6887, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.011871935799717903, "rewards/margins": 0.00929428543895483, "rewards/rejected": -0.021166222169995308, "step": 1550 }, { "epoch": 0.5375603032391454, "grad_norm": 1.7914681434631348, "learning_rate": 4.905200468759188e-08, "logits/chosen": -3.013397693634033, "logits/rejected": -3.0062289237976074, "logps/chosen": -54.798606872558594, "logps/rejected": -57.39946365356445, "loss": 0.6906, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.014869784004986286, "rewards/margins": 0.005359475500881672, "rewards/rejected": -0.020229259505867958, "step": 1560 }, { "epoch": 0.5410062026188835, "grad_norm": 1.6412312984466553, "learning_rate": 4.90244686518254e-08, "logits/chosen": -3.013441562652588, "logits/rejected": -3.0016720294952393, "logps/chosen": -53.26996994018555, "logps/rejected": -55.54787063598633, "loss": 0.6902, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.01380932331085205, "rewards/margins": 0.0062625827267766, "rewards/rejected": -0.020071903243660927, "step": 1570 }, { "epoch": 0.5444521019986216, "grad_norm": 1.6605091094970703, "learning_rate": 4.899654636031296e-08, "logits/chosen": -3.0434343814849854, "logits/rejected": -3.0141358375549316, "logps/chosen": -55.2335319519043, "logps/rejected": -55.4152946472168, "loss": 0.6867, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.009759004227817059, "rewards/margins": 0.013216088525950909, "rewards/rejected": -0.022975092753767967, "step": 1580 }, { "epoch": 0.5478980013783598, "grad_norm": 1.5235562324523926, "learning_rate": 4.896823826197791e-08, "logits/chosen": -3.065284252166748, "logits/rejected": -3.0299429893493652, "logps/chosen": -55.74760055541992, "logps/rejected": -52.83717727661133, "loss": 0.6873, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.011746985837817192, "rewards/margins": 0.012054530903697014, "rewards/rejected": -0.023801516741514206, "step": 1590 }, { "epoch": 0.5513439007580979, "grad_norm": 1.7952224016189575, "learning_rate": 4.8939544811946474e-08, "logits/chosen": -3.0523910522460938, "logits/rejected": -3.042415142059326, "logps/chosen": -55.09062957763672, "logps/rejected": -56.489952087402344, "loss": 0.6893, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.012774528935551643, "rewards/margins": 0.008137383498251438, "rewards/rejected": -0.020911911502480507, "step": 1600 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.1399619579315186, "eval_logits/rejected": -3.1342928409576416, "eval_logps/chosen": -58.628414154052734, "eval_logps/rejected": -63.430233001708984, "eval_loss": 0.6915410161018372, "eval_rewards/accuracies": 0.574117124080658, "eval_rewards/chosen": 0.000834810605738312, "eval_rewards/margins": 0.0033358593937009573, "eval_rewards/rejected": -0.00250104907900095, "eval_runtime": 383.1191, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 1600 }, { "epoch": 0.554789800137836, "grad_norm": 1.7195132970809937, "learning_rate": 4.8910466471540425e-08, "logits/chosen": -2.949537754058838, "logits/rejected": -2.934624433517456, "logps/chosen": -53.50822830200195, "logps/rejected": -54.822059631347656, "loss": 0.6895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013606156222522259, "rewards/margins": 0.007713697850704193, "rewards/rejected": -0.021319855004549026, "step": 1610 }, { "epoch": 0.5582356995175741, "grad_norm": 1.6432405710220337, "learning_rate": 4.88810037082696e-08, "logits/chosen": -3.0831003189086914, "logits/rejected": -3.062864303588867, "logps/chosen": -56.65190505981445, "logps/rejected": -56.72029495239258, "loss": 0.6894, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.014548885636031628, "rewards/margins": 0.008042259141802788, "rewards/rejected": -0.02259114384651184, "step": 1620 }, { "epoch": 0.5616815988973122, "grad_norm": 1.6284902095794678, "learning_rate": 4.885115699582447e-08, "logits/chosen": -3.043569326400757, "logits/rejected": -3.024181365966797, "logps/chosen": -53.45033645629883, "logps/rejected": -55.55952835083008, "loss": 0.6876, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.013963378965854645, "rewards/margins": 0.011394304223358631, "rewards/rejected": -0.0253576822578907, "step": 1630 }, { "epoch": 0.5651274982770503, "grad_norm": 1.6828316450119019, "learning_rate": 4.8820926814068483e-08, "logits/chosen": -3.109710693359375, "logits/rejected": -3.1024813652038574, "logps/chosen": -54.01237869262695, "logps/rejected": -57.6888427734375, "loss": 0.6893, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.01737230457365513, "rewards/margins": 0.008087287656962872, "rewards/rejected": -0.025459593161940575, "step": 1640 }, { "epoch": 0.5685733976567884, "grad_norm": 1.8313180208206177, "learning_rate": 4.879031364903033e-08, "logits/chosen": -3.0975849628448486, "logits/rejected": -3.070986270904541, "logps/chosen": -55.392799377441406, "logps/rejected": -55.93354415893555, "loss": 0.6877, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.013514397665858269, "rewards/margins": 0.01132169645279646, "rewards/rejected": -0.024836096912622452, "step": 1650 }, { "epoch": 0.5720192970365265, "grad_norm": 1.746750831604004, "learning_rate": 4.875931799289619e-08, "logits/chosen": -3.0053420066833496, "logits/rejected": -2.973118305206299, "logps/chosen": -54.811927795410156, "logps/rejected": -52.54011917114258, "loss": 0.6868, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.014087177813053131, "rewards/margins": 0.01309183519333601, "rewards/rejected": -0.027179012075066566, "step": 1660 }, { "epoch": 0.5754651964162646, "grad_norm": 1.7447357177734375, "learning_rate": 4.872794034400174e-08, "logits/chosen": -3.102569580078125, "logits/rejected": -3.077732563018799, "logps/chosen": -56.3289909362793, "logps/rejected": -56.41608810424805, "loss": 0.6868, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.012657213024795055, "rewards/margins": 0.01322676707059145, "rewards/rejected": -0.025883978232741356, "step": 1670 }, { "epoch": 0.5789110957960028, "grad_norm": 1.7729724645614624, "learning_rate": 4.8696181206824193e-08, "logits/chosen": -3.0259735584259033, "logits/rejected": -3.001051902770996, "logps/chosen": -56.8452033996582, "logps/rejected": -57.58673858642578, "loss": 0.6871, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.015482488088309765, "rewards/margins": 0.012606178410351276, "rewards/rejected": -0.028088664636015892, "step": 1680 }, { "epoch": 0.5823569951757409, "grad_norm": 1.6302522420883179, "learning_rate": 4.866404109197421e-08, "logits/chosen": -3.0682101249694824, "logits/rejected": -3.04693603515625, "logps/chosen": -54.665771484375, "logps/rejected": -54.72057342529297, "loss": 0.6874, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.014576872810721397, "rewards/margins": 0.011841942556202412, "rewards/rejected": -0.026418816298246384, "step": 1690 }, { "epoch": 0.585802894555479, "grad_norm": 1.7251896858215332, "learning_rate": 4.863152051618761e-08, "logits/chosen": -3.0659382343292236, "logits/rejected": -3.0330235958099365, "logps/chosen": -54.848663330078125, "logps/rejected": -54.21843338012695, "loss": 0.6871, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.015564699657261372, "rewards/margins": 0.012528747320175171, "rewards/rejected": -0.028093446046113968, "step": 1700 }, { "epoch": 0.585802894555479, "eval_logits/chosen": -3.1353001594543457, "eval_logits/rejected": -3.129624605178833, "eval_logps/chosen": -58.73965072631836, "eval_logps/rejected": -63.59201431274414, "eval_loss": 0.6913063526153564, "eval_rewards/accuracies": 0.5724906921386719, "eval_rewards/chosen": -0.0002775307802949101, "eval_rewards/margins": 0.0038414020091295242, "eval_rewards/rejected": -0.004118932876735926, "eval_runtime": 383.0111, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 1700 }, { "epoch": 0.5892487939352171, "grad_norm": 1.933193325996399, "learning_rate": 4.8598620002317147e-08, "logits/chosen": -2.9904685020446777, "logits/rejected": -2.9636688232421875, "logps/chosen": -52.75788497924805, "logps/rejected": -53.07261276245117, "loss": 0.6869, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.015876036137342453, "rewards/margins": 0.013014654628932476, "rewards/rejected": -0.028890687972307205, "step": 1710 }, { "epoch": 0.5926946933149552, "grad_norm": 1.7606897354125977, "learning_rate": 4.856534007932405e-08, "logits/chosen": -3.045719623565674, "logits/rejected": -3.0275542736053467, "logps/chosen": -53.83793258666992, "logps/rejected": -56.604957580566406, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018810544162988663, "rewards/margins": 0.010075687430799007, "rewards/rejected": -0.028886232525110245, "step": 1720 }, { "epoch": 0.5961405926946933, "grad_norm": 1.613088846206665, "learning_rate": 4.853168128226953e-08, "logits/chosen": -3.1440420150756836, "logits/rejected": -3.1172289848327637, "logps/chosen": -57.25090408325195, "logps/rejected": -56.80400848388672, "loss": 0.6881, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.014714662916958332, "rewards/margins": 0.010589836165308952, "rewards/rejected": -0.02530449628829956, "step": 1730 }, { "epoch": 0.5995864920744314, "grad_norm": 1.762083649635315, "learning_rate": 4.8497644152306206e-08, "logits/chosen": -3.075286626815796, "logits/rejected": -3.051445484161377, "logps/chosen": -56.94948196411133, "logps/rejected": -55.8900032043457, "loss": 0.6869, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.014870844781398773, "rewards/margins": 0.012934574857354164, "rewards/rejected": -0.027805421501398087, "step": 1740 }, { "epoch": 0.6030323914541695, "grad_norm": 1.5474236011505127, "learning_rate": 4.8463229236669355e-08, "logits/chosen": -3.020326614379883, "logits/rejected": -3.0016889572143555, "logps/chosen": -54.87323760986328, "logps/rejected": -57.5669059753418, "loss": 0.6872, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0173720084130764, "rewards/margins": 0.01237417757511139, "rewards/rejected": -0.02974618598818779, "step": 1750 }, { "epoch": 0.6064782908339077, "grad_norm": 1.683550238609314, "learning_rate": 4.842843708866815e-08, "logits/chosen": -2.953479290008545, "logits/rejected": -2.930711507797241, "logps/chosen": -56.527130126953125, "logps/rejected": -54.67262649536133, "loss": 0.6884, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.020763549953699112, "rewards/margins": 0.009913107380270958, "rewards/rejected": -0.03067665919661522, "step": 1760 }, { "epoch": 0.6099241902136457, "grad_norm": 1.8445039987564087, "learning_rate": 4.8393268267676766e-08, "logits/chosen": -3.0489084720611572, "logits/rejected": -3.0148653984069824, "logps/chosen": -56.586753845214844, "logps/rejected": -55.32324981689453, "loss": 0.6863, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.016516460105776787, "rewards/margins": 0.014310793951153755, "rewards/rejected": -0.030827254056930542, "step": 1770 }, { "epoch": 0.6133700895933839, "grad_norm": 1.6412739753723145, "learning_rate": 4.835772333912535e-08, "logits/chosen": -3.015958070755005, "logits/rejected": -2.9972171783447266, "logps/chosen": -55.84075164794922, "logps/rejected": -55.65936279296875, "loss": 0.6884, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02109210193157196, "rewards/margins": 0.01000445056706667, "rewards/rejected": -0.031096553429961205, "step": 1780 }, { "epoch": 0.616815988973122, "grad_norm": 1.7817399501800537, "learning_rate": 4.832180287449098e-08, "logits/chosen": -3.095059633255005, "logits/rejected": -3.0659828186035156, "logps/chosen": -59.23529052734375, "logps/rejected": -58.51654052734375, "loss": 0.6861, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.017015349119901657, "rewards/margins": 0.01468147523701191, "rewards/rejected": -0.031696826219558716, "step": 1790 }, { "epoch": 0.6202618883528601, "grad_norm": 1.8000974655151367, "learning_rate": 4.8285507451288445e-08, "logits/chosen": -3.039083957672119, "logits/rejected": -3.0187718868255615, "logps/chosen": -56.724571228027344, "logps/rejected": -56.069847106933594, "loss": 0.6879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02188548445701599, "rewards/margins": 0.011079026386141777, "rewards/rejected": -0.03296450898051262, "step": 1800 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -3.131194829940796, "eval_logits/rejected": -3.125533103942871, "eval_logps/chosen": -58.872982025146484, "eval_logps/rejected": -63.792110443115234, "eval_loss": 0.6909948587417603, "eval_rewards/accuracies": 0.5764405131340027, "eval_rewards/chosen": -0.0016108807176351547, "eval_rewards/margins": 0.004509021062403917, "eval_rewards/rejected": -0.006119901780039072, "eval_runtime": 382.7969, "eval_samples_per_second": 11.244, "eval_steps_per_second": 1.405, "step": 1800 }, { "epoch": 0.6237077877325982, "grad_norm": 1.7410837411880493, "learning_rate": 4.824883765306095e-08, "logits/chosen": -2.975238800048828, "logits/rejected": -2.9546613693237305, "logps/chosen": -55.50226593017578, "logps/rejected": -54.74309539794922, "loss": 0.6874, "rewards/accuracies": 0.609375, "rewards/chosen": -0.020017337054014206, "rewards/margins": 0.012218406423926353, "rewards/rejected": -0.03223574161529541, "step": 1810 }, { "epoch": 0.6271536871123363, "grad_norm": 1.7493706941604614, "learning_rate": 4.821179406937077e-08, "logits/chosen": -3.0868115425109863, "logits/rejected": -3.0760278701782227, "logps/chosen": -54.96649169921875, "logps/rejected": -59.240135192871094, "loss": 0.6863, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.018745791167020798, "rewards/margins": 0.014332296326756477, "rewards/rejected": -0.033078085631132126, "step": 1820 }, { "epoch": 0.6305995864920745, "grad_norm": 1.7735671997070312, "learning_rate": 4.817437729578975e-08, "logits/chosen": -3.0417940616607666, "logits/rejected": -3.029489040374756, "logps/chosen": -54.360626220703125, "logps/rejected": -57.020545959472656, "loss": 0.688, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.018904857337474823, "rewards/margins": 0.01093919575214386, "rewards/rejected": -0.029844054952263832, "step": 1830 }, { "epoch": 0.6340454858718125, "grad_norm": 1.6280043125152588, "learning_rate": 4.813658793388974e-08, "logits/chosen": -3.0368285179138184, "logits/rejected": -3.0022246837615967, "logps/chosen": -55.715660095214844, "logps/rejected": -54.705528259277344, "loss": 0.6863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0195215605199337, "rewards/margins": 0.014291221275925636, "rewards/rejected": -0.033812783658504486, "step": 1840 }, { "epoch": 0.6374913852515507, "grad_norm": 1.6846120357513428, "learning_rate": 4.809842659123288e-08, "logits/chosen": -3.0029711723327637, "logits/rejected": -2.993957042694092, "logps/chosen": -56.678871154785156, "logps/rejected": -62.590484619140625, "loss": 0.6859, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01833237335085869, "rewards/margins": 0.015166716650128365, "rewards/rejected": -0.0334990918636322, "step": 1850 }, { "epoch": 0.6409372846312887, "grad_norm": 1.7529977560043335, "learning_rate": 4.8059893881361906e-08, "logits/chosen": -3.0661416053771973, "logits/rejected": -3.0407485961914062, "logps/chosen": -54.817100524902344, "logps/rejected": -55.76042556762695, "loss": 0.6854, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019343797117471695, "rewards/margins": 0.01616746559739113, "rewards/rejected": -0.035511262714862823, "step": 1860 }, { "epoch": 0.6443831840110269, "grad_norm": 1.8380669355392456, "learning_rate": 4.802099042379023e-08, "logits/chosen": -3.033001184463501, "logits/rejected": -3.0092084407806396, "logps/chosen": -56.14704513549805, "logps/rejected": -57.071266174316406, "loss": 0.6868, "rewards/accuracies": 0.609375, "rewards/chosen": -0.021707097068428993, "rewards/margins": 0.013430085964500904, "rewards/rejected": -0.03513718023896217, "step": 1870 }, { "epoch": 0.647829083390765, "grad_norm": 1.7067967653274536, "learning_rate": 4.7981716843992e-08, "logits/chosen": -3.081488847732544, "logits/rejected": -3.0656135082244873, "logps/chosen": -56.795921325683594, "logps/rejected": -58.1173095703125, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.021997634321451187, "rewards/margins": 0.013481196947395802, "rewards/rejected": -0.035478830337524414, "step": 1880 }, { "epoch": 0.6512749827705031, "grad_norm": 1.6388963460922241, "learning_rate": 4.794207377339204e-08, "logits/chosen": -3.050034284591675, "logits/rejected": -3.020045757293701, "logps/chosen": -55.989768981933594, "logps/rejected": -55.39397048950195, "loss": 0.6873, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.02179611101746559, "rewards/margins": 0.012388350442051888, "rewards/rejected": -0.03418446332216263, "step": 1890 }, { "epoch": 0.6547208821502413, "grad_norm": 1.9910722970962524, "learning_rate": 4.79020618493557e-08, "logits/chosen": -3.035548686981201, "logits/rejected": -3.004786729812622, "logps/chosen": -57.925025939941406, "logps/rejected": -55.33048629760742, "loss": 0.6869, "rewards/accuracies": 0.609375, "rewards/chosen": -0.02084331214427948, "rewards/margins": 0.013170194812119007, "rewards/rejected": -0.03401350602507591, "step": 1900 }, { "epoch": 0.6547208821502413, "eval_logits/chosen": -3.1266491413116455, "eval_logits/rejected": -3.121004819869995, "eval_logps/chosen": -59.04256820678711, "eval_logps/rejected": -64.01151275634766, "eval_loss": 0.6907696723937988, "eval_rewards/accuracies": 0.5803903341293335, "eval_rewards/chosen": -0.003306694095954299, "eval_rewards/margins": 0.005007210187613964, "eval_rewards/rejected": -0.008313903585076332, "eval_runtime": 382.5952, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 1900 }, { "epoch": 0.6581667815299793, "grad_norm": 1.7251228094100952, "learning_rate": 4.7861681715178594e-08, "logits/chosen": -3.05419921875, "logits/rejected": -3.0233535766601562, "logps/chosen": -57.729942321777344, "logps/rejected": -56.08539581298828, "loss": 0.6857, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.01946786418557167, "rewards/margins": 0.015628252178430557, "rewards/rejected": -0.03509611636400223, "step": 1910 }, { "epoch": 0.6616126809097175, "grad_norm": 1.6795231103897095, "learning_rate": 4.782093402007628e-08, "logits/chosen": -3.0454723834991455, "logits/rejected": -3.015894889831543, "logps/chosen": -57.969383239746094, "logps/rejected": -58.64679718017578, "loss": 0.6843, "rewards/accuracies": 0.640625, "rewards/chosen": -0.015876373276114464, "rewards/margins": 0.018631666898727417, "rewards/rejected": -0.03450804203748703, "step": 1920 }, { "epoch": 0.6650585802894555, "grad_norm": 1.8975529670715332, "learning_rate": 4.777981941917383e-08, "logits/chosen": -3.1397838592529297, "logits/rejected": -3.1188154220581055, "logps/chosen": -58.13710403442383, "logps/rejected": -56.817955017089844, "loss": 0.6882, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02396335080265999, "rewards/margins": 0.010672876611351967, "rewards/rejected": -0.034636225551366806, "step": 1930 }, { "epoch": 0.6685044796691937, "grad_norm": 1.7755709886550903, "learning_rate": 4.773833857349525e-08, "logits/chosen": -3.0498204231262207, "logits/rejected": -3.039801597595215, "logps/chosen": -57.210662841796875, "logps/rejected": -57.98060989379883, "loss": 0.6882, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023447440937161446, "rewards/margins": 0.010580571368336678, "rewards/rejected": -0.03402801230549812, "step": 1940 }, { "epoch": 0.6719503790489317, "grad_norm": 1.766200065612793, "learning_rate": 4.7696492149952907e-08, "logits/chosen": -3.098604679107666, "logits/rejected": -3.0564684867858887, "logps/chosen": -57.542076110839844, "logps/rejected": -55.08195114135742, "loss": 0.6843, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.020345885306596756, "rewards/margins": 0.01842666044831276, "rewards/rejected": -0.038772545754909515, "step": 1950 }, { "epoch": 0.6753962784286699, "grad_norm": 1.7190169095993042, "learning_rate": 4.765428082133675e-08, "logits/chosen": -3.096452474594116, "logits/rejected": -3.0651845932006836, "logps/chosen": -59.2999267578125, "logps/rejected": -56.54218673706055, "loss": 0.6843, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02144535258412361, "rewards/margins": 0.018707148730754852, "rewards/rejected": -0.040152497589588165, "step": 1960 }, { "epoch": 0.6788421778084079, "grad_norm": 1.9399358034133911, "learning_rate": 4.761170526630357e-08, "logits/chosen": -3.0438525676727295, "logits/rejected": -3.031733274459839, "logps/chosen": -54.611976623535156, "logps/rejected": -56.984405517578125, "loss": 0.686, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.022330697625875473, "rewards/margins": 0.015112390741705894, "rewards/rejected": -0.03744309023022652, "step": 1970 }, { "epoch": 0.6822880771881461, "grad_norm": 1.9661074876785278, "learning_rate": 4.756876616936601e-08, "logits/chosen": -3.0101959705352783, "logits/rejected": -2.987222909927368, "logps/chosen": -56.37892532348633, "logps/rejected": -59.14263916015625, "loss": 0.6851, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02290673367679119, "rewards/margins": 0.016811393201351166, "rewards/rejected": -0.039718128740787506, "step": 1980 }, { "epoch": 0.6857339765678843, "grad_norm": 1.6887887716293335, "learning_rate": 4.752546422088161e-08, "logits/chosen": -3.013328790664673, "logits/rejected": -2.9913642406463623, "logps/chosen": -56.39069747924805, "logps/rejected": -57.0994987487793, "loss": 0.687, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02684500813484192, "rewards/margins": 0.013143276795744896, "rewards/rejected": -0.039988286793231964, "step": 1990 }, { "epoch": 0.6891798759476223, "grad_norm": 1.8468791246414185, "learning_rate": 4.748180011704166e-08, "logits/chosen": -3.098635673522949, "logits/rejected": -3.0738401412963867, "logps/chosen": -57.67232131958008, "logps/rejected": -57.917266845703125, "loss": 0.6863, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.024618661031126976, "rewards/margins": 0.014407900162041187, "rewards/rejected": -0.03902656212449074, "step": 2000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -3.1211769580841064, "eval_logits/rejected": -3.115485668182373, "eval_logps/chosen": -59.301353454589844, "eval_logps/rejected": -64.3387680053711, "eval_loss": 0.6904570460319519, "eval_rewards/accuracies": 0.5799256563186646, "eval_rewards/chosen": -0.005894585512578487, "eval_rewards/margins": 0.0056918649934232235, "eval_rewards/rejected": -0.011586450971662998, "eval_runtime": 383.4778, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 2000 }, { "epoch": 0.6926257753273605, "grad_norm": 1.900058627128601, "learning_rate": 4.7437774559860097e-08, "logits/chosen": -3.020214796066284, "logits/rejected": -3.008654832839966, "logps/chosen": -52.79386520385742, "logps/rejected": -57.43042755126953, "loss": 0.6852, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.026914075016975403, "rewards/margins": 0.01667412556707859, "rewards/rejected": -0.043588198721408844, "step": 2010 }, { "epoch": 0.6960716747070985, "grad_norm": 1.8659268617630005, "learning_rate": 4.7393388257162105e-08, "logits/chosen": -3.06660795211792, "logits/rejected": -3.0496487617492676, "logps/chosen": -58.541046142578125, "logps/rejected": -58.7953987121582, "loss": 0.6863, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.024179494008421898, "rewards/margins": 0.014569652266800404, "rewards/rejected": -0.038749147206544876, "step": 2020 }, { "epoch": 0.6995175740868367, "grad_norm": 1.732512354850769, "learning_rate": 4.7348641922572805e-08, "logits/chosen": -3.018547296524048, "logits/rejected": -2.993462562561035, "logps/chosen": -56.4245719909668, "logps/rejected": -57.091590881347656, "loss": 0.6854, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.029620209708809853, "rewards/margins": 0.01647775247693062, "rewards/rejected": -0.04609795659780502, "step": 2030 }, { "epoch": 0.7029634734665747, "grad_norm": 1.9546312093734741, "learning_rate": 4.730353627550579e-08, "logits/chosen": -3.0118885040283203, "logits/rejected": -2.9969117641448975, "logps/chosen": -55.87482833862305, "logps/rejected": -57.9645881652832, "loss": 0.6861, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.030089732259511948, "rewards/margins": 0.015055355615913868, "rewards/rejected": -0.04514508321881294, "step": 2040 }, { "epoch": 0.7064093728463129, "grad_norm": 1.6274358034133911, "learning_rate": 4.7258072041151496e-08, "logits/chosen": -3.0556788444519043, "logits/rejected": -3.0257270336151123, "logps/chosen": -56.53417205810547, "logps/rejected": -56.3917236328125, "loss": 0.683, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.023738224059343338, "rewards/margins": 0.021326405927538872, "rewards/rejected": -0.04506463557481766, "step": 2050 }, { "epoch": 0.709855272226051, "grad_norm": 1.8148305416107178, "learning_rate": 4.721224995046562e-08, "logits/chosen": -2.9818592071533203, "logits/rejected": -2.977569103240967, "logps/chosen": -56.40079879760742, "logps/rejected": -57.645782470703125, "loss": 0.6881, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.033871449530124664, "rewards/margins": 0.01113309245556593, "rewards/rejected": -0.04500454291701317, "step": 2060 }, { "epoch": 0.7133011716057891, "grad_norm": 1.704732060432434, "learning_rate": 4.716607074015729e-08, "logits/chosen": -3.063542366027832, "logits/rejected": -3.0377025604248047, "logps/chosen": -57.37458038330078, "logps/rejected": -56.005638122558594, "loss": 0.6848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02627789042890072, "rewards/margins": 0.017597852274775505, "rewards/rejected": -0.04387574642896652, "step": 2070 }, { "epoch": 0.7167470709855273, "grad_norm": 1.6750351190567017, "learning_rate": 4.7119535152677293e-08, "logits/chosen": -3.0322868824005127, "logits/rejected": -3.0030274391174316, "logps/chosen": -57.85790252685547, "logps/rejected": -55.57404708862305, "loss": 0.6836, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.030979687348008156, "rewards/margins": 0.020082242786884308, "rewards/rejected": -0.05106193572282791, "step": 2080 }, { "epoch": 0.7201929703652653, "grad_norm": 1.719059705734253, "learning_rate": 4.707264393620608e-08, "logits/chosen": -3.030794858932495, "logits/rejected": -3.019486427307129, "logps/chosen": -56.23786163330078, "logps/rejected": -59.121315002441406, "loss": 0.687, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028670620173215866, "rewards/margins": 0.013171094469726086, "rewards/rejected": -0.041841715574264526, "step": 2090 }, { "epoch": 0.7236388697450035, "grad_norm": 1.942506194114685, "learning_rate": 4.7025397844641777e-08, "logits/chosen": -3.0369648933410645, "logits/rejected": -3.0216054916381836, "logps/chosen": -56.1086311340332, "logps/rejected": -57.0100212097168, "loss": 0.685, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.02721325121819973, "rewards/margins": 0.017340566962957382, "rewards/rejected": -0.04455381631851196, "step": 2100 }, { "epoch": 0.7236388697450035, "eval_logits/chosen": -3.1153643131256104, "eval_logits/rejected": -3.109682321548462, "eval_logps/chosen": -59.5750732421875, "eval_logps/rejected": -64.68336486816406, "eval_loss": 0.6901406049728394, "eval_rewards/accuracies": 0.5915427803993225, "eval_rewards/chosen": -0.008631750009953976, "eval_rewards/margins": 0.00640072813257575, "eval_rewards/rejected": -0.015032477676868439, "eval_runtime": 382.9653, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 2100 }, { "epoch": 0.7270847691247415, "grad_norm": 1.7648887634277344, "learning_rate": 4.6977797637588054e-08, "logits/chosen": -2.970499038696289, "logits/rejected": -2.949388265609741, "logps/chosen": -56.93970489501953, "logps/rejected": -57.76972198486328, "loss": 0.6844, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.031819865107536316, "rewards/margins": 0.018476802855730057, "rewards/rejected": -0.050296664237976074, "step": 2110 }, { "epoch": 0.7305306685044797, "grad_norm": 1.7377597093582153, "learning_rate": 4.692984408034188e-08, "logits/chosen": -3.033653736114502, "logits/rejected": -3.010927677154541, "logps/chosen": -58.04410934448242, "logps/rejected": -60.161460876464844, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": -0.030190179124474525, "rewards/margins": 0.02115626633167267, "rewards/rejected": -0.051346443593502045, "step": 2120 }, { "epoch": 0.7339765678842178, "grad_norm": 1.7860560417175293, "learning_rate": 4.688153794388129e-08, "logits/chosen": -2.986889600753784, "logits/rejected": -2.970906972885132, "logps/chosen": -58.03997039794922, "logps/rejected": -58.46440887451172, "loss": 0.6865, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.03251488506793976, "rewards/margins": 0.01451830007135868, "rewards/rejected": -0.04703318327665329, "step": 2130 }, { "epoch": 0.7374224672639559, "grad_norm": 1.7312990427017212, "learning_rate": 4.6832880004852905e-08, "logits/chosen": -2.974548816680908, "logits/rejected": -2.9482405185699463, "logps/chosen": -54.5570068359375, "logps/rejected": -56.208251953125, "loss": 0.6836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028940200805664062, "rewards/margins": 0.020129632204771042, "rewards/rejected": -0.049069829285144806, "step": 2140 }, { "epoch": 0.740868366643694, "grad_norm": 1.7594743967056274, "learning_rate": 4.678387104555949e-08, "logits/chosen": -3.0208096504211426, "logits/rejected": -3.003213405609131, "logps/chosen": -57.1629638671875, "logps/rejected": -57.227821350097656, "loss": 0.6855, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.032648637890815735, "rewards/margins": 0.01623198576271534, "rewards/rejected": -0.048880621790885925, "step": 2150 }, { "epoch": 0.7443142660234321, "grad_norm": 1.6323630809783936, "learning_rate": 4.673451185394741e-08, "logits/chosen": -3.0157551765441895, "logits/rejected": -2.992008686065674, "logps/chosen": -59.365943908691406, "logps/rejected": -58.07090377807617, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": -0.030189761891961098, "rewards/margins": 0.019196823239326477, "rewards/rejected": -0.049386583268642426, "step": 2160 }, { "epoch": 0.7477601654031703, "grad_norm": 1.6014976501464844, "learning_rate": 4.6684803223593885e-08, "logits/chosen": -2.9624719619750977, "logits/rejected": -2.9369637966156006, "logps/chosen": -56.25724411010742, "logps/rejected": -55.88273239135742, "loss": 0.6851, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04013683646917343, "rewards/margins": 0.017235979437828064, "rewards/rejected": -0.057372815907001495, "step": 2170 }, { "epoch": 0.7512060647829083, "grad_norm": 1.7938283681869507, "learning_rate": 4.6634745953694276e-08, "logits/chosen": -3.0021328926086426, "logits/rejected": -2.9840962886810303, "logps/chosen": -55.67314529418945, "logps/rejected": -57.486183166503906, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": -0.03332405909895897, "rewards/margins": 0.017307622358202934, "rewards/rejected": -0.05063168331980705, "step": 2180 }, { "epoch": 0.7546519641626465, "grad_norm": 1.657092571258545, "learning_rate": 4.6584340849049254e-08, "logits/chosen": -3.0505223274230957, "logits/rejected": -3.0312113761901855, "logps/chosen": -57.89087677001953, "logps/rejected": -58.30964279174805, "loss": 0.6858, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.032597921788692474, "rewards/margins": 0.015607580542564392, "rewards/rejected": -0.048205506056547165, "step": 2190 }, { "epoch": 0.7580978635423845, "grad_norm": 1.6630610227584839, "learning_rate": 4.653358872005182e-08, "logits/chosen": -2.996549129486084, "logits/rejected": -2.980790138244629, "logps/chosen": -58.564979553222656, "logps/rejected": -59.381103515625, "loss": 0.6865, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0349402017891407, "rewards/margins": 0.014279574155807495, "rewards/rejected": -0.049219775944948196, "step": 2200 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -3.110973834991455, "eval_logits/rejected": -3.105285167694092, "eval_logps/chosen": -59.876705169677734, "eval_logps/rejected": -65.0448226928711, "eval_loss": 0.6898779273033142, "eval_rewards/accuracies": 0.5829461216926575, "eval_rewards/chosen": -0.01164813619107008, "eval_rewards/margins": 0.006998822558671236, "eval_rewards/rejected": -0.018646959215402603, "eval_runtime": 382.8227, "eval_samples_per_second": 11.243, "eval_steps_per_second": 1.405, "step": 2200 }, { "epoch": 0.7615437629221227, "grad_norm": 1.6787549257278442, "learning_rate": 4.648249038267429e-08, "logits/chosen": -3.031440258026123, "logits/rejected": -3.0008792877197266, "logps/chosen": -58.2042121887207, "logps/rejected": -58.29167938232422, "loss": 0.6828, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03350379690527916, "rewards/margins": 0.02191033586859703, "rewards/rejected": -0.05541413277387619, "step": 2210 }, { "epoch": 0.7649896623018608, "grad_norm": 1.7904587984085083, "learning_rate": 4.6431046658455185e-08, "logits/chosen": -3.0165257453918457, "logits/rejected": -2.984936237335205, "logps/chosen": -60.2543830871582, "logps/rejected": -58.5495719909668, "loss": 0.6851, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03662030026316643, "rewards/margins": 0.017155960202217102, "rewards/rejected": -0.05377625674009323, "step": 2220 }, { "epoch": 0.7684355616815989, "grad_norm": 1.7944724559783936, "learning_rate": 4.6379258374486015e-08, "logits/chosen": -2.9792890548706055, "logits/rejected": -2.968996047973633, "logps/chosen": -58.11162567138672, "logps/rejected": -58.761474609375, "loss": 0.6865, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.038880012929439545, "rewards/margins": 0.014500935561954975, "rewards/rejected": -0.05338094383478165, "step": 2230 }, { "epoch": 0.771881461061337, "grad_norm": 1.671658992767334, "learning_rate": 4.632712636339798e-08, "logits/chosen": -3.0855040550231934, "logits/rejected": -3.0671868324279785, "logps/chosen": -57.3691291809082, "logps/rejected": -56.94904708862305, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03902739658951759, "rewards/margins": 0.01803675666451454, "rewards/rejected": -0.057064153254032135, "step": 2240 }, { "epoch": 0.7753273604410751, "grad_norm": 1.8935986757278442, "learning_rate": 4.6274651463348615e-08, "logits/chosen": -2.9964747428894043, "logits/rejected": -2.9754271507263184, "logps/chosen": -57.55412673950195, "logps/rejected": -56.86481475830078, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.038160599768161774, "rewards/margins": 0.01648407243192196, "rewards/rejected": -0.05464467406272888, "step": 2250 }, { "epoch": 0.7787732598208132, "grad_norm": 1.7745702266693115, "learning_rate": 4.622183451800822e-08, "logits/chosen": -3.082556962966919, "logits/rejected": -3.0577738285064697, "logps/chosen": -56.99103927612305, "logps/rejected": -58.51776123046875, "loss": 0.6852, "rewards/accuracies": 0.578125, "rewards/chosen": -0.03975795954465866, "rewards/margins": 0.0169279333204031, "rewards/rejected": -0.05668588727712631, "step": 2260 }, { "epoch": 0.7822191592005513, "grad_norm": 1.9160572290420532, "learning_rate": 4.6168676376546436e-08, "logits/chosen": -2.996509552001953, "logits/rejected": -2.9861018657684326, "logps/chosen": -57.43719482421875, "logps/rejected": -57.882041931152344, "loss": 0.6862, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04184093326330185, "rewards/margins": 0.01489521749317646, "rewards/rejected": -0.05673614889383316, "step": 2270 }, { "epoch": 0.7856650585802895, "grad_norm": 1.8574970960617065, "learning_rate": 4.6115177893618475e-08, "logits/chosen": -3.0045154094696045, "logits/rejected": -2.974029064178467, "logps/chosen": -59.52744674682617, "logps/rejected": -59.06389617919922, "loss": 0.6808, "rewards/accuracies": 0.671875, "rewards/chosen": -0.032396864145994186, "rewards/margins": 0.02583927847445011, "rewards/rejected": -0.05823614448308945, "step": 2280 }, { "epoch": 0.7891109579600276, "grad_norm": 1.7958654165267944, "learning_rate": 4.6061339929351424e-08, "logits/chosen": -2.9667840003967285, "logits/rejected": -2.94087290763855, "logps/chosen": -56.38531494140625, "logps/rejected": -57.443878173828125, "loss": 0.6837, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.037103235721588135, "rewards/margins": 0.01976102404296398, "rewards/rejected": -0.056864261627197266, "step": 2290 }, { "epoch": 0.7925568573397657, "grad_norm": 1.7183456420898438, "learning_rate": 4.600716334933043e-08, "logits/chosen": -2.9908032417297363, "logits/rejected": -2.966712236404419, "logps/chosen": -58.391075134277344, "logps/rejected": -59.49479293823242, "loss": 0.6841, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04266386479139328, "rewards/margins": 0.01949688233435154, "rewards/rejected": -0.06216074898838997, "step": 2300 }, { "epoch": 0.7925568573397657, "eval_logits/chosen": -3.1065871715545654, "eval_logits/rejected": -3.10090970993042, "eval_logps/chosen": -60.260704040527344, "eval_logps/rejected": -65.50055694580078, "eval_loss": 0.6895676851272583, "eval_rewards/accuracies": 0.586663544178009, "eval_rewards/chosen": -0.015488112345337868, "eval_rewards/margins": 0.007716240826994181, "eval_rewards/rejected": -0.02320435456931591, "eval_runtime": 383.0409, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.405, "step": 2300 }, { "epoch": 0.7960027567195038, "grad_norm": 1.9360281229019165, "learning_rate": 4.595264902458476e-08, "logits/chosen": -2.9780056476593018, "logits/rejected": -2.960970401763916, "logps/chosen": -57.8055419921875, "logps/rejected": -59.35618209838867, "loss": 0.685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.043772030621767044, "rewards/margins": 0.017492057755589485, "rewards/rejected": -0.06126408651471138, "step": 2310 }, { "epoch": 0.7994486560992419, "grad_norm": 1.8774693012237549, "learning_rate": 4.5897797831573805e-08, "logits/chosen": -3.0374228954315186, "logits/rejected": -3.012843132019043, "logps/chosen": -60.29777908325195, "logps/rejected": -62.17204666137695, "loss": 0.6846, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.04278632253408432, "rewards/margins": 0.018203191459178925, "rewards/rejected": -0.060989510267972946, "step": 2320 }, { "epoch": 0.80289455547898, "grad_norm": 1.8387089967727661, "learning_rate": 4.584261065217299e-08, "logits/chosen": -3.0314106941223145, "logits/rejected": -3.00540828704834, "logps/chosen": -58.178260803222656, "logps/rejected": -58.72043991088867, "loss": 0.6826, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.03949223458766937, "rewards/margins": 0.02242438867688179, "rewards/rejected": -0.061916619539260864, "step": 2330 }, { "epoch": 0.8063404548587181, "grad_norm": 1.6642886400222778, "learning_rate": 4.5787088373659585e-08, "logits/chosen": -3.0131068229675293, "logits/rejected": -2.984947919845581, "logps/chosen": -56.55591583251953, "logps/rejected": -56.63920974731445, "loss": 0.6831, "rewards/accuracies": 0.609375, "rewards/chosen": -0.040969397872686386, "rewards/margins": 0.02148524858057499, "rewards/rejected": -0.062454648315906525, "step": 2340 }, { "epoch": 0.8097863542384562, "grad_norm": 1.8320451974868774, "learning_rate": 4.5731231888698477e-08, "logits/chosen": -3.0359046459198, "logits/rejected": -3.021085739135742, "logps/chosen": -58.619239807128906, "logps/rejected": -61.617218017578125, "loss": 0.685, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.048705123364925385, "rewards/margins": 0.01751719042658806, "rewards/rejected": -0.06622230261564255, "step": 2350 }, { "epoch": 0.8132322536181944, "grad_norm": 1.8368377685546875, "learning_rate": 4.5675042095327745e-08, "logits/chosen": -3.010652780532837, "logits/rejected": -2.9759984016418457, "logps/chosen": -61.40837860107422, "logps/rejected": -59.10234832763672, "loss": 0.6807, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.0403360053896904, "rewards/margins": 0.02660386636853218, "rewards/rejected": -0.06693986803293228, "step": 2360 }, { "epoch": 0.8166781529979324, "grad_norm": 2.0085177421569824, "learning_rate": 4.56185198969443e-08, "logits/chosen": -2.9833760261535645, "logits/rejected": -2.9649837017059326, "logps/chosen": -58.53350830078125, "logps/rejected": -58.768898010253906, "loss": 0.6849, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04439615085721016, "rewards/margins": 0.017823565751314163, "rewards/rejected": -0.06221971660852432, "step": 2370 }, { "epoch": 0.8201240523776706, "grad_norm": 2.010369062423706, "learning_rate": 4.556166620228933e-08, "logits/chosen": -2.9958512783050537, "logits/rejected": -2.975276231765747, "logps/chosen": -57.974403381347656, "logps/rejected": -59.36014938354492, "loss": 0.6826, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04042334854602814, "rewards/margins": 0.022420626133680344, "rewards/rejected": -0.06284397095441818, "step": 2380 }, { "epoch": 0.8235699517574087, "grad_norm": 1.9562764167785645, "learning_rate": 4.5504481925433655e-08, "logits/chosen": -2.9959921836853027, "logits/rejected": -2.97601056098938, "logps/chosen": -60.0582160949707, "logps/rejected": -61.117103576660156, "loss": 0.6837, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.045281074941158295, "rewards/margins": 0.020175466313958168, "rewards/rejected": -0.06545653194189072, "step": 2390 }, { "epoch": 0.8270158511371468, "grad_norm": 1.9459636211395264, "learning_rate": 4.5446967985763096e-08, "logits/chosen": -2.996704578399658, "logits/rejected": -2.9758782386779785, "logps/chosen": -59.60308074951172, "logps/rejected": -58.083961486816406, "loss": 0.6847, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.04709639772772789, "rewards/margins": 0.01816444844007492, "rewards/rejected": -0.06526084989309311, "step": 2400 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -3.102332353591919, "eval_logits/rejected": -3.0965938568115234, "eval_logps/chosen": -60.76332473754883, "eval_logps/rejected": -66.08590698242188, "eval_loss": 0.6892156600952148, "eval_rewards/accuracies": 0.5829461216926575, "eval_rewards/chosen": -0.02051425538957119, "eval_rewards/margins": 0.008543580770492554, "eval_rewards/rejected": -0.029057836160063744, "eval_runtime": 382.8628, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 2400 }, { "epoch": 0.8304617505168849, "grad_norm": 1.8945268392562866, "learning_rate": 4.5389125307963644e-08, "logits/chosen": -2.9879133701324463, "logits/rejected": -2.9655041694641113, "logps/chosen": -58.931907653808594, "logps/rejected": -59.22075271606445, "loss": 0.6821, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.04745081439614296, "rewards/margins": 0.02379843220114708, "rewards/rejected": -0.07124923914670944, "step": 2410 }, { "epoch": 0.833907649896623, "grad_norm": 2.1146321296691895, "learning_rate": 4.533095482200661e-08, "logits/chosen": -2.993964910507202, "logits/rejected": -2.9673874378204346, "logps/chosen": -59.834556579589844, "logps/rejected": -59.58820724487305, "loss": 0.6834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04960930719971657, "rewards/margins": 0.021068843081593513, "rewards/rejected": -0.07067814469337463, "step": 2420 }, { "epoch": 0.8373535492763611, "grad_norm": 1.9361426830291748, "learning_rate": 4.527245746313368e-08, "logits/chosen": -3.0526516437530518, "logits/rejected": -3.0269055366516113, "logps/chosen": -61.29005813598633, "logps/rejected": -59.2348747253418, "loss": 0.6832, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.04994454234838486, "rewards/margins": 0.02146236225962639, "rewards/rejected": -0.07140690833330154, "step": 2430 }, { "epoch": 0.8407994486560992, "grad_norm": 1.7236909866333008, "learning_rate": 4.521363417184186e-08, "logits/chosen": -3.0148699283599854, "logits/rejected": -2.984027624130249, "logps/chosen": -60.1817626953125, "logps/rejected": -58.4943962097168, "loss": 0.6809, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04856560379266739, "rewards/margins": 0.025813397020101547, "rewards/rejected": -0.07437899708747864, "step": 2440 }, { "epoch": 0.8442453480358374, "grad_norm": 1.747627854347229, "learning_rate": 4.515448589386838e-08, "logits/chosen": -2.997276782989502, "logits/rejected": -2.969769239425659, "logps/chosen": -59.23713302612305, "logps/rejected": -59.42305374145508, "loss": 0.6811, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.04925640672445297, "rewards/margins": 0.026086727157235146, "rewards/rejected": -0.07534313201904297, "step": 2450 }, { "epoch": 0.8476912474155754, "grad_norm": 1.9418365955352783, "learning_rate": 4.509501358017546e-08, "logits/chosen": -3.0374348163604736, "logits/rejected": -3.012491226196289, "logps/chosen": -60.19707107543945, "logps/rejected": -60.899681091308594, "loss": 0.6798, "rewards/accuracies": 0.640625, "rewards/chosen": -0.05107522010803223, "rewards/margins": 0.028422167524695396, "rewards/rejected": -0.07949739694595337, "step": 2460 }, { "epoch": 0.8511371467953136, "grad_norm": 2.0027401447296143, "learning_rate": 4.5035218186935046e-08, "logits/chosen": -3.0924105644226074, "logits/rejected": -3.071659564971924, "logps/chosen": -60.37407684326172, "logps/rejected": -63.678924560546875, "loss": 0.6828, "rewards/accuracies": 0.609375, "rewards/chosen": -0.05841587856411934, "rewards/margins": 0.02222299389541149, "rewards/rejected": -0.08063887059688568, "step": 2470 }, { "epoch": 0.8545830461750517, "grad_norm": 1.7875603437423706, "learning_rate": 4.4975100675513424e-08, "logits/chosen": -2.9157721996307373, "logits/rejected": -2.909118175506592, "logps/chosen": -58.89693069458008, "logps/rejected": -61.99357223510742, "loss": 0.687, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.0535644106566906, "rewards/margins": 0.013684220612049103, "rewards/rejected": -0.06724863499403, "step": 2480 }, { "epoch": 0.8580289455547898, "grad_norm": 1.9273273944854736, "learning_rate": 4.491466201245577e-08, "logits/chosen": -2.9989733695983887, "logits/rejected": -2.9906697273254395, "logps/chosen": -57.8841667175293, "logps/rejected": -60.50758743286133, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": -0.057686787098646164, "rewards/margins": 0.013683912344276905, "rewards/rejected": -0.0713706985116005, "step": 2490 }, { "epoch": 0.8614748449345279, "grad_norm": 1.8793890476226807, "learning_rate": 4.485390316947061e-08, "logits/chosen": -2.96045184135437, "logits/rejected": -2.946359872817993, "logps/chosen": -59.032554626464844, "logps/rejected": -61.06984329223633, "loss": 0.6838, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.05238372087478638, "rewards/margins": 0.020352300256490707, "rewards/rejected": -0.07273602485656738, "step": 2500 }, { "epoch": 0.8614748449345279, "eval_logits/chosen": -3.096419095993042, "eval_logits/rejected": -3.090686559677124, "eval_logps/chosen": -61.28753662109375, "eval_logps/rejected": -66.70257568359375, "eval_loss": 0.6888163685798645, "eval_rewards/accuracies": 0.5968866348266602, "eval_rewards/chosen": -0.025756467133760452, "eval_rewards/margins": 0.009468138217926025, "eval_rewards/rejected": -0.035224609076976776, "eval_runtime": 382.6445, "eval_samples_per_second": 11.248, "eval_steps_per_second": 1.406, "step": 2500 }, { "epoch": 0.864920744314266, "grad_norm": 1.919879674911499, "learning_rate": 4.479282512341418e-08, "logits/chosen": -2.971209764480591, "logits/rejected": -2.94486403465271, "logps/chosen": -58.94304656982422, "logps/rejected": -59.53107833862305, "loss": 0.6826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05667944625020027, "rewards/margins": 0.0230792798101902, "rewards/rejected": -0.07975871860980988, "step": 2510 }, { "epoch": 0.8683666436940042, "grad_norm": 1.8515247106552124, "learning_rate": 4.473142885627474e-08, "logits/chosen": -3.032197952270508, "logits/rejected": -3.0064892768859863, "logps/chosen": -60.551979064941406, "logps/rejected": -59.5928955078125, "loss": 0.6853, "rewards/accuracies": 0.578125, "rewards/chosen": -0.05664666369557381, "rewards/margins": 0.01725495606660843, "rewards/rejected": -0.07390162348747253, "step": 2520 }, { "epoch": 0.8718125430737422, "grad_norm": 1.7445335388183594, "learning_rate": 4.466971535515679e-08, "logits/chosen": -2.946706771850586, "logits/rejected": -2.931774616241455, "logps/chosen": -58.84998321533203, "logps/rejected": -61.25663375854492, "loss": 0.6836, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.05811877176165581, "rewards/margins": 0.020737625658512115, "rewards/rejected": -0.07885640114545822, "step": 2530 }, { "epoch": 0.8752584424534804, "grad_norm": 1.9827686548233032, "learning_rate": 4.4607685612265186e-08, "logits/chosen": -3.083529472351074, "logits/rejected": -3.05134654045105, "logps/chosen": -60.16617965698242, "logps/rejected": -57.46630859375, "loss": 0.6794, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.050398267805576324, "rewards/margins": 0.02916848659515381, "rewards/rejected": -0.07956675440073013, "step": 2540 }, { "epoch": 0.8787043418332184, "grad_norm": 1.8980780839920044, "learning_rate": 4.4545340624889195e-08, "logits/chosen": -2.963822841644287, "logits/rejected": -2.9299607276916504, "logps/chosen": -60.31853485107422, "logps/rejected": -60.282203674316406, "loss": 0.6805, "rewards/accuracies": 0.625, "rewards/chosen": -0.05639735981822014, "rewards/margins": 0.027309810742735863, "rewards/rejected": -0.08370716869831085, "step": 2550 }, { "epoch": 0.8821502412129566, "grad_norm": 1.9544316530227661, "learning_rate": 4.448268139538643e-08, "logits/chosen": -3.0540387630462646, "logits/rejected": -3.0203042030334473, "logps/chosen": -61.7867546081543, "logps/rejected": -59.989013671875, "loss": 0.6838, "rewards/accuracies": 0.609375, "rewards/chosen": -0.059136342257261276, "rewards/margins": 0.020124230533838272, "rewards/rejected": -0.07926057279109955, "step": 2560 }, { "epoch": 0.8855961405926946, "grad_norm": 1.8652408123016357, "learning_rate": 4.4419708931166814e-08, "logits/chosen": -3.0557119846343994, "logits/rejected": -3.020301342010498, "logps/chosen": -60.8636360168457, "logps/rejected": -59.45096969604492, "loss": 0.68, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.05171442776918411, "rewards/margins": 0.028054600581526756, "rewards/rejected": -0.07976902276277542, "step": 2570 }, { "epoch": 0.8890420399724328, "grad_norm": 1.9375498294830322, "learning_rate": 4.435642424467628e-08, "logits/chosen": -3.0368781089782715, "logits/rejected": -3.0144054889678955, "logps/chosen": -58.2252082824707, "logps/rejected": -59.93499755859375, "loss": 0.6812, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.05951894074678421, "rewards/margins": 0.02540314756333828, "rewards/rejected": -0.08492208272218704, "step": 2580 }, { "epoch": 0.892487939352171, "grad_norm": 2.004504680633545, "learning_rate": 4.429282835338059e-08, "logits/chosen": -3.0588455200195312, "logits/rejected": -3.042233943939209, "logps/chosen": -60.89204025268555, "logps/rejected": -61.184104919433594, "loss": 0.6817, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.05542243272066116, "rewards/margins": 0.024638965725898743, "rewards/rejected": -0.08006139099597931, "step": 2590 }, { "epoch": 0.895933838731909, "grad_norm": 1.8711588382720947, "learning_rate": 4.422892227974889e-08, "logits/chosen": -3.0132105350494385, "logits/rejected": -2.9964966773986816, "logps/chosen": -58.610748291015625, "logps/rejected": -62.134613037109375, "loss": 0.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06151349097490311, "rewards/margins": 0.02006571553647518, "rewards/rejected": -0.08157920837402344, "step": 2600 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -3.092498779296875, "eval_logits/rejected": -3.086803674697876, "eval_logps/chosen": -61.75394821166992, "eval_logps/rejected": -67.25653839111328, "eval_loss": 0.6884291768074036, "eval_rewards/accuracies": 0.5924721360206604, "eval_rewards/chosen": -0.030420558527112007, "eval_rewards/margins": 0.010343527421355247, "eval_rewards/rejected": -0.040764082223176956, "eval_runtime": 383.1738, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 2600 }, { "epoch": 0.8993797381116472, "grad_norm": 2.0665197372436523, "learning_rate": 4.416470705123735e-08, "logits/chosen": -3.023836135864258, "logits/rejected": -3.010112762451172, "logps/chosen": -57.9189453125, "logps/rejected": -60.74187469482422, "loss": 0.6841, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0579378679394722, "rewards/margins": 0.019573111087083817, "rewards/rejected": -0.07751097530126572, "step": 2610 }, { "epoch": 0.9028256374913852, "grad_norm": 2.056452989578247, "learning_rate": 4.410018370027258e-08, "logits/chosen": -2.977409839630127, "logits/rejected": -2.960664987564087, "logps/chosen": -59.6572151184082, "logps/rejected": -63.372642517089844, "loss": 0.6791, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05639306828379631, "rewards/margins": 0.03003428503870964, "rewards/rejected": -0.08642735332250595, "step": 2620 }, { "epoch": 0.9062715368711234, "grad_norm": 2.083853244781494, "learning_rate": 4.403535326423507e-08, "logits/chosen": -2.968996524810791, "logits/rejected": -2.9410641193389893, "logps/chosen": -60.10988235473633, "logps/rejected": -59.775108337402344, "loss": 0.6822, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.06228336691856384, "rewards/margins": 0.023516478016972542, "rewards/rejected": -0.08579983562231064, "step": 2630 }, { "epoch": 0.9097174362508614, "grad_norm": 1.997853398323059, "learning_rate": 4.397021678544251e-08, "logits/chosen": -3.0640952587127686, "logits/rejected": -3.041077136993408, "logps/chosen": -59.771812438964844, "logps/rejected": -60.32032012939453, "loss": 0.6811, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06809981167316437, "rewards/margins": 0.026022519916296005, "rewards/rejected": -0.09412233531475067, "step": 2640 }, { "epoch": 0.9131633356305996, "grad_norm": 1.9340122938156128, "learning_rate": 4.390477531113299e-08, "logits/chosen": -2.9635839462280273, "logits/rejected": -2.9340882301330566, "logps/chosen": -61.02925491333008, "logps/rejected": -61.8735466003418, "loss": 0.6805, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.05552306026220322, "rewards/margins": 0.027277979999780655, "rewards/rejected": -0.08280102908611298, "step": 2650 }, { "epoch": 0.9166092350103378, "grad_norm": 1.8188270330429077, "learning_rate": 4.3839029893448254e-08, "logits/chosen": -3.0547945499420166, "logits/rejected": -3.021871328353882, "logps/chosen": -63.97841262817383, "logps/rejected": -60.14704132080078, "loss": 0.6806, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06134616211056709, "rewards/margins": 0.026847366243600845, "rewards/rejected": -0.08819353580474854, "step": 2660 }, { "epoch": 0.9200551343900758, "grad_norm": 2.0354738235473633, "learning_rate": 4.377298158941666e-08, "logits/chosen": -2.9683034420013428, "logits/rejected": -2.9436066150665283, "logps/chosen": -60.2904052734375, "logps/rejected": -61.375, "loss": 0.6826, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.06682584434747696, "rewards/margins": 0.022921878844499588, "rewards/rejected": -0.08974771201610565, "step": 2670 }, { "epoch": 0.923501033769814, "grad_norm": 1.9245256185531616, "learning_rate": 4.3706631460936285e-08, "logits/chosen": -2.9982452392578125, "logits/rejected": -2.9764859676361084, "logps/chosen": -59.65437698364258, "logps/rejected": -63.409706115722656, "loss": 0.6819, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.06726102530956268, "rewards/margins": 0.0245712548494339, "rewards/rejected": -0.09183228015899658, "step": 2680 }, { "epoch": 0.926946933149552, "grad_norm": 1.9293922185897827, "learning_rate": 4.363998057475783e-08, "logits/chosen": -3.011679172515869, "logits/rejected": -2.982901096343994, "logps/chosen": -60.681610107421875, "logps/rejected": -61.31280517578125, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06020185351371765, "rewards/margins": 0.027754079550504684, "rewards/rejected": -0.08795593678951263, "step": 2690 }, { "epoch": 0.9303928325292902, "grad_norm": 1.8934297561645508, "learning_rate": 4.357303000246741e-08, "logits/chosen": -2.9961764812469482, "logits/rejected": -2.9662201404571533, "logps/chosen": -61.43010330200195, "logps/rejected": -61.719635009765625, "loss": 0.6822, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06593282520771027, "rewards/margins": 0.023738160729408264, "rewards/rejected": -0.08967099338769913, "step": 2700 }, { "epoch": 0.9303928325292902, "eval_logits/chosen": -3.0876576900482178, "eval_logits/rejected": -3.081939458847046, "eval_logps/chosen": -62.2427978515625, "eval_logps/rejected": -67.84044647216797, "eval_loss": 0.6880134344100952, "eval_rewards/accuracies": 0.5931691527366638, "eval_rewards/chosen": -0.035309046506881714, "eval_rewards/margins": 0.011294194497168064, "eval_rewards/rejected": -0.0466032400727272, "eval_runtime": 382.7476, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 2700 }, { "epoch": 0.9338387319090282, "grad_norm": 1.9505693912506104, "learning_rate": 4.350578082046944e-08, "logits/chosen": -3.048022508621216, "logits/rejected": -3.0309932231903076, "logps/chosen": -62.11763381958008, "logps/rejected": -62.5243034362793, "loss": 0.6858, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.06938820332288742, "rewards/margins": 0.016655951738357544, "rewards/rejected": -0.08604415506124496, "step": 2710 }, { "epoch": 0.9372846312887664, "grad_norm": 1.9431886672973633, "learning_rate": 4.34382341099692e-08, "logits/chosen": -3.00071382522583, "logits/rejected": -2.9866724014282227, "logps/chosen": -62.027076721191406, "logps/rejected": -63.376365661621094, "loss": 0.6803, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.07127322256565094, "rewards/margins": 0.027515124529600143, "rewards/rejected": -0.09878835827112198, "step": 2720 }, { "epoch": 0.9407305306685044, "grad_norm": 2.023892879486084, "learning_rate": 4.337039095695554e-08, "logits/chosen": -2.9774184226989746, "logits/rejected": -2.963737964630127, "logps/chosen": -61.70005416870117, "logps/rejected": -61.73517990112305, "loss": 0.6841, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06988800317049026, "rewards/margins": 0.020021487027406693, "rewards/rejected": -0.08990950137376785, "step": 2730 }, { "epoch": 0.9441764300482426, "grad_norm": 2.0286433696746826, "learning_rate": 4.33022524521834e-08, "logits/chosen": -2.986729383468628, "logits/rejected": -2.9642789363861084, "logps/chosen": -59.382606506347656, "logps/rejected": -60.792091369628906, "loss": 0.6817, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.07231686264276505, "rewards/margins": 0.02496473304927349, "rewards/rejected": -0.09728158265352249, "step": 2740 }, { "epoch": 0.9476223294279807, "grad_norm": 1.9931776523590088, "learning_rate": 4.323381969115626e-08, "logits/chosen": -3.0188794136047363, "logits/rejected": -2.9968786239624023, "logps/chosen": -61.60298538208008, "logps/rejected": -62.50029754638672, "loss": 0.6834, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0691499263048172, "rewards/margins": 0.02146395482122898, "rewards/rejected": -0.09061389416456223, "step": 2750 }, { "epoch": 0.9510682288077188, "grad_norm": 2.105567693710327, "learning_rate": 4.316509377410852e-08, "logits/chosen": -2.961287021636963, "logits/rejected": -2.940154552459717, "logps/chosen": -61.21519088745117, "logps/rejected": -61.087059020996094, "loss": 0.6828, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06689263880252838, "rewards/margins": 0.02259758487343788, "rewards/rejected": -0.08949021995067596, "step": 2760 }, { "epoch": 0.954514128187457, "grad_norm": 1.896525263786316, "learning_rate": 4.309607580598785e-08, "logits/chosen": -2.9971914291381836, "logits/rejected": -2.9825148582458496, "logps/chosen": -62.52891159057617, "logps/rejected": -63.73039627075195, "loss": 0.6819, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06401952356100082, "rewards/margins": 0.024410869926214218, "rewards/rejected": -0.08843038976192474, "step": 2770 }, { "epoch": 0.957960027567195, "grad_norm": 2.1041810512542725, "learning_rate": 4.302676689643739e-08, "logits/chosen": -2.923506736755371, "logits/rejected": -2.9021189212799072, "logps/chosen": -59.46391677856445, "logps/rejected": -61.263282775878906, "loss": 0.683, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.07211627066135406, "rewards/margins": 0.02268272079527378, "rewards/rejected": -0.0947989970445633, "step": 2780 }, { "epoch": 0.9614059269469332, "grad_norm": 1.849665641784668, "learning_rate": 4.295716815977791e-08, "logits/chosen": -2.9656927585601807, "logits/rejected": -2.9542365074157715, "logps/chosen": -59.235679626464844, "logps/rejected": -64.57613372802734, "loss": 0.6847, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.07313653081655502, "rewards/margins": 0.019011162221431732, "rewards/rejected": -0.09214769303798676, "step": 2790 }, { "epoch": 0.9648518263266712, "grad_norm": 1.8608778715133667, "learning_rate": 4.28872807149899e-08, "logits/chosen": -3.0552752017974854, "logits/rejected": -3.0319559574127197, "logps/chosen": -61.3916130065918, "logps/rejected": -65.34817504882812, "loss": 0.6821, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06724081188440323, "rewards/margins": 0.023939348757267, "rewards/rejected": -0.09118016809225082, "step": 2800 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -3.083217144012451, "eval_logits/rejected": -3.0774669647216797, "eval_logps/chosen": -62.41398239135742, "eval_logps/rejected": -68.07656860351562, "eval_loss": 0.6877134442329407, "eval_rewards/accuracies": 0.5961896181106567, "eval_rewards/chosen": -0.03702089190483093, "eval_rewards/margins": 0.011943526566028595, "eval_rewards/rejected": -0.04896441847085953, "eval_runtime": 383.0736, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 2800 }, { "epoch": 0.9682977257064094, "grad_norm": 2.0240118503570557, "learning_rate": 4.281710568569561e-08, "logits/chosen": -3.010549545288086, "logits/rejected": -2.99056339263916, "logps/chosen": -60.9427375793457, "logps/rejected": -63.515716552734375, "loss": 0.6804, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.07232636213302612, "rewards/margins": 0.027413805946707726, "rewards/rejected": -0.099740169942379, "step": 2810 }, { "epoch": 0.9717436250861475, "grad_norm": 2.1519691944122314, "learning_rate": 4.274664420014093e-08, "logits/chosen": -3.0124893188476562, "logits/rejected": -2.9908063411712646, "logps/chosen": -61.38493728637695, "logps/rejected": -62.30036163330078, "loss": 0.683, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06458134204149246, "rewards/margins": 0.022180432453751564, "rewards/rejected": -0.08676177263259888, "step": 2820 }, { "epoch": 0.9751895244658856, "grad_norm": 1.9782400131225586, "learning_rate": 4.2675897391177306e-08, "logits/chosen": -2.9120445251464844, "logits/rejected": -2.9073710441589355, "logps/chosen": -59.32768630981445, "logps/rejected": -61.179168701171875, "loss": 0.6832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07390494644641876, "rewards/margins": 0.02186496928334236, "rewards/rejected": -0.09576991200447083, "step": 2830 }, { "epoch": 0.9786354238456237, "grad_norm": 2.003605842590332, "learning_rate": 4.260486639624347e-08, "logits/chosen": -2.989738702774048, "logits/rejected": -2.958439350128174, "logps/chosen": -61.297950744628906, "logps/rejected": -62.249229431152344, "loss": 0.6796, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0708748996257782, "rewards/margins": 0.02920188568532467, "rewards/rejected": -0.10007677972316742, "step": 2840 }, { "epoch": 0.9820813232253618, "grad_norm": 2.036134719848633, "learning_rate": 4.253355235734719e-08, "logits/chosen": -2.944457530975342, "logits/rejected": -2.9197475910186768, "logps/chosen": -64.59095764160156, "logps/rejected": -62.16461181640625, "loss": 0.6792, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.06698767840862274, "rewards/margins": 0.02979203686118126, "rewards/rejected": -0.0967797189950943, "step": 2850 }, { "epoch": 0.9855272226051, "grad_norm": 2.0528111457824707, "learning_rate": 4.2461956421046915e-08, "logits/chosen": -3.018885374069214, "logits/rejected": -2.992501735687256, "logps/chosen": -60.8172492980957, "logps/rejected": -61.980613708496094, "loss": 0.6767, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07254646718502045, "rewards/margins": 0.03516736626625061, "rewards/rejected": -0.10771384090185165, "step": 2860 }, { "epoch": 0.988973121984838, "grad_norm": 2.017298460006714, "learning_rate": 4.239007973843332e-08, "logits/chosen": -3.009936571121216, "logits/rejected": -2.9883313179016113, "logps/chosen": -62.779754638671875, "logps/rejected": -63.250762939453125, "loss": 0.6802, "rewards/accuracies": 0.625, "rewards/chosen": -0.07313181459903717, "rewards/margins": 0.027741700410842896, "rewards/rejected": -0.10087350755929947, "step": 2870 }, { "epoch": 0.9924190213645762, "grad_norm": 2.128848075866699, "learning_rate": 4.231792346511078e-08, "logits/chosen": -3.0316288471221924, "logits/rejected": -3.0034053325653076, "logps/chosen": -63.786529541015625, "logps/rejected": -63.05548858642578, "loss": 0.6798, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07249516248703003, "rewards/margins": 0.028859594836831093, "rewards/rejected": -0.10135475546121597, "step": 2880 }, { "epoch": 0.9958649207443143, "grad_norm": 2.1330654621124268, "learning_rate": 4.224548876117888e-08, "logits/chosen": -2.9470300674438477, "logits/rejected": -2.9275383949279785, "logps/chosen": -58.948097229003906, "logps/rejected": -63.259849548339844, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": -0.06788979470729828, "rewards/margins": 0.024633031338453293, "rewards/rejected": -0.09252282977104187, "step": 2890 }, { "epoch": 0.9993108201240524, "grad_norm": 1.9306037425994873, "learning_rate": 4.217277679121364e-08, "logits/chosen": -2.9437849521636963, "logits/rejected": -2.91251277923584, "logps/chosen": -58.18503952026367, "logps/rejected": -62.898475646972656, "loss": 0.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07072552293539047, "rewards/margins": 0.02765659987926483, "rewards/rejected": -0.0983821302652359, "step": 2900 }, { "epoch": 0.9993108201240524, "eval_logits/chosen": -3.078448534011841, "eval_logits/rejected": -3.072715997695923, "eval_logps/chosen": -62.82825469970703, "eval_logps/rejected": -68.55442810058594, "eval_loss": 0.6874446272850037, "eval_rewards/accuracies": 0.589684009552002, "eval_rewards/chosen": -0.04116356372833252, "eval_rewards/margins": 0.0125794792547822, "eval_rewards/rejected": -0.053743038326501846, "eval_runtime": 382.7425, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 2900 }, { "epoch": 1.0027567195037905, "grad_norm": 1.9332603216171265, "learning_rate": 4.209978872424886e-08, "logits/chosen": -3.0481903553009033, "logits/rejected": -3.0231127738952637, "logps/chosen": -61.15643310546875, "logps/rejected": -62.160919189453125, "loss": 0.6787, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07607889920473099, "rewards/margins": 0.03089074417948723, "rewards/rejected": -0.10696963965892792, "step": 2910 }, { "epoch": 1.0062026188835287, "grad_norm": 1.955428957939148, "learning_rate": 4.202652573375736e-08, "logits/chosen": -2.9499881267547607, "logits/rejected": -2.923348903656006, "logps/chosen": -60.11320877075195, "logps/rejected": -64.00416564941406, "loss": 0.6787, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.07170912623405457, "rewards/margins": 0.031156618148088455, "rewards/rejected": -0.10286574065685272, "step": 2920 }, { "epoch": 1.0096485182632666, "grad_norm": 2.3318111896514893, "learning_rate": 4.195298899763202e-08, "logits/chosen": -2.9664363861083984, "logits/rejected": -2.9510273933410645, "logps/chosen": -58.969688415527344, "logps/rejected": -64.38948059082031, "loss": 0.6798, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07625620067119598, "rewards/margins": 0.028971320018172264, "rewards/rejected": -0.1052275076508522, "step": 2930 }, { "epoch": 1.0130944176430048, "grad_norm": 2.0234904289245605, "learning_rate": 4.187917969816692e-08, "logits/chosen": -2.969914674758911, "logits/rejected": -2.9529500007629395, "logps/chosen": -62.38588333129883, "logps/rejected": -63.943702697753906, "loss": 0.6796, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06924493610858917, "rewards/margins": 0.029406771063804626, "rewards/rejected": -0.0986516997218132, "step": 2940 }, { "epoch": 1.016540317022743, "grad_norm": 1.9675633907318115, "learning_rate": 4.1805099022038286e-08, "logits/chosen": -2.9865658283233643, "logits/rejected": -2.9694154262542725, "logps/chosen": -62.565948486328125, "logps/rejected": -66.47245025634766, "loss": 0.6799, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07150250673294067, "rewards/margins": 0.028829354792833328, "rewards/rejected": -0.1003318578004837, "step": 2950 }, { "epoch": 1.019986216402481, "grad_norm": 2.224327564239502, "learning_rate": 4.1730748160285445e-08, "logits/chosen": -2.960991382598877, "logits/rejected": -2.953686475753784, "logps/chosen": -59.5008430480957, "logps/rejected": -65.13726806640625, "loss": 0.6842, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.07502765208482742, "rewards/margins": 0.020224645733833313, "rewards/rejected": -0.09525229781866074, "step": 2960 }, { "epoch": 1.0234321157822193, "grad_norm": 1.9539470672607422, "learning_rate": 4.165612830829166e-08, "logits/chosen": -2.9927141666412354, "logits/rejected": -2.9691245555877686, "logps/chosen": -59.87060546875, "logps/rejected": -62.82917404174805, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": -0.07637914270162582, "rewards/margins": 0.030744856223464012, "rewards/rejected": -0.10712400823831558, "step": 2970 }, { "epoch": 1.0268780151619572, "grad_norm": 2.0243091583251953, "learning_rate": 4.1581240665764896e-08, "logits/chosen": -2.891476631164551, "logits/rejected": -2.8637735843658447, "logps/chosen": -62.929527282714844, "logps/rejected": -62.7051887512207, "loss": 0.6796, "rewards/accuracies": 0.640625, "rewards/chosen": -0.06915273517370224, "rewards/margins": 0.02903824672102928, "rewards/rejected": -0.09819098562002182, "step": 2980 }, { "epoch": 1.0303239145416954, "grad_norm": 1.98366379737854, "learning_rate": 4.150608643671854e-08, "logits/chosen": -2.9171640872955322, "logits/rejected": -2.9082531929016113, "logps/chosen": -60.1267204284668, "logps/rejected": -63.3505973815918, "loss": 0.6829, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07893037796020508, "rewards/margins": 0.022428948432207108, "rewards/rejected": -0.10135932266712189, "step": 2990 }, { "epoch": 1.0337698139214335, "grad_norm": 1.824174404144287, "learning_rate": 4.1430666829452105e-08, "logits/chosen": -2.979609966278076, "logits/rejected": -2.953718662261963, "logps/chosen": -61.28566360473633, "logps/rejected": -62.47938919067383, "loss": 0.6809, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07697228342294693, "rewards/margins": 0.026477236300706863, "rewards/rejected": -0.1034495085477829, "step": 3000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -3.0725417137145996, "eval_logits/rejected": -3.0667710304260254, "eval_logps/chosen": -62.928524017333984, "eval_logps/rejected": -68.7141342163086, "eval_loss": 0.6871683597564697, "eval_rewards/accuracies": 0.5945631861686707, "eval_rewards/chosen": -0.04216630756855011, "eval_rewards/margins": 0.013173781335353851, "eval_rewards/rejected": -0.05534008517861366, "eval_runtime": 382.924, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 3000 }, { "epoch": 1.0372157133011717, "grad_norm": 1.9345695972442627, "learning_rate": 4.1354983056531674e-08, "logits/chosen": -3.0547025203704834, "logits/rejected": -3.0293545722961426, "logps/chosen": -61.43259811401367, "logps/rejected": -60.869873046875, "loss": 0.6818, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0772133618593216, "rewards/margins": 0.02495790645480156, "rewards/rejected": -0.10217127948999405, "step": 3010 }, { "epoch": 1.0406616126809096, "grad_norm": 1.774355173110962, "learning_rate": 4.127903633477052e-08, "logits/chosen": -2.8931214809417725, "logits/rejected": -2.8779661655426025, "logps/chosen": -60.047607421875, "logps/rejected": -61.54701614379883, "loss": 0.6829, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.08274487406015396, "rewards/margins": 0.022762062028050423, "rewards/rejected": -0.10550693422555923, "step": 3020 }, { "epoch": 1.0441075120606478, "grad_norm": 1.8555364608764648, "learning_rate": 4.12028278852095e-08, "logits/chosen": -2.9261889457702637, "logits/rejected": -2.901170253753662, "logps/chosen": -64.0744400024414, "logps/rejected": -63.515655517578125, "loss": 0.6791, "rewards/accuracies": 0.640625, "rewards/chosen": -0.07655985653400421, "rewards/margins": 0.030276939272880554, "rewards/rejected": -0.10683679580688477, "step": 3030 }, { "epoch": 1.047553411440386, "grad_norm": 2.0417416095733643, "learning_rate": 4.1126358933097426e-08, "logits/chosen": -2.9539737701416016, "logits/rejected": -2.9326446056365967, "logps/chosen": -64.11217498779297, "logps/rejected": -63.56646728515625, "loss": 0.6777, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07603207230567932, "rewards/margins": 0.03313380107283592, "rewards/rejected": -0.10916588455438614, "step": 3040 }, { "epoch": 1.050999310820124, "grad_norm": 2.0376813411712646, "learning_rate": 4.104963070787134e-08, "logits/chosen": -2.990377187728882, "logits/rejected": -2.9798829555511475, "logps/chosen": -61.56812286376953, "logps/rejected": -63.4566535949707, "loss": 0.6775, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07472465187311172, "rewards/margins": 0.03373125195503235, "rewards/rejected": -0.10845591127872467, "step": 3050 }, { "epoch": 1.0544452101998623, "grad_norm": 2.0591254234313965, "learning_rate": 4.0972644443136805e-08, "logits/chosen": -3.03212833404541, "logits/rejected": -3.000431537628174, "logps/chosen": -60.247825622558594, "logps/rejected": -64.26817321777344, "loss": 0.6766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07237504422664642, "rewards/margins": 0.0354764387011528, "rewards/rejected": -0.10785149037837982, "step": 3060 }, { "epoch": 1.0578911095796002, "grad_norm": 2.166980743408203, "learning_rate": 4.089540137664803e-08, "logits/chosen": -2.999711513519287, "logits/rejected": -2.980905055999756, "logps/chosen": -61.864479064941406, "logps/rejected": -63.898468017578125, "loss": 0.6815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08129169046878815, "rewards/margins": 0.025567909702658653, "rewards/rejected": -0.10685960203409195, "step": 3070 }, { "epoch": 1.0613370089593384, "grad_norm": 2.047234296798706, "learning_rate": 4.081790275028798e-08, "logits/chosen": -2.899109125137329, "logits/rejected": -2.8863024711608887, "logps/chosen": -59.81492233276367, "logps/rejected": -61.828826904296875, "loss": 0.6805, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.07823071628808975, "rewards/margins": 0.02736820839345455, "rewards/rejected": -0.10559892654418945, "step": 3080 }, { "epoch": 1.0647829083390765, "grad_norm": 2.0655131340026855, "learning_rate": 4.074014981004838e-08, "logits/chosen": -3.0099358558654785, "logits/rejected": -2.989426374435425, "logps/chosen": -66.57350158691406, "logps/rejected": -65.46236419677734, "loss": 0.6796, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0776560828089714, "rewards/margins": 0.029710572212934494, "rewards/rejected": -0.1073666587471962, "step": 3090 }, { "epoch": 1.0682288077188147, "grad_norm": 2.036759376525879, "learning_rate": 4.0662143806009765e-08, "logits/chosen": -2.9257102012634277, "logits/rejected": -2.9036128520965576, "logps/chosen": -63.916656494140625, "logps/rejected": -64.88423156738281, "loss": 0.6785, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0757143646478653, "rewards/margins": 0.031929027289152145, "rewards/rejected": -0.10764340311288834, "step": 3100 }, { "epoch": 1.0682288077188147, "eval_logits/chosen": -3.0668141841888428, "eval_logits/rejected": -3.061049699783325, "eval_logps/chosen": -63.22004318237305, "eval_logps/rejected": -69.07476043701172, "eval_loss": 0.6868610978126526, "eval_rewards/accuracies": 0.5968866348266602, "eval_rewards/chosen": -0.04508143290877342, "eval_rewards/margins": 0.013864929787814617, "eval_rewards/rejected": -0.05894635617733002, "eval_runtime": 383.2856, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 3100 }, { "epoch": 1.0716747070985528, "grad_norm": 1.8537838459014893, "learning_rate": 4.058388599232129e-08, "logits/chosen": -2.978543758392334, "logits/rejected": -2.9534053802490234, "logps/chosen": -61.615081787109375, "logps/rejected": -63.93159103393555, "loss": 0.6789, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.081329345703125, "rewards/margins": 0.03077167272567749, "rewards/rejected": -0.11210101842880249, "step": 3110 }, { "epoch": 1.0751206064782908, "grad_norm": 2.199716329574585, "learning_rate": 4.0505377627180605e-08, "logits/chosen": -3.0166029930114746, "logits/rejected": -2.9912753105163574, "logps/chosen": -64.92037200927734, "logps/rejected": -66.14192962646484, "loss": 0.6761, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.07310604304075241, "rewards/margins": 0.037038955837488174, "rewards/rejected": -0.11014499515295029, "step": 3120 }, { "epoch": 1.078566505858029, "grad_norm": 2.2322821617126465, "learning_rate": 4.042661997281364e-08, "logits/chosen": -2.9484448432922363, "logits/rejected": -2.926734447479248, "logps/chosen": -62.90381622314453, "logps/rejected": -63.069923400878906, "loss": 0.6802, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.08077716082334518, "rewards/margins": 0.028544697910547256, "rewards/rejected": -0.10932186990976334, "step": 3130 }, { "epoch": 1.082012405237767, "grad_norm": 2.055997133255005, "learning_rate": 4.034761429545428e-08, "logits/chosen": -2.9946491718292236, "logits/rejected": -2.9817347526550293, "logps/chosen": -61.41887664794922, "logps/rejected": -64.48870086669922, "loss": 0.6789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07762418687343597, "rewards/margins": 0.030855854973196983, "rewards/rejected": -0.10848003625869751, "step": 3140 }, { "epoch": 1.0854583046175053, "grad_norm": 1.9650964736938477, "learning_rate": 4.0268361865324054e-08, "logits/chosen": -2.953033447265625, "logits/rejected": -2.923536777496338, "logps/chosen": -60.37060546875, "logps/rejected": -63.7410774230957, "loss": 0.6789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0846327394247055, "rewards/margins": 0.03067139908671379, "rewards/rejected": -0.115304134786129, "step": 3150 }, { "epoch": 1.0889042039972432, "grad_norm": 2.0286407470703125, "learning_rate": 4.018886395661166e-08, "logits/chosen": -2.9489102363586426, "logits/rejected": -2.9226269721984863, "logps/chosen": -61.07158660888672, "logps/rejected": -63.1686897277832, "loss": 0.683, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.08482153713703156, "rewards/margins": 0.0224880613386631, "rewards/rejected": -0.10730959475040436, "step": 3160 }, { "epoch": 1.0923501033769814, "grad_norm": 2.2252089977264404, "learning_rate": 4.01091218474525e-08, "logits/chosen": -2.938819169998169, "logits/rejected": -2.9176087379455566, "logps/chosen": -62.47895431518555, "logps/rejected": -64.67933654785156, "loss": 0.6774, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.08040270954370499, "rewards/margins": 0.03404662013053894, "rewards/rejected": -0.11444933712482452, "step": 3170 }, { "epoch": 1.0957960027567195, "grad_norm": 2.006601572036743, "learning_rate": 4.002913681990813e-08, "logits/chosen": -2.938154697418213, "logits/rejected": -2.9333245754241943, "logps/chosen": -62.572357177734375, "logps/rejected": -66.15119934082031, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08080647885799408, "rewards/margins": 0.02628178335726261, "rewards/rejected": -0.10708826780319214, "step": 3180 }, { "epoch": 1.0992419021364577, "grad_norm": 2.195202589035034, "learning_rate": 3.9948910159945675e-08, "logits/chosen": -2.9939653873443604, "logits/rejected": -2.9880919456481934, "logps/chosen": -61.3933219909668, "logps/rejected": -66.06436920166016, "loss": 0.6836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08453895896673203, "rewards/margins": 0.02129252813756466, "rewards/rejected": -0.10583148896694183, "step": 3190 }, { "epoch": 1.1026878015161956, "grad_norm": 2.2813525199890137, "learning_rate": 3.9868443157417105e-08, "logits/chosen": -3.014244556427002, "logits/rejected": -2.9842770099639893, "logps/chosen": -64.39260864257812, "logps/rejected": -64.87760925292969, "loss": 0.6763, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07504025846719742, "rewards/margins": 0.03602679818868637, "rewards/rejected": -0.1110670417547226, "step": 3200 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -3.0625712871551514, "eval_logits/rejected": -3.056849718093872, "eval_logps/chosen": -63.55337142944336, "eval_logps/rejected": -69.46443939208984, "eval_loss": 0.6866227388381958, "eval_rewards/accuracies": 0.5924721360206604, "eval_rewards/chosen": -0.04841482266783714, "eval_rewards/margins": 0.014428352005779743, "eval_rewards/rejected": -0.06284316629171371, "eval_runtime": 383.3894, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 3200 }, { "epoch": 1.1061337008959338, "grad_norm": 2.0350968837738037, "learning_rate": 3.9787737106038524e-08, "logits/chosen": -3.050877332687378, "logits/rejected": -3.02876353263855, "logps/chosen": -62.74021530151367, "logps/rejected": -64.1576919555664, "loss": 0.6786, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08162778615951538, "rewards/margins": 0.03162616863846779, "rewards/rejected": -0.11325393617153168, "step": 3210 }, { "epoch": 1.109579600275672, "grad_norm": 2.097778797149658, "learning_rate": 3.970679330336938e-08, "logits/chosen": -2.9291749000549316, "logits/rejected": -2.9129531383514404, "logps/chosen": -60.304771423339844, "logps/rejected": -61.454620361328125, "loss": 0.6832, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08479462563991547, "rewards/margins": 0.02261229418218136, "rewards/rejected": -0.10740692913532257, "step": 3220 }, { "epoch": 1.11302549965541, "grad_norm": 2.164839029312134, "learning_rate": 3.9625613050791576e-08, "logits/chosen": -2.932922124862671, "logits/rejected": -2.91778302192688, "logps/chosen": -62.301429748535156, "logps/rejected": -66.73863983154297, "loss": 0.677, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.08201193064451218, "rewards/margins": 0.034618277102708817, "rewards/rejected": -0.11663021147251129, "step": 3230 }, { "epoch": 1.1164713990351482, "grad_norm": 2.105262041091919, "learning_rate": 3.9544197653488566e-08, "logits/chosen": -2.919203519821167, "logits/rejected": -2.8894002437591553, "logps/chosen": -62.48259353637695, "logps/rejected": -62.6922721862793, "loss": 0.6756, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.07816579192876816, "rewards/margins": 0.03766994550824165, "rewards/rejected": -0.11583574116230011, "step": 3240 }, { "epoch": 1.1199172984148862, "grad_norm": 2.1137328147888184, "learning_rate": 3.946254842042437e-08, "logits/chosen": -3.060471534729004, "logits/rejected": -3.0507054328918457, "logps/chosen": -64.10884857177734, "logps/rejected": -64.92398071289062, "loss": 0.6829, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.09195342659950256, "rewards/margins": 0.023079639300704002, "rewards/rejected": -0.11503306776285172, "step": 3250 }, { "epoch": 1.1233631977946243, "grad_norm": 2.1686809062957764, "learning_rate": 3.938066666432253e-08, "logits/chosen": -2.971972942352295, "logits/rejected": -2.9358713626861572, "logps/chosen": -62.91139602661133, "logps/rejected": -63.77324676513672, "loss": 0.6752, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.07713530957698822, "rewards/margins": 0.03826489299535751, "rewards/rejected": -0.11540021002292633, "step": 3260 }, { "epoch": 1.1268090971743625, "grad_norm": 2.2415354251861572, "learning_rate": 3.929855370164499e-08, "logits/chosen": -2.9439709186553955, "logits/rejected": -2.9158260822296143, "logps/chosen": -63.88291549682617, "logps/rejected": -65.68302917480469, "loss": 0.6772, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08383753895759583, "rewards/margins": 0.03424730896949768, "rewards/rejected": -0.1180848479270935, "step": 3270 }, { "epoch": 1.1302549965541007, "grad_norm": 2.0241596698760986, "learning_rate": 3.9216210852570934e-08, "logits/chosen": -2.936072826385498, "logits/rejected": -2.903352737426758, "logps/chosen": -62.170372009277344, "logps/rejected": -64.32319641113281, "loss": 0.6736, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08395300805568695, "rewards/margins": 0.04147539287805557, "rewards/rejected": -0.12542840838432312, "step": 3280 }, { "epoch": 1.1337008959338388, "grad_norm": 1.9375991821289062, "learning_rate": 3.913363944097559e-08, "logits/chosen": -2.9910669326782227, "logits/rejected": -2.9566142559051514, "logps/chosen": -62.86028289794922, "logps/rejected": -63.057891845703125, "loss": 0.6787, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08564107865095139, "rewards/margins": 0.031442124396562576, "rewards/rejected": -0.11708319187164307, "step": 3290 }, { "epoch": 1.1371467953135768, "grad_norm": 1.9416190385818481, "learning_rate": 3.90508407944089e-08, "logits/chosen": -2.992605209350586, "logits/rejected": -2.973278284072876, "logps/chosen": -63.114585876464844, "logps/rejected": -65.09963989257812, "loss": 0.681, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08878443390130997, "rewards/margins": 0.02686789073050022, "rewards/rejected": -0.11565233767032623, "step": 3300 }, { "epoch": 1.1371467953135768, "eval_logits/chosen": -3.057607412338257, "eval_logits/rejected": -3.0518221855163574, "eval_logps/chosen": -63.967044830322266, "eval_logps/rejected": -69.97113800048828, "eval_loss": 0.6862225532531738, "eval_rewards/accuracies": 0.5922397971153259, "eval_rewards/chosen": -0.05255148932337761, "eval_rewards/margins": 0.015358657576143742, "eval_rewards/rejected": -0.06791014969348907, "eval_runtime": 383.0271, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 3300 }, { "epoch": 1.140592694693315, "grad_norm": 2.484438180923462, "learning_rate": 3.8967816244074214e-08, "logits/chosen": -2.977100372314453, "logits/rejected": -2.9560484886169434, "logps/chosen": -61.56267547607422, "logps/rejected": -64.21984100341797, "loss": 0.6798, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09384143352508545, "rewards/margins": 0.02879248932003975, "rewards/rejected": -0.1226339116692543, "step": 3310 }, { "epoch": 1.144038594073053, "grad_norm": 1.9081083536148071, "learning_rate": 3.888456712480687e-08, "logits/chosen": -2.965791940689087, "logits/rejected": -2.933101177215576, "logps/chosen": -62.242881774902344, "logps/rejected": -61.938819885253906, "loss": 0.6768, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09034211933612823, "rewards/margins": 0.035093702375888824, "rewards/rejected": -0.12543582916259766, "step": 3320 }, { "epoch": 1.1474844934527912, "grad_norm": 2.088493585586548, "learning_rate": 3.880109477505271e-08, "logits/chosen": -3.0419416427612305, "logits/rejected": -3.0212514400482178, "logps/chosen": -60.615745544433594, "logps/rejected": -66.1209716796875, "loss": 0.6761, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.08678627759218216, "rewards/margins": 0.03683622181415558, "rewards/rejected": -0.12362249195575714, "step": 3330 }, { "epoch": 1.1509303928325294, "grad_norm": 2.1750423908233643, "learning_rate": 3.871740053684662e-08, "logits/chosen": -2.9504973888397217, "logits/rejected": -2.9371280670166016, "logps/chosen": -61.08563232421875, "logps/rejected": -66.26634216308594, "loss": 0.6776, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0921957939863205, "rewards/margins": 0.03368087857961655, "rewards/rejected": -0.12587666511535645, "step": 3340 }, { "epoch": 1.1543762922122673, "grad_norm": 2.184797763824463, "learning_rate": 3.8633485755790914e-08, "logits/chosen": -2.9558186531066895, "logits/rejected": -2.9394261837005615, "logps/chosen": -65.19346618652344, "logps/rejected": -68.24827575683594, "loss": 0.6755, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08549506962299347, "rewards/margins": 0.03832552954554558, "rewards/rejected": -0.12382060289382935, "step": 3350 }, { "epoch": 1.1578221915920055, "grad_norm": 2.067305564880371, "learning_rate": 3.854935178103368e-08, "logits/chosen": -2.952755928039551, "logits/rejected": -2.9339444637298584, "logps/chosen": -62.5098991394043, "logps/rejected": -64.93201446533203, "loss": 0.6745, "rewards/accuracies": 0.640625, "rewards/chosen": -0.08339858055114746, "rewards/margins": 0.04043407738208771, "rewards/rejected": -0.12383266538381577, "step": 3360 }, { "epoch": 1.1612680909717437, "grad_norm": 1.9327855110168457, "learning_rate": 3.8464999965247145e-08, "logits/chosen": -2.937671184539795, "logits/rejected": -2.9255480766296387, "logps/chosen": -64.2508773803711, "logps/rejected": -67.32499694824219, "loss": 0.6797, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08683866262435913, "rewards/margins": 0.029572222381830215, "rewards/rejected": -0.11641087383031845, "step": 3370 }, { "epoch": 1.1647139903514818, "grad_norm": 2.0421977043151855, "learning_rate": 3.838043166460588e-08, "logits/chosen": -2.984952926635742, "logits/rejected": -2.9637486934661865, "logps/chosen": -62.15680694580078, "logps/rejected": -64.10198974609375, "loss": 0.6796, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09901008754968643, "rewards/margins": 0.029887374490499496, "rewards/rejected": -0.12889744341373444, "step": 3380 }, { "epoch": 1.1681598897312198, "grad_norm": 2.2439939975738525, "learning_rate": 3.829564823876501e-08, "logits/chosen": -2.969491720199585, "logits/rejected": -2.95082950592041, "logps/chosen": -64.22187042236328, "logps/rejected": -66.8917236328125, "loss": 0.6777, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09053460508584976, "rewards/margins": 0.033598922193050385, "rewards/rejected": -0.12413352727890015, "step": 3390 }, { "epoch": 1.171605789110958, "grad_norm": 2.121293544769287, "learning_rate": 3.8210651050838363e-08, "logits/chosen": -2.9345383644104004, "logits/rejected": -2.9132275581359863, "logps/chosen": -61.2266960144043, "logps/rejected": -64.92522430419922, "loss": 0.6767, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09755545109510422, "rewards/margins": 0.03562026470899582, "rewards/rejected": -0.13317571580410004, "step": 3400 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -3.0521657466888428, "eval_logits/rejected": -3.046401262283325, "eval_logps/chosen": -64.4254379272461, "eval_logps/rejected": -70.50476837158203, "eval_loss": 0.6859034895896912, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.057135455310344696, "eval_rewards/margins": 0.01611100137233734, "eval_rewards/rejected": -0.07324645668268204, "eval_runtime": 383.0748, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 3400 }, { "epoch": 1.175051688490696, "grad_norm": 2.10575270652771, "learning_rate": 3.812544146737654e-08, "logits/chosen": -2.99576997756958, "logits/rejected": -2.9718897342681885, "logps/chosen": -63.611839294433594, "logps/rejected": -65.30491638183594, "loss": 0.6712, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.08248257637023926, "rewards/margins": 0.046745531260967255, "rewards/rejected": -0.12922810018062592, "step": 3410 }, { "epoch": 1.1784975878704342, "grad_norm": 2.289477586746216, "learning_rate": 3.804002085834497e-08, "logits/chosen": -3.0325703620910645, "logits/rejected": -3.0163891315460205, "logps/chosen": -64.69010162353516, "logps/rejected": -65.92488861083984, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": -0.09273076057434082, "rewards/margins": 0.024460792541503906, "rewards/rejected": -0.11719155311584473, "step": 3420 }, { "epoch": 1.1819434872501722, "grad_norm": 2.048081398010254, "learning_rate": 3.795439059710185e-08, "logits/chosen": -2.916071653366089, "logits/rejected": -2.881584405899048, "logps/chosen": -65.62455749511719, "logps/rejected": -66.17389678955078, "loss": 0.6737, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.08985026925802231, "rewards/margins": 0.04207572713494301, "rewards/rejected": -0.13192598521709442, "step": 3430 }, { "epoch": 1.1853893866299103, "grad_norm": 2.2209901809692383, "learning_rate": 3.786855206037609e-08, "logits/chosen": -2.95088529586792, "logits/rejected": -2.930111885070801, "logps/chosen": -62.94878005981445, "logps/rejected": -64.75407409667969, "loss": 0.6795, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.10274597257375717, "rewards/margins": 0.030139794573187828, "rewards/rejected": -0.13288576900959015, "step": 3440 }, { "epoch": 1.1888352860096485, "grad_norm": 2.173154354095459, "learning_rate": 3.7782506628245154e-08, "logits/chosen": -2.9507505893707275, "logits/rejected": -2.926759958267212, "logps/chosen": -62.045631408691406, "logps/rejected": -65.30491638183594, "loss": 0.6782, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09113176167011261, "rewards/margins": 0.03278777748346329, "rewards/rejected": -0.12391956150531769, "step": 3450 }, { "epoch": 1.1922811853893867, "grad_norm": 2.2070460319519043, "learning_rate": 3.769625568411291e-08, "logits/chosen": -3.048619508743286, "logits/rejected": -3.0207581520080566, "logps/chosen": -65.68795776367188, "logps/rejected": -67.32627868652344, "loss": 0.6763, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09192951768636703, "rewards/margins": 0.0364498570561409, "rewards/rejected": -0.12837937474250793, "step": 3460 }, { "epoch": 1.1957270847691248, "grad_norm": 2.1387979984283447, "learning_rate": 3.760980061468734e-08, "logits/chosen": -2.88305926322937, "logits/rejected": -2.846295118331909, "logps/chosen": -66.08439636230469, "logps/rejected": -65.39725494384766, "loss": 0.6713, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08285989612340927, "rewards/margins": 0.04689788073301315, "rewards/rejected": -0.12975779175758362, "step": 3470 }, { "epoch": 1.1991729841488628, "grad_norm": 2.0541937351226807, "learning_rate": 3.7523142809958305e-08, "logits/chosen": -3.0288517475128174, "logits/rejected": -2.999641180038452, "logps/chosen": -64.0830078125, "logps/rejected": -63.95012283325195, "loss": 0.6742, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09495411813259125, "rewards/margins": 0.040952228009700775, "rewards/rejected": -0.13590635359287262, "step": 3480 }, { "epoch": 1.202618883528601, "grad_norm": 2.072749137878418, "learning_rate": 3.743628366317512e-08, "logits/chosen": -2.9743731021881104, "logits/rejected": -2.9422459602355957, "logps/chosen": -63.385276794433594, "logps/rejected": -65.08555603027344, "loss": 0.6727, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.09001701325178146, "rewards/margins": 0.043853361159563065, "rewards/rejected": -0.13387037813663483, "step": 3490 }, { "epoch": 1.206064782908339, "grad_norm": 2.181532144546509, "learning_rate": 3.7349224570824236e-08, "logits/chosen": -2.9924323558807373, "logits/rejected": -2.9736294746398926, "logps/chosen": -66.96834564208984, "logps/rejected": -68.12389373779297, "loss": 0.6781, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10329464823007584, "rewards/margins": 0.03317927196621895, "rewards/rejected": -0.13647392392158508, "step": 3500 }, { "epoch": 1.206064782908339, "eval_logits/chosen": -3.047136068344116, "eval_logits/rejected": -3.041323184967041, "eval_logps/chosen": -64.83796691894531, "eval_logps/rejected": -70.9827651977539, "eval_loss": 0.6856338381767273, "eval_rewards/accuracies": 0.5964219570159912, "eval_rewards/chosen": -0.06126071885228157, "eval_rewards/margins": 0.016765711829066277, "eval_rewards/rejected": -0.0780264362692833, "eval_runtime": 382.9297, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 3500 }, { "epoch": 1.2095106822880772, "grad_norm": 2.140570640563965, "learning_rate": 3.72619669326067e-08, "logits/chosen": -2.9457008838653564, "logits/rejected": -2.919124126434326, "logps/chosen": -63.64116287231445, "logps/rejected": -66.61964416503906, "loss": 0.6735, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09761128574609756, "rewards/margins": 0.042480140924453735, "rewards/rejected": -0.1400914490222931, "step": 3510 }, { "epoch": 1.2129565816678154, "grad_norm": 2.260218620300293, "learning_rate": 3.717451215141577e-08, "logits/chosen": -2.9485726356506348, "logits/rejected": -2.9302449226379395, "logps/chosen": -64.31620025634766, "logps/rejected": -68.99227905273438, "loss": 0.6774, "rewards/accuracies": 0.625, "rewards/chosen": -0.09727726131677628, "rewards/margins": 0.034588806331157684, "rewards/rejected": -0.13186606764793396, "step": 3520 }, { "epoch": 1.2164024810475533, "grad_norm": 2.2776901721954346, "learning_rate": 3.7086861633314225e-08, "logits/chosen": -2.9505953788757324, "logits/rejected": -2.932459592819214, "logps/chosen": -63.000953674316406, "logps/rejected": -67.3364028930664, "loss": 0.677, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.10667930543422699, "rewards/margins": 0.0352301262319088, "rewards/rejected": -0.14190945029258728, "step": 3530 }, { "epoch": 1.2198483804272915, "grad_norm": 2.3444952964782715, "learning_rate": 3.699901678751186e-08, "logits/chosen": -3.0000805854797363, "logits/rejected": -2.9713356494903564, "logps/chosen": -63.927162170410156, "logps/rejected": -66.87178039550781, "loss": 0.6776, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09891566634178162, "rewards/margins": 0.03373757004737854, "rewards/rejected": -0.13265323638916016, "step": 3540 }, { "epoch": 1.2232942798070296, "grad_norm": 2.1782407760620117, "learning_rate": 3.691097902634277e-08, "logits/chosen": -2.991338014602661, "logits/rejected": -2.9753661155700684, "logps/chosen": -65.58499908447266, "logps/rejected": -68.34015655517578, "loss": 0.6771, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.10289422422647476, "rewards/margins": 0.03521553426980972, "rewards/rejected": -0.13810977339744568, "step": 3550 }, { "epoch": 1.2267401791867678, "grad_norm": 2.2369847297668457, "learning_rate": 3.6822749765242686e-08, "logits/chosen": -2.9704623222351074, "logits/rejected": -2.929980754852295, "logps/chosen": -66.93081665039062, "logps/rejected": -64.67981719970703, "loss": 0.6758, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09201280027627945, "rewards/margins": 0.03707309439778328, "rewards/rejected": -0.12908589839935303, "step": 3560 }, { "epoch": 1.230186078566506, "grad_norm": 2.112086534500122, "learning_rate": 3.673433042272618e-08, "logits/chosen": -2.900036334991455, "logits/rejected": -2.884396553039551, "logps/chosen": -63.47931671142578, "logps/rejected": -65.76783752441406, "loss": 0.677, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09649856388568878, "rewards/margins": 0.03556473180651665, "rewards/rejected": -0.13206328451633453, "step": 3570 }, { "epoch": 1.233631977946244, "grad_norm": 2.1829700469970703, "learning_rate": 3.664572242036389e-08, "logits/chosen": -2.986011028289795, "logits/rejected": -2.9552488327026367, "logps/chosen": -65.58158111572266, "logps/rejected": -69.02853393554688, "loss": 0.6754, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10556940734386444, "rewards/margins": 0.038878049701452255, "rewards/rejected": -0.144447460770607, "step": 3580 }, { "epoch": 1.237077877325982, "grad_norm": 2.2640624046325684, "learning_rate": 3.655692718275964e-08, "logits/chosen": -2.9690423011779785, "logits/rejected": -2.94875431060791, "logps/chosen": -65.73578643798828, "logps/rejected": -67.7518081665039, "loss": 0.6781, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10301530361175537, "rewards/margins": 0.033097293227910995, "rewards/rejected": -0.13611260056495667, "step": 3590 }, { "epoch": 1.2405237767057202, "grad_norm": 2.431368350982666, "learning_rate": 3.6467946137527554e-08, "logits/chosen": -2.891173839569092, "logits/rejected": -2.8703651428222656, "logps/chosen": -63.62683868408203, "logps/rejected": -69.13282775878906, "loss": 0.6774, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.1074288934469223, "rewards/margins": 0.034920308738946915, "rewards/rejected": -0.1423492133617401, "step": 3600 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -3.0416650772094727, "eval_logits/rejected": -3.0358312129974365, "eval_logps/chosen": -65.13961791992188, "eval_logps/rejected": -71.34996032714844, "eval_loss": 0.6853528618812561, "eval_rewards/accuracies": 0.598280668258667, "eval_rewards/chosen": -0.06427720189094543, "eval_rewards/margins": 0.01742119900882244, "eval_rewards/rejected": -0.08169841021299362, "eval_runtime": 383.0704, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 3600 }, { "epoch": 1.2439696760854584, "grad_norm": 2.1995999813079834, "learning_rate": 3.6378780715269105e-08, "logits/chosen": -2.951119899749756, "logits/rejected": -2.922717332839966, "logps/chosen": -65.8984603881836, "logps/rejected": -65.83570861816406, "loss": 0.6779, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10119110345840454, "rewards/margins": 0.033696286380290985, "rewards/rejected": -0.13488736748695374, "step": 3610 }, { "epoch": 1.2474155754651963, "grad_norm": 2.140760898590088, "learning_rate": 3.628943234955009e-08, "logits/chosen": -2.9350576400756836, "logits/rejected": -2.919267177581787, "logps/chosen": -66.7107925415039, "logps/rejected": -67.46166229248047, "loss": 0.681, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10876727104187012, "rewards/margins": 0.027091091498732567, "rewards/rejected": -0.13585837185382843, "step": 3620 }, { "epoch": 1.2508614748449345, "grad_norm": 2.1931087970733643, "learning_rate": 3.619990247687759e-08, "logits/chosen": -2.9988818168640137, "logits/rejected": -2.9668526649475098, "logps/chosen": -64.88333892822266, "logps/rejected": -66.91313934326172, "loss": 0.6769, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.107566237449646, "rewards/margins": 0.03542996197938919, "rewards/rejected": -0.1429961919784546, "step": 3630 }, { "epoch": 1.2543073742246726, "grad_norm": 2.1610770225524902, "learning_rate": 3.611019253667692e-08, "logits/chosen": -3.0256993770599365, "logits/rejected": -2.993114471435547, "logps/chosen": -65.4500503540039, "logps/rejected": -67.24403381347656, "loss": 0.6723, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10007349401712418, "rewards/margins": 0.0446397066116333, "rewards/rejected": -0.14471320807933807, "step": 3640 }, { "epoch": 1.2577532736044108, "grad_norm": 2.129770517349243, "learning_rate": 3.6020303971268396e-08, "logits/chosen": -2.8045787811279297, "logits/rejected": -2.7895967960357666, "logps/chosen": -63.28857421875, "logps/rejected": -66.97554016113281, "loss": 0.683, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1076253205537796, "rewards/margins": 0.022777412086725235, "rewards/rejected": -0.13040272891521454, "step": 3650 }, { "epoch": 1.2611991729841487, "grad_norm": 2.331608295440674, "learning_rate": 3.5930238225844244e-08, "logits/chosen": -2.992966890335083, "logits/rejected": -2.968365430831909, "logps/chosen": -65.44712829589844, "logps/rejected": -65.48210144042969, "loss": 0.679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10740844905376434, "rewards/margins": 0.03220432624220848, "rewards/rejected": -0.13961276412010193, "step": 3660 }, { "epoch": 1.264645072363887, "grad_norm": 2.2650468349456787, "learning_rate": 3.583999674844531e-08, "logits/chosen": -2.9433140754699707, "logits/rejected": -2.9163331985473633, "logps/chosen": -64.33101654052734, "logps/rejected": -67.10047912597656, "loss": 0.6737, "rewards/accuracies": 0.640625, "rewards/chosen": -0.10109184682369232, "rewards/margins": 0.04247751086950302, "rewards/rejected": -0.14356933534145355, "step": 3670 }, { "epoch": 1.268090971743625, "grad_norm": 2.2816858291625977, "learning_rate": 3.574958098993775e-08, "logits/chosen": -2.977121353149414, "logits/rejected": -2.9523537158966064, "logps/chosen": -66.01762390136719, "logps/rejected": -66.09962463378906, "loss": 0.6762, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10408999025821686, "rewards/margins": 0.03713550418615341, "rewards/rejected": -0.14122548699378967, "step": 3680 }, { "epoch": 1.2715368711233632, "grad_norm": 2.187746286392212, "learning_rate": 3.565899240398978e-08, "logits/chosen": -2.8768622875213623, "logits/rejected": -2.8557772636413574, "logps/chosen": -63.966148376464844, "logps/rejected": -66.40437316894531, "loss": 0.6799, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.10979002714157104, "rewards/margins": 0.02961345948278904, "rewards/rejected": -0.13940349221229553, "step": 3690 }, { "epoch": 1.2749827705031014, "grad_norm": 2.2547101974487305, "learning_rate": 3.556823244704827e-08, "logits/chosen": -2.8993475437164307, "logits/rejected": -2.8698716163635254, "logps/chosen": -64.64362335205078, "logps/rejected": -66.32600402832031, "loss": 0.676, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10811223089694977, "rewards/margins": 0.037198375910520554, "rewards/rejected": -0.14531061053276062, "step": 3700 }, { "epoch": 1.2749827705031014, "eval_logits/chosen": -3.0372047424316406, "eval_logits/rejected": -3.031385898590088, "eval_logps/chosen": -65.41413879394531, "eval_logps/rejected": -71.68794250488281, "eval_loss": 0.6850755214691162, "eval_rewards/accuracies": 0.5989776849746704, "eval_rewards/chosen": -0.0670224279165268, "eval_rewards/margins": 0.018055759370326996, "eval_rewards/rejected": -0.08507818728685379, "eval_runtime": 382.8945, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 3700 }, { "epoch": 1.2784286698828393, "grad_norm": 2.215773105621338, "learning_rate": 3.54773025783153e-08, "logits/chosen": -2.8403139114379883, "logits/rejected": -2.8119125366210938, "logps/chosen": -64.80567932128906, "logps/rejected": -66.58682250976562, "loss": 0.6763, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09843523800373077, "rewards/margins": 0.036986224353313446, "rewards/rejected": -0.13542146980762482, "step": 3710 }, { "epoch": 1.2818745692625775, "grad_norm": 2.30126953125, "learning_rate": 3.538620425972475e-08, "logits/chosen": -2.923154354095459, "logits/rejected": -2.894347667694092, "logps/chosen": -66.34468078613281, "logps/rejected": -68.36544799804688, "loss": 0.674, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1164015680551529, "rewards/margins": 0.041045404970645905, "rewards/rejected": -0.1574469655752182, "step": 3720 }, { "epoch": 1.2853204686423156, "grad_norm": 2.1403231620788574, "learning_rate": 3.529493895591872e-08, "logits/chosen": -2.9460062980651855, "logits/rejected": -2.934063673019409, "logps/chosen": -63.490509033203125, "logps/rejected": -66.83492279052734, "loss": 0.6768, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10446493327617645, "rewards/margins": 0.03590547293424606, "rewards/rejected": -0.14037039875984192, "step": 3730 }, { "epoch": 1.2887663680220538, "grad_norm": 2.1130125522613525, "learning_rate": 3.5203508134224094e-08, "logits/chosen": -2.918806791305542, "logits/rejected": -2.896461009979248, "logps/chosen": -65.25930786132812, "logps/rejected": -63.81455612182617, "loss": 0.6801, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.10615845024585724, "rewards/margins": 0.029431451112031937, "rewards/rejected": -0.13558988273143768, "step": 3740 }, { "epoch": 1.292212267401792, "grad_norm": 1.9978524446487427, "learning_rate": 3.511191326462883e-08, "logits/chosen": -2.8778882026672363, "logits/rejected": -2.8588128089904785, "logps/chosen": -63.047523498535156, "logps/rejected": -67.1456298828125, "loss": 0.6768, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.11030145734548569, "rewards/margins": 0.03552541136741638, "rewards/rejected": -0.14582687616348267, "step": 3750 }, { "epoch": 1.29565816678153, "grad_norm": 2.478665828704834, "learning_rate": 3.502015581975843e-08, "logits/chosen": -2.909358501434326, "logits/rejected": -2.8826870918273926, "logps/chosen": -65.5045166015625, "logps/rejected": -66.89601135253906, "loss": 0.6752, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1041010171175003, "rewards/margins": 0.03946654871106148, "rewards/rejected": -0.14356756210327148, "step": 3760 }, { "epoch": 1.299104066161268, "grad_norm": 2.0279932022094727, "learning_rate": 3.492823727485218e-08, "logits/chosen": -2.9260127544403076, "logits/rejected": -2.921710968017578, "logps/chosen": -62.47440719604492, "logps/rejected": -66.68681335449219, "loss": 0.6795, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10595481097698212, "rewards/margins": 0.0303569994866848, "rewards/rejected": -0.13631181418895721, "step": 3770 }, { "epoch": 1.3025499655410062, "grad_norm": 2.361196756362915, "learning_rate": 3.48361591077395e-08, "logits/chosen": -2.96722674369812, "logits/rejected": -2.947263240814209, "logps/chosen": -61.790000915527344, "logps/rejected": -68.04402923583984, "loss": 0.6706, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09607146680355072, "rewards/margins": 0.04848823323845863, "rewards/rejected": -0.14455969631671906, "step": 3780 }, { "epoch": 1.3059958649207444, "grad_norm": 2.2241787910461426, "learning_rate": 3.4743922798816105e-08, "logits/chosen": -2.973975658416748, "logits/rejected": -2.9557018280029297, "logps/chosen": -66.19273376464844, "logps/rejected": -67.77729034423828, "loss": 0.6722, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09787340462207794, "rewards/margins": 0.04533924162387848, "rewards/rejected": -0.14321264624595642, "step": 3790 }, { "epoch": 1.3094417643004825, "grad_norm": 2.1267189979553223, "learning_rate": 3.4651529831020296e-08, "logits/chosen": -2.9703516960144043, "logits/rejected": -2.9332830905914307, "logps/chosen": -66.95878601074219, "logps/rejected": -65.53253173828125, "loss": 0.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11528768390417099, "rewards/margins": 0.039995353668928146, "rewards/rejected": -0.15528300404548645, "step": 3800 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -3.0321154594421387, "eval_logits/rejected": -3.026305913925171, "eval_logps/chosen": -65.62596130371094, "eval_logps/rejected": -71.93761444091797, "eval_loss": 0.684913694858551, "eval_rewards/accuracies": 0.5968866348266602, "eval_rewards/chosen": -0.06914062052965164, "eval_rewards/margins": 0.018434301018714905, "eval_rewards/rejected": -0.08757492154836655, "eval_runtime": 382.8583, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 3800 }, { "epoch": 1.3128876636802205, "grad_norm": 2.335982322692871, "learning_rate": 3.455898168980906e-08, "logits/chosen": -2.926928758621216, "logits/rejected": -2.897160768508911, "logps/chosen": -65.6293716430664, "logps/rejected": -66.26640319824219, "loss": 0.6754, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.10417020320892334, "rewards/margins": 0.03841516003012657, "rewards/rejected": -0.1425853669643402, "step": 3810 }, { "epoch": 1.3163335630599586, "grad_norm": 2.395035982131958, "learning_rate": 3.446627986313419e-08, "logits/chosen": -2.8648242950439453, "logits/rejected": -2.8598926067352295, "logps/chosen": -64.78601837158203, "logps/rejected": -67.5855712890625, "loss": 0.6774, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.10910700261592865, "rewards/margins": 0.03452148288488388, "rewards/rejected": -0.14362849295139313, "step": 3820 }, { "epoch": 1.3197794624396968, "grad_norm": 2.674165964126587, "learning_rate": 3.4373425841418375e-08, "logits/chosen": -2.9660162925720215, "logits/rejected": -2.9340415000915527, "logps/chosen": -69.0353012084961, "logps/rejected": -70.70301818847656, "loss": 0.6745, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11321979761123657, "rewards/margins": 0.04104456678032875, "rewards/rejected": -0.15426437556743622, "step": 3830 }, { "epoch": 1.323225361819435, "grad_norm": 2.1819868087768555, "learning_rate": 3.428042111753123e-08, "logits/chosen": -2.9421393871307373, "logits/rejected": -2.918872117996216, "logps/chosen": -64.65965270996094, "logps/rejected": -65.97190856933594, "loss": 0.6744, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10569307953119278, "rewards/margins": 0.04079803451895714, "rewards/rejected": -0.146491140127182, "step": 3840 }, { "epoch": 1.3266712611991731, "grad_norm": 2.4271602630615234, "learning_rate": 3.418726718676532e-08, "logits/chosen": -2.9934165477752686, "logits/rejected": -2.954824447631836, "logps/chosen": -64.25479125976562, "logps/rejected": -65.62818145751953, "loss": 0.6732, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.09907349199056625, "rewards/margins": 0.04342139512300491, "rewards/rejected": -0.14249490201473236, "step": 3850 }, { "epoch": 1.330117160578911, "grad_norm": 2.4139678478240967, "learning_rate": 3.409396554681208e-08, "logits/chosen": -2.9541690349578857, "logits/rejected": -2.928776502609253, "logps/chosen": -66.13783264160156, "logps/rejected": -66.45333862304688, "loss": 0.6785, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.12280281633138657, "rewards/margins": 0.03304052725434303, "rewards/rejected": -0.1558433473110199, "step": 3860 }, { "epoch": 1.3335630599586492, "grad_norm": 2.1542351245880127, "learning_rate": 3.400051769773774e-08, "logits/chosen": -2.895602226257324, "logits/rejected": -2.8764548301696777, "logps/chosen": -64.3131332397461, "logps/rejected": -68.65336608886719, "loss": 0.6741, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11499504745006561, "rewards/margins": 0.041288357228040695, "rewards/rejected": -0.1562834084033966, "step": 3870 }, { "epoch": 1.3370089593383874, "grad_norm": 2.3994975090026855, "learning_rate": 3.3906925141959254e-08, "logits/chosen": -2.8607685565948486, "logits/rejected": -2.8357174396514893, "logps/chosen": -63.11100387573242, "logps/rejected": -67.97138214111328, "loss": 0.6693, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.10886244475841522, "rewards/margins": 0.05142400413751602, "rewards/rejected": -0.16028645634651184, "step": 3880 }, { "epoch": 1.3404548587181253, "grad_norm": 2.253732204437256, "learning_rate": 3.3813189384220104e-08, "logits/chosen": -2.940135955810547, "logits/rejected": -2.919588565826416, "logps/chosen": -63.794090270996094, "logps/rejected": -68.53995513916016, "loss": 0.678, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11966504156589508, "rewards/margins": 0.03373992443084717, "rewards/rejected": -0.15340495109558105, "step": 3890 }, { "epoch": 1.3439007580978635, "grad_norm": 2.3711891174316406, "learning_rate": 3.3719311931566096e-08, "logits/chosen": -2.9669857025146484, "logits/rejected": -2.956693172454834, "logps/chosen": -64.63912963867188, "logps/rejected": -69.16084289550781, "loss": 0.6748, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11863680928945541, "rewards/margins": 0.03982996568083763, "rewards/rejected": -0.15846678614616394, "step": 3900 }, { "epoch": 1.3439007580978635, "eval_logits/chosen": -3.027390480041504, "eval_logits/rejected": -3.02156138420105, "eval_logps/chosen": -66.04216003417969, "eval_logps/rejected": -72.45966339111328, "eval_loss": 0.684454083442688, "eval_rewards/accuracies": 0.6036245226860046, "eval_rewards/chosen": -0.07330266386270523, "eval_rewards/margins": 0.019492758437991142, "eval_rewards/rejected": -0.09279542416334152, "eval_runtime": 383.0939, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 3900 }, { "epoch": 1.3473466574776016, "grad_norm": 2.16987681388855, "learning_rate": 3.362529429332117e-08, "logits/chosen": -2.982234477996826, "logits/rejected": -2.9523534774780273, "logps/chosen": -67.91022491455078, "logps/rejected": -69.66044616699219, "loss": 0.6729, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10218687355518341, "rewards/margins": 0.04407411813735962, "rewards/rejected": -0.14626097679138184, "step": 3910 }, { "epoch": 1.3507925568573398, "grad_norm": 2.1134376525878906, "learning_rate": 3.35311379810631e-08, "logits/chosen": -2.865996837615967, "logits/rejected": -2.8505825996398926, "logps/chosen": -63.66343307495117, "logps/rejected": -69.72282409667969, "loss": 0.6753, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.11979659646749496, "rewards/margins": 0.03912854567170143, "rewards/rejected": -0.1589251458644867, "step": 3920 }, { "epoch": 1.354238456237078, "grad_norm": 2.196824550628662, "learning_rate": 3.343684450859921e-08, "logits/chosen": -2.8734333515167236, "logits/rejected": -2.8389110565185547, "logps/chosen": -65.61233520507812, "logps/rejected": -66.38185119628906, "loss": 0.6725, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11375585943460464, "rewards/margins": 0.044916026294231415, "rewards/rejected": -0.15867188572883606, "step": 3930 }, { "epoch": 1.3576843556168159, "grad_norm": 2.4774861335754395, "learning_rate": 3.3342415391942055e-08, "logits/chosen": -2.943521499633789, "logits/rejected": -2.927100419998169, "logps/chosen": -66.07579040527344, "logps/rejected": -69.1430892944336, "loss": 0.6747, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11682011932134628, "rewards/margins": 0.04022994637489319, "rewards/rejected": -0.15705007314682007, "step": 3940 }, { "epoch": 1.361130254996554, "grad_norm": 2.4586751461029053, "learning_rate": 3.324785214928496e-08, "logits/chosen": -2.908015727996826, "logits/rejected": -2.8969714641571045, "logps/chosen": -64.98516082763672, "logps/rejected": -69.56060791015625, "loss": 0.6781, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.11709042638540268, "rewards/margins": 0.034015871584415436, "rewards/rejected": -0.15110629796981812, "step": 3950 }, { "epoch": 1.3645761543762922, "grad_norm": 2.2545900344848633, "learning_rate": 3.315315630097774e-08, "logits/chosen": -2.906679391860962, "logits/rejected": -2.8831567764282227, "logps/chosen": -67.37789916992188, "logps/rejected": -67.87293243408203, "loss": 0.676, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11555828899145126, "rewards/margins": 0.03761471062898636, "rewards/rejected": -0.15317301452159882, "step": 3960 }, { "epoch": 1.3680220537560304, "grad_norm": 2.426804780960083, "learning_rate": 3.305832936950217e-08, "logits/chosen": -2.913311243057251, "logits/rejected": -2.8912980556488037, "logps/chosen": -69.86399841308594, "logps/rejected": -70.14974212646484, "loss": 0.6706, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.11416562646627426, "rewards/margins": 0.04923606663942337, "rewards/rejected": -0.16340167820453644, "step": 3970 }, { "epoch": 1.3714679531357685, "grad_norm": 2.3300795555114746, "learning_rate": 3.29633728794475e-08, "logits/chosen": -2.8284902572631836, "logits/rejected": -2.8102269172668457, "logps/chosen": -62.919471740722656, "logps/rejected": -68.32115173339844, "loss": 0.6773, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.11926928907632828, "rewards/margins": 0.035174883902072906, "rewards/rejected": -0.15444417297840118, "step": 3980 }, { "epoch": 1.3749138525155065, "grad_norm": 2.3381571769714355, "learning_rate": 3.2868288357486e-08, "logits/chosen": -2.95910906791687, "logits/rejected": -2.936715602874756, "logps/chosen": -68.27220153808594, "logps/rejected": -70.48789978027344, "loss": 0.6738, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.12133724987506866, "rewards/margins": 0.04265391081571579, "rewards/rejected": -0.16399115324020386, "step": 3990 }, { "epoch": 1.3783597518952446, "grad_norm": 2.5500247478485107, "learning_rate": 3.2773077332348385e-08, "logits/chosen": -2.9376237392425537, "logits/rejected": -2.9266417026519775, "logps/chosen": -64.38090515136719, "logps/rejected": -69.34027862548828, "loss": 0.6769, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.11788547039031982, "rewards/margins": 0.03601060435175896, "rewards/rejected": -0.15389606356620789, "step": 4000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -3.023224353790283, "eval_logits/rejected": -3.0174360275268555, "eval_logps/chosen": -66.48844146728516, "eval_logps/rejected": -72.96649932861328, "eval_loss": 0.6842078566551208, "eval_rewards/accuracies": 0.6050186157226562, "eval_rewards/chosen": -0.07776538282632828, "eval_rewards/margins": 0.02009827084839344, "eval_rewards/rejected": -0.09786365926265717, "eval_runtime": 383.1323, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 4000 }, { "epoch": 1.3818056512749828, "grad_norm": 2.3130435943603516, "learning_rate": 3.2677741334799225e-08, "logits/chosen": -2.9026710987091064, "logits/rejected": -2.8721413612365723, "logps/chosen": -70.75444793701172, "logps/rejected": -69.38482666015625, "loss": 0.6762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12055616080760956, "rewards/margins": 0.037461262196302414, "rewards/rejected": -0.15801741182804108, "step": 4010 }, { "epoch": 1.385251550654721, "grad_norm": 2.2472779750823975, "learning_rate": 3.258228189761234e-08, "logits/chosen": -2.8868110179901123, "logits/rejected": -2.86332106590271, "logps/chosen": -65.00550079345703, "logps/rejected": -69.08967590332031, "loss": 0.6743, "rewards/accuracies": 0.625, "rewards/chosen": -0.12220504134893417, "rewards/margins": 0.04146517440676689, "rewards/rejected": -0.16367022693157196, "step": 4020 }, { "epoch": 1.388697450034459, "grad_norm": 2.4094691276550293, "learning_rate": 3.2486700555546194e-08, "logits/chosen": -2.9557504653930664, "logits/rejected": -2.9275245666503906, "logps/chosen": -66.57365417480469, "logps/rejected": -69.04373168945312, "loss": 0.6714, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11350591480731964, "rewards/margins": 0.04726409167051315, "rewards/rejected": -0.1607700139284134, "step": 4030 }, { "epoch": 1.392143349414197, "grad_norm": 2.4791882038116455, "learning_rate": 3.239099884531916e-08, "logits/chosen": -2.8767454624176025, "logits/rejected": -2.8590261936187744, "logps/chosen": -65.06785583496094, "logps/rejected": -69.59109497070312, "loss": 0.676, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.11845485121011734, "rewards/margins": 0.037820715457201004, "rewards/rejected": -0.15627558529376984, "step": 4040 }, { "epoch": 1.3955892487939352, "grad_norm": 2.426947832107544, "learning_rate": 3.2295178305584834e-08, "logits/chosen": -2.8861827850341797, "logits/rejected": -2.868537425994873, "logps/chosen": -65.69752502441406, "logps/rejected": -70.70096588134766, "loss": 0.6757, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12097401916980743, "rewards/margins": 0.038305263966321945, "rewards/rejected": -0.15927928686141968, "step": 4050 }, { "epoch": 1.3990351481736734, "grad_norm": 2.4238293170928955, "learning_rate": 3.219924047690735e-08, "logits/chosen": -2.910115957260132, "logits/rejected": -2.881876230239868, "logps/chosen": -67.06953430175781, "logps/rejected": -67.77577209472656, "loss": 0.6758, "rewards/accuracies": 0.609375, "rewards/chosen": -0.11987552791833878, "rewards/margins": 0.03824597969651222, "rewards/rejected": -0.1581214964389801, "step": 4060 }, { "epoch": 1.4024810475534115, "grad_norm": 2.429631233215332, "learning_rate": 3.210318690173652e-08, "logits/chosen": -2.8835904598236084, "logits/rejected": -2.8620238304138184, "logps/chosen": -65.8174057006836, "logps/rejected": -67.42288970947266, "loss": 0.6767, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12043891102075577, "rewards/margins": 0.036741141229867935, "rewards/rejected": -0.1571800261735916, "step": 4070 }, { "epoch": 1.4059269469331497, "grad_norm": 2.495227336883545, "learning_rate": 3.20070191243831e-08, "logits/chosen": -2.913691282272339, "logits/rejected": -2.8935675621032715, "logps/chosen": -66.15760803222656, "logps/rejected": -69.89808654785156, "loss": 0.6695, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12077184021472931, "rewards/margins": 0.05139942094683647, "rewards/rejected": -0.17217126488685608, "step": 4080 }, { "epoch": 1.4093728463128876, "grad_norm": 2.6889750957489014, "learning_rate": 3.191073869099395e-08, "logits/chosen": -2.9390571117401123, "logits/rejected": -2.916032314300537, "logps/chosen": -69.81867980957031, "logps/rejected": -68.8246841430664, "loss": 0.6773, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.1181713119149208, "rewards/margins": 0.035258520394563675, "rewards/rejected": -0.15342983603477478, "step": 4090 }, { "epoch": 1.4128187456926258, "grad_norm": 2.348496437072754, "learning_rate": 3.1814347149527154e-08, "logits/chosen": -2.9100403785705566, "logits/rejected": -2.8815054893493652, "logps/chosen": -68.82015228271484, "logps/rejected": -69.77921295166016, "loss": 0.6739, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12333276122808456, "rewards/margins": 0.04239136725664139, "rewards/rejected": -0.16572412848472595, "step": 4100 }, { "epoch": 1.4128187456926258, "eval_logits/chosen": -3.0187315940856934, "eval_logits/rejected": -3.0129482746124268, "eval_logps/chosen": -66.939208984375, "eval_logps/rejected": -73.48934936523438, "eval_loss": 0.6839022040367126, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.08227317035198212, "eval_rewards/margins": 0.02081906795501709, "eval_rewards/rejected": -0.1030922457575798, "eval_runtime": 383.0589, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 4100 }, { "epoch": 1.416264645072364, "grad_norm": 2.2617549896240234, "learning_rate": 3.171784604972716e-08, "logits/chosen": -2.867497205734253, "logits/rejected": -2.845815420150757, "logps/chosen": -66.6474380493164, "logps/rejected": -70.89347076416016, "loss": 0.6762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12548427283763885, "rewards/margins": 0.037441056221723557, "rewards/rejected": -0.1629253327846527, "step": 4110 }, { "epoch": 1.4197105444521019, "grad_norm": 2.3029067516326904, "learning_rate": 3.1621236943099836e-08, "logits/chosen": -2.944361686706543, "logits/rejected": -2.92055344581604, "logps/chosen": -68.30097961425781, "logps/rejected": -68.2918701171875, "loss": 0.6745, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12589865922927856, "rewards/margins": 0.04111361503601074, "rewards/rejected": -0.1670122742652893, "step": 4120 }, { "epoch": 1.42315644383184, "grad_norm": 2.5202653408050537, "learning_rate": 3.152452138288755e-08, "logits/chosen": -2.972409725189209, "logits/rejected": -2.950347423553467, "logps/chosen": -69.09842681884766, "logps/rejected": -71.52424621582031, "loss": 0.672, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12066445499658585, "rewards/margins": 0.04655041545629501, "rewards/rejected": -0.16721487045288086, "step": 4130 }, { "epoch": 1.4266023432115782, "grad_norm": 2.379849433898926, "learning_rate": 3.142770092404418e-08, "logits/chosen": -2.938502788543701, "logits/rejected": -2.9276320934295654, "logps/chosen": -67.15705871582031, "logps/rejected": -70.82493591308594, "loss": 0.6828, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14270718395709991, "rewards/margins": 0.024325937032699585, "rewards/rejected": -0.1670331209897995, "step": 4140 }, { "epoch": 1.4300482425913164, "grad_norm": 2.4067325592041016, "learning_rate": 3.133077712321015e-08, "logits/chosen": -2.9617185592651367, "logits/rejected": -2.9257190227508545, "logps/chosen": -70.95372009277344, "logps/rejected": -70.95451354980469, "loss": 0.6732, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1219591349363327, "rewards/margins": 0.043968718498945236, "rewards/rejected": -0.16592784225940704, "step": 4150 }, { "epoch": 1.4334941419710545, "grad_norm": 2.2059571743011475, "learning_rate": 3.123375153868734e-08, "logits/chosen": -2.8638856410980225, "logits/rejected": -2.840353488922119, "logps/chosen": -66.0275650024414, "logps/rejected": -68.04927062988281, "loss": 0.6758, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11636275053024292, "rewards/margins": 0.038091760128736496, "rewards/rejected": -0.15445451438426971, "step": 4160 }, { "epoch": 1.4369400413507925, "grad_norm": 2.2737934589385986, "learning_rate": 3.1136625730414085e-08, "logits/chosen": -2.934387445449829, "logits/rejected": -2.9220004081726074, "logps/chosen": -64.80223083496094, "logps/rejected": -70.0802001953125, "loss": 0.675, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.13273027539253235, "rewards/margins": 0.04044685140252113, "rewards/rejected": -0.17317715287208557, "step": 4170 }, { "epoch": 1.4403859407305306, "grad_norm": 2.382845163345337, "learning_rate": 3.103940125994007e-08, "logits/chosen": -2.8102498054504395, "logits/rejected": -2.7833516597747803, "logps/chosen": -67.23863220214844, "logps/rejected": -69.78515625, "loss": 0.6743, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13376067578792572, "rewards/margins": 0.041612375527620316, "rewards/rejected": -0.17537304759025574, "step": 4180 }, { "epoch": 1.4438318401102688, "grad_norm": 2.2182083129882812, "learning_rate": 3.094207969040123e-08, "logits/chosen": -2.944507122039795, "logits/rejected": -2.9260945320129395, "logps/chosen": -64.43937683105469, "logps/rejected": -70.73786163330078, "loss": 0.677, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1329062283039093, "rewards/margins": 0.03567344322800636, "rewards/rejected": -0.16857966780662537, "step": 4190 }, { "epoch": 1.447277739490007, "grad_norm": 2.472024440765381, "learning_rate": 3.084466258649463e-08, "logits/chosen": -2.942840814590454, "logits/rejected": -2.915674924850464, "logps/chosen": -65.9464111328125, "logps/rejected": -71.5289077758789, "loss": 0.6668, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.12445267289876938, "rewards/margins": 0.0563867911696434, "rewards/rejected": -0.1808394491672516, "step": 4200 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -3.015002489089966, "eval_logits/rejected": -3.0092456340789795, "eval_logps/chosen": -67.33753204345703, "eval_logps/rejected": -73.96842956542969, "eval_loss": 0.6835517883300781, "eval_rewards/accuracies": 0.6033921837806702, "eval_rewards/chosen": -0.08625634014606476, "eval_rewards/margins": 0.02162673883140087, "eval_rewards/rejected": -0.10788308084011078, "eval_runtime": 383.3294, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.403, "step": 4200 }, { "epoch": 1.450723638869745, "grad_norm": 2.309657335281372, "learning_rate": 3.074715151445329e-08, "logits/chosen": -2.8289742469787598, "logits/rejected": -2.8054184913635254, "logps/chosen": -67.65290069580078, "logps/rejected": -69.8817367553711, "loss": 0.6686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1299678087234497, "rewards/margins": 0.05345992371439934, "rewards/rejected": -0.18342772126197815, "step": 4210 }, { "epoch": 1.454169538249483, "grad_norm": 2.733665704727173, "learning_rate": 3.0649548042021015e-08, "logits/chosen": -2.916252374649048, "logits/rejected": -2.8998372554779053, "logps/chosen": -67.19755554199219, "logps/rejected": -71.05458068847656, "loss": 0.6756, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.13055549561977386, "rewards/margins": 0.03906089812517166, "rewards/rejected": -0.16961640119552612, "step": 4220 }, { "epoch": 1.4576154376292212, "grad_norm": 2.376802921295166, "learning_rate": 3.055185373842719e-08, "logits/chosen": -2.8743534088134766, "logits/rejected": -2.858088970184326, "logps/chosen": -63.31803512573242, "logps/rejected": -71.1214370727539, "loss": 0.6706, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.12908780574798584, "rewards/margins": 0.04894782230257988, "rewards/rejected": -0.1780356466770172, "step": 4230 }, { "epoch": 1.4610613370089593, "grad_norm": 2.3690285682678223, "learning_rate": 3.045407017436153e-08, "logits/chosen": -2.9034533500671387, "logits/rejected": -2.8743841648101807, "logps/chosen": -68.18960571289062, "logps/rejected": -70.26652526855469, "loss": 0.6739, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13637776672840118, "rewards/margins": 0.0426180437207222, "rewards/rejected": -0.17899580299854279, "step": 4240 }, { "epoch": 1.4645072363886975, "grad_norm": 2.3128628730773926, "learning_rate": 3.035619892194886e-08, "logits/chosen": -2.8859875202178955, "logits/rejected": -2.8748340606689453, "logps/chosen": -67.58362579345703, "logps/rejected": -72.99158477783203, "loss": 0.6723, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.13060235977172852, "rewards/margins": 0.046077366918325424, "rewards/rejected": -0.17667973041534424, "step": 4250 }, { "epoch": 1.4679531357684357, "grad_norm": 2.400481939315796, "learning_rate": 3.025824155472383e-08, "logits/chosen": -2.8886466026306152, "logits/rejected": -2.8534767627716064, "logps/chosen": -68.84166717529297, "logps/rejected": -68.58280944824219, "loss": 0.6689, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13132067024707794, "rewards/margins": 0.053396809846162796, "rewards/rejected": -0.18471749126911163, "step": 4260 }, { "epoch": 1.4713990351481736, "grad_norm": 2.4101381301879883, "learning_rate": 3.016019964760559e-08, "logits/chosen": -2.93318247795105, "logits/rejected": -2.9163897037506104, "logps/chosen": -67.59861755371094, "logps/rejected": -69.44223022460938, "loss": 0.6792, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1330803781747818, "rewards/margins": 0.032037150114774704, "rewards/rejected": -0.165117546916008, "step": 4270 }, { "epoch": 1.4748449345279118, "grad_norm": 2.305143117904663, "learning_rate": 3.00620747768725e-08, "logits/chosen": -2.961884021759033, "logits/rejected": -2.946578025817871, "logps/chosen": -67.22900390625, "logps/rejected": -71.36090850830078, "loss": 0.6736, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1320640742778778, "rewards/margins": 0.04322432354092598, "rewards/rejected": -0.1752883940935135, "step": 4280 }, { "epoch": 1.47829083390765, "grad_norm": 2.4769349098205566, "learning_rate": 2.996386852013677e-08, "logits/chosen": -2.9055652618408203, "logits/rejected": -2.8823864459991455, "logps/chosen": -66.78739929199219, "logps/rejected": -71.45848846435547, "loss": 0.6671, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1254073679447174, "rewards/margins": 0.05627509206533432, "rewards/rejected": -0.18168243765830994, "step": 4290 }, { "epoch": 1.481736733287388, "grad_norm": 2.492809772491455, "learning_rate": 2.986558245631909e-08, "logits/chosen": -2.917680025100708, "logits/rejected": -2.8875133991241455, "logps/chosen": -67.38093566894531, "logps/rejected": -69.20143127441406, "loss": 0.6729, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.12164533138275146, "rewards/margins": 0.04471977427601814, "rewards/rejected": -0.1663651168346405, "step": 4300 }, { "epoch": 1.481736733287388, "eval_logits/chosen": -3.0096850395202637, "eval_logits/rejected": -3.003866195678711, "eval_logps/chosen": -67.49187469482422, "eval_logps/rejected": -74.16020202636719, "eval_loss": 0.6833958625793457, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.08779981732368469, "eval_rewards/margins": 0.022000951692461967, "eval_rewards/rejected": -0.10980076342821121, "eval_runtime": 383.0932, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 4300 }, { "epoch": 1.4851826326671262, "grad_norm": 2.447634696960449, "learning_rate": 2.976721816562329e-08, "logits/chosen": -2.8635993003845215, "logits/rejected": -2.8432509899139404, "logps/chosen": -68.04258728027344, "logps/rejected": -68.48338317871094, "loss": 0.6753, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13583067059516907, "rewards/margins": 0.03996856510639191, "rewards/rejected": -0.17579922080039978, "step": 4310 }, { "epoch": 1.4886285320468642, "grad_norm": 2.4271655082702637, "learning_rate": 2.9668777229510884e-08, "logits/chosen": -2.8751580715179443, "logits/rejected": -2.854083299636841, "logps/chosen": -66.30339813232422, "logps/rejected": -68.55867004394531, "loss": 0.6773, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12580087780952454, "rewards/margins": 0.03559477999806404, "rewards/rejected": -0.16139563918113708, "step": 4320 }, { "epoch": 1.4920744314266023, "grad_norm": 2.2913057804107666, "learning_rate": 2.9570261230675636e-08, "logits/chosen": -2.86710786819458, "logits/rejected": -2.8493614196777344, "logps/chosen": -66.80902862548828, "logps/rejected": -70.53271484375, "loss": 0.6706, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1265142261981964, "rewards/margins": 0.04983477294445038, "rewards/rejected": -0.17634901404380798, "step": 4330 }, { "epoch": 1.4955203308063405, "grad_norm": 2.5450940132141113, "learning_rate": 2.9471671753018185e-08, "logits/chosen": -2.9403929710388184, "logits/rejected": -2.928295135498047, "logps/chosen": -65.49287414550781, "logps/rejected": -70.91960906982422, "loss": 0.6753, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.141557976603508, "rewards/margins": 0.039782989770174026, "rewards/rejected": -0.18134096264839172, "step": 4340 }, { "epoch": 1.4989662301860784, "grad_norm": 2.591998338699341, "learning_rate": 2.9373010381620478e-08, "logits/chosen": -2.930431842803955, "logits/rejected": -2.89919114112854, "logps/chosen": -69.33406829833984, "logps/rejected": -67.90383911132812, "loss": 0.6745, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.13317172229290009, "rewards/margins": 0.04111158102750778, "rewards/rejected": -0.17428329586982727, "step": 4350 }, { "epoch": 1.5024121295658168, "grad_norm": 2.3971872329711914, "learning_rate": 2.927427870272039e-08, "logits/chosen": -2.8952383995056152, "logits/rejected": -2.87570858001709, "logps/chosen": -67.6709976196289, "logps/rejected": -70.61396789550781, "loss": 0.6764, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.1307940036058426, "rewards/margins": 0.037335820496082306, "rewards/rejected": -0.1681298315525055, "step": 4360 }, { "epoch": 1.5058580289455548, "grad_norm": 2.4003310203552246, "learning_rate": 2.9175478303686142e-08, "logits/chosen": -2.9019107818603516, "logits/rejected": -2.8804497718811035, "logps/chosen": -66.1182632446289, "logps/rejected": -70.70848083496094, "loss": 0.6752, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.1428057849407196, "rewards/margins": 0.04003838077187538, "rewards/rejected": -0.1828441619873047, "step": 4370 }, { "epoch": 1.509303928325293, "grad_norm": 2.3031885623931885, "learning_rate": 2.9076610772990817e-08, "logits/chosen": -2.8914647102355957, "logits/rejected": -2.8741812705993652, "logps/chosen": -65.17286682128906, "logps/rejected": -71.88270568847656, "loss": 0.6696, "rewards/accuracies": 0.671875, "rewards/chosen": -0.12575367093086243, "rewards/margins": 0.05120586231350899, "rewards/rejected": -0.1769595444202423, "step": 4380 }, { "epoch": 1.512749827705031, "grad_norm": 2.7572474479675293, "learning_rate": 2.897767770018682e-08, "logits/chosen": -2.9245927333831787, "logits/rejected": -2.9104371070861816, "logps/chosen": -65.59696960449219, "logps/rejected": -70.18229675292969, "loss": 0.6701, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1351243406534195, "rewards/margins": 0.05065058544278145, "rewards/rejected": -0.18577492237091064, "step": 4390 }, { "epoch": 1.516195727084769, "grad_norm": 2.420945644378662, "learning_rate": 2.8878680675880312e-08, "logits/chosen": -2.8908398151397705, "logits/rejected": -2.860151767730713, "logps/chosen": -68.41038513183594, "logps/rejected": -69.84535217285156, "loss": 0.6748, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.13448934257030487, "rewards/margins": 0.04134248197078705, "rewards/rejected": -0.17583182454109192, "step": 4400 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -3.0065345764160156, "eval_logits/rejected": -3.0007359981536865, "eval_logps/chosen": -67.6110610961914, "eval_logps/rejected": -74.30791473388672, "eval_loss": 0.683279275894165, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.08899164199829102, "eval_rewards/margins": 0.02228626050055027, "eval_rewards/rejected": -0.11127790063619614, "eval_runtime": 382.9218, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 4400 }, { "epoch": 1.5196416264645074, "grad_norm": 2.4271833896636963, "learning_rate": 2.8779621291705642e-08, "logits/chosen": -2.9563918113708496, "logits/rejected": -2.9349350929260254, "logps/chosen": -64.04507446289062, "logps/rejected": -69.17042541503906, "loss": 0.6723, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.13352784514427185, "rewards/margins": 0.04623373597860336, "rewards/rejected": -0.17976155877113342, "step": 4410 }, { "epoch": 1.5230875258442453, "grad_norm": 2.512819766998291, "learning_rate": 2.8680501140299752e-08, "logits/chosen": -2.8031907081604004, "logits/rejected": -2.787065029144287, "logps/chosen": -66.20631408691406, "logps/rejected": -70.47114562988281, "loss": 0.6728, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.12734147906303406, "rewards/margins": 0.04467567056417465, "rewards/rejected": -0.1720171421766281, "step": 4420 }, { "epoch": 1.5265334252239835, "grad_norm": 2.409242630004883, "learning_rate": 2.858132181527657e-08, "logits/chosen": -2.9562344551086426, "logits/rejected": -2.9362080097198486, "logps/chosen": -65.13412475585938, "logps/rejected": -70.729248046875, "loss": 0.6714, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1241837739944458, "rewards/margins": 0.047337111085653305, "rewards/rejected": -0.1715208739042282, "step": 4430 }, { "epoch": 1.5299793246037217, "grad_norm": 2.377758502960205, "learning_rate": 2.848208491120141e-08, "logits/chosen": -2.9487199783325195, "logits/rejected": -2.918727159500122, "logps/chosen": -69.02839660644531, "logps/rejected": -71.60032653808594, "loss": 0.6734, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.13917836546897888, "rewards/margins": 0.04395196959376335, "rewards/rejected": -0.18313033878803253, "step": 4440 }, { "epoch": 1.5334252239834596, "grad_norm": 2.6081643104553223, "learning_rate": 2.8382792023565304e-08, "logits/chosen": -2.9071240425109863, "logits/rejected": -2.878571033477783, "logps/chosen": -68.27693176269531, "logps/rejected": -69.92921447753906, "loss": 0.6728, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1358669549226761, "rewards/margins": 0.0446004793047905, "rewards/rejected": -0.18046744167804718, "step": 4450 }, { "epoch": 1.5368711233631978, "grad_norm": 2.575587272644043, "learning_rate": 2.8283444748759376e-08, "logits/chosen": -2.9364612102508545, "logits/rejected": -2.914050579071045, "logps/chosen": -70.49769592285156, "logps/rejected": -72.06725311279297, "loss": 0.6689, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14137059450149536, "rewards/margins": 0.05345061421394348, "rewards/rejected": -0.19482120871543884, "step": 4460 }, { "epoch": 1.540317022742936, "grad_norm": 2.5107154846191406, "learning_rate": 2.8184044684049157e-08, "logits/chosen": -2.918118715286255, "logits/rejected": -2.893716812133789, "logps/chosen": -68.78949737548828, "logps/rejected": -70.99267578125, "loss": 0.6744, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.1362750232219696, "rewards/margins": 0.041847459971904755, "rewards/rejected": -0.17812249064445496, "step": 4470 }, { "epoch": 1.5437629221226739, "grad_norm": 2.535024642944336, "learning_rate": 2.8084593427548915e-08, "logits/chosen": -2.8767764568328857, "logits/rejected": -2.8640594482421875, "logps/chosen": -68.28077697753906, "logps/rejected": -72.9585952758789, "loss": 0.6733, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1402858942747116, "rewards/margins": 0.04479040205478668, "rewards/rejected": -0.1850762814283371, "step": 4480 }, { "epoch": 1.5472088215024122, "grad_norm": 2.4002034664154053, "learning_rate": 2.7985092578195944e-08, "logits/chosen": -2.9322452545166016, "logits/rejected": -2.9030165672302246, "logps/chosen": -68.30240631103516, "logps/rejected": -70.11109924316406, "loss": 0.6763, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.14420081675052643, "rewards/margins": 0.037918709218502045, "rewards/rejected": -0.18211950361728668, "step": 4490 }, { "epoch": 1.5506547208821502, "grad_norm": 2.471027135848999, "learning_rate": 2.7885543735724916e-08, "logits/chosen": -2.899827241897583, "logits/rejected": -2.876711845397949, "logps/chosen": -69.09613037109375, "logps/rejected": -71.68231964111328, "loss": 0.6678, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.12914066016674042, "rewards/margins": 0.054671771824359894, "rewards/rejected": -0.18381242454051971, "step": 4500 }, { "epoch": 1.5506547208821502, "eval_logits/chosen": -3.001643657684326, "eval_logits/rejected": -2.995819568634033, "eval_logps/chosen": -68.13470458984375, "eval_logps/rejected": -74.93878936767578, "eval_loss": 0.6828100085258484, "eval_rewards/accuracies": 0.6019981503486633, "eval_rewards/chosen": -0.09422814846038818, "eval_rewards/margins": 0.02335856668651104, "eval_rewards/rejected": -0.11758670210838318, "eval_runtime": 383.2834, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 4500 }, { "epoch": 1.5541006202618883, "grad_norm": 2.533214569091797, "learning_rate": 2.778594850064207e-08, "logits/chosen": -2.8632161617279053, "logits/rejected": -2.8420379161834717, "logps/chosen": -68.59354400634766, "logps/rejected": -70.82925415039062, "loss": 0.6695, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.13811880350112915, "rewards/margins": 0.051301032304763794, "rewards/rejected": -0.18941982090473175, "step": 4510 }, { "epoch": 1.5575465196416265, "grad_norm": 2.483794927597046, "learning_rate": 2.768630847419955e-08, "logits/chosen": -2.8536620140075684, "logits/rejected": -2.8456475734710693, "logps/chosen": -65.48823547363281, "logps/rejected": -71.29315948486328, "loss": 0.6748, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.14327779412269592, "rewards/margins": 0.040574587881565094, "rewards/rejected": -0.1838523894548416, "step": 4520 }, { "epoch": 1.5609924190213644, "grad_norm": 2.331225872039795, "learning_rate": 2.758662525836964e-08, "logits/chosen": -2.9114956855773926, "logits/rejected": -2.889056921005249, "logps/chosen": -69.080078125, "logps/rejected": -71.46721649169922, "loss": 0.6761, "rewards/accuracies": 0.625, "rewards/chosen": -0.1484481394290924, "rewards/margins": 0.03799723461270332, "rewards/rejected": -0.18644535541534424, "step": 4530 }, { "epoch": 1.5644383184011028, "grad_norm": 2.3480257987976074, "learning_rate": 2.7486900455818983e-08, "logits/chosen": -2.873544216156006, "logits/rejected": -2.862929344177246, "logps/chosen": -66.64854431152344, "logps/rejected": -73.42185974121094, "loss": 0.6707, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1337542086839676, "rewards/margins": 0.049350786954164505, "rewards/rejected": -0.183105006814003, "step": 4540 }, { "epoch": 1.5678842177808407, "grad_norm": 2.707644462585449, "learning_rate": 2.7387135669882864e-08, "logits/chosen": -2.880035161972046, "logits/rejected": -2.8493285179138184, "logps/chosen": -69.90792083740234, "logps/rejected": -70.4625015258789, "loss": 0.6718, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1387501060962677, "rewards/margins": 0.04624714329838753, "rewards/rejected": -0.18499724566936493, "step": 4550 }, { "epoch": 1.571330117160579, "grad_norm": 2.673945665359497, "learning_rate": 2.7287332504539385e-08, "logits/chosen": -2.9278173446655273, "logits/rejected": -2.896785020828247, "logps/chosen": -70.78034210205078, "logps/rejected": -72.39275360107422, "loss": 0.6678, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.13018302619457245, "rewards/margins": 0.05526752024888992, "rewards/rejected": -0.18545055389404297, "step": 4560 }, { "epoch": 1.574776016540317, "grad_norm": 2.4811689853668213, "learning_rate": 2.7187492564383706e-08, "logits/chosen": -2.889941692352295, "logits/rejected": -2.8523342609405518, "logps/chosen": -68.69287109375, "logps/rejected": -70.44648742675781, "loss": 0.6647, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.12542645633220673, "rewards/margins": 0.06266696751117706, "rewards/rejected": -0.1880934238433838, "step": 4570 }, { "epoch": 1.578221915920055, "grad_norm": 2.2270328998565674, "learning_rate": 2.708761745460224e-08, "logits/chosen": -2.9182770252227783, "logits/rejected": -2.9024040699005127, "logps/chosen": -67.9704360961914, "logps/rejected": -73.34428405761719, "loss": 0.6688, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.14232484996318817, "rewards/margins": 0.053309548646211624, "rewards/rejected": -0.19563442468643188, "step": 4580 }, { "epoch": 1.5816678152997934, "grad_norm": 2.6900172233581543, "learning_rate": 2.6987708780946844e-08, "logits/chosen": -2.916078567504883, "logits/rejected": -2.8897485733032227, "logps/chosen": -70.13990783691406, "logps/rejected": -70.71611022949219, "loss": 0.6726, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.15353693068027496, "rewards/margins": 0.045441217720508575, "rewards/rejected": -0.19897815585136414, "step": 4590 }, { "epoch": 1.5851137146795313, "grad_norm": 2.5652849674224854, "learning_rate": 2.6887768149708978e-08, "logits/chosen": -2.9708092212677, "logits/rejected": -2.957068920135498, "logps/chosen": -68.81321716308594, "logps/rejected": -73.9534683227539, "loss": 0.6735, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15308627486228943, "rewards/margins": 0.04375685006380081, "rewards/rejected": -0.19684311747550964, "step": 4600 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.9975171089172363, "eval_logits/rejected": -2.9916865825653076, "eval_logps/chosen": -68.4876480102539, "eval_logps/rejected": -75.33289337158203, "eval_loss": 0.6826581954956055, "eval_rewards/accuracies": 0.6015334725379944, "eval_rewards/chosen": -0.09775754809379578, "eval_rewards/margins": 0.023770207539200783, "eval_rewards/rejected": -0.121527761220932, "eval_runtime": 383.5419, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 4600 }, { "epoch": 1.5885596140592695, "grad_norm": 2.6045384407043457, "learning_rate": 2.6787797167693938e-08, "logits/chosen": -2.8478686809539795, "logits/rejected": -2.823045015335083, "logps/chosen": -67.64823913574219, "logps/rejected": -69.96019744873047, "loss": 0.6726, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1471799910068512, "rewards/margins": 0.04572419449687004, "rewards/rejected": -0.19290418922901154, "step": 4610 }, { "epoch": 1.5920055134390076, "grad_norm": 2.4777708053588867, "learning_rate": 2.668779744219497e-08, "logits/chosen": -2.8321468830108643, "logits/rejected": -2.822711944580078, "logps/chosen": -66.84947204589844, "logps/rejected": -72.2445068359375, "loss": 0.6734, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14121028780937195, "rewards/margins": 0.04399310052394867, "rewards/rejected": -0.18520338833332062, "step": 4620 }, { "epoch": 1.5954514128187456, "grad_norm": 2.534954309463501, "learning_rate": 2.658777058096744e-08, "logits/chosen": -2.8557753562927246, "logits/rejected": -2.8340537548065186, "logps/chosen": -65.99424743652344, "logps/rejected": -70.30352020263672, "loss": 0.6755, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1456986963748932, "rewards/margins": 0.03934315964579582, "rewards/rejected": -0.1850418597459793, "step": 4630 }, { "epoch": 1.598897312198484, "grad_norm": 2.467085599899292, "learning_rate": 2.6487718192203e-08, "logits/chosen": -2.941689968109131, "logits/rejected": -2.9171595573425293, "logps/chosen": -68.81016540527344, "logps/rejected": -72.12666320800781, "loss": 0.672, "rewards/accuracies": 0.640625, "rewards/chosen": -0.14578399062156677, "rewards/margins": 0.04744340851902962, "rewards/rejected": -0.1932273805141449, "step": 4640 }, { "epoch": 1.602343211578222, "grad_norm": 2.5921545028686523, "learning_rate": 2.6387641884503732e-08, "logits/chosen": -2.8367621898651123, "logits/rejected": -2.8232483863830566, "logps/chosen": -69.15013122558594, "logps/rejected": -72.40425109863281, "loss": 0.675, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.1498098522424698, "rewards/margins": 0.04056263715028763, "rewards/rejected": -0.19037246704101562, "step": 4650 }, { "epoch": 1.60578911095796, "grad_norm": 2.4442951679229736, "learning_rate": 2.628754326685626e-08, "logits/chosen": -2.929861545562744, "logits/rejected": -2.9053759574890137, "logps/chosen": -67.65922546386719, "logps/rejected": -72.57878112792969, "loss": 0.6712, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15370339155197144, "rewards/margins": 0.04814000055193901, "rewards/rejected": -0.20184338092803955, "step": 4660 }, { "epoch": 1.6092350103376982, "grad_norm": 2.6314620971679688, "learning_rate": 2.618742394860589e-08, "logits/chosen": -2.875800848007202, "logits/rejected": -2.856609344482422, "logps/chosen": -69.58560180664062, "logps/rejected": -73.97750854492188, "loss": 0.6695, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15391986072063446, "rewards/margins": 0.052610911428928375, "rewards/rejected": -0.20653076469898224, "step": 4670 }, { "epoch": 1.6126809097174362, "grad_norm": 2.345186233520508, "learning_rate": 2.6087285539430794e-08, "logits/chosen": -2.9313607215881348, "logits/rejected": -2.89660382270813, "logps/chosen": -68.57977294921875, "logps/rejected": -72.10173034667969, "loss": 0.6663, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1396355926990509, "rewards/margins": 0.05890543386340141, "rewards/rejected": -0.1985410451889038, "step": 4680 }, { "epoch": 1.6161268090971743, "grad_norm": 2.5449588298797607, "learning_rate": 2.598712964931602e-08, "logits/chosen": -2.9787368774414062, "logits/rejected": -2.95357084274292, "logps/chosen": -68.18052673339844, "logps/rejected": -70.33088684082031, "loss": 0.6768, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1405651867389679, "rewards/margins": 0.03695882856845856, "rewards/rejected": -0.17752400040626526, "step": 4690 }, { "epoch": 1.6195727084769125, "grad_norm": 2.576258659362793, "learning_rate": 2.58869578885277e-08, "logits/chosen": -2.7999775409698486, "logits/rejected": -2.783830404281616, "logps/chosen": -68.49214935302734, "logps/rejected": -73.67041778564453, "loss": 0.6742, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.14908184111118317, "rewards/margins": 0.04220367223024368, "rewards/rejected": -0.19128549098968506, "step": 4700 }, { "epoch": 1.6195727084769125, "eval_logits/chosen": -2.992418050765991, "eval_logits/rejected": -2.9866204261779785, "eval_logps/chosen": -68.57611083984375, "eval_logps/rejected": -75.46302032470703, "eval_loss": 0.6824790239334106, "eval_rewards/accuracies": 0.6050186157226562, "eval_rewards/chosen": -0.09864220768213272, "eval_rewards/margins": 0.024186739698052406, "eval_rewards/rejected": -0.12282893806695938, "eval_runtime": 383.3455, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 4700 }, { "epoch": 1.6230186078566504, "grad_norm": 2.8505454063415527, "learning_rate": 2.5786771867587126e-08, "logits/chosen": -2.888709545135498, "logits/rejected": -2.8711140155792236, "logps/chosen": -69.5829849243164, "logps/rejected": -71.7170181274414, "loss": 0.6731, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1512664556503296, "rewards/margins": 0.04430264234542847, "rewards/rejected": -0.19556909799575806, "step": 4710 }, { "epoch": 1.6264645072363888, "grad_norm": 2.5008065700531006, "learning_rate": 2.5686573197244853e-08, "logits/chosen": -2.870361328125, "logits/rejected": -2.8614063262939453, "logps/chosen": -66.88444519042969, "logps/rejected": -70.23690032958984, "loss": 0.6772, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.15851029753684998, "rewards/margins": 0.03688473626971245, "rewards/rejected": -0.19539503753185272, "step": 4720 }, { "epoch": 1.6299104066161267, "grad_norm": 3.1432342529296875, "learning_rate": 2.5586363488454805e-08, "logits/chosen": -2.93123197555542, "logits/rejected": -2.9248974323272705, "logps/chosen": -68.57245635986328, "logps/rejected": -73.31425476074219, "loss": 0.6698, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.15752527117729187, "rewards/margins": 0.0522991418838501, "rewards/rejected": -0.20982441306114197, "step": 4730 }, { "epoch": 1.633356305995865, "grad_norm": 2.483896017074585, "learning_rate": 2.5486144352348382e-08, "logits/chosen": -2.7911648750305176, "logits/rejected": -2.7593071460723877, "logps/chosen": -69.03587341308594, "logps/rejected": -71.77375793457031, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": -0.15717056393623352, "rewards/margins": 0.044942013919353485, "rewards/rejected": -0.2021125853061676, "step": 4740 }, { "epoch": 1.636802205375603, "grad_norm": 2.3143157958984375, "learning_rate": 2.5385917400208555e-08, "logits/chosen": -2.873028516769409, "logits/rejected": -2.856887102127075, "logps/chosen": -68.4826889038086, "logps/rejected": -73.09519958496094, "loss": 0.6711, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1460324227809906, "rewards/margins": 0.04817270487546921, "rewards/rejected": -0.1942051351070404, "step": 4750 }, { "epoch": 1.640248104755341, "grad_norm": 2.503166675567627, "learning_rate": 2.5285684243443957e-08, "logits/chosen": -2.910024642944336, "logits/rejected": -2.894676685333252, "logps/chosen": -66.54312896728516, "logps/rejected": -76.27405548095703, "loss": 0.6685, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.1493271291255951, "rewards/margins": 0.0544515959918499, "rewards/rejected": -0.2037787139415741, "step": 4760 }, { "epoch": 1.6436940041350794, "grad_norm": 2.5782196521759033, "learning_rate": 2.5185446493562985e-08, "logits/chosen": -2.8879051208496094, "logits/rejected": -2.86506986618042, "logps/chosen": -68.90760803222656, "logps/rejected": -73.1121597290039, "loss": 0.668, "rewards/accuracies": 0.640625, "rewards/chosen": -0.15400338172912598, "rewards/margins": 0.05566145107150078, "rewards/rejected": -0.20966482162475586, "step": 4770 }, { "epoch": 1.6471399035148173, "grad_norm": 3.1357321739196777, "learning_rate": 2.508520576214786e-08, "logits/chosen": -2.931192398071289, "logits/rejected": -2.9286227226257324, "logps/chosen": -68.24048614501953, "logps/rejected": -74.48942565917969, "loss": 0.672, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.14817427098751068, "rewards/margins": 0.047458209097385406, "rewards/rejected": -0.19563248753547668, "step": 4780 }, { "epoch": 1.6505858028945555, "grad_norm": 2.48701548576355, "learning_rate": 2.498496366082875e-08, "logits/chosen": -2.908775806427002, "logits/rejected": -2.8925576210021973, "logps/chosen": -67.57919311523438, "logps/rejected": -73.95806121826172, "loss": 0.6707, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15209241211414337, "rewards/margins": 0.04946356639266014, "rewards/rejected": -0.201555997133255, "step": 4790 }, { "epoch": 1.6540317022742936, "grad_norm": 2.524120807647705, "learning_rate": 2.4884721801257862e-08, "logits/chosen": -2.937943935394287, "logits/rejected": -2.912193775177002, "logps/chosen": -70.30247497558594, "logps/rejected": -73.45845031738281, "loss": 0.6741, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.14723876118659973, "rewards/margins": 0.04397964105010033, "rewards/rejected": -0.19121840596199036, "step": 4800 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -2.987703323364258, "eval_logits/rejected": -2.98188853263855, "eval_logps/chosen": -68.89502716064453, "eval_logps/rejected": -75.83085632324219, "eval_loss": 0.6822713017463684, "eval_rewards/accuracies": 0.6017658114433289, "eval_rewards/chosen": -0.10183130949735641, "eval_rewards/margins": 0.02467595785856247, "eval_rewards/rejected": -0.1265072524547577, "eval_runtime": 383.4585, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 4800 }, { "epoch": 1.6574776016540316, "grad_norm": 2.657543182373047, "learning_rate": 2.478448179508349e-08, "logits/chosen": -2.9349207878112793, "logits/rejected": -2.921032428741455, "logps/chosen": -69.62551879882812, "logps/rejected": -73.3028564453125, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": -0.14178159832954407, "rewards/margins": 0.03944983705878258, "rewards/rejected": -0.18123146891593933, "step": 4810 }, { "epoch": 1.66092350103377, "grad_norm": 2.4823145866394043, "learning_rate": 2.4684245253924142e-08, "logits/chosen": -2.935817003250122, "logits/rejected": -2.907031536102295, "logps/chosen": -71.59526062011719, "logps/rejected": -72.44599914550781, "loss": 0.6713, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.15867696702480316, "rewards/margins": 0.04795921966433525, "rewards/rejected": -0.2066361904144287, "step": 4820 }, { "epoch": 1.664369400413508, "grad_norm": 3.0742642879486084, "learning_rate": 2.4584013789342646e-08, "logits/chosen": -2.8447859287261963, "logits/rejected": -2.8314640522003174, "logps/chosen": -68.52263641357422, "logps/rejected": -73.1375503540039, "loss": 0.6765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15491469204425812, "rewards/margins": 0.03825114294886589, "rewards/rejected": -0.1931658536195755, "step": 4830 }, { "epoch": 1.667815299793246, "grad_norm": 2.607393503189087, "learning_rate": 2.448378901282015e-08, "logits/chosen": -2.9253602027893066, "logits/rejected": -2.9030656814575195, "logps/chosen": -69.81770324707031, "logps/rejected": -71.98381042480469, "loss": 0.6724, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.14524929225444794, "rewards/margins": 0.046894971281290054, "rewards/rejected": -0.19214428961277008, "step": 4840 }, { "epoch": 1.6712611991729842, "grad_norm": 2.5191705226898193, "learning_rate": 2.438357253573033e-08, "logits/chosen": -2.8299560546875, "logits/rejected": -2.8045754432678223, "logps/chosen": -69.85077667236328, "logps/rejected": -73.82840728759766, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": -0.15583452582359314, "rewards/margins": 0.048025451600551605, "rewards/rejected": -0.20385996997356415, "step": 4850 }, { "epoch": 1.6747070985527222, "grad_norm": 2.75595760345459, "learning_rate": 2.4283365969313387e-08, "logits/chosen": -2.8155770301818848, "logits/rejected": -2.8013315200805664, "logps/chosen": -69.04396057128906, "logps/rejected": -71.60387420654297, "loss": 0.6738, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.15550412237644196, "rewards/margins": 0.04418494924902916, "rewards/rejected": -0.1996890753507614, "step": 4860 }, { "epoch": 1.6781529979324605, "grad_norm": 2.3286099433898926, "learning_rate": 2.4183170924650216e-08, "logits/chosen": -2.8884427547454834, "logits/rejected": -2.87241530418396, "logps/chosen": -69.3580551147461, "logps/rejected": -73.58790588378906, "loss": 0.6733, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.15252681076526642, "rewards/margins": 0.04483538866043091, "rewards/rejected": -0.19736221432685852, "step": 4870 }, { "epoch": 1.6815988973121985, "grad_norm": 2.415541887283325, "learning_rate": 2.4082989012636432e-08, "logits/chosen": -2.910686492919922, "logits/rejected": -2.8777689933776855, "logps/chosen": -69.32978057861328, "logps/rejected": -70.67265319824219, "loss": 0.6697, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.15174154937267303, "rewards/margins": 0.05180941894650459, "rewards/rejected": -0.20355097949504852, "step": 4880 }, { "epoch": 1.6850447966919366, "grad_norm": 2.4086358547210693, "learning_rate": 2.3982821843956556e-08, "logits/chosen": -2.868699789047241, "logits/rejected": -2.835576295852661, "logps/chosen": -72.3636245727539, "logps/rejected": -73.48369598388672, "loss": 0.6668, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1486833393573761, "rewards/margins": 0.05835698917508125, "rewards/rejected": -0.20704033970832825, "step": 4890 }, { "epoch": 1.6884906960716748, "grad_norm": 2.660745143890381, "learning_rate": 2.3882671029058028e-08, "logits/chosen": -2.892490863800049, "logits/rejected": -2.8746538162231445, "logps/chosen": -69.42784118652344, "logps/rejected": -76.51860046386719, "loss": 0.6637, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.1432090550661087, "rewards/margins": 0.06555785983800888, "rewards/rejected": -0.2087668925523758, "step": 4900 }, { "epoch": 1.6884906960716748, "eval_logits/chosen": -2.9839489459991455, "eval_logits/rejected": -2.9781594276428223, "eval_logps/chosen": -69.24857330322266, "eval_logps/rejected": -76.26242065429688, "eval_loss": 0.6819381713867188, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.10536674410104752, "eval_rewards/margins": 0.0254562646150589, "eval_rewards/rejected": -0.130823016166687, "eval_runtime": 383.125, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 4900 }, { "epoch": 1.6919365954514127, "grad_norm": 2.668877601623535, "learning_rate": 2.3782538178125374e-08, "logits/chosen": -2.8696465492248535, "logits/rejected": -2.840348720550537, "logps/chosen": -69.57804107666016, "logps/rejected": -75.3145523071289, "loss": 0.6631, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.14807581901550293, "rewards/margins": 0.06579416245222092, "rewards/rejected": -0.21386997401714325, "step": 4910 }, { "epoch": 1.6953824948311509, "grad_norm": 2.6423861980438232, "learning_rate": 2.3682424901054326e-08, "logits/chosen": -2.8980164527893066, "logits/rejected": -2.879913330078125, "logps/chosen": -73.24769592285156, "logps/rejected": -76.11488342285156, "loss": 0.6756, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.16314809024333954, "rewards/margins": 0.0410863496363163, "rewards/rejected": -0.20423445105552673, "step": 4920 }, { "epoch": 1.698828394210889, "grad_norm": 2.7229273319244385, "learning_rate": 2.3582332807425868e-08, "logits/chosen": -2.915639638900757, "logits/rejected": -2.9048633575439453, "logps/chosen": -71.25807189941406, "logps/rejected": -72.16004943847656, "loss": 0.6776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15794092416763306, "rewards/margins": 0.03591596335172653, "rewards/rejected": -0.1938568651676178, "step": 4930 }, { "epoch": 1.7022742935906272, "grad_norm": 2.543386697769165, "learning_rate": 2.348226350648045e-08, "logits/chosen": -2.867797374725342, "logits/rejected": -2.852616786956787, "logps/chosen": -67.76649475097656, "logps/rejected": -73.57878112792969, "loss": 0.6702, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1536814123392105, "rewards/margins": 0.05042048543691635, "rewards/rejected": -0.20410189032554626, "step": 4940 }, { "epoch": 1.7057201929703654, "grad_norm": 2.523470878601074, "learning_rate": 2.338221860709204e-08, "logits/chosen": -2.829023838043213, "logits/rejected": -2.811434507369995, "logps/chosen": -68.92974853515625, "logps/rejected": -73.25288391113281, "loss": 0.6634, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.14867661893367767, "rewards/margins": 0.06547947227954865, "rewards/rejected": -0.2141561061143875, "step": 4950 }, { "epoch": 1.7091660923501033, "grad_norm": 2.450138807296753, "learning_rate": 2.3282199717742308e-08, "logits/chosen": -2.945568799972534, "logits/rejected": -2.9093716144561768, "logps/chosen": -70.55841064453125, "logps/rejected": -71.09516143798828, "loss": 0.6674, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.154298335313797, "rewards/margins": 0.05675143748521805, "rewards/rejected": -0.21104976534843445, "step": 4960 }, { "epoch": 1.7126119917298415, "grad_norm": 2.459618091583252, "learning_rate": 2.3182208446494726e-08, "logits/chosen": -2.9220290184020996, "logits/rejected": -2.9182627201080322, "logps/chosen": -68.618408203125, "logps/rejected": -77.17385864257812, "loss": 0.6744, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1513422578573227, "rewards/margins": 0.042391858994960785, "rewards/rejected": -0.19373410940170288, "step": 4970 }, { "epoch": 1.7160578911095796, "grad_norm": 2.569059371948242, "learning_rate": 2.3082246400968758e-08, "logits/chosen": -2.93679141998291, "logits/rejected": -2.9106948375701904, "logps/chosen": -70.22674560546875, "logps/rejected": -73.23624420166016, "loss": 0.6772, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.1652604043483734, "rewards/margins": 0.03740895166993141, "rewards/rejected": -0.20266935229301453, "step": 4980 }, { "epoch": 1.7195037904893176, "grad_norm": 2.6091148853302, "learning_rate": 2.298231518831395e-08, "logits/chosen": -2.8679404258728027, "logits/rejected": -2.851804256439209, "logps/chosen": -68.08195495605469, "logps/rejected": -73.60820007324219, "loss": 0.6699, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16003942489624023, "rewards/margins": 0.051316119730472565, "rewards/rejected": -0.2113555371761322, "step": 4990 }, { "epoch": 1.722949689869056, "grad_norm": 2.602182149887085, "learning_rate": 2.2882416415184174e-08, "logits/chosen": -2.934497833251953, "logits/rejected": -2.900315284729004, "logps/chosen": -69.95307159423828, "logps/rejected": -73.72386169433594, "loss": 0.6702, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.15212446451187134, "rewards/margins": 0.05069457367062569, "rewards/rejected": -0.20281903445720673, "step": 5000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.9805562496185303, "eval_logits/rejected": -2.9747941493988037, "eval_logps/chosen": -69.45015716552734, "eval_logps/rejected": -76.49998474121094, "eval_loss": 0.681788444519043, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -0.1073826402425766, "eval_rewards/margins": 0.025816014036536217, "eval_rewards/rejected": -0.13319866359233856, "eval_runtime": 383.1904, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 5000 }, { "epoch": 1.7263955892487939, "grad_norm": 2.358308792114258, "learning_rate": 2.2782551687711733e-08, "logits/chosen": -2.883704662322998, "logits/rejected": -2.8575568199157715, "logps/chosen": -68.92476654052734, "logps/rejected": -73.47639465332031, "loss": 0.6672, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15898780524730682, "rewards/margins": 0.05699171870946884, "rewards/rejected": -0.21597954630851746, "step": 5010 }, { "epoch": 1.729841488628532, "grad_norm": 2.789893865585327, "learning_rate": 2.2682722611481548e-08, "logits/chosen": -2.961850166320801, "logits/rejected": -2.9385809898376465, "logps/chosen": -71.18653869628906, "logps/rejected": -72.99732971191406, "loss": 0.6723, "rewards/accuracies": 0.640625, "rewards/chosen": -0.15070433914661407, "rewards/margins": 0.04643486067652702, "rewards/rejected": -0.1971392184495926, "step": 5020 }, { "epoch": 1.7332873880082702, "grad_norm": 2.3874168395996094, "learning_rate": 2.258293079150537e-08, "logits/chosen": -2.848935127258301, "logits/rejected": -2.844420909881592, "logps/chosen": -68.67393493652344, "logps/rejected": -73.88865661621094, "loss": 0.6843, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.17319481074810028, "rewards/margins": 0.022579029202461243, "rewards/rejected": -0.19577382504940033, "step": 5030 }, { "epoch": 1.7367332873880081, "grad_norm": 2.923815965652466, "learning_rate": 2.2483177832195928e-08, "logits/chosen": -2.865600109100342, "logits/rejected": -2.852930784225464, "logps/chosen": -70.68048858642578, "logps/rejected": -74.65128326416016, "loss": 0.6727, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16184066236019135, "rewards/margins": 0.04684358462691307, "rewards/rejected": -0.20868425071239471, "step": 5040 }, { "epoch": 1.7401791867677465, "grad_norm": 2.867833375930786, "learning_rate": 2.2383465337341223e-08, "logits/chosen": -2.9334335327148438, "logits/rejected": -2.918736219406128, "logps/chosen": -70.02317810058594, "logps/rejected": -76.82987976074219, "loss": 0.6689, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.1534358561038971, "rewards/margins": 0.0537557527422905, "rewards/rejected": -0.2071916162967682, "step": 5050 }, { "epoch": 1.7436250861474845, "grad_norm": 2.6318929195404053, "learning_rate": 2.228379491007862e-08, "logits/chosen": -2.928920269012451, "logits/rejected": -2.9219136238098145, "logps/chosen": -68.65642547607422, "logps/rejected": -73.80455017089844, "loss": 0.6723, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.16508011519908905, "rewards/margins": 0.04670392721891403, "rewards/rejected": -0.21178404986858368, "step": 5060 }, { "epoch": 1.7470709855272226, "grad_norm": 2.9904770851135254, "learning_rate": 2.2184168152869184e-08, "logits/chosen": -2.924872636795044, "logits/rejected": -2.8935725688934326, "logps/chosen": -70.9407730102539, "logps/rejected": -74.31452941894531, "loss": 0.6639, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14054693281650543, "rewards/margins": 0.06451435387134552, "rewards/rejected": -0.20506128668785095, "step": 5070 }, { "epoch": 1.7505168849069608, "grad_norm": 2.7165467739105225, "learning_rate": 2.2084586667471845e-08, "logits/chosen": -2.864814281463623, "logits/rejected": -2.847811698913574, "logps/chosen": -69.76703643798828, "logps/rejected": -75.40263366699219, "loss": 0.6683, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16044360399246216, "rewards/margins": 0.05554025247693062, "rewards/rejected": -0.21598383784294128, "step": 5080 }, { "epoch": 1.7539627842866987, "grad_norm": 2.5998380184173584, "learning_rate": 2.1985052054917687e-08, "logits/chosen": -2.8491740226745605, "logits/rejected": -2.8302299976348877, "logps/chosen": -69.0057601928711, "logps/rejected": -74.46393585205078, "loss": 0.6694, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15329837799072266, "rewards/margins": 0.05276813358068466, "rewards/rejected": -0.20606651902198792, "step": 5090 }, { "epoch": 1.757408683666437, "grad_norm": 2.5868284702301025, "learning_rate": 2.1885565915484193e-08, "logits/chosen": -2.8915534019470215, "logits/rejected": -2.8696448802948, "logps/chosen": -67.51110076904297, "logps/rejected": -72.25811767578125, "loss": 0.6694, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15821656584739685, "rewards/margins": 0.05269990488886833, "rewards/rejected": -0.21091647446155548, "step": 5100 }, { "epoch": 1.757408683666437, "eval_logits/chosen": -2.9760875701904297, "eval_logits/rejected": -2.970331907272339, "eval_logps/chosen": -69.78114318847656, "eval_logps/rejected": -76.88988494873047, "eval_loss": 0.6815394163131714, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.11069244146347046, "eval_rewards/margins": 0.0264052115380764, "eval_rewards/rejected": -0.13709765672683716, "eval_runtime": 383.1896, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 5100 }, { "epoch": 1.760854583046175, "grad_norm": 2.5706427097320557, "learning_rate": 2.1786129848669483e-08, "logits/chosen": -2.8978376388549805, "logits/rejected": -2.8931891918182373, "logps/chosen": -67.79803466796875, "logps/rejected": -75.92095184326172, "loss": 0.6698, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16598822176456451, "rewards/margins": 0.05197644978761673, "rewards/rejected": -0.21796467900276184, "step": 5110 }, { "epoch": 1.7643004824259132, "grad_norm": 2.7690415382385254, "learning_rate": 2.1686745453166676e-08, "logits/chosen": -2.8238472938537598, "logits/rejected": -2.798102378845215, "logps/chosen": -68.95014953613281, "logps/rejected": -73.85153198242188, "loss": 0.6699, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16501165926456451, "rewards/margins": 0.05237952992320061, "rewards/rejected": -0.21739117801189423, "step": 5120 }, { "epoch": 1.7677463818056514, "grad_norm": 2.5694692134857178, "learning_rate": 2.1587414326838095e-08, "logits/chosen": -2.8898651599884033, "logits/rejected": -2.871417284011841, "logps/chosen": -71.06920623779297, "logps/rejected": -74.75550842285156, "loss": 0.6685, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.16049064695835114, "rewards/margins": 0.05430537462234497, "rewards/rejected": -0.21479599177837372, "step": 5130 }, { "epoch": 1.7711922811853893, "grad_norm": 2.753486394882202, "learning_rate": 2.1488138066689668e-08, "logits/chosen": -2.8540873527526855, "logits/rejected": -2.8399171829223633, "logps/chosen": -71.9068374633789, "logps/rejected": -76.37146759033203, "loss": 0.6708, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1702689826488495, "rewards/margins": 0.05002786964178085, "rewards/rejected": -0.22029681503772736, "step": 5140 }, { "epoch": 1.7746381805651275, "grad_norm": 2.8333446979522705, "learning_rate": 2.1388918268845168e-08, "logits/chosen": -2.840865135192871, "logits/rejected": -2.8308463096618652, "logps/chosen": -69.06111145019531, "logps/rejected": -76.5210189819336, "loss": 0.6688, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.16019055247306824, "rewards/margins": 0.055166494101285934, "rewards/rejected": -0.21535703539848328, "step": 5150 }, { "epoch": 1.7780840799448656, "grad_norm": 2.544689416885376, "learning_rate": 2.1289756528520613e-08, "logits/chosen": -2.839569568634033, "logits/rejected": -2.8112070560455322, "logps/chosen": -73.0462875366211, "logps/rejected": -74.88614654541016, "loss": 0.6666, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.16534116864204407, "rewards/margins": 0.058647722005844116, "rewards/rejected": -0.22398889064788818, "step": 5160 }, { "epoch": 1.7815299793246038, "grad_norm": 2.907992362976074, "learning_rate": 2.119065443999858e-08, "logits/chosen": -2.8962020874023438, "logits/rejected": -2.8772637844085693, "logps/chosen": -70.54364776611328, "logps/rejected": -76.35348510742188, "loss": 0.6688, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16427725553512573, "rewards/margins": 0.05383526161313057, "rewards/rejected": -0.218112513422966, "step": 5170 }, { "epoch": 1.784975878704342, "grad_norm": 2.5953314304351807, "learning_rate": 2.1091613596602594e-08, "logits/chosen": -2.9131650924682617, "logits/rejected": -2.8833131790161133, "logps/chosen": -70.49974060058594, "logps/rejected": -72.7633285522461, "loss": 0.669, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1548534631729126, "rewards/margins": 0.05347966030240059, "rewards/rejected": -0.2083331048488617, "step": 5180 }, { "epoch": 1.7884217780840799, "grad_norm": 2.8002374172210693, "learning_rate": 2.0992635590671528e-08, "logits/chosen": -2.7871344089508057, "logits/rejected": -2.765974760055542, "logps/chosen": -68.91281127929688, "logps/rejected": -74.27314758300781, "loss": 0.6723, "rewards/accuracies": 0.625, "rewards/chosen": -0.16984659433364868, "rewards/margins": 0.046960391104221344, "rewards/rejected": -0.21680697798728943, "step": 5190 }, { "epoch": 1.791867677463818, "grad_norm": 2.8456871509552, "learning_rate": 2.089372201353394e-08, "logits/chosen": -2.8801169395446777, "logits/rejected": -2.8567395210266113, "logps/chosen": -70.15199279785156, "logps/rejected": -76.99073791503906, "loss": 0.6654, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1617727279663086, "rewards/margins": 0.06113101914525032, "rewards/rejected": -0.2229037582874298, "step": 5200 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.971893310546875, "eval_logits/rejected": -2.966071844100952, "eval_logps/chosen": -70.03202056884766, "eval_logps/rejected": -77.19258117675781, "eval_loss": 0.6813249588012695, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.11320130527019501, "eval_rewards/margins": 0.026923339813947678, "eval_rewards/rejected": -0.1401246339082718, "eval_runtime": 383.1316, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 5200 }, { "epoch": 1.7953135768435562, "grad_norm": 2.585279703140259, "learning_rate": 2.0794874455482552e-08, "logits/chosen": -2.871621608734131, "logits/rejected": -2.838993787765503, "logps/chosen": -71.43910217285156, "logps/rejected": -75.10260009765625, "loss": 0.6687, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16120050847530365, "rewards/margins": 0.053817398846149445, "rewards/rejected": -0.2150179147720337, "step": 5210 }, { "epoch": 1.7987594762232941, "grad_norm": 2.684385299682617, "learning_rate": 2.0696094505748655e-08, "logits/chosen": -2.9440083503723145, "logits/rejected": -2.9305977821350098, "logps/chosen": -72.88982391357422, "logps/rejected": -75.67564392089844, "loss": 0.6757, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.1651952862739563, "rewards/margins": 0.039820533245801926, "rewards/rejected": -0.20501580834388733, "step": 5220 }, { "epoch": 1.8022053756030325, "grad_norm": 2.7644712924957275, "learning_rate": 2.059738375247656e-08, "logits/chosen": -2.8968605995178223, "logits/rejected": -2.881497859954834, "logps/chosen": -70.36112976074219, "logps/rejected": -77.09461212158203, "loss": 0.6682, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16262635588645935, "rewards/margins": 0.0549902506172657, "rewards/rejected": -0.21761660277843475, "step": 5230 }, { "epoch": 1.8056512749827704, "grad_norm": 2.7018826007843018, "learning_rate": 2.0498743782698047e-08, "logits/chosen": -2.818573474884033, "logits/rejected": -2.7887961864471436, "logps/chosen": -69.94970703125, "logps/rejected": -73.11033630371094, "loss": 0.6683, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.16011056303977966, "rewards/margins": 0.05508071184158325, "rewards/rejected": -0.2151912897825241, "step": 5240 }, { "epoch": 1.8090971743625086, "grad_norm": 2.7184810638427734, "learning_rate": 2.04001761823069e-08, "logits/chosen": -2.8813183307647705, "logits/rejected": -2.8594295978546143, "logps/chosen": -68.76884460449219, "logps/rejected": -75.37754821777344, "loss": 0.666, "rewards/accuracies": 0.671875, "rewards/chosen": -0.15513893961906433, "rewards/margins": 0.06031592935323715, "rewards/rejected": -0.2154548615217209, "step": 5250 }, { "epoch": 1.8125430737422468, "grad_norm": 2.6583566665649414, "learning_rate": 2.0301682536033366e-08, "logits/chosen": -2.8979711532592773, "logits/rejected": -2.887810230255127, "logps/chosen": -69.4297103881836, "logps/rejected": -76.50968933105469, "loss": 0.6645, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1639290302991867, "rewards/margins": 0.06259413808584213, "rewards/rejected": -0.22652316093444824, "step": 5260 }, { "epoch": 1.8159889731219847, "grad_norm": 2.729187250137329, "learning_rate": 2.0203264427418666e-08, "logits/chosen": -2.8294336795806885, "logits/rejected": -2.8056840896606445, "logps/chosen": -69.6157455444336, "logps/rejected": -72.59150695800781, "loss": 0.6707, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16779953241348267, "rewards/margins": 0.05051570385694504, "rewards/rejected": -0.2183152437210083, "step": 5270 }, { "epoch": 1.819434872501723, "grad_norm": 2.71742844581604, "learning_rate": 2.010492343878959e-08, "logits/chosen": -2.906381845474243, "logits/rejected": -2.8961892127990723, "logps/chosen": -71.85508728027344, "logps/rejected": -77.47413635253906, "loss": 0.6706, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1716351956129074, "rewards/margins": 0.051093690097332, "rewards/rejected": -0.22272889316082, "step": 5280 }, { "epoch": 1.822880771881461, "grad_norm": 2.600576400756836, "learning_rate": 2.0006661151232984e-08, "logits/chosen": -2.91498064994812, "logits/rejected": -2.8979058265686035, "logps/chosen": -71.20735168457031, "logps/rejected": -77.16497039794922, "loss": 0.6667, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16552945971488953, "rewards/margins": 0.05823059752583504, "rewards/rejected": -0.22376005351543427, "step": 5290 }, { "epoch": 1.8263266712611992, "grad_norm": 2.586590528488159, "learning_rate": 1.9908479144570418e-08, "logits/chosen": -2.905637741088867, "logits/rejected": -2.8802638053894043, "logps/chosen": -71.35521697998047, "logps/rejected": -74.51483917236328, "loss": 0.6698, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.16961674392223358, "rewards/margins": 0.05241764709353447, "rewards/rejected": -0.22203440964221954, "step": 5300 }, { "epoch": 1.8263266712611992, "eval_logits/chosen": -2.968419075012207, "eval_logits/rejected": -2.962567090988159, "eval_logps/chosen": -70.36827087402344, "eval_logps/rejected": -77.58527374267578, "eval_loss": 0.6810991168022156, "eval_rewards/accuracies": 0.6066449880599976, "eval_rewards/chosen": -0.11656372994184494, "eval_rewards/margins": 0.027487870305776596, "eval_rewards/rejected": -0.14405162632465363, "eval_runtime": 384.7907, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 5300 }, { "epoch": 1.8297725706409373, "grad_norm": 2.916703224182129, "learning_rate": 1.9810378997332692e-08, "logits/chosen": -2.82668137550354, "logits/rejected": -2.8102364540100098, "logps/chosen": -69.68516540527344, "logps/rejected": -74.15269470214844, "loss": 0.6717, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1725122034549713, "rewards/margins": 0.048040445894002914, "rewards/rejected": -0.22055263817310333, "step": 5310 }, { "epoch": 1.8332184700206753, "grad_norm": 2.818455457687378, "learning_rate": 1.9712362286734547e-08, "logits/chosen": -2.869730234146118, "logits/rejected": -2.8491005897521973, "logps/chosen": -71.40693664550781, "logps/rejected": -75.9687728881836, "loss": 0.6673, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16744691133499146, "rewards/margins": 0.05850174278020859, "rewards/rejected": -0.22594866156578064, "step": 5320 }, { "epoch": 1.8366643694004137, "grad_norm": 2.5827646255493164, "learning_rate": 1.9614430588649217e-08, "logits/chosen": -2.8839173316955566, "logits/rejected": -2.8669357299804688, "logps/chosen": -71.69801330566406, "logps/rejected": -77.46391296386719, "loss": 0.6688, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.17538206279277802, "rewards/margins": 0.054268717765808105, "rewards/rejected": -0.2296508103609085, "step": 5330 }, { "epoch": 1.8401102687801516, "grad_norm": 2.6504569053649902, "learning_rate": 1.951658547758317e-08, "logits/chosen": -2.8742682933807373, "logits/rejected": -2.8431947231292725, "logps/chosen": -71.82453155517578, "logps/rejected": -74.79241943359375, "loss": 0.6664, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1761089414358139, "rewards/margins": 0.06009141728281975, "rewards/rejected": -0.23620033264160156, "step": 5340 }, { "epoch": 1.8435561681598898, "grad_norm": 2.4286372661590576, "learning_rate": 1.9418828526650767e-08, "logits/chosen": -2.902813673019409, "logits/rejected": -2.8797168731689453, "logps/chosen": -71.55126190185547, "logps/rejected": -75.11259460449219, "loss": 0.6701, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16592486202716827, "rewards/margins": 0.05162210389971733, "rewards/rejected": -0.2175469845533371, "step": 5350 }, { "epoch": 1.847002067539628, "grad_norm": 2.765981912612915, "learning_rate": 1.9321161307548935e-08, "logits/chosen": -2.8550877571105957, "logits/rejected": -2.82519268989563, "logps/chosen": -70.42249298095703, "logps/rejected": -73.4386215209961, "loss": 0.6702, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.17406678199768066, "rewards/margins": 0.05172518640756607, "rewards/rejected": -0.22579196095466614, "step": 5360 }, { "epoch": 1.8504479669193659, "grad_norm": 3.3246870040893555, "learning_rate": 1.922358539053197e-08, "logits/chosen": -2.890129327774048, "logits/rejected": -2.86722993850708, "logps/chosen": -71.7930908203125, "logps/rejected": -74.7721939086914, "loss": 0.6718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17468366026878357, "rewards/margins": 0.047895219177007675, "rewards/rejected": -0.22257891297340393, "step": 5370 }, { "epoch": 1.853893866299104, "grad_norm": 2.6430766582489014, "learning_rate": 1.91261023443862e-08, "logits/chosen": -2.8341400623321533, "logits/rejected": -2.8099443912506104, "logps/chosen": -69.27935791015625, "logps/rejected": -75.3008804321289, "loss": 0.6651, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16409826278686523, "rewards/margins": 0.0616566464304924, "rewards/rejected": -0.22575490176677704, "step": 5380 }, { "epoch": 1.8573397656788422, "grad_norm": 2.7408976554870605, "learning_rate": 1.9028713736404868e-08, "logits/chosen": -2.887418746948242, "logits/rejected": -2.870863437652588, "logps/chosen": -71.47615051269531, "logps/rejected": -76.49659729003906, "loss": 0.6695, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.16355492174625397, "rewards/margins": 0.05303896591067314, "rewards/rejected": -0.2165938913822174, "step": 5390 }, { "epoch": 1.8607856650585803, "grad_norm": 2.8003110885620117, "learning_rate": 1.8931421132362826e-08, "logits/chosen": -2.960994243621826, "logits/rejected": -2.9350266456604004, "logps/chosen": -68.39563751220703, "logps/rejected": -76.294677734375, "loss": 0.6644, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16245418787002563, "rewards/margins": 0.06386339664459229, "rewards/rejected": -0.2263175994157791, "step": 5400 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.9650156497955322, "eval_logits/rejected": -2.959172010421753, "eval_logps/chosen": -70.68416595458984, "eval_logps/rejected": -77.96033477783203, "eval_loss": 0.6808489561080933, "eval_rewards/accuracies": 0.6036245226860046, "eval_rewards/chosen": -0.11972276866436005, "eval_rewards/margins": 0.02807936631143093, "eval_rewards/rejected": -0.14780212938785553, "eval_runtime": 384.7367, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 5400 }, { "epoch": 1.8642315644383185, "grad_norm": 2.5483036041259766, "learning_rate": 1.8834226096491457e-08, "logits/chosen": -2.8595709800720215, "logits/rejected": -2.8264570236206055, "logps/chosen": -71.37809753417969, "logps/rejected": -72.77625274658203, "loss": 0.6666, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.16140702366828918, "rewards/margins": 0.05878330022096634, "rewards/rejected": -0.22019031643867493, "step": 5410 }, { "epoch": 1.8676774638180564, "grad_norm": 2.5755534172058105, "learning_rate": 1.8737130191453444e-08, "logits/chosen": -2.8737831115722656, "logits/rejected": -2.8454034328460693, "logps/chosen": -69.20013427734375, "logps/rejected": -73.87606048583984, "loss": 0.6712, "rewards/accuracies": 0.640625, "rewards/chosen": -0.16796642541885376, "rewards/margins": 0.049731411039829254, "rewards/rejected": -0.21769782900810242, "step": 5420 }, { "epoch": 1.8711233631977946, "grad_norm": 2.7243731021881104, "learning_rate": 1.8640134978317707e-08, "logits/chosen": -2.8814034461975098, "logits/rejected": -2.8573858737945557, "logps/chosen": -69.18235778808594, "logps/rejected": -75.73823547363281, "loss": 0.667, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.16472555696964264, "rewards/margins": 0.058313000947237015, "rewards/rejected": -0.22303852438926697, "step": 5430 }, { "epoch": 1.8745692625775328, "grad_norm": 2.8108060359954834, "learning_rate": 1.8543242016534295e-08, "logits/chosen": -2.8223328590393066, "logits/rejected": -2.809980869293213, "logps/chosen": -68.95293426513672, "logps/rejected": -77.0837631225586, "loss": 0.6653, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1660550832748413, "rewards/margins": 0.061001889407634735, "rewards/rejected": -0.22705698013305664, "step": 5440 }, { "epoch": 1.8780151619572707, "grad_norm": 2.4889485836029053, "learning_rate": 1.8446452863909272e-08, "logits/chosen": -2.9422874450683594, "logits/rejected": -2.917186737060547, "logps/chosen": -70.5916976928711, "logps/rejected": -74.60118103027344, "loss": 0.6687, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16515326499938965, "rewards/margins": 0.05371558666229248, "rewards/rejected": -0.21886885166168213, "step": 5450 }, { "epoch": 1.881461061337009, "grad_norm": 2.659416437149048, "learning_rate": 1.8349769076579714e-08, "logits/chosen": -2.8489508628845215, "logits/rejected": -2.8205370903015137, "logps/chosen": -72.37272644042969, "logps/rejected": -76.41153717041016, "loss": 0.6646, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16620686650276184, "rewards/margins": 0.06240493804216385, "rewards/rejected": -0.2286117970943451, "step": 5460 }, { "epoch": 1.884906960716747, "grad_norm": 2.9940054416656494, "learning_rate": 1.8253192208988657e-08, "logits/chosen": -2.7865242958068848, "logits/rejected": -2.761887311935425, "logps/chosen": -72.81736755371094, "logps/rejected": -75.52337646484375, "loss": 0.6723, "rewards/accuracies": 0.609375, "rewards/chosen": -0.17481762170791626, "rewards/margins": 0.0471196249127388, "rewards/rejected": -0.22193722426891327, "step": 5470 }, { "epoch": 1.8883528600964852, "grad_norm": 2.8586626052856445, "learning_rate": 1.815672381386017e-08, "logits/chosen": -2.853214979171753, "logits/rejected": -2.8347599506378174, "logps/chosen": -72.21398162841797, "logps/rejected": -77.33732604980469, "loss": 0.6679, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1800553947687149, "rewards/margins": 0.056976526975631714, "rewards/rejected": -0.2370319366455078, "step": 5480 }, { "epoch": 1.8917987594762233, "grad_norm": 2.9663612842559814, "learning_rate": 1.806036544217429e-08, "logits/chosen": -2.8946380615234375, "logits/rejected": -2.8724074363708496, "logps/chosen": -72.75650787353516, "logps/rejected": -78.33419036865234, "loss": 0.6663, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.17038476467132568, "rewards/margins": 0.06150870397686958, "rewards/rejected": -0.23189349472522736, "step": 5490 }, { "epoch": 1.8952446588559613, "grad_norm": 3.114943027496338, "learning_rate": 1.7964118643142196e-08, "logits/chosen": -2.9016942977905273, "logits/rejected": -2.8939430713653564, "logps/chosen": -71.15440368652344, "logps/rejected": -77.6981430053711, "loss": 0.6735, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.18466083705425262, "rewards/margins": 0.04492039233446121, "rewards/rejected": -0.22958120703697205, "step": 5500 }, { "epoch": 1.8952446588559613, "eval_logits/chosen": -2.9619333744049072, "eval_logits/rejected": -2.956106662750244, "eval_logps/chosen": -70.8988265991211, "eval_logps/rejected": -78.21332550048828, "eval_loss": 0.6806997060775757, "eval_rewards/accuracies": 0.6017658114433289, "eval_rewards/chosen": -0.12186926603317261, "eval_rewards/margins": 0.028462767601013184, "eval_rewards/rejected": -0.1503320336341858, "eval_runtime": 384.7541, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 5500 }, { "epoch": 1.8986905582356997, "grad_norm": 2.8946828842163086, "learning_rate": 1.7867984964181194e-08, "logits/chosen": -2.9266982078552246, "logits/rejected": -2.901700735092163, "logps/chosen": -70.2459487915039, "logps/rejected": -74.8499526977539, "loss": 0.6638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16140781342983246, "rewards/margins": 0.06453949958086014, "rewards/rejected": -0.225947305560112, "step": 5510 }, { "epoch": 1.9021364576154376, "grad_norm": 2.641909599304199, "learning_rate": 1.7771965950889927e-08, "logits/chosen": -2.916412353515625, "logits/rejected": -2.9049272537231445, "logps/chosen": -73.4239501953125, "logps/rejected": -78.68853759765625, "loss": 0.6669, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.1779644787311554, "rewards/margins": 0.05876699090003967, "rewards/rejected": -0.23673145473003387, "step": 5520 }, { "epoch": 1.9055823569951758, "grad_norm": 2.8171238899230957, "learning_rate": 1.7676063147023485e-08, "logits/chosen": -2.8761868476867676, "logits/rejected": -2.845132350921631, "logps/chosen": -69.6806411743164, "logps/rejected": -76.5590591430664, "loss": 0.6639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16561131179332733, "rewards/margins": 0.06496408581733704, "rewards/rejected": -0.23057539761066437, "step": 5530 }, { "epoch": 1.909028256374914, "grad_norm": 2.788418769836426, "learning_rate": 1.758027809446856e-08, "logits/chosen": -2.9127445220947266, "logits/rejected": -2.876011371612549, "logps/chosen": -71.16133117675781, "logps/rejected": -75.27438354492188, "loss": 0.6593, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.16821877658367157, "rewards/margins": 0.07422787696123123, "rewards/rejected": -0.2424466609954834, "step": 5540 }, { "epoch": 1.9124741557546519, "grad_norm": 2.511777639389038, "learning_rate": 1.748461233321874e-08, "logits/chosen": -2.815739154815674, "logits/rejected": -2.7984278202056885, "logps/chosen": -71.97752380371094, "logps/rejected": -76.06411743164062, "loss": 0.6706, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.17626744508743286, "rewards/margins": 0.05062605068087578, "rewards/rejected": -0.22689349949359894, "step": 5550 }, { "epoch": 1.9159200551343902, "grad_norm": 2.6722583770751953, "learning_rate": 1.7389067401349637e-08, "logits/chosen": -2.9440550804138184, "logits/rejected": -2.917231798171997, "logps/chosen": -75.46846771240234, "logps/rejected": -78.74665832519531, "loss": 0.6646, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18760187923908234, "rewards/margins": 0.06388412415981293, "rewards/rejected": -0.25148600339889526, "step": 5560 }, { "epoch": 1.9193659545141282, "grad_norm": 2.7216358184814453, "learning_rate": 1.7293644834994265e-08, "logits/chosen": -2.8047280311584473, "logits/rejected": -2.7836663722991943, "logps/chosen": -73.18403625488281, "logps/rejected": -75.3668441772461, "loss": 0.6726, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1833932101726532, "rewards/margins": 0.046460576355457306, "rewards/rejected": -0.2298537939786911, "step": 5570 }, { "epoch": 1.9228118538938663, "grad_norm": 2.5434470176696777, "learning_rate": 1.7198346168318255e-08, "logits/chosen": -2.9130733013153076, "logits/rejected": -2.8974547386169434, "logps/chosen": -72.49492645263672, "logps/rejected": -75.53585815429688, "loss": 0.6741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.17388814687728882, "rewards/margins": 0.04384138435125351, "rewards/rejected": -0.21772952377796173, "step": 5580 }, { "epoch": 1.9262577532736045, "grad_norm": 2.7205216884613037, "learning_rate": 1.7103172933495268e-08, "logits/chosen": -2.851735830307007, "logits/rejected": -2.8260715007781982, "logps/chosen": -73.14872741699219, "logps/rejected": -72.71751403808594, "loss": 0.67, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17584729194641113, "rewards/margins": 0.052096374332904816, "rewards/rejected": -0.22794368863105774, "step": 5590 }, { "epoch": 1.9297036526533424, "grad_norm": 2.8131136894226074, "learning_rate": 1.7008126660682274e-08, "logits/chosen": -2.9163663387298584, "logits/rejected": -2.891171455383301, "logps/chosen": -70.73484802246094, "logps/rejected": -76.10734558105469, "loss": 0.662, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.16637368500232697, "rewards/margins": 0.06777597963809967, "rewards/rejected": -0.23414966464042664, "step": 5600 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.9584717750549316, "eval_logits/rejected": -2.952648401260376, "eval_logps/chosen": -71.29200744628906, "eval_logps/rejected": -78.66410827636719, "eval_loss": 0.680462121963501, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.12580114603042603, "eval_rewards/margins": 0.029038695618510246, "eval_rewards/rejected": -0.15483984351158142, "eval_runtime": 384.9923, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 5600 }, { "epoch": 1.9331495520330806, "grad_norm": 2.831731081008911, "learning_rate": 1.691320887799504e-08, "logits/chosen": -2.822838544845581, "logits/rejected": -2.807856798171997, "logps/chosen": -68.80204010009766, "logps/rejected": -78.33151245117188, "loss": 0.6669, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.17571809887886047, "rewards/margins": 0.05944720655679703, "rewards/rejected": -0.2351652830839157, "step": 5610 }, { "epoch": 1.9365954514128187, "grad_norm": 2.6682093143463135, "learning_rate": 1.681842111148352e-08, "logits/chosen": -2.8356566429138184, "logits/rejected": -2.8215272426605225, "logps/chosen": -74.12593841552734, "logps/rejected": -75.23628997802734, "loss": 0.6767, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.1891641467809677, "rewards/margins": 0.03835584595799446, "rewards/rejected": -0.22751998901367188, "step": 5620 }, { "epoch": 1.940041350792557, "grad_norm": 2.92366886138916, "learning_rate": 1.6723764885107282e-08, "logits/chosen": -2.834421157836914, "logits/rejected": -2.804478883743286, "logps/chosen": -73.24942779541016, "logps/rejected": -75.69530487060547, "loss": 0.6664, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17188575863838196, "rewards/margins": 0.05913590267300606, "rewards/rejected": -0.2310216724872589, "step": 5630 }, { "epoch": 1.943487250172295, "grad_norm": 2.8467464447021484, "learning_rate": 1.6629241720711096e-08, "logits/chosen": -2.887014865875244, "logits/rejected": -2.857152223587036, "logps/chosen": -73.31623077392578, "logps/rejected": -74.98347473144531, "loss": 0.6665, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17376892268657684, "rewards/margins": 0.058810073882341385, "rewards/rejected": -0.23257899284362793, "step": 5640 }, { "epoch": 1.946933149552033, "grad_norm": 3.013774871826172, "learning_rate": 1.6534853138000366e-08, "logits/chosen": -2.858309268951416, "logits/rejected": -2.8377978801727295, "logps/chosen": -76.77154541015625, "logps/rejected": -77.50053405761719, "loss": 0.6701, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.1733008623123169, "rewards/margins": 0.05244135111570358, "rewards/rejected": -0.22574222087860107, "step": 5650 }, { "epoch": 1.9503790489317712, "grad_norm": 2.980992555618286, "learning_rate": 1.6440600654516777e-08, "logits/chosen": -2.913217067718506, "logits/rejected": -2.8954920768737793, "logps/chosen": -69.61531829833984, "logps/rejected": -76.81146240234375, "loss": 0.6607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1692298799753189, "rewards/margins": 0.07172516733407974, "rewards/rejected": -0.24095503985881805, "step": 5660 }, { "epoch": 1.9538249483115093, "grad_norm": 2.8797342777252197, "learning_rate": 1.6346485785613852e-08, "logits/chosen": -2.8300557136535645, "logits/rejected": -2.8022449016571045, "logps/chosen": -72.28825378417969, "logps/rejected": -76.40351867675781, "loss": 0.6633, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18319183588027954, "rewards/margins": 0.06637506186962128, "rewards/rejected": -0.249566912651062, "step": 5670 }, { "epoch": 1.9572708476912473, "grad_norm": 2.7297043800354004, "learning_rate": 1.625251004443262e-08, "logits/chosen": -2.9041075706481934, "logits/rejected": -2.883746385574341, "logps/chosen": -73.53236389160156, "logps/rejected": -76.66496276855469, "loss": 0.6691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1895168274641037, "rewards/margins": 0.05409818887710571, "rewards/rejected": -0.2436150312423706, "step": 5680 }, { "epoch": 1.9607167470709856, "grad_norm": 2.725618362426758, "learning_rate": 1.6158674941877236e-08, "logits/chosen": -2.8295116424560547, "logits/rejected": -2.8105344772338867, "logps/chosen": -72.64253234863281, "logps/rejected": -76.02645111083984, "loss": 0.6684, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1858823299407959, "rewards/margins": 0.05590387433767319, "rewards/rejected": -0.2417861968278885, "step": 5690 }, { "epoch": 1.9641626464507236, "grad_norm": 2.804849147796631, "learning_rate": 1.6064981986590763e-08, "logits/chosen": -2.9085612297058105, "logits/rejected": -2.881753921508789, "logps/chosen": -70.80479431152344, "logps/rejected": -77.58302307128906, "loss": 0.6634, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16686227917671204, "rewards/margins": 0.06583373993635178, "rewards/rejected": -0.2326960265636444, "step": 5700 }, { "epoch": 1.9641626464507236, "eval_logits/chosen": -2.955355405807495, "eval_logits/rejected": -2.9495320320129395, "eval_logps/chosen": -71.450439453125, "eval_logps/rejected": -78.85834503173828, "eval_loss": 0.6803197860717773, "eval_rewards/accuracies": 0.6050186157226562, "eval_rewards/chosen": -0.1273854672908783, "eval_rewards/margins": 0.029396746307611465, "eval_rewards/rejected": -0.15678218007087708, "eval_runtime": 384.6759, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5700 }, { "epoch": 1.9676085458304617, "grad_norm": 2.878384590148926, "learning_rate": 1.5971432684930852e-08, "logits/chosen": -2.8811278343200684, "logits/rejected": -2.8517918586730957, "logps/chosen": -71.73265075683594, "logps/rejected": -75.1236343383789, "loss": 0.6691, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1704731285572052, "rewards/margins": 0.05412403494119644, "rewards/rejected": -0.22459714114665985, "step": 5710 }, { "epoch": 1.9710544452102, "grad_norm": 2.6991753578186035, "learning_rate": 1.5878028540945552e-08, "logits/chosen": -2.8333468437194824, "logits/rejected": -2.814579725265503, "logps/chosen": -72.25865173339844, "logps/rejected": -75.1773910522461, "loss": 0.6756, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.18498411774635315, "rewards/margins": 0.04039698839187622, "rewards/rejected": -0.22538113594055176, "step": 5720 }, { "epoch": 1.9745003445899378, "grad_norm": 2.8288464546203613, "learning_rate": 1.578477105634914e-08, "logits/chosen": -2.8175792694091797, "logits/rejected": -2.796377658843994, "logps/chosen": -71.75520324707031, "logps/rejected": -77.10679626464844, "loss": 0.6641, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1763220727443695, "rewards/margins": 0.06456269323825836, "rewards/rejected": -0.24088478088378906, "step": 5730 }, { "epoch": 1.9779462439696762, "grad_norm": 2.8403077125549316, "learning_rate": 1.5691661730497936e-08, "logits/chosen": -2.905531644821167, "logits/rejected": -2.8741893768310547, "logps/chosen": -74.14823150634766, "logps/rejected": -75.32475280761719, "loss": 0.6625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17679746448993683, "rewards/margins": 0.06940910220146179, "rewards/rejected": -0.24620656669139862, "step": 5740 }, { "epoch": 1.9813921433494142, "grad_norm": 2.375809907913208, "learning_rate": 1.559870206036626e-08, "logits/chosen": -2.8683509826660156, "logits/rejected": -2.843994140625, "logps/chosen": -71.11871337890625, "logps/rejected": -73.41519165039062, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.18628919124603271, "rewards/margins": 0.03832828253507614, "rewards/rejected": -0.22461748123168945, "step": 5750 }, { "epoch": 1.9848380427291523, "grad_norm": 2.7703638076782227, "learning_rate": 1.550589354052228e-08, "logits/chosen": -2.8484389781951904, "logits/rejected": -2.816235065460205, "logps/chosen": -74.5155258178711, "logps/rejected": -75.35545349121094, "loss": 0.6626, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1687597930431366, "rewards/margins": 0.06856951862573624, "rewards/rejected": -0.23732931911945343, "step": 5760 }, { "epoch": 1.9882839421088905, "grad_norm": 2.634413480758667, "learning_rate": 1.5413237663104084e-08, "logits/chosen": -2.886798143386841, "logits/rejected": -2.8637537956237793, "logps/chosen": -69.6534652709961, "logps/rejected": -75.93453979492188, "loss": 0.6647, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17548868060112, "rewards/margins": 0.06267096847295761, "rewards/rejected": -0.2381596565246582, "step": 5770 }, { "epoch": 1.9917298414886284, "grad_norm": 2.8247203826904297, "learning_rate": 1.5320735917795592e-08, "logits/chosen": -2.8517236709594727, "logits/rejected": -2.827307939529419, "logps/chosen": -71.51679992675781, "logps/rejected": -76.0186538696289, "loss": 0.6696, "rewards/accuracies": 0.640625, "rewards/chosen": -0.18360595405101776, "rewards/margins": 0.05353239178657532, "rewards/rejected": -0.23713836073875427, "step": 5780 }, { "epoch": 1.9951757408683668, "grad_norm": 2.9096503257751465, "learning_rate": 1.522838979180266e-08, "logits/chosen": -2.884448289871216, "logits/rejected": -2.86497163772583, "logps/chosen": -74.96343231201172, "logps/rejected": -79.83631134033203, "loss": 0.6742, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.197788268327713, "rewards/margins": 0.04370398074388504, "rewards/rejected": -0.24149227142333984, "step": 5790 }, { "epoch": 1.9986216402481047, "grad_norm": 2.763305902481079, "learning_rate": 1.5136200769829194e-08, "logits/chosen": -2.8206562995910645, "logits/rejected": -2.79538893699646, "logps/chosen": -72.14488983154297, "logps/rejected": -75.0128173828125, "loss": 0.6685, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.17169512808322906, "rewards/margins": 0.05441085249185562, "rewards/rejected": -0.2261059731245041, "step": 5800 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.953160285949707, "eval_logits/rejected": -2.947273015975952, "eval_logps/chosen": -71.64476776123047, "eval_logps/rejected": -79.09119415283203, "eval_loss": 0.680160403251648, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.12932877242565155, "eval_rewards/margins": 0.029781876131892204, "eval_rewards/rejected": -0.1591106504201889, "eval_runtime": 384.9069, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 5800 }, { "epoch": 2.0020675396278427, "grad_norm": 3.070322275161743, "learning_rate": 1.5044170334053188e-08, "logits/chosen": -2.8400168418884277, "logits/rejected": -2.822423219680786, "logps/chosen": -72.52458190917969, "logps/rejected": -77.5846939086914, "loss": 0.6712, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.18587848544120789, "rewards/margins": 0.05025704577565193, "rewards/rejected": -0.23613551259040833, "step": 5810 }, { "epoch": 2.005513439007581, "grad_norm": 3.0915560722351074, "learning_rate": 1.4952299964103005e-08, "logits/chosen": -2.9020092487335205, "logits/rejected": -2.8757405281066895, "logps/chosen": -72.9637451171875, "logps/rejected": -75.72364044189453, "loss": 0.6639, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1771032214164734, "rewards/margins": 0.06516268849372864, "rewards/rejected": -0.24226590991020203, "step": 5820 }, { "epoch": 2.008959338387319, "grad_norm": 2.6780731678009033, "learning_rate": 1.486059113703349e-08, "logits/chosen": -2.864835739135742, "logits/rejected": -2.8269145488739014, "logps/chosen": -72.051025390625, "logps/rejected": -75.1775131225586, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17258547246456146, "rewards/margins": 0.07212905585765839, "rewards/rejected": -0.24471454322338104, "step": 5830 }, { "epoch": 2.0124052377670574, "grad_norm": 2.7480978965759277, "learning_rate": 1.4769045327302298e-08, "logits/chosen": -2.8165197372436523, "logits/rejected": -2.790295362472534, "logps/chosen": -72.93586730957031, "logps/rejected": -80.86524963378906, "loss": 0.6553, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17015625536441803, "rewards/margins": 0.08368439972400665, "rewards/rejected": -0.2538406252861023, "step": 5840 }, { "epoch": 2.0158511371467953, "grad_norm": 2.6187939643859863, "learning_rate": 1.4677664006746127e-08, "logits/chosen": -2.915573835372925, "logits/rejected": -2.883148193359375, "logps/chosen": -75.87751007080078, "logps/rejected": -79.40313720703125, "loss": 0.6649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17452451586723328, "rewards/margins": 0.06326372921466827, "rewards/rejected": -0.23778827488422394, "step": 5850 }, { "epoch": 2.0192970365265333, "grad_norm": 2.888774871826172, "learning_rate": 1.458644864455712e-08, "logits/chosen": -2.904392719268799, "logits/rejected": -2.8851654529571533, "logps/chosen": -72.34233093261719, "logps/rejected": -76.63909149169922, "loss": 0.6674, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.1772407740354538, "rewards/margins": 0.05985737591981888, "rewards/rejected": -0.23709814250469208, "step": 5860 }, { "epoch": 2.0227429359062716, "grad_norm": 2.6644673347473145, "learning_rate": 1.449540070725918e-08, "logits/chosen": -2.8865315914154053, "logits/rejected": -2.8711280822753906, "logps/chosen": -72.6664047241211, "logps/rejected": -79.73627471923828, "loss": 0.6681, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1860467940568924, "rewards/margins": 0.05664234608411789, "rewards/rejected": -0.2426891326904297, "step": 5870 }, { "epoch": 2.0261888352860096, "grad_norm": 3.0654804706573486, "learning_rate": 1.4404521658684438e-08, "logits/chosen": -2.9004409313201904, "logits/rejected": -2.891820192337036, "logps/chosen": -69.09294128417969, "logps/rejected": -78.7652359008789, "loss": 0.6619, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.17560425400733948, "rewards/margins": 0.06888742744922638, "rewards/rejected": -0.24449166655540466, "step": 5880 }, { "epoch": 2.029634734665748, "grad_norm": 3.1742727756500244, "learning_rate": 1.4313812959949682e-08, "logits/chosen": -2.8075931072235107, "logits/rejected": -2.784196138381958, "logps/chosen": -71.74871063232422, "logps/rejected": -75.79220581054688, "loss": 0.6679, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18168126046657562, "rewards/margins": 0.05658834055066109, "rewards/rejected": -0.2382695972919464, "step": 5890 }, { "epoch": 2.033080634045486, "grad_norm": 2.6430060863494873, "learning_rate": 1.4223276069432898e-08, "logits/chosen": -2.8478686809539795, "logits/rejected": -2.839992046356201, "logps/chosen": -72.51114654541016, "logps/rejected": -78.14730072021484, "loss": 0.6698, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.19261565804481506, "rewards/margins": 0.05215558409690857, "rewards/rejected": -0.24477121233940125, "step": 5900 }, { "epoch": 2.033080634045486, "eval_logits/chosen": -2.950310230255127, "eval_logits/rejected": -2.9444398880004883, "eval_logps/chosen": -71.94589233398438, "eval_logps/rejected": -79.44258117675781, "eval_loss": 0.679954469203949, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.1323399692773819, "eval_rewards/margins": 0.030284566804766655, "eval_rewards/rejected": -0.1626245379447937, "eval_runtime": 384.8932, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 5900 }, { "epoch": 2.036526533425224, "grad_norm": 3.215297222137451, "learning_rate": 1.4132912442749803e-08, "logits/chosen": -2.8545494079589844, "logits/rejected": -2.8399388790130615, "logps/chosen": -72.28363037109375, "logps/rejected": -77.0155258178711, "loss": 0.6689, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1934853494167328, "rewards/margins": 0.05461903288960457, "rewards/rejected": -0.24810437858104706, "step": 5910 }, { "epoch": 2.039972432804962, "grad_norm": 2.812729835510254, "learning_rate": 1.404272353273045e-08, "logits/chosen": -2.8346471786499023, "logits/rejected": -2.8205513954162598, "logps/chosen": -69.64672088623047, "logps/rejected": -76.75395965576172, "loss": 0.6684, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1801648586988449, "rewards/margins": 0.05517042800784111, "rewards/rejected": -0.2353353053331375, "step": 5920 }, { "epoch": 2.0434183321847, "grad_norm": 2.8067190647125244, "learning_rate": 1.3952710789395877e-08, "logits/chosen": -2.8325607776641846, "logits/rejected": -2.804713010787964, "logps/chosen": -75.24555206298828, "logps/rejected": -78.1937484741211, "loss": 0.6736, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2042478621006012, "rewards/margins": 0.04533013328909874, "rewards/rejected": -0.24957799911499023, "step": 5930 }, { "epoch": 2.0468642315644385, "grad_norm": 3.0256736278533936, "learning_rate": 1.3862875659934743e-08, "logits/chosen": -2.824462413787842, "logits/rejected": -2.8075549602508545, "logps/chosen": -70.057861328125, "logps/rejected": -78.82840728759766, "loss": 0.6613, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.17371094226837158, "rewards/margins": 0.07016728073358536, "rewards/rejected": -0.24387821555137634, "step": 5940 }, { "epoch": 2.0503101309441765, "grad_norm": 2.9061193466186523, "learning_rate": 1.3773219588680168e-08, "logits/chosen": -2.828366279602051, "logits/rejected": -2.8039188385009766, "logps/chosen": -72.87462615966797, "logps/rejected": -77.2781982421875, "loss": 0.6628, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18222375214099884, "rewards/margins": 0.06857915967702866, "rewards/rejected": -0.2508028745651245, "step": 5950 }, { "epoch": 2.0537560303239144, "grad_norm": 3.245572566986084, "learning_rate": 1.3683744017086385e-08, "logits/chosen": -2.8829081058502197, "logits/rejected": -2.8768887519836426, "logps/chosen": -74.02415466308594, "logps/rejected": -81.490966796875, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": -0.18862801790237427, "rewards/margins": 0.05201061815023422, "rewards/rejected": -0.24063865840435028, "step": 5960 }, { "epoch": 2.057201929703653, "grad_norm": 2.971945285797119, "learning_rate": 1.3594450383705669e-08, "logits/chosen": -2.8463821411132812, "logits/rejected": -2.8309214115142822, "logps/chosen": -71.5243911743164, "logps/rejected": -77.91888427734375, "loss": 0.6695, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.18059362471103668, "rewards/margins": 0.05352305248379707, "rewards/rejected": -0.23411667346954346, "step": 5970 }, { "epoch": 2.0606478290833907, "grad_norm": 2.889366626739502, "learning_rate": 1.3505340124165154e-08, "logits/chosen": -2.876772403717041, "logits/rejected": -2.854177951812744, "logps/chosen": -73.63459014892578, "logps/rejected": -78.09153747558594, "loss": 0.6718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.18617992103099823, "rewards/margins": 0.048811398446559906, "rewards/rejected": -0.23499131202697754, "step": 5980 }, { "epoch": 2.0640937284631287, "grad_norm": 2.9183542728424072, "learning_rate": 1.3416414671143744e-08, "logits/chosen": -2.867488145828247, "logits/rejected": -2.8466312885284424, "logps/chosen": -73.60958099365234, "logps/rejected": -78.07908630371094, "loss": 0.6666, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.18132875859737396, "rewards/margins": 0.05995461344718933, "rewards/rejected": -0.2412833869457245, "step": 5990 }, { "epoch": 2.067539627842867, "grad_norm": 2.8540546894073486, "learning_rate": 1.3327675454349136e-08, "logits/chosen": -2.882275104522705, "logits/rejected": -2.8465981483459473, "logps/chosen": -72.7598876953125, "logps/rejected": -78.54497528076172, "loss": 0.6627, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18601307272911072, "rewards/margins": 0.06720709800720215, "rewards/rejected": -0.25322017073631287, "step": 6000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -2.947733163833618, "eval_logits/rejected": -2.9418952465057373, "eval_logps/chosen": -72.1328353881836, "eval_logps/rejected": -79.67119598388672, "eval_loss": 0.679771900177002, "eval_rewards/accuracies": 0.6064126491546631, "eval_rewards/chosen": -0.13420936465263367, "eval_rewards/margins": 0.030701296404004097, "eval_rewards/rejected": -0.16491064429283142, "eval_runtime": 384.8866, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 6000 }, { "epoch": 2.070985527222605, "grad_norm": 2.6914260387420654, "learning_rate": 1.3239123900494737e-08, "logits/chosen": -2.8692572116851807, "logits/rejected": -2.8503878116607666, "logps/chosen": -71.97503662109375, "logps/rejected": -78.16046142578125, "loss": 0.6675, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18146854639053345, "rewards/margins": 0.057741619646549225, "rewards/rejected": -0.23921015858650208, "step": 6010 }, { "epoch": 2.0744314266023434, "grad_norm": 2.7999613285064697, "learning_rate": 1.3150761433276857e-08, "logits/chosen": -2.904170036315918, "logits/rejected": -2.8822875022888184, "logps/chosen": -70.21324157714844, "logps/rejected": -74.87745666503906, "loss": 0.6673, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17861947417259216, "rewards/margins": 0.05857741832733154, "rewards/rejected": -0.2371968775987625, "step": 6020 }, { "epoch": 2.0778773259820813, "grad_norm": 2.935558319091797, "learning_rate": 1.3062589473351676e-08, "logits/chosen": -2.871791362762451, "logits/rejected": -2.839686870574951, "logps/chosen": -73.01780700683594, "logps/rejected": -77.13985443115234, "loss": 0.6639, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.17967572808265686, "rewards/margins": 0.06501632183790207, "rewards/rejected": -0.24469204246997833, "step": 6030 }, { "epoch": 2.0813232253618192, "grad_norm": 2.847379446029663, "learning_rate": 1.2974609438312544e-08, "logits/chosen": -2.807041645050049, "logits/rejected": -2.7772576808929443, "logps/chosen": -72.95405578613281, "logps/rejected": -76.84339904785156, "loss": 0.6629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17835943400859833, "rewards/margins": 0.06789851188659668, "rewards/rejected": -0.246257945895195, "step": 6040 }, { "epoch": 2.0847691247415576, "grad_norm": 2.9994969367980957, "learning_rate": 1.2886822742667061e-08, "logits/chosen": -2.850872039794922, "logits/rejected": -2.825368642807007, "logps/chosen": -75.16609191894531, "logps/rejected": -75.98664855957031, "loss": 0.6688, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.17855563759803772, "rewards/margins": 0.05494005233049393, "rewards/rejected": -0.23349566757678986, "step": 6050 }, { "epoch": 2.0882150241212956, "grad_norm": 2.7447428703308105, "learning_rate": 1.2799230797814414e-08, "logits/chosen": -2.859102487564087, "logits/rejected": -2.833218812942505, "logps/chosen": -75.78932189941406, "logps/rejected": -78.23182678222656, "loss": 0.6665, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1931246817111969, "rewards/margins": 0.06041465327143669, "rewards/rejected": -0.2535393536090851, "step": 6060 }, { "epoch": 2.091660923501034, "grad_norm": 2.711883306503296, "learning_rate": 1.2711835012022698e-08, "logits/chosen": -2.8491129875183105, "logits/rejected": -2.8296196460723877, "logps/chosen": -71.62013244628906, "logps/rejected": -78.51918029785156, "loss": 0.6657, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1842469722032547, "rewards/margins": 0.060956645756959915, "rewards/rejected": -0.24520361423492432, "step": 6070 }, { "epoch": 2.095106822880772, "grad_norm": 2.8179895877838135, "learning_rate": 1.2624636790406173e-08, "logits/chosen": -2.8050475120544434, "logits/rejected": -2.783385992050171, "logps/chosen": -72.73182678222656, "logps/rejected": -77.1137924194336, "loss": 0.6672, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1803685426712036, "rewards/margins": 0.05829042196273804, "rewards/rejected": -0.23865894973278046, "step": 6080 }, { "epoch": 2.09855272226051, "grad_norm": 2.9454994201660156, "learning_rate": 1.2537637534902823e-08, "logits/chosen": -2.7882180213928223, "logits/rejected": -2.7828052043914795, "logps/chosen": -70.67366027832031, "logps/rejected": -82.29959106445312, "loss": 0.6661, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.19123628735542297, "rewards/margins": 0.062281377613544464, "rewards/rejected": -0.25351768732070923, "step": 6090 }, { "epoch": 2.101998621640248, "grad_norm": 3.004014730453491, "learning_rate": 1.2450838644251663e-08, "logits/chosen": -2.8539185523986816, "logits/rejected": -2.8254921436309814, "logps/chosen": -75.35298156738281, "logps/rejected": -79.0703353881836, "loss": 0.6631, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.1849779188632965, "rewards/margins": 0.06630324572324753, "rewards/rejected": -0.25128117203712463, "step": 6100 }, { "epoch": 2.101998621640248, "eval_logits/chosen": -2.9455599784851074, "eval_logits/rejected": -2.939725875854492, "eval_logps/chosen": -72.23080444335938, "eval_logps/rejected": -79.79864501953125, "eval_loss": 0.679646909236908, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.13518904149532318, "eval_rewards/margins": 0.03099626675248146, "eval_rewards/rejected": -0.16618531942367554, "eval_runtime": 384.7472, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 6100 }, { "epoch": 2.105444521019986, "grad_norm": 2.8541877269744873, "learning_rate": 1.2364241513970358e-08, "logits/chosen": -2.81101655960083, "logits/rejected": -2.788217782974243, "logps/chosen": -73.13142395019531, "logps/rejected": -78.5287094116211, "loss": 0.6658, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1952894926071167, "rewards/margins": 0.06093985587358475, "rewards/rejected": -0.25622934103012085, "step": 6110 }, { "epoch": 2.1088904203997245, "grad_norm": 2.7225706577301025, "learning_rate": 1.2277847536332747e-08, "logits/chosen": -2.7820587158203125, "logits/rejected": -2.765590190887451, "logps/chosen": -72.20928955078125, "logps/rejected": -77.2238998413086, "loss": 0.6696, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.19913873076438904, "rewards/margins": 0.053004465997219086, "rewards/rejected": -0.2521432042121887, "step": 6120 }, { "epoch": 2.1123363197794625, "grad_norm": 2.7418386936187744, "learning_rate": 1.2191658100346464e-08, "logits/chosen": -2.822899103164673, "logits/rejected": -2.8043737411499023, "logps/chosen": -71.71107482910156, "logps/rejected": -78.16561889648438, "loss": 0.6707, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19211336970329285, "rewards/margins": 0.05128926783800125, "rewards/rejected": -0.2434026002883911, "step": 6130 }, { "epoch": 2.1157822191592004, "grad_norm": 2.8674774169921875, "learning_rate": 1.2105674591730598e-08, "logits/chosen": -2.8589792251586914, "logits/rejected": -2.832695960998535, "logps/chosen": -74.89783477783203, "logps/rejected": -79.83828735351562, "loss": 0.6625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19233883917331696, "rewards/margins": 0.06969405710697174, "rewards/rejected": -0.2620328664779663, "step": 6140 }, { "epoch": 2.1192281185389388, "grad_norm": 2.939145803451538, "learning_rate": 1.2019898392893414e-08, "logits/chosen": -2.8057284355163574, "logits/rejected": -2.778944492340088, "logps/chosen": -72.93653869628906, "logps/rejected": -78.07707214355469, "loss": 0.6631, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.18705889582633972, "rewards/margins": 0.06675699353218079, "rewards/rejected": -0.2538158893585205, "step": 6150 }, { "epoch": 2.1226740179186767, "grad_norm": 2.9440038204193115, "learning_rate": 1.1934330882910173e-08, "logits/chosen": -2.8382813930511475, "logits/rejected": -2.8174939155578613, "logps/chosen": -72.0179672241211, "logps/rejected": -77.95629119873047, "loss": 0.6615, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17419029772281647, "rewards/margins": 0.06997786462306976, "rewards/rejected": -0.24416819214820862, "step": 6160 }, { "epoch": 2.126119917298415, "grad_norm": 2.9518470764160156, "learning_rate": 1.1848973437500862e-08, "logits/chosen": -2.8576152324676514, "logits/rejected": -2.8404765129089355, "logps/chosen": -74.33116912841797, "logps/rejected": -78.85844421386719, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": -0.1940416842699051, "rewards/margins": 0.05094757676124573, "rewards/rejected": -0.244989275932312, "step": 6170 }, { "epoch": 2.129565816678153, "grad_norm": 3.1728286743164062, "learning_rate": 1.1763827429008175e-08, "logits/chosen": -2.8093442916870117, "logits/rejected": -2.7787070274353027, "logps/chosen": -73.55226135253906, "logps/rejected": -76.96476745605469, "loss": 0.6621, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1875348538160324, "rewards/margins": 0.06974972784519196, "rewards/rejected": -0.25728458166122437, "step": 6180 }, { "epoch": 2.133011716057891, "grad_norm": 2.847642421722412, "learning_rate": 1.1678894226375394e-08, "logits/chosen": -2.832021474838257, "logits/rejected": -2.8195154666900635, "logps/chosen": -70.27241516113281, "logps/rejected": -78.251953125, "loss": 0.6668, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18781164288520813, "rewards/margins": 0.059385575354099274, "rewards/rejected": -0.247197225689888, "step": 6190 }, { "epoch": 2.1364576154376294, "grad_norm": 3.077493906021118, "learning_rate": 1.1594175195124398e-08, "logits/chosen": -2.9076902866363525, "logits/rejected": -2.9005513191223145, "logps/chosen": -73.25450897216797, "logps/rejected": -81.53761291503906, "loss": 0.6629, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.19050666689872742, "rewards/margins": 0.06864413619041443, "rewards/rejected": -0.25915080308914185, "step": 6200 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -2.9433274269104004, "eval_logits/rejected": -2.937446355819702, "eval_logps/chosen": -72.43737030029297, "eval_logps/rejected": -80.02812957763672, "eval_loss": 0.6795607209205627, "eval_rewards/accuracies": 0.6085036993026733, "eval_rewards/chosen": -0.1372547298669815, "eval_rewards/margins": 0.031225377693772316, "eval_rewards/rejected": -0.16848011314868927, "eval_runtime": 384.6608, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 6200 }, { "epoch": 2.1399035148173673, "grad_norm": 2.836514949798584, "learning_rate": 1.15096716973337e-08, "logits/chosen": -2.7542121410369873, "logits/rejected": -2.724360704421997, "logps/chosen": -72.6585693359375, "logps/rejected": -76.76480102539062, "loss": 0.6652, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1799730360507965, "rewards/margins": 0.06262731552124023, "rewards/rejected": -0.24260032176971436, "step": 6210 }, { "epoch": 2.1433494141971057, "grad_norm": 2.7303783893585205, "learning_rate": 1.1425385091616563e-08, "logits/chosen": -2.801254987716675, "logits/rejected": -2.786839246749878, "logps/chosen": -73.23737335205078, "logps/rejected": -79.03593444824219, "loss": 0.6659, "rewards/accuracies": 0.671875, "rewards/chosen": -0.191168874502182, "rewards/margins": 0.06283432245254517, "rewards/rejected": -0.2540031969547272, "step": 6220 }, { "epoch": 2.1467953135768436, "grad_norm": 2.853400468826294, "learning_rate": 1.1341316733099131e-08, "logits/chosen": -2.859867572784424, "logits/rejected": -2.8382954597473145, "logps/chosen": -71.71000671386719, "logps/rejected": -77.24999237060547, "loss": 0.6654, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19146578013896942, "rewards/margins": 0.0619155652821064, "rewards/rejected": -0.2533813416957855, "step": 6230 }, { "epoch": 2.1502412129565815, "grad_norm": 2.8453168869018555, "learning_rate": 1.1257467973398674e-08, "logits/chosen": -2.918832302093506, "logits/rejected": -2.9026529788970947, "logps/chosen": -74.5626220703125, "logps/rejected": -81.85762023925781, "loss": 0.6623, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.19708040356636047, "rewards/margins": 0.06956573575735092, "rewards/rejected": -0.266646146774292, "step": 6240 }, { "epoch": 2.15368711233632, "grad_norm": 2.9064645767211914, "learning_rate": 1.1173840160601828e-08, "logits/chosen": -2.848163604736328, "logits/rejected": -2.8247714042663574, "logps/chosen": -73.07435607910156, "logps/rejected": -79.97650146484375, "loss": 0.6604, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19149038195610046, "rewards/margins": 0.0730765089392662, "rewards/rejected": -0.26456689834594727, "step": 6250 }, { "epoch": 2.157133011716058, "grad_norm": 2.908900260925293, "learning_rate": 1.1090434639242935e-08, "logits/chosen": -2.829350233078003, "logits/rejected": -2.808617353439331, "logps/chosen": -72.5802993774414, "logps/rejected": -79.20438385009766, "loss": 0.6649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19321544468402863, "rewards/margins": 0.06296241283416748, "rewards/rejected": -0.2561778426170349, "step": 6260 }, { "epoch": 2.160578911095796, "grad_norm": 2.90704083442688, "learning_rate": 1.100725275028243e-08, "logits/chosen": -2.8384509086608887, "logits/rejected": -2.8228604793548584, "logps/chosen": -73.39671325683594, "logps/rejected": -80.55579376220703, "loss": 0.6642, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1932399868965149, "rewards/margins": 0.06481561809778214, "rewards/rejected": -0.2580556273460388, "step": 6270 }, { "epoch": 2.164024810475534, "grad_norm": 2.853450059890747, "learning_rate": 1.0924295831085273e-08, "logits/chosen": -2.9191622734069824, "logits/rejected": -2.892040967941284, "logps/chosen": -73.12138366699219, "logps/rejected": -75.91929626464844, "loss": 0.6663, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18977297842502594, "rewards/margins": 0.06074298545718193, "rewards/rejected": -0.25051596760749817, "step": 6280 }, { "epoch": 2.167470709855272, "grad_norm": 2.7717442512512207, "learning_rate": 1.0841565215399453e-08, "logits/chosen": -2.8475899696350098, "logits/rejected": -2.8125226497650146, "logps/chosen": -74.5064697265625, "logps/rejected": -77.3388900756836, "loss": 0.6585, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18283092975616455, "rewards/margins": 0.07614856213331223, "rewards/rejected": -0.2589794993400574, "step": 6290 }, { "epoch": 2.1709166092350105, "grad_norm": 2.930222988128662, "learning_rate": 1.075906223333454e-08, "logits/chosen": -2.817661762237549, "logits/rejected": -2.7878212928771973, "logps/chosen": -76.17271423339844, "logps/rejected": -78.36231994628906, "loss": 0.6672, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.1928362399339676, "rewards/margins": 0.05870527774095535, "rewards/rejected": -0.25154152512550354, "step": 6300 }, { "epoch": 2.1709166092350105, "eval_logits/chosen": -2.940539598464966, "eval_logits/rejected": -2.934702157974243, "eval_logps/chosen": -72.63883209228516, "eval_logps/rejected": -80.2660903930664, "eval_loss": 0.679415762424469, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.13926927745342255, "eval_rewards/margins": 0.03159034997224808, "eval_rewards/rejected": -0.17085963487625122, "eval_runtime": 384.9468, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 6300 }, { "epoch": 2.1743625086147484, "grad_norm": 3.022702217102051, "learning_rate": 1.067678821134031e-08, "logits/chosen": -2.9255900382995605, "logits/rejected": -2.912877321243286, "logps/chosen": -74.55247497558594, "logps/rejected": -79.90052032470703, "loss": 0.6666, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.19838905334472656, "rewards/margins": 0.05970954895019531, "rewards/rejected": -0.2580986022949219, "step": 6310 }, { "epoch": 2.1778084079944864, "grad_norm": 3.1013293266296387, "learning_rate": 1.0594744472185376e-08, "logits/chosen": -2.88043212890625, "logits/rejected": -2.868788957595825, "logps/chosen": -70.76589965820312, "logps/rejected": -78.20405578613281, "loss": 0.6628, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.18525739014148712, "rewards/margins": 0.06922597438097, "rewards/rejected": -0.2544833719730377, "step": 6320 }, { "epoch": 2.1812543073742248, "grad_norm": 2.913210153579712, "learning_rate": 1.0512932334936014e-08, "logits/chosen": -2.7977688312530518, "logits/rejected": -2.7784111499786377, "logps/chosen": -71.53343963623047, "logps/rejected": -76.45532989501953, "loss": 0.6714, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.18970337510108948, "rewards/margins": 0.050395965576171875, "rewards/rejected": -0.24009934067726135, "step": 6330 }, { "epoch": 2.1847002067539627, "grad_norm": 2.755401372909546, "learning_rate": 1.0431353114934858e-08, "logits/chosen": -2.9117612838745117, "logits/rejected": -2.881978988647461, "logps/chosen": -70.91378021240234, "logps/rejected": -77.30023193359375, "loss": 0.6611, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18551412224769592, "rewards/margins": 0.07095842063426971, "rewards/rejected": -0.25647255778312683, "step": 6340 }, { "epoch": 2.188146106133701, "grad_norm": 2.8766865730285645, "learning_rate": 1.0350008123779796e-08, "logits/chosen": -2.811021327972412, "logits/rejected": -2.793273448944092, "logps/chosen": -73.39643096923828, "logps/rejected": -78.3592300415039, "loss": 0.6692, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19075675308704376, "rewards/margins": 0.055299412459135056, "rewards/rejected": -0.24605616927146912, "step": 6350 }, { "epoch": 2.191592005513439, "grad_norm": 2.969682455062866, "learning_rate": 1.02688986693029e-08, "logits/chosen": -2.899294376373291, "logits/rejected": -2.87791109085083, "logps/chosen": -74.26057434082031, "logps/rejected": -80.1431884765625, "loss": 0.6643, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1914677917957306, "rewards/margins": 0.06429991871118546, "rewards/rejected": -0.25576773285865784, "step": 6360 }, { "epoch": 2.195037904893177, "grad_norm": 2.9559319019317627, "learning_rate": 1.018802605554933e-08, "logits/chosen": -2.8084044456481934, "logits/rejected": -2.7824862003326416, "logps/chosen": -71.33440399169922, "logps/rejected": -77.79242706298828, "loss": 0.6632, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.1981213241815567, "rewards/margins": 0.06635818630456924, "rewards/rejected": -0.26447948813438416, "step": 6370 }, { "epoch": 2.1984838042729153, "grad_norm": 2.9525883197784424, "learning_rate": 1.0107391582756492e-08, "logits/chosen": -2.806623697280884, "logits/rejected": -2.786754608154297, "logps/chosen": -70.97440338134766, "logps/rejected": -77.8625259399414, "loss": 0.6668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18143963813781738, "rewards/margins": 0.060033511370420456, "rewards/rejected": -0.24147315323352814, "step": 6380 }, { "epoch": 2.2019297036526533, "grad_norm": 3.1429362297058105, "learning_rate": 1.0026996547332969e-08, "logits/chosen": -2.8720028400421143, "logits/rejected": -2.8449950218200684, "logps/chosen": -75.2216567993164, "logps/rejected": -78.14983367919922, "loss": 0.6724, "rewards/accuracies": 0.640625, "rewards/chosen": -0.20274245738983154, "rewards/margins": 0.04755076393485069, "rewards/rejected": -0.25029319524765015, "step": 6390 }, { "epoch": 2.205375603032391, "grad_norm": 2.823739528656006, "learning_rate": 9.946842241837853e-09, "logits/chosen": -2.827728509902954, "logits/rejected": -2.8105125427246094, "logps/chosen": -75.67320251464844, "logps/rejected": -80.54769134521484, "loss": 0.6687, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2044738084077835, "rewards/margins": 0.05544687435030937, "rewards/rejected": -0.2599206864833832, "step": 6400 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -2.937972068786621, "eval_logits/rejected": -2.932161569595337, "eval_logps/chosen": -72.72406768798828, "eval_logps/rejected": -80.36534118652344, "eval_loss": 0.6793599724769592, "eval_rewards/accuracies": 0.6085036993026733, "eval_rewards/chosen": -0.14012175798416138, "eval_rewards/margins": 0.03173041716217995, "eval_rewards/rejected": -0.17185218632221222, "eval_runtime": 384.6828, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.399, "step": 6400 }, { "epoch": 2.2088215024121296, "grad_norm": 3.143953561782837, "learning_rate": 9.866929954959796e-09, "logits/chosen": -2.8348395824432373, "logits/rejected": -2.825678586959839, "logps/chosen": -71.58697509765625, "logps/rejected": -79.75975036621094, "loss": 0.6711, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.1976335346698761, "rewards/margins": 0.05073421448469162, "rewards/rejected": -0.2483677864074707, "step": 6410 }, { "epoch": 2.2122674017918675, "grad_norm": 3.0812461376190186, "learning_rate": 9.787260971496442e-09, "logits/chosen": -2.8326029777526855, "logits/rejected": -2.81501841545105, "logps/chosen": -73.20677185058594, "logps/rejected": -79.30074310302734, "loss": 0.6692, "rewards/accuracies": 0.625, "rewards/chosen": -0.203878253698349, "rewards/margins": 0.05465240031480789, "rewards/rejected": -0.2585306465625763, "step": 6420 }, { "epoch": 2.215713301171606, "grad_norm": 3.0068674087524414, "learning_rate": 9.707836572333663e-09, "logits/chosen": -2.8269410133361816, "logits/rejected": -2.804957866668701, "logps/chosen": -74.88557434082031, "logps/rejected": -78.3868408203125, "loss": 0.6631, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.20228293538093567, "rewards/margins": 0.06738170236349106, "rewards/rejected": -0.2696646749973297, "step": 6430 }, { "epoch": 2.219159200551344, "grad_norm": 2.698864698410034, "learning_rate": 9.62865803442499e-09, "logits/chosen": -2.848231077194214, "logits/rejected": -2.832853078842163, "logps/chosen": -72.02949523925781, "logps/rejected": -78.27039337158203, "loss": 0.6671, "rewards/accuracies": 0.640625, "rewards/chosen": -0.19244711101055145, "rewards/margins": 0.058957718312740326, "rewards/rejected": -0.25140485167503357, "step": 6440 }, { "epoch": 2.222605099931082, "grad_norm": 2.9465391635894775, "learning_rate": 9.549726630771149e-09, "logits/chosen": -2.870114326477051, "logits/rejected": -2.847660541534424, "logps/chosen": -75.42669677734375, "logps/rejected": -80.46903991699219, "loss": 0.6657, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20035307109355927, "rewards/margins": 0.062261391431093216, "rewards/rejected": -0.26261448860168457, "step": 6450 }, { "epoch": 2.22605099931082, "grad_norm": 2.8095502853393555, "learning_rate": 9.471043630399466e-09, "logits/chosen": -2.877028465270996, "logits/rejected": -2.864351511001587, "logps/chosen": -70.01892852783203, "logps/rejected": -80.4472427368164, "loss": 0.6581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18370530009269714, "rewards/margins": 0.07779251784086227, "rewards/rejected": -0.26149782538414, "step": 6460 }, { "epoch": 2.229496898690558, "grad_norm": 3.235055685043335, "learning_rate": 9.392610298343622e-09, "logits/chosen": -2.8332557678222656, "logits/rejected": -2.8132190704345703, "logps/chosen": -69.69593811035156, "logps/rejected": -75.2501220703125, "loss": 0.6697, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.18931730091571808, "rewards/margins": 0.05274263024330139, "rewards/rejected": -0.24205991625785828, "step": 6470 }, { "epoch": 2.2329427980702965, "grad_norm": 2.873202323913574, "learning_rate": 9.314427895623161e-09, "logits/chosen": -2.7937698364257812, "logits/rejected": -2.769622325897217, "logps/chosen": -75.4273681640625, "logps/rejected": -79.83731079101562, "loss": 0.6607, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1910162717103958, "rewards/margins": 0.07164834439754486, "rewards/rejected": -0.2626646161079407, "step": 6480 }, { "epoch": 2.2363886974500344, "grad_norm": 3.053175449371338, "learning_rate": 9.236497679223323e-09, "logits/chosen": -2.764616012573242, "logits/rejected": -2.7418341636657715, "logps/chosen": -71.33344268798828, "logps/rejected": -78.18122100830078, "loss": 0.659, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19584837555885315, "rewards/margins": 0.07489541172981262, "rewards/rejected": -0.2707437574863434, "step": 6490 }, { "epoch": 2.2398345968297724, "grad_norm": 3.2312283515930176, "learning_rate": 9.158820902074788e-09, "logits/chosen": -2.8689606189727783, "logits/rejected": -2.8464503288269043, "logps/chosen": -76.46820068359375, "logps/rejected": -78.84970092773438, "loss": 0.6662, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.20383891463279724, "rewards/margins": 0.06056356430053711, "rewards/rejected": -0.26440250873565674, "step": 6500 }, { "epoch": 2.2398345968297724, "eval_logits/chosen": -2.936417818069458, "eval_logits/rejected": -2.930582284927368, "eval_logps/chosen": -72.8570327758789, "eval_logps/rejected": -80.52567291259766, "eval_loss": 0.6792523860931396, "eval_rewards/accuracies": 0.6087360382080078, "eval_rewards/chosen": -0.14145129919052124, "eval_rewards/margins": 0.03200420364737511, "eval_rewards/rejected": -0.17345550656318665, "eval_runtime": 384.8443, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 6500 }, { "epoch": 2.2432804962095108, "grad_norm": 2.813805341720581, "learning_rate": 9.081398813033536e-09, "logits/chosen": -2.7846837043762207, "logits/rejected": -2.7650232315063477, "logps/chosen": -71.4330825805664, "logps/rejected": -76.6882553100586, "loss": 0.6646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2042521983385086, "rewards/margins": 0.06442488729953766, "rewards/rejected": -0.2686770558357239, "step": 6510 }, { "epoch": 2.2467263955892487, "grad_norm": 3.2393975257873535, "learning_rate": 9.004232656860804e-09, "logits/chosen": -2.907459020614624, "logits/rejected": -2.8817758560180664, "logps/chosen": -73.28174591064453, "logps/rejected": -79.92520904541016, "loss": 0.6638, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.19188448786735535, "rewards/margins": 0.066767618060112, "rewards/rejected": -0.25865209102630615, "step": 6520 }, { "epoch": 2.250172294968987, "grad_norm": 3.052621841430664, "learning_rate": 8.927323674202996e-09, "logits/chosen": -2.882232666015625, "logits/rejected": -2.8775315284729004, "logps/chosen": -71.92259979248047, "logps/rejected": -78.81254577636719, "loss": 0.6719, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19625094532966614, "rewards/margins": 0.04872672259807587, "rewards/rejected": -0.244977667927742, "step": 6530 }, { "epoch": 2.253618194348725, "grad_norm": 2.835733413696289, "learning_rate": 8.850673101571815e-09, "logits/chosen": -2.7827701568603516, "logits/rejected": -2.7627553939819336, "logps/chosen": -73.01153564453125, "logps/rejected": -76.7647933959961, "loss": 0.67, "rewards/accuracies": 0.625, "rewards/chosen": -0.18928982317447662, "rewards/margins": 0.05259045958518982, "rewards/rejected": -0.24188026785850525, "step": 6540 }, { "epoch": 2.257064093728463, "grad_norm": 2.7526662349700928, "learning_rate": 8.774282171324345e-09, "logits/chosen": -2.919456958770752, "logits/rejected": -2.887068271636963, "logps/chosen": -73.70796203613281, "logps/rejected": -77.02224731445312, "loss": 0.6658, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.19566510617733002, "rewards/margins": 0.060904134064912796, "rewards/rejected": -0.2565692365169525, "step": 6550 }, { "epoch": 2.2605099931082013, "grad_norm": 3.0060200691223145, "learning_rate": 8.698152111643242e-09, "logits/chosen": -2.8509039878845215, "logits/rejected": -2.8322460651397705, "logps/chosen": -74.21089172363281, "logps/rejected": -78.41842651367188, "loss": 0.6663, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.2004033625125885, "rewards/margins": 0.06080017611384392, "rewards/rejected": -0.2612035274505615, "step": 6560 }, { "epoch": 2.2639558924879393, "grad_norm": 2.967320680618286, "learning_rate": 8.622284146516995e-09, "logits/chosen": -2.8543925285339355, "logits/rejected": -2.8362247943878174, "logps/chosen": -72.62348175048828, "logps/rejected": -82.3823471069336, "loss": 0.6608, "rewards/accuracies": 0.671875, "rewards/chosen": -0.18806661665439606, "rewards/margins": 0.07189898937940598, "rewards/rejected": -0.25996559858322144, "step": 6570 }, { "epoch": 2.2674017918676777, "grad_norm": 3.081692695617676, "learning_rate": 8.546679495720233e-09, "logits/chosen": -2.7989463806152344, "logits/rejected": -2.771743059158325, "logps/chosen": -75.41520690917969, "logps/rejected": -80.87393951416016, "loss": 0.6595, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19556981325149536, "rewards/margins": 0.0760122612118721, "rewards/rejected": -0.27158206701278687, "step": 6580 }, { "epoch": 2.2708476912474156, "grad_norm": 2.7911300659179688, "learning_rate": 8.471339374794132e-09, "logits/chosen": -2.8368725776672363, "logits/rejected": -2.820103883743286, "logps/chosen": -73.36140441894531, "logps/rejected": -77.35289001464844, "loss": 0.6677, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1978413462638855, "rewards/margins": 0.05708049610257149, "rewards/rejected": -0.2549218535423279, "step": 6590 }, { "epoch": 2.2742935906271535, "grad_norm": 2.932063102722168, "learning_rate": 8.396264995026857e-09, "logits/chosen": -2.837392807006836, "logits/rejected": -2.8130462169647217, "logps/chosen": -74.26287841796875, "logps/rejected": -76.72357177734375, "loss": 0.6701, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.18978814780712128, "rewards/margins": 0.05212768167257309, "rewards/rejected": -0.24191585183143616, "step": 6600 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -2.9345366954803467, "eval_logits/rejected": -2.9286646842956543, "eval_logps/chosen": -72.9458236694336, "eval_logps/rejected": -80.62232208251953, "eval_loss": 0.6792166829109192, "eval_rewards/accuracies": 0.6096654534339905, "eval_rewards/chosen": -0.14233915507793427, "eval_rewards/margins": 0.032082799822092056, "eval_rewards/rejected": -0.17442195117473602, "eval_runtime": 384.6471, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 6600 }, { "epoch": 2.277739490006892, "grad_norm": 3.14896821975708, "learning_rate": 8.321457563434101e-09, "logits/chosen": -2.8635406494140625, "logits/rejected": -2.851743221282959, "logps/chosen": -72.97914123535156, "logps/rejected": -78.85169982910156, "loss": 0.6721, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1941734403371811, "rewards/margins": 0.04868479073047638, "rewards/rejected": -0.24285821616649628, "step": 6610 }, { "epoch": 2.28118538938663, "grad_norm": 2.881791830062866, "learning_rate": 8.246918282739662e-09, "logits/chosen": -2.83842396736145, "logits/rejected": -2.8174965381622314, "logps/chosen": -69.40675354003906, "logps/rejected": -78.28099060058594, "loss": 0.6619, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.18288998305797577, "rewards/margins": 0.069371297955513, "rewards/rejected": -0.25226131081581116, "step": 6620 }, { "epoch": 2.2846312887663682, "grad_norm": 2.7638931274414062, "learning_rate": 8.17264835135612e-09, "logits/chosen": -2.877408027648926, "logits/rejected": -2.8723177909851074, "logps/chosen": -70.49653625488281, "logps/rejected": -80.97590637207031, "loss": 0.6661, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20640122890472412, "rewards/margins": 0.06047845631837845, "rewards/rejected": -0.26687973737716675, "step": 6630 }, { "epoch": 2.288077188146106, "grad_norm": 2.9644789695739746, "learning_rate": 8.09864896336557e-09, "logits/chosen": -2.7981972694396973, "logits/rejected": -2.7807703018188477, "logps/chosen": -72.94361114501953, "logps/rejected": -78.8845443725586, "loss": 0.6622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1882120668888092, "rewards/margins": 0.07037904113531113, "rewards/rejected": -0.2585911154747009, "step": 6640 }, { "epoch": 2.291523087525844, "grad_norm": 2.9004316329956055, "learning_rate": 8.024921308500413e-09, "logits/chosen": -2.8424270153045654, "logits/rejected": -2.8245530128479004, "logps/chosen": -72.79402923583984, "logps/rejected": -78.49622344970703, "loss": 0.6669, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2029307633638382, "rewards/margins": 0.05925045162439346, "rewards/rejected": -0.26218122243881226, "step": 6650 }, { "epoch": 2.2949689869055825, "grad_norm": 2.8440451622009277, "learning_rate": 7.95146657212423e-09, "logits/chosen": -2.86430287361145, "logits/rejected": -2.836003541946411, "logps/chosen": -77.50093078613281, "logps/rejected": -76.80472564697266, "loss": 0.6726, "rewards/accuracies": 0.59375, "rewards/chosen": -0.20996472239494324, "rewards/margins": 0.04897299408912659, "rewards/rejected": -0.2589377164840698, "step": 6660 }, { "epoch": 2.2984148862853204, "grad_norm": 3.2602100372314453, "learning_rate": 7.878285935212742e-09, "logits/chosen": -2.8455986976623535, "logits/rejected": -2.8280088901519775, "logps/chosen": -75.61815643310547, "logps/rejected": -77.79137420654297, "loss": 0.6696, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.2027847021818161, "rewards/margins": 0.05452219769358635, "rewards/rejected": -0.25730693340301514, "step": 6670 }, { "epoch": 2.301860785665059, "grad_norm": 2.9945316314697266, "learning_rate": 7.805380574334794e-09, "logits/chosen": -2.8512001037597656, "logits/rejected": -2.823992967605591, "logps/chosen": -77.82210540771484, "logps/rejected": -78.67521667480469, "loss": 0.6699, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.20708855986595154, "rewards/margins": 0.053487490862607956, "rewards/rejected": -0.260576069355011, "step": 6680 }, { "epoch": 2.3053066850447967, "grad_norm": 2.9436490535736084, "learning_rate": 7.732751661633466e-09, "logits/chosen": -2.7900009155273438, "logits/rejected": -2.759897470474243, "logps/chosen": -75.78814697265625, "logps/rejected": -79.66416931152344, "loss": 0.6657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2022581547498703, "rewards/margins": 0.06291095912456512, "rewards/rejected": -0.2651691436767578, "step": 6690 }, { "epoch": 2.3087525844245347, "grad_norm": 2.8492109775543213, "learning_rate": 7.66040036480721e-09, "logits/chosen": -2.7683489322662354, "logits/rejected": -2.737278699874878, "logps/chosen": -74.63603210449219, "logps/rejected": -77.3808822631836, "loss": 0.6592, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.18580102920532227, "rewards/margins": 0.07513437420129776, "rewards/rejected": -0.26093539595603943, "step": 6700 }, { "epoch": 2.3087525844245347, "eval_logits/chosen": -2.933316469192505, "eval_logits/rejected": -2.9274494647979736, "eval_logps/chosen": -73.00688171386719, "eval_logps/rejected": -80.70835876464844, "eval_loss": 0.6791099309921265, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.1429498940706253, "eval_rewards/margins": 0.03233249858021736, "eval_rewards/rejected": -0.17528240382671356, "eval_runtime": 384.8738, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 6700 }, { "epoch": 2.312198483804273, "grad_norm": 3.2617263793945312, "learning_rate": 7.588327847091078e-09, "logits/chosen": -2.829902172088623, "logits/rejected": -2.817091941833496, "logps/chosen": -75.34056091308594, "logps/rejected": -81.42596435546875, "loss": 0.6673, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.21541886031627655, "rewards/margins": 0.06066742539405823, "rewards/rejected": -0.2760862708091736, "step": 6710 }, { "epoch": 2.315644383184011, "grad_norm": 2.6608829498291016, "learning_rate": 7.516535267238027e-09, "logits/chosen": -2.844090461730957, "logits/rejected": -2.824551820755005, "logps/chosen": -72.43873596191406, "logps/rejected": -77.8552017211914, "loss": 0.6719, "rewards/accuracies": 0.640625, "rewards/chosen": -0.18574808537960052, "rewards/margins": 0.048894040286540985, "rewards/rejected": -0.2346421480178833, "step": 6720 }, { "epoch": 2.3190902825637494, "grad_norm": 3.0138895511627197, "learning_rate": 7.44502377950029e-09, "logits/chosen": -2.8302483558654785, "logits/rejected": -2.8032493591308594, "logps/chosen": -70.27099609375, "logps/rejected": -76.91096496582031, "loss": 0.6606, "rewards/accuracies": 0.671875, "rewards/chosen": -0.18054182827472687, "rewards/margins": 0.0722096711397171, "rewards/rejected": -0.25275152921676636, "step": 6730 }, { "epoch": 2.3225361819434873, "grad_norm": 3.3245744705200195, "learning_rate": 7.3737945336108135e-09, "logits/chosen": -2.7983157634735107, "logits/rejected": -2.7656636238098145, "logps/chosen": -73.24220275878906, "logps/rejected": -77.26773834228516, "loss": 0.6619, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19299200177192688, "rewards/margins": 0.06960491091012955, "rewards/rejected": -0.26259690523147583, "step": 6740 }, { "epoch": 2.3259820813232253, "grad_norm": 2.892019510269165, "learning_rate": 7.302848674764747e-09, "logits/chosen": -2.8497538566589355, "logits/rejected": -2.82230806350708, "logps/chosen": -75.72679138183594, "logps/rejected": -81.4393081665039, "loss": 0.6588, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.20929181575775146, "rewards/margins": 0.0777427926659584, "rewards/rejected": -0.2870346009731293, "step": 6750 }, { "epoch": 2.3294279807029636, "grad_norm": 2.9910216331481934, "learning_rate": 7.232187343601112e-09, "logits/chosen": -2.7459378242492676, "logits/rejected": -2.739776611328125, "logps/chosen": -70.7459945678711, "logps/rejected": -78.71460723876953, "loss": 0.6701, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20652897655963898, "rewards/margins": 0.053986918181180954, "rewards/rejected": -0.26051589846611023, "step": 6760 }, { "epoch": 2.3328738800827016, "grad_norm": 2.843125820159912, "learning_rate": 7.1618116761843454e-09, "logits/chosen": -2.794175386428833, "logits/rejected": -2.7684130668640137, "logps/chosen": -76.53829193115234, "logps/rejected": -76.85578155517578, "loss": 0.6735, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21193759143352509, "rewards/margins": 0.04486025124788284, "rewards/rejected": -0.2567978501319885, "step": 6770 }, { "epoch": 2.3363197794624395, "grad_norm": 3.024794578552246, "learning_rate": 7.0917228039861495e-09, "logits/chosen": -2.837920904159546, "logits/rejected": -2.8221476078033447, "logps/chosen": -71.71517181396484, "logps/rejected": -77.46109008789062, "loss": 0.6695, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.2031993865966797, "rewards/margins": 0.05470992997288704, "rewards/rejected": -0.25790929794311523, "step": 6780 }, { "epoch": 2.339765678842178, "grad_norm": 2.9613075256347656, "learning_rate": 7.021921853867224e-09, "logits/chosen": -2.776054620742798, "logits/rejected": -2.7627484798431396, "logps/chosen": -71.4914779663086, "logps/rejected": -78.49641418457031, "loss": 0.6674, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.20226594805717468, "rewards/margins": 0.05967609956860542, "rewards/rejected": -0.2619420289993286, "step": 6790 }, { "epoch": 2.343211578221916, "grad_norm": 3.1597957611083984, "learning_rate": 6.952409948059157e-09, "logits/chosen": -2.7840054035186768, "logits/rejected": -2.775805950164795, "logps/chosen": -76.02079772949219, "logps/rejected": -81.79857635498047, "loss": 0.668, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20447516441345215, "rewards/margins": 0.05776340514421463, "rewards/rejected": -0.2622385621070862, "step": 6800 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -2.9325602054595947, "eval_logits/rejected": -2.9266765117645264, "eval_logps/chosen": -73.11541748046875, "eval_logps/rejected": -80.83464050292969, "eval_loss": 0.6790278553962708, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.1440352201461792, "eval_rewards/margins": 0.03250991180539131, "eval_rewards/rejected": -0.1765451431274414, "eval_runtime": 384.6626, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 6800 }, { "epoch": 2.346657477601654, "grad_norm": 2.9884908199310303, "learning_rate": 6.883188204146445e-09, "logits/chosen": -2.9092586040496826, "logits/rejected": -2.87811017036438, "logps/chosen": -72.9873275756836, "logps/rejected": -77.67689514160156, "loss": 0.6637, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.18514366447925568, "rewards/margins": 0.06694239377975464, "rewards/rejected": -0.2520860433578491, "step": 6810 }, { "epoch": 2.350103376981392, "grad_norm": 3.0769546031951904, "learning_rate": 6.81425773504842e-09, "logits/chosen": -2.7738914489746094, "logits/rejected": -2.745915174484253, "logps/chosen": -75.17833709716797, "logps/rejected": -79.51817321777344, "loss": 0.6629, "rewards/accuracies": 0.65625, "rewards/chosen": -0.203973650932312, "rewards/margins": 0.06809861958026886, "rewards/rejected": -0.2720722556114197, "step": 6820 }, { "epoch": 2.35354927636113, "grad_norm": 2.9340219497680664, "learning_rate": 6.745619649001477e-09, "logits/chosen": -2.8019449710845947, "logits/rejected": -2.778942346572876, "logps/chosen": -73.75199890136719, "logps/rejected": -78.28003692626953, "loss": 0.6705, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.2022874802350998, "rewards/margins": 0.05254838615655899, "rewards/rejected": -0.2548358738422394, "step": 6830 }, { "epoch": 2.3569951757408685, "grad_norm": 2.799448251724243, "learning_rate": 6.677275049541128e-09, "logits/chosen": -2.8605258464813232, "logits/rejected": -2.8355460166931152, "logps/chosen": -74.54423522949219, "logps/rejected": -78.68629455566406, "loss": 0.6664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19858643412590027, "rewards/margins": 0.06091517210006714, "rewards/rejected": -0.259501576423645, "step": 6840 }, { "epoch": 2.3604410751206064, "grad_norm": 3.0726187229156494, "learning_rate": 6.609225035484395e-09, "logits/chosen": -2.836742877960205, "logits/rejected": -2.8063669204711914, "logps/chosen": -75.58624267578125, "logps/rejected": -78.98460388183594, "loss": 0.6675, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19170625507831573, "rewards/margins": 0.05845893546938896, "rewards/rejected": -0.2501651644706726, "step": 6850 }, { "epoch": 2.3638869745003444, "grad_norm": 2.9660751819610596, "learning_rate": 6.541470700912014e-09, "logits/chosen": -2.7775285243988037, "logits/rejected": -2.7445595264434814, "logps/chosen": -72.46418762207031, "logps/rejected": -78.74679565429688, "loss": 0.6614, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.18516193330287933, "rewards/margins": 0.07057294994592667, "rewards/rejected": -0.2557348608970642, "step": 6860 }, { "epoch": 2.3673328738800827, "grad_norm": 2.8617970943450928, "learning_rate": 6.474013135150927e-09, "logits/chosen": -2.9082653522491455, "logits/rejected": -2.882812738418579, "logps/chosen": -74.62200927734375, "logps/rejected": -81.91954803466797, "loss": 0.6587, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18413953483104706, "rewards/margins": 0.07611306756734848, "rewards/rejected": -0.26025262475013733, "step": 6870 }, { "epoch": 2.3707787732598207, "grad_norm": 3.307215929031372, "learning_rate": 6.406853422756778e-09, "logits/chosen": -2.8913216590881348, "logits/rejected": -2.8640782833099365, "logps/chosen": -75.2558364868164, "logps/rejected": -78.92206573486328, "loss": 0.6656, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.199386328458786, "rewards/margins": 0.06303433328866959, "rewards/rejected": -0.262420654296875, "step": 6880 }, { "epoch": 2.374224672639559, "grad_norm": 3.0470967292785645, "learning_rate": 6.339992643496381e-09, "logits/chosen": -2.94490385055542, "logits/rejected": -2.9193694591522217, "logps/chosen": -73.72622680664062, "logps/rejected": -79.48007202148438, "loss": 0.6605, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18935003876686096, "rewards/margins": 0.07349582016468048, "rewards/rejected": -0.26284584403038025, "step": 6890 }, { "epoch": 2.377670572019297, "grad_norm": 3.120114803314209, "learning_rate": 6.273431872330487e-09, "logits/chosen": -2.88698673248291, "logits/rejected": -2.8568246364593506, "logps/chosen": -77.24230194091797, "logps/rejected": -79.74833679199219, "loss": 0.6637, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.20365147292613983, "rewards/margins": 0.06559376418590546, "rewards/rejected": -0.2692452669143677, "step": 6900 }, { "epoch": 2.377670572019297, "eval_logits/chosen": -2.9310030937194824, "eval_logits/rejected": -2.9251255989074707, "eval_logps/chosen": -73.22893524169922, "eval_logps/rejected": -80.9638671875, "eval_loss": 0.6789751648902893, "eval_rewards/accuracies": 0.6064126491546631, "eval_rewards/chosen": -0.1451704204082489, "eval_rewards/margins": 0.03266707807779312, "eval_rewards/rejected": -0.17783750593662262, "eval_runtime": 384.6532, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 6900 }, { "epoch": 2.381116471399035, "grad_norm": 3.3634748458862305, "learning_rate": 6.207172179396392e-09, "logits/chosen": -2.853071451187134, "logits/rejected": -2.827911853790283, "logps/chosen": -75.31288146972656, "logps/rejected": -79.6338882446289, "loss": 0.6627, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18924039602279663, "rewards/margins": 0.06921472400426865, "rewards/rejected": -0.2584551274776459, "step": 6910 }, { "epoch": 2.3845623707787733, "grad_norm": 2.8288156986236572, "learning_rate": 6.141214629990799e-09, "logits/chosen": -2.8890914916992188, "logits/rejected": -2.867971181869507, "logps/chosen": -75.27203369140625, "logps/rejected": -78.17911529541016, "loss": 0.6631, "rewards/accuracies": 0.671875, "rewards/chosen": -0.194017693400383, "rewards/margins": 0.06690562516450882, "rewards/rejected": -0.2609233260154724, "step": 6920 }, { "epoch": 2.3880082701585112, "grad_norm": 2.9177701473236084, "learning_rate": 6.075560284552658e-09, "logits/chosen": -2.790680170059204, "logits/rejected": -2.7666659355163574, "logps/chosen": -73.41400146484375, "logps/rejected": -77.69325256347656, "loss": 0.6658, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.2058548629283905, "rewards/margins": 0.062140632420778275, "rewards/rejected": -0.2679955065250397, "step": 6930 }, { "epoch": 2.3914541695382496, "grad_norm": 3.118807315826416, "learning_rate": 6.0102101986461425e-09, "logits/chosen": -2.8359973430633545, "logits/rejected": -2.8073039054870605, "logps/chosen": -76.55292510986328, "logps/rejected": -81.13348388671875, "loss": 0.6649, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.21906523406505585, "rewards/margins": 0.06718254834413528, "rewards/rejected": -0.28624778985977173, "step": 6940 }, { "epoch": 2.3949000689179876, "grad_norm": 3.054338216781616, "learning_rate": 5.945165422943646e-09, "logits/chosen": -2.7907369136810303, "logits/rejected": -2.7695584297180176, "logps/chosen": -75.50694274902344, "logps/rejected": -77.88689422607422, "loss": 0.671, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.20970694720745087, "rewards/margins": 0.05168859288096428, "rewards/rejected": -0.26139554381370544, "step": 6950 }, { "epoch": 2.3983459682977255, "grad_norm": 3.344757080078125, "learning_rate": 5.880427003208924e-09, "logits/chosen": -2.825366497039795, "logits/rejected": -2.7932753562927246, "logps/chosen": -77.42554473876953, "logps/rejected": -78.44639587402344, "loss": 0.6608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1897098571062088, "rewards/margins": 0.07129661738872528, "rewards/rejected": -0.2610064744949341, "step": 6960 }, { "epoch": 2.401791867677464, "grad_norm": 2.93522047996521, "learning_rate": 5.815995980280247e-09, "logits/chosen": -2.7872939109802246, "logits/rejected": -2.766758680343628, "logps/chosen": -70.60647583007812, "logps/rejected": -79.06925964355469, "loss": 0.6607, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18260036408901215, "rewards/margins": 0.07249243557453156, "rewards/rejected": -0.2550927996635437, "step": 6970 }, { "epoch": 2.405237767057202, "grad_norm": 2.7519149780273438, "learning_rate": 5.751873390053697e-09, "logits/chosen": -2.7728610038757324, "logits/rejected": -2.756472110748291, "logps/chosen": -73.69827270507812, "logps/rejected": -78.36803436279297, "loss": 0.6679, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.19948041439056396, "rewards/margins": 0.05798675864934921, "rewards/rejected": -0.2574671804904938, "step": 6980 }, { "epoch": 2.40868366643694, "grad_norm": 3.2839996814727783, "learning_rate": 5.688060263466493e-09, "logits/chosen": -2.80692720413208, "logits/rejected": -2.789949893951416, "logps/chosen": -76.3621597290039, "logps/rejected": -80.12962341308594, "loss": 0.6653, "rewards/accuracies": 0.640625, "rewards/chosen": -0.19801384210586548, "rewards/margins": 0.06326314061880112, "rewards/rejected": -0.2612769901752472, "step": 6990 }, { "epoch": 2.412129565816678, "grad_norm": 2.897106647491455, "learning_rate": 5.6245576264804224e-09, "logits/chosen": -2.7816810607910156, "logits/rejected": -2.7585110664367676, "logps/chosen": -74.5242691040039, "logps/rejected": -80.26558685302734, "loss": 0.6645, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20455551147460938, "rewards/margins": 0.064387746155262, "rewards/rejected": -0.26894325017929077, "step": 7000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.930096387863159, "eval_logits/rejected": -2.924251079559326, "eval_logps/chosen": -73.30198669433594, "eval_logps/rejected": -81.05806732177734, "eval_loss": 0.6788771152496338, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.1459009051322937, "eval_rewards/margins": 0.03287860378623009, "eval_rewards/rejected": -0.1787794977426529, "eval_runtime": 384.675, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 7000 }, { "epoch": 2.415575465196416, "grad_norm": 3.1949822902679443, "learning_rate": 5.5613665000653484e-09, "logits/chosen": -2.7867696285247803, "logits/rejected": -2.772585868835449, "logps/chosen": -75.81459045410156, "logps/rejected": -77.5947494506836, "loss": 0.6731, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2085818350315094, "rewards/margins": 0.04690108448266983, "rewards/rejected": -0.25548291206359863, "step": 7010 }, { "epoch": 2.4190213645761545, "grad_norm": 3.2058663368225098, "learning_rate": 5.4984879001827886e-09, "logits/chosen": -2.785252094268799, "logits/rejected": -2.7515876293182373, "logps/chosen": -75.16913604736328, "logps/rejected": -76.70052337646484, "loss": 0.661, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19872048497200012, "rewards/margins": 0.07230913639068604, "rewards/rejected": -0.27102965116500854, "step": 7020 }, { "epoch": 2.4224672639558924, "grad_norm": 2.8839571475982666, "learning_rate": 5.435922837769583e-09, "logits/chosen": -2.8598036766052246, "logits/rejected": -2.8262715339660645, "logps/chosen": -74.22901153564453, "logps/rejected": -77.08287048339844, "loss": 0.6625, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1911129355430603, "rewards/margins": 0.06827744841575623, "rewards/rejected": -0.2593904137611389, "step": 7030 }, { "epoch": 2.425913163335631, "grad_norm": 3.0537242889404297, "learning_rate": 5.37367231872165e-09, "logits/chosen": -2.7912654876708984, "logits/rejected": -2.7635159492492676, "logps/chosen": -77.57463836669922, "logps/rejected": -81.10298156738281, "loss": 0.6679, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.2095755636692047, "rewards/margins": 0.057374708354473114, "rewards/rejected": -0.2669502794742584, "step": 7040 }, { "epoch": 2.4293590627153687, "grad_norm": 2.9226977825164795, "learning_rate": 5.3117373438778035e-09, "logits/chosen": -2.8336963653564453, "logits/rejected": -2.7984254360198975, "logps/chosen": -76.75789642333984, "logps/rejected": -79.33134460449219, "loss": 0.665, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.19564273953437805, "rewards/margins": 0.06509944051504135, "rewards/rejected": -0.2607421576976776, "step": 7050 }, { "epoch": 2.4328049620951067, "grad_norm": 2.8860878944396973, "learning_rate": 5.250118909003659e-09, "logits/chosen": -2.8313279151916504, "logits/rejected": -2.81327748298645, "logps/chosen": -70.6441650390625, "logps/rejected": -76.98272705078125, "loss": 0.6712, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.19963473081588745, "rewards/margins": 0.05034155771136284, "rewards/rejected": -0.24997630715370178, "step": 7060 }, { "epoch": 2.436250861474845, "grad_norm": 2.9371960163116455, "learning_rate": 5.1888180047756355e-09, "logits/chosen": -2.8849520683288574, "logits/rejected": -2.8640384674072266, "logps/chosen": -74.20401763916016, "logps/rejected": -77.95927429199219, "loss": 0.6689, "rewards/accuracies": 0.640625, "rewards/chosen": -0.20794710516929626, "rewards/margins": 0.05524333193898201, "rewards/rejected": -0.26319044828414917, "step": 7070 }, { "epoch": 2.439696760854583, "grad_norm": 2.9890997409820557, "learning_rate": 5.12783561676502e-09, "logits/chosen": -2.8459792137145996, "logits/rejected": -2.8320841789245605, "logps/chosen": -73.34442901611328, "logps/rejected": -82.1419906616211, "loss": 0.6649, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1971440613269806, "rewards/margins": 0.06351295113563538, "rewards/rejected": -0.26065701246261597, "step": 7080 }, { "epoch": 2.4431426602343214, "grad_norm": 3.065767288208008, "learning_rate": 5.067172725422123e-09, "logits/chosen": -2.777703285217285, "logits/rejected": -2.7452573776245117, "logps/chosen": -77.12316131591797, "logps/rejected": -78.41402435302734, "loss": 0.6703, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.20606210827827454, "rewards/margins": 0.053220026195049286, "rewards/rejected": -0.2592821419239044, "step": 7090 }, { "epoch": 2.4465885596140593, "grad_norm": 2.884192943572998, "learning_rate": 5.006830306060517e-09, "logits/chosen": -2.793834924697876, "logits/rejected": -2.772796154022217, "logps/chosen": -76.0117416381836, "logps/rejected": -81.75495910644531, "loss": 0.6589, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19764237105846405, "rewards/margins": 0.07749302685260773, "rewards/rejected": -0.2751353979110718, "step": 7100 }, { "epoch": 2.4465885596140593, "eval_logits/chosen": -2.9292759895324707, "eval_logits/rejected": -2.923396110534668, "eval_logps/chosen": -73.35259246826172, "eval_logps/rejected": -81.12709045410156, "eval_loss": 0.6787919402122498, "eval_rewards/accuracies": 0.609897792339325, "eval_rewards/chosen": -0.146406888961792, "eval_rewards/margins": 0.033062852919101715, "eval_rewards/rejected": -0.1794697493314743, "eval_runtime": 384.6226, "eval_samples_per_second": 11.19, "eval_steps_per_second": 1.399, "step": 7100 }, { "epoch": 2.4500344589937972, "grad_norm": 2.8249142169952393, "learning_rate": 4.946809328841356e-09, "logits/chosen": -2.8955156803131104, "logits/rejected": -2.877920627593994, "logps/chosen": -76.56686401367188, "logps/rejected": -79.42098236083984, "loss": 0.6734, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21016404032707214, "rewards/margins": 0.0465286485850811, "rewards/rejected": -0.25669270753860474, "step": 7110 }, { "epoch": 2.4534803583735356, "grad_norm": 3.0992178916931152, "learning_rate": 4.887110758757781e-09, "logits/chosen": -2.7646467685699463, "logits/rejected": -2.745619773864746, "logps/chosen": -74.1895751953125, "logps/rejected": -80.88856506347656, "loss": 0.6568, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.19314591586589813, "rewards/margins": 0.0820588618516922, "rewards/rejected": -0.27520477771759033, "step": 7120 }, { "epoch": 2.4569262577532736, "grad_norm": 3.150029420852661, "learning_rate": 4.827735555619375e-09, "logits/chosen": -2.8117587566375732, "logits/rejected": -2.7805752754211426, "logps/chosen": -76.26405334472656, "logps/rejected": -77.24273681640625, "loss": 0.6697, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.20275354385375977, "rewards/margins": 0.05396211892366409, "rewards/rejected": -0.25671568512916565, "step": 7130 }, { "epoch": 2.460372157133012, "grad_norm": 3.0173354148864746, "learning_rate": 4.768684674036799e-09, "logits/chosen": -2.784952163696289, "logits/rejected": -2.7648305892944336, "logps/chosen": -73.42536163330078, "logps/rejected": -79.0429458618164, "loss": 0.6638, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.2039499282836914, "rewards/margins": 0.06605829298496246, "rewards/rejected": -0.2700082063674927, "step": 7140 }, { "epoch": 2.46381805651275, "grad_norm": 3.1032605171203613, "learning_rate": 4.709959063406374e-09, "logits/chosen": -2.8271021842956543, "logits/rejected": -2.807617425918579, "logps/chosen": -76.30509948730469, "logps/rejected": -80.57902526855469, "loss": 0.6612, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.1917501986026764, "rewards/margins": 0.07202035933732986, "rewards/rejected": -0.26377058029174805, "step": 7150 }, { "epoch": 2.467263955892488, "grad_norm": 2.7967400550842285, "learning_rate": 4.651559667894853e-09, "logits/chosen": -2.768265962600708, "logits/rejected": -2.742154121398926, "logps/chosen": -73.25530242919922, "logps/rejected": -78.80184173583984, "loss": 0.6623, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.1812301129102707, "rewards/margins": 0.06913981586694717, "rewards/rejected": -0.25036993622779846, "step": 7160 }, { "epoch": 2.470709855272226, "grad_norm": 3.1149227619171143, "learning_rate": 4.593487426424233e-09, "logits/chosen": -2.883150577545166, "logits/rejected": -2.873790740966797, "logps/chosen": -75.01385498046875, "logps/rejected": -84.44725036621094, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2007102519273758, "rewards/margins": 0.0915181040763855, "rewards/rejected": -0.2922283709049225, "step": 7170 }, { "epoch": 2.474155754651964, "grad_norm": 3.2805941104888916, "learning_rate": 4.53574327265665e-09, "logits/chosen": -2.827605724334717, "logits/rejected": -2.810629367828369, "logps/chosen": -75.50403594970703, "logps/rejected": -79.51150512695312, "loss": 0.6685, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.20283977687358856, "rewards/margins": 0.05667983740568161, "rewards/rejected": -0.25951963663101196, "step": 7180 }, { "epoch": 2.4776016540317025, "grad_norm": 3.1896822452545166, "learning_rate": 4.478328134979406e-09, "logits/chosen": -2.8776450157165527, "logits/rejected": -2.8499927520751953, "logps/chosen": -74.177001953125, "logps/rejected": -77.23027038574219, "loss": 0.6686, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.2047462910413742, "rewards/margins": 0.05561001971364021, "rewards/rejected": -0.2603563368320465, "step": 7190 }, { "epoch": 2.4810475534114405, "grad_norm": 3.07416033744812, "learning_rate": 4.421242936489972e-09, "logits/chosen": -2.8358047008514404, "logits/rejected": -2.8095009326934814, "logps/chosen": -74.22608947753906, "logps/rejected": -76.84759521484375, "loss": 0.6636, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1980791687965393, "rewards/margins": 0.06637752801179886, "rewards/rejected": -0.26445668935775757, "step": 7200 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -2.9281628131866455, "eval_logits/rejected": -2.9222700595855713, "eval_logps/chosen": -73.48018646240234, "eval_logps/rejected": -81.27429962158203, "eval_loss": 0.678713858127594, "eval_rewards/accuracies": 0.6087360382080078, "eval_rewards/chosen": -0.14768294990062714, "eval_rewards/margins": 0.03325879946351051, "eval_rewards/rejected": -0.18094174563884735, "eval_runtime": 384.7262, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 7200 }, { "epoch": 2.4844934527911784, "grad_norm": 3.0005810260772705, "learning_rate": 4.364488594981242e-09, "logits/chosen": -2.8784918785095215, "logits/rejected": -2.8603434562683105, "logps/chosen": -71.34646606445312, "logps/rejected": -78.35809326171875, "loss": 0.6672, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1992645263671875, "rewards/margins": 0.058445774018764496, "rewards/rejected": -0.2577102780342102, "step": 7210 }, { "epoch": 2.4879393521709168, "grad_norm": 3.0718331336975098, "learning_rate": 4.3080660229266705e-09, "logits/chosen": -2.8081634044647217, "logits/rejected": -2.7848963737487793, "logps/chosen": -73.06819152832031, "logps/rejected": -79.35301971435547, "loss": 0.662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19690005481243134, "rewards/margins": 0.07033663988113403, "rewards/rejected": -0.2672366797924042, "step": 7220 }, { "epoch": 2.4913852515506547, "grad_norm": 3.277806520462036, "learning_rate": 4.25197612746569e-09, "logits/chosen": -2.8041481971740723, "logits/rejected": -2.7863705158233643, "logps/chosen": -75.71656799316406, "logps/rejected": -79.11249542236328, "loss": 0.6695, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.209635928273201, "rewards/margins": 0.05336422473192215, "rewards/rejected": -0.26300013065338135, "step": 7230 }, { "epoch": 2.4948311509303926, "grad_norm": 3.1139039993286133, "learning_rate": 4.196219810389099e-09, "logits/chosen": -2.8967912197113037, "logits/rejected": -2.8768467903137207, "logps/chosen": -75.59066772460938, "logps/rejected": -79.74969482421875, "loss": 0.6587, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.19420555233955383, "rewards/margins": 0.07623039931058884, "rewards/rejected": -0.2704359292984009, "step": 7240 }, { "epoch": 2.498277050310131, "grad_norm": 2.972391128540039, "learning_rate": 4.1407979681245155e-09, "logits/chosen": -2.8111519813537598, "logits/rejected": -2.782468318939209, "logps/chosen": -73.0444107055664, "logps/rejected": -79.65174102783203, "loss": 0.6589, "rewards/accuracies": 0.703125, "rewards/chosen": -0.19300583004951477, "rewards/margins": 0.07582889497280121, "rewards/rejected": -0.2688347101211548, "step": 7250 }, { "epoch": 2.501722949689869, "grad_norm": 3.1058664321899414, "learning_rate": 4.085711491722058e-09, "logits/chosen": -2.8487160205841064, "logits/rejected": -2.821274757385254, "logps/chosen": -75.83345031738281, "logps/rejected": -80.06922912597656, "loss": 0.665, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20267176628112793, "rewards/margins": 0.0636163204908371, "rewards/rejected": -0.26628807187080383, "step": 7260 }, { "epoch": 2.505168849069607, "grad_norm": 3.104827404022217, "learning_rate": 4.030961266839919e-09, "logits/chosen": -2.834934711456299, "logits/rejected": -2.8135828971862793, "logps/chosen": -76.01072692871094, "logps/rejected": -80.75750732421875, "loss": 0.663, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.20193436741828918, "rewards/margins": 0.06776344031095505, "rewards/rejected": -0.2696978449821472, "step": 7270 }, { "epoch": 2.5086147484493453, "grad_norm": 2.863002300262451, "learning_rate": 3.976548173730221e-09, "logits/chosen": -2.809178113937378, "logits/rejected": -2.7803077697753906, "logps/chosen": -74.27993774414062, "logps/rejected": -80.78609466552734, "loss": 0.6601, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.19415071606636047, "rewards/margins": 0.07396674156188965, "rewards/rejected": -0.2681174874305725, "step": 7280 }, { "epoch": 2.5120606478290832, "grad_norm": 2.8512284755706787, "learning_rate": 3.922473087224776e-09, "logits/chosen": -2.7803237438201904, "logits/rejected": -2.7747929096221924, "logps/chosen": -72.02703857421875, "logps/rejected": -78.68260192871094, "loss": 0.6706, "rewards/accuracies": 0.640625, "rewards/chosen": -0.21125459671020508, "rewards/margins": 0.05147852376103401, "rewards/rejected": -0.26273313164711, "step": 7290 }, { "epoch": 2.5155065472088216, "grad_norm": 2.5867412090301514, "learning_rate": 3.868736876721087e-09, "logits/chosen": -2.7969918251037598, "logits/rejected": -2.774527072906494, "logps/chosen": -74.00245666503906, "logps/rejected": -77.838623046875, "loss": 0.6679, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.20409516990184784, "rewards/margins": 0.05744461342692375, "rewards/rejected": -0.2615398168563843, "step": 7300 }, { "epoch": 2.5155065472088216, "eval_logits/chosen": -2.927854299545288, "eval_logits/rejected": -2.9219651222229004, "eval_logps/chosen": -73.55626678466797, "eval_logps/rejected": -81.34708404541016, "eval_loss": 0.678745448589325, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.14844365417957306, "eval_rewards/margins": 0.0332258939743042, "eval_rewards/rejected": -0.18166953325271606, "eval_runtime": 384.8313, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 7300 }, { "epoch": 2.5189524465885595, "grad_norm": 2.936593770980835, "learning_rate": 3.815340406168333e-09, "logits/chosen": -2.7792415618896484, "logits/rejected": -2.758457899093628, "logps/chosen": -73.82521057128906, "logps/rejected": -77.83888244628906, "loss": 0.6648, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20047025382518768, "rewards/margins": 0.0641724094748497, "rewards/rejected": -0.2646426558494568, "step": 7310 }, { "epoch": 2.5223983459682975, "grad_norm": 2.8303260803222656, "learning_rate": 3.762284534053492e-09, "logits/chosen": -2.748037815093994, "logits/rejected": -2.736145496368408, "logps/chosen": -72.69297790527344, "logps/rejected": -79.75902557373047, "loss": 0.6634, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20238709449768066, "rewards/margins": 0.06648044288158417, "rewards/rejected": -0.268867552280426, "step": 7320 }, { "epoch": 2.525844245348036, "grad_norm": 2.8986549377441406, "learning_rate": 3.7095701133875585e-09, "logits/chosen": -2.8247933387756348, "logits/rejected": -2.7931108474731445, "logps/chosen": -74.48722076416016, "logps/rejected": -78.02268981933594, "loss": 0.6658, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2090875804424286, "rewards/margins": 0.06215309351682663, "rewards/rejected": -0.2712406516075134, "step": 7330 }, { "epoch": 2.529290144727774, "grad_norm": 2.98786997795105, "learning_rate": 3.657197991691774e-09, "logits/chosen": -2.7856554985046387, "logits/rejected": -2.7675652503967285, "logps/chosen": -69.34353637695312, "logps/rejected": -77.05681610107422, "loss": 0.6701, "rewards/accuracies": 0.640625, "rewards/chosen": -0.19962826371192932, "rewards/margins": 0.05234195664525032, "rewards/rejected": -0.25197023153305054, "step": 7340 }, { "epoch": 2.532736044107512, "grad_norm": 3.2573559284210205, "learning_rate": 3.6051690109840494e-09, "logits/chosen": -2.878065586090088, "logits/rejected": -2.864863395690918, "logps/chosen": -73.37214660644531, "logps/rejected": -82.72701263427734, "loss": 0.66, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20315006375312805, "rewards/margins": 0.07522037625312805, "rewards/rejected": -0.2783704400062561, "step": 7350 }, { "epoch": 2.53618194348725, "grad_norm": 2.9621572494506836, "learning_rate": 3.5534840077654227e-09, "logits/chosen": -2.81292724609375, "logits/rejected": -2.790576457977295, "logps/chosen": -75.97547912597656, "logps/rejected": -80.47693634033203, "loss": 0.6641, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2040446251630783, "rewards/margins": 0.06569458544254303, "rewards/rejected": -0.26973921060562134, "step": 7360 }, { "epoch": 2.539627842866988, "grad_norm": 2.7631373405456543, "learning_rate": 3.5021438130065834e-09, "logits/chosen": -2.829509735107422, "logits/rejected": -2.8180649280548096, "logps/chosen": -74.30305480957031, "logps/rejected": -80.3935317993164, "loss": 0.6611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2016134262084961, "rewards/margins": 0.07378591597080231, "rewards/rejected": -0.2753993570804596, "step": 7370 }, { "epoch": 2.5430737422467264, "grad_norm": 3.048893451690674, "learning_rate": 3.4511492521345443e-09, "logits/chosen": -2.8367865085601807, "logits/rejected": -2.8088583946228027, "logps/chosen": -71.83967590332031, "logps/rejected": -77.95903015136719, "loss": 0.6594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19038712978363037, "rewards/margins": 0.07551152259111404, "rewards/rejected": -0.2658986449241638, "step": 7380 }, { "epoch": 2.5465196416264644, "grad_norm": 2.8188202381134033, "learning_rate": 3.4005011450193436e-09, "logits/chosen": -2.8250107765197754, "logits/rejected": -2.814131259918213, "logps/chosen": -72.77637481689453, "logps/rejected": -78.44098663330078, "loss": 0.6677, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2007887065410614, "rewards/margins": 0.057915978133678436, "rewards/rejected": -0.25870469212532043, "step": 7390 }, { "epoch": 2.5499655410062028, "grad_norm": 3.1946489810943604, "learning_rate": 3.3502003059608847e-09, "logits/chosen": -2.8203511238098145, "logits/rejected": -2.8072240352630615, "logps/chosen": -76.36048889160156, "logps/rejected": -82.04139709472656, "loss": 0.6679, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.22048643231391907, "rewards/margins": 0.05740383267402649, "rewards/rejected": -0.27789026498794556, "step": 7400 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.9272968769073486, "eval_logits/rejected": -2.921482563018799, "eval_logps/chosen": -73.6218032836914, "eval_logps/rejected": -81.42633819580078, "eval_loss": 0.6786887049674988, "eval_rewards/accuracies": 0.609433114528656, "eval_rewards/chosen": -0.14909912645816803, "eval_rewards/margins": 0.03336302563548088, "eval_rewards/rejected": -0.18246212601661682, "eval_runtime": 384.9967, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 7400 }, { "epoch": 2.5534114403859407, "grad_norm": 3.0059900283813477, "learning_rate": 3.300247543675827e-09, "logits/chosen": -2.9239370822906494, "logits/rejected": -2.886399745941162, "logps/chosen": -77.12906646728516, "logps/rejected": -78.69457244873047, "loss": 0.6611, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.21024811267852783, "rewards/margins": 0.07142722606658936, "rewards/rejected": -0.2816753387451172, "step": 7410 }, { "epoch": 2.5568573397656786, "grad_norm": 3.1475119590759277, "learning_rate": 3.2506436612845937e-09, "logits/chosen": -2.8529903888702393, "logits/rejected": -2.8277475833892822, "logps/chosen": -75.9848403930664, "logps/rejected": -81.09859466552734, "loss": 0.6634, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20545276999473572, "rewards/margins": 0.06720688194036484, "rewards/rejected": -0.27265965938568115, "step": 7420 }, { "epoch": 2.560303239145417, "grad_norm": 2.888164758682251, "learning_rate": 3.2013894562984643e-09, "logits/chosen": -2.7869720458984375, "logits/rejected": -2.766547203063965, "logps/chosen": -74.35009765625, "logps/rejected": -80.58966064453125, "loss": 0.6631, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20564734935760498, "rewards/margins": 0.06839480251073837, "rewards/rejected": -0.27404215931892395, "step": 7430 }, { "epoch": 2.563749138525155, "grad_norm": 2.7628254890441895, "learning_rate": 3.1524857206067346e-09, "logits/chosen": -2.8000998497009277, "logits/rejected": -2.775285005569458, "logps/chosen": -75.18690490722656, "logps/rejected": -80.17982482910156, "loss": 0.6605, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.19598865509033203, "rewards/margins": 0.07316666841506958, "rewards/rejected": -0.269155353307724, "step": 7440 }, { "epoch": 2.5671950379048933, "grad_norm": 3.069775342941284, "learning_rate": 3.1039332404640016e-09, "logits/chosen": -2.722813844680786, "logits/rejected": -2.6986989974975586, "logps/chosen": -76.45578002929688, "logps/rejected": -80.69409942626953, "loss": 0.673, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.21405068039894104, "rewards/margins": 0.048891831189394, "rewards/rejected": -0.26294249296188354, "step": 7450 }, { "epoch": 2.5706409372846313, "grad_norm": 2.9569754600524902, "learning_rate": 3.05573279647752e-09, "logits/chosen": -2.7951464653015137, "logits/rejected": -2.7828805446624756, "logps/chosen": -71.73380279541016, "logps/rejected": -80.99790954589844, "loss": 0.6649, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2004125863313675, "rewards/margins": 0.06363936513662338, "rewards/rejected": -0.2640519142150879, "step": 7460 }, { "epoch": 2.574086836664369, "grad_norm": 3.7347359657287598, "learning_rate": 3.0078851635946424e-09, "logits/chosen": -2.8934829235076904, "logits/rejected": -2.875368595123291, "logps/chosen": -71.91404724121094, "logps/rejected": -78.40968322753906, "loss": 0.6578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19142022728919983, "rewards/margins": 0.07889686524868011, "rewards/rejected": -0.27031707763671875, "step": 7470 }, { "epoch": 2.5775327360441076, "grad_norm": 3.1747958660125732, "learning_rate": 2.960391111090374e-09, "logits/chosen": -2.8428940773010254, "logits/rejected": -2.8111205101013184, "logps/chosen": -75.44371032714844, "logps/rejected": -79.03288269042969, "loss": 0.6639, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.19604380428791046, "rewards/margins": 0.06772056221961975, "rewards/rejected": -0.2637643814086914, "step": 7480 }, { "epoch": 2.5809786354238455, "grad_norm": 3.1292686462402344, "learning_rate": 2.913251402554978e-09, "logits/chosen": -2.8647892475128174, "logits/rejected": -2.833148956298828, "logps/chosen": -74.54780578613281, "logps/rejected": -79.8340072631836, "loss": 0.6641, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.20531466603279114, "rewards/margins": 0.06498364359140396, "rewards/rejected": -0.2702983021736145, "step": 7490 }, { "epoch": 2.584424534803584, "grad_norm": 3.226863384246826, "learning_rate": 2.8664667958817475e-09, "logits/chosen": -2.873723268508911, "logits/rejected": -2.862431526184082, "logps/chosen": -74.18633270263672, "logps/rejected": -79.91875457763672, "loss": 0.6657, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2045147866010666, "rewards/margins": 0.06179727241396904, "rewards/rejected": -0.2663120627403259, "step": 7500 }, { "epoch": 2.584424534803584, "eval_logits/chosen": -2.9269816875457764, "eval_logits/rejected": -2.921128988265991, "eval_logps/chosen": -73.67274475097656, "eval_logps/rejected": -81.48828125, "eval_loss": 0.6786367297172546, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.1496085673570633, "eval_rewards/margins": 0.033472951501607895, "eval_rewards/rejected": -0.18308153748512268, "eval_runtime": 384.6905, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.399, "step": 7500 }, { "epoch": 2.587870434183322, "grad_norm": 3.0301599502563477, "learning_rate": 2.820038043254769e-09, "logits/chosen": -2.841358184814453, "logits/rejected": -2.8345835208892822, "logps/chosen": -74.02638244628906, "logps/rejected": -79.9226303100586, "loss": 0.6703, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2135336846113205, "rewards/margins": 0.053957004100084305, "rewards/rejected": -0.2674906849861145, "step": 7510 }, { "epoch": 2.59131633356306, "grad_norm": 3.1768720149993896, "learning_rate": 2.773965891136859e-09, "logits/chosen": -2.804445743560791, "logits/rejected": -2.7838993072509766, "logps/chosen": -74.34261322021484, "logps/rejected": -80.16661071777344, "loss": 0.6666, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20510026812553406, "rewards/margins": 0.06139761209487915, "rewards/rejected": -0.2664979100227356, "step": 7520 }, { "epoch": 2.594762232942798, "grad_norm": 2.7789816856384277, "learning_rate": 2.7282510802575487e-09, "logits/chosen": -2.8340415954589844, "logits/rejected": -2.806692600250244, "logps/chosen": -74.09795379638672, "logps/rejected": -78.34556579589844, "loss": 0.6632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1984129399061203, "rewards/margins": 0.0679444894194603, "rewards/rejected": -0.2663574516773224, "step": 7530 }, { "epoch": 2.598208132322536, "grad_norm": 2.9909815788269043, "learning_rate": 2.682894345601186e-09, "logits/chosen": -2.8831825256347656, "logits/rejected": -2.850769519805908, "logps/chosen": -74.82855987548828, "logps/rejected": -78.21894836425781, "loss": 0.6676, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1973542869091034, "rewards/margins": 0.058047063648700714, "rewards/rejected": -0.2554013431072235, "step": 7540 }, { "epoch": 2.6016540317022745, "grad_norm": 2.7855241298675537, "learning_rate": 2.6378964163951162e-09, "logits/chosen": -2.8237948417663574, "logits/rejected": -2.8027873039245605, "logps/chosen": -73.11653137207031, "logps/rejected": -77.89794158935547, "loss": 0.6684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20274615287780762, "rewards/margins": 0.056128352880477905, "rewards/rejected": -0.2588745057582855, "step": 7550 }, { "epoch": 2.6050999310820124, "grad_norm": 3.097872734069824, "learning_rate": 2.5932580160979323e-09, "logits/chosen": -2.8130440711975098, "logits/rejected": -2.7946929931640625, "logps/chosen": -77.16216278076172, "logps/rejected": -80.65824890136719, "loss": 0.6713, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21544364094734192, "rewards/margins": 0.05053433030843735, "rewards/rejected": -0.2659779489040375, "step": 7560 }, { "epoch": 2.6085458304617504, "grad_norm": 3.1403050422668457, "learning_rate": 2.5489798623879094e-09, "logits/chosen": -2.78678297996521, "logits/rejected": -2.763037919998169, "logps/chosen": -75.012939453125, "logps/rejected": -80.80729675292969, "loss": 0.6626, "rewards/accuracies": 0.625, "rewards/chosen": -0.20696601271629333, "rewards/margins": 0.07109804451465607, "rewards/rejected": -0.2780640423297882, "step": 7570 }, { "epoch": 2.6119917298414888, "grad_norm": 3.1326699256896973, "learning_rate": 2.5050626671513725e-09, "logits/chosen": -2.877891778945923, "logits/rejected": -2.856247901916504, "logps/chosen": -73.21952819824219, "logps/rejected": -80.94234466552734, "loss": 0.6591, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.1931820511817932, "rewards/margins": 0.07609923928976059, "rewards/rejected": -0.269281268119812, "step": 7580 }, { "epoch": 2.6154376292212267, "grad_norm": 3.1336936950683594, "learning_rate": 2.461507136471344e-09, "logits/chosen": -2.8177061080932617, "logits/rejected": -2.810600519180298, "logps/chosen": -75.12174987792969, "logps/rejected": -81.65655517578125, "loss": 0.6678, "rewards/accuracies": 0.59375, "rewards/chosen": -0.20702321827411652, "rewards/margins": 0.060899682343006134, "rewards/rejected": -0.26792293787002563, "step": 7590 }, { "epoch": 2.618883528600965, "grad_norm": 3.0580458641052246, "learning_rate": 2.418313970616126e-09, "logits/chosen": -2.8153727054595947, "logits/rejected": -2.7971465587615967, "logps/chosen": -74.92732238769531, "logps/rejected": -80.164306640625, "loss": 0.6638, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20816096663475037, "rewards/margins": 0.06470120698213577, "rewards/rejected": -0.27286219596862793, "step": 7600 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.9263126850128174, "eval_logits/rejected": -2.9204790592193604, "eval_logps/chosen": -73.72270202636719, "eval_logps/rejected": -81.5289306640625, "eval_loss": 0.6786933541297913, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.15010805428028107, "eval_rewards/margins": 0.033379942178726196, "eval_rewards/rejected": -0.18348799645900726, "eval_runtime": 385.0338, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 7600 }, { "epoch": 2.622329427980703, "grad_norm": 3.0515081882476807, "learning_rate": 2.3754838640280635e-09, "logits/chosen": -2.774482488632202, "logits/rejected": -2.7405154705047607, "logps/chosen": -74.89452362060547, "logps/rejected": -78.25789642333984, "loss": 0.6639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20920777320861816, "rewards/margins": 0.06651514023542404, "rewards/rejected": -0.2757229208946228, "step": 7610 }, { "epoch": 2.625775327360441, "grad_norm": 3.229125499725342, "learning_rate": 2.3330175053123986e-09, "logits/chosen": -2.7771217823028564, "logits/rejected": -2.7549872398376465, "logps/chosen": -77.79759216308594, "logps/rejected": -81.6274185180664, "loss": 0.6624, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.2079116851091385, "rewards/margins": 0.06936520338058472, "rewards/rejected": -0.2772769033908844, "step": 7620 }, { "epoch": 2.6292212267401793, "grad_norm": 3.316720962524414, "learning_rate": 2.290915577226152e-09, "logits/chosen": -2.8076014518737793, "logits/rejected": -2.7848591804504395, "logps/chosen": -72.82865905761719, "logps/rejected": -78.71394348144531, "loss": 0.6603, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19348780810832977, "rewards/margins": 0.07338859885931015, "rewards/rejected": -0.2668763995170593, "step": 7630 }, { "epoch": 2.6326671261199173, "grad_norm": 3.231029510498047, "learning_rate": 2.2491787566672167e-09, "logits/chosen": -2.8938686847686768, "logits/rejected": -2.8632521629333496, "logps/chosen": -75.23637390136719, "logps/rejected": -81.21224212646484, "loss": 0.6615, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21629247069358826, "rewards/margins": 0.07112295180559158, "rewards/rejected": -0.28741541504859924, "step": 7640 }, { "epoch": 2.6361130254996556, "grad_norm": 2.7676174640655518, "learning_rate": 2.2078077146633845e-09, "logits/chosen": -2.7665352821350098, "logits/rejected": -2.7346112728118896, "logps/chosen": -74.17662048339844, "logps/rejected": -79.28196716308594, "loss": 0.6584, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20812615752220154, "rewards/margins": 0.08001205325126648, "rewards/rejected": -0.288138210773468, "step": 7650 }, { "epoch": 2.6395589248793936, "grad_norm": 3.3405444622039795, "learning_rate": 2.166803116361654e-09, "logits/chosen": -2.8530850410461426, "logits/rejected": -2.833200454711914, "logps/chosen": -73.19547271728516, "logps/rejected": -80.88066101074219, "loss": 0.6618, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20517858862876892, "rewards/margins": 0.07204946875572205, "rewards/rejected": -0.27722805738449097, "step": 7660 }, { "epoch": 2.6430048242591315, "grad_norm": 3.0809807777404785, "learning_rate": 2.126165621017456e-09, "logits/chosen": -2.7972121238708496, "logits/rejected": -2.778132677078247, "logps/chosen": -76.18596649169922, "logps/rejected": -82.34400177001953, "loss": 0.664, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.2082051783800125, "rewards/margins": 0.06539927423000336, "rewards/rejected": -0.27360445261001587, "step": 7670 }, { "epoch": 2.64645072363887, "grad_norm": 3.1275718212127686, "learning_rate": 2.0858958819840955e-09, "logits/chosen": -2.7645487785339355, "logits/rejected": -2.7558534145355225, "logps/chosen": -73.58595275878906, "logps/rejected": -82.28010559082031, "loss": 0.6638, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2056526243686676, "rewards/margins": 0.06633436679840088, "rewards/rejected": -0.27198702096939087, "step": 7680 }, { "epoch": 2.649896623018608, "grad_norm": 2.981877326965332, "learning_rate": 2.0459945467022564e-09, "logits/chosen": -2.863377332687378, "logits/rejected": -2.8364272117614746, "logps/chosen": -76.25320434570312, "logps/rejected": -79.8304214477539, "loss": 0.6632, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.21095123887062073, "rewards/margins": 0.0675932988524437, "rewards/rejected": -0.278544545173645, "step": 7690 }, { "epoch": 2.6533425223983462, "grad_norm": 2.9926531314849854, "learning_rate": 2.006462256689545e-09, "logits/chosen": -2.883608341217041, "logits/rejected": -2.853994846343994, "logps/chosen": -78.49991607666016, "logps/rejected": -81.63729095458984, "loss": 0.6638, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.2101484090089798, "rewards/margins": 0.06828924268484116, "rewards/rejected": -0.27843764424324036, "step": 7700 }, { "epoch": 2.6533425223983462, "eval_logits/chosen": -2.9260611534118652, "eval_logits/rejected": -2.92020320892334, "eval_logps/chosen": -73.70887756347656, "eval_logps/rejected": -81.52108764648438, "eval_loss": 0.6786604523658752, "eval_rewards/accuracies": 0.6105948090553284, "eval_rewards/chosen": -0.14996981620788574, "eval_rewards/margins": 0.03343977779150009, "eval_rewards/rejected": -0.18340960144996643, "eval_runtime": 385.0953, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 7700 }, { "epoch": 2.656788421778084, "grad_norm": 2.9839422702789307, "learning_rate": 1.9672996475302404e-09, "logits/chosen": -2.8186190128326416, "logits/rejected": -2.7886860370635986, "logps/chosen": -76.7218246459961, "logps/rejected": -80.06423950195312, "loss": 0.6633, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20709805190563202, "rewards/margins": 0.06771091371774673, "rewards/rejected": -0.27480894327163696, "step": 7710 }, { "epoch": 2.660234321157822, "grad_norm": 2.9466066360473633, "learning_rate": 1.9285073488650135e-09, "logits/chosen": -2.942584276199341, "logits/rejected": -2.913252592086792, "logps/chosen": -76.68403625488281, "logps/rejected": -80.53271484375, "loss": 0.6628, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.2035132646560669, "rewards/margins": 0.06678745895624161, "rewards/rejected": -0.2703007459640503, "step": 7720 }, { "epoch": 2.66368022053756, "grad_norm": 3.0535659790039062, "learning_rate": 1.8900859843808403e-09, "logits/chosen": -2.7687828540802, "logits/rejected": -2.7519726753234863, "logps/chosen": -76.04930114746094, "logps/rejected": -81.59408569335938, "loss": 0.6646, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19750520586967468, "rewards/margins": 0.06474888324737549, "rewards/rejected": -0.26225408911705017, "step": 7730 }, { "epoch": 2.6671261199172984, "grad_norm": 2.8972673416137695, "learning_rate": 1.8520361718009776e-09, "logits/chosen": -2.8628134727478027, "logits/rejected": -2.8364129066467285, "logps/chosen": -78.28489685058594, "logps/rejected": -80.29054260253906, "loss": 0.667, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.20889762043952942, "rewards/margins": 0.06034914776682854, "rewards/rejected": -0.26924675703048706, "step": 7740 }, { "epoch": 2.670572019297037, "grad_norm": 3.006944179534912, "learning_rate": 1.8143585228750035e-09, "logits/chosen": -2.838576078414917, "logits/rejected": -2.8247971534729004, "logps/chosen": -76.03779602050781, "logps/rejected": -79.47288513183594, "loss": 0.6736, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21770253777503967, "rewards/margins": 0.04604431241750717, "rewards/rejected": -0.26374685764312744, "step": 7750 }, { "epoch": 2.6740179186767747, "grad_norm": 3.033519744873047, "learning_rate": 1.7770536433690087e-09, "logits/chosen": -2.7235705852508545, "logits/rejected": -2.695596218109131, "logps/chosen": -76.21092224121094, "logps/rejected": -78.67154693603516, "loss": 0.6606, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19899126887321472, "rewards/margins": 0.07271669059991837, "rewards/rejected": -0.2717079520225525, "step": 7760 }, { "epoch": 2.6774638180565127, "grad_norm": 3.1475305557250977, "learning_rate": 1.7401221330558436e-09, "logits/chosen": -2.7682201862335205, "logits/rejected": -2.7441859245300293, "logps/chosen": -73.18707275390625, "logps/rejected": -79.66898345947266, "loss": 0.6618, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.20043106377124786, "rewards/margins": 0.0698840469121933, "rewards/rejected": -0.27031511068344116, "step": 7770 }, { "epoch": 2.6809097174362506, "grad_norm": 2.9892404079437256, "learning_rate": 1.703564585705483e-09, "logits/chosen": -2.858635425567627, "logits/rejected": -2.8397774696350098, "logps/chosen": -74.53509521484375, "logps/rejected": -81.00166320800781, "loss": 0.6643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21388328075408936, "rewards/margins": 0.06483346968889236, "rewards/rejected": -0.2787167429924011, "step": 7780 }, { "epoch": 2.684355616815989, "grad_norm": 3.001133680343628, "learning_rate": 1.6673815890754672e-09, "logits/chosen": -2.8553643226623535, "logits/rejected": -2.841541051864624, "logps/chosen": -74.88123321533203, "logps/rejected": -81.55390167236328, "loss": 0.6651, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20828452706336975, "rewards/margins": 0.06349913775920868, "rewards/rejected": -0.27178364992141724, "step": 7790 }, { "epoch": 2.687801516195727, "grad_norm": 3.0135772228240967, "learning_rate": 1.6315737249014695e-09, "logits/chosen": -2.821977376937866, "logits/rejected": -2.806702136993408, "logps/chosen": -73.7355728149414, "logps/rejected": -79.48221588134766, "loss": 0.6664, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.20533959567546844, "rewards/margins": 0.06135411188006401, "rewards/rejected": -0.26669368147850037, "step": 7800 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.925576686859131, "eval_logits/rejected": -2.9197590351104736, "eval_logps/chosen": -73.74092864990234, "eval_logps/rejected": -81.56622314453125, "eval_loss": 0.6786039471626282, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.1502903550863266, "eval_rewards/margins": 0.033570654690265656, "eval_rewards/rejected": -0.18386100232601166, "eval_runtime": 384.8528, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 7800 }, { "epoch": 2.6912474155754653, "grad_norm": 2.7619662284851074, "learning_rate": 1.5961415688879293e-09, "logits/chosen": -2.843717575073242, "logits/rejected": -2.834810972213745, "logps/chosen": -73.8877182006836, "logps/rejected": -80.44827270507812, "loss": 0.6652, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19360974431037903, "rewards/margins": 0.06272809952497482, "rewards/rejected": -0.25633782148361206, "step": 7810 }, { "epoch": 2.6946933149552033, "grad_norm": 2.8655476570129395, "learning_rate": 1.5610856906988018e-09, "logits/chosen": -2.8215110301971436, "logits/rejected": -2.7916667461395264, "logps/chosen": -74.8424072265625, "logps/rejected": -80.0567626953125, "loss": 0.6588, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.20539546012878418, "rewards/margins": 0.07711796462535858, "rewards/rejected": -0.28251343965530396, "step": 7820 }, { "epoch": 2.698139214334941, "grad_norm": 3.3269481658935547, "learning_rate": 1.5264066539484005e-09, "logits/chosen": -2.876765727996826, "logits/rejected": -2.8655307292938232, "logps/chosen": -72.62867736816406, "logps/rejected": -80.24038696289062, "loss": 0.6627, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.19633261859416962, "rewards/margins": 0.06905725598335266, "rewards/rejected": -0.26538988947868347, "step": 7830 }, { "epoch": 2.7015851137146796, "grad_norm": 3.157557725906372, "learning_rate": 1.4921050161923355e-09, "logits/chosen": -2.7604708671569824, "logits/rejected": -2.734009265899658, "logps/chosen": -76.15579986572266, "logps/rejected": -81.95283508300781, "loss": 0.658, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21348616480827332, "rewards/margins": 0.07732900977134705, "rewards/rejected": -0.29081517457962036, "step": 7840 }, { "epoch": 2.7050310130944175, "grad_norm": 2.924194812774658, "learning_rate": 1.4581813289185368e-09, "logits/chosen": -2.840101718902588, "logits/rejected": -2.8231797218322754, "logps/chosen": -73.50285339355469, "logps/rejected": -81.42240905761719, "loss": 0.6638, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.21257443726062775, "rewards/margins": 0.06771445274353027, "rewards/rejected": -0.2802888751029968, "step": 7850 }, { "epoch": 2.708476912474156, "grad_norm": 2.9922053813934326, "learning_rate": 1.4246361375384153e-09, "logits/chosen": -2.8335814476013184, "logits/rejected": -2.8177263736724854, "logps/chosen": -77.03657531738281, "logps/rejected": -81.59843444824219, "loss": 0.669, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.2188374251127243, "rewards/margins": 0.055102430284023285, "rewards/rejected": -0.273939847946167, "step": 7860 }, { "epoch": 2.711922811853894, "grad_norm": 3.1429929733276367, "learning_rate": 1.391469981378063e-09, "logits/chosen": -2.8928942680358887, "logits/rejected": -2.8670597076416016, "logps/chosen": -73.8523178100586, "logps/rejected": -78.66716003417969, "loss": 0.6663, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.21104225516319275, "rewards/margins": 0.061517082154750824, "rewards/rejected": -0.27255937457084656, "step": 7870 }, { "epoch": 2.7153687112336318, "grad_norm": 3.0322632789611816, "learning_rate": 1.3586833936696045e-09, "logits/chosen": -2.7872555255889893, "logits/rejected": -2.7731995582580566, "logps/chosen": -73.65172576904297, "logps/rejected": -80.22820281982422, "loss": 0.67, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.22093363106250763, "rewards/margins": 0.053128112107515335, "rewards/rejected": -0.27406173944473267, "step": 7880 }, { "epoch": 2.71881461061337, "grad_norm": 3.138367176055908, "learning_rate": 1.3262769015426111e-09, "logits/chosen": -2.885283946990967, "logits/rejected": -2.868257999420166, "logps/chosen": -73.71112823486328, "logps/rejected": -79.2211685180664, "loss": 0.6675, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20631082355976105, "rewards/margins": 0.05770787596702576, "rewards/rejected": -0.264018714427948, "step": 7890 }, { "epoch": 2.722260509993108, "grad_norm": 2.902265787124634, "learning_rate": 1.2942510260156302e-09, "logits/chosen": -2.8135218620300293, "logits/rejected": -2.7821240425109863, "logps/chosen": -75.91639709472656, "logps/rejected": -78.18832397460938, "loss": 0.6631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19772398471832275, "rewards/margins": 0.06734729558229446, "rewards/rejected": -0.265071302652359, "step": 7900 }, { "epoch": 2.722260509993108, "eval_logits/chosen": -2.9253664016723633, "eval_logits/rejected": -2.9195282459259033, "eval_logps/chosen": -73.73696899414062, "eval_logps/rejected": -81.57855224609375, "eval_loss": 0.6785289645195007, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.15025074779987335, "eval_rewards/margins": 0.03373360633850098, "eval_rewards/rejected": -0.18398433923721313, "eval_runtime": 385.1835, "eval_samples_per_second": 11.174, "eval_steps_per_second": 1.397, "step": 7900 }, { "epoch": 2.7257064093728465, "grad_norm": 2.8847227096557617, "learning_rate": 1.2626062819878104e-09, "logits/chosen": -2.812169313430786, "logits/rejected": -2.7991702556610107, "logps/chosen": -72.23898315429688, "logps/rejected": -79.7685775756836, "loss": 0.6685, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2202928066253662, "rewards/margins": 0.05701180547475815, "rewards/rejected": -0.27730461955070496, "step": 7910 }, { "epoch": 2.7291523087525844, "grad_norm": 3.214412212371826, "learning_rate": 1.2313431782306234e-09, "logits/chosen": -2.8863492012023926, "logits/rejected": -2.8790600299835205, "logps/chosen": -73.56205749511719, "logps/rejected": -80.91376495361328, "loss": 0.6693, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2201816588640213, "rewards/margins": 0.05611049383878708, "rewards/rejected": -0.2762921452522278, "step": 7920 }, { "epoch": 2.7325982081323223, "grad_norm": 3.252439260482788, "learning_rate": 1.2004622173796802e-09, "logits/chosen": -2.850803852081299, "logits/rejected": -2.825124979019165, "logps/chosen": -73.05252838134766, "logps/rejected": -78.29252624511719, "loss": 0.6669, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.19854137301445007, "rewards/margins": 0.05922982841730118, "rewards/rejected": -0.25777122378349304, "step": 7930 }, { "epoch": 2.7360441075120607, "grad_norm": 2.9394636154174805, "learning_rate": 1.1699638959266427e-09, "logits/chosen": -2.8230245113372803, "logits/rejected": -2.8012354373931885, "logps/chosen": -74.95918273925781, "logps/rejected": -80.56278991699219, "loss": 0.6609, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.21407315135002136, "rewards/margins": 0.07152987271547318, "rewards/rejected": -0.28560304641723633, "step": 7940 }, { "epoch": 2.7394900068917987, "grad_norm": 2.9502594470977783, "learning_rate": 1.1398487042112687e-09, "logits/chosen": -2.8475682735443115, "logits/rejected": -2.8071656227111816, "logps/chosen": -76.10076904296875, "logps/rejected": -78.57460021972656, "loss": 0.6544, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1987118422985077, "rewards/margins": 0.08543185889720917, "rewards/rejected": -0.28414368629455566, "step": 7950 }, { "epoch": 2.742935906271537, "grad_norm": 2.9502437114715576, "learning_rate": 1.1101171264134955e-09, "logits/chosen": -2.7727394104003906, "logits/rejected": -2.7549283504486084, "logps/chosen": -73.73404693603516, "logps/rejected": -78.82189178466797, "loss": 0.6673, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.20958586037158966, "rewards/margins": 0.05917984992265701, "rewards/rejected": -0.2687656879425049, "step": 7960 }, { "epoch": 2.746381805651275, "grad_norm": 3.156480073928833, "learning_rate": 1.0807696405456756e-09, "logits/chosen": -2.7660205364227295, "logits/rejected": -2.7393124103546143, "logps/chosen": -76.06897735595703, "logps/rejected": -81.09082794189453, "loss": 0.6578, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18885715305805206, "rewards/margins": 0.07937277853488922, "rewards/rejected": -0.2682299017906189, "step": 7970 }, { "epoch": 2.749827705031013, "grad_norm": 2.746273994445801, "learning_rate": 1.0518067184448893e-09, "logits/chosen": -2.8735687732696533, "logits/rejected": -2.8486602306365967, "logps/chosen": -76.75370025634766, "logps/rejected": -78.97061157226562, "loss": 0.6652, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.195907324552536, "rewards/margins": 0.06388511508703232, "rewards/rejected": -0.25979241728782654, "step": 7980 }, { "epoch": 2.7532736044107513, "grad_norm": 3.4030685424804688, "learning_rate": 1.023228825765343e-09, "logits/chosen": -2.801901340484619, "logits/rejected": -2.7810540199279785, "logps/chosen": -76.72042083740234, "logps/rejected": -83.08061218261719, "loss": 0.6638, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20836582779884338, "rewards/margins": 0.06544304639101028, "rewards/rejected": -0.27380886673927307, "step": 7990 }, { "epoch": 2.7567195037904892, "grad_norm": 3.051384925842285, "learning_rate": 9.9503642197091e-10, "logits/chosen": -2.8817572593688965, "logits/rejected": -2.8682847023010254, "logps/chosen": -74.94681549072266, "logps/rejected": -80.64703369140625, "loss": 0.666, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.20687516033649445, "rewards/margins": 0.06138332933187485, "rewards/rejected": -0.2682585120201111, "step": 8000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.9250006675720215, "eval_logits/rejected": -2.919144868850708, "eval_logps/chosen": -73.77140808105469, "eval_logps/rejected": -81.6061782836914, "eval_loss": 0.6785618662834167, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.15059514343738556, "eval_rewards/margins": 0.03366539999842644, "eval_rewards/rejected": -0.1842605471611023, "eval_runtime": 385.0521, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 8000 }, { "epoch": 2.7601654031702276, "grad_norm": 3.2257680892944336, "learning_rate": 9.672299603277145e-10, "logits/chosen": -2.9246973991394043, "logits/rejected": -2.9099814891815186, "logps/chosen": -75.38167572021484, "logps/rejected": -80.27973937988281, "loss": 0.6646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.21209347248077393, "rewards/margins": 0.06647959351539612, "rewards/rejected": -0.27857303619384766, "step": 8010 }, { "epoch": 2.7636113025499656, "grad_norm": 2.7653374671936035, "learning_rate": 9.39809887896878e-10, "logits/chosen": -2.8898653984069824, "logits/rejected": -2.874114751815796, "logps/chosen": -73.39210510253906, "logps/rejected": -79.42277526855469, "loss": 0.6629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19987304508686066, "rewards/margins": 0.06897140294313431, "rewards/rejected": -0.26884445548057556, "step": 8020 }, { "epoch": 2.7670572019297035, "grad_norm": 3.033780574798584, "learning_rate": 9.127766455272828e-10, "logits/chosen": -2.840846538543701, "logits/rejected": -2.8221616744995117, "logps/chosen": -74.61019134521484, "logps/rejected": -81.89695739746094, "loss": 0.6575, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19553305208683014, "rewards/margins": 0.07905575633049011, "rewards/rejected": -0.27458879351615906, "step": 8030 }, { "epoch": 2.770503101309442, "grad_norm": 2.9897148609161377, "learning_rate": 8.861306678485364e-10, "logits/chosen": -2.80135178565979, "logits/rejected": -2.7688608169555664, "logps/chosen": -77.63356018066406, "logps/rejected": -81.58967590332031, "loss": 0.6567, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.2054605931043625, "rewards/margins": 0.08159399032592773, "rewards/rejected": -0.28705453872680664, "step": 8040 }, { "epoch": 2.77394900068918, "grad_norm": 3.0009284019470215, "learning_rate": 8.598723832639571e-10, "logits/chosen": -2.8436341285705566, "logits/rejected": -2.8278112411499023, "logps/chosen": -73.47724914550781, "logps/rejected": -81.10176086425781, "loss": 0.6598, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.20919208228588104, "rewards/margins": 0.07506237924098969, "rewards/rejected": -0.2842544913291931, "step": 8050 }, { "epoch": 2.777394900068918, "grad_norm": 2.9819178581237793, "learning_rate": 8.340022139436714e-10, "logits/chosen": -2.7565393447875977, "logits/rejected": -2.733949661254883, "logps/chosen": -72.91341400146484, "logps/rejected": -78.9596939086914, "loss": 0.6642, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.19067752361297607, "rewards/margins": 0.06563916802406311, "rewards/rejected": -0.2563166618347168, "step": 8060 }, { "epoch": 2.780840799448656, "grad_norm": 2.9695065021514893, "learning_rate": 8.085205758178781e-10, "logits/chosen": -2.870241641998291, "logits/rejected": -2.8463797569274902, "logps/chosen": -73.7506103515625, "logps/rejected": -78.36888885498047, "loss": 0.6705, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.21164241433143616, "rewards/margins": 0.05253661423921585, "rewards/rejected": -0.264178991317749, "step": 8070 }, { "epoch": 2.784286698828394, "grad_norm": 3.0695478916168213, "learning_rate": 7.834278785700893e-10, "logits/chosen": -2.8941903114318848, "logits/rejected": -2.8697493076324463, "logps/chosen": -74.40611267089844, "logps/rejected": -81.62747955322266, "loss": 0.6556, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.2034609019756317, "rewards/margins": 0.08321623504161835, "rewards/rejected": -0.28667712211608887, "step": 8080 }, { "epoch": 2.7877325982081325, "grad_norm": 2.8457517623901367, "learning_rate": 7.587245256306135e-10, "logits/chosen": -2.793593645095825, "logits/rejected": -2.7596046924591064, "logps/chosen": -77.24102020263672, "logps/rejected": -80.56849670410156, "loss": 0.6633, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.20842310786247253, "rewards/margins": 0.06696037948131561, "rewards/rejected": -0.27538353204727173, "step": 8090 }, { "epoch": 2.7911784975878704, "grad_norm": 3.257617235183716, "learning_rate": 7.344109141700167e-10, "logits/chosen": -2.78893780708313, "logits/rejected": -2.7529454231262207, "logps/chosen": -74.35671997070312, "logps/rejected": -80.74169158935547, "loss": 0.6577, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20287461578845978, "rewards/margins": 0.07928146421909332, "rewards/rejected": -0.2821560800075531, "step": 8100 }, { "epoch": 2.7911784975878704, "eval_logits/chosen": -2.9251549243927, "eval_logits/rejected": -2.9193291664123535, "eval_logps/chosen": -73.78260803222656, "eval_logps/rejected": -81.61182403564453, "eval_loss": 0.6785925030708313, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.15070703625679016, "eval_rewards/margins": 0.03360989689826965, "eval_rewards/rejected": -0.184316948056221, "eval_runtime": 384.7316, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 8100 }, { "epoch": 2.794624396967609, "grad_norm": 3.0377540588378906, "learning_rate": 7.104874350927715e-10, "logits/chosen": -2.7490034103393555, "logits/rejected": -2.7311787605285645, "logps/chosen": -73.51597595214844, "logps/rejected": -81.4391860961914, "loss": 0.659, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.19332146644592285, "rewards/margins": 0.07653947174549103, "rewards/rejected": -0.2698608934879303, "step": 8110 }, { "epoch": 2.7980702963473467, "grad_norm": 3.035641670227051, "learning_rate": 6.86954473030954e-10, "logits/chosen": -2.8335814476013184, "logits/rejected": -2.8145697116851807, "logps/chosen": -72.57453155517578, "logps/rejected": -78.99360656738281, "loss": 0.6681, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.21016351878643036, "rewards/margins": 0.05725828558206558, "rewards/rejected": -0.26742178201675415, "step": 8120 }, { "epoch": 2.8015161957270847, "grad_norm": 3.1044979095458984, "learning_rate": 6.638124063380629e-10, "logits/chosen": -2.834780693054199, "logits/rejected": -2.813063859939575, "logps/chosen": -76.1025390625, "logps/rejected": -82.8951187133789, "loss": 0.6642, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2102293223142624, "rewards/margins": 0.06742466986179352, "rewards/rejected": -0.2776539921760559, "step": 8130 }, { "epoch": 2.804962095106823, "grad_norm": 3.1006083488464355, "learning_rate": 6.410616070829433e-10, "logits/chosen": -2.9075138568878174, "logits/rejected": -2.8843631744384766, "logps/chosen": -76.33185577392578, "logps/rejected": -81.07740020751953, "loss": 0.6609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20844149589538574, "rewards/margins": 0.07246918976306915, "rewards/rejected": -0.2809106707572937, "step": 8140 }, { "epoch": 2.808407994486561, "grad_norm": 3.108548402786255, "learning_rate": 6.187024410437947e-10, "logits/chosen": -2.8894360065460205, "logits/rejected": -2.873110294342041, "logps/chosen": -73.07804870605469, "logps/rejected": -79.3492431640625, "loss": 0.6661, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20236873626708984, "rewards/margins": 0.06146268919110298, "rewards/rejected": -0.2638314366340637, "step": 8150 }, { "epoch": 2.8118538938662994, "grad_norm": 3.0752692222595215, "learning_rate": 5.967352677022947e-10, "logits/chosen": -2.806727170944214, "logits/rejected": -2.7932605743408203, "logps/chosen": -74.14737701416016, "logps/rejected": -80.87196350097656, "loss": 0.6687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20520082116127014, "rewards/margins": 0.056270383298397064, "rewards/rejected": -0.2614712119102478, "step": 8160 }, { "epoch": 2.8152997932460373, "grad_norm": 2.771949052810669, "learning_rate": 5.751604402378263e-10, "logits/chosen": -2.8021903038024902, "logits/rejected": -2.778395652770996, "logps/chosen": -74.8586654663086, "logps/rejected": -79.89605712890625, "loss": 0.6604, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.187066912651062, "rewards/margins": 0.07510276138782501, "rewards/rejected": -0.26216965913772583, "step": 8170 }, { "epoch": 2.8187456926257752, "grad_norm": 3.0852625370025635, "learning_rate": 5.539783055217906e-10, "logits/chosen": -2.875746011734009, "logits/rejected": -2.850543737411499, "logps/chosen": -73.80335235595703, "logps/rejected": -79.35490417480469, "loss": 0.6612, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.19688871502876282, "rewards/margins": 0.07104488462209702, "rewards/rejected": -0.26793360710144043, "step": 8180 }, { "epoch": 2.822191592005513, "grad_norm": 3.5065574645996094, "learning_rate": 5.331892041120278e-10, "logits/chosen": -2.8204991817474365, "logits/rejected": -2.804705858230591, "logps/chosen": -73.89649963378906, "logps/rejected": -78.33135986328125, "loss": 0.6668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.19775326550006866, "rewards/margins": 0.05953418090939522, "rewards/rejected": -0.2572874426841736, "step": 8190 }, { "epoch": 2.8256374913852516, "grad_norm": 3.1395673751831055, "learning_rate": 5.127934702473524e-10, "logits/chosen": -2.768110513687134, "logits/rejected": -2.748587131500244, "logps/chosen": -73.3841781616211, "logps/rejected": -80.75636291503906, "loss": 0.6608, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1992691457271576, "rewards/margins": 0.07353170216083527, "rewards/rejected": -0.27280086278915405, "step": 8200 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.9250049591064453, "eval_logits/rejected": -2.919085741043091, "eval_logps/chosen": -73.78492736816406, "eval_logps/rejected": -81.62398529052734, "eval_loss": 0.6785528063774109, "eval_rewards/accuracies": 0.607342004776001, "eval_rewards/chosen": -0.15073026716709137, "eval_rewards/margins": 0.03370826691389084, "eval_rewards/rejected": -0.1844385415315628, "eval_runtime": 385.2094, "eval_samples_per_second": 11.173, "eval_steps_per_second": 1.397, "step": 8200 }, { "epoch": 2.82908339076499, "grad_norm": 3.1844325065612793, "learning_rate": 4.927914318421711e-10, "logits/chosen": -2.782346725463867, "logits/rejected": -2.759028434753418, "logps/chosen": -75.99636840820312, "logps/rejected": -79.94261169433594, "loss": 0.6675, "rewards/accuracies": 0.640625, "rewards/chosen": -0.21251539885997772, "rewards/margins": 0.05866488069295883, "rewards/rejected": -0.27118030190467834, "step": 8210 }, { "epoch": 2.832529290144728, "grad_norm": 2.767206907272339, "learning_rate": 4.731834104812149e-10, "logits/chosen": -2.7711310386657715, "logits/rejected": -2.758197546005249, "logps/chosen": -74.13941955566406, "logps/rejected": -79.72126770019531, "loss": 0.6699, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.21543319523334503, "rewards/margins": 0.05427657440304756, "rewards/rejected": -0.2697097659111023, "step": 8220 }, { "epoch": 2.835975189524466, "grad_norm": 2.7497756481170654, "learning_rate": 4.539697214143656e-10, "logits/chosen": -2.8009250164031982, "logits/rejected": -2.7856788635253906, "logps/chosen": -72.73230743408203, "logps/rejected": -81.3805160522461, "loss": 0.6553, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.19867649674415588, "rewards/margins": 0.08552376925945282, "rewards/rejected": -0.2842002511024475, "step": 8230 }, { "epoch": 2.8394210889042037, "grad_norm": 3.169144630432129, "learning_rate": 4.351506735515875e-10, "logits/chosen": -2.7634286880493164, "logits/rejected": -2.746793508529663, "logps/chosen": -71.97180938720703, "logps/rejected": -81.16720581054688, "loss": 0.6582, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2011670619249344, "rewards/margins": 0.07838908582925797, "rewards/rejected": -0.27955612540245056, "step": 8240 }, { "epoch": 2.842866988283942, "grad_norm": 2.940222978591919, "learning_rate": 4.1672656945796746e-10, "logits/chosen": -2.8429880142211914, "logits/rejected": -2.8110785484313965, "logps/chosen": -74.89208221435547, "logps/rejected": -75.99726867675781, "loss": 0.6686, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.19833175837993622, "rewards/margins": 0.05717344954609871, "rewards/rejected": -0.25550517439842224, "step": 8250 }, { "epoch": 2.84631288766368, "grad_norm": 3.257936716079712, "learning_rate": 3.986977053488438e-10, "logits/chosen": -2.9054017066955566, "logits/rejected": -2.8815624713897705, "logps/chosen": -77.75546264648438, "logps/rejected": -82.44725799560547, "loss": 0.6643, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.2127995789051056, "rewards/margins": 0.06575660407543182, "rewards/rejected": -0.2785561680793762, "step": 8260 }, { "epoch": 2.8497587870434185, "grad_norm": 2.912531614303589, "learning_rate": 3.810643710850381e-10, "logits/chosen": -2.8028767108917236, "logits/rejected": -2.7801384925842285, "logps/chosen": -75.77508544921875, "logps/rejected": -77.29663848876953, "loss": 0.6705, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2116108387708664, "rewards/margins": 0.05116669088602066, "rewards/rejected": -0.26277750730514526, "step": 8270 }, { "epoch": 2.8532046864231564, "grad_norm": 2.881582736968994, "learning_rate": 3.6382685016821126e-10, "logits/chosen": -2.7941391468048096, "logits/rejected": -2.7742578983306885, "logps/chosen": -75.33322143554688, "logps/rejected": -84.64946746826172, "loss": 0.6605, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2139880657196045, "rewards/margins": 0.07356895506381989, "rewards/rejected": -0.28755703568458557, "step": 8280 }, { "epoch": 2.8566505858028943, "grad_norm": 2.9310665130615234, "learning_rate": 3.4698541973629536e-10, "logits/chosen": -2.856480360031128, "logits/rejected": -2.835907459259033, "logps/chosen": -72.40876007080078, "logps/rejected": -78.96958923339844, "loss": 0.6639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20296049118041992, "rewards/margins": 0.06605638563632965, "rewards/rejected": -0.26901689171791077, "step": 8290 }, { "epoch": 2.8600964851826327, "grad_norm": 2.946805715560913, "learning_rate": 3.305403505590276e-10, "logits/chosen": -2.7859432697296143, "logits/rejected": -2.7681732177734375, "logps/chosen": -77.09591674804688, "logps/rejected": -79.38593292236328, "loss": 0.6736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2156323939561844, "rewards/margins": 0.04642469808459282, "rewards/rejected": -0.2620570957660675, "step": 8300 }, { "epoch": 2.8600964851826327, "eval_logits/chosen": -2.9249799251556396, "eval_logits/rejected": -2.919144630432129, "eval_logps/chosen": -73.76573181152344, "eval_logps/rejected": -81.61539459228516, "eval_loss": 0.6784944534301758, "eval_rewards/accuracies": 0.6080390214920044, "eval_rewards/chosen": -0.15053831040859222, "eval_rewards/margins": 0.03381437808275223, "eval_rewards/rejected": -0.18435269594192505, "eval_runtime": 385.0422, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 8300 }, { "epoch": 2.8635423845623706, "grad_norm": 3.108680009841919, "learning_rate": 3.1449190703362604e-10, "logits/chosen": -2.7763113975524902, "logits/rejected": -2.753483295440674, "logps/chosen": -76.78858184814453, "logps/rejected": -80.63233184814453, "loss": 0.6685, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.20948398113250732, "rewards/margins": 0.057111162692308426, "rewards/rejected": -0.26659509539604187, "step": 8310 }, { "epoch": 2.866988283942109, "grad_norm": 3.1631922721862793, "learning_rate": 2.988403471805095e-10, "logits/chosen": -2.7882938385009766, "logits/rejected": -2.750797748565674, "logps/chosen": -79.38661193847656, "logps/rejected": -79.74537658691406, "loss": 0.6572, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.19960026443004608, "rewards/margins": 0.08091472834348679, "rewards/rejected": -0.28051498532295227, "step": 8320 }, { "epoch": 2.870434183321847, "grad_norm": 3.4014647006988525, "learning_rate": 2.8358592263916237e-10, "logits/chosen": -2.792026996612549, "logits/rejected": -2.7789974212646484, "logps/chosen": -73.66891479492188, "logps/rejected": -78.63441467285156, "loss": 0.6669, "rewards/accuracies": 0.609375, "rewards/chosen": -0.19467243552207947, "rewards/margins": 0.059616707265377045, "rewards/rejected": -0.2542891204357147, "step": 8330 }, { "epoch": 2.873880082701585, "grad_norm": 2.9850170612335205, "learning_rate": 2.687288786640873e-10, "logits/chosen": -2.762876272201538, "logits/rejected": -2.75291109085083, "logps/chosen": -71.81262969970703, "logps/rejected": -78.47230529785156, "loss": 0.6693, "rewards/accuracies": 0.609375, "rewards/chosen": -0.21638062596321106, "rewards/margins": 0.05744846910238266, "rewards/rejected": -0.27382907271385193, "step": 8340 }, { "epoch": 2.8773259820813233, "grad_norm": 3.059152126312256, "learning_rate": 2.5426945412086453e-10, "logits/chosen": -2.8993163108825684, "logits/rejected": -2.8781580924987793, "logps/chosen": -73.96505737304688, "logps/rejected": -79.07539367675781, "loss": 0.6685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1998750865459442, "rewards/margins": 0.05597395822405815, "rewards/rejected": -0.25584906339645386, "step": 8350 }, { "epoch": 2.8807718814610612, "grad_norm": 2.8647665977478027, "learning_rate": 2.402078814823072e-10, "logits/chosen": -2.827730417251587, "logits/rejected": -2.7995123863220215, "logps/chosen": -73.15206146240234, "logps/rejected": -77.20793914794922, "loss": 0.6638, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.20556721091270447, "rewards/margins": 0.06660813093185425, "rewards/rejected": -0.2721753418445587, "step": 8360 }, { "epoch": 2.8842177808407996, "grad_norm": 3.4023728370666504, "learning_rate": 2.26544386824723e-10, "logits/chosen": -2.758392810821533, "logits/rejected": -2.7359821796417236, "logps/chosen": -75.03350830078125, "logps/rejected": -79.03582763671875, "loss": 0.6709, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.21311607956886292, "rewards/margins": 0.05189105123281479, "rewards/rejected": -0.2650070786476135, "step": 8370 }, { "epoch": 2.8876636802205375, "grad_norm": 2.956857681274414, "learning_rate": 2.1327918982428915e-10, "logits/chosen": -2.793687343597412, "logits/rejected": -2.777982473373413, "logps/chosen": -75.14671325683594, "logps/rejected": -83.21397399902344, "loss": 0.6609, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20601341128349304, "rewards/margins": 0.07270172238349915, "rewards/rejected": -0.2787151336669922, "step": 8380 }, { "epoch": 2.8911095796002755, "grad_norm": 3.5116777420043945, "learning_rate": 2.0041250375350538e-10, "logits/chosen": -2.8458657264709473, "logits/rejected": -2.8204007148742676, "logps/chosen": -75.76696014404297, "logps/rejected": -81.17318725585938, "loss": 0.662, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.20350489020347595, "rewards/margins": 0.06952239573001862, "rewards/rejected": -0.27302730083465576, "step": 8390 }, { "epoch": 2.894555478980014, "grad_norm": 3.6376869678497314, "learning_rate": 1.87944535477777e-10, "logits/chosen": -2.8323798179626465, "logits/rejected": -2.8076651096343994, "logps/chosen": -73.94133758544922, "logps/rejected": -77.5760269165039, "loss": 0.6687, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.20638296008110046, "rewards/margins": 0.05851083993911743, "rewards/rejected": -0.2648937702178955, "step": 8400 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.92505145072937, "eval_logits/rejected": -2.9191689491271973, "eval_logps/chosen": -73.7841567993164, "eval_logps/rejected": -81.62508392333984, "eval_loss": 0.6785398721694946, "eval_rewards/accuracies": 0.609433114528656, "eval_rewards/chosen": -0.15072257816791534, "eval_rewards/margins": 0.033727142959833145, "eval_rewards/rejected": -0.18444973230361938, "eval_runtime": 385.0399, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 8400 }, { "epoch": 2.898001378359752, "grad_norm": 3.0362184047698975, "learning_rate": 1.758754854520844e-10, "logits/chosen": -2.8287675380706787, "logits/rejected": -2.802426338195801, "logps/chosen": -73.71202087402344, "logps/rejected": -76.5075454711914, "loss": 0.6662, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.19436898827552795, "rewards/margins": 0.062545046210289, "rewards/rejected": -0.25691404938697815, "step": 8410 }, { "epoch": 2.90144727773949, "grad_norm": 3.1035408973693848, "learning_rate": 1.6420554771775784e-10, "logits/chosen": -2.846205472946167, "logits/rejected": -2.821800708770752, "logps/chosen": -75.31559753417969, "logps/rejected": -82.06736755371094, "loss": 0.662, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.2175147533416748, "rewards/margins": 0.0706000104546547, "rewards/rejected": -0.2881147563457489, "step": 8420 }, { "epoch": 2.904893177119228, "grad_norm": 2.8897104263305664, "learning_rate": 1.5293490989936873e-10, "logits/chosen": -2.840233087539673, "logits/rejected": -2.801994562149048, "logps/chosen": -76.46429443359375, "logps/rejected": -76.47848510742188, "loss": 0.6703, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.21195289492607117, "rewards/margins": 0.05338698625564575, "rewards/rejected": -0.2653399109840393, "step": 8430 }, { "epoch": 2.908339076498966, "grad_norm": 3.093263626098633, "learning_rate": 1.4206375320169327e-10, "logits/chosen": -2.9038126468658447, "logits/rejected": -2.8824398517608643, "logps/chosen": -71.90079498291016, "logps/rejected": -78.90889739990234, "loss": 0.6652, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.20876248180866241, "rewards/margins": 0.06549503654241562, "rewards/rejected": -0.2742575705051422, "step": 8440 }, { "epoch": 2.9117849758787044, "grad_norm": 3.051363229751587, "learning_rate": 1.3159225240682582e-10, "logits/chosen": -2.83693265914917, "logits/rejected": -2.8210654258728027, "logps/chosen": -76.00609588623047, "logps/rejected": -78.53899383544922, "loss": 0.6739, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.21936480700969696, "rewards/margins": 0.04506075382232666, "rewards/rejected": -0.2644255757331848, "step": 8450 }, { "epoch": 2.9152308752584424, "grad_norm": 2.7372143268585205, "learning_rate": 1.215205758713367e-10, "logits/chosen": -2.846186399459839, "logits/rejected": -2.82513689994812, "logps/chosen": -73.82601165771484, "logps/rejected": -80.60832214355469, "loss": 0.666, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.20659160614013672, "rewards/margins": 0.06164426729083061, "rewards/rejected": -0.26823586225509644, "step": 8460 }, { "epoch": 2.9186767746381808, "grad_norm": 3.124793767929077, "learning_rate": 1.1184888552359384e-10, "logits/chosen": -2.8728432655334473, "logits/rejected": -2.851874589920044, "logps/chosen": -76.2994155883789, "logps/rejected": -79.36776733398438, "loss": 0.6623, "rewards/accuracies": 0.671875, "rewards/chosen": -0.19177719950675964, "rewards/margins": 0.06831403821706772, "rewards/rejected": -0.26009124517440796, "step": 8470 }, { "epoch": 2.9221226740179187, "grad_norm": 2.9773621559143066, "learning_rate": 1.0257733686114545e-10, "logits/chosen": -2.8122098445892334, "logits/rejected": -2.8002538681030273, "logps/chosen": -72.68424224853516, "logps/rejected": -82.39579772949219, "loss": 0.6617, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20119774341583252, "rewards/margins": 0.07170656323432922, "rewards/rejected": -0.27290430665016174, "step": 8480 }, { "epoch": 2.9255685733976566, "grad_norm": 3.015909194946289, "learning_rate": 9.370607894822468e-11, "logits/chosen": -2.8903708457946777, "logits/rejected": -2.865344285964966, "logps/chosen": -75.3279800415039, "logps/rejected": -80.90769958496094, "loss": 0.6618, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20650510489940643, "rewards/margins": 0.07003673911094666, "rewards/rejected": -0.2765418589115143, "step": 8490 }, { "epoch": 2.929014472777395, "grad_norm": 2.8695192337036133, "learning_rate": 8.523525441334611e-11, "logits/chosen": -2.912292003631592, "logits/rejected": -2.8887391090393066, "logps/chosen": -73.92333221435547, "logps/rejected": -78.88671112060547, "loss": 0.6637, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.19588249921798706, "rewards/margins": 0.0661255493760109, "rewards/rejected": -0.26200801134109497, "step": 8500 }, { "epoch": 2.929014472777395, "eval_logits/chosen": -2.9250717163085938, "eval_logits/rejected": -2.9191627502441406, "eval_logps/chosen": -73.76414489746094, "eval_logps/rejected": -81.60913848876953, "eval_loss": 0.6785100102424622, "eval_rewards/accuracies": 0.6089683771133423, "eval_rewards/chosen": -0.15052250027656555, "eval_rewards/margins": 0.03376760333776474, "eval_rewards/rejected": -0.1842900961637497, "eval_runtime": 385.0179, "eval_samples_per_second": 11.179, "eval_steps_per_second": 1.397, "step": 8500 }, { "epoch": 2.932460372157133, "grad_norm": 3.0175492763519287, "learning_rate": 7.716499944702137e-11, "logits/chosen": -2.7975122928619385, "logits/rejected": -2.7899200916290283, "logps/chosen": -75.40348052978516, "logps/rejected": -81.5361557006836, "loss": 0.6721, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2224903553724289, "rewards/margins": 0.05113212391734123, "rewards/rejected": -0.2736225128173828, "step": 8510 }, { "epoch": 2.9359062715368713, "grad_norm": 3.1740522384643555, "learning_rate": 6.949544379956651e-11, "logits/chosen": -2.8778023719787598, "logits/rejected": -2.854410171508789, "logps/chosen": -77.89419555664062, "logps/rejected": -79.05435180664062, "loss": 0.6702, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.21721093356609344, "rewards/margins": 0.05263395234942436, "rewards/rejected": -0.2698448598384857, "step": 8520 }, { "epoch": 2.9393521709166093, "grad_norm": 2.921412944793701, "learning_rate": 6.222671077900921e-11, "logits/chosen": -2.8722891807556152, "logits/rejected": -2.846217632293701, "logps/chosen": -76.21599578857422, "logps/rejected": -80.65084838867188, "loss": 0.6711, "rewards/accuracies": 0.625, "rewards/chosen": -0.21297021210193634, "rewards/margins": 0.05137593299150467, "rewards/rejected": -0.2643461525440216, "step": 8530 }, { "epoch": 2.942798070296347, "grad_norm": 2.9245152473449707, "learning_rate": 5.535891724911812e-11, "logits/chosen": -2.7714693546295166, "logits/rejected": -2.7439846992492676, "logps/chosen": -74.5972900390625, "logps/rejected": -81.04167175292969, "loss": 0.6581, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.20181472599506378, "rewards/margins": 0.07860523462295532, "rewards/rejected": -0.2804199457168579, "step": 8540 }, { "epoch": 2.9462439696760856, "grad_norm": 3.0245354175567627, "learning_rate": 4.889217362751552e-11, "logits/chosen": -2.7904419898986816, "logits/rejected": -2.7672250270843506, "logps/chosen": -75.63033294677734, "logps/rejected": -81.00315856933594, "loss": 0.6612, "rewards/accuracies": 0.703125, "rewards/chosen": -0.20973272621631622, "rewards/margins": 0.07170229405164719, "rewards/rejected": -0.2814350426197052, "step": 8550 }, { "epoch": 2.9496898690558235, "grad_norm": 2.8795149326324463, "learning_rate": 4.2826583883903697e-11, "logits/chosen": -2.8456568717956543, "logits/rejected": -2.826765775680542, "logps/chosen": -75.73876190185547, "logps/rejected": -79.46434783935547, "loss": 0.6684, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.22310082614421844, "rewards/margins": 0.05659005045890808, "rewards/rejected": -0.2796908915042877, "step": 8560 }, { "epoch": 2.953135768435562, "grad_norm": 3.0075953006744385, "learning_rate": 3.716224553839964e-11, "logits/chosen": -2.78086519241333, "logits/rejected": -2.7558305263519287, "logps/chosen": -73.81675720214844, "logps/rejected": -79.77852630615234, "loss": 0.6604, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.19690962135791779, "rewards/margins": 0.07262058556079865, "rewards/rejected": -0.2695302367210388, "step": 8570 }, { "epoch": 2.9565816678153, "grad_norm": 2.969672679901123, "learning_rate": 3.189924965995017e-11, "logits/chosen": -2.9200000762939453, "logits/rejected": -2.8983757495880127, "logps/chosen": -72.23191833496094, "logps/rejected": -80.37834167480469, "loss": 0.6566, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.20110949873924255, "rewards/margins": 0.08245866000652313, "rewards/rejected": -0.28356820344924927, "step": 8580 }, { "epoch": 2.960027567195038, "grad_norm": 3.058807373046875, "learning_rate": 2.703768086489422e-11, "logits/chosen": -2.921848773956299, "logits/rejected": -2.9014699459075928, "logps/chosen": -77.3508071899414, "logps/rejected": -81.32893371582031, "loss": 0.6725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.22241735458374023, "rewards/margins": 0.049599092453718185, "rewards/rejected": -0.2720164358615875, "step": 8590 }, { "epoch": 2.963473466574776, "grad_norm": 3.082368850708008, "learning_rate": 2.257761731557506e-11, "logits/chosen": -2.8179478645324707, "logits/rejected": -2.811583995819092, "logps/chosen": -72.83722686767578, "logps/rejected": -78.99812316894531, "loss": 0.6689, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1994284838438034, "rewards/margins": 0.05565127730369568, "rewards/rejected": -0.25507980585098267, "step": 8600 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.924795150756836, "eval_logits/rejected": -2.9189281463623047, "eval_logps/chosen": -73.79267883300781, "eval_logps/rejected": -81.61965942382812, "eval_loss": 0.6786068081855774, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.15080790221691132, "eval_rewards/margins": 0.03358744457364082, "eval_rewards/rejected": -0.18439534306526184, "eval_runtime": 385.1688, "eval_samples_per_second": 11.174, "eval_steps_per_second": 1.397, "step": 8600 }, { "epoch": 2.966919365954514, "grad_norm": 3.209893226623535, "learning_rate": 1.8519130719102382e-11, "logits/chosen": -2.7989935874938965, "logits/rejected": -2.795987367630005, "logps/chosen": -75.49506378173828, "logps/rejected": -84.47955322265625, "loss": 0.6616, "rewards/accuracies": 0.640625, "rewards/chosen": -0.21646857261657715, "rewards/margins": 0.07354557514190674, "rewards/rejected": -0.2900141775608063, "step": 8610 }, { "epoch": 2.9703652653342525, "grad_norm": 3.2650809288024902, "learning_rate": 1.4862286326189355e-11, "logits/chosen": -2.838510274887085, "logits/rejected": -2.8364574909210205, "logps/chosen": -73.19190979003906, "logps/rejected": -80.28457641601562, "loss": 0.6708, "rewards/accuracies": 0.625, "rewards/chosen": -0.20072920620441437, "rewards/margins": 0.05174810439348221, "rewards/rejected": -0.2524773180484772, "step": 8620 }, { "epoch": 2.9738111647139904, "grad_norm": 3.103325128555298, "learning_rate": 1.1607142930114555e-11, "logits/chosen": -2.768782615661621, "logits/rejected": -2.7490038871765137, "logps/chosen": -74.37593841552734, "logps/rejected": -80.69816589355469, "loss": 0.659, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2060295045375824, "rewards/margins": 0.07509645819664001, "rewards/rejected": -0.2811259627342224, "step": 8630 }, { "epoch": 2.9772570640937284, "grad_norm": 3.215789556503296, "learning_rate": 8.753752865761633e-12, "logits/chosen": -2.814493179321289, "logits/rejected": -2.787137508392334, "logps/chosen": -75.7542953491211, "logps/rejected": -79.59574890136719, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": -0.20659641921520233, "rewards/margins": 0.058165986090898514, "rewards/rejected": -0.26476243138313293, "step": 8640 }, { "epoch": 2.9807029634734663, "grad_norm": 3.210848093032837, "learning_rate": 6.302162008786638e-12, "logits/chosen": -2.856536865234375, "logits/rejected": -2.8403079509735107, "logps/chosen": -78.23812103271484, "logps/rejected": -81.70077514648438, "loss": 0.6678, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.2107788622379303, "rewards/margins": 0.058105625212192535, "rewards/rejected": -0.2688845098018646, "step": 8650 }, { "epoch": 2.9841488628532047, "grad_norm": 3.22208309173584, "learning_rate": 4.2524097748852795e-12, "logits/chosen": -2.839494228363037, "logits/rejected": -2.818178653717041, "logps/chosen": -75.95003509521484, "logps/rejected": -79.2614517211914, "loss": 0.6679, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.21408987045288086, "rewards/margins": 0.05723186582326889, "rewards/rejected": -0.27132174372673035, "step": 8660 }, { "epoch": 2.987594762232943, "grad_norm": 2.9011948108673096, "learning_rate": 2.6045291191462148e-12, "logits/chosen": -2.8478007316589355, "logits/rejected": -2.823422908782959, "logps/chosen": -73.57682037353516, "logps/rejected": -78.84309387207031, "loss": 0.6593, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.19709068536758423, "rewards/margins": 0.07512352615594864, "rewards/rejected": -0.27221423387527466, "step": 8670 }, { "epoch": 2.991040661612681, "grad_norm": 2.882768154144287, "learning_rate": 1.3585465355347991e-12, "logits/chosen": -2.873730421066284, "logits/rejected": -2.8558573722839355, "logps/chosen": -74.82108306884766, "logps/rejected": -80.60095977783203, "loss": 0.6664, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.20831748843193054, "rewards/margins": 0.06118513271212578, "rewards/rejected": -0.2695026397705078, "step": 8680 }, { "epoch": 2.994486560992419, "grad_norm": 3.4320223331451416, "learning_rate": 5.144820564573216e-13, "logits/chosen": -2.828159809112549, "logits/rejected": -2.8172225952148438, "logps/chosen": -74.29943084716797, "logps/rejected": -81.4353256225586, "loss": 0.6729, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.21716761589050293, "rewards/margins": 0.04703906923532486, "rewards/rejected": -0.2642067074775696, "step": 8690 }, { "epoch": 2.997932460372157, "grad_norm": 3.144468069076538, "learning_rate": 7.234925244459234e-14, "logits/chosen": -2.8449816703796387, "logits/rejected": -2.821136236190796, "logps/chosen": -74.33763122558594, "logps/rejected": -78.94417572021484, "loss": 0.6585, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1975519061088562, "rewards/margins": 0.07710181921720505, "rewards/rejected": -0.27465370297431946, "step": 8700 }, { "epoch": 2.997932460372157, "eval_logits/chosen": -2.924863338470459, "eval_logits/rejected": -2.9190196990966797, "eval_logps/chosen": -73.79141235351562, "eval_logps/rejected": -81.6350326538086, "eval_loss": 0.6785153150558472, "eval_rewards/accuracies": 0.6085036993026733, "eval_rewards/chosen": -0.15079528093338013, "eval_rewards/margins": 0.03375381976366043, "eval_rewards/rejected": -0.18454909324645996, "eval_runtime": 385.1253, "eval_samples_per_second": 11.176, "eval_steps_per_second": 1.397, "step": 8700 }, { "epoch": 3.0, "step": 8706, "total_flos": 0.0, "train_loss": 0.6755671014244629, "train_runtime": 103988.7813, "train_samples_per_second": 2.679, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 8706, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }