{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.9011406844106466e-08, "logits/chosen": -1.309581995010376, "logits/rejected": -0.5872728228569031, "logps/chosen": -734.6773071289062, "logps/rejected": -1525.39306640625, "loss": 0.4075, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.9011406844106465e-07, "logits/chosen": -1.22140371799469, "logits/rejected": -0.6799044609069824, "logps/chosen": -646.4432373046875, "logps/rejected": -1440.77001953125, "loss": 0.3195, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0001053886444424279, "rewards/margins": 2.4901755750761367e-06, "rewards/rejected": 0.00010289846977684647, "step": 10 }, { "epoch": 0.01, "learning_rate": 3.802281368821293e-07, "logits/chosen": -1.461050271987915, "logits/rejected": -1.0473058223724365, "logps/chosen": -597.43408203125, "logps/rejected": -1460.790771484375, "loss": 0.3201, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0002944885636679828, "rewards/margins": 0.0006216522306203842, "rewards/rejected": -0.0009161407360807061, "step": 20 }, { "epoch": 0.01, "learning_rate": 5.70342205323194e-07, "logits/chosen": -1.630396842956543, "logits/rejected": -0.6520310640335083, "logps/chosen": -645.5565795898438, "logps/rejected": -1319.029541015625, "loss": 0.396, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005385800031945109, "rewards/margins": 0.003348312806338072, "rewards/rejected": -0.003886892693117261, "step": 30 }, { "epoch": 0.02, "learning_rate": 7.604562737642586e-07, "logits/chosen": -1.62783682346344, "logits/rejected": -0.8065999746322632, "logps/chosen": -586.908935546875, "logps/rejected": -1148.939697265625, "loss": 0.321, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036350872833281755, "rewards/margins": 0.003113765735179186, "rewards/rejected": -0.006748852785676718, "step": 40 }, { "epoch": 0.02, "learning_rate": 9.505703422053232e-07, "logits/chosen": -1.7062771320343018, "logits/rejected": -0.7769229412078857, "logps/chosen": -663.8600463867188, "logps/rejected": -1293.67626953125, "loss": 0.3153, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0018195791635662317, "rewards/margins": 0.023479169234633446, "rewards/rejected": -0.021659590303897858, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.140684410646388e-06, "logits/chosen": -1.7021366357803345, "logits/rejected": -0.937055766582489, "logps/chosen": -640.6766357421875, "logps/rejected": -1203.092529296875, "loss": 0.327, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00542110251262784, "rewards/margins": 0.016934162005782127, "rewards/rejected": -0.02235526219010353, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.3307984790874527e-06, "logits/chosen": -1.5004761219024658, "logits/rejected": -0.5878639221191406, "logps/chosen": -659.1041870117188, "logps/rejected": -1361.371337890625, "loss": 0.3052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027993163093924522, "rewards/margins": 0.046672653406858444, "rewards/rejected": -0.07466582208871841, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.5209125475285172e-06, "logits/chosen": -1.265582799911499, "logits/rejected": -0.5200140476226807, "logps/chosen": -597.455810546875, "logps/rejected": -1530.325439453125, "loss": 0.232, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08238484710454941, "rewards/margins": 0.10074327141046524, "rewards/rejected": -0.18312808871269226, "step": 80 }, { "epoch": 0.03, "learning_rate": 1.7110266159695818e-06, "logits/chosen": -0.9624673128128052, "logits/rejected": -0.46925705671310425, "logps/chosen": -962.2857666015625, "logps/rejected": -1880.2685546875, "loss": 0.2054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2484085112810135, "rewards/margins": 0.25825539231300354, "rewards/rejected": -0.5066639184951782, "step": 90 }, { "epoch": 0.04, "learning_rate": 1.9011406844106463e-06, "logits/chosen": -1.1727972030639648, "logits/rejected": -0.44827398657798767, "logps/chosen": -761.7739868164062, "logps/rejected": -1599.9483642578125, "loss": 0.2418, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22190134227275848, "rewards/margins": 0.22281675040721893, "rewards/rejected": -0.44471806287765503, "step": 100 }, { "epoch": 0.04, "learning_rate": 2.0912547528517115e-06, "logits/chosen": -1.3008177280426025, "logits/rejected": -0.053379178047180176, "logps/chosen": -989.0183715820312, "logps/rejected": -1746.445068359375, "loss": 0.2609, "rewards/accuracies": 0.625, "rewards/chosen": -0.3034511208534241, "rewards/margins": 0.17612631618976593, "rewards/rejected": -0.4795774519443512, "step": 110 }, { "epoch": 0.05, "learning_rate": 2.281368821292776e-06, "logits/chosen": -1.5831632614135742, "logits/rejected": -0.8108356595039368, "logps/chosen": -715.2606201171875, "logps/rejected": -1611.300048828125, "loss": 0.2247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15237729251384735, "rewards/margins": 0.22697433829307556, "rewards/rejected": -0.3793516755104065, "step": 120 }, { "epoch": 0.05, "learning_rate": 2.4714828897338406e-06, "logits/chosen": -1.4963654279708862, "logits/rejected": -0.4917267858982086, "logps/chosen": -656.2247924804688, "logps/rejected": -1643.033447265625, "loss": 0.1827, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15522368252277374, "rewards/margins": 0.2442297488451004, "rewards/rejected": -0.39945346117019653, "step": 130 }, { "epoch": 0.05, "learning_rate": 2.6615969581749054e-06, "logits/chosen": -1.3148021697998047, "logits/rejected": -0.7181310653686523, "logps/chosen": -791.9319458007812, "logps/rejected": -1512.953857421875, "loss": 0.2621, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22255554795265198, "rewards/margins": 0.2340855598449707, "rewards/rejected": -0.4566411077976227, "step": 140 }, { "epoch": 0.06, "learning_rate": 2.8517110266159697e-06, "logits/chosen": -1.3970264196395874, "logits/rejected": 0.18021254241466522, "logps/chosen": -710.4620971679688, "logps/rejected": -1632.2252197265625, "loss": 0.1709, "rewards/accuracies": 0.75, "rewards/chosen": -0.12118466943502426, "rewards/margins": 0.267572820186615, "rewards/rejected": -0.38875746726989746, "step": 150 }, { "epoch": 0.06, "learning_rate": 3.0418250950570345e-06, "logits/chosen": -1.3218777179718018, "logits/rejected": -0.4862894117832184, "logps/chosen": -715.0960693359375, "logps/rejected": -1668.695556640625, "loss": 0.1901, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10209158807992935, "rewards/margins": 0.21048147976398468, "rewards/rejected": -0.31257307529449463, "step": 160 }, { "epoch": 0.06, "learning_rate": 3.2319391634980988e-06, "logits/chosen": -1.6319293975830078, "logits/rejected": -0.5137763023376465, "logps/chosen": -865.77880859375, "logps/rejected": -1757.321533203125, "loss": 0.2244, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20110686123371124, "rewards/margins": 0.1988024115562439, "rewards/rejected": -0.39990928769111633, "step": 170 }, { "epoch": 0.07, "learning_rate": 3.4220532319391635e-06, "logits/chosen": -1.4142566919326782, "logits/rejected": -0.7955026626586914, "logps/chosen": -657.78662109375, "logps/rejected": -1524.0322265625, "loss": 0.1877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1512230634689331, "rewards/margins": 0.18753719329833984, "rewards/rejected": -0.33876022696495056, "step": 180 }, { "epoch": 0.07, "learning_rate": 3.6121673003802283e-06, "logits/chosen": -1.5043981075286865, "logits/rejected": -0.5569364428520203, "logps/chosen": -749.2457885742188, "logps/rejected": -1676.224365234375, "loss": 0.2303, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1593555510044098, "rewards/margins": 0.23125717043876648, "rewards/rejected": -0.39061275124549866, "step": 190 }, { "epoch": 0.08, "learning_rate": 3.8022813688212926e-06, "logits/chosen": -1.4530147314071655, "logits/rejected": -0.1936938464641571, "logps/chosen": -941.8811645507812, "logps/rejected": -1538.266845703125, "loss": 0.277, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22557249665260315, "rewards/margins": 0.1226031631231308, "rewards/rejected": -0.34817570447921753, "step": 200 }, { "epoch": 0.08, "learning_rate": 3.992395437262358e-06, "logits/chosen": -1.3884700536727905, "logits/rejected": 0.2985416054725647, "logps/chosen": -727.7371826171875, "logps/rejected": -1549.333251953125, "loss": 0.2163, "rewards/accuracies": 0.75, "rewards/chosen": -0.11415885388851166, "rewards/margins": 0.1606224626302719, "rewards/rejected": -0.27478131651878357, "step": 210 }, { "epoch": 0.08, "learning_rate": 4.182509505703423e-06, "logits/chosen": -1.4721076488494873, "logits/rejected": -0.5550671815872192, "logps/chosen": -665.4345703125, "logps/rejected": -1226.3505859375, "loss": 0.2751, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.12580981850624084, "rewards/margins": 0.07592322677373886, "rewards/rejected": -0.2017330378293991, "step": 220 }, { "epoch": 0.09, "learning_rate": 4.372623574144487e-06, "logits/chosen": -1.3315961360931396, "logits/rejected": -0.7412258386611938, "logps/chosen": -798.4100341796875, "logps/rejected": -1797.867431640625, "loss": 0.228, "rewards/accuracies": 0.75, "rewards/chosen": -0.2196497619152069, "rewards/margins": 0.31383997201919556, "rewards/rejected": -0.5334897637367249, "step": 230 }, { "epoch": 0.09, "learning_rate": 4.562737642585552e-06, "logits/chosen": -1.2657647132873535, "logits/rejected": 0.19807374477386475, "logps/chosen": -884.892578125, "logps/rejected": -1730.600341796875, "loss": 0.2077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.30446261167526245, "rewards/margins": 0.2605041563510895, "rewards/rejected": -0.5649667978286743, "step": 240 }, { "epoch": 0.1, "learning_rate": 4.752851711026617e-06, "logits/chosen": -1.3215892314910889, "logits/rejected": -0.1221291646361351, "logps/chosen": -868.1724853515625, "logps/rejected": -1894.5892333984375, "loss": 0.1899, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2540385127067566, "rewards/margins": 0.26371657848358154, "rewards/rejected": -0.5177550911903381, "step": 250 }, { "epoch": 0.1, "learning_rate": 4.942965779467681e-06, "logits/chosen": -1.5402283668518066, "logits/rejected": -0.10687409341335297, "logps/chosen": -840.9368286132812, "logps/rejected": -1507.033935546875, "loss": 0.2313, "rewards/accuracies": 0.75, "rewards/chosen": -0.19984489679336548, "rewards/margins": 0.16107426583766937, "rewards/rejected": -0.36091917753219604, "step": 260 }, { "epoch": 0.1, "learning_rate": 4.999891646507394e-06, "logits/chosen": -1.1328164339065552, "logits/rejected": -0.6200036406517029, "logps/chosen": -871.7901611328125, "logps/rejected": -1817.0308837890625, "loss": 0.1701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2524244487285614, "rewards/margins": 0.3093523681163788, "rewards/rejected": -0.5617768168449402, "step": 270 }, { "epoch": 0.11, "learning_rate": 4.999360958133643e-06, "logits/chosen": -1.244642972946167, "logits/rejected": -0.35955825448036194, "logps/chosen": -1192.7857666015625, "logps/rejected": -2177.05126953125, "loss": 0.2568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.49829235672950745, "rewards/margins": 0.27310460805892944, "rewards/rejected": -0.7713969945907593, "step": 280 }, { "epoch": 0.11, "learning_rate": 4.998388126979494e-06, "logits/chosen": -1.4707214832305908, "logits/rejected": -0.5473088026046753, "logps/chosen": -905.7005004882812, "logps/rejected": -1648.120849609375, "loss": 0.2355, "rewards/accuracies": 0.75, "rewards/chosen": -0.3049623370170593, "rewards/margins": 0.17379149794578552, "rewards/rejected": -0.47875380516052246, "step": 290 }, { "epoch": 0.11, "learning_rate": 4.9969733251410465e-06, "logits/chosen": -1.322038173675537, "logits/rejected": -0.5637993216514587, "logps/chosen": -694.6287841796875, "logps/rejected": -1592.732421875, "loss": 0.2149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18798497319221497, "rewards/margins": 0.2423991858959198, "rewards/rejected": -0.43038409948349, "step": 300 }, { "epoch": 0.12, "learning_rate": 4.995116802900044e-06, "logits/chosen": -1.6272156238555908, "logits/rejected": -0.5717583894729614, "logps/chosen": -614.6407470703125, "logps/rejected": -1641.684326171875, "loss": 0.1696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.06946410238742828, "rewards/margins": 0.29210174083709717, "rewards/rejected": -0.36156582832336426, "step": 310 }, { "epoch": 0.12, "learning_rate": 4.992818888679599e-06, "logits/chosen": -1.5178866386413574, "logits/rejected": -0.4450433850288391, "logps/chosen": -621.1731567382812, "logps/rejected": -1573.9354248046875, "loss": 0.2163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10414925962686539, "rewards/margins": 0.3151122033596039, "rewards/rejected": -0.41926145553588867, "step": 320 }, { "epoch": 0.13, "learning_rate": 4.990079988986094e-06, "logits/chosen": -1.4997282028198242, "logits/rejected": -0.6227926015853882, "logps/chosen": -749.7415771484375, "logps/rejected": -1688.022705078125, "loss": 0.2327, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11727683246135712, "rewards/margins": 0.28126558661460876, "rewards/rejected": -0.39854246377944946, "step": 330 }, { "epoch": 0.13, "learning_rate": 4.98690058833727e-06, "logits/chosen": -1.5571765899658203, "logits/rejected": -0.8521603345870972, "logps/chosen": -601.8826904296875, "logps/rejected": -1498.2889404296875, "loss": 0.2486, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08230418711900711, "rewards/margins": 0.23202486336231232, "rewards/rejected": -0.3143290877342224, "step": 340 }, { "epoch": 0.13, "learning_rate": 4.983281249176515e-06, "logits/chosen": -1.3627010583877563, "logits/rejected": -0.6490218639373779, "logps/chosen": -681.435302734375, "logps/rejected": -1366.2730712890625, "loss": 0.2731, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13615843653678894, "rewards/margins": 0.17348505556583405, "rewards/rejected": -0.3096435070037842, "step": 350 }, { "epoch": 0.14, "learning_rate": 4.9792226117733645e-06, "logits/chosen": -1.6986331939697266, "logits/rejected": -1.1046206951141357, "logps/chosen": -523.299072265625, "logps/rejected": -1489.0263671875, "loss": 0.1982, "rewards/accuracies": 0.875, "rewards/chosen": -0.010974712669849396, "rewards/margins": 0.26038092374801636, "rewards/rejected": -0.27135562896728516, "step": 360 }, { "epoch": 0.14, "learning_rate": 4.974725394110236e-06, "logits/chosen": -1.546090841293335, "logits/rejected": -0.9158832430839539, "logps/chosen": -653.3963012695312, "logps/rejected": -1584.8653564453125, "loss": 0.2056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.050740789622068405, "rewards/margins": 0.2886130213737488, "rewards/rejected": -0.23787224292755127, "step": 370 }, { "epoch": 0.14, "learning_rate": 4.969790391755419e-06, "logits/chosen": -1.548245906829834, "logits/rejected": -0.41839033365249634, "logps/chosen": -631.1019287109375, "logps/rejected": -1485.03662109375, "loss": 0.3121, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.015809131786227226, "rewards/margins": 0.2246825248003006, "rewards/rejected": -0.24049165844917297, "step": 380 }, { "epoch": 0.15, "learning_rate": 4.964418477722337e-06, "logits/chosen": -1.2504632472991943, "logits/rejected": -0.9116919636726379, "logps/chosen": -643.5406494140625, "logps/rejected": -1442.185302734375, "loss": 0.186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06941981613636017, "rewards/margins": 0.18849459290504456, "rewards/rejected": -0.2579144239425659, "step": 390 }, { "epoch": 0.15, "learning_rate": 4.958610602315105e-06, "logits/chosen": -1.5482969284057617, "logits/rejected": -0.4484465718269348, "logps/chosen": -791.5938720703125, "logps/rejected": -1489.133056640625, "loss": 0.2643, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16398222744464874, "rewards/margins": 0.20837199687957764, "rewards/rejected": -0.3723542094230652, "step": 400 }, { "epoch": 0.16, "learning_rate": 4.952367792960423e-06, "logits/chosen": -1.442470669746399, "logits/rejected": -0.5165648460388184, "logps/chosen": -834.3720703125, "logps/rejected": -1960.627685546875, "loss": 0.1825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12983514368534088, "rewards/margins": 0.37547969818115234, "rewards/rejected": -0.5053148865699768, "step": 410 }, { "epoch": 0.16, "learning_rate": 4.94569115402582e-06, "logits/chosen": -1.5044573545455933, "logits/rejected": -0.2479744851589203, "logps/chosen": -727.1597290039062, "logps/rejected": -1593.1517333984375, "loss": 0.1617, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12083429098129272, "rewards/margins": 0.22341656684875488, "rewards/rejected": -0.3442508280277252, "step": 420 }, { "epoch": 0.16, "learning_rate": 4.938581866624288e-06, "logits/chosen": -1.526505708694458, "logits/rejected": -0.6968569159507751, "logps/chosen": -748.1736450195312, "logps/rejected": -1730.7857666015625, "loss": 0.1916, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15270407497882843, "rewards/margins": 0.29999324679374695, "rewards/rejected": -0.45269736647605896, "step": 430 }, { "epoch": 0.17, "learning_rate": 4.931041188405346e-06, "logits/chosen": -1.4287599325180054, "logits/rejected": -0.02651187777519226, "logps/chosen": -769.5696411132812, "logps/rejected": -1608.8892822265625, "loss": 0.1572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1605798304080963, "rewards/margins": 0.2679330110549927, "rewards/rejected": -0.4285128712654114, "step": 440 }, { "epoch": 0.17, "learning_rate": 4.923070453332547e-06, "logits/chosen": -1.6807445287704468, "logits/rejected": -0.6135926246643066, "logps/chosen": -782.5677490234375, "logps/rejected": -1665.75390625, "loss": 0.2031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12514187395572662, "rewards/margins": 0.2597144544124603, "rewards/rejected": -0.38485628366470337, "step": 450 }, { "epoch": 0.18, "learning_rate": 4.91467107144751e-06, "logits/chosen": -1.4267165660858154, "logits/rejected": -0.7928562164306641, "logps/chosen": -571.6627807617188, "logps/rejected": -1345.6600341796875, "loss": 0.2316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.060534317046403885, "rewards/margins": 0.1647827923297882, "rewards/rejected": -0.2253170758485794, "step": 460 }, { "epoch": 0.18, "learning_rate": 4.905844528620472e-06, "logits/chosen": -1.5388805866241455, "logits/rejected": -0.7093125581741333, "logps/chosen": -704.4991455078125, "logps/rejected": -1389.565185546875, "loss": 0.2392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0523810088634491, "rewards/margins": 0.18069866299629211, "rewards/rejected": -0.2330796718597412, "step": 470 }, { "epoch": 0.18, "learning_rate": 4.896592386287437e-06, "logits/chosen": -1.385908603668213, "logits/rejected": -0.45796504616737366, "logps/chosen": -656.732177734375, "logps/rejected": -1562.862060546875, "loss": 0.2431, "rewards/accuracies": 0.75, "rewards/chosen": -0.1251785010099411, "rewards/margins": 0.2701167166233063, "rewards/rejected": -0.39529523253440857, "step": 480 }, { "epoch": 0.19, "learning_rate": 4.886916281173954e-06, "logits/chosen": -1.1565008163452148, "logits/rejected": -0.25456371903419495, "logps/chosen": -817.7601318359375, "logps/rejected": -1859.864501953125, "loss": 0.2027, "rewards/accuracies": 0.875, "rewards/chosen": -0.17359836399555206, "rewards/margins": 0.23825760185718536, "rewards/rejected": -0.41185593605041504, "step": 490 }, { "epoch": 0.19, "learning_rate": 4.876817925005577e-06, "logits/chosen": -1.3309084177017212, "logits/rejected": 0.2755126953125, "logps/chosen": -758.0354614257812, "logps/rejected": -1587.595458984375, "loss": 0.1959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13779202103614807, "rewards/margins": 0.27691641449928284, "rewards/rejected": -0.4147084355354309, "step": 500 }, { "epoch": 0.19, "learning_rate": 4.866299104205053e-06, "logits/chosen": -1.431330919265747, "logits/rejected": -0.14378157258033752, "logps/chosen": -756.1514892578125, "logps/rejected": -1630.8369140625, "loss": 0.1805, "rewards/accuracies": 0.875, "rewards/chosen": -0.14098593592643738, "rewards/margins": 0.27901023626327515, "rewards/rejected": -0.4199961721897125, "step": 510 }, { "epoch": 0.2, "learning_rate": 4.855361679576306e-06, "logits/chosen": -1.1147464513778687, "logits/rejected": 0.01641288958489895, "logps/chosen": -743.7813110351562, "logps/rejected": -1726.9251708984375, "loss": 0.2112, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14749287068843842, "rewards/margins": 0.29514187574386597, "rewards/rejected": -0.4426347315311432, "step": 520 }, { "epoch": 0.2, "learning_rate": 4.844007585975251e-06, "logits/chosen": -1.296064019203186, "logits/rejected": -0.6896405220031738, "logps/chosen": -704.9949951171875, "logps/rejected": -1606.681396484375, "loss": 0.1979, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14679738879203796, "rewards/margins": 0.2554212510585785, "rewards/rejected": -0.40221863985061646, "step": 530 }, { "epoch": 0.21, "learning_rate": 4.8322388319675175e-06, "logits/chosen": -1.165326476097107, "logits/rejected": -0.7254418134689331, "logps/chosen": -693.1873779296875, "logps/rejected": -1572.689208984375, "loss": 0.2381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10777801275253296, "rewards/margins": 0.2233743667602539, "rewards/rejected": -0.33115237951278687, "step": 540 }, { "epoch": 0.21, "learning_rate": 4.820057499473126e-06, "logits/chosen": -1.5993174314498901, "logits/rejected": -0.3147927224636078, "logps/chosen": -671.328125, "logps/rejected": -1688.6597900390625, "loss": 0.1652, "rewards/accuracies": 0.875, "rewards/chosen": -0.010269267484545708, "rewards/margins": 0.3295990228652954, "rewards/rejected": -0.33986830711364746, "step": 550 }, { "epoch": 0.21, "learning_rate": 4.8074657433981945e-06, "logits/chosen": -1.4064615964889526, "logits/rejected": -0.36807772517204285, "logps/chosen": -713.1765747070312, "logps/rejected": -1756.349609375, "loss": 0.2574, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.052180755883455276, "rewards/margins": 0.3084072470664978, "rewards/rejected": -0.3605879843235016, "step": 560 }, { "epoch": 0.22, "learning_rate": 4.7944657912537276e-06, "logits/chosen": -1.4590526819229126, "logits/rejected": -0.4386933743953705, "logps/chosen": -853.4212036132812, "logps/rejected": -1716.999267578125, "loss": 0.1963, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1896963119506836, "rewards/margins": 0.2503826916217804, "rewards/rejected": -0.4400790333747864, "step": 570 }, { "epoch": 0.22, "learning_rate": 4.781059942761568e-06, "logits/chosen": -1.247014045715332, "logits/rejected": -0.04012506455183029, "logps/chosen": -904.2608642578125, "logps/rejected": -1889.211181640625, "loss": 0.1797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.27497318387031555, "rewards/margins": 0.2862357795238495, "rewards/rejected": -0.5612090229988098, "step": 580 }, { "epoch": 0.22, "learning_rate": 4.767250569447567e-06, "logits/chosen": -1.4142177104949951, "logits/rejected": 0.06862497329711914, "logps/chosen": -824.2003173828125, "logps/rejected": -1700.429931640625, "loss": 0.2265, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12286008894443512, "rewards/margins": 0.32222193479537964, "rewards/rejected": -0.44508203864097595, "step": 590 }, { "epoch": 0.23, "learning_rate": 4.753040114222059e-06, "logits/chosen": -1.3935539722442627, "logits/rejected": -0.6971178650856018, "logps/chosen": -558.4041137695312, "logps/rejected": -1443.4300537109375, "loss": 0.2382, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.014678947627544403, "rewards/margins": 0.19151851534843445, "rewards/rejected": -0.20619745552539825, "step": 600 }, { "epoch": 0.23, "learning_rate": 4.738431090947703e-06, "logits/chosen": -1.5321964025497437, "logits/rejected": -0.1490766704082489, "logps/chosen": -643.0662841796875, "logps/rejected": -1438.817626953125, "loss": 0.191, "rewards/accuracies": 0.75, "rewards/chosen": -0.059288300573825836, "rewards/margins": 0.20326800644397736, "rewards/rejected": -0.262556254863739, "step": 610 }, { "epoch": 0.24, "learning_rate": 4.723426083994774e-06, "logits/chosen": -1.482052206993103, "logits/rejected": -0.34388789534568787, "logps/chosen": -868.4851684570312, "logps/rejected": -1728.1343994140625, "loss": 0.1932, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1511087864637375, "rewards/margins": 0.2767433524131775, "rewards/rejected": -0.42785215377807617, "step": 620 }, { "epoch": 0.24, "learning_rate": 4.708027747783986e-06, "logits/chosen": -1.6188995838165283, "logits/rejected": -0.2503194808959961, "logps/chosen": -639.2178955078125, "logps/rejected": -1499.210205078125, "loss": 0.1301, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07993890345096588, "rewards/margins": 0.24828875064849854, "rewards/rejected": -0.3282276391983032, "step": 630 }, { "epoch": 0.24, "learning_rate": 4.692238806316913e-06, "logits/chosen": -1.3904335498809814, "logits/rejected": -0.6569021344184875, "logps/chosen": -695.43701171875, "logps/rejected": -1746.911376953125, "loss": 0.1591, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10653958469629288, "rewards/margins": 0.3157615065574646, "rewards/rejected": -0.4223010540008545, "step": 640 }, { "epoch": 0.25, "learning_rate": 4.6760620526941105e-06, "logits/chosen": -1.4406391382217407, "logits/rejected": -0.5928043127059937, "logps/chosen": -762.9915771484375, "logps/rejected": -1816.375, "loss": 0.2113, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16144251823425293, "rewards/margins": 0.31130173802375793, "rewards/rejected": -0.47274428606033325, "step": 650 }, { "epoch": 0.25, "learning_rate": 4.6595003486210065e-06, "logits/chosen": -1.3454574346542358, "logits/rejected": -0.26248273253440857, "logps/chosen": -805.0185546875, "logps/rejected": -1648.8382568359375, "loss": 0.2108, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15398888289928436, "rewards/margins": 0.2925569415092468, "rewards/rejected": -0.4465457797050476, "step": 660 }, { "epoch": 0.26, "learning_rate": 4.642556623901664e-06, "logits/chosen": -1.4065353870391846, "logits/rejected": 0.598714292049408, "logps/chosen": -780.4371337890625, "logps/rejected": -1702.613525390625, "loss": 0.2046, "rewards/accuracies": 0.75, "rewards/chosen": -0.12835724651813507, "rewards/margins": 0.245024636387825, "rewards/rejected": -0.3733818531036377, "step": 670 }, { "epoch": 0.26, "learning_rate": 4.625233875920487e-06, "logits/chosen": -1.582171082496643, "logits/rejected": -0.7945160269737244, "logps/chosen": -738.0201416015625, "logps/rejected": -1607.751708984375, "loss": 0.2161, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.04244168847799301, "rewards/margins": 0.2502944767475128, "rewards/rejected": -0.2927362024784088, "step": 680 }, { "epoch": 0.26, "learning_rate": 4.607535169111981e-06, "logits/chosen": -1.2722053527832031, "logits/rejected": -0.0793805867433548, "logps/chosen": -617.8064575195312, "logps/rejected": -1496.67041015625, "loss": 0.2024, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0301450677216053, "rewards/margins": 0.26098376512527466, "rewards/rejected": -0.29112881422042847, "step": 690 }, { "epoch": 0.27, "learning_rate": 4.589463634418639e-06, "logits/chosen": -1.3151203393936157, "logits/rejected": -0.8953059315681458, "logps/chosen": -768.4039306640625, "logps/rejected": -1633.479248046875, "loss": 0.1688, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12600573897361755, "rewards/margins": 0.2763189375400543, "rewards/rejected": -0.4023246765136719, "step": 700 }, { "epoch": 0.27, "learning_rate": 4.571022468737083e-06, "logits/chosen": -1.442660927772522, "logits/rejected": -0.5813604593276978, "logps/chosen": -644.8008422851562, "logps/rejected": -1580.753173828125, "loss": 0.1593, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08881621807813644, "rewards/margins": 0.2825973629951477, "rewards/rejected": -0.37141355872154236, "step": 710 }, { "epoch": 0.27, "learning_rate": 4.552214934352513e-06, "logits/chosen": -1.4404734373092651, "logits/rejected": -0.8436611890792847, "logps/chosen": -739.8285522460938, "logps/rejected": -1773.526123046875, "loss": 0.1518, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09408523142337799, "rewards/margins": 0.3753548264503479, "rewards/rejected": -0.4694400429725647, "step": 720 }, { "epoch": 0.28, "learning_rate": 4.533044358361609e-06, "logits/chosen": -1.7495180368423462, "logits/rejected": -0.4551807940006256, "logps/chosen": -752.6351318359375, "logps/rejected": -1469.6446533203125, "loss": 0.2085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06651552766561508, "rewards/margins": 0.21766388416290283, "rewards/rejected": -0.2841794192790985, "step": 730 }, { "epoch": 0.28, "learning_rate": 4.513514132083958e-06, "logits/chosen": -1.3756463527679443, "logits/rejected": -0.2475576400756836, "logps/chosen": -572.2022094726562, "logps/rejected": -1350.266845703125, "loss": 0.2147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01980910822749138, "rewards/margins": 0.2021811306476593, "rewards/rejected": -0.22199022769927979, "step": 740 }, { "epoch": 0.29, "learning_rate": 4.493627710462119e-06, "logits/chosen": -1.399147629737854, "logits/rejected": -0.25146716833114624, "logps/chosen": -630.6769409179688, "logps/rejected": -1520.7139892578125, "loss": 0.1709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.01403821725398302, "rewards/margins": 0.21889618039131165, "rewards/rejected": -0.2329343855381012, "step": 750 }, { "epoch": 0.29, "learning_rate": 4.473388611450441e-06, "logits/chosen": -1.622859239578247, "logits/rejected": -0.13336774706840515, "logps/chosen": -611.0005493164062, "logps/rejected": -1637.197021484375, "loss": 0.1764, "rewards/accuracies": 0.875, "rewards/chosen": -0.023980099707841873, "rewards/margins": 0.27093058824539185, "rewards/rejected": -0.2949106693267822, "step": 760 }, { "epoch": 0.29, "learning_rate": 4.452800415392722e-06, "logits/chosen": -1.4647607803344727, "logits/rejected": -0.3352859914302826, "logps/chosen": -785.999267578125, "logps/rejected": -1838.8935546875, "loss": 0.2232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.18775556981563568, "rewards/margins": 0.292813777923584, "rewards/rejected": -0.48056936264038086, "step": 770 }, { "epoch": 0.3, "learning_rate": 4.431866764388844e-06, "logits/chosen": -1.585782766342163, "logits/rejected": -0.8955543637275696, "logps/chosen": -768.5866088867188, "logps/rejected": -1672.092529296875, "loss": 0.1899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13700775802135468, "rewards/margins": 0.2583485245704651, "rewards/rejected": -0.39535626769065857, "step": 780 }, { "epoch": 0.3, "learning_rate": 4.410591361650474e-06, "logits/chosen": -1.582663655281067, "logits/rejected": 0.2276397943496704, "logps/chosen": -720.8452758789062, "logps/rejected": -1423.891845703125, "loss": 0.2557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10702049732208252, "rewards/margins": 0.20522770285606384, "rewards/rejected": -0.312248170375824, "step": 790 }, { "epoch": 0.3, "learning_rate": 4.388977970845956e-06, "logits/chosen": -1.3418039083480835, "logits/rejected": 0.316244512796402, "logps/chosen": -689.212890625, "logps/rejected": -1685.4466552734375, "loss": 0.2173, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05374826863408089, "rewards/margins": 0.2638680040836334, "rewards/rejected": -0.3176162838935852, "step": 800 }, { "epoch": 0.31, "learning_rate": 4.3670304154345116e-06, "logits/chosen": -1.3105695247650146, "logits/rejected": -0.4537012577056885, "logps/chosen": -540.65771484375, "logps/rejected": -1366.0250244140625, "loss": 0.1576, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.025857895612716675, "rewards/margins": 0.19634315371513367, "rewards/rejected": -0.22220103442668915, "step": 810 }, { "epoch": 0.31, "learning_rate": 4.344752577989862e-06, "logits/chosen": -1.1346259117126465, "logits/rejected": -0.565026581287384, "logps/chosen": -820.3753662109375, "logps/rejected": -1898.3541259765625, "loss": 0.1848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19666633009910583, "rewards/margins": 0.30824899673461914, "rewards/rejected": -0.5049153566360474, "step": 820 }, { "epoch": 0.32, "learning_rate": 4.322148399513387e-06, "logits/chosen": -1.422300100326538, "logits/rejected": -0.8067476153373718, "logps/chosen": -744.7098388671875, "logps/rejected": -1772.1265869140625, "loss": 0.2093, "rewards/accuracies": 0.75, "rewards/chosen": -0.18467073142528534, "rewards/margins": 0.3054446578025818, "rewards/rejected": -0.4901154637336731, "step": 830 }, { "epoch": 0.32, "learning_rate": 4.2992218787369595e-06, "logits/chosen": -1.3477489948272705, "logits/rejected": -0.2981131672859192, "logps/chosen": -741.5479736328125, "logps/rejected": -1808.893798828125, "loss": 0.1249, "rewards/accuracies": 0.875, "rewards/chosen": -0.11807835102081299, "rewards/margins": 0.2913779318332672, "rewards/rejected": -0.4094562530517578, "step": 840 }, { "epoch": 0.32, "learning_rate": 4.275977071415554e-06, "logits/chosen": -1.5264980792999268, "logits/rejected": -0.47445183992385864, "logps/chosen": -657.248046875, "logps/rejected": -1744.3697509765625, "loss": 0.1601, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08889902383089066, "rewards/margins": 0.3612448275089264, "rewards/rejected": -0.45014387369155884, "step": 850 }, { "epoch": 0.33, "learning_rate": 4.252418089609777e-06, "logits/chosen": -1.5788004398345947, "logits/rejected": -0.4188787043094635, "logps/chosen": -781.0071411132812, "logps/rejected": -1724.9921875, "loss": 0.1893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14888271689414978, "rewards/margins": 0.3237541615962982, "rewards/rejected": -0.472636878490448, "step": 860 }, { "epoch": 0.33, "learning_rate": 4.228549100958435e-06, "logits/chosen": -1.461836338043213, "logits/rejected": -0.1806771457195282, "logps/chosen": -820.3692626953125, "logps/rejected": -1787.0689697265625, "loss": 0.1426, "rewards/accuracies": 0.875, "rewards/chosen": -0.12280750274658203, "rewards/margins": 0.3464260697364807, "rewards/rejected": -0.46923360228538513, "step": 870 }, { "epoch": 0.34, "learning_rate": 4.2043743279412676e-06, "logits/chosen": -1.4641296863555908, "logits/rejected": -0.7659576535224915, "logps/chosen": -746.824462890625, "logps/rejected": -1713.807861328125, "loss": 0.1634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12080003321170807, "rewards/margins": 0.32014593482017517, "rewards/rejected": -0.44094595313072205, "step": 880 }, { "epoch": 0.34, "learning_rate": 4.17989804713198e-06, "logits/chosen": -1.1899983882904053, "logits/rejected": -0.48646849393844604, "logps/chosen": -663.8396606445312, "logps/rejected": -1579.03076171875, "loss": 0.1892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.059574615210294724, "rewards/margins": 0.23942884802818298, "rewards/rejected": -0.2990034520626068, "step": 890 }, { "epoch": 0.34, "learning_rate": 4.155124588441713e-06, "logits/chosen": -1.3385188579559326, "logits/rejected": -0.27449753880500793, "logps/chosen": -800.9932861328125, "logps/rejected": -1861.008544921875, "loss": 0.1363, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0702655091881752, "rewards/margins": 0.40770387649536133, "rewards/rejected": -0.47796934843063354, "step": 900 }, { "epoch": 0.35, "learning_rate": 4.130058334353063e-06, "logits/chosen": -1.3284122943878174, "logits/rejected": -0.28693780303001404, "logps/chosen": -626.9280395507812, "logps/rejected": -1514.298828125, "loss": 0.1639, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0630309209227562, "rewards/margins": 0.3169647455215454, "rewards/rejected": -0.3799956738948822, "step": 910 }, { "epoch": 0.35, "learning_rate": 4.1047037191448175e-06, "logits/chosen": -1.4023081064224243, "logits/rejected": -0.6370285153388977, "logps/chosen": -712.2186889648438, "logps/rejected": -1769.7369384765625, "loss": 0.1192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08080119639635086, "rewards/margins": 0.3465597629547119, "rewards/rejected": -0.4273609519004822, "step": 920 }, { "epoch": 0.35, "learning_rate": 4.0790652281075166e-06, "logits/chosen": -1.463370680809021, "logits/rejected": -0.5760782361030579, "logps/chosen": -721.4664916992188, "logps/rejected": -1616.5904541015625, "loss": 0.1516, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05719956010580063, "rewards/margins": 0.2681477963924408, "rewards/rejected": -0.325347363948822, "step": 930 }, { "epoch": 0.36, "learning_rate": 4.0531473967499975e-06, "logits/chosen": -1.4497969150543213, "logits/rejected": -0.3674037754535675, "logps/chosen": -723.52490234375, "logps/rejected": -1600.862548828125, "loss": 0.1626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05653859302401543, "rewards/margins": 0.26881542801856995, "rewards/rejected": -0.3253540098667145, "step": 940 }, { "epoch": 0.36, "learning_rate": 4.026954809997051e-06, "logits/chosen": -1.3658907413482666, "logits/rejected": -0.4854092597961426, "logps/chosen": -604.777099609375, "logps/rejected": -1575.8779296875, "loss": 0.1322, "rewards/accuracies": 0.875, "rewards/chosen": -0.04493629187345505, "rewards/margins": 0.2772950530052185, "rewards/rejected": -0.32223135232925415, "step": 950 }, { "epoch": 0.37, "learning_rate": 4.000492101378339e-06, "logits/chosen": -1.0287444591522217, "logits/rejected": -0.4628511071205139, "logps/chosen": -731.2538452148438, "logps/rejected": -1616.0206298828125, "loss": 0.277, "rewards/accuracies": 0.75, "rewards/chosen": -0.11523719131946564, "rewards/margins": 0.24555626511573792, "rewards/rejected": -0.36079344153404236, "step": 960 }, { "epoch": 0.37, "learning_rate": 3.97376395220871e-06, "logits/chosen": -1.3218439817428589, "logits/rejected": 0.45043665170669556, "logps/chosen": -829.5682373046875, "logps/rejected": -1803.4990234375, "loss": 0.2206, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15352213382720947, "rewards/margins": 0.2729176878929138, "rewards/rejected": -0.4264398515224457, "step": 970 }, { "epoch": 0.37, "learning_rate": 3.946775090760064e-06, "logits/chosen": -1.237532377243042, "logits/rejected": -0.15372852981090546, "logps/chosen": -677.3379516601562, "logps/rejected": -1583.31640625, "loss": 0.1403, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08809792995452881, "rewards/margins": 0.3064519166946411, "rewards/rejected": -0.3945498466491699, "step": 980 }, { "epoch": 0.38, "learning_rate": 3.919530291424913e-06, "logits/chosen": -1.4484431743621826, "logits/rejected": -0.18236878514289856, "logps/chosen": -668.5921630859375, "logps/rejected": -1779.682373046875, "loss": 0.1723, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.052390240132808685, "rewards/margins": 0.3086095452308655, "rewards/rejected": -0.3609997630119324, "step": 990 }, { "epoch": 0.38, "learning_rate": 3.892034373871775e-06, "logits/chosen": -1.4016462564468384, "logits/rejected": -0.3728039562702179, "logps/chosen": -540.1661376953125, "logps/rejected": -1441.755859375, "loss": 0.1964, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03943265229463577, "rewards/margins": 0.22541245818138123, "rewards/rejected": -0.2648451030254364, "step": 1000 }, { "epoch": 0.38, "learning_rate": 3.8642922021925694e-06, "logits/chosen": -1.4675654172897339, "logits/rejected": -0.6075834035873413, "logps/chosen": -699.6939086914062, "logps/rejected": -1622.045166015625, "loss": 0.1948, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0991956889629364, "rewards/margins": 0.2816307544708252, "rewards/rejected": -0.3808264434337616, "step": 1010 }, { "epoch": 0.39, "learning_rate": 3.836308684042143e-06, "logits/chosen": -1.3532975912094116, "logits/rejected": -0.34337225556373596, "logps/chosen": -714.5763549804688, "logps/rejected": -1901.723388671875, "loss": 0.1273, "rewards/accuracies": 0.875, "rewards/chosen": -0.1728592813014984, "rewards/margins": 0.3996080458164215, "rewards/rejected": -0.5724672675132751, "step": 1020 }, { "epoch": 0.39, "learning_rate": 3.8080887697700963e-06, "logits/chosen": -0.9772112965583801, "logits/rejected": 0.18120181560516357, "logps/chosen": -643.1815185546875, "logps/rejected": -1652.1116943359375, "loss": 0.1912, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12446007877588272, "rewards/margins": 0.30697235465049744, "rewards/rejected": -0.4314323961734772, "step": 1030 }, { "epoch": 0.4, "learning_rate": 3.7796374515450545e-06, "logits/chosen": -0.9035671353340149, "logits/rejected": -0.06956671178340912, "logps/chosen": -612.6896362304688, "logps/rejected": -1553.434326171875, "loss": 0.1859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11754368245601654, "rewards/margins": 0.3055102229118347, "rewards/rejected": -0.42305389046669006, "step": 1040 }, { "epoch": 0.4, "learning_rate": 3.750959762471542e-06, "logits/chosen": -1.5567461252212524, "logits/rejected": -0.7666939496994019, "logps/chosen": -637.8421020507812, "logps/rejected": -1684.463134765625, "loss": 0.1342, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07764432579278946, "rewards/margins": 0.31340324878692627, "rewards/rejected": -0.39104756712913513, "step": 1050 }, { "epoch": 0.4, "learning_rate": 3.7220607756996137e-06, "logits/chosen": -1.0380656719207764, "logits/rejected": 0.18184402585029602, "logps/chosen": -665.8292846679688, "logps/rejected": -1527.458740234375, "loss": 0.1849, "rewards/accuracies": 0.75, "rewards/chosen": -0.07235832512378693, "rewards/margins": 0.2235848605632782, "rewards/rejected": -0.2959432005882263, "step": 1060 }, { "epoch": 0.41, "learning_rate": 3.6929456035274036e-06, "logits/chosen": -1.5165836811065674, "logits/rejected": -0.8381372690200806, "logps/chosen": -554.2527465820312, "logps/rejected": -1489.83740234375, "loss": 0.17, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05788055807352066, "rewards/margins": 0.24253058433532715, "rewards/rejected": -0.3004111349582672, "step": 1070 }, { "epoch": 0.41, "learning_rate": 3.663619396496746e-06, "logits/chosen": -1.385711431503296, "logits/rejected": -0.424043744802475, "logps/chosen": -681.849853515625, "logps/rejected": -1655.5491943359375, "loss": 0.158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10035707801580429, "rewards/margins": 0.351849228143692, "rewards/rejected": -0.4522063136100769, "step": 1080 }, { "epoch": 0.42, "learning_rate": 3.6340873424820355e-06, "logits/chosen": -1.0970741510391235, "logits/rejected": -0.5975388884544373, "logps/chosen": -675.5836181640625, "logps/rejected": -1579.396240234375, "loss": 0.1912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14220666885375977, "rewards/margins": 0.3026614487171173, "rewards/rejected": -0.44486814737319946, "step": 1090 }, { "epoch": 0.42, "learning_rate": 3.604354665772477e-06, "logits/chosen": -1.496716022491455, "logits/rejected": -0.051234085112810135, "logps/chosen": -862.8018798828125, "logps/rejected": -1910.465087890625, "loss": 0.1672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.18626311421394348, "rewards/margins": 0.3734392523765564, "rewards/rejected": -0.5597023367881775, "step": 1100 }, { "epoch": 0.42, "learning_rate": 3.574426626147898e-06, "logits/chosen": -1.1676304340362549, "logits/rejected": -0.09432904422283173, "logps/chosen": -775.7552490234375, "logps/rejected": -1548.7041015625, "loss": 0.2185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1587100625038147, "rewards/margins": 0.239671990275383, "rewards/rejected": -0.3983820080757141, "step": 1110 }, { "epoch": 0.43, "learning_rate": 3.5443085179482805e-06, "logits/chosen": -1.202911138534546, "logits/rejected": -0.15514256060123444, "logps/chosen": -688.68701171875, "logps/rejected": -1669.8521728515625, "loss": 0.1406, "rewards/accuracies": 0.875, "rewards/chosen": -0.0728377178311348, "rewards/margins": 0.2820171117782593, "rewards/rejected": -0.3548548221588135, "step": 1120 }, { "epoch": 0.43, "learning_rate": 3.5140056691371815e-06, "logits/chosen": -1.38773775100708, "logits/rejected": -0.2697985768318176, "logps/chosen": -623.3013916015625, "logps/rejected": -1472.1282958984375, "loss": 0.2088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09578903019428253, "rewards/margins": 0.2587495446205139, "rewards/rejected": -0.3545385003089905, "step": 1130 }, { "epoch": 0.43, "learning_rate": 3.4835234403592018e-06, "logits/chosen": -1.4611411094665527, "logits/rejected": -0.7454389333724976, "logps/chosen": -629.8526611328125, "logps/rejected": -1794.346923828125, "loss": 0.1042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09019435197114944, "rewards/margins": 0.4334816336631775, "rewards/rejected": -0.5236759781837463, "step": 1140 }, { "epoch": 0.44, "learning_rate": 3.4528672239916776e-06, "logits/chosen": -1.191080093383789, "logits/rejected": 0.01841779612004757, "logps/chosen": -734.9410400390625, "logps/rejected": -1738.0103759765625, "loss": 0.1239, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10190373659133911, "rewards/margins": 0.37488192319869995, "rewards/rejected": -0.47678565979003906, "step": 1150 }, { "epoch": 0.44, "learning_rate": 3.4220424431907556e-06, "logits/chosen": -1.0855127573013306, "logits/rejected": -0.08657385408878326, "logps/chosen": -727.0004272460938, "logps/rejected": -1627.780029296875, "loss": 0.2036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13039125502109528, "rewards/margins": 0.2706908881664276, "rewards/rejected": -0.40108218789100647, "step": 1160 }, { "epoch": 0.45, "learning_rate": 3.3910545509320227e-06, "logits/chosen": -1.2731348276138306, "logits/rejected": -0.13660672307014465, "logps/chosen": -758.6027221679688, "logps/rejected": -1823.713134765625, "loss": 0.1682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16975989937782288, "rewards/margins": 0.3468281924724579, "rewards/rejected": -0.5165880918502808, "step": 1170 }, { "epoch": 0.45, "learning_rate": 3.3599090290458624e-06, "logits/chosen": -1.2526190280914307, "logits/rejected": -0.37779682874679565, "logps/chosen": -642.5097045898438, "logps/rejected": -1594.8338623046875, "loss": 0.1651, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09089592844247818, "rewards/margins": 0.3324505686759949, "rewards/rejected": -0.42334645986557007, "step": 1180 }, { "epoch": 0.45, "learning_rate": 3.328611387247709e-06, "logits/chosen": -1.2133219242095947, "logits/rejected": -0.03537008911371231, "logps/chosen": -676.9924926757812, "logps/rejected": -1744.3121337890625, "loss": 0.1374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09459026157855988, "rewards/margins": 0.3284613788127899, "rewards/rejected": -0.423051655292511, "step": 1190 }, { "epoch": 0.46, "learning_rate": 3.297167162163362e-06, "logits/chosen": -1.396259069442749, "logits/rejected": -0.7233491539955139, "logps/chosen": -651.3782958984375, "logps/rejected": -1565.4619140625, "loss": 0.2218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07348114252090454, "rewards/margins": 0.28616175055503845, "rewards/rejected": -0.359642893075943, "step": 1200 }, { "epoch": 0.46, "learning_rate": 3.265581916349546e-06, "logits/chosen": -1.1477538347244263, "logits/rejected": -0.6080346703529358, "logps/chosen": -664.5640869140625, "logps/rejected": -1713.1175537109375, "loss": 0.1719, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13420571386814117, "rewards/margins": 0.31483012437820435, "rewards/rejected": -0.4490358233451843, "step": 1210 }, { "epoch": 0.46, "learning_rate": 3.2338612373098827e-06, "logits/chosen": -1.2792946100234985, "logits/rejected": -0.1775778830051422, "logps/chosen": -683.7018432617188, "logps/rejected": -1671.712646484375, "loss": 0.179, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09147936850786209, "rewards/margins": 0.32306018471717834, "rewards/rejected": -0.4145395755767822, "step": 1220 }, { "epoch": 0.47, "learning_rate": 3.202010736506447e-06, "logits/chosen": -1.4471147060394287, "logits/rejected": -0.19848565757274628, "logps/chosen": -663.111572265625, "logps/rejected": -1640.63671875, "loss": 0.1357, "rewards/accuracies": 0.875, "rewards/chosen": -0.03837443143129349, "rewards/margins": 0.2893136143684387, "rewards/rejected": -0.3276880383491516, "step": 1230 }, { "epoch": 0.47, "learning_rate": 3.1700360483670846e-06, "logits/chosen": -1.457140564918518, "logits/rejected": 0.09290559589862823, "logps/chosen": -592.1260375976562, "logps/rejected": -1410.0513916015625, "loss": 0.1862, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.013573979958891869, "rewards/margins": 0.23258109390735626, "rewards/rejected": -0.24615509808063507, "step": 1240 }, { "epoch": 0.48, "learning_rate": 3.1379428292886717e-06, "logits/chosen": -0.9757415652275085, "logits/rejected": -0.4595464766025543, "logps/chosen": -843.9957275390625, "logps/rejected": -1754.7308349609375, "loss": 0.1976, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18087983131408691, "rewards/margins": 0.2553021013736725, "rewards/rejected": -0.4361819326877594, "step": 1250 }, { "epoch": 0.48, "learning_rate": 3.1057367566364857e-06, "logits/chosen": -1.5113880634307861, "logits/rejected": 0.19634008407592773, "logps/chosen": -799.1556396484375, "logps/rejected": -1759.4019775390625, "loss": 0.1768, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1623387187719345, "rewards/margins": 0.3321741819381714, "rewards/rejected": -0.4945129454135895, "step": 1260 }, { "epoch": 0.48, "learning_rate": 3.073423527739862e-06, "logits/chosen": -1.3205205202102661, "logits/rejected": -0.07886432111263275, "logps/chosen": -624.7160034179688, "logps/rejected": -1630.43212890625, "loss": 0.1308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13049890100955963, "rewards/margins": 0.34211310744285583, "rewards/rejected": -0.47261205315589905, "step": 1270 }, { "epoch": 0.49, "learning_rate": 3.0410088588843256e-06, "logits/chosen": -1.3378050327301025, "logits/rejected": -1.1143252849578857, "logps/chosen": -587.5389404296875, "logps/rejected": -1599.2294921875, "loss": 0.1629, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12640805542469025, "rewards/margins": 0.3628181219100952, "rewards/rejected": -0.48922616243362427, "step": 1280 }, { "epoch": 0.49, "learning_rate": 3.0084984843003673e-06, "logits/chosen": -1.3381378650665283, "logits/rejected": -0.5265523195266724, "logps/chosen": -723.5072021484375, "logps/rejected": -1706.4879150390625, "loss": 0.226, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1913474053144455, "rewards/margins": 0.2739107310771942, "rewards/rejected": -0.4652581214904785, "step": 1290 }, { "epoch": 0.5, "learning_rate": 2.975898155149044e-06, "logits/chosen": -1.42616605758667, "logits/rejected": -0.5659226179122925, "logps/chosen": -808.23681640625, "logps/rejected": -1759.300048828125, "loss": 0.1548, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18762359023094177, "rewards/margins": 0.27777546644210815, "rewards/rejected": -0.46539902687072754, "step": 1300 }, { "epoch": 0.5, "learning_rate": 2.943213638504586e-06, "logits/chosen": -1.283149242401123, "logits/rejected": -0.29603224992752075, "logps/chosen": -659.40087890625, "logps/rejected": -1833.444091796875, "loss": 0.1537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12946954369544983, "rewards/margins": 0.33892717957496643, "rewards/rejected": -0.46839675307273865, "step": 1310 }, { "epoch": 0.5, "learning_rate": 2.910450716334188e-06, "logits/chosen": -1.4800167083740234, "logits/rejected": -0.18855571746826172, "logps/chosen": -684.8777465820312, "logps/rejected": -1908.510986328125, "loss": 0.1421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11518852412700653, "rewards/margins": 0.39856284856796265, "rewards/rejected": -0.5137513875961304, "step": 1320 }, { "epoch": 0.51, "learning_rate": 2.8776151844751633e-06, "logits/chosen": -1.28926420211792, "logits/rejected": -0.5672773718833923, "logps/chosen": -674.1381225585938, "logps/rejected": -1578.6949462890625, "loss": 0.1436, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08385200798511505, "rewards/margins": 0.26598161458969116, "rewards/rejected": -0.3498336672782898, "step": 1330 }, { "epoch": 0.51, "learning_rate": 2.844712851609648e-06, "logits/chosen": -1.4831870794296265, "logits/rejected": 0.5175789594650269, "logps/chosen": -621.2581176757812, "logps/rejected": -1680.9501953125, "loss": 0.1496, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.033650465309619904, "rewards/margins": 0.32731929421424866, "rewards/rejected": -0.36096978187561035, "step": 1340 }, { "epoch": 0.51, "learning_rate": 2.811749538237034e-06, "logits/chosen": -1.1817986965179443, "logits/rejected": -0.7003868222236633, "logps/chosen": -514.7432861328125, "logps/rejected": -1461.7861328125, "loss": 0.1739, "rewards/accuracies": 0.75, "rewards/chosen": -0.07194347679615021, "rewards/margins": 0.28546491265296936, "rewards/rejected": -0.3574083745479584, "step": 1350 }, { "epoch": 0.52, "learning_rate": 2.7787310756443066e-06, "logits/chosen": -1.3616505861282349, "logits/rejected": -0.5461617112159729, "logps/chosen": -670.5218505859375, "logps/rejected": -1585.295166015625, "loss": 0.1635, "rewards/accuracies": 0.875, "rewards/chosen": -0.09263516962528229, "rewards/margins": 0.2952800989151001, "rewards/rejected": -0.3879152834415436, "step": 1360 }, { "epoch": 0.52, "learning_rate": 2.7456633048744752e-06, "logits/chosen": -1.318484902381897, "logits/rejected": -0.3666155934333801, "logps/chosen": -709.5248413085938, "logps/rejected": -1748.4302978515625, "loss": 0.1562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10941965878009796, "rewards/margins": 0.3524025082588196, "rewards/rejected": -0.46182212233543396, "step": 1370 }, { "epoch": 0.53, "learning_rate": 2.7125520756932833e-06, "logits/chosen": -1.5628801584243774, "logits/rejected": -0.46112918853759766, "logps/chosen": -728.0285034179688, "logps/rejected": -1538.0501708984375, "loss": 0.2535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07505785673856735, "rewards/margins": 0.2690279185771942, "rewards/rejected": -0.34408581256866455, "step": 1380 }, { "epoch": 0.53, "learning_rate": 2.679403245554366e-06, "logits/chosen": -1.3699464797973633, "logits/rejected": -0.8164985775947571, "logps/chosen": -667.3021240234375, "logps/rejected": -1651.1578369140625, "loss": 0.1686, "rewards/accuracies": 0.875, "rewards/chosen": -0.04789549112319946, "rewards/margins": 0.28630372881889343, "rewards/rejected": -0.3341991901397705, "step": 1390 }, { "epoch": 0.53, "learning_rate": 2.646222678563057e-06, "logits/chosen": -1.5558125972747803, "logits/rejected": 0.3179488778114319, "logps/chosen": -730.3712158203125, "logps/rejected": -1707.983642578125, "loss": 0.127, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08006377518177032, "rewards/margins": 0.3021152913570404, "rewards/rejected": -0.3821790814399719, "step": 1400 }, { "epoch": 0.54, "learning_rate": 2.613016244439013e-06, "logits/chosen": -1.4453147649765015, "logits/rejected": -0.6497384905815125, "logps/chosen": -670.798095703125, "logps/rejected": -1444.1165771484375, "loss": 0.2594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08068539202213287, "rewards/margins": 0.2475142925977707, "rewards/rejected": -0.32819968461990356, "step": 1410 }, { "epoch": 0.54, "learning_rate": 2.579789817477846e-06, "logits/chosen": -1.4187909364700317, "logits/rejected": 0.15132123231887817, "logps/chosen": -694.3150024414062, "logps/rejected": -1665.516845703125, "loss": 0.1486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05452340096235275, "rewards/margins": 0.288423091173172, "rewards/rejected": -0.34294646978378296, "step": 1420 }, { "epoch": 0.54, "learning_rate": 2.54654927551195e-06, "logits/chosen": -1.162477970123291, "logits/rejected": -0.5448390245437622, "logps/chosen": -576.2252197265625, "logps/rejected": -1469.6390380859375, "loss": 0.1902, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0797332376241684, "rewards/margins": 0.2568422257900238, "rewards/rejected": -0.3365754783153534, "step": 1430 }, { "epoch": 0.55, "learning_rate": 2.5133004988706915e-06, "logits/chosen": -1.3508780002593994, "logits/rejected": -0.2575222849845886, "logps/chosen": -711.7572021484375, "logps/rejected": -1815.9508056640625, "loss": 0.1221, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11810201406478882, "rewards/margins": 0.3641887605190277, "rewards/rejected": -0.4822908043861389, "step": 1440 }, { "epoch": 0.55, "learning_rate": 2.4800493693401754e-06, "logits/chosen": -1.2100986242294312, "logits/rejected": -0.16991616785526276, "logps/chosen": -742.0198364257812, "logps/rejected": -1741.579833984375, "loss": 0.1322, "rewards/accuracies": 0.875, "rewards/chosen": -0.15465518832206726, "rewards/margins": 0.3196510374546051, "rewards/rejected": -0.47430619597435, "step": 1450 }, { "epoch": 0.56, "learning_rate": 2.446801769122734e-06, "logits/chosen": -1.2211730480194092, "logits/rejected": 0.38833457231521606, "logps/chosen": -654.7449951171875, "logps/rejected": -1712.729248046875, "loss": 0.1407, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11765112727880478, "rewards/margins": 0.3435496389865875, "rewards/rejected": -0.4612007141113281, "step": 1460 }, { "epoch": 0.56, "learning_rate": 2.4135635797963594e-06, "logits/chosen": -1.5404462814331055, "logits/rejected": -0.052362293004989624, "logps/chosen": -763.3060913085938, "logps/rejected": -1810.395751953125, "loss": 0.1019, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10972901433706284, "rewards/margins": 0.37772834300994873, "rewards/rejected": -0.48745736479759216, "step": 1470 }, { "epoch": 0.56, "learning_rate": 2.3803406812742332e-06, "logits/chosen": -1.3496084213256836, "logits/rejected": 0.013677084818482399, "logps/chosen": -786.0018310546875, "logps/rejected": -1808.4107666015625, "loss": 0.1378, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.12588255107402802, "rewards/margins": 0.36181893944740295, "rewards/rejected": -0.4877014756202698, "step": 1480 }, { "epoch": 0.57, "learning_rate": 2.347138950764558e-06, "logits/chosen": -1.3037679195404053, "logits/rejected": -0.5650864839553833, "logps/chosen": -703.7874145507812, "logps/rejected": -1737.054931640625, "loss": 0.1645, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08934299647808075, "rewards/margins": 0.3699977993965149, "rewards/rejected": -0.45934081077575684, "step": 1490 }, { "epoch": 0.57, "learning_rate": 2.3139642617308667e-06, "logits/chosen": -1.4848829507827759, "logits/rejected": -0.7195728421211243, "logps/chosen": -734.0852661132812, "logps/rejected": -1579.29150390625, "loss": 0.1878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07708898931741714, "rewards/margins": 0.2716900110244751, "rewards/rejected": -0.34877896308898926, "step": 1500 }, { "epoch": 0.58, "learning_rate": 2.2808224828529913e-06, "logits/chosen": -1.3574950695037842, "logits/rejected": -0.41164666414260864, "logps/chosen": -685.7807006835938, "logps/rejected": -1724.122802734375, "loss": 0.1612, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08044632524251938, "rewards/margins": 0.31947189569473267, "rewards/rejected": -0.39991822838783264, "step": 1510 }, { "epoch": 0.58, "learning_rate": 2.24771947698888e-06, "logits/chosen": -1.4472054243087769, "logits/rejected": -0.6798117756843567, "logps/chosen": -663.0281982421875, "logps/rejected": -1746.3392333984375, "loss": 0.1531, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09853185713291168, "rewards/margins": 0.388458788394928, "rewards/rejected": -0.48699063062667847, "step": 1520 }, { "epoch": 0.58, "learning_rate": 2.214661100137445e-06, "logits/chosen": -1.3750660419464111, "logits/rejected": -0.6010938286781311, "logps/chosen": -742.4952392578125, "logps/rejected": -1725.106689453125, "loss": 0.1603, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1297096461057663, "rewards/margins": 0.327436625957489, "rewards/rejected": -0.4571463167667389, "step": 1530 }, { "epoch": 0.59, "learning_rate": 2.1816532004026234e-06, "logits/chosen": -1.0516839027404785, "logits/rejected": 0.1618213802576065, "logps/chosen": -727.3853759765625, "logps/rejected": -1619.0814208984375, "loss": 0.1646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11544345319271088, "rewards/margins": 0.2913517355918884, "rewards/rejected": -0.4067951738834381, "step": 1540 }, { "epoch": 0.59, "learning_rate": 2.1487016169588305e-06, "logits/chosen": -1.34139883518219, "logits/rejected": -0.5810695886611938, "logps/chosen": -854.8800659179688, "logps/rejected": -1915.371337890625, "loss": 0.105, "rewards/accuracies": 0.875, "rewards/chosen": -0.12121065706014633, "rewards/margins": 0.3828410506248474, "rewards/rejected": -0.504051685333252, "step": 1550 }, { "epoch": 0.59, "learning_rate": 2.115812179018e-06, "logits/chosen": -1.4813458919525146, "logits/rejected": -0.22126176953315735, "logps/chosen": -745.7259521484375, "logps/rejected": -1674.12109375, "loss": 0.1873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09153054654598236, "rewards/margins": 0.3230289816856384, "rewards/rejected": -0.4145595133304596, "step": 1560 }, { "epoch": 0.6, "learning_rate": 2.08299070479838e-06, "logits/chosen": -1.4628018140792847, "logits/rejected": -0.11371966451406479, "logps/chosen": -713.2190551757812, "logps/rejected": -1602.038330078125, "loss": 0.1749, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08934558928012848, "rewards/margins": 0.2611418068408966, "rewards/rejected": -0.3504873514175415, "step": 1570 }, { "epoch": 0.6, "learning_rate": 2.0502430004952796e-06, "logits/chosen": -1.543895959854126, "logits/rejected": -0.3139607012271881, "logps/chosen": -829.5701293945312, "logps/rejected": -1755.7701416015625, "loss": 0.1463, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.09140854328870773, "rewards/margins": 0.3310515582561493, "rewards/rejected": -0.42246007919311523, "step": 1580 }, { "epoch": 0.61, "learning_rate": 2.0175748592539353e-06, "logits/chosen": -1.2287932634353638, "logits/rejected": 0.2602895200252533, "logps/chosen": -756.0363159179688, "logps/rejected": -1771.6744384765625, "loss": 0.1395, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07528454065322876, "rewards/margins": 0.3804745376110077, "rewards/rejected": -0.45575910806655884, "step": 1590 }, { "epoch": 0.61, "learning_rate": 1.9849920601446945e-06, "logits/chosen": -1.2234537601470947, "logits/rejected": -0.34282106161117554, "logps/chosen": -701.86962890625, "logps/rejected": -1703.814453125, "loss": 0.1506, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07677377760410309, "rewards/margins": 0.3593098819255829, "rewards/rejected": -0.4360836446285248, "step": 1600 }, { "epoch": 0.61, "learning_rate": 1.952500367140682e-06, "logits/chosen": -1.2301862239837646, "logits/rejected": -0.09375803172588348, "logps/chosen": -598.0748291015625, "logps/rejected": -1759.71875, "loss": 0.1046, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06393888592720032, "rewards/margins": 0.43143805861473083, "rewards/rejected": -0.49537691473960876, "step": 1610 }, { "epoch": 0.62, "learning_rate": 1.92010552809814e-06, "logits/chosen": -1.3494677543640137, "logits/rejected": 0.08366771787405014, "logps/chosen": -675.3621826171875, "logps/rejected": -1539.6778564453125, "loss": 0.1346, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08343581110239029, "rewards/margins": 0.2974902093410492, "rewards/rejected": -0.3809260129928589, "step": 1620 }, { "epoch": 0.62, "learning_rate": 1.8878132737396183e-06, "logits/chosen": -1.2662420272827148, "logits/rejected": -0.46429747343063354, "logps/chosen": -592.8118896484375, "logps/rejected": -1795.30859375, "loss": 0.1126, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08287428319454193, "rewards/margins": 0.4036792814731598, "rewards/rejected": -0.48655351996421814, "step": 1630 }, { "epoch": 0.62, "learning_rate": 1.855629316640199e-06, "logits/chosen": -1.1289461851119995, "logits/rejected": -0.4695679247379303, "logps/chosen": -728.015625, "logps/rejected": -1607.047607421875, "loss": 0.13, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09498603641986847, "rewards/margins": 0.29786354303359985, "rewards/rejected": -0.3928496241569519, "step": 1640 }, { "epoch": 0.63, "learning_rate": 1.823559350216924e-06, "logits/chosen": -1.2158567905426025, "logits/rejected": 0.4376288950443268, "logps/chosen": -793.0543823242188, "logps/rejected": -1689.7529296875, "loss": 0.1543, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10590417683124542, "rewards/margins": 0.29491281509399414, "rewards/rejected": -0.40081700682640076, "step": 1650 }, { "epoch": 0.63, "learning_rate": 1.7916090477216208e-06, "logits/chosen": -1.0943143367767334, "logits/rejected": -0.005443835165351629, "logps/chosen": -680.1480712890625, "logps/rejected": -1681.1956787109375, "loss": 0.1292, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0522083044052124, "rewards/margins": 0.33797913789749146, "rewards/rejected": -0.39018744230270386, "step": 1660 }, { "epoch": 0.64, "learning_rate": 1.7597840612372882e-06, "logits/chosen": -1.2401732206344604, "logits/rejected": -0.34577831625938416, "logps/chosen": -615.2108764648438, "logps/rejected": -1627.213623046875, "loss": 0.1195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05474289506673813, "rewards/margins": 0.3130968511104584, "rewards/rejected": -0.3678397536277771, "step": 1670 }, { "epoch": 0.64, "learning_rate": 1.7280900206782307e-06, "logits/chosen": -1.5008598566055298, "logits/rejected": -0.43634098768234253, "logps/chosen": -723.1873168945312, "logps/rejected": -1771.3841552734375, "loss": 0.15, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08210828900337219, "rewards/margins": 0.3466632068157196, "rewards/rejected": -0.4287714958190918, "step": 1680 }, { "epoch": 0.64, "learning_rate": 1.696532532794113e-06, "logits/chosen": -1.3174114227294922, "logits/rejected": -0.3240479528903961, "logps/chosen": -700.016845703125, "logps/rejected": -1568.8504638671875, "loss": 0.2067, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12464163452386856, "rewards/margins": 0.25538191199302673, "rewards/rejected": -0.3800235390663147, "step": 1690 }, { "epoch": 0.65, "learning_rate": 1.6651171801781158e-06, "logits/chosen": -1.3929417133331299, "logits/rejected": -0.04801623150706291, "logps/chosen": -696.4129638671875, "logps/rejected": -1759.968017578125, "loss": 0.1405, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10133171081542969, "rewards/margins": 0.35232028365135193, "rewards/rejected": -0.4536519944667816, "step": 1700 }, { "epoch": 0.65, "learning_rate": 1.6338495202793542e-06, "logits/chosen": -1.298825979232788, "logits/rejected": -0.4632337987422943, "logps/chosen": -552.8858642578125, "logps/rejected": -1541.509033203125, "loss": 0.1239, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06905023008584976, "rewards/margins": 0.3324562609195709, "rewards/rejected": -0.4015064835548401, "step": 1710 }, { "epoch": 0.66, "learning_rate": 1.6027350844197615e-06, "logits/chosen": -1.4130520820617676, "logits/rejected": -0.06178497150540352, "logps/chosen": -737.1648559570312, "logps/rejected": -1813.991943359375, "loss": 0.1269, "rewards/accuracies": 0.875, "rewards/chosen": -0.11935298144817352, "rewards/margins": 0.3370525538921356, "rewards/rejected": -0.45640549063682556, "step": 1720 }, { "epoch": 0.66, "learning_rate": 1.5717793768155804e-06, "logits/chosen": -1.5521743297576904, "logits/rejected": -0.474079430103302, "logps/chosen": -760.5835571289062, "logps/rejected": -1911.4273681640625, "loss": 0.1314, "rewards/accuracies": 0.875, "rewards/chosen": -0.12647734582424164, "rewards/margins": 0.4650444984436035, "rewards/rejected": -0.5915218591690063, "step": 1730 }, { "epoch": 0.66, "learning_rate": 1.5409878736036546e-06, "logits/chosen": -1.3719254732131958, "logits/rejected": -0.2794768214225769, "logps/chosen": -827.5120239257812, "logps/rejected": -1812.239013671875, "loss": 0.1639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1621624082326889, "rewards/margins": 0.3491363227367401, "rewards/rejected": -0.5112987756729126, "step": 1740 }, { "epoch": 0.67, "learning_rate": 1.510366021872689e-06, "logits/chosen": -1.2999706268310547, "logits/rejected": -0.11817393451929092, "logps/chosen": -799.0403442382812, "logps/rejected": -1788.6002197265625, "loss": 0.1892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13252076506614685, "rewards/margins": 0.3233005702495575, "rewards/rejected": -0.4558214247226715, "step": 1750 }, { "epoch": 0.67, "learning_rate": 1.4799192386996457e-06, "logits/chosen": -1.417991280555725, "logits/rejected": -0.3066923916339874, "logps/chosen": -860.0372924804688, "logps/rejected": -1447.767333984375, "loss": 0.2838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20034471154212952, "rewards/margins": 0.1224360466003418, "rewards/rejected": -0.3227807283401489, "step": 1760 }, { "epoch": 0.67, "learning_rate": 1.449652910191448e-06, "logits/chosen": -1.3324776887893677, "logits/rejected": -0.2286255806684494, "logps/chosen": -782.1688232421875, "logps/rejected": -1766.2386474609375, "loss": 0.143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12544932961463928, "rewards/margins": 0.32188212871551514, "rewards/rejected": -0.4473314881324768, "step": 1770 }, { "epoch": 0.68, "learning_rate": 1.4195723905321725e-06, "logits/chosen": -1.3904447555541992, "logits/rejected": -0.3608115613460541, "logps/chosen": -664.779296875, "logps/rejected": -1656.0230712890625, "loss": 0.162, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1018267497420311, "rewards/margins": 0.3426135778427124, "rewards/rejected": -0.4444403052330017, "step": 1780 }, { "epoch": 0.68, "learning_rate": 1.3896830010358753e-06, "logits/chosen": -1.2271074056625366, "logits/rejected": 0.2730652093887329, "logps/chosen": -660.5557250976562, "logps/rejected": -1640.987060546875, "loss": 0.1648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10987261682748795, "rewards/margins": 0.3438589572906494, "rewards/rejected": -0.45373159646987915, "step": 1790 }, { "epoch": 0.69, "learning_rate": 1.3599900292052404e-06, "logits/chosen": -1.2404191493988037, "logits/rejected": 0.0616680271923542, "logps/chosen": -671.2255249023438, "logps/rejected": -1757.7509765625, "loss": 0.1213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09178225696086884, "rewards/margins": 0.35961416363716125, "rewards/rejected": -0.45139652490615845, "step": 1800 }, { "epoch": 0.69, "learning_rate": 1.3304987277962105e-06, "logits/chosen": -1.599121332168579, "logits/rejected": -0.7104395031929016, "logps/chosen": -720.9259643554688, "logps/rejected": -1798.143310546875, "loss": 0.1367, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09743434190750122, "rewards/margins": 0.30916815996170044, "rewards/rejected": -0.4066024720668793, "step": 1810 }, { "epoch": 0.69, "learning_rate": 1.301214313888759e-06, "logits/chosen": -1.525014877319336, "logits/rejected": -0.5128569602966309, "logps/chosen": -699.74267578125, "logps/rejected": -1553.73095703125, "loss": 0.1462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10087595880031586, "rewards/margins": 0.24198803305625916, "rewards/rejected": -0.3428639769554138, "step": 1820 }, { "epoch": 0.7, "learning_rate": 1.2721419679639793e-06, "logits/chosen": -1.4727911949157715, "logits/rejected": -0.21553997695446014, "logps/chosen": -925.6624755859375, "logps/rejected": -1859.183349609375, "loss": 0.1318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.12055756151676178, "rewards/margins": 0.3322068750858307, "rewards/rejected": -0.45276445150375366, "step": 1830 }, { "epoch": 0.7, "learning_rate": 1.2432868329876424e-06, "logits/chosen": -1.5464954376220703, "logits/rejected": -0.5984674692153931, "logps/chosen": -797.6492919921875, "logps/rejected": -1948.8232421875, "loss": 0.1467, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10304896533489227, "rewards/margins": 0.44831156730651855, "rewards/rejected": -0.5513604283332825, "step": 1840 }, { "epoch": 0.7, "learning_rate": 1.2146540135003976e-06, "logits/chosen": -1.273262858390808, "logits/rejected": 0.07696258276700974, "logps/chosen": -714.8951416015625, "logps/rejected": -1766.770751953125, "loss": 0.1596, "rewards/accuracies": 0.875, "rewards/chosen": -0.10553844273090363, "rewards/margins": 0.35988712310791016, "rewards/rejected": -0.4654255509376526, "step": 1850 }, { "epoch": 0.71, "learning_rate": 1.1862485747147656e-06, "logits/chosen": -1.245619297027588, "logits/rejected": -0.08681797981262207, "logps/chosen": -638.5645751953125, "logps/rejected": -1674.7666015625, "loss": 0.1116, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.054060857743024826, "rewards/margins": 0.32587510347366333, "rewards/rejected": -0.37993595004081726, "step": 1860 }, { "epoch": 0.71, "learning_rate": 1.1580755416190912e-06, "logits/chosen": -1.2976939678192139, "logits/rejected": -0.8462039828300476, "logps/chosen": -693.8250732421875, "logps/rejected": -1577.976806640625, "loss": 0.1657, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1271156370639801, "rewards/margins": 0.28418809175491333, "rewards/rejected": -0.4113037586212158, "step": 1870 }, { "epoch": 0.72, "learning_rate": 1.130139898088609e-06, "logits/chosen": -1.2483781576156616, "logits/rejected": -0.4588032364845276, "logps/chosen": -699.0128173828125, "logps/rejected": -1630.91064453125, "loss": 0.2088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11796565353870392, "rewards/margins": 0.2625008523464203, "rewards/rejected": -0.380466490983963, "step": 1880 }, { "epoch": 0.72, "learning_rate": 1.102446586003791e-06, "logits/chosen": -1.3382247686386108, "logits/rejected": -0.6118718981742859, "logps/chosen": -832.4485473632812, "logps/rejected": -1541.2559814453125, "loss": 0.2167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12457825988531113, "rewards/margins": 0.25009018182754517, "rewards/rejected": -0.3746684491634369, "step": 1890 }, { "epoch": 0.72, "learning_rate": 1.0750005043761042e-06, "logits/chosen": -1.2273459434509277, "logits/rejected": -0.06240634247660637, "logps/chosen": -607.1456298828125, "logps/rejected": -1608.713623046875, "loss": 0.1233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09009061753749847, "rewards/margins": 0.3294326066970825, "rewards/rejected": -0.4195232391357422, "step": 1900 }, { "epoch": 0.73, "learning_rate": 1.0478065084813771e-06, "logits/chosen": -1.2952836751937866, "logits/rejected": -0.0880967229604721, "logps/chosen": -740.3468627929688, "logps/rejected": -1820.2017822265625, "loss": 0.1184, "rewards/accuracies": 0.875, "rewards/chosen": -0.09452497214078903, "rewards/margins": 0.3751065731048584, "rewards/rejected": -0.4696315824985504, "step": 1910 }, { "epoch": 0.73, "learning_rate": 1.0208694090008817e-06, "logits/chosen": -1.4395984411239624, "logits/rejected": 0.10438177734613419, "logps/chosen": -668.413330078125, "logps/rejected": -1532.82421875, "loss": 0.1844, "rewards/accuracies": 0.75, "rewards/chosen": -0.09522219747304916, "rewards/margins": 0.2659499943256378, "rewards/rejected": -0.3611721396446228, "step": 1920 }, { "epoch": 0.74, "learning_rate": 9.941939711703167e-07, "logits/chosen": -1.4851906299591064, "logits/rejected": -0.6851671934127808, "logps/chosen": -698.2586669921875, "logps/rejected": -1720.084716796875, "loss": 0.1619, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09314031898975372, "rewards/margins": 0.3323098123073578, "rewards/rejected": -0.4254501461982727, "step": 1930 }, { "epoch": 0.74, "learning_rate": 9.677849139368236e-07, "logits/chosen": -1.1770188808441162, "logits/rejected": -0.25014448165893555, "logps/chosen": -718.9888916015625, "logps/rejected": -1563.19140625, "loss": 0.1685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10421274602413177, "rewards/margins": 0.2857014238834381, "rewards/rejected": -0.3899141848087311, "step": 1940 }, { "epoch": 0.74, "learning_rate": 9.416469091241967e-07, "logits/chosen": -1.304079294204712, "logits/rejected": 0.7498459815979004, "logps/chosen": -823.6525268554688, "logps/rejected": -1516.979736328125, "loss": 0.2101, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13554687798023224, "rewards/margins": 0.20679816603660583, "rewards/rejected": -0.3423450291156769, "step": 1950 }, { "epoch": 0.75, "learning_rate": 9.15784580606425e-07, "logits/chosen": -1.5646919012069702, "logits/rejected": -0.30903416872024536, "logps/chosen": -689.1824340820312, "logps/rejected": -1723.592041015625, "loss": 0.1436, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07560104131698608, "rewards/margins": 0.35716474056243896, "rewards/rejected": -0.43276581168174744, "step": 1960 }, { "epoch": 0.75, "learning_rate": 8.902025034897157e-07, "logits/chosen": -1.6203832626342773, "logits/rejected": 0.05432802438735962, "logps/chosen": -712.8575439453125, "logps/rejected": -1714.422119140625, "loss": 0.1354, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07794491201639175, "rewards/margins": 0.33776524662971497, "rewards/rejected": -0.41571006178855896, "step": 1970 }, { "epoch": 0.75, "learning_rate": 8.649052033031518e-07, "logits/chosen": -1.2619088888168335, "logits/rejected": -0.714330792427063, "logps/chosen": -768.6756591796875, "logps/rejected": -1666.509521484375, "loss": 0.1316, "rewards/accuracies": 0.875, "rewards/chosen": -0.10662130266427994, "rewards/margins": 0.2979509234428406, "rewards/rejected": -0.4045723080635071, "step": 1980 }, { "epoch": 0.76, "learning_rate": 8.398971551981152e-07, "logits/chosen": -1.53328537940979, "logits/rejected": -0.6478430032730103, "logps/chosen": -692.9175415039062, "logps/rejected": -1607.8665771484375, "loss": 0.1591, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08122928440570831, "rewards/margins": 0.3541563153266907, "rewards/rejected": -0.4353856146335602, "step": 1990 }, { "epoch": 0.76, "learning_rate": 8.151827831566173e-07, "logits/chosen": -1.1904656887054443, "logits/rejected": -0.07360811531543732, "logps/chosen": -824.7193603515625, "logps/rejected": -1856.769775390625, "loss": 0.1494, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11931683868169785, "rewards/margins": 0.3319847285747528, "rewards/rejected": -0.45130157470703125, "step": 2000 }, { "epoch": 0.77, "learning_rate": 7.907664592086906e-07, "logits/chosen": -1.2746717929840088, "logits/rejected": 0.19167876243591309, "logps/chosen": -826.2185668945312, "logps/rejected": -1656.9251708984375, "loss": 0.1466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12366266548633575, "rewards/margins": 0.26790809631347656, "rewards/rejected": -0.3915707468986511, "step": 2010 }, { "epoch": 0.77, "learning_rate": 7.666525026589652e-07, "logits/chosen": -1.4633296728134155, "logits/rejected": -0.4709087014198303, "logps/chosen": -727.7723388671875, "logps/rejected": -1795.431884765625, "loss": 0.1492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1102936714887619, "rewards/margins": 0.35756900906562805, "rewards/rejected": -0.46786266565322876, "step": 2020 }, { "epoch": 0.77, "learning_rate": 7.428451793225716e-07, "logits/chosen": -1.209649682044983, "logits/rejected": 0.16932383179664612, "logps/chosen": -666.8624267578125, "logps/rejected": -1579.373291015625, "loss": 0.2177, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05102403834462166, "rewards/margins": 0.28134503960609436, "rewards/rejected": -0.3323690891265869, "step": 2030 }, { "epoch": 0.78, "learning_rate": 7.193487007705122e-07, "logits/chosen": -1.6273008584976196, "logits/rejected": -0.493377149105072, "logps/chosen": -667.7854614257812, "logps/rejected": -1685.8043212890625, "loss": 0.1025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.050732649862766266, "rewards/margins": 0.339358389377594, "rewards/rejected": -0.3900910019874573, "step": 2040 }, { "epoch": 0.78, "learning_rate": 6.961672235846212e-07, "logits/chosen": -1.3306572437286377, "logits/rejected": -0.7527714967727661, "logps/chosen": -668.8829345703125, "logps/rejected": -1745.853515625, "loss": 0.1378, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06819947063922882, "rewards/margins": 0.3691011369228363, "rewards/rejected": -0.4373006224632263, "step": 2050 }, { "epoch": 0.78, "learning_rate": 6.733048486222574e-07, "logits/chosen": -1.4040008783340454, "logits/rejected": -0.3465643525123596, "logps/chosen": -818.2564086914062, "logps/rejected": -1872.59375, "loss": 0.1623, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1196279376745224, "rewards/margins": 0.3104092478752136, "rewards/rejected": -0.4300372004508972, "step": 2060 }, { "epoch": 0.79, "learning_rate": 6.507656202908522e-07, "logits/chosen": -1.3825243711471558, "logits/rejected": -0.33207207918167114, "logps/chosen": -671.7288208007812, "logps/rejected": -1452.2998046875, "loss": 0.1691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08563695847988129, "rewards/margins": 0.2635264992713928, "rewards/rejected": -0.3491634428501129, "step": 2070 }, { "epoch": 0.79, "learning_rate": 6.285535258324437e-07, "logits/chosen": -1.1362348794937134, "logits/rejected": -0.5011839866638184, "logps/chosen": -658.7559814453125, "logps/rejected": -1833.154052734375, "loss": 0.1181, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09890180081129074, "rewards/margins": 0.41942349076271057, "rewards/rejected": -0.5183252692222595, "step": 2080 }, { "epoch": 0.8, "learning_rate": 6.06672494618327e-07, "logits/chosen": -1.1703777313232422, "logits/rejected": -0.40541720390319824, "logps/chosen": -655.1098022460938, "logps/rejected": -1618.4405517578125, "loss": 0.1685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0858469307422638, "rewards/margins": 0.3096432387828827, "rewards/rejected": -0.3954901695251465, "step": 2090 }, { "epoch": 0.8, "learning_rate": 5.851263974539354e-07, "logits/chosen": -1.3247920274734497, "logits/rejected": -0.17334061861038208, "logps/chosen": -699.2870483398438, "logps/rejected": -1762.783203125, "loss": 0.126, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09240012615919113, "rewards/margins": 0.35216832160949707, "rewards/rejected": -0.4445684552192688, "step": 2100 }, { "epoch": 0.8, "learning_rate": 5.639190458940894e-07, "logits/chosen": -1.4283260107040405, "logits/rejected": -0.08018004894256592, "logps/chosen": -779.4390869140625, "logps/rejected": -1646.017578125, "loss": 0.2144, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10010604560375214, "rewards/margins": 0.24653327465057373, "rewards/rejected": -0.34663933515548706, "step": 2110 }, { "epoch": 0.81, "learning_rate": 5.430541915687218e-07, "logits/chosen": -1.437163233757019, "logits/rejected": -0.5125199556350708, "logps/chosen": -709.0470581054688, "logps/rejected": -1737.5172119140625, "loss": 0.1266, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1090354323387146, "rewards/margins": 0.35091733932495117, "rewards/rejected": -0.4599527418613434, "step": 2120 }, { "epoch": 0.81, "learning_rate": 5.225355255192063e-07, "logits/chosen": -1.3520411252975464, "logits/rejected": -0.6261115074157715, "logps/chosen": -615.5046997070312, "logps/rejected": -1778.3017578125, "loss": 0.1262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08450282365083694, "rewards/margins": 0.3983950912952423, "rewards/rejected": -0.4828978478908539, "step": 2130 }, { "epoch": 0.82, "learning_rate": 5.023666775454033e-07, "logits/chosen": -1.2711622714996338, "logits/rejected": 0.205433651804924, "logps/chosen": -646.5723266601562, "logps/rejected": -1755.7763671875, "loss": 0.1269, "rewards/accuracies": 0.875, "rewards/chosen": -0.10668406635522842, "rewards/margins": 0.36221179366111755, "rewards/rejected": -0.468895822763443, "step": 2140 }, { "epoch": 0.82, "learning_rate": 4.825512155635409e-07, "logits/chosen": -1.5237255096435547, "logits/rejected": -0.41017183661460876, "logps/chosen": -753.2335205078125, "logps/rejected": -1794.0836181640625, "loss": 0.1488, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11266312748193741, "rewards/margins": 0.3188976049423218, "rewards/rejected": -0.4315606951713562, "step": 2150 }, { "epoch": 0.82, "learning_rate": 4.630926449750389e-07, "logits/chosen": -1.321341633796692, "logits/rejected": -0.45827093720436096, "logps/chosen": -621.6727294921875, "logps/rejected": -1710.5660400390625, "loss": 0.1609, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10735125839710236, "rewards/margins": 0.3593485653400421, "rewards/rejected": -0.4666997790336609, "step": 2160 }, { "epoch": 0.83, "learning_rate": 4.4399440804640036e-07, "logits/chosen": -1.3837645053863525, "logits/rejected": -0.12423048168420792, "logps/chosen": -702.0316162109375, "logps/rejected": -1616.111083984375, "loss": 0.1853, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10689805448055267, "rewards/margins": 0.3030398488044739, "rewards/rejected": -0.40993791818618774, "step": 2170 }, { "epoch": 0.83, "learning_rate": 4.25259883300263e-07, "logits/chosen": -1.4211256504058838, "logits/rejected": -0.5185990333557129, "logps/chosen": -739.6668090820312, "logps/rejected": -1575.1163330078125, "loss": 0.1892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10943174362182617, "rewards/margins": 0.26164135336875916, "rewards/rejected": -0.3710730969905853, "step": 2180 }, { "epoch": 0.83, "learning_rate": 4.068923849177306e-07, "logits/chosen": -1.4977080821990967, "logits/rejected": -0.7932590842247009, "logps/chosen": -715.7740478515625, "logps/rejected": -1722.7689208984375, "loss": 0.0896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09469622373580933, "rewards/margins": 0.33459311723709106, "rewards/rejected": -0.4292893409729004, "step": 2190 }, { "epoch": 0.84, "learning_rate": 3.8889516215208897e-07, "logits/chosen": -1.2984775304794312, "logits/rejected": -0.6099029779434204, "logps/chosen": -672.9491577148438, "logps/rejected": -1491.906005859375, "loss": 0.1689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07857345044612885, "rewards/margins": 0.3144298791885376, "rewards/rejected": -0.39300331473350525, "step": 2200 }, { "epoch": 0.84, "learning_rate": 3.712713987540026e-07, "logits/chosen": -1.3680397272109985, "logits/rejected": -0.05657276511192322, "logps/chosen": -681.3847045898438, "logps/rejected": -1717.935302734375, "loss": 0.1384, "rewards/accuracies": 0.875, "rewards/chosen": -0.0924479216337204, "rewards/margins": 0.3233557939529419, "rewards/rejected": -0.4158037304878235, "step": 2210 }, { "epoch": 0.85, "learning_rate": 3.540242124083046e-07, "logits/chosen": -1.396579623222351, "logits/rejected": -0.6319032907485962, "logps/chosen": -647.7759399414062, "logps/rejected": -1605.53125, "loss": 0.1358, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08058805763721466, "rewards/margins": 0.32926681637763977, "rewards/rejected": -0.40985482931137085, "step": 2220 }, { "epoch": 0.85, "learning_rate": 3.3715665418246834e-07, "logits/chosen": -1.2585084438323975, "logits/rejected": 0.1263083815574646, "logps/chosen": -674.4474487304688, "logps/rejected": -1586.1773681640625, "loss": 0.1933, "rewards/accuracies": 0.75, "rewards/chosen": -0.11296755075454712, "rewards/margins": 0.2808358371257782, "rewards/rejected": -0.3938034176826477, "step": 2230 }, { "epoch": 0.85, "learning_rate": 3.206717079868685e-07, "logits/chosen": -1.3417532444000244, "logits/rejected": 0.2919345796108246, "logps/chosen": -643.0567626953125, "logps/rejected": -1609.2138671875, "loss": 0.1436, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1068982258439064, "rewards/margins": 0.30355769395828247, "rewards/rejected": -0.4104558825492859, "step": 2240 }, { "epoch": 0.86, "learning_rate": 3.0457229004691946e-07, "logits/chosen": -1.2822325229644775, "logits/rejected": -0.5269815325737, "logps/chosen": -677.7099609375, "logps/rejected": -1543.5213623046875, "loss": 0.1599, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07473982870578766, "rewards/margins": 0.300297349691391, "rewards/rejected": -0.37503722310066223, "step": 2250 }, { "epoch": 0.86, "learning_rate": 2.888612483871883e-07, "logits/chosen": -1.3761008977890015, "logits/rejected": -0.011561155319213867, "logps/chosen": -770.3639526367188, "logps/rejected": -1677.126220703125, "loss": 0.1477, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09979093074798584, "rewards/margins": 0.3001348376274109, "rewards/rejected": -0.39992576837539673, "step": 2260 }, { "epoch": 0.86, "learning_rate": 2.7354136232757233e-07, "logits/chosen": -1.518425464630127, "logits/rejected": -0.8384590148925781, "logps/chosen": -685.4517822265625, "logps/rejected": -1764.8919677734375, "loss": 0.1998, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08911526948213577, "rewards/margins": 0.35367995500564575, "rewards/rejected": -0.4427952170372009, "step": 2270 }, { "epoch": 0.87, "learning_rate": 2.5861534199163265e-07, "logits/chosen": -1.5102428197860718, "logits/rejected": -0.9423762559890747, "logps/chosen": -692.1110229492188, "logps/rejected": -1931.6302490234375, "loss": 0.085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.10240081697702408, "rewards/margins": 0.40418750047683716, "rewards/rejected": -0.506588339805603, "step": 2280 }, { "epoch": 0.87, "learning_rate": 2.440858278271632e-07, "logits/chosen": -1.3964751958847046, "logits/rejected": -0.5981974601745605, "logps/chosen": -689.185546875, "logps/rejected": -1516.942138671875, "loss": 0.1605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07960820198059082, "rewards/margins": 0.3220427632331848, "rewards/rejected": -0.401650995016098, "step": 2290 }, { "epoch": 0.88, "learning_rate": 2.299553901390933e-07, "logits/chosen": -1.3468966484069824, "logits/rejected": -0.36982113122940063, "logps/chosen": -729.3480224609375, "logps/rejected": -1747.8248291015625, "loss": 0.1434, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12135223299264908, "rewards/margins": 0.33952102065086365, "rewards/rejected": -0.46087321639060974, "step": 2300 }, { "epoch": 0.88, "learning_rate": 2.1622652863479394e-07, "logits/chosen": -1.196449637413025, "logits/rejected": -0.22112736105918884, "logps/chosen": -734.4334716796875, "logps/rejected": -1707.4638671875, "loss": 0.1248, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09437798708677292, "rewards/margins": 0.3716830611228943, "rewards/rejected": -0.4660610258579254, "step": 2310 }, { "epoch": 0.88, "learning_rate": 2.0290167198187188e-07, "logits/chosen": -1.4252989292144775, "logits/rejected": -0.07000579684972763, "logps/chosen": -687.1531982421875, "logps/rejected": -1504.78076171875, "loss": 0.1923, "rewards/accuracies": 0.75, "rewards/chosen": -0.11418702453374863, "rewards/margins": 0.2642050087451935, "rewards/rejected": -0.3783920407295227, "step": 2320 }, { "epoch": 0.89, "learning_rate": 1.8998317737853407e-07, "logits/chosen": -1.3490269184112549, "logits/rejected": -0.5536440014839172, "logps/chosen": -738.6914672851562, "logps/rejected": -1782.6478271484375, "loss": 0.1594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1122765988111496, "rewards/margins": 0.36197370290756226, "rewards/rejected": -0.47425025701522827, "step": 2330 }, { "epoch": 0.89, "learning_rate": 1.7747333013659356e-07, "logits/chosen": -1.4544187784194946, "logits/rejected": -0.5661396980285645, "logps/chosen": -614.9838256835938, "logps/rejected": -1560.7384033203125, "loss": 0.1562, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09549121558666229, "rewards/margins": 0.34782588481903076, "rewards/rejected": -0.44331711530685425, "step": 2340 }, { "epoch": 0.9, "learning_rate": 1.653743432771915e-07, "logits/chosen": -1.3903831243515015, "logits/rejected": -0.644793689250946, "logps/chosen": -683.1171264648438, "logps/rejected": -1818.9449462890625, "loss": 0.1654, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09501717984676361, "rewards/margins": 0.38627177476882935, "rewards/rejected": -0.48128899931907654, "step": 2350 }, { "epoch": 0.9, "learning_rate": 1.5368835713931108e-07, "logits/chosen": -1.4099493026733398, "logits/rejected": -0.46979403495788574, "logps/chosen": -693.54638671875, "logps/rejected": -1541.280517578125, "loss": 0.2439, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09285475313663483, "rewards/margins": 0.28271645307540894, "rewards/rejected": -0.37557122111320496, "step": 2360 }, { "epoch": 0.9, "learning_rate": 1.424174390011443e-07, "logits/chosen": -1.4735386371612549, "logits/rejected": -0.1706770956516266, "logps/chosen": -698.6603393554688, "logps/rejected": -1731.127197265625, "loss": 0.1727, "rewards/accuracies": 0.875, "rewards/chosen": -0.10361242294311523, "rewards/margins": 0.3313453495502472, "rewards/rejected": -0.43495768308639526, "step": 2370 }, { "epoch": 0.91, "learning_rate": 1.3156358271438628e-07, "logits/chosen": -1.3660482168197632, "logits/rejected": -0.44238418340682983, "logps/chosen": -660.2940673828125, "logps/rejected": -1715.768310546875, "loss": 0.148, "rewards/accuracies": 0.875, "rewards/chosen": -0.07836081832647324, "rewards/margins": 0.3487169146537781, "rewards/rejected": -0.4270777702331543, "step": 2380 }, { "epoch": 0.91, "learning_rate": 1.2112870835151835e-07, "logits/chosen": -1.3392059803009033, "logits/rejected": -0.4184719920158386, "logps/chosen": -838.8780517578125, "logps/rejected": -1943.736083984375, "loss": 0.1807, "rewards/accuracies": 0.875, "rewards/chosen": -0.14092907309532166, "rewards/margins": 0.37054315209388733, "rewards/rejected": -0.5114721655845642, "step": 2390 }, { "epoch": 0.91, "learning_rate": 1.1111466186614267e-07, "logits/chosen": -1.2360316514968872, "logits/rejected": -0.45300132036209106, "logps/chosen": -712.4830322265625, "logps/rejected": -1735.195556640625, "loss": 0.1245, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11973945796489716, "rewards/margins": 0.3529644012451172, "rewards/rejected": -0.47270384430885315, "step": 2400 }, { "epoch": 0.92, "learning_rate": 1.0152321476642884e-07, "logits/chosen": -1.094175934791565, "logits/rejected": 0.11562313139438629, "logps/chosen": -604.6788330078125, "logps/rejected": -1585.505126953125, "loss": 0.1234, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09517119824886322, "rewards/margins": 0.3041767477989197, "rewards/rejected": -0.3993479311466217, "step": 2410 }, { "epoch": 0.92, "learning_rate": 9.235606380173012e-08, "logits/chosen": -1.4821711778640747, "logits/rejected": -1.0250943899154663, "logps/chosen": -609.3489990234375, "logps/rejected": -1536.6357421875, "loss": 0.1941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0961153581738472, "rewards/margins": 0.297118604183197, "rewards/rejected": -0.3932339549064636, "step": 2420 }, { "epoch": 0.93, "learning_rate": 8.361483066242376e-08, "logits/chosen": -1.1447381973266602, "logits/rejected": -0.2417713701725006, "logps/chosen": -683.2552490234375, "logps/rejected": -1674.5484619140625, "loss": 0.1773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08651186525821686, "rewards/margins": 0.33491700887680054, "rewards/rejected": -0.421428918838501, "step": 2430 }, { "epoch": 0.93, "learning_rate": 7.530106169303108e-08, "logits/chosen": -1.262356162071228, "logits/rejected": -0.4814915060997009, "logps/chosen": -625.0184326171875, "logps/rejected": -1854.253173828125, "loss": 0.1321, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0827106162905693, "rewards/margins": 0.40019288659095764, "rewards/rejected": -0.4829035699367523, "step": 2440 }, { "epoch": 0.93, "learning_rate": 6.741622761866518e-08, "logits/chosen": -1.3366048336029053, "logits/rejected": -0.41937917470932007, "logps/chosen": -799.596923828125, "logps/rejected": -1796.3837890625, "loss": 0.1553, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10500969737768173, "rewards/margins": 0.32893863320350647, "rewards/rejected": -0.4339483678340912, "step": 2450 }, { "epoch": 0.94, "learning_rate": 5.996172328485622e-08, "logits/chosen": -1.133802056312561, "logits/rejected": -0.39687299728393555, "logps/chosen": -587.1231689453125, "logps/rejected": -1569.2593994140625, "loss": 0.1466, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07388588041067123, "rewards/margins": 0.32578206062316895, "rewards/rejected": -0.3996679484844208, "step": 2460 }, { "epoch": 0.94, "learning_rate": 5.2938867410800566e-08, "logits/chosen": -1.3924256563186646, "logits/rejected": 0.1944352090358734, "logps/chosen": -832.2745971679688, "logps/rejected": -1727.2838134765625, "loss": 0.216, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17324545979499817, "rewards/margins": 0.26337963342666626, "rewards/rejected": -0.43662509322166443, "step": 2470 }, { "epoch": 0.94, "learning_rate": 4.634890235607398e-08, "logits/chosen": -1.4834661483764648, "logits/rejected": -0.18076911568641663, "logps/chosen": -670.6461181640625, "logps/rejected": -1735.8082275390625, "loss": 0.1099, "rewards/accuracies": 0.875, "rewards/chosen": -0.08105865120887756, "rewards/margins": 0.3617044687271118, "rewards/rejected": -0.442763090133667, "step": 2480 }, { "epoch": 0.95, "learning_rate": 4.019299390085884e-08, "logits/chosen": -1.4342833757400513, "logits/rejected": -0.7094882130622864, "logps/chosen": -672.8631591796875, "logps/rejected": -1710.8316650390625, "loss": 0.129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09844377636909485, "rewards/margins": 0.3631378412246704, "rewards/rejected": -0.46158164739608765, "step": 2490 }, { "epoch": 0.95, "learning_rate": 3.4472231039712203e-08, "logits/chosen": -1.5594841241836548, "logits/rejected": 0.3898252546787262, "logps/chosen": -667.2887573242188, "logps/rejected": -1757.8695068359375, "loss": 0.1287, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.082678884267807, "rewards/margins": 0.37307044863700867, "rewards/rejected": -0.45574936270713806, "step": 2500 }, { "epoch": 0.96, "learning_rate": 2.9187625788921758e-08, "logits/chosen": -1.3999525308609009, "logits/rejected": -0.20501630008220673, "logps/chosen": -804.59814453125, "logps/rejected": -1903.7210693359375, "loss": 0.1318, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.154169499874115, "rewards/margins": 0.3609152138233185, "rewards/rejected": -0.5150847434997559, "step": 2510 }, { "epoch": 0.96, "learning_rate": 2.434011300747663e-08, "logits/chosen": -1.5042387247085571, "logits/rejected": -0.5580551624298096, "logps/chosen": -680.4427490234375, "logps/rejected": -1800.3922119140625, "loss": 0.1144, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08121950179338455, "rewards/margins": 0.3951423466205597, "rewards/rejected": -0.47636181116104126, "step": 2520 }, { "epoch": 0.96, "learning_rate": 1.9930550231688774e-08, "logits/chosen": -1.3505208492279053, "logits/rejected": 0.30468448996543884, "logps/chosen": -829.3043212890625, "logps/rejected": -1658.1771240234375, "loss": 0.172, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11715574562549591, "rewards/margins": 0.28796306252479553, "rewards/rejected": -0.40511879324913025, "step": 2530 }, { "epoch": 0.97, "learning_rate": 1.595971752349379e-08, "logits/chosen": -1.0691452026367188, "logits/rejected": -0.030401384457945824, "logps/chosen": -730.8147583007812, "logps/rejected": -1791.3951416015625, "loss": 0.1143, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11044508218765259, "rewards/margins": 0.3494996428489685, "rewards/rejected": -0.45994478464126587, "step": 2540 }, { "epoch": 0.97, "learning_rate": 1.242831733245492e-08, "logits/chosen": -1.5310267210006714, "logits/rejected": -0.3011860251426697, "logps/chosen": -824.4607543945312, "logps/rejected": -1836.755615234375, "loss": 0.1154, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13207051157951355, "rewards/margins": 0.34391117095947266, "rewards/rejected": -0.4759817123413086, "step": 2550 }, { "epoch": 0.98, "learning_rate": 9.336974371499386e-09, "logits/chosen": -1.3519163131713867, "logits/rejected": -0.5477306246757507, "logps/chosen": -668.1945190429688, "logps/rejected": -1465.209716796875, "loss": 0.1526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09252701699733734, "rewards/margins": 0.27862733602523804, "rewards/rejected": -0.3711543679237366, "step": 2560 }, { "epoch": 0.98, "learning_rate": 6.68623550640346e-09, "logits/chosen": -1.270180106163025, "logits/rejected": -0.30761486291885376, "logps/chosen": -670.3209228515625, "logps/rejected": -1696.2796630859375, "loss": 0.0996, "rewards/accuracies": 0.875, "rewards/chosen": -0.06959416717290878, "rewards/margins": 0.36404961347579956, "rewards/rejected": -0.43364381790161133, "step": 2570 }, { "epoch": 0.98, "learning_rate": 4.476569659052632e-09, "logits/chosen": -1.5455292463302612, "logits/rejected": -0.18862822651863098, "logps/chosen": -751.3689575195312, "logps/rejected": -1753.3134765625, "loss": 0.1059, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10283096134662628, "rewards/margins": 0.3394131660461426, "rewards/rejected": -0.44224414229393005, "step": 2580 }, { "epoch": 0.99, "learning_rate": 2.708367724486849e-09, "logits/chosen": -1.3475897312164307, "logits/rejected": -0.23612749576568604, "logps/chosen": -714.1107177734375, "logps/rejected": -1568.1207275390625, "loss": 0.189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11587677896022797, "rewards/margins": 0.2607864737510681, "rewards/rejected": -0.3766632676124573, "step": 2590 }, { "epoch": 0.99, "learning_rate": 1.3819425017502774e-09, "logits/chosen": -1.533098816871643, "logits/rejected": -0.6434544920921326, "logps/chosen": -564.0545043945312, "logps/rejected": -1495.6220703125, "loss": 0.1588, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05225463956594467, "rewards/margins": 0.33574843406677246, "rewards/rejected": -0.38800305128097534, "step": 2600 }, { "epoch": 0.99, "learning_rate": 4.97528638557232e-10, "logits/chosen": -1.2899420261383057, "logits/rejected": -0.09048126637935638, "logps/chosen": -679.8358764648438, "logps/rejected": -1596.41650390625, "loss": 0.2151, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10204477608203888, "rewards/margins": 0.3431271016597748, "rewards/rejected": -0.44517189264297485, "step": 2610 }, { "epoch": 1.0, "learning_rate": 5.5282589782323923e-11, "logits/chosen": -1.3265888690948486, "logits/rejected": -0.3179568350315094, "logps/chosen": -737.0650634765625, "logps/rejected": -1814.740478515625, "loss": 0.1602, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10084042698144913, "rewards/margins": 0.35281193256378174, "rewards/rejected": -0.4536523222923279, "step": 2620 }, { "epoch": 1.0, "step": 2625, "total_flos": 0.0, "train_loss": 0.17747254354613168, "train_runtime": 12401.2463, "train_samples_per_second": 0.847, "train_steps_per_second": 0.212 } ], "logging_steps": 10, "max_steps": 2625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }