{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.7993779160186625, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006220839813374805, "grad_norm": 52.52096939086914, "learning_rate": 1.0351966873706006e-08, "logits/chosen": 0.012532182969152927, "logits/rejected": 5.3814802169799805, "logps/chosen": -373.7783203125, "logps/rejected": -592.5435791015625, "loss": 2.1252, "rewards/accuracies": 0.25, "rewards/chosen": 3.039964437484741, "rewards/margins": -1.1029200553894043, "rewards/rejected": 4.142884254455566, "step": 1 }, { "epoch": 0.001244167962674961, "grad_norm": 48.9936408996582, "learning_rate": 2.0703933747412012e-08, "logits/chosen": 1.9091384410858154, "logits/rejected": 5.514098644256592, "logps/chosen": -337.4912414550781, "logps/rejected": -521.2832641601562, "loss": 5.0175, "rewards/accuracies": 0.125, "rewards/chosen": 5.3760600090026855, "rewards/margins": -4.835138320922852, "rewards/rejected": 10.211198806762695, "step": 2 }, { "epoch": 0.0018662519440124418, "grad_norm": 45.977542877197266, "learning_rate": 3.1055900621118015e-08, "logits/chosen": 2.1885063648223877, "logits/rejected": 4.337547302246094, "logps/chosen": -459.13153076171875, "logps/rejected": -506.27825927734375, "loss": 2.7651, "rewards/accuracies": 0.25, "rewards/chosen": 5.090634346008301, "rewards/margins": -1.6695140600204468, "rewards/rejected": 6.760149002075195, "step": 3 }, { "epoch": 0.002488335925349922, "grad_norm": 46.56956481933594, "learning_rate": 4.1407867494824025e-08, "logits/chosen": -0.5925924181938171, "logits/rejected": 3.8525352478027344, "logps/chosen": -306.25311279296875, "logps/rejected": -507.11968994140625, "loss": 4.915, "rewards/accuracies": 0.125, "rewards/chosen": 3.0717453956604004, "rewards/margins": -4.032550811767578, "rewards/rejected": 7.10429573059082, "step": 4 }, { "epoch": 0.003110419906687403, "grad_norm": 28.329368591308594, "learning_rate": 5.175983436853002e-08, "logits/chosen": 4.241677284240723, "logits/rejected": 4.568050384521484, "logps/chosen": -507.6634826660156, "logps/rejected": -522.225341796875, "loss": 1.2785, "rewards/accuracies": 0.5, "rewards/chosen": 9.437142372131348, "rewards/margins": 1.2293858528137207, "rewards/rejected": 8.207756042480469, "step": 5 }, { "epoch": 0.0037325038880248835, "grad_norm": 44.399723052978516, "learning_rate": 6.211180124223603e-08, "logits/chosen": 0.9154919385910034, "logits/rejected": 2.570462703704834, "logps/chosen": -378.0952453613281, "logps/rejected": -446.9964599609375, "loss": 2.6541, "rewards/accuracies": 0.375, "rewards/chosen": 4.882189750671387, "rewards/margins": -1.601928949356079, "rewards/rejected": 6.484118461608887, "step": 6 }, { "epoch": 0.004354587869362364, "grad_norm": 56.556884765625, "learning_rate": 7.246376811594204e-08, "logits/chosen": -0.5533325672149658, "logits/rejected": 4.104916095733643, "logps/chosen": -329.94818115234375, "logps/rejected": -621.577880859375, "loss": 4.5922, "rewards/accuracies": 0.25, "rewards/chosen": 3.9307994842529297, "rewards/margins": -3.158440113067627, "rewards/rejected": 7.089239597320557, "step": 7 }, { "epoch": 0.004976671850699844, "grad_norm": 44.0192985534668, "learning_rate": 8.281573498964805e-08, "logits/chosen": 4.58944034576416, "logits/rejected": 4.53434944152832, "logps/chosen": -509.0722961425781, "logps/rejected": -480.3599853515625, "loss": 3.6859, "rewards/accuracies": 0.25, "rewards/chosen": 4.1546311378479, "rewards/margins": -3.303114891052246, "rewards/rejected": 7.4577460289001465, "step": 8 }, { "epoch": 0.005598755832037325, "grad_norm": 50.199031829833984, "learning_rate": 9.316770186335405e-08, "logits/chosen": -0.12182983756065369, "logits/rejected": 5.243905067443848, "logps/chosen": -297.5303039550781, "logps/rejected": -551.4981079101562, "loss": 5.1511, "rewards/accuracies": 0.125, "rewards/chosen": 3.3871593475341797, "rewards/margins": -4.763964653015137, "rewards/rejected": 8.151124000549316, "step": 9 }, { "epoch": 0.006220839813374806, "grad_norm": 44.1707763671875, "learning_rate": 1.0351966873706004e-07, "logits/chosen": 1.1508839130401611, "logits/rejected": 3.511411428451538, "logps/chosen": -398.50701904296875, "logps/rejected": -559.7413330078125, "loss": 3.7766, "rewards/accuracies": 0.375, "rewards/chosen": 5.0946455001831055, "rewards/margins": -2.76721453666687, "rewards/rejected": 7.861859321594238, "step": 10 }, { "epoch": 0.006842923794712286, "grad_norm": 56.588111877441406, "learning_rate": 1.1387163561076605e-07, "logits/chosen": 2.96075177192688, "logits/rejected": 5.463801383972168, "logps/chosen": -471.24652099609375, "logps/rejected": -518.7703857421875, "loss": 2.7771, "rewards/accuracies": 0.25, "rewards/chosen": 3.3338894844055176, "rewards/margins": -1.9073762893676758, "rewards/rejected": 5.241266250610352, "step": 11 }, { "epoch": 0.007465007776049767, "grad_norm": 44.33781814575195, "learning_rate": 1.2422360248447206e-07, "logits/chosen": 2.9660511016845703, "logits/rejected": 5.102863788604736, "logps/chosen": -435.560546875, "logps/rejected": -541.2033081054688, "loss": 3.1998, "rewards/accuracies": 0.25, "rewards/chosen": 7.141000270843506, "rewards/margins": -1.1307448148727417, "rewards/rejected": 8.271744728088379, "step": 12 }, { "epoch": 0.008087091757387248, "grad_norm": 40.28484344482422, "learning_rate": 1.3457556935817807e-07, "logits/chosen": 1.7922760248184204, "logits/rejected": 6.533666610717773, "logps/chosen": -422.5932312011719, "logps/rejected": -602.603515625, "loss": 3.6227, "rewards/accuracies": 0.375, "rewards/chosen": 5.170872688293457, "rewards/margins": -2.765868663787842, "rewards/rejected": 7.936741828918457, "step": 13 }, { "epoch": 0.008709175738724729, "grad_norm": 49.1853141784668, "learning_rate": 1.4492753623188408e-07, "logits/chosen": -1.2458416223526, "logits/rejected": 3.827310562133789, "logps/chosen": -276.527587890625, "logps/rejected": -613.375732421875, "loss": 4.5431, "rewards/accuracies": 0.25, "rewards/chosen": 2.5047080516815186, "rewards/margins": -3.598675012588501, "rewards/rejected": 6.1033830642700195, "step": 14 }, { "epoch": 0.00933125972006221, "grad_norm": 40.00694274902344, "learning_rate": 1.5527950310559006e-07, "logits/chosen": 0.6321654915809631, "logits/rejected": 4.0691609382629395, "logps/chosen": -410.6964111328125, "logps/rejected": -517.6644287109375, "loss": 4.6035, "rewards/accuracies": 0.375, "rewards/chosen": 5.570537567138672, "rewards/margins": -2.5490994453430176, "rewards/rejected": 8.119636535644531, "step": 15 }, { "epoch": 0.009953343701399688, "grad_norm": 38.782012939453125, "learning_rate": 1.656314699792961e-07, "logits/chosen": 0.7363018989562988, "logits/rejected": 4.3214006423950195, "logps/chosen": -343.54168701171875, "logps/rejected": -488.6893310546875, "loss": 3.467, "rewards/accuracies": 0.25, "rewards/chosen": 7.064784049987793, "rewards/margins": -1.6931116580963135, "rewards/rejected": 8.757895469665527, "step": 16 }, { "epoch": 0.010575427682737169, "grad_norm": 32.41596603393555, "learning_rate": 1.7598343685300208e-07, "logits/chosen": -0.19795677065849304, "logits/rejected": 1.080691933631897, "logps/chosen": -396.13726806640625, "logps/rejected": -448.459228515625, "loss": 3.3021, "rewards/accuracies": 0.375, "rewards/chosen": 4.552862167358398, "rewards/margins": -1.6756930351257324, "rewards/rejected": 6.228555202484131, "step": 17 }, { "epoch": 0.01119751166407465, "grad_norm": 46.2690544128418, "learning_rate": 1.863354037267081e-07, "logits/chosen": 2.7062017917633057, "logits/rejected": 3.694239377975464, "logps/chosen": -446.1850280761719, "logps/rejected": -549.3088989257812, "loss": 4.0624, "rewards/accuracies": 0.25, "rewards/chosen": 6.74229621887207, "rewards/margins": -3.0481457710266113, "rewards/rejected": 9.79044246673584, "step": 18 }, { "epoch": 0.01181959564541213, "grad_norm": 43.01369094848633, "learning_rate": 1.966873706004141e-07, "logits/chosen": 0.34469613432884216, "logits/rejected": 4.731164932250977, "logps/chosen": -413.55377197265625, "logps/rejected": -558.8274536132812, "loss": 1.8663, "rewards/accuracies": 0.25, "rewards/chosen": 3.901057243347168, "rewards/margins": -1.076063632965088, "rewards/rejected": 4.977120876312256, "step": 19 }, { "epoch": 0.012441679626749611, "grad_norm": 43.26823043823242, "learning_rate": 2.0703933747412008e-07, "logits/chosen": 0.7995167970657349, "logits/rejected": 4.243210792541504, "logps/chosen": -403.41650390625, "logps/rejected": -514.751953125, "loss": 3.9075, "rewards/accuracies": 0.375, "rewards/chosen": 4.657955646514893, "rewards/margins": -2.819042682647705, "rewards/rejected": 7.476998329162598, "step": 20 }, { "epoch": 0.013063763608087092, "grad_norm": 40.63854217529297, "learning_rate": 2.173913043478261e-07, "logits/chosen": 0.8253070712089539, "logits/rejected": 4.052617073059082, "logps/chosen": -392.7893371582031, "logps/rejected": -536.6062622070312, "loss": 4.2509, "rewards/accuracies": 0.375, "rewards/chosen": 5.524932861328125, "rewards/margins": -2.6818959712982178, "rewards/rejected": 8.206829071044922, "step": 21 }, { "epoch": 0.013685847589424573, "grad_norm": 46.58070755004883, "learning_rate": 2.277432712215321e-07, "logits/chosen": -0.05951416492462158, "logits/rejected": 3.8601527214050293, "logps/chosen": -323.16619873046875, "logps/rejected": -515.654052734375, "loss": 4.1162, "rewards/accuracies": 0.125, "rewards/chosen": 3.4312305450439453, "rewards/margins": -3.677948236465454, "rewards/rejected": 7.10917854309082, "step": 22 }, { "epoch": 0.014307931570762053, "grad_norm": 46.004512786865234, "learning_rate": 2.3809523809523811e-07, "logits/chosen": 1.3055967092514038, "logits/rejected": 4.365591526031494, "logps/chosen": -403.9496765136719, "logps/rejected": -589.9659423828125, "loss": 5.3046, "rewards/accuracies": 0.125, "rewards/chosen": 6.964463710784912, "rewards/margins": -5.141401290893555, "rewards/rejected": 12.105865478515625, "step": 23 }, { "epoch": 0.014930015552099534, "grad_norm": 55.458885192871094, "learning_rate": 2.484472049689441e-07, "logits/chosen": 1.6353414058685303, "logits/rejected": 5.698489665985107, "logps/chosen": -337.55078125, "logps/rejected": -551.3826904296875, "loss": 5.5208, "rewards/accuracies": 0.0, "rewards/chosen": 5.996336936950684, "rewards/margins": -5.433783531188965, "rewards/rejected": 11.430120468139648, "step": 24 }, { "epoch": 0.015552099533437015, "grad_norm": 38.34725570678711, "learning_rate": 2.5879917184265016e-07, "logits/chosen": -0.7651683688163757, "logits/rejected": 2.5544052124023438, "logps/chosen": -235.06356811523438, "logps/rejected": -407.9915771484375, "loss": 3.0314, "rewards/accuracies": 0.125, "rewards/chosen": 4.645867347717285, "rewards/margins": -2.6488218307495117, "rewards/rejected": 7.294688701629639, "step": 25 }, { "epoch": 0.016174183514774496, "grad_norm": 26.11163330078125, "learning_rate": 2.6915113871635614e-07, "logits/chosen": 0.4576059579849243, "logits/rejected": 3.3899879455566406, "logps/chosen": -265.80413818359375, "logps/rejected": -426.92108154296875, "loss": 3.0763, "rewards/accuracies": 0.75, "rewards/chosen": 4.355762481689453, "rewards/margins": -1.404404640197754, "rewards/rejected": 5.760167121887207, "step": 26 }, { "epoch": 0.016796267496111975, "grad_norm": 50.16153335571289, "learning_rate": 2.795031055900621e-07, "logits/chosen": 0.9520180225372314, "logits/rejected": 1.3030765056610107, "logps/chosen": -424.3631591796875, "logps/rejected": -459.70550537109375, "loss": 3.3118, "rewards/accuracies": 0.125, "rewards/chosen": 3.551614999771118, "rewards/margins": -2.787909746170044, "rewards/rejected": 6.339524745941162, "step": 27 }, { "epoch": 0.017418351477449457, "grad_norm": 47.12639236450195, "learning_rate": 2.8985507246376816e-07, "logits/chosen": -0.052807122468948364, "logits/rejected": 4.841619968414307, "logps/chosen": -374.9623718261719, "logps/rejected": -599.8634643554688, "loss": 5.1175, "rewards/accuracies": 0.375, "rewards/chosen": 4.661255836486816, "rewards/margins": -4.2213335037231445, "rewards/rejected": 8.882589340209961, "step": 28 }, { "epoch": 0.018040435458786936, "grad_norm": 55.17001724243164, "learning_rate": 3.0020703933747414e-07, "logits/chosen": 2.173837661743164, "logits/rejected": 4.256175518035889, "logps/chosen": -514.0777587890625, "logps/rejected": -582.631591796875, "loss": 4.6282, "rewards/accuracies": 0.25, "rewards/chosen": 7.686736106872559, "rewards/margins": -2.3255183696746826, "rewards/rejected": 10.01225471496582, "step": 29 }, { "epoch": 0.01866251944012442, "grad_norm": 45.248477935791016, "learning_rate": 3.1055900621118013e-07, "logits/chosen": 3.1304564476013184, "logits/rejected": 5.465931415557861, "logps/chosen": -467.85833740234375, "logps/rejected": -526.0656127929688, "loss": 4.2446, "rewards/accuracies": 0.25, "rewards/chosen": 7.557483673095703, "rewards/margins": -3.491199016571045, "rewards/rejected": 11.04868221282959, "step": 30 }, { "epoch": 0.019284603421461897, "grad_norm": 22.855022430419922, "learning_rate": 3.2091097308488616e-07, "logits/chosen": 3.430361270904541, "logits/rejected": 5.7235107421875, "logps/chosen": -460.7084655761719, "logps/rejected": -496.1153564453125, "loss": 1.6007, "rewards/accuracies": 0.75, "rewards/chosen": 7.377945899963379, "rewards/margins": 1.0376758575439453, "rewards/rejected": 6.340270519256592, "step": 31 }, { "epoch": 0.019906687402799376, "grad_norm": 53.90958023071289, "learning_rate": 3.312629399585922e-07, "logits/chosen": 0.0030750036239624023, "logits/rejected": 4.562582015991211, "logps/chosen": -328.0555419921875, "logps/rejected": -523.3200073242188, "loss": 4.207, "rewards/accuracies": 0.125, "rewards/chosen": 3.139477252960205, "rewards/margins": -3.903869152069092, "rewards/rejected": 7.043346881866455, "step": 32 }, { "epoch": 0.02052877138413686, "grad_norm": 46.680335998535156, "learning_rate": 3.416149068322982e-07, "logits/chosen": 0.6541320085525513, "logits/rejected": 4.543554306030273, "logps/chosen": -410.05511474609375, "logps/rejected": -555.1267700195312, "loss": 2.788, "rewards/accuracies": 0.25, "rewards/chosen": 4.895612716674805, "rewards/margins": -2.0638134479522705, "rewards/rejected": 6.959425926208496, "step": 33 }, { "epoch": 0.021150855365474338, "grad_norm": 45.052101135253906, "learning_rate": 3.5196687370600417e-07, "logits/chosen": 2.8638980388641357, "logits/rejected": 4.418205738067627, "logps/chosen": -467.8529357910156, "logps/rejected": -529.4423828125, "loss": 1.9395, "rewards/accuracies": 0.25, "rewards/chosen": 8.37755298614502, "rewards/margins": -0.4131333827972412, "rewards/rejected": 8.79068660736084, "step": 34 }, { "epoch": 0.02177293934681182, "grad_norm": 46.419036865234375, "learning_rate": 3.623188405797102e-07, "logits/chosen": 1.6391205787658691, "logits/rejected": 3.039005756378174, "logps/chosen": -437.704833984375, "logps/rejected": -481.65179443359375, "loss": 3.9945, "rewards/accuracies": 0.25, "rewards/chosen": 8.587377548217773, "rewards/margins": -2.558168649673462, "rewards/rejected": 11.145545959472656, "step": 35 }, { "epoch": 0.0223950233281493, "grad_norm": 49.24765396118164, "learning_rate": 3.726708074534162e-07, "logits/chosen": 0.6626954078674316, "logits/rejected": 5.522377967834473, "logps/chosen": -249.82749938964844, "logps/rejected": -480.0360412597656, "loss": 5.1576, "rewards/accuracies": 0.0, "rewards/chosen": 3.22590708732605, "rewards/margins": -5.005549430847168, "rewards/rejected": 8.231456756591797, "step": 36 }, { "epoch": 0.023017107309486782, "grad_norm": 40.403690338134766, "learning_rate": 3.8302277432712217e-07, "logits/chosen": 3.179089069366455, "logits/rejected": 5.580604076385498, "logps/chosen": -484.9145812988281, "logps/rejected": -545.426513671875, "loss": 2.1843, "rewards/accuracies": 0.375, "rewards/chosen": 8.788908004760742, "rewards/margins": 1.674599528312683, "rewards/rejected": 7.114309310913086, "step": 37 }, { "epoch": 0.02363919129082426, "grad_norm": 45.154029846191406, "learning_rate": 3.933747412008282e-07, "logits/chosen": 1.6743485927581787, "logits/rejected": 4.098839282989502, "logps/chosen": -391.55462646484375, "logps/rejected": -487.47802734375, "loss": 3.8492, "rewards/accuracies": 0.375, "rewards/chosen": 4.612683296203613, "rewards/margins": -3.17740535736084, "rewards/rejected": 7.7900896072387695, "step": 38 }, { "epoch": 0.024261275272161743, "grad_norm": 34.88768768310547, "learning_rate": 4.037267080745342e-07, "logits/chosen": 2.6441800594329834, "logits/rejected": 3.981311321258545, "logps/chosen": -326.20489501953125, "logps/rejected": -379.9033203125, "loss": 1.7066, "rewards/accuracies": 0.375, "rewards/chosen": 2.9094905853271484, "rewards/margins": -0.5736005902290344, "rewards/rejected": 3.483090877532959, "step": 39 }, { "epoch": 0.024883359253499222, "grad_norm": 53.884212493896484, "learning_rate": 4.1407867494824017e-07, "logits/chosen": 1.5335716009140015, "logits/rejected": 4.578222751617432, "logps/chosen": -436.3438720703125, "logps/rejected": -596.6395263671875, "loss": 7.6457, "rewards/accuracies": 0.25, "rewards/chosen": 4.73098087310791, "rewards/margins": -7.282605171203613, "rewards/rejected": 12.013585090637207, "step": 40 }, { "epoch": 0.0255054432348367, "grad_norm": 47.64874267578125, "learning_rate": 4.244306418219462e-07, "logits/chosen": -0.8001726865768433, "logits/rejected": 4.6293535232543945, "logps/chosen": -284.55389404296875, "logps/rejected": -473.02423095703125, "loss": 3.6999, "rewards/accuracies": 0.25, "rewards/chosen": 3.107661247253418, "rewards/margins": -3.117319107055664, "rewards/rejected": 6.224980354309082, "step": 41 }, { "epoch": 0.026127527216174184, "grad_norm": 39.684940338134766, "learning_rate": 4.347826086956522e-07, "logits/chosen": 2.798779010772705, "logits/rejected": 3.0875256061553955, "logps/chosen": -509.02490234375, "logps/rejected": -545.913330078125, "loss": 1.6709, "rewards/accuracies": 0.5, "rewards/chosen": 10.082112312316895, "rewards/margins": 2.398789405822754, "rewards/rejected": 7.683322906494141, "step": 42 }, { "epoch": 0.026749611197511663, "grad_norm": 32.33852767944336, "learning_rate": 4.451345755693582e-07, "logits/chosen": 1.85959792137146, "logits/rejected": 1.575959324836731, "logps/chosen": -412.39202880859375, "logps/rejected": -416.99822998046875, "loss": 2.4367, "rewards/accuracies": 0.375, "rewards/chosen": 7.020496368408203, "rewards/margins": -1.3365765810012817, "rewards/rejected": 8.357072830200195, "step": 43 }, { "epoch": 0.027371695178849145, "grad_norm": 51.07847595214844, "learning_rate": 4.554865424430642e-07, "logits/chosen": -0.3712843060493469, "logits/rejected": 5.780962944030762, "logps/chosen": -346.7430725097656, "logps/rejected": -668.7047729492188, "loss": 6.9333, "rewards/accuracies": 0.125, "rewards/chosen": 1.6845252513885498, "rewards/margins": -6.724910736083984, "rewards/rejected": 8.409436225891113, "step": 44 }, { "epoch": 0.027993779160186624, "grad_norm": 49.54875183105469, "learning_rate": 4.658385093167702e-07, "logits/chosen": 0.5838897228240967, "logits/rejected": 4.9215898513793945, "logps/chosen": -403.30804443359375, "logps/rejected": -589.4797973632812, "loss": 3.4521, "rewards/accuracies": 0.0, "rewards/chosen": 9.050199508666992, "rewards/margins": -3.2908575534820557, "rewards/rejected": 12.341056823730469, "step": 45 }, { "epoch": 0.028615863141524107, "grad_norm": 34.638187408447266, "learning_rate": 4.7619047619047623e-07, "logits/chosen": -0.016638919711112976, "logits/rejected": 3.867025375366211, "logps/chosen": -291.69720458984375, "logps/rejected": -462.72760009765625, "loss": 3.2514, "rewards/accuracies": 0.375, "rewards/chosen": 3.6876978874206543, "rewards/margins": -1.1836978197097778, "rewards/rejected": 4.871395587921143, "step": 46 }, { "epoch": 0.029237947122861586, "grad_norm": 43.84613800048828, "learning_rate": 4.865424430641822e-07, "logits/chosen": 1.2583420276641846, "logits/rejected": 5.132736682891846, "logps/chosen": -379.2347412109375, "logps/rejected": -541.4534912109375, "loss": 2.7044, "rewards/accuracies": 0.375, "rewards/chosen": 4.057747840881348, "rewards/margins": -2.267834186553955, "rewards/rejected": 6.325582027435303, "step": 47 }, { "epoch": 0.029860031104199068, "grad_norm": 47.614315032958984, "learning_rate": 4.968944099378882e-07, "logits/chosen": -1.1907007694244385, "logits/rejected": 3.4339210987091064, "logps/chosen": -319.9333190917969, "logps/rejected": -497.4118957519531, "loss": 3.9817, "rewards/accuracies": 0.25, "rewards/chosen": 5.15675687789917, "rewards/margins": -2.3444647789001465, "rewards/rejected": 7.501220703125, "step": 48 }, { "epoch": 0.030482115085536547, "grad_norm": 39.3867301940918, "learning_rate": 5.072463768115942e-07, "logits/chosen": -3.0998148918151855, "logits/rejected": 4.021823406219482, "logps/chosen": -238.55032348632812, "logps/rejected": -542.6666870117188, "loss": 3.4389, "rewards/accuracies": 0.375, "rewards/chosen": 3.568326234817505, "rewards/margins": -2.0750913619995117, "rewards/rejected": 5.6434173583984375, "step": 49 }, { "epoch": 0.03110419906687403, "grad_norm": 44.98670196533203, "learning_rate": 5.175983436853003e-07, "logits/chosen": -1.0378568172454834, "logits/rejected": 6.316718101501465, "logps/chosen": -254.8461151123047, "logps/rejected": -560.372802734375, "loss": 5.8883, "rewards/accuracies": 0.25, "rewards/chosen": 3.8307912349700928, "rewards/margins": -5.595851898193359, "rewards/rejected": 9.426643371582031, "step": 50 }, { "epoch": 0.031726283048211505, "grad_norm": 57.26844787597656, "learning_rate": 5.279503105590063e-07, "logits/chosen": 1.2163293361663818, "logits/rejected": 4.85459566116333, "logps/chosen": -395.74951171875, "logps/rejected": -541.177734375, "loss": 7.3597, "rewards/accuracies": 0.0, "rewards/chosen": 3.2945306301116943, "rewards/margins": -7.298341751098633, "rewards/rejected": 10.59287166595459, "step": 51 }, { "epoch": 0.03234836702954899, "grad_norm": 64.28971862792969, "learning_rate": 5.383022774327123e-07, "logits/chosen": 3.2448387145996094, "logits/rejected": 5.297354221343994, "logps/chosen": -508.2118835449219, "logps/rejected": -576.161376953125, "loss": 4.2396, "rewards/accuracies": 0.25, "rewards/chosen": 8.044355392456055, "rewards/margins": -2.649853467941284, "rewards/rejected": 10.694210052490234, "step": 52 }, { "epoch": 0.03297045101088647, "grad_norm": 45.40407180786133, "learning_rate": 5.486542443064183e-07, "logits/chosen": 4.2648606300354, "logits/rejected": 6.613371849060059, "logps/chosen": -471.30206298828125, "logps/rejected": -523.7880859375, "loss": 2.9062, "rewards/accuracies": 0.25, "rewards/chosen": 4.817241191864014, "rewards/margins": -1.9571592807769775, "rewards/rejected": 6.77440071105957, "step": 53 }, { "epoch": 0.03359253499222395, "grad_norm": 44.306114196777344, "learning_rate": 5.590062111801243e-07, "logits/chosen": -2.1772749423980713, "logits/rejected": 4.385883808135986, "logps/chosen": -280.7019958496094, "logps/rejected": -626.658935546875, "loss": 5.2982, "rewards/accuracies": 0.125, "rewards/chosen": 4.835158348083496, "rewards/margins": -4.899688720703125, "rewards/rejected": 9.734847068786621, "step": 54 }, { "epoch": 0.03421461897356143, "grad_norm": 52.895057678222656, "learning_rate": 5.693581780538302e-07, "logits/chosen": 3.337453603744507, "logits/rejected": 4.959819793701172, "logps/chosen": -557.9818115234375, "logps/rejected": -590.6300659179688, "loss": 4.2118, "rewards/accuracies": 0.25, "rewards/chosen": 8.753548622131348, "rewards/margins": -3.3140532970428467, "rewards/rejected": 12.067602157592773, "step": 55 }, { "epoch": 0.034836702954898914, "grad_norm": 55.78615951538086, "learning_rate": 5.797101449275363e-07, "logits/chosen": 1.4191863536834717, "logits/rejected": 3.9965124130249023, "logps/chosen": -516.8236083984375, "logps/rejected": -617.3018798828125, "loss": 4.5912, "rewards/accuracies": 0.25, "rewards/chosen": 6.091235160827637, "rewards/margins": -4.235839366912842, "rewards/rejected": 10.32707405090332, "step": 56 }, { "epoch": 0.03545878693623639, "grad_norm": 43.840721130371094, "learning_rate": 5.900621118012423e-07, "logits/chosen": 0.7551851272583008, "logits/rejected": 3.4136557579040527, "logps/chosen": -383.36932373046875, "logps/rejected": -520.3485107421875, "loss": 2.3896, "rewards/accuracies": 0.25, "rewards/chosen": 6.259200096130371, "rewards/margins": -1.5887733697891235, "rewards/rejected": 7.847973346710205, "step": 57 }, { "epoch": 0.03608087091757387, "grad_norm": 53.005619049072266, "learning_rate": 6.004140786749483e-07, "logits/chosen": 0.8960199356079102, "logits/rejected": 5.212392807006836, "logps/chosen": -324.52838134765625, "logps/rejected": -570.8604736328125, "loss": 5.8641, "rewards/accuracies": 0.125, "rewards/chosen": 4.302431583404541, "rewards/margins": -5.693329811096191, "rewards/rejected": 9.99576187133789, "step": 58 }, { "epoch": 0.03670295489891135, "grad_norm": 39.89027404785156, "learning_rate": 6.107660455486543e-07, "logits/chosen": 0.9366174340248108, "logits/rejected": 3.5945024490356445, "logps/chosen": -388.9140625, "logps/rejected": -512.8837890625, "loss": 4.197, "rewards/accuracies": 0.25, "rewards/chosen": 6.309109210968018, "rewards/margins": -3.4562954902648926, "rewards/rejected": 9.76540470123291, "step": 59 }, { "epoch": 0.03732503888024884, "grad_norm": 50.484832763671875, "learning_rate": 6.211180124223603e-07, "logits/chosen": 1.6095322370529175, "logits/rejected": 5.206839561462402, "logps/chosen": -287.35223388671875, "logps/rejected": -461.3832092285156, "loss": 4.2601, "rewards/accuracies": 0.125, "rewards/chosen": 3.1970438957214355, "rewards/margins": -3.5525975227355957, "rewards/rejected": 6.749641418457031, "step": 60 }, { "epoch": 0.037947122861586316, "grad_norm": 49.992767333984375, "learning_rate": 6.314699792960663e-07, "logits/chosen": -1.0890344381332397, "logits/rejected": 3.729297161102295, "logps/chosen": -355.4293212890625, "logps/rejected": -582.5045166015625, "loss": 4.0015, "rewards/accuracies": 0.25, "rewards/chosen": 5.588550567626953, "rewards/margins": -2.822916030883789, "rewards/rejected": 8.411466598510742, "step": 61 }, { "epoch": 0.038569206842923795, "grad_norm": 32.55772018432617, "learning_rate": 6.418219461697723e-07, "logits/chosen": 1.50239098072052, "logits/rejected": 3.742825984954834, "logps/chosen": -340.8537902832031, "logps/rejected": -463.9370422363281, "loss": 3.6689, "rewards/accuracies": 0.375, "rewards/chosen": 3.409723997116089, "rewards/margins": -2.145049571990967, "rewards/rejected": 5.554773807525635, "step": 62 }, { "epoch": 0.039191290824261274, "grad_norm": 47.97010803222656, "learning_rate": 6.521739130434783e-07, "logits/chosen": -0.5774695873260498, "logits/rejected": 3.452863931655884, "logps/chosen": -466.74688720703125, "logps/rejected": -540.4033813476562, "loss": 3.878, "rewards/accuracies": 0.375, "rewards/chosen": 5.0257568359375, "rewards/margins": -3.079911231994629, "rewards/rejected": 8.105668067932129, "step": 63 }, { "epoch": 0.03981337480559875, "grad_norm": 60.673484802246094, "learning_rate": 6.625258799171844e-07, "logits/chosen": 1.5208394527435303, "logits/rejected": 3.985078811645508, "logps/chosen": -520.3128662109375, "logps/rejected": -586.0016479492188, "loss": 4.4295, "rewards/accuracies": 0.25, "rewards/chosen": 4.203519344329834, "rewards/margins": -3.1812143325805664, "rewards/rejected": 7.3847336769104, "step": 64 }, { "epoch": 0.04043545878693624, "grad_norm": 37.80503845214844, "learning_rate": 6.728778467908903e-07, "logits/chosen": 3.377659320831299, "logits/rejected": 4.628144264221191, "logps/chosen": -503.3712463378906, "logps/rejected": -563.0233154296875, "loss": 1.7255, "rewards/accuracies": 0.5, "rewards/chosen": 7.828451156616211, "rewards/margins": 0.11194157600402832, "rewards/rejected": 7.716508865356445, "step": 65 }, { "epoch": 0.04105754276827372, "grad_norm": 49.729862213134766, "learning_rate": 6.832298136645964e-07, "logits/chosen": 1.4419440031051636, "logits/rejected": 5.238779544830322, "logps/chosen": -408.07769775390625, "logps/rejected": -551.1817626953125, "loss": 4.5682, "rewards/accuracies": 0.375, "rewards/chosen": 6.3790388107299805, "rewards/margins": -2.325056552886963, "rewards/rejected": 8.704095840454102, "step": 66 }, { "epoch": 0.0416796267496112, "grad_norm": 56.857330322265625, "learning_rate": 6.935817805383023e-07, "logits/chosen": 1.8640213012695312, "logits/rejected": 5.679266929626465, "logps/chosen": -432.50390625, "logps/rejected": -636.1224365234375, "loss": 3.0801, "rewards/accuracies": 0.25, "rewards/chosen": 4.258503437042236, "rewards/margins": -2.0767855644226074, "rewards/rejected": 6.335289001464844, "step": 67 }, { "epoch": 0.042301710730948676, "grad_norm": 46.245445251464844, "learning_rate": 7.039337474120083e-07, "logits/chosen": 0.9165676832199097, "logits/rejected": 2.82378888130188, "logps/chosen": -407.091552734375, "logps/rejected": -470.09381103515625, "loss": 2.5734, "rewards/accuracies": 0.375, "rewards/chosen": 2.471022605895996, "rewards/margins": -1.7493443489074707, "rewards/rejected": 4.220366954803467, "step": 68 }, { "epoch": 0.04292379471228616, "grad_norm": 43.912864685058594, "learning_rate": 7.142857142857143e-07, "logits/chosen": 2.735879421234131, "logits/rejected": 3.7518153190612793, "logps/chosen": -509.6024169921875, "logps/rejected": -566.545654296875, "loss": 4.0801, "rewards/accuracies": 0.375, "rewards/chosen": 8.47749137878418, "rewards/margins": -2.4810967445373535, "rewards/rejected": 10.958588600158691, "step": 69 }, { "epoch": 0.04354587869362364, "grad_norm": 36.06388854980469, "learning_rate": 7.246376811594204e-07, "logits/chosen": -0.6139513254165649, "logits/rejected": 3.7651548385620117, "logps/chosen": -205.31744384765625, "logps/rejected": -337.3735046386719, "loss": 2.4479, "rewards/accuracies": 0.125, "rewards/chosen": 1.2144393920898438, "rewards/margins": -2.109691619873047, "rewards/rejected": 3.3241305351257324, "step": 70 }, { "epoch": 0.04416796267496112, "grad_norm": 48.27947998046875, "learning_rate": 7.349896480331263e-07, "logits/chosen": 0.38291358947753906, "logits/rejected": 4.874111652374268, "logps/chosen": -282.7892761230469, "logps/rejected": -501.6195373535156, "loss": 3.3797, "rewards/accuracies": 0.25, "rewards/chosen": 2.928769111633301, "rewards/margins": -2.763104200363159, "rewards/rejected": 5.691873550415039, "step": 71 }, { "epoch": 0.0447900466562986, "grad_norm": 36.64083480834961, "learning_rate": 7.453416149068324e-07, "logits/chosen": 3.216303586959839, "logits/rejected": 5.372792720794678, "logps/chosen": -454.7572021484375, "logps/rejected": -530.1778564453125, "loss": 1.9955, "rewards/accuracies": 0.625, "rewards/chosen": 3.4445557594299316, "rewards/margins": 0.10334785282611847, "rewards/rejected": 3.341207981109619, "step": 72 }, { "epoch": 0.04541213063763608, "grad_norm": 68.5392074584961, "learning_rate": 7.556935817805384e-07, "logits/chosen": 1.791405439376831, "logits/rejected": 4.570212364196777, "logps/chosen": -487.98828125, "logps/rejected": -584.217529296875, "loss": 4.2218, "rewards/accuracies": 0.125, "rewards/chosen": 4.641778469085693, "rewards/margins": -3.9613802433013916, "rewards/rejected": 8.603158950805664, "step": 73 }, { "epoch": 0.046034214618973564, "grad_norm": 35.1021614074707, "learning_rate": 7.660455486542443e-07, "logits/chosen": 2.474278450012207, "logits/rejected": 3.269683599472046, "logps/chosen": -424.99627685546875, "logps/rejected": -421.07513427734375, "loss": 1.4964, "rewards/accuracies": 0.375, "rewards/chosen": 6.019632339477539, "rewards/margins": 1.7979152202606201, "rewards/rejected": 4.221717357635498, "step": 74 }, { "epoch": 0.04665629860031104, "grad_norm": 45.31480026245117, "learning_rate": 7.763975155279503e-07, "logits/chosen": 3.982232093811035, "logits/rejected": 4.152582168579102, "logps/chosen": -462.2098083496094, "logps/rejected": -437.01776123046875, "loss": 1.4921, "rewards/accuracies": 0.375, "rewards/chosen": 3.4199776649475098, "rewards/margins": -0.8454870581626892, "rewards/rejected": 4.265464782714844, "step": 75 }, { "epoch": 0.04727838258164852, "grad_norm": 45.84056091308594, "learning_rate": 7.867494824016564e-07, "logits/chosen": 1.7973527908325195, "logits/rejected": 4.244307994842529, "logps/chosen": -511.0018005371094, "logps/rejected": -612.5986328125, "loss": 3.7604, "rewards/accuracies": 0.375, "rewards/chosen": 5.5702314376831055, "rewards/margins": -2.9728171825408936, "rewards/rejected": 8.543049812316895, "step": 76 }, { "epoch": 0.047900466562986, "grad_norm": 49.20193862915039, "learning_rate": 7.971014492753623e-07, "logits/chosen": 1.614759922027588, "logits/rejected": 4.329085350036621, "logps/chosen": -505.11370849609375, "logps/rejected": -607.37451171875, "loss": 4.4378, "rewards/accuracies": 0.375, "rewards/chosen": 6.242237091064453, "rewards/margins": -2.3279433250427246, "rewards/rejected": 8.570180892944336, "step": 77 }, { "epoch": 0.04852255054432349, "grad_norm": 33.60881805419922, "learning_rate": 8.074534161490684e-07, "logits/chosen": 3.6060690879821777, "logits/rejected": 6.220172882080078, "logps/chosen": -486.65313720703125, "logps/rejected": -591.926025390625, "loss": 2.7688, "rewards/accuracies": 0.625, "rewards/chosen": 9.750151634216309, "rewards/margins": -0.9578385353088379, "rewards/rejected": 10.707990646362305, "step": 78 }, { "epoch": 0.049144634525660966, "grad_norm": 42.57831954956055, "learning_rate": 8.178053830227745e-07, "logits/chosen": 0.6833969354629517, "logits/rejected": 4.242366790771484, "logps/chosen": -285.3023376464844, "logps/rejected": -455.5042419433594, "loss": 4.457, "rewards/accuracies": 0.25, "rewards/chosen": 3.0746004581451416, "rewards/margins": -3.957090377807617, "rewards/rejected": 7.031691074371338, "step": 79 }, { "epoch": 0.049766718506998445, "grad_norm": 53.40254592895508, "learning_rate": 8.281573498964803e-07, "logits/chosen": 2.4628918170928955, "logits/rejected": 4.680422306060791, "logps/chosen": -472.9252624511719, "logps/rejected": -564.3480834960938, "loss": 3.7476, "rewards/accuracies": 0.125, "rewards/chosen": 7.595523357391357, "rewards/margins": -3.3378608226776123, "rewards/rejected": 10.933384895324707, "step": 80 }, { "epoch": 0.050388802488335924, "grad_norm": 47.84553527832031, "learning_rate": 8.385093167701864e-07, "logits/chosen": -0.8003718852996826, "logits/rejected": 4.039190769195557, "logps/chosen": -320.61370849609375, "logps/rejected": -488.978759765625, "loss": 1.7874, "rewards/accuracies": 0.125, "rewards/chosen": 2.1538021564483643, "rewards/margins": -1.0411781072616577, "rewards/rejected": 3.1949803829193115, "step": 81 }, { "epoch": 0.0510108864696734, "grad_norm": 51.101417541503906, "learning_rate": 8.488612836438924e-07, "logits/chosen": 0.9290800094604492, "logits/rejected": 3.8441295623779297, "logps/chosen": -422.1509704589844, "logps/rejected": -547.141357421875, "loss": 3.4353, "rewards/accuracies": 0.125, "rewards/chosen": 6.441579818725586, "rewards/margins": -2.543184280395508, "rewards/rejected": 8.984764099121094, "step": 82 }, { "epoch": 0.05163297045101089, "grad_norm": 52.027313232421875, "learning_rate": 8.592132505175985e-07, "logits/chosen": 0.9031997919082642, "logits/rejected": 6.393537521362305, "logps/chosen": -402.1159362792969, "logps/rejected": -635.5774536132812, "loss": 4.7445, "rewards/accuracies": 0.25, "rewards/chosen": 5.350074291229248, "rewards/margins": -3.6923584938049316, "rewards/rejected": 9.04243278503418, "step": 83 }, { "epoch": 0.05225505443234837, "grad_norm": 49.580406188964844, "learning_rate": 8.695652173913044e-07, "logits/chosen": -1.1733779907226562, "logits/rejected": 3.845046281814575, "logps/chosen": -309.610595703125, "logps/rejected": -534.025634765625, "loss": 5.856, "rewards/accuracies": 0.25, "rewards/chosen": 3.196354866027832, "rewards/margins": -4.153650283813477, "rewards/rejected": 7.350005626678467, "step": 84 }, { "epoch": 0.05287713841368585, "grad_norm": 42.38566970825195, "learning_rate": 8.799171842650105e-07, "logits/chosen": -0.9000750780105591, "logits/rejected": 3.985197067260742, "logps/chosen": -332.3162841796875, "logps/rejected": -567.3726806640625, "loss": 4.3539, "rewards/accuracies": 0.25, "rewards/chosen": 6.791793346405029, "rewards/margins": -3.3670926094055176, "rewards/rejected": 10.158885955810547, "step": 85 }, { "epoch": 0.053499222395023326, "grad_norm": 54.692100524902344, "learning_rate": 8.902691511387164e-07, "logits/chosen": 0.4778970181941986, "logits/rejected": 4.724665641784668, "logps/chosen": -448.77117919921875, "logps/rejected": -606.9342041015625, "loss": 5.7404, "rewards/accuracies": 0.125, "rewards/chosen": 6.051287651062012, "rewards/margins": -4.640262126922607, "rewards/rejected": 10.691549301147461, "step": 86 }, { "epoch": 0.05412130637636081, "grad_norm": 56.3837890625, "learning_rate": 9.006211180124224e-07, "logits/chosen": -1.7641148567199707, "logits/rejected": 4.488871097564697, "logps/chosen": -210.34579467773438, "logps/rejected": -489.9498291015625, "loss": 3.2886, "rewards/accuracies": 0.25, "rewards/chosen": 3.801412343978882, "rewards/margins": -2.549666404724121, "rewards/rejected": 6.351078987121582, "step": 87 }, { "epoch": 0.05474339035769829, "grad_norm": 37.51689147949219, "learning_rate": 9.109730848861284e-07, "logits/chosen": 0.7698231935501099, "logits/rejected": 3.520718574523926, "logps/chosen": -422.2262268066406, "logps/rejected": -491.434814453125, "loss": 2.6793, "rewards/accuracies": 0.5, "rewards/chosen": 4.767912864685059, "rewards/margins": 0.8315317630767822, "rewards/rejected": 3.9363808631896973, "step": 88 }, { "epoch": 0.05536547433903577, "grad_norm": 43.7822380065918, "learning_rate": 9.213250517598345e-07, "logits/chosen": -0.11454379558563232, "logits/rejected": 3.4940781593322754, "logps/chosen": -329.4980163574219, "logps/rejected": -485.48779296875, "loss": 2.8667, "rewards/accuracies": 0.25, "rewards/chosen": 3.1187541484832764, "rewards/margins": -1.1736748218536377, "rewards/rejected": 4.292428970336914, "step": 89 }, { "epoch": 0.05598755832037325, "grad_norm": 52.58222961425781, "learning_rate": 9.316770186335404e-07, "logits/chosen": 1.3086739778518677, "logits/rejected": 5.073335647583008, "logps/chosen": -393.6297912597656, "logps/rejected": -569.0318603515625, "loss": 3.7375, "rewards/accuracies": 0.125, "rewards/chosen": 6.170962810516357, "rewards/margins": -3.5130481719970703, "rewards/rejected": 9.684011459350586, "step": 90 }, { "epoch": 0.05660964230171073, "grad_norm": 31.68039894104004, "learning_rate": 9.420289855072465e-07, "logits/chosen": 0.22252234816551208, "logits/rejected": 2.2449254989624023, "logps/chosen": -322.2689208984375, "logps/rejected": -423.7203063964844, "loss": 2.0681, "rewards/accuracies": 0.25, "rewards/chosen": 4.651986122131348, "rewards/margins": -0.39517509937286377, "rewards/rejected": 5.047161102294922, "step": 91 }, { "epoch": 0.05723172628304821, "grad_norm": 41.90281295776367, "learning_rate": 9.523809523809525e-07, "logits/chosen": 1.7530864477157593, "logits/rejected": 5.830805778503418, "logps/chosen": -420.7454833984375, "logps/rejected": -628.6724853515625, "loss": 3.2832, "rewards/accuracies": 0.5, "rewards/chosen": 9.385915756225586, "rewards/margins": -1.5105422735214233, "rewards/rejected": 10.896458625793457, "step": 92 }, { "epoch": 0.05785381026438569, "grad_norm": 24.812217712402344, "learning_rate": 9.627329192546585e-07, "logits/chosen": 1.9809218645095825, "logits/rejected": 4.16044807434082, "logps/chosen": -430.45709228515625, "logps/rejected": -506.4284362792969, "loss": 1.266, "rewards/accuracies": 0.625, "rewards/chosen": 8.652717590332031, "rewards/margins": 2.367443561553955, "rewards/rejected": 6.285274982452393, "step": 93 }, { "epoch": 0.05847589424572317, "grad_norm": 52.60491180419922, "learning_rate": 9.730848861283643e-07, "logits/chosen": 0.5998326539993286, "logits/rejected": 4.901703834533691, "logps/chosen": -406.40472412109375, "logps/rejected": -620.4981689453125, "loss": 4.0579, "rewards/accuracies": 0.125, "rewards/chosen": 4.539392471313477, "rewards/margins": -3.4850521087646484, "rewards/rejected": 8.024444580078125, "step": 94 }, { "epoch": 0.05909797822706065, "grad_norm": 44.85295867919922, "learning_rate": 9.834368530020705e-07, "logits/chosen": -2.097593307495117, "logits/rejected": 3.1550774574279785, "logps/chosen": -391.03338623046875, "logps/rejected": -641.2415771484375, "loss": 3.0355, "rewards/accuracies": 0.375, "rewards/chosen": 5.09652042388916, "rewards/margins": -2.0533454418182373, "rewards/rejected": 7.149866104125977, "step": 95 }, { "epoch": 0.059720062208398136, "grad_norm": 34.27623748779297, "learning_rate": 9.937888198757765e-07, "logits/chosen": 2.9386537075042725, "logits/rejected": 3.68379545211792, "logps/chosen": -584.134521484375, "logps/rejected": -609.106201171875, "loss": 1.5705, "rewards/accuracies": 0.75, "rewards/chosen": 5.0029168128967285, "rewards/margins": 0.11842000484466553, "rewards/rejected": 4.884496688842773, "step": 96 }, { "epoch": 0.060342146189735615, "grad_norm": 44.750606536865234, "learning_rate": 1.0041407867494825e-06, "logits/chosen": 2.349898099899292, "logits/rejected": 6.011685371398926, "logps/chosen": -319.87628173828125, "logps/rejected": -486.68499755859375, "loss": 2.4576, "rewards/accuracies": 0.25, "rewards/chosen": 4.2724761962890625, "rewards/margins": -1.7754751443862915, "rewards/rejected": 6.047951698303223, "step": 97 }, { "epoch": 0.060964230171073094, "grad_norm": 48.21299362182617, "learning_rate": 1.0144927536231885e-06, "logits/chosen": 0.6218348145484924, "logits/rejected": 4.305963516235352, "logps/chosen": -326.71710205078125, "logps/rejected": -464.59320068359375, "loss": 4.0852, "rewards/accuracies": 0.125, "rewards/chosen": 4.030767440795898, "rewards/margins": -3.5390217304229736, "rewards/rejected": 7.569788932800293, "step": 98 }, { "epoch": 0.06158631415241057, "grad_norm": 43.5555419921875, "learning_rate": 1.0248447204968944e-06, "logits/chosen": 2.1077942848205566, "logits/rejected": 4.135837078094482, "logps/chosen": -475.6229248046875, "logps/rejected": -545.8417358398438, "loss": 3.2673, "rewards/accuracies": 0.375, "rewards/chosen": 5.936098098754883, "rewards/margins": -0.997451663017273, "rewards/rejected": 6.933549880981445, "step": 99 }, { "epoch": 0.06220839813374806, "grad_norm": 60.58588790893555, "learning_rate": 1.0351966873706006e-06, "logits/chosen": 2.1906070709228516, "logits/rejected": 4.7897725105285645, "logps/chosen": -492.7605285644531, "logps/rejected": -564.39208984375, "loss": 4.7329, "rewards/accuracies": 0.125, "rewards/chosen": 5.5719757080078125, "rewards/margins": -4.470346450805664, "rewards/rejected": 10.042322158813477, "step": 100 }, { "epoch": 0.06283048211508553, "grad_norm": 48.09543228149414, "learning_rate": 1.0455486542443064e-06, "logits/chosen": 0.6987656354904175, "logits/rejected": 4.166409969329834, "logps/chosen": -367.01678466796875, "logps/rejected": -541.5540771484375, "loss": 4.6407, "rewards/accuracies": 0.25, "rewards/chosen": 4.648255348205566, "rewards/margins": -4.1871418952941895, "rewards/rejected": 8.835397720336914, "step": 101 }, { "epoch": 0.06345256609642301, "grad_norm": 57.54613494873047, "learning_rate": 1.0559006211180126e-06, "logits/chosen": -2.2215631008148193, "logits/rejected": 5.639945983886719, "logps/chosen": -292.84765625, "logps/rejected": -641.1038818359375, "loss": 5.1023, "rewards/accuracies": 0.25, "rewards/chosen": 3.9426522254943848, "rewards/margins": -4.445166110992432, "rewards/rejected": 8.387818336486816, "step": 102 }, { "epoch": 0.0640746500777605, "grad_norm": 46.35042190551758, "learning_rate": 1.0662525879917186e-06, "logits/chosen": 0.6544018387794495, "logits/rejected": 4.883418560028076, "logps/chosen": -323.3349914550781, "logps/rejected": -573.3865356445312, "loss": 3.4786, "rewards/accuracies": 0.25, "rewards/chosen": 6.362491130828857, "rewards/margins": -2.575894832611084, "rewards/rejected": 8.938385009765625, "step": 103 }, { "epoch": 0.06469673405909798, "grad_norm": 53.28401565551758, "learning_rate": 1.0766045548654246e-06, "logits/chosen": 2.349595069885254, "logits/rejected": 4.094552516937256, "logps/chosen": -496.2256774902344, "logps/rejected": -590.922119140625, "loss": 3.2819, "rewards/accuracies": 0.375, "rewards/chosen": 2.4656195640563965, "rewards/margins": -2.7044270038604736, "rewards/rejected": 5.170046806335449, "step": 104 }, { "epoch": 0.06531881804043546, "grad_norm": 41.93247985839844, "learning_rate": 1.0869565217391306e-06, "logits/chosen": 1.0588008165359497, "logits/rejected": 2.653573989868164, "logps/chosen": -380.8188781738281, "logps/rejected": -459.8533630371094, "loss": 1.8891, "rewards/accuracies": 0.375, "rewards/chosen": 6.556657791137695, "rewards/margins": -0.8650388717651367, "rewards/rejected": 7.421696662902832, "step": 105 }, { "epoch": 0.06594090202177294, "grad_norm": 59.02732467651367, "learning_rate": 1.0973084886128365e-06, "logits/chosen": 1.3528294563293457, "logits/rejected": 5.272040843963623, "logps/chosen": -232.5532684326172, "logps/rejected": -436.3426818847656, "loss": 3.2053, "rewards/accuracies": 0.0, "rewards/chosen": 0.45379501581192017, "rewards/margins": -3.016813278198242, "rewards/rejected": 3.4706084728240967, "step": 106 }, { "epoch": 0.06656298600311042, "grad_norm": 56.47092819213867, "learning_rate": 1.1076604554865425e-06, "logits/chosen": 0.5967641472816467, "logits/rejected": 1.849827527999878, "logps/chosen": -502.0181884765625, "logps/rejected": -542.710205078125, "loss": 5.3509, "rewards/accuracies": 0.125, "rewards/chosen": 4.310145854949951, "rewards/margins": -5.1838178634643555, "rewards/rejected": 9.493963241577148, "step": 107 }, { "epoch": 0.0671850699844479, "grad_norm": 29.14185905456543, "learning_rate": 1.1180124223602485e-06, "logits/chosen": 3.0689427852630615, "logits/rejected": 3.7151615619659424, "logps/chosen": -515.645263671875, "logps/rejected": -523.316650390625, "loss": 1.6792, "rewards/accuracies": 0.5, "rewards/chosen": 8.409460067749023, "rewards/margins": 1.357513189315796, "rewards/rejected": 7.051946640014648, "step": 108 }, { "epoch": 0.06780715396578538, "grad_norm": 41.52821731567383, "learning_rate": 1.1283643892339545e-06, "logits/chosen": 3.8132739067077637, "logits/rejected": 3.7674436569213867, "logps/chosen": -589.3350219726562, "logps/rejected": -575.3363037109375, "loss": 2.2836, "rewards/accuracies": 0.5, "rewards/chosen": 8.319855690002441, "rewards/margins": -0.695635974407196, "rewards/rejected": 9.015491485595703, "step": 109 }, { "epoch": 0.06842923794712286, "grad_norm": 32.027217864990234, "learning_rate": 1.1387163561076605e-06, "logits/chosen": 1.6463077068328857, "logits/rejected": 4.320730686187744, "logps/chosen": -380.8627014160156, "logps/rejected": -503.88323974609375, "loss": 1.8091, "rewards/accuracies": 0.5, "rewards/chosen": 7.096940040588379, "rewards/margins": 0.04935610294342041, "rewards/rejected": 7.047583103179932, "step": 110 }, { "epoch": 0.06905132192846034, "grad_norm": 48.38652038574219, "learning_rate": 1.1490683229813664e-06, "logits/chosen": 0.4883633255958557, "logits/rejected": 4.920735836029053, "logps/chosen": -403.420166015625, "logps/rejected": -578.6812133789062, "loss": 1.5954, "rewards/accuracies": 0.25, "rewards/chosen": 3.8030200004577637, "rewards/margins": 1.088423490524292, "rewards/rejected": 2.714596748352051, "step": 111 }, { "epoch": 0.06967340590979783, "grad_norm": 39.91889953613281, "learning_rate": 1.1594202898550726e-06, "logits/chosen": 2.2552788257598877, "logits/rejected": 4.922609329223633, "logps/chosen": -424.36431884765625, "logps/rejected": -576.1085205078125, "loss": 2.2137, "rewards/accuracies": 0.5, "rewards/chosen": 6.894935607910156, "rewards/margins": 0.45326733589172363, "rewards/rejected": 6.441668510437012, "step": 112 }, { "epoch": 0.07029548989113531, "grad_norm": 36.67121124267578, "learning_rate": 1.1697722567287784e-06, "logits/chosen": -3.018321990966797, "logits/rejected": 3.101332187652588, "logps/chosen": -302.11090087890625, "logps/rejected": -540.5518798828125, "loss": 3.3978, "rewards/accuracies": 0.375, "rewards/chosen": 3.1350018978118896, "rewards/margins": -2.6119015216827393, "rewards/rejected": 5.746903419494629, "step": 113 }, { "epoch": 0.07091757387247279, "grad_norm": 37.47822952270508, "learning_rate": 1.1801242236024846e-06, "logits/chosen": 0.36539775133132935, "logits/rejected": 2.5840260982513428, "logps/chosen": -302.42681884765625, "logps/rejected": -385.96075439453125, "loss": 2.6045, "rewards/accuracies": 0.375, "rewards/chosen": 2.869319438934326, "rewards/margins": -1.7174735069274902, "rewards/rejected": 4.586793422698975, "step": 114 }, { "epoch": 0.07153965785381027, "grad_norm": 61.6951904296875, "learning_rate": 1.1904761904761906e-06, "logits/chosen": 0.23400786519050598, "logits/rejected": 4.336287975311279, "logps/chosen": -468.264892578125, "logps/rejected": -585.5806884765625, "loss": 5.9258, "rewards/accuracies": 0.25, "rewards/chosen": 2.550875425338745, "rewards/margins": -5.330711841583252, "rewards/rejected": 7.881587028503418, "step": 115 }, { "epoch": 0.07216174183514774, "grad_norm": 42.67302703857422, "learning_rate": 1.2008281573498966e-06, "logits/chosen": -1.4592227935791016, "logits/rejected": 1.4536776542663574, "logps/chosen": -272.0027160644531, "logps/rejected": -449.3828125, "loss": 2.3933, "rewards/accuracies": 0.375, "rewards/chosen": 4.253082275390625, "rewards/margins": -0.5537877082824707, "rewards/rejected": 4.806870460510254, "step": 116 }, { "epoch": 0.07278382581648522, "grad_norm": 37.58478927612305, "learning_rate": 1.2111801242236026e-06, "logits/chosen": 0.2646111845970154, "logits/rejected": 3.028031826019287, "logps/chosen": -422.3641662597656, "logps/rejected": -519.9530029296875, "loss": 3.1383, "rewards/accuracies": 0.5, "rewards/chosen": 5.61953592300415, "rewards/margins": -1.9047925472259521, "rewards/rejected": 7.524328708648682, "step": 117 }, { "epoch": 0.0734059097978227, "grad_norm": 46.9578971862793, "learning_rate": 1.2215320910973085e-06, "logits/chosen": -1.7312917709350586, "logits/rejected": 4.160816192626953, "logps/chosen": -374.05706787109375, "logps/rejected": -589.3055419921875, "loss": 3.5799, "rewards/accuracies": 0.125, "rewards/chosen": 6.456633567810059, "rewards/margins": -2.8331234455108643, "rewards/rejected": 9.289756774902344, "step": 118 }, { "epoch": 0.07402799377916018, "grad_norm": 48.71473693847656, "learning_rate": 1.2318840579710147e-06, "logits/chosen": -0.895279049873352, "logits/rejected": 4.6400885581970215, "logps/chosen": -271.0947265625, "logps/rejected": -575.875244140625, "loss": 3.2339, "rewards/accuracies": 0.25, "rewards/chosen": 1.412948489189148, "rewards/margins": -1.9902526140213013, "rewards/rejected": 3.403201103210449, "step": 119 }, { "epoch": 0.07465007776049767, "grad_norm": 53.11747360229492, "learning_rate": 1.2422360248447205e-06, "logits/chosen": -1.3114756345748901, "logits/rejected": 3.2517542839050293, "logps/chosen": -284.1141357421875, "logps/rejected": -538.9307250976562, "loss": 2.6922, "rewards/accuracies": 0.25, "rewards/chosen": 2.876573085784912, "rewards/margins": -2.385010004043579, "rewards/rejected": 5.26158332824707, "step": 120 }, { "epoch": 0.07527216174183515, "grad_norm": 32.17045974731445, "learning_rate": 1.2525879917184267e-06, "logits/chosen": 3.688138961791992, "logits/rejected": 4.196073055267334, "logps/chosen": -517.5601196289062, "logps/rejected": -592.2069091796875, "loss": 1.1033, "rewards/accuracies": 0.75, "rewards/chosen": 7.916095733642578, "rewards/margins": 0.7442711591720581, "rewards/rejected": 7.1718244552612305, "step": 121 }, { "epoch": 0.07589424572317263, "grad_norm": 41.10165786743164, "learning_rate": 1.2629399585921327e-06, "logits/chosen": 1.2101017236709595, "logits/rejected": 2.4238104820251465, "logps/chosen": -386.5546569824219, "logps/rejected": -454.49859619140625, "loss": 1.9363, "rewards/accuracies": 0.375, "rewards/chosen": 5.121078968048096, "rewards/margins": 0.4273524284362793, "rewards/rejected": 4.693726539611816, "step": 122 }, { "epoch": 0.07651632970451011, "grad_norm": 40.01031494140625, "learning_rate": 1.2732919254658385e-06, "logits/chosen": 2.7430291175842285, "logits/rejected": 4.701658248901367, "logps/chosen": -423.6866455078125, "logps/rejected": -474.761962890625, "loss": 2.2463, "rewards/accuracies": 0.5, "rewards/chosen": 4.223697185516357, "rewards/margins": -1.0474417209625244, "rewards/rejected": 5.271138668060303, "step": 123 }, { "epoch": 0.07713841368584759, "grad_norm": 47.88119125366211, "learning_rate": 1.2836438923395447e-06, "logits/chosen": -1.60335111618042, "logits/rejected": 3.7535877227783203, "logps/chosen": -201.6946258544922, "logps/rejected": -488.87969970703125, "loss": 3.3213, "rewards/accuracies": 0.375, "rewards/chosen": 2.568941593170166, "rewards/margins": -1.890028715133667, "rewards/rejected": 4.458970069885254, "step": 124 }, { "epoch": 0.07776049766718507, "grad_norm": 41.96632385253906, "learning_rate": 1.2939958592132506e-06, "logits/chosen": 0.8452179431915283, "logits/rejected": 2.755337715148926, "logps/chosen": -344.67462158203125, "logps/rejected": -452.2144775390625, "loss": 1.6188, "rewards/accuracies": 0.375, "rewards/chosen": 4.238457679748535, "rewards/margins": -1.0320303440093994, "rewards/rejected": 5.270488262176514, "step": 125 }, { "epoch": 0.07838258164852255, "grad_norm": 45.34007263183594, "learning_rate": 1.3043478260869566e-06, "logits/chosen": 1.386866569519043, "logits/rejected": 4.8691511154174805, "logps/chosen": -366.2882995605469, "logps/rejected": -543.7435302734375, "loss": 2.9963, "rewards/accuracies": 0.25, "rewards/chosen": 3.628605842590332, "rewards/margins": -2.35605525970459, "rewards/rejected": 5.984661102294922, "step": 126 }, { "epoch": 0.07900466562986003, "grad_norm": 31.151798248291016, "learning_rate": 1.3146997929606626e-06, "logits/chosen": 3.52664852142334, "logits/rejected": 4.250565528869629, "logps/chosen": -458.6521911621094, "logps/rejected": -533.302490234375, "loss": 2.3654, "rewards/accuracies": 0.625, "rewards/chosen": 7.884675025939941, "rewards/margins": 1.5698320865631104, "rewards/rejected": 6.314842700958252, "step": 127 }, { "epoch": 0.0796267496111975, "grad_norm": 47.57011032104492, "learning_rate": 1.3250517598343688e-06, "logits/chosen": 3.230344533920288, "logits/rejected": 6.1636643409729, "logps/chosen": -435.8682861328125, "logps/rejected": -526.7186889648438, "loss": 3.2785, "rewards/accuracies": 0.25, "rewards/chosen": 5.82703971862793, "rewards/margins": -2.0120530128479004, "rewards/rejected": 7.839093208312988, "step": 128 }, { "epoch": 0.080248833592535, "grad_norm": 41.736793518066406, "learning_rate": 1.3354037267080746e-06, "logits/chosen": 1.6107079982757568, "logits/rejected": 4.2225165367126465, "logps/chosen": -449.3865966796875, "logps/rejected": -570.679931640625, "loss": 3.5626, "rewards/accuracies": 0.375, "rewards/chosen": 6.096354007720947, "rewards/margins": -1.7619743347167969, "rewards/rejected": 7.858328819274902, "step": 129 }, { "epoch": 0.08087091757387248, "grad_norm": 28.421558380126953, "learning_rate": 1.3457556935817806e-06, "logits/chosen": -2.727404832839966, "logits/rejected": 4.425556659698486, "logps/chosen": -231.41461181640625, "logps/rejected": -580.169921875, "loss": 2.6388, "rewards/accuracies": 0.625, "rewards/chosen": 4.525047779083252, "rewards/margins": -0.4674373269081116, "rewards/rejected": 4.992485046386719, "step": 130 }, { "epoch": 0.08149300155520996, "grad_norm": 34.796287536621094, "learning_rate": 1.3561076604554865e-06, "logits/chosen": 1.1071691513061523, "logits/rejected": 3.6799988746643066, "logps/chosen": -383.92962646484375, "logps/rejected": -528.1698608398438, "loss": 3.2315, "rewards/accuracies": 0.5, "rewards/chosen": 6.100879669189453, "rewards/margins": -1.634765863418579, "rewards/rejected": 7.7356462478637695, "step": 131 }, { "epoch": 0.08211508553654744, "grad_norm": 49.1640625, "learning_rate": 1.3664596273291927e-06, "logits/chosen": 0.9404339790344238, "logits/rejected": 5.779233455657959, "logps/chosen": -365.7359313964844, "logps/rejected": -657.7894287109375, "loss": 4.144, "rewards/accuracies": 0.375, "rewards/chosen": 3.550569534301758, "rewards/margins": -2.898592948913574, "rewards/rejected": 6.449162483215332, "step": 132 }, { "epoch": 0.08273716951788491, "grad_norm": 33.71388626098633, "learning_rate": 1.3768115942028987e-06, "logits/chosen": -1.5326220989227295, "logits/rejected": 4.281208038330078, "logps/chosen": -145.20849609375, "logps/rejected": -438.5946350097656, "loss": 2.0325, "rewards/accuracies": 0.5, "rewards/chosen": 2.4517431259155273, "rewards/margins": -0.3625298738479614, "rewards/rejected": 2.8142733573913574, "step": 133 }, { "epoch": 0.0833592534992224, "grad_norm": 47.125850677490234, "learning_rate": 1.3871635610766047e-06, "logits/chosen": 1.5233802795410156, "logits/rejected": 4.672469615936279, "logps/chosen": -411.7640380859375, "logps/rejected": -596.0722045898438, "loss": 4.3379, "rewards/accuracies": 0.5, "rewards/chosen": 4.727325439453125, "rewards/margins": -1.5561026334762573, "rewards/rejected": 6.283427715301514, "step": 134 }, { "epoch": 0.08398133748055987, "grad_norm": 40.958187103271484, "learning_rate": 1.3975155279503105e-06, "logits/chosen": 1.8504540920257568, "logits/rejected": 5.391268730163574, "logps/chosen": -445.0400085449219, "logps/rejected": -602.6244506835938, "loss": 1.6739, "rewards/accuracies": 0.625, "rewards/chosen": 4.19703483581543, "rewards/margins": -0.15884339809417725, "rewards/rejected": 4.3558783531188965, "step": 135 }, { "epoch": 0.08460342146189735, "grad_norm": 55.949459075927734, "learning_rate": 1.4078674948240167e-06, "logits/chosen": 1.1511344909667969, "logits/rejected": 4.316564559936523, "logps/chosen": -389.3515319824219, "logps/rejected": -509.3475036621094, "loss": 2.3386, "rewards/accuracies": 0.25, "rewards/chosen": 3.0395596027374268, "rewards/margins": -0.5732651352882385, "rewards/rejected": 3.6128249168395996, "step": 136 }, { "epoch": 0.08522550544323483, "grad_norm": 39.164981842041016, "learning_rate": 1.4182194616977226e-06, "logits/chosen": 1.738755226135254, "logits/rejected": 4.727439880371094, "logps/chosen": -352.5355529785156, "logps/rejected": -513.2650146484375, "loss": 2.3329, "rewards/accuracies": 0.625, "rewards/chosen": 4.571519374847412, "rewards/margins": 0.1314365565776825, "rewards/rejected": 4.440083026885986, "step": 137 }, { "epoch": 0.08584758942457232, "grad_norm": 42.70882034301758, "learning_rate": 1.4285714285714286e-06, "logits/chosen": 0.34659343957901, "logits/rejected": 3.817269802093506, "logps/chosen": -473.480224609375, "logps/rejected": -662.4795532226562, "loss": 2.6746, "rewards/accuracies": 0.625, "rewards/chosen": 5.974919319152832, "rewards/margins": -0.8572205305099487, "rewards/rejected": 6.832139492034912, "step": 138 }, { "epoch": 0.0864696734059098, "grad_norm": 40.75260543823242, "learning_rate": 1.4389233954451348e-06, "logits/chosen": -0.08259952068328857, "logits/rejected": 3.267246723175049, "logps/chosen": -385.9140319824219, "logps/rejected": -604.5545654296875, "loss": 2.6701, "rewards/accuracies": 0.5, "rewards/chosen": 3.4592254161834717, "rewards/margins": -0.001993894577026367, "rewards/rejected": 3.461219072341919, "step": 139 }, { "epoch": 0.08709175738724728, "grad_norm": 43.442569732666016, "learning_rate": 1.4492753623188408e-06, "logits/chosen": 0.29093819856643677, "logits/rejected": 4.218543529510498, "logps/chosen": -445.10443115234375, "logps/rejected": -656.73046875, "loss": 4.0458, "rewards/accuracies": 0.5, "rewards/chosen": 6.9249982833862305, "rewards/margins": -2.191218852996826, "rewards/rejected": 9.116217613220215, "step": 140 }, { "epoch": 0.08771384136858476, "grad_norm": 39.78031921386719, "learning_rate": 1.4596273291925466e-06, "logits/chosen": 0.6515867710113525, "logits/rejected": 4.114278316497803, "logps/chosen": -373.6202697753906, "logps/rejected": -598.4118041992188, "loss": 0.8933, "rewards/accuracies": 0.5, "rewards/chosen": 4.184488773345947, "rewards/margins": 1.4812554121017456, "rewards/rejected": 2.703233242034912, "step": 141 }, { "epoch": 0.08833592534992224, "grad_norm": 43.94890594482422, "learning_rate": 1.4699792960662526e-06, "logits/chosen": 1.9376635551452637, "logits/rejected": 5.742636680603027, "logps/chosen": -368.154541015625, "logps/rejected": -523.9427490234375, "loss": 3.4754, "rewards/accuracies": 0.375, "rewards/chosen": 2.491053581237793, "rewards/margins": -2.008192300796509, "rewards/rejected": 4.499245643615723, "step": 142 }, { "epoch": 0.08895800933125972, "grad_norm": 49.16591262817383, "learning_rate": 1.4803312629399588e-06, "logits/chosen": 0.4437107741832733, "logits/rejected": 2.9630513191223145, "logps/chosen": -416.73028564453125, "logps/rejected": -574.045654296875, "loss": 2.5663, "rewards/accuracies": 0.375, "rewards/chosen": 6.391658306121826, "rewards/margins": -1.0516711473464966, "rewards/rejected": 7.443329334259033, "step": 143 }, { "epoch": 0.0895800933125972, "grad_norm": 25.008758544921875, "learning_rate": 1.4906832298136647e-06, "logits/chosen": -0.1724388599395752, "logits/rejected": 3.9658737182617188, "logps/chosen": -400.5678405761719, "logps/rejected": -579.6800537109375, "loss": 0.6586, "rewards/accuracies": 0.625, "rewards/chosen": 0.8572346568107605, "rewards/margins": 1.3512787818908691, "rewards/rejected": -0.49404406547546387, "step": 144 }, { "epoch": 0.09020217729393468, "grad_norm": 47.504005432128906, "learning_rate": 1.5010351966873707e-06, "logits/chosen": -1.509745717048645, "logits/rejected": 5.010717391967773, "logps/chosen": -196.65380859375, "logps/rejected": -495.7618408203125, "loss": 3.8704, "rewards/accuracies": 0.25, "rewards/chosen": 2.165783643722534, "rewards/margins": -2.8671915531158447, "rewards/rejected": 5.032975196838379, "step": 145 }, { "epoch": 0.09082426127527216, "grad_norm": 37.12325668334961, "learning_rate": 1.5113871635610767e-06, "logits/chosen": 0.7813265919685364, "logits/rejected": 2.518862724304199, "logps/chosen": -364.07196044921875, "logps/rejected": -468.7448425292969, "loss": 1.7235, "rewards/accuracies": 0.5, "rewards/chosen": 1.2074331045150757, "rewards/margins": 0.04681295156478882, "rewards/rejected": 1.1606202125549316, "step": 146 }, { "epoch": 0.09144634525660965, "grad_norm": 38.50062942504883, "learning_rate": 1.521739130434783e-06, "logits/chosen": -0.4486757516860962, "logits/rejected": 3.2172610759735107, "logps/chosen": -288.96588134765625, "logps/rejected": -525.9904174804688, "loss": 2.485, "rewards/accuracies": 0.5, "rewards/chosen": -0.029601097106933594, "rewards/margins": -1.5874814987182617, "rewards/rejected": 1.5578804016113281, "step": 147 }, { "epoch": 0.09206842923794713, "grad_norm": 45.27934265136719, "learning_rate": 1.5320910973084887e-06, "logits/chosen": 0.4484668970108032, "logits/rejected": 4.418857097625732, "logps/chosen": -344.88507080078125, "logps/rejected": -540.1593017578125, "loss": 3.5602, "rewards/accuracies": 0.25, "rewards/chosen": 4.541347503662109, "rewards/margins": -2.862128257751465, "rewards/rejected": 7.403475761413574, "step": 148 }, { "epoch": 0.0926905132192846, "grad_norm": 48.022586822509766, "learning_rate": 1.5424430641821947e-06, "logits/chosen": -0.05514061450958252, "logits/rejected": 4.1575236320495605, "logps/chosen": -391.945556640625, "logps/rejected": -546.39453125, "loss": 3.2204, "rewards/accuracies": 0.25, "rewards/chosen": 3.5273935794830322, "rewards/margins": -0.9070781469345093, "rewards/rejected": 4.43447208404541, "step": 149 }, { "epoch": 0.09331259720062209, "grad_norm": 51.76631164550781, "learning_rate": 1.5527950310559006e-06, "logits/chosen": 0.8921202421188354, "logits/rejected": 4.766198635101318, "logps/chosen": -428.14654541015625, "logps/rejected": -611.3369140625, "loss": 1.9622, "rewards/accuracies": 0.125, "rewards/chosen": 6.118347644805908, "rewards/margins": -0.13962364196777344, "rewards/rejected": 6.257971286773682, "step": 150 }, { "epoch": 0.09393468118195956, "grad_norm": 37.54130935668945, "learning_rate": 1.5631469979296068e-06, "logits/chosen": 1.5896421670913696, "logits/rejected": 2.176250457763672, "logps/chosen": -477.7599792480469, "logps/rejected": -511.2159423828125, "loss": 2.5037, "rewards/accuracies": 0.625, "rewards/chosen": 6.544580459594727, "rewards/margins": 2.958336591720581, "rewards/rejected": 3.5862441062927246, "step": 151 }, { "epoch": 0.09455676516329704, "grad_norm": 36.237796783447266, "learning_rate": 1.5734989648033128e-06, "logits/chosen": 1.5189558267593384, "logits/rejected": 4.844598770141602, "logps/chosen": -324.1607360839844, "logps/rejected": -502.5072326660156, "loss": 1.4235, "rewards/accuracies": 0.625, "rewards/chosen": 5.578876495361328, "rewards/margins": 1.3157144784927368, "rewards/rejected": 4.263161659240723, "step": 152 }, { "epoch": 0.09517884914463452, "grad_norm": 41.82114791870117, "learning_rate": 1.5838509316770188e-06, "logits/chosen": -0.6398723125457764, "logits/rejected": 2.4939658641815186, "logps/chosen": -355.7408752441406, "logps/rejected": -513.5419311523438, "loss": 1.737, "rewards/accuracies": 0.5, "rewards/chosen": 2.5448246002197266, "rewards/margins": -0.07166877388954163, "rewards/rejected": 2.6164932250976562, "step": 153 }, { "epoch": 0.095800933125972, "grad_norm": 37.8273811340332, "learning_rate": 1.5942028985507246e-06, "logits/chosen": 3.02529239654541, "logits/rejected": 4.839844226837158, "logps/chosen": -499.8304748535156, "logps/rejected": -633.421875, "loss": 2.5997, "rewards/accuracies": 0.625, "rewards/chosen": 8.184986114501953, "rewards/margins": 0.4161781072616577, "rewards/rejected": 7.768807411193848, "step": 154 }, { "epoch": 0.09642301710730948, "grad_norm": 50.805259704589844, "learning_rate": 1.6045548654244308e-06, "logits/chosen": -1.5919575691223145, "logits/rejected": 2.0901906490325928, "logps/chosen": -286.54669189453125, "logps/rejected": -470.29364013671875, "loss": 1.8801, "rewards/accuracies": 0.375, "rewards/chosen": -0.2827115058898926, "rewards/margins": -1.2794893980026245, "rewards/rejected": 0.9967778921127319, "step": 155 }, { "epoch": 0.09704510108864697, "grad_norm": 38.15058135986328, "learning_rate": 1.6149068322981367e-06, "logits/chosen": 1.0436800718307495, "logits/rejected": 5.138459205627441, "logps/chosen": -467.70269775390625, "logps/rejected": -661.43115234375, "loss": 2.0, "rewards/accuracies": 0.625, "rewards/chosen": 7.959029674530029, "rewards/margins": 1.6471713781356812, "rewards/rejected": 6.311858654022217, "step": 156 }, { "epoch": 0.09766718506998445, "grad_norm": 53.46415710449219, "learning_rate": 1.6252587991718427e-06, "logits/chosen": -0.7396149039268494, "logits/rejected": 4.500385284423828, "logps/chosen": -277.4739990234375, "logps/rejected": -541.1393432617188, "loss": 4.5397, "rewards/accuracies": 0.125, "rewards/chosen": 4.4420905113220215, "rewards/margins": -4.118373394012451, "rewards/rejected": 8.560463905334473, "step": 157 }, { "epoch": 0.09828926905132193, "grad_norm": 56.35954666137695, "learning_rate": 1.635610766045549e-06, "logits/chosen": -0.8244016170501709, "logits/rejected": 2.7302236557006836, "logps/chosen": -430.4935302734375, "logps/rejected": -605.2032470703125, "loss": 7.1248, "rewards/accuracies": 0.25, "rewards/chosen": 2.587373971939087, "rewards/margins": -6.728273391723633, "rewards/rejected": 9.31564712524414, "step": 158 }, { "epoch": 0.09891135303265941, "grad_norm": 35.76237106323242, "learning_rate": 1.645962732919255e-06, "logits/chosen": 0.3610566258430481, "logits/rejected": 4.236985206604004, "logps/chosen": -247.51239013671875, "logps/rejected": -494.441650390625, "loss": 1.5019, "rewards/accuracies": 0.625, "rewards/chosen": 2.1166188716888428, "rewards/margins": 0.9880322217941284, "rewards/rejected": 1.1285866498947144, "step": 159 }, { "epoch": 0.09953343701399689, "grad_norm": 33.57570266723633, "learning_rate": 1.6563146997929607e-06, "logits/chosen": 0.2717881500720978, "logits/rejected": 3.3716044425964355, "logps/chosen": -386.5144348144531, "logps/rejected": -545.7966918945312, "loss": 0.6394, "rewards/accuracies": 0.625, "rewards/chosen": 3.3963658809661865, "rewards/margins": 1.2866623401641846, "rewards/rejected": 2.109703779220581, "step": 160 }, { "epoch": 0.10015552099533437, "grad_norm": 41.08350372314453, "learning_rate": 1.6666666666666667e-06, "logits/chosen": 0.4015045762062073, "logits/rejected": 2.754533052444458, "logps/chosen": -514.9000244140625, "logps/rejected": -614.388916015625, "loss": 1.4105, "rewards/accuracies": 0.25, "rewards/chosen": 4.394747257232666, "rewards/margins": -0.1185079962015152, "rewards/rejected": 4.5132551193237305, "step": 161 }, { "epoch": 0.10077760497667185, "grad_norm": 30.24134635925293, "learning_rate": 1.6770186335403729e-06, "logits/chosen": 3.017308235168457, "logits/rejected": 5.260052680969238, "logps/chosen": -510.5013427734375, "logps/rejected": -667.1070556640625, "loss": 1.3398, "rewards/accuracies": 0.625, "rewards/chosen": 7.266972541809082, "rewards/margins": 4.063735485076904, "rewards/rejected": 3.2032370567321777, "step": 162 }, { "epoch": 0.10139968895800933, "grad_norm": 53.645294189453125, "learning_rate": 1.6873706004140788e-06, "logits/chosen": 1.5133259296417236, "logits/rejected": 3.539081573486328, "logps/chosen": -507.55078125, "logps/rejected": -652.8729248046875, "loss": 2.9854, "rewards/accuracies": 0.5, "rewards/chosen": 6.406628608703613, "rewards/margins": -1.5444564819335938, "rewards/rejected": 7.951085567474365, "step": 163 }, { "epoch": 0.1020217729393468, "grad_norm": 18.941761016845703, "learning_rate": 1.6977225672877848e-06, "logits/chosen": 1.6392831802368164, "logits/rejected": 3.4583234786987305, "logps/chosen": -351.8995666503906, "logps/rejected": -475.34564208984375, "loss": 0.6281, "rewards/accuracies": 0.875, "rewards/chosen": 5.116458892822266, "rewards/margins": 4.187784671783447, "rewards/rejected": 0.9286739826202393, "step": 164 }, { "epoch": 0.1026438569206843, "grad_norm": 31.994827270507812, "learning_rate": 1.7080745341614908e-06, "logits/chosen": -1.536903738975525, "logits/rejected": 1.0030691623687744, "logps/chosen": -365.28912353515625, "logps/rejected": -476.5758361816406, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": 2.6238882541656494, "rewards/margins": 2.114072799682617, "rewards/rejected": 0.5098155736923218, "step": 165 }, { "epoch": 0.10326594090202178, "grad_norm": 54.416507720947266, "learning_rate": 1.718426501035197e-06, "logits/chosen": 0.15935613214969635, "logits/rejected": 5.428621292114258, "logps/chosen": -346.4788818359375, "logps/rejected": -621.9774169921875, "loss": 3.0793, "rewards/accuracies": 0.25, "rewards/chosen": 5.3431501388549805, "rewards/margins": -1.2562695741653442, "rewards/rejected": 6.599419593811035, "step": 166 }, { "epoch": 0.10388802488335926, "grad_norm": 56.326568603515625, "learning_rate": 1.7287784679089028e-06, "logits/chosen": 0.3369348645210266, "logits/rejected": 4.614073276519775, "logps/chosen": -442.0284423828125, "logps/rejected": -650.9850463867188, "loss": 5.0816, "rewards/accuracies": 0.25, "rewards/chosen": 3.0569558143615723, "rewards/margins": -3.6081981658935547, "rewards/rejected": 6.665153503417969, "step": 167 }, { "epoch": 0.10451010886469674, "grad_norm": 43.598472595214844, "learning_rate": 1.7391304347826088e-06, "logits/chosen": 1.4294483661651611, "logits/rejected": 3.848559617996216, "logps/chosen": -428.77825927734375, "logps/rejected": -571.7686157226562, "loss": 1.8859, "rewards/accuracies": 0.5, "rewards/chosen": 2.6971874237060547, "rewards/margins": 0.7028264403343201, "rewards/rejected": 1.9943610429763794, "step": 168 }, { "epoch": 0.10513219284603421, "grad_norm": 33.3597412109375, "learning_rate": 1.7494824016563147e-06, "logits/chosen": 0.0816267803311348, "logits/rejected": 3.4725728034973145, "logps/chosen": -398.22998046875, "logps/rejected": -560.8500366210938, "loss": 2.7753, "rewards/accuracies": 0.625, "rewards/chosen": 2.8912577629089355, "rewards/margins": 0.11649256944656372, "rewards/rejected": 2.7747652530670166, "step": 169 }, { "epoch": 0.1057542768273717, "grad_norm": 22.159027099609375, "learning_rate": 1.759834368530021e-06, "logits/chosen": 1.665263056755066, "logits/rejected": 1.8744968175888062, "logps/chosen": -511.35595703125, "logps/rejected": -522.4755249023438, "loss": 0.3666, "rewards/accuracies": 0.875, "rewards/chosen": 5.932845115661621, "rewards/margins": 3.839456081390381, "rewards/rejected": 2.093388557434082, "step": 170 }, { "epoch": 0.10637636080870917, "grad_norm": 32.081825256347656, "learning_rate": 1.770186335403727e-06, "logits/chosen": -2.3348941802978516, "logits/rejected": 3.1992275714874268, "logps/chosen": -373.0357666015625, "logps/rejected": -653.9948120117188, "loss": 1.5876, "rewards/accuracies": 0.75, "rewards/chosen": 3.7468457221984863, "rewards/margins": 1.3742190599441528, "rewards/rejected": 2.372626781463623, "step": 171 }, { "epoch": 0.10699844479004665, "grad_norm": 34.0040397644043, "learning_rate": 1.780538302277433e-06, "logits/chosen": 1.7993035316467285, "logits/rejected": 4.404562950134277, "logps/chosen": -417.0467529296875, "logps/rejected": -575.447021484375, "loss": 2.1066, "rewards/accuracies": 0.75, "rewards/chosen": 2.4859135150909424, "rewards/margins": 2.4089438915252686, "rewards/rejected": 0.07696938514709473, "step": 172 }, { "epoch": 0.10762052877138413, "grad_norm": 24.10268783569336, "learning_rate": 1.7908902691511387e-06, "logits/chosen": 2.498624801635742, "logits/rejected": 3.5662894248962402, "logps/chosen": -599.74169921875, "logps/rejected": -655.7711181640625, "loss": 0.2948, "rewards/accuracies": 0.75, "rewards/chosen": 4.151212692260742, "rewards/margins": 4.251218795776367, "rewards/rejected": -0.10000598430633545, "step": 173 }, { "epoch": 0.10824261275272162, "grad_norm": 51.85417556762695, "learning_rate": 1.8012422360248449e-06, "logits/chosen": -1.7430806159973145, "logits/rejected": 3.780117988586426, "logps/chosen": -350.8195495605469, "logps/rejected": -647.4478149414062, "loss": 3.9011, "rewards/accuracies": 0.625, "rewards/chosen": 2.512604236602783, "rewards/margins": -1.8627538681030273, "rewards/rejected": 4.375357627868652, "step": 174 }, { "epoch": 0.1088646967340591, "grad_norm": 41.917449951171875, "learning_rate": 1.8115942028985508e-06, "logits/chosen": -1.1673665046691895, "logits/rejected": 3.871638298034668, "logps/chosen": -189.32254028320312, "logps/rejected": -495.197998046875, "loss": 4.4235, "rewards/accuracies": 0.5, "rewards/chosen": 1.9024066925048828, "rewards/margins": -1.6625474691390991, "rewards/rejected": 3.5649542808532715, "step": 175 }, { "epoch": 0.10948678071539658, "grad_norm": 50.83918762207031, "learning_rate": 1.8219461697722568e-06, "logits/chosen": 0.5478426814079285, "logits/rejected": 1.6672499179840088, "logps/chosen": -499.48370361328125, "logps/rejected": -573.6226806640625, "loss": 3.8286, "rewards/accuracies": 0.375, "rewards/chosen": 3.3958325386047363, "rewards/margins": -2.7034497261047363, "rewards/rejected": 6.0992817878723145, "step": 176 }, { "epoch": 0.11010886469673406, "grad_norm": 34.604827880859375, "learning_rate": 1.832298136645963e-06, "logits/chosen": 0.9112688302993774, "logits/rejected": 4.5739922523498535, "logps/chosen": -432.5416259765625, "logps/rejected": -530.7010498046875, "loss": 1.156, "rewards/accuracies": 0.625, "rewards/chosen": 3.8393948078155518, "rewards/margins": 1.3490967750549316, "rewards/rejected": 2.490297794342041, "step": 177 }, { "epoch": 0.11073094867807154, "grad_norm": 39.49983215332031, "learning_rate": 1.842650103519669e-06, "logits/chosen": -0.8858106136322021, "logits/rejected": 5.029402732849121, "logps/chosen": -205.45779418945312, "logps/rejected": -511.8585205078125, "loss": 2.0445, "rewards/accuracies": 0.625, "rewards/chosen": 0.839600682258606, "rewards/margins": 1.14154851436615, "rewards/rejected": -0.30194801092147827, "step": 178 }, { "epoch": 0.11135303265940902, "grad_norm": 43.34393310546875, "learning_rate": 1.8530020703933748e-06, "logits/chosen": 2.5536022186279297, "logits/rejected": 5.413421630859375, "logps/chosen": -391.5511474609375, "logps/rejected": -573.38818359375, "loss": 1.7285, "rewards/accuracies": 0.5, "rewards/chosen": 4.927746295928955, "rewards/margins": 0.048822566866874695, "rewards/rejected": 4.878923416137695, "step": 179 }, { "epoch": 0.1119751166407465, "grad_norm": 51.95783615112305, "learning_rate": 1.8633540372670808e-06, "logits/chosen": -1.940110206604004, "logits/rejected": 2.500943183898926, "logps/chosen": -327.7534484863281, "logps/rejected": -551.9661865234375, "loss": 5.6805, "rewards/accuracies": 0.375, "rewards/chosen": 5.075728416442871, "rewards/margins": -2.9619293212890625, "rewards/rejected": 8.037657737731934, "step": 180 }, { "epoch": 0.11259720062208398, "grad_norm": 28.379240036010742, "learning_rate": 1.873706004140787e-06, "logits/chosen": 1.9032138586044312, "logits/rejected": 4.713849067687988, "logps/chosen": -477.2460632324219, "logps/rejected": -636.798095703125, "loss": 1.2699, "rewards/accuracies": 0.75, "rewards/chosen": 5.134786605834961, "rewards/margins": 1.3110426664352417, "rewards/rejected": 3.8237438201904297, "step": 181 }, { "epoch": 0.11321928460342146, "grad_norm": 54.019630432128906, "learning_rate": 1.884057971014493e-06, "logits/chosen": -0.569671094417572, "logits/rejected": 3.2655553817749023, "logps/chosen": -367.45550537109375, "logps/rejected": -637.9801635742188, "loss": 1.8045, "rewards/accuracies": 0.5, "rewards/chosen": -0.29579782485961914, "rewards/margins": -0.03955581784248352, "rewards/rejected": -0.25624197721481323, "step": 182 }, { "epoch": 0.11384136858475895, "grad_norm": 50.58377456665039, "learning_rate": 1.894409937888199e-06, "logits/chosen": 0.5268037915229797, "logits/rejected": 3.4074790477752686, "logps/chosen": -357.352783203125, "logps/rejected": -576.5067749023438, "loss": 1.7159, "rewards/accuracies": 0.625, "rewards/chosen": 4.699305534362793, "rewards/margins": 0.25702714920043945, "rewards/rejected": 4.4422783851623535, "step": 183 }, { "epoch": 0.11446345256609643, "grad_norm": 43.4469108581543, "learning_rate": 1.904761904761905e-06, "logits/chosen": 2.0469088554382324, "logits/rejected": 4.830729007720947, "logps/chosen": -409.3948669433594, "logps/rejected": -563.2650146484375, "loss": 1.6257, "rewards/accuracies": 0.5, "rewards/chosen": 3.6548261642456055, "rewards/margins": 1.6007968187332153, "rewards/rejected": 2.0540289878845215, "step": 184 }, { "epoch": 0.1150855365474339, "grad_norm": 49.39980697631836, "learning_rate": 1.915113871635611e-06, "logits/chosen": 1.1279759407043457, "logits/rejected": 3.980091094970703, "logps/chosen": -389.1132507324219, "logps/rejected": -566.9024658203125, "loss": 2.6029, "rewards/accuracies": 0.25, "rewards/chosen": 4.677840232849121, "rewards/margins": -1.6894056797027588, "rewards/rejected": 6.367246150970459, "step": 185 }, { "epoch": 0.11570762052877138, "grad_norm": 30.314416885375977, "learning_rate": 1.925465838509317e-06, "logits/chosen": 1.9638819694519043, "logits/rejected": 4.244125843048096, "logps/chosen": -455.41754150390625, "logps/rejected": -591.694580078125, "loss": 1.458, "rewards/accuracies": 0.75, "rewards/chosen": 3.3089189529418945, "rewards/margins": 1.6026175022125244, "rewards/rejected": 1.7063013315200806, "step": 186 }, { "epoch": 0.11632970451010886, "grad_norm": 36.3062858581543, "learning_rate": 1.935817805383023e-06, "logits/chosen": -0.3030480146408081, "logits/rejected": 3.8223390579223633, "logps/chosen": -343.2451171875, "logps/rejected": -557.1092529296875, "loss": 1.0118, "rewards/accuracies": 0.75, "rewards/chosen": 2.1318135261535645, "rewards/margins": 2.583085536956787, "rewards/rejected": -0.45127207040786743, "step": 187 }, { "epoch": 0.11695178849144634, "grad_norm": 34.68586349487305, "learning_rate": 1.9461697722567286e-06, "logits/chosen": -0.018244266510009766, "logits/rejected": 3.894641876220703, "logps/chosen": -376.0664978027344, "logps/rejected": -571.3930053710938, "loss": 1.505, "rewards/accuracies": 0.5, "rewards/chosen": 5.088592052459717, "rewards/margins": 0.9989587664604187, "rewards/rejected": 4.089632987976074, "step": 188 }, { "epoch": 0.11757387247278382, "grad_norm": 41.42873764038086, "learning_rate": 1.956521739130435e-06, "logits/chosen": 3.0823469161987305, "logits/rejected": 4.817437171936035, "logps/chosen": -481.16162109375, "logps/rejected": -586.5604858398438, "loss": 0.8623, "rewards/accuracies": 0.625, "rewards/chosen": -1.9342479705810547, "rewards/margins": 0.728072464466095, "rewards/rejected": -2.662320375442505, "step": 189 }, { "epoch": 0.1181959564541213, "grad_norm": 70.27243041992188, "learning_rate": 1.966873706004141e-06, "logits/chosen": 2.949324369430542, "logits/rejected": 3.5138602256774902, "logps/chosen": -627.6104125976562, "logps/rejected": -615.1279907226562, "loss": 2.0358, "rewards/accuracies": 0.625, "rewards/chosen": 2.9750401973724365, "rewards/margins": 3.873018264770508, "rewards/rejected": -0.8979783654212952, "step": 190 }, { "epoch": 0.1188180404354588, "grad_norm": 49.11233901977539, "learning_rate": 1.977225672877847e-06, "logits/chosen": -0.8565497994422913, "logits/rejected": 4.336215972900391, "logps/chosen": -288.052978515625, "logps/rejected": -581.88330078125, "loss": 4.2047, "rewards/accuracies": 0.5, "rewards/chosen": 3.6924588680267334, "rewards/margins": -1.4131442308425903, "rewards/rejected": 5.105603218078613, "step": 191 }, { "epoch": 0.11944012441679627, "grad_norm": 54.367671966552734, "learning_rate": 1.987577639751553e-06, "logits/chosen": -1.6013593673706055, "logits/rejected": 3.676778554916382, "logps/chosen": -427.30792236328125, "logps/rejected": -734.0960083007812, "loss": 1.9657, "rewards/accuracies": 0.25, "rewards/chosen": 3.4305613040924072, "rewards/margins": -1.0722615718841553, "rewards/rejected": 4.5028228759765625, "step": 192 }, { "epoch": 0.12006220839813375, "grad_norm": 47.42571258544922, "learning_rate": 1.997929606625259e-06, "logits/chosen": 0.7500506639480591, "logits/rejected": 4.990383625030518, "logps/chosen": -360.7723388671875, "logps/rejected": -608.65625, "loss": 3.3685, "rewards/accuracies": 0.5, "rewards/chosen": 2.2373976707458496, "rewards/margins": -0.7982468605041504, "rewards/rejected": 3.035644769668579, "step": 193 }, { "epoch": 0.12068429237947123, "grad_norm": 33.1822395324707, "learning_rate": 2.008281573498965e-06, "logits/chosen": 0.6219146847724915, "logits/rejected": 2.929879665374756, "logps/chosen": -460.13238525390625, "logps/rejected": -706.8247680664062, "loss": 2.046, "rewards/accuracies": 0.75, "rewards/chosen": 3.59297513961792, "rewards/margins": 1.675492286682129, "rewards/rejected": 1.9174827337265015, "step": 194 }, { "epoch": 0.12130637636080871, "grad_norm": 51.21638488769531, "learning_rate": 2.018633540372671e-06, "logits/chosen": -1.0155105590820312, "logits/rejected": 1.4087202548980713, "logps/chosen": -385.1896667480469, "logps/rejected": -559.8251953125, "loss": 2.8099, "rewards/accuracies": 0.375, "rewards/chosen": 3.893887996673584, "rewards/margins": -0.482464075088501, "rewards/rejected": 4.376351833343506, "step": 195 }, { "epoch": 0.12192846034214619, "grad_norm": 46.816688537597656, "learning_rate": 2.028985507246377e-06, "logits/chosen": 1.821613073348999, "logits/rejected": 3.8777379989624023, "logps/chosen": -552.5363159179688, "logps/rejected": -630.4986572265625, "loss": 1.2842, "rewards/accuracies": 0.5, "rewards/chosen": 1.8579941987991333, "rewards/margins": 0.636164128780365, "rewards/rejected": 1.221830129623413, "step": 196 }, { "epoch": 0.12255054432348367, "grad_norm": 3.2378439903259277, "learning_rate": 2.039337474120083e-06, "logits/chosen": 1.5373815298080444, "logits/rejected": 5.254292964935303, "logps/chosen": -366.4618835449219, "logps/rejected": -528.86865234375, "loss": 0.1328, "rewards/accuracies": 0.875, "rewards/chosen": 3.522764205932617, "rewards/margins": 3.405061721801758, "rewards/rejected": 0.11770275235176086, "step": 197 }, { "epoch": 0.12317262830482115, "grad_norm": 34.82770538330078, "learning_rate": 2.049689440993789e-06, "logits/chosen": 3.3856725692749023, "logits/rejected": 1.8890368938446045, "logps/chosen": -570.4473876953125, "logps/rejected": -554.4201049804688, "loss": 0.7851, "rewards/accuracies": 0.625, "rewards/chosen": 3.8009650707244873, "rewards/margins": 3.108883857727051, "rewards/rejected": 0.692081093788147, "step": 198 }, { "epoch": 0.12379471228615863, "grad_norm": 50.189727783203125, "learning_rate": 2.060041407867495e-06, "logits/chosen": 1.0152267217636108, "logits/rejected": 3.6783008575439453, "logps/chosen": -494.78271484375, "logps/rejected": -646.5352783203125, "loss": 4.7003, "rewards/accuracies": 0.5, "rewards/chosen": 5.196945667266846, "rewards/margins": -2.769010543823242, "rewards/rejected": 7.965957164764404, "step": 199 }, { "epoch": 0.12441679626749612, "grad_norm": 30.09671974182129, "learning_rate": 2.0703933747412013e-06, "logits/chosen": 0.6837999820709229, "logits/rejected": 3.201636552810669, "logps/chosen": -420.2381591796875, "logps/rejected": -556.3150634765625, "loss": 0.4249, "rewards/accuracies": 0.75, "rewards/chosen": 2.953427791595459, "rewards/margins": 3.2480907440185547, "rewards/rejected": -0.2946627736091614, "step": 200 }, { "epoch": 0.12503888024883358, "grad_norm": 45.81303405761719, "learning_rate": 2.0807453416149073e-06, "logits/chosen": 1.7944955825805664, "logits/rejected": 5.481308460235596, "logps/chosen": -430.08514404296875, "logps/rejected": -581.4287109375, "loss": 2.692, "rewards/accuracies": 0.5, "rewards/chosen": 3.2093420028686523, "rewards/margins": 0.14378416538238525, "rewards/rejected": 3.0655577182769775, "step": 201 }, { "epoch": 0.12566096423017106, "grad_norm": 43.14478302001953, "learning_rate": 2.091097308488613e-06, "logits/chosen": -1.7328505516052246, "logits/rejected": 2.6943445205688477, "logps/chosen": -396.7474670410156, "logps/rejected": -666.2418212890625, "loss": 1.991, "rewards/accuracies": 0.625, "rewards/chosen": 0.7151708602905273, "rewards/margins": 0.4704517722129822, "rewards/rejected": 0.2447190284729004, "step": 202 }, { "epoch": 0.12628304821150854, "grad_norm": 31.574993133544922, "learning_rate": 2.101449275362319e-06, "logits/chosen": 1.9355977773666382, "logits/rejected": 3.36230206489563, "logps/chosen": -495.40789794921875, "logps/rejected": -570.6536865234375, "loss": 0.9764, "rewards/accuracies": 0.75, "rewards/chosen": 3.350783586502075, "rewards/margins": 3.221501111984253, "rewards/rejected": 0.12928247451782227, "step": 203 }, { "epoch": 0.12690513219284602, "grad_norm": 63.032718658447266, "learning_rate": 2.111801242236025e-06, "logits/chosen": -1.5227994918823242, "logits/rejected": 4.453179359436035, "logps/chosen": -373.2676086425781, "logps/rejected": -745.5072021484375, "loss": 4.4336, "rewards/accuracies": 0.375, "rewards/chosen": 4.488953590393066, "rewards/margins": -2.0308337211608887, "rewards/rejected": 6.519787788391113, "step": 204 }, { "epoch": 0.12752721617418353, "grad_norm": 22.824756622314453, "learning_rate": 2.122153209109731e-06, "logits/chosen": 0.824635922908783, "logits/rejected": 2.8972008228302, "logps/chosen": -390.86981201171875, "logps/rejected": -515.5609130859375, "loss": 0.2428, "rewards/accuracies": 0.875, "rewards/chosen": 0.06970125436782837, "rewards/margins": 2.518465518951416, "rewards/rejected": -2.4487640857696533, "step": 205 }, { "epoch": 0.128149300155521, "grad_norm": 24.64143180847168, "learning_rate": 2.132505175983437e-06, "logits/chosen": 0.700970470905304, "logits/rejected": 2.4702115058898926, "logps/chosen": -401.08013916015625, "logps/rejected": -505.01715087890625, "loss": 0.3995, "rewards/accuracies": 0.75, "rewards/chosen": 2.6803672313690186, "rewards/margins": 3.9768378734588623, "rewards/rejected": -1.2964705228805542, "step": 206 }, { "epoch": 0.12877138413685849, "grad_norm": 52.84967803955078, "learning_rate": 2.1428571428571427e-06, "logits/chosen": 2.6100564002990723, "logits/rejected": 5.6528754234313965, "logps/chosen": -531.5084228515625, "logps/rejected": -727.681640625, "loss": 1.454, "rewards/accuracies": 0.625, "rewards/chosen": -1.112928032875061, "rewards/margins": 1.359449863433838, "rewards/rejected": -2.4723777770996094, "step": 207 }, { "epoch": 0.12939346811819596, "grad_norm": 39.599334716796875, "learning_rate": 2.153209109730849e-06, "logits/chosen": 1.1986247301101685, "logits/rejected": 5.218608856201172, "logps/chosen": -325.86175537109375, "logps/rejected": -480.5209655761719, "loss": 2.2942, "rewards/accuracies": 0.5, "rewards/chosen": 0.030095696449279785, "rewards/margins": 0.45811617374420166, "rewards/rejected": -0.42802050709724426, "step": 208 }, { "epoch": 0.13001555209953344, "grad_norm": 37.675941467285156, "learning_rate": 2.163561076604555e-06, "logits/chosen": -0.7307783365249634, "logits/rejected": 2.09501576423645, "logps/chosen": -386.4396667480469, "logps/rejected": -580.3024291992188, "loss": 1.6707, "rewards/accuracies": 0.75, "rewards/chosen": 0.41714441776275635, "rewards/margins": 2.4524335861206055, "rewards/rejected": -2.0352892875671387, "step": 209 }, { "epoch": 0.13063763608087092, "grad_norm": 20.327880859375, "learning_rate": 2.173913043478261e-06, "logits/chosen": 0.5417582988739014, "logits/rejected": 2.157532215118408, "logps/chosen": -552.3806762695312, "logps/rejected": -637.8341064453125, "loss": 0.2936, "rewards/accuracies": 0.875, "rewards/chosen": 3.288459300994873, "rewards/margins": 6.135288715362549, "rewards/rejected": -2.846829652786255, "step": 210 }, { "epoch": 0.1312597200622084, "grad_norm": 32.202301025390625, "learning_rate": 2.184265010351967e-06, "logits/chosen": 1.5024635791778564, "logits/rejected": 4.253437519073486, "logps/chosen": -503.56805419921875, "logps/rejected": -695.8643798828125, "loss": 1.1783, "rewards/accuracies": 0.75, "rewards/chosen": 0.677678644657135, "rewards/margins": 3.9930052757263184, "rewards/rejected": -3.315326690673828, "step": 211 }, { "epoch": 0.13188180404354588, "grad_norm": 31.536731719970703, "learning_rate": 2.194616977225673e-06, "logits/chosen": 1.5302878618240356, "logits/rejected": 2.419198513031006, "logps/chosen": -523.5919799804688, "logps/rejected": -623.6431884765625, "loss": 0.4977, "rewards/accuracies": 0.75, "rewards/chosen": 5.382325649261475, "rewards/margins": 2.3979134559631348, "rewards/rejected": 2.984412670135498, "step": 212 }, { "epoch": 0.13250388802488336, "grad_norm": 47.228065490722656, "learning_rate": 2.204968944099379e-06, "logits/chosen": 1.0171109437942505, "logits/rejected": 1.333390712738037, "logps/chosen": -481.23822021484375, "logps/rejected": -489.55426025390625, "loss": 2.4562, "rewards/accuracies": 0.5, "rewards/chosen": 2.261629581451416, "rewards/margins": 2.093576669692993, "rewards/rejected": 0.16805295646190643, "step": 213 }, { "epoch": 0.13312597200622084, "grad_norm": 47.21223831176758, "learning_rate": 2.215320910973085e-06, "logits/chosen": 1.9000352621078491, "logits/rejected": 4.789727687835693, "logps/chosen": -478.86102294921875, "logps/rejected": -640.8873901367188, "loss": 1.6637, "rewards/accuracies": 0.375, "rewards/chosen": 2.010389804840088, "rewards/margins": 0.16780626773834229, "rewards/rejected": 1.842583417892456, "step": 214 }, { "epoch": 0.13374805598755832, "grad_norm": 41.62903594970703, "learning_rate": 2.225672877846791e-06, "logits/chosen": -0.524249792098999, "logits/rejected": 3.3108348846435547, "logps/chosen": -439.0601501464844, "logps/rejected": -683.864013671875, "loss": 2.1058, "rewards/accuracies": 0.625, "rewards/chosen": 2.8111445903778076, "rewards/margins": 2.1104533672332764, "rewards/rejected": 0.7006913423538208, "step": 215 }, { "epoch": 0.1343701399688958, "grad_norm": 37.27708435058594, "learning_rate": 2.236024844720497e-06, "logits/chosen": -1.3696495294570923, "logits/rejected": 0.9015498161315918, "logps/chosen": -392.2905578613281, "logps/rejected": -514.0640258789062, "loss": 1.8766, "rewards/accuracies": 0.5, "rewards/chosen": 1.9762678146362305, "rewards/margins": 3.288834810256958, "rewards/rejected": -1.3125672340393066, "step": 216 }, { "epoch": 0.13499222395023328, "grad_norm": 46.841278076171875, "learning_rate": 2.246376811594203e-06, "logits/chosen": -1.1116505861282349, "logits/rejected": 3.4078965187072754, "logps/chosen": -318.01605224609375, "logps/rejected": -635.120361328125, "loss": 2.2848, "rewards/accuracies": 0.5, "rewards/chosen": 2.1768715381622314, "rewards/margins": -0.572177529335022, "rewards/rejected": 2.749049186706543, "step": 217 }, { "epoch": 0.13561430793157075, "grad_norm": 39.24924087524414, "learning_rate": 2.256728778467909e-06, "logits/chosen": 1.5476105213165283, "logits/rejected": 3.4906458854675293, "logps/chosen": -322.1094055175781, "logps/rejected": -462.31683349609375, "loss": 0.973, "rewards/accuracies": 0.75, "rewards/chosen": -0.4564497470855713, "rewards/margins": 3.5915491580963135, "rewards/rejected": -4.047998905181885, "step": 218 }, { "epoch": 0.13623639191290823, "grad_norm": 60.18535614013672, "learning_rate": 2.2670807453416154e-06, "logits/chosen": 0.6108388900756836, "logits/rejected": 2.967494249343872, "logps/chosen": -531.0944213867188, "logps/rejected": -751.517822265625, "loss": 2.4948, "rewards/accuracies": 0.375, "rewards/chosen": -1.0347089767456055, "rewards/margins": 1.0422743558883667, "rewards/rejected": -2.0769832134246826, "step": 219 }, { "epoch": 0.1368584758942457, "grad_norm": 24.164648056030273, "learning_rate": 2.277432712215321e-06, "logits/chosen": -0.36401891708374023, "logits/rejected": 1.6827431917190552, "logps/chosen": -440.4677429199219, "logps/rejected": -560.8984985351562, "loss": 0.5672, "rewards/accuracies": 0.875, "rewards/chosen": 3.0978987216949463, "rewards/margins": 3.5617518424987793, "rewards/rejected": -0.463853120803833, "step": 220 }, { "epoch": 0.1374805598755832, "grad_norm": 49.610164642333984, "learning_rate": 2.287784679089027e-06, "logits/chosen": 0.010624885559082031, "logits/rejected": 3.5907387733459473, "logps/chosen": -405.98388671875, "logps/rejected": -642.1790771484375, "loss": 2.0901, "rewards/accuracies": 0.5, "rewards/chosen": 3.899686336517334, "rewards/margins": 0.45490002632141113, "rewards/rejected": 3.4447860717773438, "step": 221 }, { "epoch": 0.13810264385692067, "grad_norm": 17.924177169799805, "learning_rate": 2.298136645962733e-06, "logits/chosen": -0.9403358697891235, "logits/rejected": 3.576866388320923, "logps/chosen": -347.7789001464844, "logps/rejected": -721.7254028320312, "loss": 0.3152, "rewards/accuracies": 0.875, "rewards/chosen": 2.649275779724121, "rewards/margins": 7.130861282348633, "rewards/rejected": -4.4815850257873535, "step": 222 }, { "epoch": 0.13872472783825818, "grad_norm": 22.09856414794922, "learning_rate": 2.3084886128364393e-06, "logits/chosen": 0.05965060740709305, "logits/rejected": 3.7791969776153564, "logps/chosen": -300.701171875, "logps/rejected": -541.962890625, "loss": 0.4578, "rewards/accuracies": 0.75, "rewards/chosen": 2.1633074283599854, "rewards/margins": 2.8631277084350586, "rewards/rejected": -0.699820339679718, "step": 223 }, { "epoch": 0.13934681181959566, "grad_norm": 31.209260940551758, "learning_rate": 2.3188405797101453e-06, "logits/chosen": -5.668010711669922, "logits/rejected": 1.2728283405303955, "logps/chosen": -199.50497436523438, "logps/rejected": -599.9317016601562, "loss": 0.5324, "rewards/accuracies": 0.75, "rewards/chosen": 1.009107232093811, "rewards/margins": 5.485748291015625, "rewards/rejected": -4.476640701293945, "step": 224 }, { "epoch": 0.13996889580093314, "grad_norm": 40.985450744628906, "learning_rate": 2.3291925465838513e-06, "logits/chosen": -1.2980353832244873, "logits/rejected": 5.021801471710205, "logps/chosen": -314.845703125, "logps/rejected": -743.640869140625, "loss": 1.9684, "rewards/accuracies": 0.625, "rewards/chosen": 0.9610698223114014, "rewards/margins": 2.9930801391601562, "rewards/rejected": -2.032010316848755, "step": 225 }, { "epoch": 0.14059097978227061, "grad_norm": 40.860530853271484, "learning_rate": 2.339544513457557e-06, "logits/chosen": 1.379894495010376, "logits/rejected": 2.6520724296569824, "logps/chosen": -518.1561279296875, "logps/rejected": -594.7689208984375, "loss": 1.633, "rewards/accuracies": 0.5, "rewards/chosen": 1.6457796096801758, "rewards/margins": 1.1583093404769897, "rewards/rejected": 0.4874701499938965, "step": 226 }, { "epoch": 0.1412130637636081, "grad_norm": 30.166248321533203, "learning_rate": 2.3498964803312632e-06, "logits/chosen": -0.5353786945343018, "logits/rejected": 3.058781862258911, "logps/chosen": -381.04534912109375, "logps/rejected": -579.377685546875, "loss": 0.4811, "rewards/accuracies": 0.75, "rewards/chosen": -0.24283432960510254, "rewards/margins": 3.317432165145874, "rewards/rejected": -3.5602664947509766, "step": 227 }, { "epoch": 0.14183514774494557, "grad_norm": 14.29391098022461, "learning_rate": 2.3602484472049692e-06, "logits/chosen": -0.5758196115493774, "logits/rejected": 1.405975341796875, "logps/chosen": -263.87591552734375, "logps/rejected": -447.11566162109375, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 0.6491126418113708, "rewards/margins": 6.495497226715088, "rewards/rejected": -5.8463850021362305, "step": 228 }, { "epoch": 0.14245723172628305, "grad_norm": 37.63568115234375, "learning_rate": 2.370600414078675e-06, "logits/chosen": -0.9808781147003174, "logits/rejected": 3.0308525562286377, "logps/chosen": -369.4408264160156, "logps/rejected": -660.7260131835938, "loss": 1.4419, "rewards/accuracies": 0.75, "rewards/chosen": 2.0158448219299316, "rewards/margins": 5.2338457107543945, "rewards/rejected": -3.218001365661621, "step": 229 }, { "epoch": 0.14307931570762053, "grad_norm": 10.795422554016113, "learning_rate": 2.380952380952381e-06, "logits/chosen": 1.0303987264633179, "logits/rejected": 3.6234517097473145, "logps/chosen": -420.68048095703125, "logps/rejected": -619.9874877929688, "loss": 0.16, "rewards/accuracies": 0.875, "rewards/chosen": 1.7086896896362305, "rewards/margins": 7.707023620605469, "rewards/rejected": -5.998333930969238, "step": 230 }, { "epoch": 0.143701399688958, "grad_norm": 37.661048889160156, "learning_rate": 2.391304347826087e-06, "logits/chosen": -1.1538281440734863, "logits/rejected": 2.5236012935638428, "logps/chosen": -396.6817932128906, "logps/rejected": -621.38330078125, "loss": 1.031, "rewards/accuracies": 0.75, "rewards/chosen": 1.994320034980774, "rewards/margins": 2.331658363342285, "rewards/rejected": -0.3373383581638336, "step": 231 }, { "epoch": 0.1443234836702955, "grad_norm": 35.17131423950195, "learning_rate": 2.401656314699793e-06, "logits/chosen": -1.1166588068008423, "logits/rejected": 2.102498769760132, "logps/chosen": -419.2410583496094, "logps/rejected": -635.1260986328125, "loss": 0.8231, "rewards/accuracies": 0.625, "rewards/chosen": 0.8397369384765625, "rewards/margins": 2.944187641143799, "rewards/rejected": -2.1044509410858154, "step": 232 }, { "epoch": 0.14494556765163297, "grad_norm": 25.272857666015625, "learning_rate": 2.412008281573499e-06, "logits/chosen": -0.6808191537857056, "logits/rejected": 3.588482141494751, "logps/chosen": -241.42205810546875, "logps/rejected": -557.1651611328125, "loss": 0.6014, "rewards/accuracies": 0.875, "rewards/chosen": -0.2021692991256714, "rewards/margins": 5.207537651062012, "rewards/rejected": -5.409707069396973, "step": 233 }, { "epoch": 0.14556765163297045, "grad_norm": 43.97793960571289, "learning_rate": 2.422360248447205e-06, "logits/chosen": 1.0645995140075684, "logits/rejected": 3.1753838062286377, "logps/chosen": -477.3868713378906, "logps/rejected": -616.861083984375, "loss": 0.8906, "rewards/accuracies": 0.625, "rewards/chosen": 1.234439730644226, "rewards/margins": 3.5022284984588623, "rewards/rejected": -2.267788887023926, "step": 234 }, { "epoch": 0.14618973561430793, "grad_norm": 14.174098014831543, "learning_rate": 2.432712215320911e-06, "logits/chosen": 2.2343997955322266, "logits/rejected": 2.843075752258301, "logps/chosen": -527.3372192382812, "logps/rejected": -604.364990234375, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": 2.552929639816284, "rewards/margins": 5.782426357269287, "rewards/rejected": -3.229496479034424, "step": 235 }, { "epoch": 0.1468118195956454, "grad_norm": 40.284629821777344, "learning_rate": 2.443064182194617e-06, "logits/chosen": -0.22108198702335358, "logits/rejected": 2.564833641052246, "logps/chosen": -368.523681640625, "logps/rejected": -532.35400390625, "loss": 0.6829, "rewards/accuracies": 0.875, "rewards/chosen": 2.9975924491882324, "rewards/margins": 1.6895599365234375, "rewards/rejected": 1.3080326318740845, "step": 236 }, { "epoch": 0.14743390357698288, "grad_norm": 38.307769775390625, "learning_rate": 2.453416149068323e-06, "logits/chosen": -1.2599825859069824, "logits/rejected": 2.4900829792022705, "logps/chosen": -441.6986083984375, "logps/rejected": -676.992431640625, "loss": 1.8282, "rewards/accuracies": 0.625, "rewards/chosen": -0.310348242521286, "rewards/margins": 0.09642618894577026, "rewards/rejected": -0.40677428245544434, "step": 237 }, { "epoch": 0.14805598755832036, "grad_norm": 41.170799255371094, "learning_rate": 2.4637681159420295e-06, "logits/chosen": -0.8981081247329712, "logits/rejected": 3.240385055541992, "logps/chosen": -429.32147216796875, "logps/rejected": -688.107666015625, "loss": 0.8547, "rewards/accuracies": 0.75, "rewards/chosen": 0.36102116107940674, "rewards/margins": 6.074836730957031, "rewards/rejected": -5.713814735412598, "step": 238 }, { "epoch": 0.14867807153965784, "grad_norm": 38.23157501220703, "learning_rate": 2.474120082815735e-06, "logits/chosen": 1.0275741815567017, "logits/rejected": 3.7468931674957275, "logps/chosen": -617.2286987304688, "logps/rejected": -790.6701049804688, "loss": 0.5688, "rewards/accuracies": 0.75, "rewards/chosen": 0.11678469181060791, "rewards/margins": 4.816810131072998, "rewards/rejected": -4.70002555847168, "step": 239 }, { "epoch": 0.14930015552099535, "grad_norm": 50.341251373291016, "learning_rate": 2.484472049689441e-06, "logits/chosen": -2.779496192932129, "logits/rejected": 0.41580402851104736, "logps/chosen": -301.6960144042969, "logps/rejected": -558.6378173828125, "loss": 1.0108, "rewards/accuracies": 0.5, "rewards/chosen": 1.6704089641571045, "rewards/margins": 4.456121921539307, "rewards/rejected": -2.7857131958007812, "step": 240 }, { "epoch": 0.14992223950233283, "grad_norm": 34.48246765136719, "learning_rate": 2.494824016563147e-06, "logits/chosen": -0.310516357421875, "logits/rejected": 1.8644804954528809, "logps/chosen": -419.97955322265625, "logps/rejected": -559.4046630859375, "loss": 0.9705, "rewards/accuracies": 0.625, "rewards/chosen": -1.8145637512207031, "rewards/margins": 1.7449901103973389, "rewards/rejected": -3.559554100036621, "step": 241 }, { "epoch": 0.1505443234836703, "grad_norm": 44.7969856262207, "learning_rate": 2.5051759834368534e-06, "logits/chosen": 2.063325881958008, "logits/rejected": 1.8975759744644165, "logps/chosen": -570.3998413085938, "logps/rejected": -557.6295166015625, "loss": 1.1162, "rewards/accuracies": 0.625, "rewards/chosen": -4.317770481109619, "rewards/margins": 1.6723326444625854, "rewards/rejected": -5.990103244781494, "step": 242 }, { "epoch": 0.15116640746500778, "grad_norm": 18.721473693847656, "learning_rate": 2.515527950310559e-06, "logits/chosen": -2.293026924133301, "logits/rejected": 2.767634868621826, "logps/chosen": -441.6069641113281, "logps/rejected": -787.40771484375, "loss": 0.174, "rewards/accuracies": 0.875, "rewards/chosen": 0.4177449345588684, "rewards/margins": 5.593556880950928, "rewards/rejected": -5.175811767578125, "step": 243 }, { "epoch": 0.15178849144634526, "grad_norm": 19.432035446166992, "learning_rate": 2.5258799171842654e-06, "logits/chosen": 2.422300338745117, "logits/rejected": 3.6136603355407715, "logps/chosen": -622.0272827148438, "logps/rejected": -721.8018798828125, "loss": 0.1876, "rewards/accuracies": 1.0, "rewards/chosen": -1.7000467777252197, "rewards/margins": 4.6907572746276855, "rewards/rejected": -6.390804290771484, "step": 244 }, { "epoch": 0.15241057542768274, "grad_norm": 38.153358459472656, "learning_rate": 2.5362318840579714e-06, "logits/chosen": 1.7388522624969482, "logits/rejected": 4.493720531463623, "logps/chosen": -551.026123046875, "logps/rejected": -751.4261474609375, "loss": 0.7534, "rewards/accuracies": 0.75, "rewards/chosen": 1.3347375392913818, "rewards/margins": 4.540463447570801, "rewards/rejected": -3.205725908279419, "step": 245 }, { "epoch": 0.15303265940902022, "grad_norm": 43.89891052246094, "learning_rate": 2.546583850931677e-06, "logits/chosen": -0.849528431892395, "logits/rejected": 1.535990834236145, "logps/chosen": -399.88470458984375, "logps/rejected": -579.0242919921875, "loss": 0.8152, "rewards/accuracies": 0.5, "rewards/chosen": -3.5337576866149902, "rewards/margins": 4.215351104736328, "rewards/rejected": -7.749109268188477, "step": 246 }, { "epoch": 0.1536547433903577, "grad_norm": 42.65324783325195, "learning_rate": 2.5569358178053833e-06, "logits/chosen": 1.0358763933181763, "logits/rejected": 3.2417514324188232, "logps/chosen": -489.2657470703125, "logps/rejected": -644.57080078125, "loss": 0.8755, "rewards/accuracies": 0.75, "rewards/chosen": -0.1160585880279541, "rewards/margins": 3.223320960998535, "rewards/rejected": -3.3393797874450684, "step": 247 }, { "epoch": 0.15427682737169518, "grad_norm": 51.01871871948242, "learning_rate": 2.5672877846790893e-06, "logits/chosen": 0.9451044797897339, "logits/rejected": 2.965320110321045, "logps/chosen": -430.13543701171875, "logps/rejected": -588.0543823242188, "loss": 0.9527, "rewards/accuracies": 0.625, "rewards/chosen": 0.4361051321029663, "rewards/margins": 5.314112663269043, "rewards/rejected": -4.878007888793945, "step": 248 }, { "epoch": 0.15489891135303266, "grad_norm": 46.12874221801758, "learning_rate": 2.5776397515527953e-06, "logits/chosen": -1.0910335779190063, "logits/rejected": 3.0898303985595703, "logps/chosen": -381.70159912109375, "logps/rejected": -687.109619140625, "loss": 1.0711, "rewards/accuracies": 0.75, "rewards/chosen": 2.315556764602661, "rewards/margins": 7.379352569580078, "rewards/rejected": -5.063795566558838, "step": 249 }, { "epoch": 0.15552099533437014, "grad_norm": 2.26055908203125, "learning_rate": 2.5879917184265013e-06, "logits/chosen": 0.7546824216842651, "logits/rejected": 3.479255199432373, "logps/chosen": -389.37835693359375, "logps/rejected": -597.3038330078125, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 1.3359346389770508, "rewards/margins": 9.287576675415039, "rewards/rejected": -7.951642036437988, "step": 250 }, { "epoch": 0.15614307931570762, "grad_norm": 5.929549694061279, "learning_rate": 2.598343685300207e-06, "logits/chosen": 1.4317560195922852, "logits/rejected": 3.4852263927459717, "logps/chosen": -604.067138671875, "logps/rejected": -810.948974609375, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -4.16350793838501, "rewards/margins": 11.667287826538086, "rewards/rejected": -15.830794334411621, "step": 251 }, { "epoch": 0.1567651632970451, "grad_norm": 5.079573631286621, "learning_rate": 2.6086956521739132e-06, "logits/chosen": -0.28528928756713867, "logits/rejected": 2.207214593887329, "logps/chosen": -442.1741027832031, "logps/rejected": -624.3016357421875, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.1780473291873932, "rewards/margins": 9.567262649536133, "rewards/rejected": -9.745308876037598, "step": 252 }, { "epoch": 0.15738724727838257, "grad_norm": 1.077399730682373, "learning_rate": 2.6190476190476192e-06, "logits/chosen": 1.8529900312423706, "logits/rejected": 3.7783148288726807, "logps/chosen": -544.4069213867188, "logps/rejected": -773.4401245117188, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.3226267099380493, "rewards/margins": 9.717878341674805, "rewards/rejected": -9.395252227783203, "step": 253 }, { "epoch": 0.15800933125972005, "grad_norm": 26.679351806640625, "learning_rate": 2.629399585921325e-06, "logits/chosen": 0.5402032136917114, "logits/rejected": 3.1739237308502197, "logps/chosen": -467.8306579589844, "logps/rejected": -729.2216796875, "loss": 0.2176, "rewards/accuracies": 0.75, "rewards/chosen": -3.075988531112671, "rewards/margins": 7.494318962097168, "rewards/rejected": -10.570308685302734, "step": 254 }, { "epoch": 0.15863141524105753, "grad_norm": 6.276568412780762, "learning_rate": 2.639751552795031e-06, "logits/chosen": -3.0313994884490967, "logits/rejected": 1.6596317291259766, "logps/chosen": -239.6669464111328, "logps/rejected": -597.2471313476562, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 0.20734518766403198, "rewards/margins": 10.459396362304688, "rewards/rejected": -10.252050399780273, "step": 255 }, { "epoch": 0.159253499222395, "grad_norm": 36.85567855834961, "learning_rate": 2.6501035196687376e-06, "logits/chosen": 0.3948754072189331, "logits/rejected": 2.9974865913391113, "logps/chosen": -479.9315490722656, "logps/rejected": -652.0233154296875, "loss": 0.5583, "rewards/accuracies": 0.875, "rewards/chosen": 1.309106469154358, "rewards/margins": 8.474274635314941, "rewards/rejected": -7.165168285369873, "step": 256 }, { "epoch": 0.1598755832037325, "grad_norm": 59.51966094970703, "learning_rate": 2.660455486542443e-06, "logits/chosen": 2.1631646156311035, "logits/rejected": 3.3487162590026855, "logps/chosen": -596.6227416992188, "logps/rejected": -722.599365234375, "loss": 2.4024, "rewards/accuracies": 0.5, "rewards/chosen": -5.933492660522461, "rewards/margins": 0.4551329016685486, "rewards/rejected": -6.388625144958496, "step": 257 }, { "epoch": 0.16049766718507, "grad_norm": 39.466552734375, "learning_rate": 2.670807453416149e-06, "logits/chosen": -1.7184714078903198, "logits/rejected": 2.2928924560546875, "logps/chosen": -400.03253173828125, "logps/rejected": -628.02587890625, "loss": 0.9934, "rewards/accuracies": 0.625, "rewards/chosen": -0.7665202617645264, "rewards/margins": 2.3955516815185547, "rewards/rejected": -3.162071704864502, "step": 258 }, { "epoch": 0.16111975116640748, "grad_norm": 5.647301197052002, "learning_rate": 2.6811594202898555e-06, "logits/chosen": -1.1122512817382812, "logits/rejected": 2.666635274887085, "logps/chosen": -352.390380859375, "logps/rejected": -645.776611328125, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": -1.1382899284362793, "rewards/margins": 8.056615829467773, "rewards/rejected": -9.194905281066895, "step": 259 }, { "epoch": 0.16174183514774496, "grad_norm": 22.67473602294922, "learning_rate": 2.691511387163561e-06, "logits/chosen": -0.1789799928665161, "logits/rejected": 2.045513153076172, "logps/chosen": -465.56683349609375, "logps/rejected": -668.13330078125, "loss": 0.3468, "rewards/accuracies": 0.875, "rewards/chosen": -3.058413505554199, "rewards/margins": 9.895721435546875, "rewards/rejected": -12.95413589477539, "step": 260 }, { "epoch": 0.16236391912908243, "grad_norm": 0.010524489916861057, "learning_rate": 2.7018633540372675e-06, "logits/chosen": 0.737421452999115, "logits/rejected": 3.0371365547180176, "logps/chosen": -520.680908203125, "logps/rejected": -735.24560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.17118918895721436, "rewards/margins": 14.252211570739746, "rewards/rejected": -14.081022262573242, "step": 261 }, { "epoch": 0.1629860031104199, "grad_norm": 51.21318054199219, "learning_rate": 2.712215320910973e-06, "logits/chosen": -0.729854166507721, "logits/rejected": 3.941812038421631, "logps/chosen": -443.6608581542969, "logps/rejected": -738.9736938476562, "loss": 1.436, "rewards/accuracies": 0.75, "rewards/chosen": -3.1441855430603027, "rewards/margins": 5.385406494140625, "rewards/rejected": -8.52959156036377, "step": 262 }, { "epoch": 0.1636080870917574, "grad_norm": 41.15818786621094, "learning_rate": 2.7225672877846795e-06, "logits/chosen": 1.1326603889465332, "logits/rejected": 3.168422222137451, "logps/chosen": -544.2522583007812, "logps/rejected": -729.4052734375, "loss": 0.7476, "rewards/accuracies": 0.875, "rewards/chosen": -5.015408515930176, "rewards/margins": 8.209056854248047, "rewards/rejected": -13.224465370178223, "step": 263 }, { "epoch": 0.16423017107309487, "grad_norm": 40.650020599365234, "learning_rate": 2.7329192546583855e-06, "logits/chosen": -0.14138327538967133, "logits/rejected": -1.1530615091323853, "logps/chosen": -486.3907775878906, "logps/rejected": -504.61163330078125, "loss": 1.7413, "rewards/accuracies": 0.625, "rewards/chosen": -3.723165512084961, "rewards/margins": 3.4709882736206055, "rewards/rejected": -7.194153785705566, "step": 264 }, { "epoch": 0.16485225505443235, "grad_norm": 8.631354331970215, "learning_rate": 2.743271221532091e-06, "logits/chosen": -1.6873260736465454, "logits/rejected": 3.191648006439209, "logps/chosen": -309.7070617675781, "logps/rejected": -606.5875244140625, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": -1.8784593343734741, "rewards/margins": 10.098764419555664, "rewards/rejected": -11.977224349975586, "step": 265 }, { "epoch": 0.16547433903576983, "grad_norm": 35.53109359741211, "learning_rate": 2.7536231884057974e-06, "logits/chosen": -2.07094669342041, "logits/rejected": 2.508565902709961, "logps/chosen": -418.36077880859375, "logps/rejected": -844.75146484375, "loss": 0.5531, "rewards/accuracies": 0.875, "rewards/chosen": -4.0940752029418945, "rewards/margins": 11.74495792388916, "rewards/rejected": -15.839033126831055, "step": 266 }, { "epoch": 0.1660964230171073, "grad_norm": 17.228307723999023, "learning_rate": 2.7639751552795034e-06, "logits/chosen": 0.33517616987228394, "logits/rejected": 3.5891106128692627, "logps/chosen": -434.30438232421875, "logps/rejected": -723.2798461914062, "loss": 0.163, "rewards/accuracies": 0.875, "rewards/chosen": 2.3649284839630127, "rewards/margins": 8.922518730163574, "rewards/rejected": -6.557590484619141, "step": 267 }, { "epoch": 0.1667185069984448, "grad_norm": 43.660179138183594, "learning_rate": 2.7743271221532094e-06, "logits/chosen": 0.725469708442688, "logits/rejected": 3.846705198287964, "logps/chosen": -609.2630615234375, "logps/rejected": -882.2078857421875, "loss": 0.8438, "rewards/accuracies": 0.625, "rewards/chosen": -3.523108959197998, "rewards/margins": 6.391105651855469, "rewards/rejected": -9.914214134216309, "step": 268 }, { "epoch": 0.16734059097978227, "grad_norm": 11.92179012298584, "learning_rate": 2.7846790890269154e-06, "logits/chosen": 0.1969280242919922, "logits/rejected": 4.658049583435059, "logps/chosen": -520.454345703125, "logps/rejected": -818.27294921875, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": -0.863990306854248, "rewards/margins": 4.931917667388916, "rewards/rejected": -5.795907974243164, "step": 269 }, { "epoch": 0.16796267496111975, "grad_norm": 44.41823196411133, "learning_rate": 2.795031055900621e-06, "logits/chosen": 0.10651260614395142, "logits/rejected": 3.5432283878326416, "logps/chosen": -487.7990417480469, "logps/rejected": -717.2167358398438, "loss": 1.3901, "rewards/accuracies": 0.875, "rewards/chosen": -2.1005728244781494, "rewards/margins": 10.617042541503906, "rewards/rejected": -12.717616081237793, "step": 270 }, { "epoch": 0.16858475894245722, "grad_norm": 33.92555618286133, "learning_rate": 2.8053830227743273e-06, "logits/chosen": 0.07751777768135071, "logits/rejected": 4.2295098304748535, "logps/chosen": -369.48114013671875, "logps/rejected": -573.50732421875, "loss": 0.823, "rewards/accuracies": 0.75, "rewards/chosen": -4.756532192230225, "rewards/margins": 1.9096379280090332, "rewards/rejected": -6.666170120239258, "step": 271 }, { "epoch": 0.1692068429237947, "grad_norm": 27.594270706176758, "learning_rate": 2.8157349896480333e-06, "logits/chosen": 0.10852780938148499, "logits/rejected": 4.689783096313477, "logps/chosen": -429.4493103027344, "logps/rejected": -787.86865234375, "loss": 0.2497, "rewards/accuracies": 0.875, "rewards/chosen": -1.5277105569839478, "rewards/margins": 7.960251808166504, "rewards/rejected": -9.48796272277832, "step": 272 }, { "epoch": 0.16982892690513218, "grad_norm": 1.3322811126708984, "learning_rate": 2.8260869565217393e-06, "logits/chosen": -1.8472188711166382, "logits/rejected": 2.6371984481811523, "logps/chosen": -345.05682373046875, "logps/rejected": -702.9637451171875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 0.7659844160079956, "rewards/margins": 12.975811004638672, "rewards/rejected": -12.209827423095703, "step": 273 }, { "epoch": 0.17045101088646966, "grad_norm": 34.784027099609375, "learning_rate": 2.8364389233954453e-06, "logits/chosen": 0.351703405380249, "logits/rejected": 2.1608669757843018, "logps/chosen": -389.4510803222656, "logps/rejected": -599.7803344726562, "loss": 1.4058, "rewards/accuracies": 0.875, "rewards/chosen": -4.442836284637451, "rewards/margins": 4.768312454223633, "rewards/rejected": -9.21114730834961, "step": 274 }, { "epoch": 0.17107309486780714, "grad_norm": 44.48931884765625, "learning_rate": 2.8467908902691517e-06, "logits/chosen": 1.1296827793121338, "logits/rejected": 4.558327674865723, "logps/chosen": -466.94305419921875, "logps/rejected": -747.0626220703125, "loss": 0.4661, "rewards/accuracies": 0.75, "rewards/chosen": -4.572750091552734, "rewards/margins": 4.540435314178467, "rewards/rejected": -9.113184928894043, "step": 275 }, { "epoch": 0.17169517884914465, "grad_norm": 43.360530853271484, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -1.1608879566192627, "logits/rejected": 1.6421074867248535, "logps/chosen": -595.3029174804688, "logps/rejected": -797.7877807617188, "loss": 0.9953, "rewards/accuracies": 0.625, "rewards/chosen": -7.334722518920898, "rewards/margins": 6.524963855743408, "rewards/rejected": -13.859687805175781, "step": 276 }, { "epoch": 0.17231726283048213, "grad_norm": 34.76185989379883, "learning_rate": 2.8674948240165632e-06, "logits/chosen": 3.2887511253356934, "logits/rejected": 4.776767730712891, "logps/chosen": -702.6865844726562, "logps/rejected": -816.4313354492188, "loss": 0.4011, "rewards/accuracies": 0.75, "rewards/chosen": 0.9340373277664185, "rewards/margins": 9.299430847167969, "rewards/rejected": -8.36539363861084, "step": 277 }, { "epoch": 0.1729393468118196, "grad_norm": 0.728277862071991, "learning_rate": 2.8778467908902696e-06, "logits/chosen": 0.3774760961532593, "logits/rejected": 3.0168018341064453, "logps/chosen": -395.4736633300781, "logps/rejected": -690.8614501953125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.5477930903434753, "rewards/margins": 12.72874641418457, "rewards/rejected": -13.276540756225586, "step": 278 }, { "epoch": 0.17356143079315708, "grad_norm": 30.32001495361328, "learning_rate": 2.888198757763975e-06, "logits/chosen": -1.6389963626861572, "logits/rejected": 1.60201096534729, "logps/chosen": -320.2080078125, "logps/rejected": -618.7227783203125, "loss": 0.1274, "rewards/accuracies": 0.875, "rewards/chosen": -3.867861032485962, "rewards/margins": 10.369712829589844, "rewards/rejected": -14.23757553100586, "step": 279 }, { "epoch": 0.17418351477449456, "grad_norm": 50.79122543334961, "learning_rate": 2.8985507246376816e-06, "logits/chosen": 1.6694326400756836, "logits/rejected": 2.5693209171295166, "logps/chosen": -643.999755859375, "logps/rejected": -713.3614501953125, "loss": 3.1784, "rewards/accuracies": 0.75, "rewards/chosen": -8.281852722167969, "rewards/margins": 4.56790828704834, "rewards/rejected": -12.849761962890625, "step": 280 }, { "epoch": 0.17480559875583204, "grad_norm": 36.4666862487793, "learning_rate": 2.908902691511387e-06, "logits/chosen": -0.6406252384185791, "logits/rejected": 1.632117748260498, "logps/chosen": -515.8046264648438, "logps/rejected": -732.3682861328125, "loss": 1.2141, "rewards/accuracies": 0.875, "rewards/chosen": -3.834707021713257, "rewards/margins": 10.90757942199707, "rewards/rejected": -14.742287635803223, "step": 281 }, { "epoch": 0.17542768273716952, "grad_norm": 44.52206802368164, "learning_rate": 2.919254658385093e-06, "logits/chosen": 1.7754943370819092, "logits/rejected": 2.074369192123413, "logps/chosen": -688.3287353515625, "logps/rejected": -695.4146728515625, "loss": 1.7472, "rewards/accuracies": 0.875, "rewards/chosen": -6.223416328430176, "rewards/margins": 4.887999057769775, "rewards/rejected": -11.111414909362793, "step": 282 }, { "epoch": 0.176049766718507, "grad_norm": 46.707618713378906, "learning_rate": 2.9296066252587996e-06, "logits/chosen": -2.913400888442993, "logits/rejected": 3.6963987350463867, "logps/chosen": -259.6846923828125, "logps/rejected": -758.641845703125, "loss": 0.7202, "rewards/accuracies": 0.625, "rewards/chosen": -3.6863110065460205, "rewards/margins": 7.087304592132568, "rewards/rejected": -10.773615837097168, "step": 283 }, { "epoch": 0.17667185069984448, "grad_norm": 59.8579216003418, "learning_rate": 2.939958592132505e-06, "logits/chosen": 0.8096885085105896, "logits/rejected": 1.4599106311798096, "logps/chosen": -640.6162719726562, "logps/rejected": -699.5067138671875, "loss": 1.0139, "rewards/accuracies": 0.75, "rewards/chosen": -5.759545803070068, "rewards/margins": 6.045644760131836, "rewards/rejected": -11.80518913269043, "step": 284 }, { "epoch": 0.17729393468118196, "grad_norm": 2.9321112632751465, "learning_rate": 2.9503105590062115e-06, "logits/chosen": -2.3982667922973633, "logits/rejected": 3.0039591789245605, "logps/chosen": -393.3422546386719, "logps/rejected": -827.1685791015625, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 0.9348781108856201, "rewards/margins": 11.114221572875977, "rewards/rejected": -10.179344177246094, "step": 285 }, { "epoch": 0.17791601866251944, "grad_norm": 38.78324890136719, "learning_rate": 2.9606625258799175e-06, "logits/chosen": 0.25066280364990234, "logits/rejected": 1.5802853107452393, "logps/chosen": -459.66357421875, "logps/rejected": -581.1425170898438, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": -7.499963760375977, "rewards/margins": 8.779077529907227, "rewards/rejected": -16.279041290283203, "step": 286 }, { "epoch": 0.17853810264385692, "grad_norm": 3.962609052658081, "learning_rate": 2.9710144927536235e-06, "logits/chosen": -2.1418416500091553, "logits/rejected": 1.1488546133041382, "logps/chosen": -345.88714599609375, "logps/rejected": -620.9361572265625, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -1.7706562280654907, "rewards/margins": 8.32846450805664, "rewards/rejected": -10.099120140075684, "step": 287 }, { "epoch": 0.1791601866251944, "grad_norm": 26.762493133544922, "learning_rate": 2.9813664596273295e-06, "logits/chosen": -1.101001501083374, "logits/rejected": 2.8201372623443604, "logps/chosen": -486.9984130859375, "logps/rejected": -790.114013671875, "loss": 0.6456, "rewards/accuracies": 0.75, "rewards/chosen": -4.161718368530273, "rewards/margins": 6.815770626068115, "rewards/rejected": -10.977489471435547, "step": 288 }, { "epoch": 0.17978227060653187, "grad_norm": 24.064451217651367, "learning_rate": 2.991718426501035e-06, "logits/chosen": 1.4957832098007202, "logits/rejected": 3.3200576305389404, "logps/chosen": -516.3970947265625, "logps/rejected": -710.7282104492188, "loss": 0.8452, "rewards/accuracies": 0.75, "rewards/chosen": -9.919002532958984, "rewards/margins": 6.532848358154297, "rewards/rejected": -16.45184898376465, "step": 289 }, { "epoch": 0.18040435458786935, "grad_norm": 19.973859786987305, "learning_rate": 3.0020703933747414e-06, "logits/chosen": -4.241866588592529, "logits/rejected": 2.728012800216675, "logps/chosen": -220.65185546875, "logps/rejected": -713.72119140625, "loss": 0.4069, "rewards/accuracies": 0.875, "rewards/chosen": -3.7319488525390625, "rewards/margins": 12.879581451416016, "rewards/rejected": -16.611530303955078, "step": 290 }, { "epoch": 0.18102643856920683, "grad_norm": 9.816418647766113, "learning_rate": 3.0124223602484474e-06, "logits/chosen": -2.1271183490753174, "logits/rejected": 1.7093493938446045, "logps/chosen": -368.6051025390625, "logps/rejected": -721.2058715820312, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -5.852741241455078, "rewards/margins": 10.840240478515625, "rewards/rejected": -16.692981719970703, "step": 291 }, { "epoch": 0.1816485225505443, "grad_norm": 58.5022087097168, "learning_rate": 3.0227743271221534e-06, "logits/chosen": -1.7429341077804565, "logits/rejected": 3.815767288208008, "logps/chosen": -329.79193115234375, "logps/rejected": -742.4627685546875, "loss": 0.8852, "rewards/accuracies": 0.625, "rewards/chosen": -3.896777629852295, "rewards/margins": 8.65748119354248, "rewards/rejected": -12.554258346557617, "step": 292 }, { "epoch": 0.1822706065318818, "grad_norm": 35.150367736816406, "learning_rate": 3.0331262939958594e-06, "logits/chosen": -0.9945878386497498, "logits/rejected": 3.7409143447875977, "logps/chosen": -498.15850830078125, "logps/rejected": -846.7988891601562, "loss": 0.4365, "rewards/accuracies": 0.75, "rewards/chosen": -7.441921710968018, "rewards/margins": 8.19404411315918, "rewards/rejected": -15.635965347290039, "step": 293 }, { "epoch": 0.1828926905132193, "grad_norm": 53.36930465698242, "learning_rate": 3.043478260869566e-06, "logits/chosen": -0.6886473894119263, "logits/rejected": 4.724715232849121, "logps/chosen": -432.71478271484375, "logps/rejected": -881.2340698242188, "loss": 2.3085, "rewards/accuracies": 0.875, "rewards/chosen": -5.074771881103516, "rewards/margins": 13.83968734741211, "rewards/rejected": -18.914457321166992, "step": 294 }, { "epoch": 0.18351477449455678, "grad_norm": 22.593576431274414, "learning_rate": 3.0538302277432714e-06, "logits/chosen": 0.6494942903518677, "logits/rejected": 2.726839542388916, "logps/chosen": -589.6900634765625, "logps/rejected": -807.8902587890625, "loss": 0.1957, "rewards/accuracies": 0.875, "rewards/chosen": -5.541738033294678, "rewards/margins": 10.383159637451172, "rewards/rejected": -15.924898147583008, "step": 295 }, { "epoch": 0.18413685847589426, "grad_norm": 0.09417515993118286, "learning_rate": 3.0641821946169773e-06, "logits/chosen": -1.711951494216919, "logits/rejected": 2.7394614219665527, "logps/chosen": -368.3467102050781, "logps/rejected": -731.626708984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5702080726623535, "rewards/margins": 13.196901321411133, "rewards/rejected": -15.767109870910645, "step": 296 }, { "epoch": 0.18475894245723173, "grad_norm": 53.6297607421875, "learning_rate": 3.0745341614906837e-06, "logits/chosen": 2.0798213481903076, "logits/rejected": 1.6787725687026978, "logps/chosen": -639.6285400390625, "logps/rejected": -665.6937255859375, "loss": 1.9573, "rewards/accuracies": 0.625, "rewards/chosen": -8.730389595031738, "rewards/margins": 5.162319660186768, "rewards/rejected": -13.892708778381348, "step": 297 }, { "epoch": 0.1853810264385692, "grad_norm": 48.8674430847168, "learning_rate": 3.0848861283643893e-06, "logits/chosen": 2.25374436378479, "logits/rejected": 3.050382137298584, "logps/chosen": -669.9483642578125, "logps/rejected": -737.494873046875, "loss": 1.4464, "rewards/accuracies": 0.625, "rewards/chosen": -5.033759117126465, "rewards/margins": 8.142561912536621, "rewards/rejected": -13.176321029663086, "step": 298 }, { "epoch": 0.1860031104199067, "grad_norm": 13.007925033569336, "learning_rate": 3.0952380952380957e-06, "logits/chosen": -0.3085886836051941, "logits/rejected": 1.4879200458526611, "logps/chosen": -568.7110595703125, "logps/rejected": -805.2724609375, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": -7.329258918762207, "rewards/margins": 10.055083274841309, "rewards/rejected": -17.384342193603516, "step": 299 }, { "epoch": 0.18662519440124417, "grad_norm": 1.1980512142181396, "learning_rate": 3.1055900621118013e-06, "logits/chosen": -1.660165548324585, "logits/rejected": 2.2211251258850098, "logps/chosen": -307.91375732421875, "logps/rejected": -707.158447265625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.748342275619507, "rewards/margins": 14.093570709228516, "rewards/rejected": -16.8419132232666, "step": 300 }, { "epoch": 0.18724727838258165, "grad_norm": 2.2520453929901123, "learning_rate": 3.1159420289855073e-06, "logits/chosen": -1.8573826551437378, "logits/rejected": 2.397719621658325, "logps/chosen": -297.97613525390625, "logps/rejected": -675.8267211914062, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -1.4887399673461914, "rewards/margins": 15.379812240600586, "rewards/rejected": -16.868553161621094, "step": 301 }, { "epoch": 0.18786936236391913, "grad_norm": 1.385947346687317, "learning_rate": 3.1262939958592137e-06, "logits/chosen": 2.1132750511169434, "logits/rejected": 2.269409656524658, "logps/chosen": -581.947509765625, "logps/rejected": -664.1751708984375, "loss": 0.1036, "rewards/accuracies": 0.875, "rewards/chosen": -0.37770363688468933, "rewards/margins": 7.922628402709961, "rewards/rejected": -8.300332069396973, "step": 302 }, { "epoch": 0.1884914463452566, "grad_norm": 3.8977954387664795, "learning_rate": 3.1366459627329192e-06, "logits/chosen": 0.9858641624450684, "logits/rejected": 1.9707233905792236, "logps/chosen": -619.97998046875, "logps/rejected": -831.2584228515625, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -5.3522443771362305, "rewards/margins": 12.102344512939453, "rewards/rejected": -17.454587936401367, "step": 303 }, { "epoch": 0.1891135303265941, "grad_norm": 33.76449203491211, "learning_rate": 3.1469979296066256e-06, "logits/chosen": -1.6749011278152466, "logits/rejected": 2.192087173461914, "logps/chosen": -387.16436767578125, "logps/rejected": -690.3063354492188, "loss": 0.1106, "rewards/accuracies": 0.875, "rewards/chosen": -7.873738765716553, "rewards/margins": 8.083623886108398, "rewards/rejected": -15.957362174987793, "step": 304 }, { "epoch": 0.18973561430793157, "grad_norm": 0.1594896912574768, "learning_rate": 3.1573498964803316e-06, "logits/chosen": -3.2994441986083984, "logits/rejected": 1.944976806640625, "logps/chosen": -242.07859802246094, "logps/rejected": -739.9111938476562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.6777923107147217, "rewards/margins": 15.069310188293457, "rewards/rejected": -16.747102737426758, "step": 305 }, { "epoch": 0.19035769828926905, "grad_norm": 63.30927658081055, "learning_rate": 3.1677018633540376e-06, "logits/chosen": 1.2816505432128906, "logits/rejected": 2.624063491821289, "logps/chosen": -605.6007080078125, "logps/rejected": -725.8055419921875, "loss": 2.7351, "rewards/accuracies": 0.625, "rewards/chosen": -8.404818534851074, "rewards/margins": 3.914830207824707, "rewards/rejected": -12.319649696350098, "step": 306 }, { "epoch": 0.19097978227060652, "grad_norm": 26.36448097229004, "learning_rate": 3.1780538302277436e-06, "logits/chosen": 0.1868879795074463, "logits/rejected": 2.8717458248138428, "logps/chosen": -550.631591796875, "logps/rejected": -762.9170532226562, "loss": 0.5359, "rewards/accuracies": 0.875, "rewards/chosen": -4.5749921798706055, "rewards/margins": 10.07697582244873, "rewards/rejected": -14.651968955993652, "step": 307 }, { "epoch": 0.191601866251944, "grad_norm": 2.184065580368042, "learning_rate": 3.188405797101449e-06, "logits/chosen": -0.3023766279220581, "logits/rejected": 2.1106514930725098, "logps/chosen": -538.0408935546875, "logps/rejected": -788.0562744140625, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -8.051552772521973, "rewards/margins": 13.31997299194336, "rewards/rejected": -21.37152671813965, "step": 308 }, { "epoch": 0.19222395023328148, "grad_norm": 31.663427352905273, "learning_rate": 3.1987577639751555e-06, "logits/chosen": 0.46885818243026733, "logits/rejected": 3.4298930168151855, "logps/chosen": -457.65106201171875, "logps/rejected": -654.154052734375, "loss": 0.254, "rewards/accuracies": 0.875, "rewards/chosen": -9.863368034362793, "rewards/margins": 4.747684955596924, "rewards/rejected": -14.611052513122559, "step": 309 }, { "epoch": 0.19284603421461896, "grad_norm": 9.104625701904297, "learning_rate": 3.2091097308488615e-06, "logits/chosen": 1.0741961002349854, "logits/rejected": 4.69411563873291, "logps/chosen": -489.1529541015625, "logps/rejected": -798.117919921875, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": -4.415085315704346, "rewards/margins": 13.385915756225586, "rewards/rejected": -17.801002502441406, "step": 310 }, { "epoch": 0.19346811819595647, "grad_norm": 2.730404853820801, "learning_rate": 3.2194616977225675e-06, "logits/chosen": -3.9466922283172607, "logits/rejected": 1.7940433025360107, "logps/chosen": -369.9723205566406, "logps/rejected": -805.43701171875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.8560994863510132, "rewards/margins": 16.080703735351562, "rewards/rejected": -17.936803817749023, "step": 311 }, { "epoch": 0.19409020217729395, "grad_norm": 31.281147003173828, "learning_rate": 3.2298136645962735e-06, "logits/chosen": 0.35380566120147705, "logits/rejected": 1.810831904411316, "logps/chosen": -577.0803833007812, "logps/rejected": -711.97119140625, "loss": 0.3663, "rewards/accuracies": 0.875, "rewards/chosen": -8.113171577453613, "rewards/margins": 9.861759185791016, "rewards/rejected": -17.974929809570312, "step": 312 }, { "epoch": 0.19471228615863143, "grad_norm": 40.597530364990234, "learning_rate": 3.24016563146998e-06, "logits/chosen": 1.288434386253357, "logits/rejected": 3.427957057952881, "logps/chosen": -649.0224609375, "logps/rejected": -792.2032470703125, "loss": 0.3979, "rewards/accuracies": 0.75, "rewards/chosen": -8.13973617553711, "rewards/margins": 9.166152954101562, "rewards/rejected": -17.305889129638672, "step": 313 }, { "epoch": 0.1953343701399689, "grad_norm": 38.50737380981445, "learning_rate": 3.2505175983436855e-06, "logits/chosen": 0.9875385761260986, "logits/rejected": 1.3537368774414062, "logps/chosen": -590.6889038085938, "logps/rejected": -713.373046875, "loss": 1.1722, "rewards/accuracies": 0.875, "rewards/chosen": -7.0353498458862305, "rewards/margins": 7.720550537109375, "rewards/rejected": -14.755900382995605, "step": 314 }, { "epoch": 0.19595645412130638, "grad_norm": 58.41802215576172, "learning_rate": 3.2608695652173914e-06, "logits/chosen": 3.5066542625427246, "logits/rejected": 3.6684353351593018, "logps/chosen": -731.1271362304688, "logps/rejected": -816.0885009765625, "loss": 0.6979, "rewards/accuracies": 0.75, "rewards/chosen": -11.413936614990234, "rewards/margins": 4.682851791381836, "rewards/rejected": -16.096786499023438, "step": 315 }, { "epoch": 0.19657853810264386, "grad_norm": 13.743263244628906, "learning_rate": 3.271221532091098e-06, "logits/chosen": -1.6713857650756836, "logits/rejected": 0.9872905015945435, "logps/chosen": -455.806640625, "logps/rejected": -674.9764404296875, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": -4.974114418029785, "rewards/margins": 8.05095100402832, "rewards/rejected": -13.025066375732422, "step": 316 }, { "epoch": 0.19720062208398134, "grad_norm": 10.271381378173828, "learning_rate": 3.2815734989648034e-06, "logits/chosen": -1.2763960361480713, "logits/rejected": 1.2967215776443481, "logps/chosen": -479.1017761230469, "logps/rejected": -778.9246215820312, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -4.021428108215332, "rewards/margins": 10.688220977783203, "rewards/rejected": -14.709648132324219, "step": 317 }, { "epoch": 0.19782270606531882, "grad_norm": 10.258651733398438, "learning_rate": 3.29192546583851e-06, "logits/chosen": -0.9771242737770081, "logits/rejected": 3.320232629776001, "logps/chosen": -419.28009033203125, "logps/rejected": -837.815185546875, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": -2.8659563064575195, "rewards/margins": 10.27839469909668, "rewards/rejected": -13.144350051879883, "step": 318 }, { "epoch": 0.1984447900466563, "grad_norm": 51.61482620239258, "learning_rate": 3.3022774327122154e-06, "logits/chosen": 0.948689341545105, "logits/rejected": 1.2561674118041992, "logps/chosen": -505.46624755859375, "logps/rejected": -577.086669921875, "loss": 2.3032, "rewards/accuracies": 0.75, "rewards/chosen": -7.470050811767578, "rewards/margins": 4.860204696655273, "rewards/rejected": -12.330255508422852, "step": 319 }, { "epoch": 0.19906687402799378, "grad_norm": 22.424713134765625, "learning_rate": 3.3126293995859214e-06, "logits/chosen": -0.02822953462600708, "logits/rejected": 4.412856578826904, "logps/chosen": -482.056396484375, "logps/rejected": -848.180419921875, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": -7.365329742431641, "rewards/margins": 8.485767364501953, "rewards/rejected": -15.851097106933594, "step": 320 }, { "epoch": 0.19968895800933126, "grad_norm": 30.33492660522461, "learning_rate": 3.3229813664596278e-06, "logits/chosen": 1.4249606132507324, "logits/rejected": 3.7021050453186035, "logps/chosen": -519.3263549804688, "logps/rejected": -800.2149658203125, "loss": 0.7746, "rewards/accuracies": 0.875, "rewards/chosen": -7.840842247009277, "rewards/margins": 8.635676383972168, "rewards/rejected": -16.476520538330078, "step": 321 }, { "epoch": 0.20031104199066874, "grad_norm": 14.67454719543457, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.4113171100616455, "logits/rejected": 3.002666473388672, "logps/chosen": -444.43084716796875, "logps/rejected": -809.7966918945312, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": -7.718070030212402, "rewards/margins": 11.31346607208252, "rewards/rejected": -19.031536102294922, "step": 322 }, { "epoch": 0.20093312597200622, "grad_norm": 11.428664207458496, "learning_rate": 3.3436853002070397e-06, "logits/chosen": 0.5857963562011719, "logits/rejected": 2.122978925704956, "logps/chosen": -484.9825134277344, "logps/rejected": -699.5858154296875, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": -2.3593549728393555, "rewards/margins": 8.682098388671875, "rewards/rejected": -11.041452407836914, "step": 323 }, { "epoch": 0.2015552099533437, "grad_norm": 46.258018493652344, "learning_rate": 3.3540372670807457e-06, "logits/chosen": -0.023642655462026596, "logits/rejected": 2.68320369720459, "logps/chosen": -434.5428466796875, "logps/rejected": -652.7188720703125, "loss": 1.1915, "rewards/accuracies": 0.75, "rewards/chosen": -5.017848491668701, "rewards/margins": 7.239933490753174, "rewards/rejected": -12.257781982421875, "step": 324 }, { "epoch": 0.20217729393468117, "grad_norm": 27.407543182373047, "learning_rate": 3.3643892339544517e-06, "logits/chosen": -0.966314971446991, "logits/rejected": 1.4754480123519897, "logps/chosen": -432.0926208496094, "logps/rejected": -727.2296142578125, "loss": 0.6391, "rewards/accuracies": 0.75, "rewards/chosen": -5.21859073638916, "rewards/margins": 8.61583423614502, "rewards/rejected": -13.83442497253418, "step": 325 }, { "epoch": 0.20279937791601865, "grad_norm": 1.9784207344055176, "learning_rate": 3.3747412008281577e-06, "logits/chosen": 0.732000470161438, "logits/rejected": 3.4103875160217285, "logps/chosen": -518.6851806640625, "logps/rejected": -762.8380126953125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -3.6236228942871094, "rewards/margins": 12.843097686767578, "rewards/rejected": -16.466720581054688, "step": 326 }, { "epoch": 0.20342146189735613, "grad_norm": 17.703134536743164, "learning_rate": 3.3850931677018632e-06, "logits/chosen": -0.15337622165679932, "logits/rejected": 3.2251529693603516, "logps/chosen": -447.5892028808594, "logps/rejected": -686.328125, "loss": 0.1563, "rewards/accuracies": 0.875, "rewards/chosen": -0.26287829875946045, "rewards/margins": 8.936200141906738, "rewards/rejected": -9.199078559875488, "step": 327 }, { "epoch": 0.2040435458786936, "grad_norm": 3.6893739700317383, "learning_rate": 3.3954451345755696e-06, "logits/chosen": -0.10928022861480713, "logits/rejected": 3.0014255046844482, "logps/chosen": -275.645751953125, "logps/rejected": -618.063232421875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": -1.6410908699035645, "rewards/margins": 13.339859008789062, "rewards/rejected": -14.980949401855469, "step": 328 }, { "epoch": 0.20466562986003112, "grad_norm": 32.0711784362793, "learning_rate": 3.4057971014492756e-06, "logits/chosen": 2.949876308441162, "logits/rejected": 3.1409921646118164, "logps/chosen": -831.2333984375, "logps/rejected": -917.00146484375, "loss": 0.7636, "rewards/accuracies": 0.875, "rewards/chosen": -12.08757209777832, "rewards/margins": 10.290388107299805, "rewards/rejected": -22.377960205078125, "step": 329 }, { "epoch": 0.2052877138413686, "grad_norm": 17.057174682617188, "learning_rate": 3.4161490683229816e-06, "logits/chosen": -1.017820954322815, "logits/rejected": 4.270072937011719, "logps/chosen": -252.80633544921875, "logps/rejected": -551.4617309570312, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 1.9883391857147217, "rewards/margins": 7.211501121520996, "rewards/rejected": -5.223161697387695, "step": 330 }, { "epoch": 0.20590979782270608, "grad_norm": 42.068294525146484, "learning_rate": 3.4265010351966876e-06, "logits/chosen": 0.805486798286438, "logits/rejected": 2.702273368835449, "logps/chosen": -640.76953125, "logps/rejected": -748.0137939453125, "loss": 0.9465, "rewards/accuracies": 0.875, "rewards/chosen": -4.5970258712768555, "rewards/margins": 9.613271713256836, "rewards/rejected": -14.210297584533691, "step": 331 }, { "epoch": 0.20653188180404355, "grad_norm": 28.886274337768555, "learning_rate": 3.436853002070394e-06, "logits/chosen": -1.282555341720581, "logits/rejected": 1.9352182149887085, "logps/chosen": -425.39117431640625, "logps/rejected": -682.3564453125, "loss": 0.6583, "rewards/accuracies": 0.75, "rewards/chosen": -4.6346869468688965, "rewards/margins": 7.455491065979004, "rewards/rejected": -12.090177536010742, "step": 332 }, { "epoch": 0.20715396578538103, "grad_norm": 37.93339920043945, "learning_rate": 3.4472049689440996e-06, "logits/chosen": 2.5240001678466797, "logits/rejected": 4.233339309692383, "logps/chosen": -619.6500854492188, "logps/rejected": -794.2083740234375, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": -3.68540620803833, "rewards/margins": 5.579806327819824, "rewards/rejected": -9.265213012695312, "step": 333 }, { "epoch": 0.2077760497667185, "grad_norm": 18.976966857910156, "learning_rate": 3.4575569358178055e-06, "logits/chosen": 3.476201057434082, "logits/rejected": 4.37414026260376, "logps/chosen": -620.353271484375, "logps/rejected": -740.6907958984375, "loss": 0.2442, "rewards/accuracies": 0.875, "rewards/chosen": -4.355771064758301, "rewards/margins": 8.380756378173828, "rewards/rejected": -12.736527442932129, "step": 334 }, { "epoch": 0.208398133748056, "grad_norm": 34.64626693725586, "learning_rate": 3.467908902691512e-06, "logits/chosen": 0.05781608819961548, "logits/rejected": 2.8691015243530273, "logps/chosen": -449.12384033203125, "logps/rejected": -717.6018676757812, "loss": 0.5111, "rewards/accuracies": 0.875, "rewards/chosen": -2.46394681930542, "rewards/margins": 11.127350807189941, "rewards/rejected": -13.591299057006836, "step": 335 }, { "epoch": 0.20902021772939347, "grad_norm": 53.95747375488281, "learning_rate": 3.4782608695652175e-06, "logits/chosen": -0.6395105123519897, "logits/rejected": 3.263615608215332, "logps/chosen": -465.0777587890625, "logps/rejected": -737.4808349609375, "loss": 2.0079, "rewards/accuracies": 0.75, "rewards/chosen": -4.715614318847656, "rewards/margins": 6.04669189453125, "rewards/rejected": -10.762306213378906, "step": 336 }, { "epoch": 0.20964230171073095, "grad_norm": 55.4720458984375, "learning_rate": 3.488612836438924e-06, "logits/chosen": -1.6190953254699707, "logits/rejected": 2.6271119117736816, "logps/chosen": -385.53424072265625, "logps/rejected": -694.2275390625, "loss": 0.7186, "rewards/accuracies": 0.875, "rewards/chosen": -6.31008768081665, "rewards/margins": 9.709564208984375, "rewards/rejected": -16.019651412963867, "step": 337 }, { "epoch": 0.21026438569206843, "grad_norm": 28.41864776611328, "learning_rate": 3.4989648033126295e-06, "logits/chosen": 0.5265299081802368, "logits/rejected": 1.7877110242843628, "logps/chosen": -576.763671875, "logps/rejected": -724.7598876953125, "loss": 0.3338, "rewards/accuracies": 0.75, "rewards/chosen": -4.072409152984619, "rewards/margins": 8.15505313873291, "rewards/rejected": -12.227461814880371, "step": 338 }, { "epoch": 0.2108864696734059, "grad_norm": 1.3567543029785156, "learning_rate": 3.5093167701863355e-06, "logits/chosen": 1.584625005722046, "logits/rejected": 3.8020477294921875, "logps/chosen": -429.4696960449219, "logps/rejected": -659.51708984375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -3.6522419452667236, "rewards/margins": 10.80935287475586, "rewards/rejected": -14.461594581604004, "step": 339 }, { "epoch": 0.2115085536547434, "grad_norm": 22.75896453857422, "learning_rate": 3.519668737060042e-06, "logits/chosen": -0.11166572570800781, "logits/rejected": 3.290238857269287, "logps/chosen": -433.4940185546875, "logps/rejected": -669.661376953125, "loss": 0.2866, "rewards/accuracies": 0.875, "rewards/chosen": -4.664963245391846, "rewards/margins": 8.477402687072754, "rewards/rejected": -13.142365455627441, "step": 340 }, { "epoch": 0.21213063763608087, "grad_norm": 59.272911071777344, "learning_rate": 3.5300207039337474e-06, "logits/chosen": 2.0690321922302246, "logits/rejected": 3.6204605102539062, "logps/chosen": -710.8519287109375, "logps/rejected": -870.4520263671875, "loss": 3.7634, "rewards/accuracies": 0.875, "rewards/chosen": -7.790801048278809, "rewards/margins": 8.290600776672363, "rewards/rejected": -16.081401824951172, "step": 341 }, { "epoch": 0.21275272161741834, "grad_norm": 18.303058624267578, "learning_rate": 3.540372670807454e-06, "logits/chosen": -2.2392303943634033, "logits/rejected": 3.1548311710357666, "logps/chosen": -287.3793029785156, "logps/rejected": -696.4027709960938, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": -1.4984145164489746, "rewards/margins": 13.312691688537598, "rewards/rejected": -14.81110668182373, "step": 342 }, { "epoch": 0.21337480559875582, "grad_norm": 32.04864501953125, "learning_rate": 3.55072463768116e-06, "logits/chosen": -0.019650399684906006, "logits/rejected": 5.416806221008301, "logps/chosen": -432.47296142578125, "logps/rejected": -780.5631103515625, "loss": 0.7646, "rewards/accuracies": 0.875, "rewards/chosen": -4.224555015563965, "rewards/margins": 5.373963356018066, "rewards/rejected": -9.598518371582031, "step": 343 }, { "epoch": 0.2139968895800933, "grad_norm": 0.13826704025268555, "learning_rate": 3.561076604554866e-06, "logits/chosen": 0.030586957931518555, "logits/rejected": 4.062129974365234, "logps/chosen": -449.2490539550781, "logps/rejected": -781.0255126953125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.162491798400879, "rewards/margins": 10.261180877685547, "rewards/rejected": -17.42367172241211, "step": 344 }, { "epoch": 0.21461897356143078, "grad_norm": 33.03892135620117, "learning_rate": 3.5714285714285718e-06, "logits/chosen": 0.037287890911102295, "logits/rejected": 4.113863945007324, "logps/chosen": -381.3580017089844, "logps/rejected": -648.242919921875, "loss": 0.3812, "rewards/accuracies": 0.875, "rewards/chosen": -0.9465177059173584, "rewards/margins": 5.442397117614746, "rewards/rejected": -6.388915061950684, "step": 345 }, { "epoch": 0.21524105754276826, "grad_norm": 37.0477180480957, "learning_rate": 3.5817805383022773e-06, "logits/chosen": 0.227480947971344, "logits/rejected": 2.9201414585113525, "logps/chosen": -538.601806640625, "logps/rejected": -760.7200927734375, "loss": 0.3476, "rewards/accuracies": 0.875, "rewards/chosen": -1.8433430194854736, "rewards/margins": 10.570931434631348, "rewards/rejected": -12.414274215698242, "step": 346 }, { "epoch": 0.21586314152410577, "grad_norm": 1.1853687763214111, "learning_rate": 3.5921325051759837e-06, "logits/chosen": 0.9866267442703247, "logits/rejected": 3.310483932495117, "logps/chosen": -620.07666015625, "logps/rejected": -880.2100219726562, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -6.350822925567627, "rewards/margins": 12.167201042175293, "rewards/rejected": -18.518024444580078, "step": 347 }, { "epoch": 0.21648522550544325, "grad_norm": 10.583118438720703, "learning_rate": 3.6024844720496897e-06, "logits/chosen": -2.126750946044922, "logits/rejected": 0.0941476821899414, "logps/chosen": -432.0557861328125, "logps/rejected": -669.322509765625, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": -5.652919769287109, "rewards/margins": 11.304596900939941, "rewards/rejected": -16.957515716552734, "step": 348 }, { "epoch": 0.21710730948678073, "grad_norm": 27.52307891845703, "learning_rate": 3.6128364389233957e-06, "logits/chosen": -0.28781723976135254, "logits/rejected": 4.803361892700195, "logps/chosen": -432.9546203613281, "logps/rejected": -847.7452392578125, "loss": 1.3175, "rewards/accuracies": 0.875, "rewards/chosen": -5.87117338180542, "rewards/margins": 12.279956817626953, "rewards/rejected": -18.15113067626953, "step": 349 }, { "epoch": 0.2177293934681182, "grad_norm": 0.44653281569480896, "learning_rate": 3.6231884057971017e-06, "logits/chosen": -2.8117458820343018, "logits/rejected": 1.8438504934310913, "logps/chosen": -285.66485595703125, "logps/rejected": -688.595947265625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.911851644515991, "rewards/margins": 11.518555641174316, "rewards/rejected": -15.430407524108887, "step": 350 }, { "epoch": 0.21835147744945568, "grad_norm": 49.33088302612305, "learning_rate": 3.633540372670808e-06, "logits/chosen": 0.18182387948036194, "logits/rejected": 1.4403018951416016, "logps/chosen": -533.3489990234375, "logps/rejected": -716.069091796875, "loss": 1.6784, "rewards/accuracies": 0.625, "rewards/chosen": -6.557300090789795, "rewards/margins": 5.727059364318848, "rewards/rejected": -12.284358978271484, "step": 351 }, { "epoch": 0.21897356143079316, "grad_norm": 11.414206504821777, "learning_rate": 3.6438923395445137e-06, "logits/chosen": -2.3577916622161865, "logits/rejected": 2.345189332962036, "logps/chosen": -273.41754150390625, "logps/rejected": -637.039794921875, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -4.158540725708008, "rewards/margins": 8.63189697265625, "rewards/rejected": -12.790438652038574, "step": 352 }, { "epoch": 0.21959564541213064, "grad_norm": 15.936989784240723, "learning_rate": 3.6542443064182196e-06, "logits/chosen": -0.885023832321167, "logits/rejected": 3.278656244277954, "logps/chosen": -498.98699951171875, "logps/rejected": -804.25634765625, "loss": 0.1398, "rewards/accuracies": 0.875, "rewards/chosen": -1.9475908279418945, "rewards/margins": 11.372220993041992, "rewards/rejected": -13.319811820983887, "step": 353 }, { "epoch": 0.22021772939346812, "grad_norm": 5.129945278167725, "learning_rate": 3.664596273291926e-06, "logits/chosen": -0.23889708518981934, "logits/rejected": 2.783459424972534, "logps/chosen": -460.1069641113281, "logps/rejected": -735.85693359375, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -7.499173641204834, "rewards/margins": 10.676677703857422, "rewards/rejected": -18.175851821899414, "step": 354 }, { "epoch": 0.2208398133748056, "grad_norm": 12.920183181762695, "learning_rate": 3.6749482401656316e-06, "logits/chosen": -0.26782411336898804, "logits/rejected": 2.080547332763672, "logps/chosen": -357.44122314453125, "logps/rejected": -601.374267578125, "loss": 0.1093, "rewards/accuracies": 0.875, "rewards/chosen": -3.0575757026672363, "rewards/margins": 11.874951362609863, "rewards/rejected": -14.932526588439941, "step": 355 }, { "epoch": 0.22146189735614308, "grad_norm": 3.1438405513763428, "learning_rate": 3.685300207039338e-06, "logits/chosen": 0.6319728493690491, "logits/rejected": 3.0250282287597656, "logps/chosen": -542.49169921875, "logps/rejected": -804.1852416992188, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6361920833587646, "rewards/margins": 12.459537506103516, "rewards/rejected": -14.09572982788086, "step": 356 }, { "epoch": 0.22208398133748056, "grad_norm": 36.63279342651367, "learning_rate": 3.6956521739130436e-06, "logits/chosen": 3.3123087882995605, "logits/rejected": 3.9450058937072754, "logps/chosen": -695.519287109375, "logps/rejected": -728.1754150390625, "loss": 0.3193, "rewards/accuracies": 0.75, "rewards/chosen": -6.348609447479248, "rewards/margins": 5.168727397918701, "rewards/rejected": -11.51733684539795, "step": 357 }, { "epoch": 0.22270606531881804, "grad_norm": 29.851964950561523, "learning_rate": 3.7060041407867496e-06, "logits/chosen": 0.2993144989013672, "logits/rejected": 3.471100091934204, "logps/chosen": -428.8981628417969, "logps/rejected": -622.437744140625, "loss": 0.398, "rewards/accuracies": 0.875, "rewards/chosen": -5.051437854766846, "rewards/margins": 8.075346946716309, "rewards/rejected": -13.126784324645996, "step": 358 }, { "epoch": 0.22332814930015552, "grad_norm": 1.975639820098877, "learning_rate": 3.716356107660456e-06, "logits/chosen": 1.1808077096939087, "logits/rejected": 3.8902134895324707, "logps/chosen": -600.333251953125, "logps/rejected": -811.3698120117188, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.7640974521636963, "rewards/margins": 9.818041801452637, "rewards/rejected": -12.58213996887207, "step": 359 }, { "epoch": 0.223950233281493, "grad_norm": 26.269790649414062, "learning_rate": 3.7267080745341615e-06, "logits/chosen": 0.20025908946990967, "logits/rejected": 3.0269370079040527, "logps/chosen": -549.3627319335938, "logps/rejected": -816.055908203125, "loss": 0.4161, "rewards/accuracies": 0.875, "rewards/chosen": -3.0061068534851074, "rewards/margins": 13.149438858032227, "rewards/rejected": -16.155548095703125, "step": 360 }, { "epoch": 0.22457231726283047, "grad_norm": 6.104818820953369, "learning_rate": 3.737060041407868e-06, "logits/chosen": 0.3199158310890198, "logits/rejected": 1.2874040603637695, "logps/chosen": -559.9285888671875, "logps/rejected": -674.5938720703125, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -6.582068920135498, "rewards/margins": 7.962156295776367, "rewards/rejected": -14.544224739074707, "step": 361 }, { "epoch": 0.22519440124416795, "grad_norm": 0.13571402430534363, "learning_rate": 3.747412008281574e-06, "logits/chosen": 0.565754771232605, "logits/rejected": 4.5421552658081055, "logps/chosen": -477.07855224609375, "logps/rejected": -831.6639404296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.70570182800293, "rewards/margins": 14.201147079467773, "rewards/rejected": -21.906848907470703, "step": 362 }, { "epoch": 0.22581648522550543, "grad_norm": 29.88707160949707, "learning_rate": 3.7577639751552795e-06, "logits/chosen": -0.7453135251998901, "logits/rejected": 1.5671675205230713, "logps/chosen": -481.1373291015625, "logps/rejected": -689.8429565429688, "loss": 0.5111, "rewards/accuracies": 0.875, "rewards/chosen": -8.882606506347656, "rewards/margins": 6.476565361022949, "rewards/rejected": -15.359171867370605, "step": 363 }, { "epoch": 0.2264385692068429, "grad_norm": 28.465078353881836, "learning_rate": 3.768115942028986e-06, "logits/chosen": 1.0373425483703613, "logits/rejected": 3.4107184410095215, "logps/chosen": -617.771728515625, "logps/rejected": -864.3381958007812, "loss": 0.3374, "rewards/accuracies": 0.875, "rewards/chosen": -8.847381591796875, "rewards/margins": 14.079671859741211, "rewards/rejected": -22.927051544189453, "step": 364 }, { "epoch": 0.22706065318818042, "grad_norm": 0.7878022789955139, "learning_rate": 3.7784679089026914e-06, "logits/chosen": -1.1057369709014893, "logits/rejected": 4.344394207000732, "logps/chosen": -436.24176025390625, "logps/rejected": -857.5606689453125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.23887825012207, "rewards/margins": 11.883063316345215, "rewards/rejected": -16.12194061279297, "step": 365 }, { "epoch": 0.2276827371695179, "grad_norm": 26.853925704956055, "learning_rate": 3.788819875776398e-06, "logits/chosen": -0.7507523894309998, "logits/rejected": 3.8645191192626953, "logps/chosen": -458.9233093261719, "logps/rejected": -851.0308837890625, "loss": 0.1905, "rewards/accuracies": 0.875, "rewards/chosen": -7.54487419128418, "rewards/margins": 13.677597045898438, "rewards/rejected": -21.222469329833984, "step": 366 }, { "epoch": 0.22830482115085537, "grad_norm": 0.009762109257280827, "learning_rate": 3.799171842650104e-06, "logits/chosen": 0.7624329924583435, "logits/rejected": 4.902022361755371, "logps/chosen": -479.6599426269531, "logps/rejected": -928.024169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.773331642150879, "rewards/margins": 18.00758171081543, "rewards/rejected": -23.780914306640625, "step": 367 }, { "epoch": 0.22892690513219285, "grad_norm": 21.023170471191406, "learning_rate": 3.80952380952381e-06, "logits/chosen": -0.7568533420562744, "logits/rejected": 2.8591063022613525, "logps/chosen": -502.4686279296875, "logps/rejected": -829.576171875, "loss": 0.2384, "rewards/accuracies": 0.875, "rewards/chosen": -5.506834030151367, "rewards/margins": 11.209808349609375, "rewards/rejected": -16.716642379760742, "step": 368 }, { "epoch": 0.22954898911353033, "grad_norm": 3.5660157203674316, "learning_rate": 3.819875776397516e-06, "logits/chosen": -0.4312325716018677, "logits/rejected": 2.562499523162842, "logps/chosen": -480.4518127441406, "logps/rejected": -758.7933959960938, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -6.095095634460449, "rewards/margins": 9.17817211151123, "rewards/rejected": -15.273269653320312, "step": 369 }, { "epoch": 0.2301710730948678, "grad_norm": 39.562400817871094, "learning_rate": 3.830227743271222e-06, "logits/chosen": -2.5081186294555664, "logits/rejected": 0.7461903095245361, "logps/chosen": -378.31488037109375, "logps/rejected": -717.0318603515625, "loss": 0.7567, "rewards/accuracies": 0.75, "rewards/chosen": -6.153487205505371, "rewards/margins": 7.799801826477051, "rewards/rejected": -13.953289031982422, "step": 370 }, { "epoch": 0.2307931570762053, "grad_norm": 3.9070324897766113, "learning_rate": 3.840579710144928e-06, "logits/chosen": -1.1572139263153076, "logits/rejected": 4.676782608032227, "logps/chosen": -444.24261474609375, "logps/rejected": -850.5587768554688, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -5.589866638183594, "rewards/margins": 13.885885238647461, "rewards/rejected": -19.475751876831055, "step": 371 }, { "epoch": 0.23141524105754277, "grad_norm": 1.867079496383667, "learning_rate": 3.850931677018634e-06, "logits/chosen": -1.5081374645233154, "logits/rejected": 3.0880353450775146, "logps/chosen": -357.534423828125, "logps/rejected": -728.313232421875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -4.523712158203125, "rewards/margins": 10.212594985961914, "rewards/rejected": -14.736308097839355, "step": 372 }, { "epoch": 0.23203732503888025, "grad_norm": 27.546833038330078, "learning_rate": 3.86128364389234e-06, "logits/chosen": 1.3264672756195068, "logits/rejected": 3.7906978130340576, "logps/chosen": -420.4259033203125, "logps/rejected": -580.6475830078125, "loss": 0.2752, "rewards/accuracies": 0.875, "rewards/chosen": -6.540620803833008, "rewards/margins": 7.766623497009277, "rewards/rejected": -14.307243347167969, "step": 373 }, { "epoch": 0.23265940902021773, "grad_norm": 31.622692108154297, "learning_rate": 3.871635610766046e-06, "logits/chosen": -1.0142524242401123, "logits/rejected": 1.874340295791626, "logps/chosen": -449.20782470703125, "logps/rejected": -648.7708740234375, "loss": 0.8318, "rewards/accuracies": 0.875, "rewards/chosen": -6.764393329620361, "rewards/margins": 8.824634552001953, "rewards/rejected": -15.589027404785156, "step": 374 }, { "epoch": 0.2332814930015552, "grad_norm": 4.413476467132568, "learning_rate": 3.881987577639752e-06, "logits/chosen": -2.1498818397521973, "logits/rejected": 3.9574713706970215, "logps/chosen": -471.00579833984375, "logps/rejected": -936.8933715820312, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -6.591935634613037, "rewards/margins": 15.384891510009766, "rewards/rejected": -21.976825714111328, "step": 375 }, { "epoch": 0.23390357698289269, "grad_norm": 44.951560974121094, "learning_rate": 3.892339544513457e-06, "logits/chosen": 0.2634568214416504, "logits/rejected": 4.468165874481201, "logps/chosen": -532.2499389648438, "logps/rejected": -898.7428588867188, "loss": 1.4199, "rewards/accuracies": 0.875, "rewards/chosen": -7.81072473526001, "rewards/margins": 9.554339408874512, "rewards/rejected": -17.365062713623047, "step": 376 }, { "epoch": 0.23452566096423016, "grad_norm": 0.7279941439628601, "learning_rate": 3.902691511387164e-06, "logits/chosen": -2.8469130992889404, "logits/rejected": 2.8125102519989014, "logps/chosen": -294.74200439453125, "logps/rejected": -710.29052734375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.099022150039673, "rewards/margins": 10.450815200805664, "rewards/rejected": -12.549837112426758, "step": 377 }, { "epoch": 0.23514774494556764, "grad_norm": 0.7543030977249146, "learning_rate": 3.91304347826087e-06, "logits/chosen": 0.9640889763832092, "logits/rejected": 3.2024917602539062, "logps/chosen": -529.7698364257812, "logps/rejected": -801.398193359375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -8.66275405883789, "rewards/margins": 12.464832305908203, "rewards/rejected": -21.127586364746094, "step": 378 }, { "epoch": 0.23576982892690512, "grad_norm": 3.1311933994293213, "learning_rate": 3.923395445134576e-06, "logits/chosen": 0.4315589368343353, "logits/rejected": 3.47430682182312, "logps/chosen": -391.7742919921875, "logps/rejected": -686.934814453125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -9.57724380493164, "rewards/margins": 9.273082733154297, "rewards/rejected": -18.85032844543457, "step": 379 }, { "epoch": 0.2363919129082426, "grad_norm": 51.43965148925781, "learning_rate": 3.933747412008282e-06, "logits/chosen": -0.633091151714325, "logits/rejected": 2.4071877002716064, "logps/chosen": -599.2264404296875, "logps/rejected": -844.1055908203125, "loss": 0.3701, "rewards/accuracies": 0.875, "rewards/chosen": -11.826896667480469, "rewards/margins": 10.886575698852539, "rewards/rejected": -22.713472366333008, "step": 380 }, { "epoch": 0.23701399688958008, "grad_norm": 13.136116027832031, "learning_rate": 3.9440993788819884e-06, "logits/chosen": 0.8338421583175659, "logits/rejected": 3.6499695777893066, "logps/chosen": -475.3841552734375, "logps/rejected": -757.0548095703125, "loss": 0.246, "rewards/accuracies": 0.875, "rewards/chosen": -8.005119323730469, "rewards/margins": 10.59940242767334, "rewards/rejected": -18.604522705078125, "step": 381 }, { "epoch": 0.2376360808709176, "grad_norm": 49.22789001464844, "learning_rate": 3.954451345755694e-06, "logits/chosen": -2.2940762042999268, "logits/rejected": 2.278968334197998, "logps/chosen": -396.1622619628906, "logps/rejected": -734.0848388671875, "loss": 1.6745, "rewards/accuracies": 0.875, "rewards/chosen": -6.4951090812683105, "rewards/margins": 12.099257469177246, "rewards/rejected": -18.5943660736084, "step": 382 }, { "epoch": 0.23825816485225507, "grad_norm": 38.47724533081055, "learning_rate": 3.9648033126294e-06, "logits/chosen": 1.8390161991119385, "logits/rejected": 2.8606789112091064, "logps/chosen": -547.0473022460938, "logps/rejected": -666.1442260742188, "loss": 0.8119, "rewards/accuracies": 0.75, "rewards/chosen": -8.53658676147461, "rewards/margins": 7.52390193939209, "rewards/rejected": -16.060489654541016, "step": 383 }, { "epoch": 0.23888024883359255, "grad_norm": 0.290153443813324, "learning_rate": 3.975155279503106e-06, "logits/chosen": -2.8529579639434814, "logits/rejected": 3.3242416381835938, "logps/chosen": -212.5696563720703, "logps/rejected": -679.15869140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.8149008750915527, "rewards/margins": 10.911154747009277, "rewards/rejected": -14.726055145263672, "step": 384 }, { "epoch": 0.23950233281493002, "grad_norm": 13.57795238494873, "learning_rate": 3.9855072463768115e-06, "logits/chosen": 0.6920334696769714, "logits/rejected": 3.7827398777008057, "logps/chosen": -409.10968017578125, "logps/rejected": -730.4736938476562, "loss": 0.0954, "rewards/accuracies": 0.875, "rewards/chosen": -5.51535701751709, "rewards/margins": 10.565073013305664, "rewards/rejected": -16.080429077148438, "step": 385 }, { "epoch": 0.2401244167962675, "grad_norm": 20.393552780151367, "learning_rate": 3.995859213250518e-06, "logits/chosen": -2.67911434173584, "logits/rejected": 3.7035298347473145, "logps/chosen": -335.30224609375, "logps/rejected": -857.2125854492188, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": -5.92775821685791, "rewards/margins": 16.138141632080078, "rewards/rejected": -22.065900802612305, "step": 386 }, { "epoch": 0.24074650077760498, "grad_norm": 44.758060455322266, "learning_rate": 4.0062111801242235e-06, "logits/chosen": 1.2232826948165894, "logits/rejected": 3.325286388397217, "logps/chosen": -695.9320068359375, "logps/rejected": -895.4841918945312, "loss": 1.7857, "rewards/accuracies": 0.75, "rewards/chosen": -12.098251342773438, "rewards/margins": 7.316153049468994, "rewards/rejected": -19.414403915405273, "step": 387 }, { "epoch": 0.24136858475894246, "grad_norm": 4.955570220947266, "learning_rate": 4.01656314699793e-06, "logits/chosen": 0.6446991562843323, "logits/rejected": 3.0992045402526855, "logps/chosen": -446.56939697265625, "logps/rejected": -768.3590087890625, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": -8.716497421264648, "rewards/margins": 15.004471778869629, "rewards/rejected": -23.720970153808594, "step": 388 }, { "epoch": 0.24199066874027994, "grad_norm": 0.3073206841945648, "learning_rate": 4.026915113871636e-06, "logits/chosen": -1.1649131774902344, "logits/rejected": 2.8903706073760986, "logps/chosen": -549.2460327148438, "logps/rejected": -947.3636474609375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.724716186523438, "rewards/margins": 17.002840042114258, "rewards/rejected": -25.727556228637695, "step": 389 }, { "epoch": 0.24261275272161742, "grad_norm": 46.902503967285156, "learning_rate": 4.037267080745342e-06, "logits/chosen": 4.440867900848389, "logits/rejected": 5.046853065490723, "logps/chosen": -755.260498046875, "logps/rejected": -915.3885498046875, "loss": 0.9632, "rewards/accuracies": 0.75, "rewards/chosen": -9.23651123046875, "rewards/margins": 8.646513938903809, "rewards/rejected": -17.883026123046875, "step": 390 }, { "epoch": 0.2432348367029549, "grad_norm": 62.403770446777344, "learning_rate": 4.047619047619048e-06, "logits/chosen": 1.4265656471252441, "logits/rejected": 1.5992298126220703, "logps/chosen": -569.1845703125, "logps/rejected": -579.0994262695312, "loss": 1.5848, "rewards/accuracies": 0.375, "rewards/chosen": -7.588815689086914, "rewards/margins": 3.867744207382202, "rewards/rejected": -11.456559181213379, "step": 391 }, { "epoch": 0.24385692068429238, "grad_norm": 0.5350190997123718, "learning_rate": 4.057971014492754e-06, "logits/chosen": -0.43517547845840454, "logits/rejected": 4.035444736480713, "logps/chosen": -518.9000854492188, "logps/rejected": -957.9375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.960100173950195, "rewards/margins": 20.601770401000977, "rewards/rejected": -29.561870574951172, "step": 392 }, { "epoch": 0.24447900466562986, "grad_norm": 17.18977928161621, "learning_rate": 4.06832298136646e-06, "logits/chosen": 2.52517032623291, "logits/rejected": 3.6979427337646484, "logps/chosen": -600.1431274414062, "logps/rejected": -744.0851440429688, "loss": 0.1593, "rewards/accuracies": 1.0, "rewards/chosen": -11.176643371582031, "rewards/margins": 10.406632423400879, "rewards/rejected": -21.583276748657227, "step": 393 }, { "epoch": 0.24510108864696734, "grad_norm": 26.286714553833008, "learning_rate": 4.078674948240166e-06, "logits/chosen": 0.6746093034744263, "logits/rejected": 3.103036880493164, "logps/chosen": -551.1771850585938, "logps/rejected": -789.2107543945312, "loss": 0.1156, "rewards/accuracies": 0.875, "rewards/chosen": -9.185583114624023, "rewards/margins": 12.239805221557617, "rewards/rejected": -21.42538833618164, "step": 394 }, { "epoch": 0.24572317262830481, "grad_norm": 48.16623306274414, "learning_rate": 4.089026915113871e-06, "logits/chosen": 0.9857127666473389, "logits/rejected": 2.9497580528259277, "logps/chosen": -589.0301513671875, "logps/rejected": -858.3607177734375, "loss": 0.8546, "rewards/accuracies": 0.75, "rewards/chosen": -8.686701774597168, "rewards/margins": 8.606294631958008, "rewards/rejected": -17.292997360229492, "step": 395 }, { "epoch": 0.2463452566096423, "grad_norm": 16.663297653198242, "learning_rate": 4.099378881987578e-06, "logits/chosen": -0.5236176252365112, "logits/rejected": 4.395070552825928, "logps/chosen": -501.79010009765625, "logps/rejected": -919.9902954101562, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": -7.657619953155518, "rewards/margins": 13.845070838928223, "rewards/rejected": -21.502689361572266, "step": 396 }, { "epoch": 0.24696734059097977, "grad_norm": 10.62903881072998, "learning_rate": 4.109730848861284e-06, "logits/chosen": 1.6638069152832031, "logits/rejected": 4.234158992767334, "logps/chosen": -656.40966796875, "logps/rejected": -937.57373046875, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": -12.261098861694336, "rewards/margins": 14.2492094039917, "rewards/rejected": -26.51030921936035, "step": 397 }, { "epoch": 0.24758942457231725, "grad_norm": 52.60649490356445, "learning_rate": 4.12008281573499e-06, "logits/chosen": -2.485949993133545, "logits/rejected": 1.5430916547775269, "logps/chosen": -493.4781494140625, "logps/rejected": -862.8096923828125, "loss": 1.3036, "rewards/accuracies": 0.75, "rewards/chosen": -9.662334442138672, "rewards/margins": 12.271377563476562, "rewards/rejected": -21.933712005615234, "step": 398 }, { "epoch": 0.24821150855365473, "grad_norm": 45.10889434814453, "learning_rate": 4.130434782608696e-06, "logits/chosen": -0.12205278873443604, "logits/rejected": 3.722781181335449, "logps/chosen": -527.0693359375, "logps/rejected": -907.486083984375, "loss": 0.9624, "rewards/accuracies": 0.875, "rewards/chosen": -10.117431640625, "rewards/margins": 13.028144836425781, "rewards/rejected": -23.14557647705078, "step": 399 }, { "epoch": 0.24883359253499224, "grad_norm": 30.479806900024414, "learning_rate": 4.1407867494824025e-06, "logits/chosen": -1.7838436365127563, "logits/rejected": 3.219532012939453, "logps/chosen": -398.19696044921875, "logps/rejected": -772.0665283203125, "loss": 0.2774, "rewards/accuracies": 0.875, "rewards/chosen": -4.097878456115723, "rewards/margins": 10.300621032714844, "rewards/rejected": -14.39849853515625, "step": 400 }, { "epoch": 0.24945567651632972, "grad_norm": 8.507942199707031, "learning_rate": 4.151138716356108e-06, "logits/chosen": -2.097954034805298, "logits/rejected": 0.6987195611000061, "logps/chosen": -465.9945373535156, "logps/rejected": -792.6306762695312, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": -7.525957107543945, "rewards/margins": 9.363893508911133, "rewards/rejected": -16.889850616455078, "step": 401 }, { "epoch": 0.25007776049766717, "grad_norm": 0.2137855887413025, "learning_rate": 4.1614906832298145e-06, "logits/chosen": -0.9140585660934448, "logits/rejected": 2.3151116371154785, "logps/chosen": -567.2479858398438, "logps/rejected": -872.9825439453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.398047924041748, "rewards/margins": 16.891010284423828, "rewards/rejected": -24.289058685302734, "step": 402 }, { "epoch": 0.2506998444790047, "grad_norm": 8.883277893066406, "learning_rate": 4.17184265010352e-06, "logits/chosen": 1.1128238439559937, "logits/rejected": 3.5758769512176514, "logps/chosen": -319.7587585449219, "logps/rejected": -550.59130859375, "loss": 0.1725, "rewards/accuracies": 0.875, "rewards/chosen": -5.242685317993164, "rewards/margins": 8.442047119140625, "rewards/rejected": -13.684733390808105, "step": 403 }, { "epoch": 0.2513219284603421, "grad_norm": 2.909620761871338, "learning_rate": 4.182194616977226e-06, "logits/chosen": -2.1192445755004883, "logits/rejected": 1.4824601411819458, "logps/chosen": -362.9059143066406, "logps/rejected": -629.9395751953125, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -6.2401885986328125, "rewards/margins": 11.025968551635742, "rewards/rejected": -17.266155242919922, "step": 404 }, { "epoch": 0.25194401244167963, "grad_norm": 40.915035247802734, "learning_rate": 4.192546583850932e-06, "logits/chosen": 0.9748575687408447, "logits/rejected": 3.5667386054992676, "logps/chosen": -563.0350341796875, "logps/rejected": -791.6390380859375, "loss": 1.3857, "rewards/accuracies": 0.875, "rewards/chosen": -8.867755889892578, "rewards/margins": 11.296546936035156, "rewards/rejected": -20.164302825927734, "step": 405 }, { "epoch": 0.2525660964230171, "grad_norm": 17.680635452270508, "learning_rate": 4.202898550724638e-06, "logits/chosen": -1.3480279445648193, "logits/rejected": 4.3109130859375, "logps/chosen": -554.6787109375, "logps/rejected": -936.0064086914062, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": -9.705822944641113, "rewards/margins": 10.080413818359375, "rewards/rejected": -19.786235809326172, "step": 406 }, { "epoch": 0.2531881804043546, "grad_norm": 35.00507354736328, "learning_rate": 4.213250517598344e-06, "logits/chosen": 2.5178279876708984, "logits/rejected": 4.39531135559082, "logps/chosen": -507.35345458984375, "logps/rejected": -729.5151977539062, "loss": 0.4349, "rewards/accuracies": 0.75, "rewards/chosen": -3.729861259460449, "rewards/margins": 7.778502941131592, "rewards/rejected": -11.508363723754883, "step": 407 }, { "epoch": 0.25381026438569204, "grad_norm": 0.04022481292486191, "learning_rate": 4.22360248447205e-06, "logits/chosen": -0.8258675336837769, "logits/rejected": 3.800693988800049, "logps/chosen": -420.3175048828125, "logps/rejected": -882.1793212890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.828684329986572, "rewards/margins": 18.809568405151367, "rewards/rejected": -24.63825225830078, "step": 408 }, { "epoch": 0.25443234836702955, "grad_norm": 0.001061922637745738, "learning_rate": 4.233954451345756e-06, "logits/chosen": -3.2353157997131348, "logits/rejected": 1.6717270612716675, "logps/chosen": -393.54449462890625, "logps/rejected": -819.972412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6159932017326355, "rewards/margins": 16.839954376220703, "rewards/rejected": -17.455947875976562, "step": 409 }, { "epoch": 0.25505443234836706, "grad_norm": 8.824790954589844, "learning_rate": 4.244306418219462e-06, "logits/chosen": 0.3574584126472473, "logits/rejected": 4.6828107833862305, "logps/chosen": -431.82720947265625, "logps/rejected": -833.9597778320312, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -5.883938789367676, "rewards/margins": 15.454058647155762, "rewards/rejected": -21.337997436523438, "step": 410 }, { "epoch": 0.2556765163297045, "grad_norm": 28.11722183227539, "learning_rate": 4.254658385093168e-06, "logits/chosen": 1.0341689586639404, "logits/rejected": 1.9157094955444336, "logps/chosen": -704.972412109375, "logps/rejected": -916.5335083007812, "loss": 1.1396, "rewards/accuracies": 0.875, "rewards/chosen": -6.860650062561035, "rewards/margins": 14.662042617797852, "rewards/rejected": -21.522693634033203, "step": 411 }, { "epoch": 0.256298600311042, "grad_norm": 12.06362533569336, "learning_rate": 4.265010351966874e-06, "logits/chosen": 2.358436107635498, "logits/rejected": 1.828049659729004, "logps/chosen": -598.4984741210938, "logps/rejected": -622.1403198242188, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": -5.350015163421631, "rewards/margins": 6.945611953735352, "rewards/rejected": -12.29562759399414, "step": 412 }, { "epoch": 0.25692068429237946, "grad_norm": 32.59035873413086, "learning_rate": 4.27536231884058e-06, "logits/chosen": -2.3559393882751465, "logits/rejected": -0.047882288694381714, "logps/chosen": -506.2479553222656, "logps/rejected": -774.8721923828125, "loss": 1.0308, "rewards/accuracies": 0.875, "rewards/chosen": -6.341673374176025, "rewards/margins": 11.014400482177734, "rewards/rejected": -17.3560733795166, "step": 413 }, { "epoch": 0.25754276827371697, "grad_norm": 0.0008131487993523479, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -0.24382908642292023, "logits/rejected": 3.1727211475372314, "logps/chosen": -495.8878173828125, "logps/rejected": -906.5289916992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.006720066070557, "rewards/margins": 21.804332733154297, "rewards/rejected": -25.811054229736328, "step": 414 }, { "epoch": 0.2581648522550544, "grad_norm": 53.79934310913086, "learning_rate": 4.296066252587992e-06, "logits/chosen": 2.0413005352020264, "logits/rejected": 2.457620143890381, "logps/chosen": -714.4756469726562, "logps/rejected": -817.9464721679688, "loss": 1.9911, "rewards/accuracies": 0.625, "rewards/chosen": -7.469036102294922, "rewards/margins": 7.2690205574035645, "rewards/rejected": -14.738057136535645, "step": 415 }, { "epoch": 0.25878693623639193, "grad_norm": 46.28091812133789, "learning_rate": 4.306418219461698e-06, "logits/chosen": -1.0962793827056885, "logits/rejected": 2.4222702980041504, "logps/chosen": -452.4810791015625, "logps/rejected": -901.61328125, "loss": 0.4518, "rewards/accuracies": 0.875, "rewards/chosen": -4.062511444091797, "rewards/margins": 14.822870254516602, "rewards/rejected": -18.8853816986084, "step": 416 }, { "epoch": 0.2594090202177294, "grad_norm": 5.891534805297852, "learning_rate": 4.316770186335404e-06, "logits/chosen": -2.3341612815856934, "logits/rejected": 2.0078935623168945, "logps/chosen": -371.0771789550781, "logps/rejected": -782.2593383789062, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": -5.06859016418457, "rewards/margins": 15.229127883911133, "rewards/rejected": -20.297718048095703, "step": 417 }, { "epoch": 0.2600311041990669, "grad_norm": 61.13966369628906, "learning_rate": 4.32712215320911e-06, "logits/chosen": -0.5787434577941895, "logits/rejected": 2.6994335651397705, "logps/chosen": -533.9091796875, "logps/rejected": -798.9489135742188, "loss": 2.1237, "rewards/accuracies": 0.75, "rewards/chosen": -3.8390722274780273, "rewards/margins": 6.464807987213135, "rewards/rejected": -10.30388069152832, "step": 418 }, { "epoch": 0.26065318818040434, "grad_norm": 38.46092987060547, "learning_rate": 4.337474120082817e-06, "logits/chosen": -2.4339849948883057, "logits/rejected": 2.17366886138916, "logps/chosen": -351.37957763671875, "logps/rejected": -786.957275390625, "loss": 1.4374, "rewards/accuracies": 0.875, "rewards/chosen": -5.966383934020996, "rewards/margins": 12.617194175720215, "rewards/rejected": -18.583576202392578, "step": 419 }, { "epoch": 0.26127527216174184, "grad_norm": 3.1034646034240723, "learning_rate": 4.347826086956522e-06, "logits/chosen": -1.9733619689941406, "logits/rejected": 1.6848191022872925, "logps/chosen": -379.7969970703125, "logps/rejected": -691.1990966796875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -5.863301753997803, "rewards/margins": 12.697863578796387, "rewards/rejected": -18.56116485595703, "step": 420 }, { "epoch": 0.2618973561430793, "grad_norm": 32.67521286010742, "learning_rate": 4.358178053830228e-06, "logits/chosen": 1.1796698570251465, "logits/rejected": 4.6011834144592285, "logps/chosen": -574.990234375, "logps/rejected": -920.2255859375, "loss": 0.7592, "rewards/accuracies": 0.75, "rewards/chosen": -5.079226970672607, "rewards/margins": 8.750815391540527, "rewards/rejected": -13.830042839050293, "step": 421 }, { "epoch": 0.2625194401244168, "grad_norm": 0.34506696462631226, "learning_rate": 4.368530020703934e-06, "logits/chosen": 0.18304240703582764, "logits/rejected": 2.8602895736694336, "logps/chosen": -505.0839538574219, "logps/rejected": -769.8147583007812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.4538447856903076, "rewards/margins": 13.170971870422363, "rewards/rejected": -16.62481689453125, "step": 422 }, { "epoch": 0.26314152410575425, "grad_norm": 21.526031494140625, "learning_rate": 4.37888198757764e-06, "logits/chosen": 0.37090492248535156, "logits/rejected": 3.465035915374756, "logps/chosen": -380.53021240234375, "logps/rejected": -645.5463256835938, "loss": 0.3492, "rewards/accuracies": 0.875, "rewards/chosen": -5.203587532043457, "rewards/margins": 10.76041030883789, "rewards/rejected": -15.963998794555664, "step": 423 }, { "epoch": 0.26376360808709176, "grad_norm": 0.6480556130409241, "learning_rate": 4.389233954451346e-06, "logits/chosen": 0.4218369722366333, "logits/rejected": 3.8173675537109375, "logps/chosen": -460.1600036621094, "logps/rejected": -816.0669555664062, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.8497533798217773, "rewards/margins": 16.56940460205078, "rewards/rejected": -20.419158935546875, "step": 424 }, { "epoch": 0.2643856920684292, "grad_norm": 0.35805878043174744, "learning_rate": 4.399585921325052e-06, "logits/chosen": -0.5259240865707397, "logits/rejected": 2.797184467315674, "logps/chosen": -321.9033203125, "logps/rejected": -709.5282592773438, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.251986503601074, "rewards/margins": 13.363567352294922, "rewards/rejected": -17.615554809570312, "step": 425 }, { "epoch": 0.2650077760497667, "grad_norm": 30.364904403686523, "learning_rate": 4.409937888198758e-06, "logits/chosen": 0.010838508605957031, "logits/rejected": 3.3058197498321533, "logps/chosen": -322.04998779296875, "logps/rejected": -627.85107421875, "loss": 0.3733, "rewards/accuracies": 0.875, "rewards/chosen": -0.6551088690757751, "rewards/margins": 10.415206909179688, "rewards/rejected": -11.070316314697266, "step": 426 }, { "epoch": 0.2656298600311042, "grad_norm": 0.6932123303413391, "learning_rate": 4.4202898550724645e-06, "logits/chosen": -3.0709590911865234, "logits/rejected": 1.8338202238082886, "logps/chosen": -395.63201904296875, "logps/rejected": -706.2110595703125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.838165283203125, "rewards/margins": 10.090193748474121, "rewards/rejected": -12.928359985351562, "step": 427 }, { "epoch": 0.2662519440124417, "grad_norm": 20.828861236572266, "learning_rate": 4.43064182194617e-06, "logits/chosen": 1.9217793941497803, "logits/rejected": 3.788158893585205, "logps/chosen": -446.7420654296875, "logps/rejected": -647.2725219726562, "loss": 0.1785, "rewards/accuracies": 0.875, "rewards/chosen": -3.4110867977142334, "rewards/margins": 10.278014183044434, "rewards/rejected": -13.689101219177246, "step": 428 }, { "epoch": 0.2668740279937792, "grad_norm": 72.134033203125, "learning_rate": 4.4409937888198765e-06, "logits/chosen": -1.149119257926941, "logits/rejected": 1.603652000427246, "logps/chosen": -605.6443481445312, "logps/rejected": -874.0235595703125, "loss": 2.4631, "rewards/accuracies": 0.625, "rewards/chosen": -6.702610969543457, "rewards/margins": 8.99447250366211, "rewards/rejected": -15.69708251953125, "step": 429 }, { "epoch": 0.26749611197511663, "grad_norm": 14.545768737792969, "learning_rate": 4.451345755693582e-06, "logits/chosen": 0.7905763387680054, "logits/rejected": 2.0667171478271484, "logps/chosen": -578.0338745117188, "logps/rejected": -808.4791259765625, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": -5.076621055603027, "rewards/margins": 11.813894271850586, "rewards/rejected": -16.890514373779297, "step": 430 }, { "epoch": 0.26811819595645414, "grad_norm": 2.8750247955322266, "learning_rate": 4.4616977225672884e-06, "logits/chosen": 0.553899884223938, "logits/rejected": 1.9621305465698242, "logps/chosen": -432.06707763671875, "logps/rejected": -614.139404296875, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -6.893393039703369, "rewards/margins": 7.513227462768555, "rewards/rejected": -14.406620025634766, "step": 431 }, { "epoch": 0.2687402799377916, "grad_norm": 0.06428180634975433, "learning_rate": 4.472049689440994e-06, "logits/chosen": -3.2774603366851807, "logits/rejected": 3.1442956924438477, "logps/chosen": -306.5682678222656, "logps/rejected": -784.934814453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5532071590423584, "rewards/margins": 16.325817108154297, "rewards/rejected": -19.879024505615234, "step": 432 }, { "epoch": 0.2693623639191291, "grad_norm": 0.01828506775200367, "learning_rate": 4.4824016563146996e-06, "logits/chosen": -2.483086347579956, "logits/rejected": 2.2985618114471436, "logps/chosen": -339.38409423828125, "logps/rejected": -746.3882446289062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.604024887084961, "rewards/margins": 13.318676948547363, "rewards/rejected": -16.92270278930664, "step": 433 }, { "epoch": 0.26998444790046655, "grad_norm": 31.069183349609375, "learning_rate": 4.492753623188406e-06, "logits/chosen": 0.32477349042892456, "logits/rejected": 3.5227673053741455, "logps/chosen": -445.853515625, "logps/rejected": -733.571044921875, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -6.151865482330322, "rewards/margins": 11.84760856628418, "rewards/rejected": -17.999475479125977, "step": 434 }, { "epoch": 0.27060653188180406, "grad_norm": 0.9645976424217224, "learning_rate": 4.503105590062112e-06, "logits/chosen": -0.11669465899467468, "logits/rejected": 4.126821041107178, "logps/chosen": -368.92138671875, "logps/rejected": -812.9979248046875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -2.660503625869751, "rewards/margins": 15.838494300842285, "rewards/rejected": -18.498998641967773, "step": 435 }, { "epoch": 0.2712286158631415, "grad_norm": 39.218509674072266, "learning_rate": 4.513457556935818e-06, "logits/chosen": 0.9734901785850525, "logits/rejected": 2.7167110443115234, "logps/chosen": -637.0340576171875, "logps/rejected": -781.2739868164062, "loss": 0.4874, "rewards/accuracies": 0.75, "rewards/chosen": -7.061949729919434, "rewards/margins": 6.840909004211426, "rewards/rejected": -13.90285873413086, "step": 436 }, { "epoch": 0.271850699844479, "grad_norm": 0.06801696121692657, "learning_rate": 4.523809523809524e-06, "logits/chosen": -0.26763665676116943, "logits/rejected": 2.6673028469085693, "logps/chosen": -445.45440673828125, "logps/rejected": -784.701904296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.659411907196045, "rewards/margins": 14.931681632995605, "rewards/rejected": -21.591094970703125, "step": 437 }, { "epoch": 0.27247278382581647, "grad_norm": 36.035675048828125, "learning_rate": 4.534161490683231e-06, "logits/chosen": -0.16339880228042603, "logits/rejected": 2.9149556159973145, "logps/chosen": -583.873779296875, "logps/rejected": -795.2122192382812, "loss": 0.3557, "rewards/accuracies": 0.875, "rewards/chosen": -4.639849662780762, "rewards/margins": 12.784384727478027, "rewards/rejected": -17.42423439025879, "step": 438 }, { "epoch": 0.273094867807154, "grad_norm": 0.8856536746025085, "learning_rate": 4.544513457556936e-06, "logits/chosen": -1.4041645526885986, "logits/rejected": 2.2672030925750732, "logps/chosen": -365.6314697265625, "logps/rejected": -686.5092163085938, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -6.813375949859619, "rewards/margins": 12.718483924865723, "rewards/rejected": -19.5318603515625, "step": 439 }, { "epoch": 0.2737169517884914, "grad_norm": 3.0047109127044678, "learning_rate": 4.554865424430642e-06, "logits/chosen": 0.23409417271614075, "logits/rejected": 3.16221284866333, "logps/chosen": -432.616455078125, "logps/rejected": -746.5999145507812, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -4.304468154907227, "rewards/margins": 14.120923042297363, "rewards/rejected": -18.425392150878906, "step": 440 }, { "epoch": 0.27433903576982893, "grad_norm": 17.851999282836914, "learning_rate": 4.565217391304348e-06, "logits/chosen": 0.34363794326782227, "logits/rejected": 1.5882227420806885, "logps/chosen": -498.787353515625, "logps/rejected": -747.3587646484375, "loss": 0.1145, "rewards/accuracies": 0.875, "rewards/chosen": -4.135153770446777, "rewards/margins": 14.027706146240234, "rewards/rejected": -18.162860870361328, "step": 441 }, { "epoch": 0.2749611197511664, "grad_norm": 0.005955725442618132, "learning_rate": 4.575569358178054e-06, "logits/chosen": -1.9308936595916748, "logits/rejected": 4.959326267242432, "logps/chosen": -230.3758544921875, "logps/rejected": -687.7498779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3154499530792236, "rewards/margins": 15.841753959655762, "rewards/rejected": -18.157203674316406, "step": 442 }, { "epoch": 0.2755832037325039, "grad_norm": 6.812429904937744, "learning_rate": 4.58592132505176e-06, "logits/chosen": -1.1785022020339966, "logits/rejected": 4.522409915924072, "logps/chosen": -390.4237976074219, "logps/rejected": -832.566650390625, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -1.7003462314605713, "rewards/margins": 13.203170776367188, "rewards/rejected": -14.90351676940918, "step": 443 }, { "epoch": 0.27620528771384134, "grad_norm": 0.40864428877830505, "learning_rate": 4.596273291925466e-06, "logits/chosen": 0.9464837312698364, "logits/rejected": 3.3178625106811523, "logps/chosen": -486.07080078125, "logps/rejected": -789.6358642578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -5.384681224822998, "rewards/margins": 13.80879020690918, "rewards/rejected": -19.19347381591797, "step": 444 }, { "epoch": 0.27682737169517885, "grad_norm": 9.587814331054688, "learning_rate": 4.606625258799172e-06, "logits/chosen": -0.3521985411643982, "logits/rejected": 1.9633857011795044, "logps/chosen": -317.2825622558594, "logps/rejected": -631.1461791992188, "loss": 0.1236, "rewards/accuracies": 0.875, "rewards/chosen": -5.156929016113281, "rewards/margins": 11.896200180053711, "rewards/rejected": -17.053129196166992, "step": 445 }, { "epoch": 0.27744945567651635, "grad_norm": 18.159738540649414, "learning_rate": 4.616977225672879e-06, "logits/chosen": -2.9285995960235596, "logits/rejected": 2.2748818397521973, "logps/chosen": -380.7985534667969, "logps/rejected": -957.358154296875, "loss": 0.1393, "rewards/accuracies": 0.875, "rewards/chosen": -3.3776912689208984, "rewards/margins": 15.898237228393555, "rewards/rejected": -19.275928497314453, "step": 446 }, { "epoch": 0.2780715396578538, "grad_norm": 14.753242492675781, "learning_rate": 4.627329192546584e-06, "logits/chosen": 0.5322920680046082, "logits/rejected": 2.511157274246216, "logps/chosen": -571.1580200195312, "logps/rejected": -745.7861328125, "loss": 0.1431, "rewards/accuracies": 0.875, "rewards/chosen": -6.477001667022705, "rewards/margins": 10.424528121948242, "rewards/rejected": -16.901531219482422, "step": 447 }, { "epoch": 0.2786936236391913, "grad_norm": 41.926910400390625, "learning_rate": 4.637681159420291e-06, "logits/chosen": -0.680780291557312, "logits/rejected": 2.3562331199645996, "logps/chosen": -612.2960205078125, "logps/rejected": -949.9226684570312, "loss": 0.6981, "rewards/accuracies": 0.875, "rewards/chosen": -12.722881317138672, "rewards/margins": 13.012408256530762, "rewards/rejected": -25.73529052734375, "step": 448 }, { "epoch": 0.27931570762052876, "grad_norm": 64.53588104248047, "learning_rate": 4.648033126293996e-06, "logits/chosen": 0.5436501502990723, "logits/rejected": 1.3081331253051758, "logps/chosen": -639.3142700195312, "logps/rejected": -768.12109375, "loss": 2.0345, "rewards/accuracies": 0.75, "rewards/chosen": -10.196063995361328, "rewards/margins": 7.856705665588379, "rewards/rejected": -18.052770614624023, "step": 449 }, { "epoch": 0.27993779160186627, "grad_norm": 0.13056936860084534, "learning_rate": 4.6583850931677025e-06, "logits/chosen": -1.6725151538848877, "logits/rejected": 1.1285243034362793, "logps/chosen": -322.6612548828125, "logps/rejected": -694.7567749023438, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.016662836074829, "rewards/margins": 12.84094524383545, "rewards/rejected": -15.857608795166016, "step": 450 }, { "epoch": 0.2805598755832037, "grad_norm": 19.141525268554688, "learning_rate": 4.668737060041408e-06, "logits/chosen": -1.0159063339233398, "logits/rejected": 2.010833501815796, "logps/chosen": -506.7704772949219, "logps/rejected": -794.066162109375, "loss": 0.1208, "rewards/accuracies": 0.875, "rewards/chosen": -2.9857184886932373, "rewards/margins": 14.262092590332031, "rewards/rejected": -17.24781036376953, "step": 451 }, { "epoch": 0.28118195956454123, "grad_norm": 0.5353635549545288, "learning_rate": 4.679089026915114e-06, "logits/chosen": 1.4490152597427368, "logits/rejected": 3.0750107765197754, "logps/chosen": -593.1810913085938, "logps/rejected": -867.7335815429688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -5.067837238311768, "rewards/margins": 19.971832275390625, "rewards/rejected": -25.039670944213867, "step": 452 }, { "epoch": 0.2818040435458787, "grad_norm": 33.27368927001953, "learning_rate": 4.68944099378882e-06, "logits/chosen": -0.8176746964454651, "logits/rejected": 4.199805736541748, "logps/chosen": -414.0248107910156, "logps/rejected": -875.6469116210938, "loss": 0.4048, "rewards/accuracies": 0.875, "rewards/chosen": -7.070443630218506, "rewards/margins": 17.442882537841797, "rewards/rejected": -24.513324737548828, "step": 453 }, { "epoch": 0.2824261275272162, "grad_norm": 0.5116642117500305, "learning_rate": 4.6997929606625265e-06, "logits/chosen": -0.9462630152702332, "logits/rejected": 4.389346122741699, "logps/chosen": -450.0084533691406, "logps/rejected": -879.887451171875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -5.727695941925049, "rewards/margins": 14.461294174194336, "rewards/rejected": -20.188989639282227, "step": 454 }, { "epoch": 0.28304821150855364, "grad_norm": 0.004335819277912378, "learning_rate": 4.710144927536232e-06, "logits/chosen": -0.34563398361206055, "logits/rejected": 2.4481041431427, "logps/chosen": -492.546630859375, "logps/rejected": -868.8828735351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.161014556884766, "rewards/margins": 17.945571899414062, "rewards/rejected": -25.106586456298828, "step": 455 }, { "epoch": 0.28367029548989114, "grad_norm": 4.237789630889893, "learning_rate": 4.7204968944099384e-06, "logits/chosen": 1.1531126499176025, "logits/rejected": 3.893054485321045, "logps/chosen": -517.807373046875, "logps/rejected": -700.8167114257812, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -8.180765151977539, "rewards/margins": 10.975384712219238, "rewards/rejected": -19.156150817871094, "step": 456 }, { "epoch": 0.2842923794712286, "grad_norm": 29.369626998901367, "learning_rate": 4.730848861283645e-06, "logits/chosen": -0.09173685312271118, "logits/rejected": 4.074847221374512, "logps/chosen": -400.05731201171875, "logps/rejected": -793.289794921875, "loss": 0.3132, "rewards/accuracies": 0.875, "rewards/chosen": -2.655954599380493, "rewards/margins": 14.949386596679688, "rewards/rejected": -17.6053409576416, "step": 457 }, { "epoch": 0.2849144634525661, "grad_norm": 0.08755633234977722, "learning_rate": 4.74120082815735e-06, "logits/chosen": 2.690290927886963, "logits/rejected": 4.742177486419678, "logps/chosen": -631.9013671875, "logps/rejected": -792.9278564453125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.875566005706787, "rewards/margins": 11.673030853271484, "rewards/rejected": -17.548595428466797, "step": 458 }, { "epoch": 0.28553654743390355, "grad_norm": 19.94220733642578, "learning_rate": 4.751552795031056e-06, "logits/chosen": -1.3704369068145752, "logits/rejected": 3.496886968612671, "logps/chosen": -335.9270324707031, "logps/rejected": -774.93896484375, "loss": 0.2088, "rewards/accuracies": 0.875, "rewards/chosen": -3.90877628326416, "rewards/margins": 16.912059783935547, "rewards/rejected": -20.82083511352539, "step": 459 }, { "epoch": 0.28615863141524106, "grad_norm": 19.56954574584961, "learning_rate": 4.761904761904762e-06, "logits/chosen": -1.0929023027420044, "logits/rejected": 2.552992582321167, "logps/chosen": -454.52703857421875, "logps/rejected": -862.881103515625, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": -7.9837646484375, "rewards/margins": 12.657257080078125, "rewards/rejected": -20.641021728515625, "step": 460 }, { "epoch": 0.2867807153965785, "grad_norm": 49.00531005859375, "learning_rate": 4.772256728778468e-06, "logits/chosen": 1.0357595682144165, "logits/rejected": 3.4586753845214844, "logps/chosen": -576.972900390625, "logps/rejected": -813.3095703125, "loss": 2.7897, "rewards/accuracies": 0.875, "rewards/chosen": -10.867692947387695, "rewards/margins": 8.180194854736328, "rewards/rejected": -19.04788589477539, "step": 461 }, { "epoch": 0.287402799377916, "grad_norm": 35.561561584472656, "learning_rate": 4.782608695652174e-06, "logits/chosen": -0.6484382748603821, "logits/rejected": 1.8681774139404297, "logps/chosen": -480.96697998046875, "logps/rejected": -675.93505859375, "loss": 0.7702, "rewards/accuracies": 0.75, "rewards/chosen": -7.842622756958008, "rewards/margins": 7.973590850830078, "rewards/rejected": -15.816213607788086, "step": 462 }, { "epoch": 0.2880248833592535, "grad_norm": 13.397370338439941, "learning_rate": 4.79296066252588e-06, "logits/chosen": -0.4268653988838196, "logits/rejected": 1.851413607597351, "logps/chosen": -558.5866088867188, "logps/rejected": -824.4746704101562, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": -11.017292976379395, "rewards/margins": 15.577205657958984, "rewards/rejected": -26.594499588012695, "step": 463 }, { "epoch": 0.288646967340591, "grad_norm": 9.706470489501953, "learning_rate": 4.803312629399586e-06, "logits/chosen": 0.26902830600738525, "logits/rejected": 2.9063727855682373, "logps/chosen": -592.06103515625, "logps/rejected": -944.8291015625, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -9.126492500305176, "rewards/margins": 15.503677368164062, "rewards/rejected": -24.630168914794922, "step": 464 }, { "epoch": 0.2892690513219285, "grad_norm": 10.245887756347656, "learning_rate": 4.813664596273293e-06, "logits/chosen": 1.1928220987319946, "logits/rejected": 2.9932894706726074, "logps/chosen": -420.79364013671875, "logps/rejected": -637.9657592773438, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": -5.203188896179199, "rewards/margins": 10.591708183288574, "rewards/rejected": -15.794897079467773, "step": 465 }, { "epoch": 0.28989113530326593, "grad_norm": 2.713153600692749, "learning_rate": 4.824016563146998e-06, "logits/chosen": -2.3040497303009033, "logits/rejected": 2.769646406173706, "logps/chosen": -466.1157531738281, "logps/rejected": -865.4462890625, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -6.633267879486084, "rewards/margins": 12.267324447631836, "rewards/rejected": -18.900592803955078, "step": 466 }, { "epoch": 0.29051321928460344, "grad_norm": 10.755620002746582, "learning_rate": 4.834368530020705e-06, "logits/chosen": 0.9057645797729492, "logits/rejected": 2.70548415184021, "logps/chosen": -634.2490234375, "logps/rejected": -898.1949462890625, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": -6.538129806518555, "rewards/margins": 10.6325101852417, "rewards/rejected": -17.17064094543457, "step": 467 }, { "epoch": 0.2911353032659409, "grad_norm": 0.32542532682418823, "learning_rate": 4.84472049689441e-06, "logits/chosen": 0.36011022329330444, "logits/rejected": 3.592407703399658, "logps/chosen": -515.8699340820312, "logps/rejected": -852.7752075195312, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -8.930538177490234, "rewards/margins": 12.950520515441895, "rewards/rejected": -21.881057739257812, "step": 468 }, { "epoch": 0.2917573872472784, "grad_norm": 18.29994010925293, "learning_rate": 4.855072463768117e-06, "logits/chosen": -2.088484764099121, "logits/rejected": 1.2744338512420654, "logps/chosen": -515.333984375, "logps/rejected": -878.5330810546875, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": -8.84554386138916, "rewards/margins": 16.93134880065918, "rewards/rejected": -25.77689552307129, "step": 469 }, { "epoch": 0.29237947122861585, "grad_norm": 0.0330018624663353, "learning_rate": 4.865424430641822e-06, "logits/chosen": -1.7830653190612793, "logits/rejected": 4.1540117263793945, "logps/chosen": -318.3174133300781, "logps/rejected": -939.5980224609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.052013397216797, "rewards/margins": 23.681976318359375, "rewards/rejected": -29.733993530273438, "step": 470 }, { "epoch": 0.29300155520995336, "grad_norm": 1.8088337182998657, "learning_rate": 4.875776397515528e-06, "logits/chosen": -1.733601450920105, "logits/rejected": 3.224742889404297, "logps/chosen": -384.46563720703125, "logps/rejected": -849.8120727539062, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -3.0474658012390137, "rewards/margins": 15.848743438720703, "rewards/rejected": -18.896209716796875, "step": 471 }, { "epoch": 0.2936236391912908, "grad_norm": 36.6681022644043, "learning_rate": 4.886128364389234e-06, "logits/chosen": 4.0972490310668945, "logits/rejected": 3.991466760635376, "logps/chosen": -659.598876953125, "logps/rejected": -745.9718017578125, "loss": 0.9151, "rewards/accuracies": 0.875, "rewards/chosen": -10.811805725097656, "rewards/margins": 10.334466934204102, "rewards/rejected": -21.146270751953125, "step": 472 }, { "epoch": 0.2942457231726283, "grad_norm": 25.90192222595215, "learning_rate": 4.896480331262941e-06, "logits/chosen": 0.19086448848247528, "logits/rejected": 4.012401103973389, "logps/chosen": -527.1025390625, "logps/rejected": -888.753173828125, "loss": 0.2166, "rewards/accuracies": 0.875, "rewards/chosen": -10.20269775390625, "rewards/margins": 16.669485092163086, "rewards/rejected": -26.872182846069336, "step": 473 }, { "epoch": 0.29486780715396577, "grad_norm": 33.641075134277344, "learning_rate": 4.906832298136646e-06, "logits/chosen": -0.06766408681869507, "logits/rejected": 1.6043598651885986, "logps/chosen": -641.4823608398438, "logps/rejected": -823.9783935546875, "loss": 0.8575, "rewards/accuracies": 0.875, "rewards/chosen": -6.947844505310059, "rewards/margins": 13.2529878616333, "rewards/rejected": -20.20083236694336, "step": 474 }, { "epoch": 0.2954898911353033, "grad_norm": 28.769502639770508, "learning_rate": 4.9171842650103525e-06, "logits/chosen": -2.258347749710083, "logits/rejected": 2.3731532096862793, "logps/chosen": -292.4906311035156, "logps/rejected": -720.4879760742188, "loss": 0.7488, "rewards/accuracies": 0.875, "rewards/chosen": -5.584536075592041, "rewards/margins": 18.020307540893555, "rewards/rejected": -23.60484504699707, "step": 475 }, { "epoch": 0.2961119751166407, "grad_norm": 11.994034767150879, "learning_rate": 4.927536231884059e-06, "logits/chosen": -0.4436854124069214, "logits/rejected": 3.012737989425659, "logps/chosen": -534.578369140625, "logps/rejected": -852.6429443359375, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": -10.41529655456543, "rewards/margins": 11.558574676513672, "rewards/rejected": -21.9738712310791, "step": 476 }, { "epoch": 0.29673405909797823, "grad_norm": 43.14473342895508, "learning_rate": 4.9378881987577645e-06, "logits/chosen": -1.4974907636642456, "logits/rejected": 1.5613664388656616, "logps/chosen": -588.545654296875, "logps/rejected": -959.09521484375, "loss": 0.628, "rewards/accuracies": 0.875, "rewards/chosen": -9.135448455810547, "rewards/margins": 14.607860565185547, "rewards/rejected": -23.743309020996094, "step": 477 }, { "epoch": 0.2973561430793157, "grad_norm": 0.07286535203456879, "learning_rate": 4.94824016563147e-06, "logits/chosen": 2.7268476486206055, "logits/rejected": 5.009843349456787, "logps/chosen": -562.8731079101562, "logps/rejected": -846.2417602539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.0041303634643555, "rewards/margins": 14.70496940612793, "rewards/rejected": -18.7091007232666, "step": 478 }, { "epoch": 0.2979782270606532, "grad_norm": 21.970552444458008, "learning_rate": 4.9585921325051765e-06, "logits/chosen": -0.09169107675552368, "logits/rejected": 3.7837038040161133, "logps/chosen": -594.7907104492188, "logps/rejected": -975.8360595703125, "loss": 0.1053, "rewards/accuracies": 0.875, "rewards/chosen": -8.725290298461914, "rewards/margins": 16.457599639892578, "rewards/rejected": -25.182891845703125, "step": 479 }, { "epoch": 0.2986003110419907, "grad_norm": 0.8716603517532349, "learning_rate": 4.968944099378882e-06, "logits/chosen": -1.1426682472229004, "logits/rejected": 3.6567468643188477, "logps/chosen": -467.8244323730469, "logps/rejected": -828.1360473632812, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -9.560232162475586, "rewards/margins": 15.44735050201416, "rewards/rejected": -25.007583618164062, "step": 480 }, { "epoch": 0.29922239502332815, "grad_norm": 33.56404113769531, "learning_rate": 4.9792960662525884e-06, "logits/chosen": 0.7220965623855591, "logits/rejected": 3.8607287406921387, "logps/chosen": -571.3853149414062, "logps/rejected": -981.12353515625, "loss": 0.7797, "rewards/accuracies": 0.875, "rewards/chosen": -9.732620239257812, "rewards/margins": 18.23125648498535, "rewards/rejected": -27.963876724243164, "step": 481 }, { "epoch": 0.29984447900466565, "grad_norm": 0.08978770673274994, "learning_rate": 4.989648033126294e-06, "logits/chosen": -1.091248869895935, "logits/rejected": 3.5144314765930176, "logps/chosen": -335.8171691894531, "logps/rejected": -747.2073364257812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.7346973419189453, "rewards/margins": 15.793294906616211, "rewards/rejected": -19.527992248535156, "step": 482 }, { "epoch": 0.3004665629860031, "grad_norm": 3.013458490371704, "learning_rate": 5e-06, "logits/chosen": 0.6055781245231628, "logits/rejected": 2.4230387210845947, "logps/chosen": -551.6712036132812, "logps/rejected": -721.6043701171875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -3.386186361312866, "rewards/margins": 14.0813627243042, "rewards/rejected": -17.46755027770996, "step": 483 }, { "epoch": 0.3010886469673406, "grad_norm": 30.440462112426758, "learning_rate": 4.998847395112956e-06, "logits/chosen": -1.0507476329803467, "logits/rejected": 2.1822104454040527, "logps/chosen": -519.81787109375, "logps/rejected": -819.217041015625, "loss": 0.6089, "rewards/accuracies": 0.875, "rewards/chosen": -5.95994234085083, "rewards/margins": 10.954721450805664, "rewards/rejected": -16.91466522216797, "step": 484 }, { "epoch": 0.30171073094867806, "grad_norm": 0.5171369910240173, "learning_rate": 4.997694790225911e-06, "logits/chosen": 1.39463210105896, "logits/rejected": 3.4383838176727295, "logps/chosen": -607.8924560546875, "logps/rejected": -851.253173828125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -9.285219192504883, "rewards/margins": 17.140562057495117, "rewards/rejected": -26.42578125, "step": 485 }, { "epoch": 0.30233281493001557, "grad_norm": 0.0036457055248320103, "learning_rate": 4.996542185338866e-06, "logits/chosen": 0.37686654925346375, "logits/rejected": 3.2330291271209717, "logps/chosen": -513.7345581054688, "logps/rejected": -845.1243896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9581902027130127, "rewards/margins": 16.60371208190918, "rewards/rejected": -19.561901092529297, "step": 486 }, { "epoch": 0.302954898911353, "grad_norm": 19.37331199645996, "learning_rate": 4.995389580451821e-06, "logits/chosen": 0.7491446733474731, "logits/rejected": 4.478757858276367, "logps/chosen": -346.427734375, "logps/rejected": -620.2800903320312, "loss": 0.1591, "rewards/accuracies": 0.875, "rewards/chosen": -3.718315601348877, "rewards/margins": 9.981128692626953, "rewards/rejected": -13.699443817138672, "step": 487 }, { "epoch": 0.30357698289269053, "grad_norm": 30.425840377807617, "learning_rate": 4.9942369755647765e-06, "logits/chosen": 0.05541801452636719, "logits/rejected": 2.531036376953125, "logps/chosen": -521.7699584960938, "logps/rejected": -792.4808349609375, "loss": 0.3181, "rewards/accuracies": 0.875, "rewards/chosen": -4.5134687423706055, "rewards/margins": 13.561178207397461, "rewards/rejected": -18.07464599609375, "step": 488 }, { "epoch": 0.304199066874028, "grad_norm": 0.3623151481151581, "learning_rate": 4.993084370677732e-06, "logits/chosen": -0.7831491231918335, "logits/rejected": 2.9344162940979004, "logps/chosen": -465.19976806640625, "logps/rejected": -898.609130859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.581158399581909, "rewards/margins": 16.225189208984375, "rewards/rejected": -19.806346893310547, "step": 489 }, { "epoch": 0.3048211508553655, "grad_norm": 0.0868559256196022, "learning_rate": 4.991931765790687e-06, "logits/chosen": -0.7188588380813599, "logits/rejected": 3.194019317626953, "logps/chosen": -353.9021911621094, "logps/rejected": -728.07373046875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.075817823410034, "rewards/margins": 15.577851295471191, "rewards/rejected": -18.653669357299805, "step": 490 }, { "epoch": 0.30544323483670294, "grad_norm": 3.0214992875698954e-05, "learning_rate": 4.990779160903643e-06, "logits/chosen": -4.055235862731934, "logits/rejected": 2.879169464111328, "logps/chosen": -296.3211975097656, "logps/rejected": -1010.753173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0184645652770996, "rewards/margins": 24.33975601196289, "rewards/rejected": -27.35822105407715, "step": 491 }, { "epoch": 0.30606531881804044, "grad_norm": 0.01288218330591917, "learning_rate": 4.989626556016598e-06, "logits/chosen": -1.2633652687072754, "logits/rejected": 3.8267054557800293, "logps/chosen": -355.5899658203125, "logps/rejected": -829.5449829101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8909149169921875, "rewards/margins": 20.595117568969727, "rewards/rejected": -23.48603057861328, "step": 492 }, { "epoch": 0.3066874027993779, "grad_norm": 7.490941524505615, "learning_rate": 4.9884739511295535e-06, "logits/chosen": 0.9076950550079346, "logits/rejected": 3.3427844047546387, "logps/chosen": -536.124755859375, "logps/rejected": -888.4501953125, "loss": 0.2884, "rewards/accuracies": 0.875, "rewards/chosen": -6.489822864532471, "rewards/margins": 21.485950469970703, "rewards/rejected": -27.97577476501465, "step": 493 }, { "epoch": 0.3073094867807154, "grad_norm": 11.447746276855469, "learning_rate": 4.987321346242509e-06, "logits/chosen": -1.4060472249984741, "logits/rejected": 3.44411563873291, "logps/chosen": -309.3318786621094, "logps/rejected": -801.0556640625, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -5.114956378936768, "rewards/margins": 15.835704803466797, "rewards/rejected": -20.950660705566406, "step": 494 }, { "epoch": 0.30793157076205285, "grad_norm": 6.083962917327881, "learning_rate": 4.986168741355464e-06, "logits/chosen": -0.1779065728187561, "logits/rejected": 3.5842981338500977, "logps/chosen": -526.7737426757812, "logps/rejected": -901.7969970703125, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -6.71384859085083, "rewards/margins": 17.37543487548828, "rewards/rejected": -24.089284896850586, "step": 495 }, { "epoch": 0.30855365474339036, "grad_norm": 0.15468864142894745, "learning_rate": 4.985016136468419e-06, "logits/chosen": 1.059091329574585, "logits/rejected": 3.2236602306365967, "logps/chosen": -624.5120849609375, "logps/rejected": -944.41748046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.029439926147461, "rewards/margins": 17.166460037231445, "rewards/rejected": -26.19590187072754, "step": 496 }, { "epoch": 0.3091757387247278, "grad_norm": 3.3190361136803403e-05, "learning_rate": 4.983863531581374e-06, "logits/chosen": -0.9245209097862244, "logits/rejected": 3.136207103729248, "logps/chosen": -439.8807373046875, "logps/rejected": -856.2329711914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.595765590667725, "rewards/margins": 19.405052185058594, "rewards/rejected": -26.000818252563477, "step": 497 }, { "epoch": 0.3097978227060653, "grad_norm": 19.35166358947754, "learning_rate": 4.98271092669433e-06, "logits/chosen": -0.9028311967849731, "logits/rejected": 0.7717417478561401, "logps/chosen": -487.8577880859375, "logps/rejected": -661.5962524414062, "loss": 0.3424, "rewards/accuracies": 0.875, "rewards/chosen": -7.07199239730835, "rewards/margins": 11.381423950195312, "rewards/rejected": -18.453414916992188, "step": 498 }, { "epoch": 0.3104199066874028, "grad_norm": 5.1509809494018555, "learning_rate": 4.981558321807285e-06, "logits/chosen": 2.9335570335388184, "logits/rejected": 4.180946350097656, "logps/chosen": -523.1350708007812, "logps/rejected": -706.580078125, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": -10.346258163452148, "rewards/margins": 10.622045516967773, "rewards/rejected": -20.968303680419922, "step": 499 }, { "epoch": 0.3110419906687403, "grad_norm": 46.103485107421875, "learning_rate": 4.98040571692024e-06, "logits/chosen": 1.3831346035003662, "logits/rejected": 2.8288354873657227, "logps/chosen": -670.0927734375, "logps/rejected": -866.8636474609375, "loss": 2.0674, "rewards/accuracies": 0.75, "rewards/chosen": -5.911185264587402, "rewards/margins": 7.389866828918457, "rewards/rejected": -13.30105209350586, "step": 500 }, { "epoch": 0.3116640746500778, "grad_norm": 2.393094539642334, "learning_rate": 4.979253112033195e-06, "logits/chosen": -1.0027861595153809, "logits/rejected": 4.648131847381592, "logps/chosen": -314.61761474609375, "logps/rejected": -713.0402221679688, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -4.013879776000977, "rewards/margins": 11.831021308898926, "rewards/rejected": -15.844901084899902, "step": 501 }, { "epoch": 0.31228615863141523, "grad_norm": 0.044700928032398224, "learning_rate": 4.9781005071461505e-06, "logits/chosen": -0.232437402009964, "logits/rejected": 1.7130168676376343, "logps/chosen": -703.8182373046875, "logps/rejected": -967.1251831054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.608842372894287, "rewards/margins": 13.548971176147461, "rewards/rejected": -21.157814025878906, "step": 502 }, { "epoch": 0.31290824261275274, "grad_norm": 28.902780532836914, "learning_rate": 4.976947902259106e-06, "logits/chosen": 0.051264986395835876, "logits/rejected": 3.9489521980285645, "logps/chosen": -476.07147216796875, "logps/rejected": -838.5808715820312, "loss": 0.1531, "rewards/accuracies": 0.875, "rewards/chosen": -4.013816833496094, "rewards/margins": 14.595891952514648, "rewards/rejected": -18.609710693359375, "step": 503 }, { "epoch": 0.3135303265940902, "grad_norm": 2.9529836177825928, "learning_rate": 4.975795297372061e-06, "logits/chosen": 0.06787616014480591, "logits/rejected": 3.1880979537963867, "logps/chosen": -502.477783203125, "logps/rejected": -835.5942993164062, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -3.26987361907959, "rewards/margins": 13.25092887878418, "rewards/rejected": -16.520803451538086, "step": 504 }, { "epoch": 0.3141524105754277, "grad_norm": 47.98479461669922, "learning_rate": 4.974642692485017e-06, "logits/chosen": -2.50323486328125, "logits/rejected": 2.1609268188476562, "logps/chosen": -439.9488830566406, "logps/rejected": -871.5830078125, "loss": 1.1082, "rewards/accuracies": 0.75, "rewards/chosen": -7.005231857299805, "rewards/margins": 15.006608963012695, "rewards/rejected": -22.0118408203125, "step": 505 }, { "epoch": 0.31477449455676515, "grad_norm": 30.343387603759766, "learning_rate": 4.973490087597972e-06, "logits/chosen": -0.5188862085342407, "logits/rejected": 3.0589683055877686, "logps/chosen": -588.429443359375, "logps/rejected": -955.3839111328125, "loss": 0.3109, "rewards/accuracies": 0.875, "rewards/chosen": -9.351261138916016, "rewards/margins": 16.22182846069336, "rewards/rejected": -25.573089599609375, "step": 506 }, { "epoch": 0.31539657853810266, "grad_norm": 0.008166669867932796, "learning_rate": 4.9723374827109275e-06, "logits/chosen": -2.4529879093170166, "logits/rejected": 1.081737995147705, "logps/chosen": -450.3961181640625, "logps/rejected": -771.6092529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.570064067840576, "rewards/margins": 15.121671676635742, "rewards/rejected": -20.69173812866211, "step": 507 }, { "epoch": 0.3160186625194401, "grad_norm": 20.37687873840332, "learning_rate": 4.971184877823883e-06, "logits/chosen": -1.355845332145691, "logits/rejected": 3.8365607261657715, "logps/chosen": -369.71142578125, "logps/rejected": -886.731201171875, "loss": 0.0948, "rewards/accuracies": 0.875, "rewards/chosen": -6.974061965942383, "rewards/margins": 13.553607940673828, "rewards/rejected": -20.527667999267578, "step": 508 }, { "epoch": 0.3166407465007776, "grad_norm": 11.929078102111816, "learning_rate": 4.970032272936838e-06, "logits/chosen": -0.82236647605896, "logits/rejected": 2.8248941898345947, "logps/chosen": -480.3670654296875, "logps/rejected": -850.4513549804688, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": -8.097372055053711, "rewards/margins": 17.191640853881836, "rewards/rejected": -25.289012908935547, "step": 509 }, { "epoch": 0.31726283048211507, "grad_norm": 0.16133515536785126, "learning_rate": 4.968879668049793e-06, "logits/chosen": -0.7161822319030762, "logits/rejected": 2.622292995452881, "logps/chosen": -376.44049072265625, "logps/rejected": -857.2184448242188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.603147983551025, "rewards/margins": 22.76580810546875, "rewards/rejected": -28.368953704833984, "step": 510 }, { "epoch": 0.3178849144634526, "grad_norm": 26.133943557739258, "learning_rate": 4.967727063162748e-06, "logits/chosen": -0.21386033296585083, "logits/rejected": 3.1011533737182617, "logps/chosen": -486.1957092285156, "logps/rejected": -911.2633666992188, "loss": 0.3089, "rewards/accuracies": 0.875, "rewards/chosen": -9.180964469909668, "rewards/margins": 12.32599925994873, "rewards/rejected": -21.50696563720703, "step": 511 }, { "epoch": 0.31850699844479, "grad_norm": 0.002745468867942691, "learning_rate": 4.966574458275704e-06, "logits/chosen": -1.3007858991622925, "logits/rejected": 2.079380989074707, "logps/chosen": -366.24652099609375, "logps/rejected": -766.4879150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.880084037780762, "rewards/margins": 16.512948989868164, "rewards/rejected": -22.393033981323242, "step": 512 }, { "epoch": 0.31912908242612753, "grad_norm": 0.6302774548530579, "learning_rate": 4.965421853388659e-06, "logits/chosen": -2.278191089630127, "logits/rejected": 2.4888484477996826, "logps/chosen": -343.87042236328125, "logps/rejected": -755.074462890625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -6.464818477630615, "rewards/margins": 15.757192611694336, "rewards/rejected": -22.222009658813477, "step": 513 }, { "epoch": 0.319751166407465, "grad_norm": 11.727120399475098, "learning_rate": 4.964269248501614e-06, "logits/chosen": 1.27705717086792, "logits/rejected": 3.1875715255737305, "logps/chosen": -657.3870849609375, "logps/rejected": -968.127685546875, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -7.210622787475586, "rewards/margins": 17.608915328979492, "rewards/rejected": -24.819538116455078, "step": 514 }, { "epoch": 0.3203732503888025, "grad_norm": 0.0041059167124331, "learning_rate": 4.963116643614569e-06, "logits/chosen": -0.5016254782676697, "logits/rejected": 2.372769832611084, "logps/chosen": -536.0447387695312, "logps/rejected": -931.0604858398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.73209285736084, "rewards/margins": 19.664264678955078, "rewards/rejected": -27.396358489990234, "step": 515 }, { "epoch": 0.32099533437014, "grad_norm": 0.01308477483689785, "learning_rate": 4.9619640387275245e-06, "logits/chosen": 0.8085079789161682, "logits/rejected": 1.2257745265960693, "logps/chosen": -672.284912109375, "logps/rejected": -892.2509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.008455276489258, "rewards/margins": 18.847637176513672, "rewards/rejected": -30.85609245300293, "step": 516 }, { "epoch": 0.32161741835147745, "grad_norm": 0.041552409529685974, "learning_rate": 4.96081143384048e-06, "logits/chosen": 2.4713146686553955, "logits/rejected": 4.937995910644531, "logps/chosen": -600.8577880859375, "logps/rejected": -997.2860107421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.211318969726562, "rewards/margins": 22.59845542907715, "rewards/rejected": -32.809776306152344, "step": 517 }, { "epoch": 0.32223950233281495, "grad_norm": 0.008153838105499744, "learning_rate": 4.959658828953435e-06, "logits/chosen": 0.0751684308052063, "logits/rejected": 3.57389760017395, "logps/chosen": -547.2952880859375, "logps/rejected": -994.4802856445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.388373851776123, "rewards/margins": 21.344240188598633, "rewards/rejected": -28.73261260986328, "step": 518 }, { "epoch": 0.3228615863141524, "grad_norm": 30.417959213256836, "learning_rate": 4.95850622406639e-06, "logits/chosen": 0.2802954316139221, "logits/rejected": 2.9028139114379883, "logps/chosen": -483.03717041015625, "logps/rejected": -740.7561645507812, "loss": 0.8714, "rewards/accuracies": 0.875, "rewards/chosen": -9.424942016601562, "rewards/margins": 13.649709701538086, "rewards/rejected": -23.07465171813965, "step": 519 }, { "epoch": 0.3234836702954899, "grad_norm": 13.999482154846191, "learning_rate": 4.957353619179346e-06, "logits/chosen": 1.0957525968551636, "logits/rejected": 4.062682151794434, "logps/chosen": -607.4361572265625, "logps/rejected": -969.5930786132812, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": -11.513922691345215, "rewards/margins": 19.56466293334961, "rewards/rejected": -31.07858657836914, "step": 520 }, { "epoch": 0.32410575427682736, "grad_norm": 0.07647275179624557, "learning_rate": 4.9562010142923015e-06, "logits/chosen": -0.7335138320922852, "logits/rejected": 3.0744705200195312, "logps/chosen": -568.8802490234375, "logps/rejected": -1019.5764770507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.956428527832031, "rewards/margins": 16.48337173461914, "rewards/rejected": -26.43979835510254, "step": 521 }, { "epoch": 0.32472783825816487, "grad_norm": 1.4716237783432007, "learning_rate": 4.955048409405257e-06, "logits/chosen": -1.4096927642822266, "logits/rejected": 1.4332243204116821, "logps/chosen": -559.1268920898438, "logps/rejected": -950.1163330078125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -8.92173957824707, "rewards/margins": 16.359594345092773, "rewards/rejected": -25.281333923339844, "step": 522 }, { "epoch": 0.3253499222395023, "grad_norm": 0.0002848071453627199, "learning_rate": 4.953895804518212e-06, "logits/chosen": 1.6974499225616455, "logits/rejected": 4.660102367401123, "logps/chosen": -540.9886474609375, "logps/rejected": -895.8781127929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.400766372680664, "rewards/margins": 20.902734756469727, "rewards/rejected": -31.30350112915039, "step": 523 }, { "epoch": 0.3259720062208398, "grad_norm": 1.21076500415802, "learning_rate": 4.952743199631167e-06, "logits/chosen": -1.0652494430541992, "logits/rejected": 2.6607961654663086, "logps/chosen": -571.4052124023438, "logps/rejected": -920.467041015625, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -10.149827003479004, "rewards/margins": 22.16531753540039, "rewards/rejected": -32.315147399902344, "step": 524 }, { "epoch": 0.3265940902021773, "grad_norm": 5.951406002044678, "learning_rate": 4.951590594744122e-06, "logits/chosen": -0.5077402591705322, "logits/rejected": 0.5942150950431824, "logps/chosen": -651.2573852539062, "logps/rejected": -814.20556640625, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -13.089807510375977, "rewards/margins": 14.319589614868164, "rewards/rejected": -27.40939712524414, "step": 525 }, { "epoch": 0.3272161741835148, "grad_norm": 26.456708908081055, "learning_rate": 4.950437989857078e-06, "logits/chosen": 2.108860969543457, "logits/rejected": 3.822720527648926, "logps/chosen": -722.9005126953125, "logps/rejected": -971.195556640625, "loss": 0.2687, "rewards/accuracies": 0.75, "rewards/chosen": -11.09481430053711, "rewards/margins": 11.886865615844727, "rewards/rejected": -22.981679916381836, "step": 526 }, { "epoch": 0.32783825816485224, "grad_norm": 34.449256896972656, "learning_rate": 4.949285384970033e-06, "logits/chosen": 0.3040791153907776, "logits/rejected": 3.248950481414795, "logps/chosen": -527.2380981445312, "logps/rejected": -877.9301147460938, "loss": 0.2376, "rewards/accuracies": 0.875, "rewards/chosen": -11.19322681427002, "rewards/margins": 14.885248184204102, "rewards/rejected": -26.078475952148438, "step": 527 }, { "epoch": 0.32846034214618974, "grad_norm": 0.7171943783760071, "learning_rate": 4.948132780082988e-06, "logits/chosen": 0.1580321490764618, "logits/rejected": 1.8069722652435303, "logps/chosen": -633.6151123046875, "logps/rejected": -865.1552734375, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -12.61785888671875, "rewards/margins": 16.59424591064453, "rewards/rejected": -29.21210479736328, "step": 528 }, { "epoch": 0.3290824261275272, "grad_norm": 32.90729904174805, "learning_rate": 4.946980175195943e-06, "logits/chosen": -0.4327443242073059, "logits/rejected": 3.3330399990081787, "logps/chosen": -536.70166015625, "logps/rejected": -927.87255859375, "loss": 0.2303, "rewards/accuracies": 0.875, "rewards/chosen": -10.0555419921875, "rewards/margins": 17.280601501464844, "rewards/rejected": -27.33614158630371, "step": 529 }, { "epoch": 0.3297045101088647, "grad_norm": 0.0027257169131189585, "learning_rate": 4.9458275703088985e-06, "logits/chosen": 2.5637218952178955, "logits/rejected": 3.3361709117889404, "logps/chosen": -708.5892333984375, "logps/rejected": -922.7438354492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.294085502624512, "rewards/margins": 21.062084197998047, "rewards/rejected": -32.356170654296875, "step": 530 }, { "epoch": 0.33032659409020215, "grad_norm": 0.9086652994155884, "learning_rate": 4.944674965421854e-06, "logits/chosen": -0.039085566997528076, "logits/rejected": 3.6807267665863037, "logps/chosen": -591.6583862304688, "logps/rejected": -1018.6286010742188, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -11.807241439819336, "rewards/margins": 18.556652069091797, "rewards/rejected": -30.3638916015625, "step": 531 }, { "epoch": 0.33094867807153966, "grad_norm": 0.13630260527133942, "learning_rate": 4.943522360534809e-06, "logits/chosen": 0.1960965096950531, "logits/rejected": 2.4245996475219727, "logps/chosen": -623.849609375, "logps/rejected": -828.4188842773438, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -14.50074577331543, "rewards/margins": 13.00462818145752, "rewards/rejected": -27.505373001098633, "step": 532 }, { "epoch": 0.33157076205287717, "grad_norm": 0.0008395586046390235, "learning_rate": 4.942369755647764e-06, "logits/chosen": 0.5557337403297424, "logits/rejected": 2.7182116508483887, "logps/chosen": -707.032958984375, "logps/rejected": -1065.6031494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.236682891845703, "rewards/margins": 24.51875114440918, "rewards/rejected": -34.755435943603516, "step": 533 }, { "epoch": 0.3321928460342146, "grad_norm": 47.25418472290039, "learning_rate": 4.94121715076072e-06, "logits/chosen": -0.8992422819137573, "logits/rejected": 2.3601064682006836, "logps/chosen": -526.9442138671875, "logps/rejected": -885.7796020507812, "loss": 0.2361, "rewards/accuracies": 0.875, "rewards/chosen": -12.016450881958008, "rewards/margins": 22.274335861206055, "rewards/rejected": -34.29078674316406, "step": 534 }, { "epoch": 0.3328149300155521, "grad_norm": 28.037433624267578, "learning_rate": 4.9400645458736755e-06, "logits/chosen": -0.71573805809021, "logits/rejected": 3.0357680320739746, "logps/chosen": -526.3771362304688, "logps/rejected": -831.0687255859375, "loss": 0.8585, "rewards/accuracies": 0.875, "rewards/chosen": -8.930638313293457, "rewards/margins": 12.199329376220703, "rewards/rejected": -21.12996482849121, "step": 535 }, { "epoch": 0.3334370139968896, "grad_norm": 15.79422664642334, "learning_rate": 4.938911940986631e-06, "logits/chosen": -2.4950482845306396, "logits/rejected": 0.818975567817688, "logps/chosen": -389.66387939453125, "logps/rejected": -746.3770751953125, "loss": 0.3818, "rewards/accuracies": 0.875, "rewards/chosen": -8.482288360595703, "rewards/margins": 17.772830963134766, "rewards/rejected": -26.25511932373047, "step": 536 }, { "epoch": 0.3340590979782271, "grad_norm": 21.64934539794922, "learning_rate": 4.937759336099586e-06, "logits/chosen": -1.2507398128509521, "logits/rejected": 0.7657705545425415, "logps/chosen": -407.2629699707031, "logps/rejected": -652.3349609375, "loss": 0.118, "rewards/accuracies": 0.875, "rewards/chosen": -7.758617877960205, "rewards/margins": 11.0945405960083, "rewards/rejected": -18.853158950805664, "step": 537 }, { "epoch": 0.33468118195956453, "grad_norm": 11.210826873779297, "learning_rate": 4.936606731212541e-06, "logits/chosen": -1.8799057006835938, "logits/rejected": -0.9870352149009705, "logps/chosen": -464.33892822265625, "logps/rejected": -667.3587646484375, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": -10.691595077514648, "rewards/margins": 13.272488594055176, "rewards/rejected": -23.96408462524414, "step": 538 }, { "epoch": 0.33530326594090204, "grad_norm": 0.03116637095808983, "learning_rate": 4.935454126325496e-06, "logits/chosen": -0.41563552618026733, "logits/rejected": 1.657536506652832, "logps/chosen": -451.22296142578125, "logps/rejected": -700.4354248046875, "loss": 0.0867, "rewards/accuracies": 0.875, "rewards/chosen": -12.579830169677734, "rewards/margins": 17.299863815307617, "rewards/rejected": -29.87969398498535, "step": 539 }, { "epoch": 0.3359253499222395, "grad_norm": 48.413124084472656, "learning_rate": 4.934301521438452e-06, "logits/chosen": -1.1306934356689453, "logits/rejected": 1.482694387435913, "logps/chosen": -490.3797912597656, "logps/rejected": -851.402099609375, "loss": 0.7104, "rewards/accuracies": 0.75, "rewards/chosen": -10.867384910583496, "rewards/margins": 19.34979248046875, "rewards/rejected": -30.21717643737793, "step": 540 }, { "epoch": 0.336547433903577, "grad_norm": 0.10275397449731827, "learning_rate": 4.933148916551407e-06, "logits/chosen": -2.942965269088745, "logits/rejected": 1.474172830581665, "logps/chosen": -426.2244567871094, "logps/rejected": -927.9111328125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.147480964660645, "rewards/margins": 25.321353912353516, "rewards/rejected": -35.468833923339844, "step": 541 }, { "epoch": 0.33716951788491445, "grad_norm": 41.10462951660156, "learning_rate": 4.931996311664362e-06, "logits/chosen": -1.3480467796325684, "logits/rejected": 2.5357439517974854, "logps/chosen": -555.9721069335938, "logps/rejected": -968.4746704101562, "loss": 0.4043, "rewards/accuracies": 0.875, "rewards/chosen": -14.855534553527832, "rewards/margins": 16.756187438964844, "rewards/rejected": -31.61172103881836, "step": 542 }, { "epoch": 0.33779160186625196, "grad_norm": 0.6125503182411194, "learning_rate": 4.930843706777317e-06, "logits/chosen": -0.027516961097717285, "logits/rejected": 3.6979832649230957, "logps/chosen": -636.796875, "logps/rejected": -1036.647705078125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -15.932252883911133, "rewards/margins": 19.813295364379883, "rewards/rejected": -35.745548248291016, "step": 543 }, { "epoch": 0.3384136858475894, "grad_norm": 2.54610538482666, "learning_rate": 4.9296911018902725e-06, "logits/chosen": -3.5853943824768066, "logits/rejected": 1.622105360031128, "logps/chosen": -367.0932312011719, "logps/rejected": -856.6544189453125, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": -9.336380004882812, "rewards/margins": 14.396159172058105, "rewards/rejected": -23.732540130615234, "step": 544 }, { "epoch": 0.3390357698289269, "grad_norm": 22.747459411621094, "learning_rate": 4.928538497003228e-06, "logits/chosen": -1.3813132047653198, "logits/rejected": 3.102327585220337, "logps/chosen": -441.0496826171875, "logps/rejected": -810.7593994140625, "loss": 0.2061, "rewards/accuracies": 0.875, "rewards/chosen": -7.9570817947387695, "rewards/margins": 14.54705810546875, "rewards/rejected": -22.504138946533203, "step": 545 }, { "epoch": 0.33965785381026437, "grad_norm": 0.5766253471374512, "learning_rate": 4.927385892116183e-06, "logits/chosen": -1.2130874395370483, "logits/rejected": 3.091094970703125, "logps/chosen": -418.65997314453125, "logps/rejected": -918.8798828125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.981672286987305, "rewards/margins": 25.764217376708984, "rewards/rejected": -33.745887756347656, "step": 546 }, { "epoch": 0.34027993779160187, "grad_norm": 0.008549829944968224, "learning_rate": 4.926233287229138e-06, "logits/chosen": -1.421816110610962, "logits/rejected": 2.4775519371032715, "logps/chosen": -563.928466796875, "logps/rejected": -1086.707275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.792487144470215, "rewards/margins": 27.195476531982422, "rewards/rejected": -37.98796463012695, "step": 547 }, { "epoch": 0.3409020217729393, "grad_norm": 0.0007273833034560084, "learning_rate": 4.925080682342093e-06, "logits/chosen": -1.8788166046142578, "logits/rejected": 3.0953235626220703, "logps/chosen": -372.55548095703125, "logps/rejected": -888.577392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.427706718444824, "rewards/margins": 20.497804641723633, "rewards/rejected": -27.92551040649414, "step": 548 }, { "epoch": 0.34152410575427683, "grad_norm": 0.002993043977767229, "learning_rate": 4.9239280774550495e-06, "logits/chosen": 1.5593526363372803, "logits/rejected": 4.229238510131836, "logps/chosen": -595.6698608398438, "logps/rejected": -1023.314697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.270602226257324, "rewards/margins": 23.1187801361084, "rewards/rejected": -35.38938522338867, "step": 549 }, { "epoch": 0.3421461897356143, "grad_norm": 0.0029460815712809563, "learning_rate": 4.922775472568005e-06, "logits/chosen": -2.3387439250946045, "logits/rejected": 1.7299787998199463, "logps/chosen": -413.709228515625, "logps/rejected": -912.1287841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.157748222351074, "rewards/margins": 24.370506286621094, "rewards/rejected": -32.528255462646484, "step": 550 }, { "epoch": 0.3427682737169518, "grad_norm": 0.0008374156313948333, "learning_rate": 4.921622867680959e-06, "logits/chosen": -0.32705816626548767, "logits/rejected": 2.5051586627960205, "logps/chosen": -589.31884765625, "logps/rejected": -961.8628540039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.971508026123047, "rewards/margins": 22.370750427246094, "rewards/rejected": -37.34225845336914, "step": 551 }, { "epoch": 0.3433903576982893, "grad_norm": 33.720123291015625, "learning_rate": 4.920470262793914e-06, "logits/chosen": -2.241405487060547, "logits/rejected": 3.023292064666748, "logps/chosen": -447.4491882324219, "logps/rejected": -1021.66015625, "loss": 0.4903, "rewards/accuracies": 0.875, "rewards/chosen": -8.648796081542969, "rewards/margins": 23.02674674987793, "rewards/rejected": -31.675540924072266, "step": 552 }, { "epoch": 0.34401244167962675, "grad_norm": 39.974979400634766, "learning_rate": 4.9193176579068695e-06, "logits/chosen": 1.1962766647338867, "logits/rejected": 3.599783182144165, "logps/chosen": -549.6170654296875, "logps/rejected": -786.75341796875, "loss": 1.0005, "rewards/accuracies": 0.75, "rewards/chosen": -12.80546760559082, "rewards/margins": 13.706472396850586, "rewards/rejected": -26.511940002441406, "step": 553 }, { "epoch": 0.34463452566096425, "grad_norm": 17.803882598876953, "learning_rate": 4.918165053019825e-06, "logits/chosen": 1.2538249492645264, "logits/rejected": 1.7552330493927002, "logps/chosen": -711.7666625976562, "logps/rejected": -828.937255859375, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": -14.915160179138184, "rewards/margins": 10.584980964660645, "rewards/rejected": -25.500139236450195, "step": 554 }, { "epoch": 0.3452566096423017, "grad_norm": 3.103040933609009, "learning_rate": 4.91701244813278e-06, "logits/chosen": 1.5487773418426514, "logits/rejected": 3.0372424125671387, "logps/chosen": -642.92529296875, "logps/rejected": -902.142333984375, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -13.582810401916504, "rewards/margins": 19.856678009033203, "rewards/rejected": -33.43948745727539, "step": 555 }, { "epoch": 0.3458786936236392, "grad_norm": 0.014518902637064457, "learning_rate": 4.915859843245735e-06, "logits/chosen": -0.48636725544929504, "logits/rejected": 3.007913589477539, "logps/chosen": -254.45782470703125, "logps/rejected": -726.4654541015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.3389573097229, "rewards/margins": 20.198001861572266, "rewards/rejected": -25.53696060180664, "step": 556 }, { "epoch": 0.34650077760497666, "grad_norm": 0.005257305223494768, "learning_rate": 4.9147072383586904e-06, "logits/chosen": 0.3584858775138855, "logits/rejected": 2.2523038387298584, "logps/chosen": -760.4193115234375, "logps/rejected": -1081.203369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.717350006103516, "rewards/margins": 21.49951934814453, "rewards/rejected": -39.21686935424805, "step": 557 }, { "epoch": 0.34712286158631417, "grad_norm": 1.6701537370681763, "learning_rate": 4.913554633471646e-06, "logits/chosen": -2.2826242446899414, "logits/rejected": 0.3551350235939026, "logps/chosen": -442.69097900390625, "logps/rejected": -697.0272216796875, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -9.382549285888672, "rewards/margins": 13.87314510345459, "rewards/rejected": -23.255693435668945, "step": 558 }, { "epoch": 0.3477449455676516, "grad_norm": 21.336896896362305, "learning_rate": 4.912402028584602e-06, "logits/chosen": -0.32555997371673584, "logits/rejected": 0.8113161325454712, "logps/chosen": -569.210693359375, "logps/rejected": -764.329345703125, "loss": 0.1424, "rewards/accuracies": 0.875, "rewards/chosen": -10.628195762634277, "rewards/margins": 14.781917572021484, "rewards/rejected": -25.410112380981445, "step": 559 }, { "epoch": 0.3483670295489891, "grad_norm": 0.01587485708296299, "learning_rate": 4.911249423697557e-06, "logits/chosen": -0.7397094964981079, "logits/rejected": 3.212367057800293, "logps/chosen": -520.3375244140625, "logps/rejected": -964.132080078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.017093658447266, "rewards/margins": 24.597700119018555, "rewards/rejected": -34.61479187011719, "step": 560 }, { "epoch": 0.3489891135303266, "grad_norm": 0.06691177934408188, "learning_rate": 4.910096818810512e-06, "logits/chosen": 0.5737409591674805, "logits/rejected": 3.6291799545288086, "logps/chosen": -487.9759521484375, "logps/rejected": -921.3861083984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.400189399719238, "rewards/margins": 21.333316802978516, "rewards/rejected": -30.733505249023438, "step": 561 }, { "epoch": 0.3496111975116641, "grad_norm": 37.446746826171875, "learning_rate": 4.908944213923467e-06, "logits/chosen": 1.1964095830917358, "logits/rejected": 1.6312025785446167, "logps/chosen": -680.3416137695312, "logps/rejected": -908.3619384765625, "loss": 0.4519, "rewards/accuracies": 0.875, "rewards/chosen": -13.171266555786133, "rewards/margins": 16.99323272705078, "rewards/rejected": -30.164499282836914, "step": 562 }, { "epoch": 0.35023328149300154, "grad_norm": 0.03469838947057724, "learning_rate": 4.907791609036423e-06, "logits/chosen": -1.0637686252593994, "logits/rejected": 2.611809492111206, "logps/chosen": -328.0810546875, "logps/rejected": -687.5595092773438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.062766075134277, "rewards/margins": 14.250360488891602, "rewards/rejected": -21.313125610351562, "step": 563 }, { "epoch": 0.35085536547433904, "grad_norm": 3.72678804397583, "learning_rate": 4.906639004149378e-06, "logits/chosen": 2.042522430419922, "logits/rejected": 3.486320734024048, "logps/chosen": -773.8923950195312, "logps/rejected": -1025.49560546875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -18.240882873535156, "rewards/margins": 17.987770080566406, "rewards/rejected": -36.22865295410156, "step": 564 }, { "epoch": 0.3514774494556765, "grad_norm": 1.5056075426400639e-05, "learning_rate": 4.905486399262333e-06, "logits/chosen": -2.78934383392334, "logits/rejected": 3.307797431945801, "logps/chosen": -389.1837158203125, "logps/rejected": -981.9088745117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.682865142822266, "rewards/margins": 26.758037567138672, "rewards/rejected": -33.44090270996094, "step": 565 }, { "epoch": 0.352099533437014, "grad_norm": 0.0001173518830910325, "learning_rate": 4.904333794375288e-06, "logits/chosen": -3.6703920364379883, "logits/rejected": 2.331336498260498, "logps/chosen": -475.85528564453125, "logps/rejected": -1081.368408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.64018726348877, "rewards/margins": 30.549610137939453, "rewards/rejected": -39.18980026245117, "step": 566 }, { "epoch": 0.35272161741835145, "grad_norm": 2.92313551902771, "learning_rate": 4.9031811894882435e-06, "logits/chosen": 1.1690174341201782, "logits/rejected": 3.2298545837402344, "logps/chosen": -529.1205444335938, "logps/rejected": -833.876708984375, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -14.099763870239258, "rewards/margins": 16.231340408325195, "rewards/rejected": -30.331104278564453, "step": 567 }, { "epoch": 0.35334370139968896, "grad_norm": 0.03307843208312988, "learning_rate": 4.902028584601199e-06, "logits/chosen": -0.4311780333518982, "logits/rejected": 2.940704345703125, "logps/chosen": -413.35516357421875, "logps/rejected": -888.8939208984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.529521942138672, "rewards/margins": 24.85467529296875, "rewards/rejected": -33.38420104980469, "step": 568 }, { "epoch": 0.35396578538102647, "grad_norm": 26.350021362304688, "learning_rate": 4.900875979714154e-06, "logits/chosen": 0.2001965045928955, "logits/rejected": 2.413053512573242, "logps/chosen": -546.7880859375, "logps/rejected": -866.169677734375, "loss": 0.2851, "rewards/accuracies": 0.875, "rewards/chosen": -14.412593841552734, "rewards/margins": 13.435179710388184, "rewards/rejected": -27.847774505615234, "step": 569 }, { "epoch": 0.3545878693623639, "grad_norm": 37.97916793823242, "learning_rate": 4.899723374827109e-06, "logits/chosen": 1.0915520191192627, "logits/rejected": 3.6510167121887207, "logps/chosen": -701.7742919921875, "logps/rejected": -1025.33984375, "loss": 0.8505, "rewards/accuracies": 0.875, "rewards/chosen": -15.174086570739746, "rewards/margins": 18.551523208618164, "rewards/rejected": -33.725608825683594, "step": 570 }, { "epoch": 0.3552099533437014, "grad_norm": 12.117103576660156, "learning_rate": 4.898570769940064e-06, "logits/chosen": -0.11754333972930908, "logits/rejected": 1.6310240030288696, "logps/chosen": -532.1516723632812, "logps/rejected": -736.618896484375, "loss": 0.1422, "rewards/accuracies": 0.875, "rewards/chosen": -11.331872940063477, "rewards/margins": 12.47451114654541, "rewards/rejected": -23.806385040283203, "step": 571 }, { "epoch": 0.3558320373250389, "grad_norm": 0.04478954151272774, "learning_rate": 4.89741816505302e-06, "logits/chosen": -2.8400495052337646, "logits/rejected": 1.9938653707504272, "logps/chosen": -462.40533447265625, "logps/rejected": -920.4817504882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.620899200439453, "rewards/margins": 25.308086395263672, "rewards/rejected": -32.928985595703125, "step": 572 }, { "epoch": 0.3564541213063764, "grad_norm": 4.206796169281006, "learning_rate": 4.896265560165976e-06, "logits/chosen": 1.6942188739776611, "logits/rejected": 3.390136241912842, "logps/chosen": -653.3523559570312, "logps/rejected": -950.9197387695312, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -10.680431365966797, "rewards/margins": 14.628912925720215, "rewards/rejected": -25.309345245361328, "step": 573 }, { "epoch": 0.35707620528771383, "grad_norm": 37.8600959777832, "learning_rate": 4.895112955278931e-06, "logits/chosen": 0.7583746910095215, "logits/rejected": 1.5877070426940918, "logps/chosen": -595.6297607421875, "logps/rejected": -756.054931640625, "loss": 1.2698, "rewards/accuracies": 0.875, "rewards/chosen": -11.754125595092773, "rewards/margins": 11.235215187072754, "rewards/rejected": -22.989341735839844, "step": 574 }, { "epoch": 0.35769828926905134, "grad_norm": 0.011185353621840477, "learning_rate": 4.893960350391886e-06, "logits/chosen": -2.070542097091675, "logits/rejected": 3.7503252029418945, "logps/chosen": -415.4565124511719, "logps/rejected": -957.781494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.346330165863037, "rewards/margins": 24.029987335205078, "rewards/rejected": -31.376317977905273, "step": 575 }, { "epoch": 0.3583203732503888, "grad_norm": 0.3578238785266876, "learning_rate": 4.892807745504841e-06, "logits/chosen": -1.425378680229187, "logits/rejected": 2.926992177963257, "logps/chosen": -356.78533935546875, "logps/rejected": -807.888671875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.3499860763549805, "rewards/margins": 21.162736892700195, "rewards/rejected": -26.51272201538086, "step": 576 }, { "epoch": 0.3589424572317263, "grad_norm": 0.49534156918525696, "learning_rate": 4.891655140617797e-06, "logits/chosen": -1.4474272727966309, "logits/rejected": 2.1497583389282227, "logps/chosen": -323.8979187011719, "logps/rejected": -678.140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.529425144195557, "rewards/margins": 16.806941986083984, "rewards/rejected": -22.336368560791016, "step": 577 }, { "epoch": 0.35956454121306375, "grad_norm": 0.014279196970164776, "learning_rate": 4.890502535730752e-06, "logits/chosen": 1.0501610040664673, "logits/rejected": 3.710789203643799, "logps/chosen": -628.9027099609375, "logps/rejected": -967.5325927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.289298057556152, "rewards/margins": 17.977649688720703, "rewards/rejected": -26.266950607299805, "step": 578 }, { "epoch": 0.36018662519440126, "grad_norm": 67.1640625, "learning_rate": 4.889349930843707e-06, "logits/chosen": -0.08438020944595337, "logits/rejected": 0.6998257637023926, "logps/chosen": -620.3289794921875, "logps/rejected": -855.9243774414062, "loss": 0.8311, "rewards/accuracies": 0.75, "rewards/chosen": -12.778589248657227, "rewards/margins": 15.465974807739258, "rewards/rejected": -28.244564056396484, "step": 579 }, { "epoch": 0.3608087091757387, "grad_norm": 35.90826416015625, "learning_rate": 4.888197325956662e-06, "logits/chosen": -3.391622543334961, "logits/rejected": -0.005591452121734619, "logps/chosen": -414.13525390625, "logps/rejected": -752.8704223632812, "loss": 0.2604, "rewards/accuracies": 0.875, "rewards/chosen": -6.338963031768799, "rewards/margins": 19.059038162231445, "rewards/rejected": -25.39800262451172, "step": 580 }, { "epoch": 0.3614307931570762, "grad_norm": 0.0016980888321995735, "learning_rate": 4.8870447210696175e-06, "logits/chosen": 1.6687289476394653, "logits/rejected": 2.0782861709594727, "logps/chosen": -697.9501342773438, "logps/rejected": -933.7528686523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.147013664245605, "rewards/margins": 22.531450271606445, "rewards/rejected": -30.678462982177734, "step": 581 }, { "epoch": 0.36205287713841366, "grad_norm": 30.825054168701172, "learning_rate": 4.885892116182573e-06, "logits/chosen": -0.3926219344139099, "logits/rejected": 1.7349659204483032, "logps/chosen": -594.2177734375, "logps/rejected": -873.021240234375, "loss": 0.2269, "rewards/accuracies": 0.875, "rewards/chosen": -10.434806823730469, "rewards/margins": 14.307239532470703, "rewards/rejected": -24.742048263549805, "step": 582 }, { "epoch": 0.36267496111975117, "grad_norm": 13.518712043762207, "learning_rate": 4.884739511295528e-06, "logits/chosen": -0.2923990786075592, "logits/rejected": 0.8742029070854187, "logps/chosen": -729.5030517578125, "logps/rejected": -917.1492309570312, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": -13.968738555908203, "rewards/margins": 17.587398529052734, "rewards/rejected": -31.556137084960938, "step": 583 }, { "epoch": 0.3632970451010886, "grad_norm": 3.0330076217651367, "learning_rate": 4.883586906408483e-06, "logits/chosen": 0.3311905860900879, "logits/rejected": 3.680189847946167, "logps/chosen": -561.0177001953125, "logps/rejected": -898.9573364257812, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -10.7169189453125, "rewards/margins": 16.990234375, "rewards/rejected": -27.707151412963867, "step": 584 }, { "epoch": 0.36391912908242613, "grad_norm": 38.674537658691406, "learning_rate": 4.882434301521438e-06, "logits/chosen": 1.2607179880142212, "logits/rejected": 4.454874038696289, "logps/chosen": -765.2697143554688, "logps/rejected": -1076.0303955078125, "loss": 0.5958, "rewards/accuracies": 0.75, "rewards/chosen": -15.136398315429688, "rewards/margins": 15.660676956176758, "rewards/rejected": -30.797075271606445, "step": 585 }, { "epoch": 0.3645412130637636, "grad_norm": 0.13426032662391663, "learning_rate": 4.881281696634394e-06, "logits/chosen": 1.5848565101623535, "logits/rejected": 3.3656342029571533, "logps/chosen": -545.01220703125, "logps/rejected": -805.3402099609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.408105850219727, "rewards/margins": 17.92910385131836, "rewards/rejected": -25.337207794189453, "step": 586 }, { "epoch": 0.3651632970451011, "grad_norm": 0.10479693114757538, "learning_rate": 4.880129091747349e-06, "logits/chosen": -1.8742375373840332, "logits/rejected": 1.2531564235687256, "logps/chosen": -501.27825927734375, "logps/rejected": -815.1589965820312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.048238754272461, "rewards/margins": 16.175546646118164, "rewards/rejected": -26.223785400390625, "step": 587 }, { "epoch": 0.3657853810264386, "grad_norm": 33.76664352416992, "learning_rate": 4.878976486860305e-06, "logits/chosen": 0.6626871228218079, "logits/rejected": 3.236910820007324, "logps/chosen": -571.5628662109375, "logps/rejected": -790.7460327148438, "loss": 0.4093, "rewards/accuracies": 0.875, "rewards/chosen": -8.41337776184082, "rewards/margins": 8.020959854125977, "rewards/rejected": -16.434335708618164, "step": 588 }, { "epoch": 0.36640746500777605, "grad_norm": 4.419313430786133, "learning_rate": 4.87782388197326e-06, "logits/chosen": -1.5819462537765503, "logits/rejected": 0.5037208199501038, "logps/chosen": -438.037353515625, "logps/rejected": -680.818603515625, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -5.652944087982178, "rewards/margins": 13.655805587768555, "rewards/rejected": -19.30875015258789, "step": 589 }, { "epoch": 0.36702954898911355, "grad_norm": 0.024569852277636528, "learning_rate": 4.876671277086215e-06, "logits/chosen": 0.6127163767814636, "logits/rejected": 3.8395800590515137, "logps/chosen": -658.8626098632812, "logps/rejected": -1016.54541015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.372544765472412, "rewards/margins": 16.042373657226562, "rewards/rejected": -21.414918899536133, "step": 590 }, { "epoch": 0.367651632970451, "grad_norm": 15.055514335632324, "learning_rate": 4.875518672199171e-06, "logits/chosen": 0.09293143451213837, "logits/rejected": 3.7359554767608643, "logps/chosen": -455.48675537109375, "logps/rejected": -904.8528442382812, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": -9.17232608795166, "rewards/margins": 21.1533203125, "rewards/rejected": -30.325645446777344, "step": 591 }, { "epoch": 0.3682737169517885, "grad_norm": 29.496828079223633, "learning_rate": 4.874366067312126e-06, "logits/chosen": -2.956026554107666, "logits/rejected": 2.513519048690796, "logps/chosen": -357.78668212890625, "logps/rejected": -780.1038818359375, "loss": 0.8831, "rewards/accuracies": 0.875, "rewards/chosen": -7.444273471832275, "rewards/margins": 13.725360870361328, "rewards/rejected": -21.169635772705078, "step": 592 }, { "epoch": 0.36889580093312596, "grad_norm": 11.6830415725708, "learning_rate": 4.873213462425081e-06, "logits/chosen": 1.3610471487045288, "logits/rejected": 2.2409071922302246, "logps/chosen": -676.43603515625, "logps/rejected": -838.9456176757812, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -13.695094108581543, "rewards/margins": 13.126800537109375, "rewards/rejected": -26.821895599365234, "step": 593 }, { "epoch": 0.36951788491446347, "grad_norm": 2.102048397064209, "learning_rate": 4.872060857538036e-06, "logits/chosen": 1.9273995161056519, "logits/rejected": 4.002025604248047, "logps/chosen": -528.9932861328125, "logps/rejected": -871.9857788085938, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -6.188224792480469, "rewards/margins": 17.76958656311035, "rewards/rejected": -23.957813262939453, "step": 594 }, { "epoch": 0.3701399688958009, "grad_norm": 8.947134017944336, "learning_rate": 4.8709082526509915e-06, "logits/chosen": 1.2496016025543213, "logits/rejected": 1.224539041519165, "logps/chosen": -612.486083984375, "logps/rejected": -726.193359375, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -7.15511417388916, "rewards/margins": 12.905986785888672, "rewards/rejected": -20.061100006103516, "step": 595 }, { "epoch": 0.3707620528771384, "grad_norm": 0.10295522212982178, "learning_rate": 4.869755647763947e-06, "logits/chosen": -0.8488799333572388, "logits/rejected": 4.074465751647949, "logps/chosen": -527.9417724609375, "logps/rejected": -927.850830078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.303407192230225, "rewards/margins": 18.423900604248047, "rewards/rejected": -24.727310180664062, "step": 596 }, { "epoch": 0.3713841368584759, "grad_norm": 40.78211212158203, "learning_rate": 4.868603042876902e-06, "logits/chosen": 1.4812382459640503, "logits/rejected": 3.599062204360962, "logps/chosen": -663.3964233398438, "logps/rejected": -983.2401123046875, "loss": 0.8761, "rewards/accuracies": 0.875, "rewards/chosen": -11.089118957519531, "rewards/margins": 19.937496185302734, "rewards/rejected": -31.0266170501709, "step": 597 }, { "epoch": 0.3720062208398134, "grad_norm": 32.93647384643555, "learning_rate": 4.867450437989857e-06, "logits/chosen": -0.36488810181617737, "logits/rejected": 1.5354863405227661, "logps/chosen": -460.73455810546875, "logps/rejected": -689.384521484375, "loss": 0.4965, "rewards/accuracies": 0.875, "rewards/chosen": -6.197094917297363, "rewards/margins": 14.525459289550781, "rewards/rejected": -20.72255516052246, "step": 598 }, { "epoch": 0.37262830482115084, "grad_norm": 0.004770494066178799, "learning_rate": 4.866297833102812e-06, "logits/chosen": -1.5025954246520996, "logits/rejected": 3.4834418296813965, "logps/chosen": -318.74017333984375, "logps/rejected": -800.4383544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3950350284576416, "rewards/margins": 19.421722412109375, "rewards/rejected": -22.816753387451172, "step": 599 }, { "epoch": 0.37325038880248834, "grad_norm": 19.019315719604492, "learning_rate": 4.865145228215768e-06, "logits/chosen": 2.659332275390625, "logits/rejected": 3.8738040924072266, "logps/chosen": -588.7614135742188, "logps/rejected": -936.4422607421875, "loss": 0.1276, "rewards/accuracies": 0.875, "rewards/chosen": -7.607353210449219, "rewards/margins": 20.830799102783203, "rewards/rejected": -28.438154220581055, "step": 600 }, { "epoch": 0.3738724727838258, "grad_norm": 9.484130859375, "learning_rate": 4.863992623328723e-06, "logits/chosen": 2.2311387062072754, "logits/rejected": 4.177638530731201, "logps/chosen": -634.6103515625, "logps/rejected": -801.521484375, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -6.441396236419678, "rewards/margins": 12.886884689331055, "rewards/rejected": -19.328279495239258, "step": 601 }, { "epoch": 0.3744945567651633, "grad_norm": 0.7348048090934753, "learning_rate": 4.862840018441679e-06, "logits/chosen": 0.9550198316574097, "logits/rejected": 3.199352741241455, "logps/chosen": -641.8240356445312, "logps/rejected": -881.9445190429688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -7.406742095947266, "rewards/margins": 14.080041885375977, "rewards/rejected": -21.48678207397461, "step": 602 }, { "epoch": 0.37511664074650075, "grad_norm": 0.01319398358464241, "learning_rate": 4.861687413554634e-06, "logits/chosen": -2.5636777877807617, "logits/rejected": 1.7623281478881836, "logps/chosen": -420.635986328125, "logps/rejected": -824.3484497070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.411018371582031, "rewards/margins": 15.403884887695312, "rewards/rejected": -19.814903259277344, "step": 603 }, { "epoch": 0.37573872472783826, "grad_norm": 30.160045623779297, "learning_rate": 4.860534808667589e-06, "logits/chosen": 0.446747750043869, "logits/rejected": 3.6913681030273438, "logps/chosen": -425.6489562988281, "logps/rejected": -764.9004516601562, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": -4.018855571746826, "rewards/margins": 14.458169937133789, "rewards/rejected": -18.47702407836914, "step": 604 }, { "epoch": 0.37636080870917576, "grad_norm": 37.270713806152344, "learning_rate": 4.859382203780545e-06, "logits/chosen": 2.456481695175171, "logits/rejected": 1.7729933261871338, "logps/chosen": -640.4718017578125, "logps/rejected": -661.660400390625, "loss": 0.6326, "rewards/accuracies": 0.75, "rewards/chosen": -6.538122177124023, "rewards/margins": 7.628580093383789, "rewards/rejected": -14.166702270507812, "step": 605 }, { "epoch": 0.3769828926905132, "grad_norm": 0.01940000243484974, "learning_rate": 4.8582295988935e-06, "logits/chosen": 0.7876328229904175, "logits/rejected": 1.8315197229385376, "logps/chosen": -532.6771240234375, "logps/rejected": -785.2579956054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.908534049987793, "rewards/margins": 16.05277442932129, "rewards/rejected": -20.9613094329834, "step": 606 }, { "epoch": 0.3776049766718507, "grad_norm": 7.5594706535339355, "learning_rate": 4.857076994006455e-06, "logits/chosen": -0.28083693981170654, "logits/rejected": 2.34736967086792, "logps/chosen": -486.8939208984375, "logps/rejected": -747.44287109375, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -6.2080583572387695, "rewards/margins": 13.350509643554688, "rewards/rejected": -19.558568954467773, "step": 607 }, { "epoch": 0.3782270606531882, "grad_norm": 18.768293380737305, "learning_rate": 4.85592438911941e-06, "logits/chosen": 0.1136578619480133, "logits/rejected": 2.9444642066955566, "logps/chosen": -499.3504333496094, "logps/rejected": -764.1222534179688, "loss": 0.1255, "rewards/accuracies": 0.875, "rewards/chosen": -3.96809720993042, "rewards/margins": 12.719585418701172, "rewards/rejected": -16.68768310546875, "step": 608 }, { "epoch": 0.3788491446345257, "grad_norm": 0.0016397468280047178, "learning_rate": 4.8547717842323655e-06, "logits/chosen": -2.1330926418304443, "logits/rejected": 2.6644673347473145, "logps/chosen": -389.0440368652344, "logps/rejected": -849.74365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.208529949188232, "rewards/margins": 18.280054092407227, "rewards/rejected": -22.488584518432617, "step": 609 }, { "epoch": 0.37947122861586313, "grad_norm": 0.18090400099754333, "learning_rate": 4.853619179345321e-06, "logits/chosen": 0.31500858068466187, "logits/rejected": 2.8184757232666016, "logps/chosen": -556.4556884765625, "logps/rejected": -885.5828857421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.6242709159851074, "rewards/margins": 16.77484703063965, "rewards/rejected": -20.399118423461914, "step": 610 }, { "epoch": 0.38009331259720064, "grad_norm": 0.5810987949371338, "learning_rate": 4.852466574458276e-06, "logits/chosen": -1.924774169921875, "logits/rejected": 1.5502896308898926, "logps/chosen": -318.47222900390625, "logps/rejected": -653.2730102539062, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -3.3044466972351074, "rewards/margins": 14.565733909606934, "rewards/rejected": -17.870182037353516, "step": 611 }, { "epoch": 0.3807153965785381, "grad_norm": 1.5486432313919067, "learning_rate": 4.851313969571231e-06, "logits/chosen": -0.7176028490066528, "logits/rejected": 1.7789404392242432, "logps/chosen": -466.0256652832031, "logps/rejected": -780.8444213867188, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -6.195468902587891, "rewards/margins": 16.52389907836914, "rewards/rejected": -22.71936798095703, "step": 612 }, { "epoch": 0.3813374805598756, "grad_norm": 0.22635774314403534, "learning_rate": 4.850161364684186e-06, "logits/chosen": -0.2533850073814392, "logits/rejected": 1.6178226470947266, "logps/chosen": -524.1812744140625, "logps/rejected": -790.1473388671875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.199096918106079, "rewards/margins": 18.512271881103516, "rewards/rejected": -21.71137046813965, "step": 613 }, { "epoch": 0.38195956454121305, "grad_norm": 0.00020886211132165045, "learning_rate": 4.849008759797142e-06, "logits/chosen": -2.068157911300659, "logits/rejected": 2.3664326667785645, "logps/chosen": -345.16943359375, "logps/rejected": -826.10693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.358859539031982, "rewards/margins": 19.83062171936035, "rewards/rejected": -24.189481735229492, "step": 614 }, { "epoch": 0.38258164852255055, "grad_norm": 14.616460800170898, "learning_rate": 4.847856154910097e-06, "logits/chosen": 2.5612854957580566, "logits/rejected": 3.3244102001190186, "logps/chosen": -575.50048828125, "logps/rejected": -779.091796875, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -8.839805603027344, "rewards/margins": 13.69375991821289, "rewards/rejected": -22.533565521240234, "step": 615 }, { "epoch": 0.383203732503888, "grad_norm": 0.1210361123085022, "learning_rate": 4.846703550023052e-06, "logits/chosen": 1.096333384513855, "logits/rejected": 3.7907910346984863, "logps/chosen": -487.0243225097656, "logps/rejected": -856.36181640625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8041367530822754, "rewards/margins": 18.125682830810547, "rewards/rejected": -21.929819107055664, "step": 616 }, { "epoch": 0.3838258164852255, "grad_norm": 0.007629493251442909, "learning_rate": 4.845550945136008e-06, "logits/chosen": 2.442495346069336, "logits/rejected": 4.257726669311523, "logps/chosen": -573.216796875, "logps/rejected": -859.510009765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.294878959655762, "rewards/margins": 20.90713119506836, "rewards/rejected": -26.202011108398438, "step": 617 }, { "epoch": 0.38444790046656296, "grad_norm": 8.720742225646973, "learning_rate": 4.844398340248963e-06, "logits/chosen": 2.267151355743408, "logits/rejected": 4.637188911437988, "logps/chosen": -508.08251953125, "logps/rejected": -838.2218017578125, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": -3.086606025695801, "rewards/margins": 18.29007339477539, "rewards/rejected": -21.376678466796875, "step": 618 }, { "epoch": 0.38506998444790047, "grad_norm": 0.0023380150087177753, "learning_rate": 4.843245735361919e-06, "logits/chosen": 0.5009723901748657, "logits/rejected": 3.4536290168762207, "logps/chosen": -502.7635498046875, "logps/rejected": -818.5247192382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.08828657865524292, "rewards/margins": 18.531295776367188, "rewards/rejected": -18.443008422851562, "step": 619 }, { "epoch": 0.3856920684292379, "grad_norm": 50.71761703491211, "learning_rate": 4.842093130474874e-06, "logits/chosen": -0.8099173307418823, "logits/rejected": 1.7252922058105469, "logps/chosen": -438.8321533203125, "logps/rejected": -750.5208740234375, "loss": 0.8195, "rewards/accuracies": 0.875, "rewards/chosen": -4.430469036102295, "rewards/margins": 15.224184036254883, "rewards/rejected": -19.654653549194336, "step": 620 }, { "epoch": 0.38631415241057543, "grad_norm": 0.14640004932880402, "learning_rate": 4.840940525587829e-06, "logits/chosen": -1.0835647583007812, "logits/rejected": 3.823948383331299, "logps/chosen": -309.2799377441406, "logps/rejected": -755.61669921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.2851157188415527, "rewards/margins": 14.198723793029785, "rewards/rejected": -17.48383903503418, "step": 621 }, { "epoch": 0.38693623639191294, "grad_norm": 24.4279842376709, "learning_rate": 4.839787920700784e-06, "logits/chosen": 2.0698561668395996, "logits/rejected": 3.8490822315216064, "logps/chosen": -626.984375, "logps/rejected": -923.0991821289062, "loss": 0.2581, "rewards/accuracies": 0.875, "rewards/chosen": -6.559420585632324, "rewards/margins": 15.944664001464844, "rewards/rejected": -22.50408363342285, "step": 622 }, { "epoch": 0.3875583203732504, "grad_norm": 1.6033275127410889, "learning_rate": 4.8386353158137395e-06, "logits/chosen": 1.5655349493026733, "logits/rejected": 4.216394424438477, "logps/chosen": -497.54638671875, "logps/rejected": -865.5770874023438, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -1.9526607990264893, "rewards/margins": 17.63835906982422, "rewards/rejected": -19.591020584106445, "step": 623 }, { "epoch": 0.3881804043545879, "grad_norm": 3.7565343379974365, "learning_rate": 4.837482710926695e-06, "logits/chosen": -3.539860963821411, "logits/rejected": 3.1450586318969727, "logps/chosen": -289.2846984863281, "logps/rejected": -818.124755859375, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -2.083850622177124, "rewards/margins": 15.55625057220459, "rewards/rejected": -17.64010238647461, "step": 624 }, { "epoch": 0.38880248833592534, "grad_norm": 9.447480201721191, "learning_rate": 4.83633010603965e-06, "logits/chosen": -1.6923489570617676, "logits/rejected": 2.869992256164551, "logps/chosen": -456.4915771484375, "logps/rejected": -925.1444702148438, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": -5.213108539581299, "rewards/margins": 16.34868812561035, "rewards/rejected": -21.561796188354492, "step": 625 }, { "epoch": 0.38942457231726285, "grad_norm": 1.3966708183288574, "learning_rate": 4.835177501152605e-06, "logits/chosen": 1.9913567304611206, "logits/rejected": 3.4345850944519043, "logps/chosen": -446.4867858886719, "logps/rejected": -746.5498657226562, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -5.273656845092773, "rewards/margins": 16.530517578125, "rewards/rejected": -21.804176330566406, "step": 626 }, { "epoch": 0.3900466562986003, "grad_norm": 0.013844764791429043, "learning_rate": 4.83402489626556e-06, "logits/chosen": 2.8672995567321777, "logits/rejected": 3.437636375427246, "logps/chosen": -738.3363647460938, "logps/rejected": -923.5630493164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.327809810638428, "rewards/margins": 15.714049339294434, "rewards/rejected": -21.041858673095703, "step": 627 }, { "epoch": 0.3906687402799378, "grad_norm": 0.23032204806804657, "learning_rate": 4.832872291378516e-06, "logits/chosen": 0.48103535175323486, "logits/rejected": 3.2791948318481445, "logps/chosen": -511.300048828125, "logps/rejected": -837.8681030273438, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -9.440017700195312, "rewards/margins": 12.671974182128906, "rewards/rejected": -22.111989974975586, "step": 628 }, { "epoch": 0.39129082426127526, "grad_norm": 1.4107637405395508, "learning_rate": 4.831719686491471e-06, "logits/chosen": -2.6341326236724854, "logits/rejected": 3.5024142265319824, "logps/chosen": -183.09799194335938, "logps/rejected": -754.7428588867188, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.7689191102981567, "rewards/margins": 23.79766273498535, "rewards/rejected": -24.56658172607422, "step": 629 }, { "epoch": 0.39191290824261277, "grad_norm": 0.15013962984085083, "learning_rate": 4.830567081604426e-06, "logits/chosen": -0.7445148229598999, "logits/rejected": 3.8409135341644287, "logps/chosen": -460.27984619140625, "logps/rejected": -913.905029296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.9821481704711914, "rewards/margins": 21.528270721435547, "rewards/rejected": -24.510419845581055, "step": 630 }, { "epoch": 0.3925349922239502, "grad_norm": 5.702386379241943, "learning_rate": 4.829414476717382e-06, "logits/chosen": 0.4213750958442688, "logits/rejected": 2.7475595474243164, "logps/chosen": -468.94549560546875, "logps/rejected": -773.5858154296875, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -6.939387798309326, "rewards/margins": 16.688373565673828, "rewards/rejected": -23.627761840820312, "step": 631 }, { "epoch": 0.3931570762052877, "grad_norm": 28.688859939575195, "learning_rate": 4.828261871830337e-06, "logits/chosen": -0.6199629306793213, "logits/rejected": 3.405015468597412, "logps/chosen": -511.64501953125, "logps/rejected": -868.1781005859375, "loss": 0.325, "rewards/accuracies": 0.875, "rewards/chosen": -5.980612277984619, "rewards/margins": 13.439093589782715, "rewards/rejected": -19.419706344604492, "step": 632 }, { "epoch": 0.3937791601866252, "grad_norm": 0.5256664752960205, "learning_rate": 4.827109266943293e-06, "logits/chosen": 1.968687891960144, "logits/rejected": 1.5026566982269287, "logps/chosen": -554.6557006835938, "logps/rejected": -652.0545654296875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.485213279724121, "rewards/margins": 10.570073127746582, "rewards/rejected": -15.055286407470703, "step": 633 }, { "epoch": 0.3944012441679627, "grad_norm": 0.0065084053203463554, "learning_rate": 4.825956662056248e-06, "logits/chosen": 1.2704832553863525, "logits/rejected": 2.851560115814209, "logps/chosen": -437.1866149902344, "logps/rejected": -672.5458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.370952606201172, "rewards/margins": 16.11528778076172, "rewards/rejected": -21.48624038696289, "step": 634 }, { "epoch": 0.39502332814930013, "grad_norm": 2.8544149245135486e-05, "learning_rate": 4.824804057169203e-06, "logits/chosen": 0.17647302150726318, "logits/rejected": 3.5326387882232666, "logps/chosen": -426.699462890625, "logps/rejected": -840.47021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.158924579620361, "rewards/margins": 19.477825164794922, "rewards/rejected": -23.636749267578125, "step": 635 }, { "epoch": 0.39564541213063764, "grad_norm": 0.12314128130674362, "learning_rate": 4.823651452282158e-06, "logits/chosen": 0.10702091455459595, "logits/rejected": 3.7575435638427734, "logps/chosen": -440.03643798828125, "logps/rejected": -920.6306762695312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.680037498474121, "rewards/margins": 22.33530616760254, "rewards/rejected": -29.015344619750977, "step": 636 }, { "epoch": 0.3962674961119751, "grad_norm": 26.167678833007812, "learning_rate": 4.8224988473951135e-06, "logits/chosen": 1.3711156845092773, "logits/rejected": 3.597280502319336, "logps/chosen": -536.123291015625, "logps/rejected": -809.23291015625, "loss": 0.4267, "rewards/accuracies": 0.75, "rewards/chosen": -8.699240684509277, "rewards/margins": 11.513164520263672, "rewards/rejected": -20.212406158447266, "step": 637 }, { "epoch": 0.3968895800933126, "grad_norm": 3.827866554260254, "learning_rate": 4.821346242508069e-06, "logits/chosen": 1.7881110906600952, "logits/rejected": 3.7016730308532715, "logps/chosen": -512.7743530273438, "logps/rejected": -737.0347900390625, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -7.946093559265137, "rewards/margins": 16.378984451293945, "rewards/rejected": -24.325077056884766, "step": 638 }, { "epoch": 0.39751166407465005, "grad_norm": 0.02579200640320778, "learning_rate": 4.820193637621024e-06, "logits/chosen": -1.0454192161560059, "logits/rejected": 2.754476547241211, "logps/chosen": -391.75030517578125, "logps/rejected": -796.76953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.3952555656433105, "rewards/margins": 19.92845916748047, "rewards/rejected": -25.323715209960938, "step": 639 }, { "epoch": 0.39813374805598756, "grad_norm": 13.338902473449707, "learning_rate": 4.819041032733979e-06, "logits/chosen": 0.01971861720085144, "logits/rejected": 4.193253993988037, "logps/chosen": -554.3214721679688, "logps/rejected": -962.2105712890625, "loss": 0.0639, "rewards/accuracies": 1.0, "rewards/chosen": -6.8727617263793945, "rewards/margins": 22.870092391967773, "rewards/rejected": -29.742855072021484, "step": 640 }, { "epoch": 0.39875583203732506, "grad_norm": 5.3078994824318215e-05, "learning_rate": 4.817888427846934e-06, "logits/chosen": 2.95037841796875, "logits/rejected": 4.468027114868164, "logps/chosen": -615.4202880859375, "logps/rejected": -942.3226318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.942698955535889, "rewards/margins": 22.296367645263672, "rewards/rejected": -30.239065170288086, "step": 641 }, { "epoch": 0.3993779160186625, "grad_norm": 1.463631510734558, "learning_rate": 4.81673582295989e-06, "logits/chosen": -1.417764663696289, "logits/rejected": 3.904311180114746, "logps/chosen": -437.336669921875, "logps/rejected": -847.3101196289062, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -7.907837390899658, "rewards/margins": 16.962812423706055, "rewards/rejected": -24.870649337768555, "step": 642 }, { "epoch": 0.4, "grad_norm": 4.746206283569336, "learning_rate": 4.815583218072845e-06, "logits/chosen": -0.7753801345825195, "logits/rejected": 3.87953519821167, "logps/chosen": -416.31231689453125, "logps/rejected": -864.30126953125, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": -4.581168174743652, "rewards/margins": 22.60930061340332, "rewards/rejected": -27.19046974182129, "step": 643 }, { "epoch": 0.4006220839813375, "grad_norm": 4.815265128854662e-05, "learning_rate": 4.8144306131858e-06, "logits/chosen": -0.5812610387802124, "logits/rejected": 3.251255989074707, "logps/chosen": -372.25054931640625, "logps/rejected": -811.37939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.438142776489258, "rewards/margins": 21.47218132019043, "rewards/rejected": -24.910324096679688, "step": 644 }, { "epoch": 0.401244167962675, "grad_norm": 0.024434104561805725, "learning_rate": 4.813278008298755e-06, "logits/chosen": 0.36400488018989563, "logits/rejected": 3.0627307891845703, "logps/chosen": -459.53582763671875, "logps/rejected": -897.6513671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.094304084777832, "rewards/margins": 28.00246810913086, "rewards/rejected": -31.096773147583008, "step": 645 }, { "epoch": 0.40186625194401243, "grad_norm": 0.13461847603321075, "learning_rate": 4.812125403411711e-06, "logits/chosen": -2.2824478149414062, "logits/rejected": 3.998173713684082, "logps/chosen": -335.736083984375, "logps/rejected": -883.3201293945312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.016701698303223, "rewards/margins": 17.895519256591797, "rewards/rejected": -23.912221908569336, "step": 646 }, { "epoch": 0.40248833592534994, "grad_norm": 23.656198501586914, "learning_rate": 4.810972798524667e-06, "logits/chosen": 0.18910843133926392, "logits/rejected": 2.8961360454559326, "logps/chosen": -523.1488647460938, "logps/rejected": -843.4249267578125, "loss": 0.2861, "rewards/accuracies": 0.875, "rewards/chosen": -6.169979095458984, "rewards/margins": 11.714703559875488, "rewards/rejected": -17.884681701660156, "step": 647 }, { "epoch": 0.4031104199066874, "grad_norm": 1.5852130651474, "learning_rate": 4.809820193637622e-06, "logits/chosen": 0.43711161613464355, "logits/rejected": 2.8995206356048584, "logps/chosen": -539.7998046875, "logps/rejected": -927.2687377929688, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -9.630446434020996, "rewards/margins": 20.007854461669922, "rewards/rejected": -29.6382999420166, "step": 648 }, { "epoch": 0.4037325038880249, "grad_norm": 0.008699237369000912, "learning_rate": 4.808667588750577e-06, "logits/chosen": 0.30366188287734985, "logits/rejected": 3.941417694091797, "logps/chosen": -508.7660217285156, "logps/rejected": -924.7904663085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7266173362731934, "rewards/margins": 23.782203674316406, "rewards/rejected": -27.508821487426758, "step": 649 }, { "epoch": 0.40435458786936235, "grad_norm": 19.735321044921875, "learning_rate": 4.807514983863532e-06, "logits/chosen": 1.1154712438583374, "logits/rejected": 2.21747088432312, "logps/chosen": -499.0096740722656, "logps/rejected": -780.8884887695312, "loss": 0.7222, "rewards/accuracies": 0.875, "rewards/chosen": -8.748971939086914, "rewards/margins": 19.208717346191406, "rewards/rejected": -27.95768928527832, "step": 650 }, { "epoch": 0.40497667185069985, "grad_norm": 4.993641376495361, "learning_rate": 4.8063623789764875e-06, "logits/chosen": -1.0320407152175903, "logits/rejected": 2.897843837738037, "logps/chosen": -491.4745178222656, "logps/rejected": -866.7677001953125, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -9.669189453125, "rewards/margins": 17.265472412109375, "rewards/rejected": -26.934659957885742, "step": 651 }, { "epoch": 0.4055987558320373, "grad_norm": 33.00937271118164, "learning_rate": 4.805209774089443e-06, "logits/chosen": 1.5638294219970703, "logits/rejected": 4.383650779724121, "logps/chosen": -449.9453125, "logps/rejected": -779.3433837890625, "loss": 0.4431, "rewards/accuracies": 0.875, "rewards/chosen": -5.803736686706543, "rewards/margins": 20.59209632873535, "rewards/rejected": -26.395832061767578, "step": 652 }, { "epoch": 0.4062208398133748, "grad_norm": 19.781658172607422, "learning_rate": 4.804057169202398e-06, "logits/chosen": -0.6717186570167542, "logits/rejected": 3.4153504371643066, "logps/chosen": -452.99822998046875, "logps/rejected": -792.9339599609375, "loss": 0.1421, "rewards/accuracies": 0.875, "rewards/chosen": -8.400663375854492, "rewards/margins": 15.584333419799805, "rewards/rejected": -23.984996795654297, "step": 653 }, { "epoch": 0.40684292379471226, "grad_norm": 28.147157669067383, "learning_rate": 4.802904564315353e-06, "logits/chosen": -3.40217661857605, "logits/rejected": 3.5524790287017822, "logps/chosen": -438.203857421875, "logps/rejected": -1074.790283203125, "loss": 0.9619, "rewards/accuracies": 0.875, "rewards/chosen": -4.154910087585449, "rewards/margins": 29.582317352294922, "rewards/rejected": -33.73722839355469, "step": 654 }, { "epoch": 0.40746500777604977, "grad_norm": 0.09156882762908936, "learning_rate": 4.801751959428308e-06, "logits/chosen": 3.1281723976135254, "logits/rejected": 3.121691942214966, "logps/chosen": -741.47216796875, "logps/rejected": -1028.3216552734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.673593521118164, "rewards/margins": 22.124595642089844, "rewards/rejected": -30.798187255859375, "step": 655 }, { "epoch": 0.4080870917573872, "grad_norm": 0.6502810716629028, "learning_rate": 4.800599354541264e-06, "logits/chosen": -1.4345520734786987, "logits/rejected": 1.081398367881775, "logps/chosen": -333.1896667480469, "logps/rejected": -653.4251708984375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -5.2726359367370605, "rewards/margins": 16.819339752197266, "rewards/rejected": -22.091976165771484, "step": 656 }, { "epoch": 0.40870917573872473, "grad_norm": 0.025361565873026848, "learning_rate": 4.799446749654219e-06, "logits/chosen": 1.792804479598999, "logits/rejected": 3.220573663711548, "logps/chosen": -632.1589965820312, "logps/rejected": -931.2838134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.018635749816895, "rewards/margins": 15.393332481384277, "rewards/rejected": -27.411968231201172, "step": 657 }, { "epoch": 0.40933125972006223, "grad_norm": 0.01778257079422474, "learning_rate": 4.798294144767174e-06, "logits/chosen": -2.2758796215057373, "logits/rejected": 2.9297425746917725, "logps/chosen": -398.4133605957031, "logps/rejected": -808.4490356445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.194453716278076, "rewards/margins": 16.604900360107422, "rewards/rejected": -22.799354553222656, "step": 658 }, { "epoch": 0.4099533437013997, "grad_norm": 0.39328357577323914, "learning_rate": 4.797141539880129e-06, "logits/chosen": 0.019907251000404358, "logits/rejected": 4.10305118560791, "logps/chosen": -539.0341796875, "logps/rejected": -1011.156005859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -10.144923210144043, "rewards/margins": 19.252063751220703, "rewards/rejected": -29.39698600769043, "step": 659 }, { "epoch": 0.4105754276827372, "grad_norm": 40.315162658691406, "learning_rate": 4.795988934993085e-06, "logits/chosen": -0.9315068125724792, "logits/rejected": 2.8982458114624023, "logps/chosen": -423.90631103515625, "logps/rejected": -840.4696044921875, "loss": 0.5096, "rewards/accuracies": 0.875, "rewards/chosen": -9.175039291381836, "rewards/margins": 19.72167205810547, "rewards/rejected": -28.896713256835938, "step": 660 }, { "epoch": 0.41119751166407464, "grad_norm": 0.00019077463366556913, "learning_rate": 4.794836330106041e-06, "logits/chosen": -0.42708921432495117, "logits/rejected": 4.313132286071777, "logps/chosen": -349.7354736328125, "logps/rejected": -814.03759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.468191146850586, "rewards/margins": 19.493267059326172, "rewards/rejected": -24.961458206176758, "step": 661 }, { "epoch": 0.41181959564541215, "grad_norm": 9.062701225280762, "learning_rate": 4.793683725218996e-06, "logits/chosen": 1.5624027252197266, "logits/rejected": 3.176633834838867, "logps/chosen": -594.6624755859375, "logps/rejected": -916.4132080078125, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": -8.484450340270996, "rewards/margins": 17.643253326416016, "rewards/rejected": -26.127704620361328, "step": 662 }, { "epoch": 0.4124416796267496, "grad_norm": 0.16003720462322235, "learning_rate": 4.792531120331951e-06, "logits/chosen": 2.4539906978607178, "logits/rejected": 3.480743408203125, "logps/chosen": -633.8411865234375, "logps/rejected": -825.93310546875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -10.565387725830078, "rewards/margins": 13.266152381896973, "rewards/rejected": -23.831541061401367, "step": 663 }, { "epoch": 0.4130637636080871, "grad_norm": 32.037132263183594, "learning_rate": 4.791378515444906e-06, "logits/chosen": 1.967376708984375, "logits/rejected": 3.912313222885132, "logps/chosen": -587.0730590820312, "logps/rejected": -835.215576171875, "loss": 1.5524, "rewards/accuracies": 0.875, "rewards/chosen": -11.711736679077148, "rewards/margins": 14.280363082885742, "rewards/rejected": -25.992103576660156, "step": 664 }, { "epoch": 0.41368584758942456, "grad_norm": 22.63080406188965, "learning_rate": 4.7902259105578615e-06, "logits/chosen": -1.1042022705078125, "logits/rejected": 1.5109401941299438, "logps/chosen": -308.7568359375, "logps/rejected": -628.593994140625, "loss": 0.4471, "rewards/accuracies": 0.875, "rewards/chosen": -3.961351156234741, "rewards/margins": 20.284442901611328, "rewards/rejected": -24.245792388916016, "step": 665 }, { "epoch": 0.41430793157076207, "grad_norm": 4.558940887451172, "learning_rate": 4.789073305670817e-06, "logits/chosen": 1.9392263889312744, "logits/rejected": 3.432485580444336, "logps/chosen": -719.56640625, "logps/rejected": -961.9409790039062, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -7.849826812744141, "rewards/margins": 17.749860763549805, "rewards/rejected": -25.599689483642578, "step": 666 }, { "epoch": 0.4149300155520995, "grad_norm": 26.374540328979492, "learning_rate": 4.787920700783772e-06, "logits/chosen": 1.5769593715667725, "logits/rejected": 2.828300714492798, "logps/chosen": -494.17388916015625, "logps/rejected": -807.5885009765625, "loss": 0.932, "rewards/accuracies": 0.875, "rewards/chosen": -8.869695663452148, "rewards/margins": 17.6748046875, "rewards/rejected": -26.54450225830078, "step": 667 }, { "epoch": 0.415552099533437, "grad_norm": 10.404101371765137, "learning_rate": 4.786768095896727e-06, "logits/chosen": 2.2112016677856445, "logits/rejected": 2.179577112197876, "logps/chosen": -776.2391357421875, "logps/rejected": -941.023193359375, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -13.392984390258789, "rewards/margins": 13.242691993713379, "rewards/rejected": -26.635677337646484, "step": 668 }, { "epoch": 0.4161741835147745, "grad_norm": 0.00012222891382407397, "learning_rate": 4.785615491009682e-06, "logits/chosen": -0.8005756139755249, "logits/rejected": 2.6633152961730957, "logps/chosen": -580.3494262695312, "logps/rejected": -1048.3486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.532279968261719, "rewards/margins": 26.374229431152344, "rewards/rejected": -36.90650939941406, "step": 669 }, { "epoch": 0.416796267496112, "grad_norm": 0.016480503603816032, "learning_rate": 4.784462886122638e-06, "logits/chosen": -1.1997987031936646, "logits/rejected": 3.5286808013916016, "logps/chosen": -381.94769287109375, "logps/rejected": -869.9482421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.5639848709106445, "rewards/margins": 21.98867416381836, "rewards/rejected": -27.552656173706055, "step": 670 }, { "epoch": 0.41741835147744943, "grad_norm": 5.529460430145264, "learning_rate": 4.783310281235593e-06, "logits/chosen": -0.6281594634056091, "logits/rejected": 2.345285654067993, "logps/chosen": -411.87286376953125, "logps/rejected": -790.1439819335938, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": -6.664583206176758, "rewards/margins": 19.735605239868164, "rewards/rejected": -26.400184631347656, "step": 671 }, { "epoch": 0.41804043545878694, "grad_norm": 29.453466415405273, "learning_rate": 4.782157676348548e-06, "logits/chosen": -1.222573161125183, "logits/rejected": 3.766413688659668, "logps/chosen": -480.82989501953125, "logps/rejected": -1006.6981201171875, "loss": 0.2167, "rewards/accuracies": 0.875, "rewards/chosen": -7.6935529708862305, "rewards/margins": 23.371456146240234, "rewards/rejected": -31.065006256103516, "step": 672 }, { "epoch": 0.4186625194401244, "grad_norm": 0.17522646486759186, "learning_rate": 4.781005071461503e-06, "logits/chosen": -5.1208977699279785, "logits/rejected": 3.1196346282958984, "logps/chosen": -232.45718383789062, "logps/rejected": -869.94091796875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.2358012199401855, "rewards/margins": 20.760984420776367, "rewards/rejected": -24.996788024902344, "step": 673 }, { "epoch": 0.4192846034214619, "grad_norm": 27.000770568847656, "learning_rate": 4.7798524665744585e-06, "logits/chosen": 1.8730990886688232, "logits/rejected": 3.122191905975342, "logps/chosen": -651.515625, "logps/rejected": -853.1071166992188, "loss": 0.9936, "rewards/accuracies": 0.875, "rewards/chosen": -11.197526931762695, "rewards/margins": 10.82865047454834, "rewards/rejected": -22.02617645263672, "step": 674 }, { "epoch": 0.4199066874027994, "grad_norm": 30.578311920166016, "learning_rate": 4.7786998616874146e-06, "logits/chosen": -0.01745295524597168, "logits/rejected": 2.4413537979125977, "logps/chosen": -419.0821228027344, "logps/rejected": -830.2938232421875, "loss": 1.3451, "rewards/accuracies": 0.875, "rewards/chosen": -4.137384414672852, "rewards/margins": 18.40494155883789, "rewards/rejected": -22.542327880859375, "step": 675 }, { "epoch": 0.42052877138413686, "grad_norm": 53.24766159057617, "learning_rate": 4.77754725680037e-06, "logits/chosen": -1.0631290674209595, "logits/rejected": 2.835108518600464, "logps/chosen": -550.2883911132812, "logps/rejected": -937.1199340820312, "loss": 1.3762, "rewards/accuracies": 0.75, "rewards/chosen": -8.001826286315918, "rewards/margins": 19.6325740814209, "rewards/rejected": -27.634401321411133, "step": 676 }, { "epoch": 0.42115085536547436, "grad_norm": 24.685049057006836, "learning_rate": 4.776394651913325e-06, "logits/chosen": -0.9792740345001221, "logits/rejected": 2.392324686050415, "logps/chosen": -512.7687377929688, "logps/rejected": -835.5687255859375, "loss": 0.2189, "rewards/accuracies": 0.875, "rewards/chosen": -9.09033203125, "rewards/margins": 14.524964332580566, "rewards/rejected": -23.61529541015625, "step": 677 }, { "epoch": 0.4217729393468118, "grad_norm": 0.8354139924049377, "learning_rate": 4.77524204702628e-06, "logits/chosen": -0.6444275975227356, "logits/rejected": 3.580641508102417, "logps/chosen": -416.6503601074219, "logps/rejected": -801.9967651367188, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -5.518429279327393, "rewards/margins": 17.77960205078125, "rewards/rejected": -23.298032760620117, "step": 678 }, { "epoch": 0.4223950233281493, "grad_norm": 29.565584182739258, "learning_rate": 4.7740894421392355e-06, "logits/chosen": 0.06383585929870605, "logits/rejected": 3.639883518218994, "logps/chosen": -476.05352783203125, "logps/rejected": -927.9241943359375, "loss": 0.5134, "rewards/accuracies": 0.875, "rewards/chosen": -7.642790794372559, "rewards/margins": 17.618484497070312, "rewards/rejected": -25.261276245117188, "step": 679 }, { "epoch": 0.4230171073094868, "grad_norm": 0.0912347361445427, "learning_rate": 4.772936837252191e-06, "logits/chosen": -0.12195968627929688, "logits/rejected": 3.932724952697754, "logps/chosen": -402.1063232421875, "logps/rejected": -811.5177001953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.853856086730957, "rewards/margins": 16.344602584838867, "rewards/rejected": -21.19845962524414, "step": 680 }, { "epoch": 0.4236391912908243, "grad_norm": 0.3873937129974365, "learning_rate": 4.771784232365146e-06, "logits/chosen": 1.799547553062439, "logits/rejected": 4.590263366699219, "logps/chosen": -559.97216796875, "logps/rejected": -1006.1261596679688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -9.338591575622559, "rewards/margins": 23.82638168334961, "rewards/rejected": -33.16497039794922, "step": 681 }, { "epoch": 0.42426127527216173, "grad_norm": 0.7272276282310486, "learning_rate": 4.770631627478101e-06, "logits/chosen": 0.4105660915374756, "logits/rejected": 2.076657772064209, "logps/chosen": -388.392333984375, "logps/rejected": -615.2227783203125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -5.603516578674316, "rewards/margins": 15.719802856445312, "rewards/rejected": -21.323318481445312, "step": 682 }, { "epoch": 0.42488335925349924, "grad_norm": 1.1333785323586199e-06, "learning_rate": 4.769479022591056e-06, "logits/chosen": -0.6940436959266663, "logits/rejected": 4.295339107513428, "logps/chosen": -507.3326110839844, "logps/rejected": -970.3253173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.027980327606201, "rewards/margins": 27.017305374145508, "rewards/rejected": -31.045286178588867, "step": 683 }, { "epoch": 0.4255054432348367, "grad_norm": 4.457662726053968e-05, "learning_rate": 4.768326417704012e-06, "logits/chosen": -1.7501380443572998, "logits/rejected": 4.424574851989746, "logps/chosen": -388.6649169921875, "logps/rejected": -957.874267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.324387550354004, "rewards/margins": 27.12349510192871, "rewards/rejected": -33.44788360595703, "step": 684 }, { "epoch": 0.4261275272161742, "grad_norm": 0.12122859060764313, "learning_rate": 4.767173812816967e-06, "logits/chosen": 1.5742546319961548, "logits/rejected": 4.587541580200195, "logps/chosen": -466.7994079589844, "logps/rejected": -852.6675415039062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.404057025909424, "rewards/margins": 24.145267486572266, "rewards/rejected": -30.54932403564453, "step": 685 }, { "epoch": 0.42674961119751165, "grad_norm": 6.259092807769775, "learning_rate": 4.766021207929922e-06, "logits/chosen": 0.6969764828681946, "logits/rejected": 4.548662185668945, "logps/chosen": -486.2441711425781, "logps/rejected": -815.9784545898438, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": -7.9217023849487305, "rewards/margins": 12.386877059936523, "rewards/rejected": -20.30858039855957, "step": 686 }, { "epoch": 0.42737169517884915, "grad_norm": 38.55778884887695, "learning_rate": 4.764868603042877e-06, "logits/chosen": 1.018954873085022, "logits/rejected": 3.2408273220062256, "logps/chosen": -672.9383544921875, "logps/rejected": -942.35791015625, "loss": 1.645, "rewards/accuracies": 0.875, "rewards/chosen": -10.722557067871094, "rewards/margins": 15.812875747680664, "rewards/rejected": -26.53543472290039, "step": 687 }, { "epoch": 0.4279937791601866, "grad_norm": 24.259414672851562, "learning_rate": 4.7637159981558325e-06, "logits/chosen": -2.0910446643829346, "logits/rejected": 1.7110493183135986, "logps/chosen": -436.27105712890625, "logps/rejected": -857.93603515625, "loss": 0.1087, "rewards/accuracies": 0.875, "rewards/chosen": -6.506262302398682, "rewards/margins": 17.343793869018555, "rewards/rejected": -23.85005760192871, "step": 688 }, { "epoch": 0.4286158631415241, "grad_norm": 0.00418486725538969, "learning_rate": 4.762563393268788e-06, "logits/chosen": 0.039060741662979126, "logits/rejected": 3.397256374359131, "logps/chosen": -334.5584411621094, "logps/rejected": -723.466064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.947673797607422, "rewards/margins": 17.47788429260254, "rewards/rejected": -22.425556182861328, "step": 689 }, { "epoch": 0.42923794712286156, "grad_norm": 34.04841232299805, "learning_rate": 4.761410788381743e-06, "logits/chosen": -1.352245807647705, "logits/rejected": 1.7429472208023071, "logps/chosen": -442.674560546875, "logps/rejected": -684.5184326171875, "loss": 1.6189, "rewards/accuracies": 0.875, "rewards/chosen": -8.620567321777344, "rewards/margins": 9.817951202392578, "rewards/rejected": -18.438518524169922, "step": 690 }, { "epoch": 0.42986003110419907, "grad_norm": 31.715965270996094, "learning_rate": 4.760258183494698e-06, "logits/chosen": -1.247018814086914, "logits/rejected": 3.7910540103912354, "logps/chosen": -424.660400390625, "logps/rejected": -836.0987548828125, "loss": 0.4279, "rewards/accuracies": 0.875, "rewards/chosen": -8.494476318359375, "rewards/margins": 15.016087532043457, "rewards/rejected": -23.510562896728516, "step": 691 }, { "epoch": 0.4304821150855365, "grad_norm": 24.739803314208984, "learning_rate": 4.759105578607653e-06, "logits/chosen": -0.5017096400260925, "logits/rejected": 2.8500189781188965, "logps/chosen": -446.18585205078125, "logps/rejected": -784.303466796875, "loss": 0.2784, "rewards/accuracies": 0.875, "rewards/chosen": -8.562870025634766, "rewards/margins": 13.364456176757812, "rewards/rejected": -21.927326202392578, "step": 692 }, { "epoch": 0.431104199066874, "grad_norm": 44.9724235534668, "learning_rate": 4.757952973720609e-06, "logits/chosen": 3.754262924194336, "logits/rejected": 5.475862979888916, "logps/chosen": -708.015869140625, "logps/rejected": -868.138916015625, "loss": 1.1519, "rewards/accuracies": 0.875, "rewards/chosen": -9.583785057067871, "rewards/margins": 9.499543190002441, "rewards/rejected": -19.083328247070312, "step": 693 }, { "epoch": 0.43172628304821153, "grad_norm": 0.12701572477817535, "learning_rate": 4.756800368833564e-06, "logits/chosen": 1.0257909297943115, "logits/rejected": 1.0529817342758179, "logps/chosen": -592.5926513671875, "logps/rejected": -720.8876953125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.634477615356445, "rewards/margins": 19.143278121948242, "rewards/rejected": -26.777755737304688, "step": 694 }, { "epoch": 0.432348367029549, "grad_norm": 0.3144236207008362, "learning_rate": 4.755647763946519e-06, "logits/chosen": -0.3531043231487274, "logits/rejected": 3.7984418869018555, "logps/chosen": -361.6063232421875, "logps/rejected": -824.85009765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.502399921417236, "rewards/margins": 23.011634826660156, "rewards/rejected": -29.514034271240234, "step": 695 }, { "epoch": 0.4329704510108865, "grad_norm": 0.00020732081611640751, "learning_rate": 4.754495159059474e-06, "logits/chosen": -1.5537899732589722, "logits/rejected": 3.863278388977051, "logps/chosen": -386.71478271484375, "logps/rejected": -1015.0911254882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0640451908111572, "rewards/margins": 25.591724395751953, "rewards/rejected": -28.65576934814453, "step": 696 }, { "epoch": 0.43359253499222394, "grad_norm": 26.792686462402344, "learning_rate": 4.7533425541724295e-06, "logits/chosen": 1.8323582410812378, "logits/rejected": 3.697693109512329, "logps/chosen": -591.3922729492188, "logps/rejected": -806.5018310546875, "loss": 0.374, "rewards/accuracies": 0.875, "rewards/chosen": -3.721050500869751, "rewards/margins": 13.876240730285645, "rewards/rejected": -17.5972900390625, "step": 697 }, { "epoch": 0.43421461897356145, "grad_norm": 4.399302005767822, "learning_rate": 4.752189949285385e-06, "logits/chosen": 0.6278190612792969, "logits/rejected": 1.133225440979004, "logps/chosen": -593.8261108398438, "logps/rejected": -769.5350952148438, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -7.2219953536987305, "rewards/margins": 14.507144927978516, "rewards/rejected": -21.729141235351562, "step": 698 }, { "epoch": 0.4348367029548989, "grad_norm": 4.496114730834961, "learning_rate": 4.751037344398341e-06, "logits/chosen": 0.0551641583442688, "logits/rejected": 4.235555171966553, "logps/chosen": -363.8678894042969, "logps/rejected": -777.250732421875, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": -7.3506622314453125, "rewards/margins": 14.979228973388672, "rewards/rejected": -22.329891204833984, "step": 699 }, { "epoch": 0.4354587869362364, "grad_norm": 0.8819683194160461, "learning_rate": 4.749884739511296e-06, "logits/chosen": 0.10031324625015259, "logits/rejected": 4.287592887878418, "logps/chosen": -440.3474426269531, "logps/rejected": -884.9929809570312, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -5.949248313903809, "rewards/margins": 17.867116928100586, "rewards/rejected": -23.816364288330078, "step": 700 }, { "epoch": 0.43608087091757386, "grad_norm": 23.288360595703125, "learning_rate": 4.748732134624251e-06, "logits/chosen": -1.186615228652954, "logits/rejected": 3.4784440994262695, "logps/chosen": -423.2039794921875, "logps/rejected": -845.6221313476562, "loss": 0.7393, "rewards/accuracies": 0.875, "rewards/chosen": -7.458308219909668, "rewards/margins": 15.12204360961914, "rewards/rejected": -22.580352783203125, "step": 701 }, { "epoch": 0.43670295489891137, "grad_norm": 0.004663229454308748, "learning_rate": 4.7475795297372065e-06, "logits/chosen": 0.0869104266166687, "logits/rejected": 4.081899166107178, "logps/chosen": -479.1942443847656, "logps/rejected": -896.97119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.93591833114624, "rewards/margins": 18.201383590698242, "rewards/rejected": -26.137300491333008, "step": 702 }, { "epoch": 0.4373250388802488, "grad_norm": 0.0026992084458470345, "learning_rate": 4.746426924850162e-06, "logits/chosen": -2.009183883666992, "logits/rejected": 3.8174266815185547, "logps/chosen": -265.8365478515625, "logps/rejected": -853.8241577148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.638486862182617, "rewards/margins": 23.152103424072266, "rewards/rejected": -25.790592193603516, "step": 703 }, { "epoch": 0.4379471228615863, "grad_norm": 0.031216738745570183, "learning_rate": 4.745274319963117e-06, "logits/chosen": 2.1903696060180664, "logits/rejected": 4.714466571807861, "logps/chosen": -630.603515625, "logps/rejected": -957.2979736328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.502143859863281, "rewards/margins": 16.405481338500977, "rewards/rejected": -25.907625198364258, "step": 704 }, { "epoch": 0.4385692068429238, "grad_norm": 4.766060829162598, "learning_rate": 4.744121715076072e-06, "logits/chosen": 0.11997011303901672, "logits/rejected": 2.390803098678589, "logps/chosen": -474.9181823730469, "logps/rejected": -758.4802856445312, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -8.49026107788086, "rewards/margins": 13.536415100097656, "rewards/rejected": -22.026674270629883, "step": 705 }, { "epoch": 0.4391912908242613, "grad_norm": 30.133831024169922, "learning_rate": 4.742969110189027e-06, "logits/chosen": 0.5862630009651184, "logits/rejected": 1.7295148372650146, "logps/chosen": -444.6620178222656, "logps/rejected": -619.6727294921875, "loss": 0.4622, "rewards/accuracies": 0.875, "rewards/chosen": -6.195107460021973, "rewards/margins": 14.017037391662598, "rewards/rejected": -20.212146759033203, "step": 706 }, { "epoch": 0.43981337480559873, "grad_norm": 0.0015233299927785993, "learning_rate": 4.741816505301983e-06, "logits/chosen": 1.5498592853546143, "logits/rejected": 4.029008865356445, "logps/chosen": -538.0079345703125, "logps/rejected": -924.365966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.708686828613281, "rewards/margins": 23.275190353393555, "rewards/rejected": -30.98387908935547, "step": 707 }, { "epoch": 0.44043545878693624, "grad_norm": 0.0009342418634332716, "learning_rate": 4.740663900414938e-06, "logits/chosen": -0.7223360538482666, "logits/rejected": 3.4309163093566895, "logps/chosen": -340.63818359375, "logps/rejected": -774.0181274414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.805405616760254, "rewards/margins": 21.276443481445312, "rewards/rejected": -28.081846237182617, "step": 708 }, { "epoch": 0.4410575427682737, "grad_norm": 46.141357421875, "learning_rate": 4.739511295527893e-06, "logits/chosen": -3.716412305831909, "logits/rejected": 1.5815439224243164, "logps/chosen": -422.9011535644531, "logps/rejected": -904.5416870117188, "loss": 1.7318, "rewards/accuracies": 0.875, "rewards/chosen": -10.199352264404297, "rewards/margins": 17.21719741821289, "rewards/rejected": -27.416549682617188, "step": 709 }, { "epoch": 0.4416796267496112, "grad_norm": 2.966789484024048, "learning_rate": 4.738358690640848e-06, "logits/chosen": 2.19631028175354, "logits/rejected": 4.154239177703857, "logps/chosen": -558.2725830078125, "logps/rejected": -827.265625, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -7.756782531738281, "rewards/margins": 15.467741966247559, "rewards/rejected": -23.224525451660156, "step": 710 }, { "epoch": 0.4423017107309487, "grad_norm": 1.096614956855774, "learning_rate": 4.7372060857538035e-06, "logits/chosen": 1.0244200229644775, "logits/rejected": 4.128070831298828, "logps/chosen": -523.0098266601562, "logps/rejected": -862.1829833984375, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -6.989865303039551, "rewards/margins": 17.196226119995117, "rewards/rejected": -24.18609046936035, "step": 711 }, { "epoch": 0.44292379471228616, "grad_norm": 21.39118194580078, "learning_rate": 4.736053480866759e-06, "logits/chosen": 1.2569371461868286, "logits/rejected": 4.960230350494385, "logps/chosen": -481.29217529296875, "logps/rejected": -845.056396484375, "loss": 0.1824, "rewards/accuracies": 0.875, "rewards/chosen": -5.386847019195557, "rewards/margins": 18.074981689453125, "rewards/rejected": -23.461828231811523, "step": 712 }, { "epoch": 0.44354587869362366, "grad_norm": 0.035147711634635925, "learning_rate": 4.734900875979714e-06, "logits/chosen": -2.8533473014831543, "logits/rejected": 1.125679850578308, "logps/chosen": -378.7342529296875, "logps/rejected": -823.3388671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.9344024658203125, "rewards/margins": 23.337167739868164, "rewards/rejected": -28.271568298339844, "step": 713 }, { "epoch": 0.4441679626749611, "grad_norm": 0.0005704350187443197, "learning_rate": 4.73374827109267e-06, "logits/chosen": -0.12316238880157471, "logits/rejected": 3.290395736694336, "logps/chosen": -390.05584716796875, "logps/rejected": -771.4310302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.037465572357178, "rewards/margins": 19.214134216308594, "rewards/rejected": -24.251598358154297, "step": 714 }, { "epoch": 0.4447900466562986, "grad_norm": 6.9552507400512695, "learning_rate": 4.732595666205625e-06, "logits/chosen": -2.466341018676758, "logits/rejected": 1.6108577251434326, "logps/chosen": -375.34283447265625, "logps/rejected": -802.8533325195312, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -5.593360900878906, "rewards/margins": 15.921603202819824, "rewards/rejected": -21.514965057373047, "step": 715 }, { "epoch": 0.4454121306376361, "grad_norm": 1.741106629371643, "learning_rate": 4.7314430613185805e-06, "logits/chosen": 2.07574462890625, "logits/rejected": 3.481757164001465, "logps/chosen": -640.0421142578125, "logps/rejected": -870.4541015625, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -13.261276245117188, "rewards/margins": 15.212532043457031, "rewards/rejected": -28.473806381225586, "step": 716 }, { "epoch": 0.4460342146189736, "grad_norm": 4.817162789549911e-06, "learning_rate": 4.730290456431536e-06, "logits/chosen": -2.971275568008423, "logits/rejected": 4.351428508758545, "logps/chosen": -219.6768035888672, "logps/rejected": -880.5977172851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8710782527923584, "rewards/margins": 29.834440231323242, "rewards/rejected": -32.70552062988281, "step": 717 }, { "epoch": 0.44665629860031103, "grad_norm": 34.71223068237305, "learning_rate": 4.729137851544491e-06, "logits/chosen": -0.3415360450744629, "logits/rejected": 3.649252414703369, "logps/chosen": -492.4614562988281, "logps/rejected": -833.1641845703125, "loss": 0.6228, "rewards/accuracies": 0.875, "rewards/chosen": -4.140350341796875, "rewards/margins": 10.404376983642578, "rewards/rejected": -14.544727325439453, "step": 718 }, { "epoch": 0.44727838258164854, "grad_norm": 0.005894318222999573, "learning_rate": 4.727985246657446e-06, "logits/chosen": -1.3749679327011108, "logits/rejected": 3.040403127670288, "logps/chosen": -446.4466552734375, "logps/rejected": -924.62939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.499325275421143, "rewards/margins": 21.93583106994629, "rewards/rejected": -27.435157775878906, "step": 719 }, { "epoch": 0.447900466562986, "grad_norm": 2.2069828510284424, "learning_rate": 4.726832641770401e-06, "logits/chosen": 0.39223602414131165, "logits/rejected": 2.627257823944092, "logps/chosen": -487.3267517089844, "logps/rejected": -755.5730590820312, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": -6.579398155212402, "rewards/margins": 12.08456039428711, "rewards/rejected": -18.663959503173828, "step": 720 }, { "epoch": 0.4485225505443235, "grad_norm": 33.56819152832031, "learning_rate": 4.725680036883357e-06, "logits/chosen": 1.559786319732666, "logits/rejected": 3.7089614868164062, "logps/chosen": -713.7747192382812, "logps/rejected": -980.051513671875, "loss": 0.471, "rewards/accuracies": 0.875, "rewards/chosen": -4.505293846130371, "rewards/margins": 19.567005157470703, "rewards/rejected": -24.07229995727539, "step": 721 }, { "epoch": 0.44914463452566095, "grad_norm": 27.670228958129883, "learning_rate": 4.724527431996312e-06, "logits/chosen": 0.33212411403656006, "logits/rejected": 2.9645965099334717, "logps/chosen": -450.44378662109375, "logps/rejected": -721.9329833984375, "loss": 0.3559, "rewards/accuracies": 0.875, "rewards/chosen": -4.267915725708008, "rewards/margins": 13.11201286315918, "rewards/rejected": -17.379928588867188, "step": 722 }, { "epoch": 0.44976671850699845, "grad_norm": 15.541918754577637, "learning_rate": 4.723374827109267e-06, "logits/chosen": 2.854301929473877, "logits/rejected": 5.9449462890625, "logps/chosen": -631.2748413085938, "logps/rejected": -989.6734619140625, "loss": 0.1159, "rewards/accuracies": 0.875, "rewards/chosen": -8.880619049072266, "rewards/margins": 17.788612365722656, "rewards/rejected": -26.669231414794922, "step": 723 }, { "epoch": 0.4503888024883359, "grad_norm": 0.08411452174186707, "learning_rate": 4.722222222222222e-06, "logits/chosen": -0.4340131878852844, "logits/rejected": 2.539937973022461, "logps/chosen": -517.1436767578125, "logps/rejected": -874.4027099609375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.182528018951416, "rewards/margins": 19.89919090270996, "rewards/rejected": -26.08172035217285, "step": 724 }, { "epoch": 0.4510108864696734, "grad_norm": 20.3781681060791, "learning_rate": 4.7210696173351775e-06, "logits/chosen": 0.23897361755371094, "logits/rejected": 2.9999685287475586, "logps/chosen": -700.116943359375, "logps/rejected": -934.967529296875, "loss": 0.1813, "rewards/accuracies": 0.875, "rewards/chosen": -7.934647560119629, "rewards/margins": 15.803482055664062, "rewards/rejected": -23.738128662109375, "step": 725 }, { "epoch": 0.45163297045101086, "grad_norm": 29.43364715576172, "learning_rate": 4.719917012448133e-06, "logits/chosen": -1.8553903102874756, "logits/rejected": 1.862307071685791, "logps/chosen": -480.6034240722656, "logps/rejected": -942.1726684570312, "loss": 0.2991, "rewards/accuracies": 0.875, "rewards/chosen": -6.97006368637085, "rewards/margins": 21.751319885253906, "rewards/rejected": -28.721384048461914, "step": 726 }, { "epoch": 0.45225505443234837, "grad_norm": 23.18766212463379, "learning_rate": 4.718764407561088e-06, "logits/chosen": 1.378616213798523, "logits/rejected": 3.979541063308716, "logps/chosen": -559.097412109375, "logps/rejected": -870.5625610351562, "loss": 0.3831, "rewards/accuracies": 0.875, "rewards/chosen": -7.625370979309082, "rewards/margins": 14.113837242126465, "rewards/rejected": -21.739208221435547, "step": 727 }, { "epoch": 0.4528771384136858, "grad_norm": 24.29891014099121, "learning_rate": 4.717611802674044e-06, "logits/chosen": 2.059588670730591, "logits/rejected": 4.025932312011719, "logps/chosen": -778.895263671875, "logps/rejected": -1051.254150390625, "loss": 0.2721, "rewards/accuracies": 0.875, "rewards/chosen": -11.523857116699219, "rewards/margins": 17.883371353149414, "rewards/rejected": -29.407230377197266, "step": 728 }, { "epoch": 0.4534992223950233, "grad_norm": 10.843426704406738, "learning_rate": 4.716459197786999e-06, "logits/chosen": -1.1179172992706299, "logits/rejected": 2.602318525314331, "logps/chosen": -471.91949462890625, "logps/rejected": -863.3618774414062, "loss": 0.1805, "rewards/accuracies": 0.875, "rewards/chosen": -6.14882755279541, "rewards/margins": 18.53677749633789, "rewards/rejected": -24.685606002807617, "step": 729 }, { "epoch": 0.45412130637636083, "grad_norm": 1.3418076038360596, "learning_rate": 4.7153065928999545e-06, "logits/chosen": -1.153537631034851, "logits/rejected": 2.798265218734741, "logps/chosen": -587.9439697265625, "logps/rejected": -993.32861328125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -10.406091690063477, "rewards/margins": 23.02769660949707, "rewards/rejected": -33.43379211425781, "step": 730 }, { "epoch": 0.4547433903576983, "grad_norm": 0.011465908959507942, "learning_rate": 4.71415398801291e-06, "logits/chosen": -1.5201659202575684, "logits/rejected": 3.761108160018921, "logps/chosen": -495.5352783203125, "logps/rejected": -978.4993286132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.031967163085938, "rewards/margins": 23.243043899536133, "rewards/rejected": -32.2750129699707, "step": 731 }, { "epoch": 0.4553654743390358, "grad_norm": 0.019519884139299393, "learning_rate": 4.713001383125865e-06, "logits/chosen": -1.4993672370910645, "logits/rejected": 3.8746728897094727, "logps/chosen": -348.3289794921875, "logps/rejected": -954.3119506835938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.078332901000977, "rewards/margins": 22.78673553466797, "rewards/rejected": -30.865070343017578, "step": 732 }, { "epoch": 0.45598755832037324, "grad_norm": 0.0006432720110751688, "learning_rate": 4.71184877823882e-06, "logits/chosen": 1.0196747779846191, "logits/rejected": 4.499789714813232, "logps/chosen": -518.6286010742188, "logps/rejected": -980.1221923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.276054382324219, "rewards/margins": 25.788124084472656, "rewards/rejected": -33.064178466796875, "step": 733 }, { "epoch": 0.45660964230171075, "grad_norm": 0.017226440832018852, "learning_rate": 4.710696173351775e-06, "logits/chosen": -1.222594976425171, "logits/rejected": 4.341372489929199, "logps/chosen": -471.2843322753906, "logps/rejected": -997.5906982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.164975166320801, "rewards/margins": 24.127727508544922, "rewards/rejected": -31.29270362854004, "step": 734 }, { "epoch": 0.4572317262830482, "grad_norm": 0.007779096253216267, "learning_rate": 4.709543568464731e-06, "logits/chosen": -0.9736397862434387, "logits/rejected": 2.2716212272644043, "logps/chosen": -304.4974365234375, "logps/rejected": -723.11181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.925656795501709, "rewards/margins": 21.8261775970459, "rewards/rejected": -26.751834869384766, "step": 735 }, { "epoch": 0.4578538102643857, "grad_norm": 3.236133337020874, "learning_rate": 4.708390963577686e-06, "logits/chosen": 0.17622852325439453, "logits/rejected": 2.8149657249450684, "logps/chosen": -579.513427734375, "logps/rejected": -885.9993896484375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -9.07348346710205, "rewards/margins": 19.248193740844727, "rewards/rejected": -28.32167625427246, "step": 736 }, { "epoch": 0.45847589424572316, "grad_norm": 0.00014734258002135903, "learning_rate": 4.707238358690641e-06, "logits/chosen": 1.3257426023483276, "logits/rejected": 3.4590463638305664, "logps/chosen": -559.5299072265625, "logps/rejected": -946.916748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.955436706542969, "rewards/margins": 21.80295181274414, "rewards/rejected": -35.75838851928711, "step": 737 }, { "epoch": 0.45909797822706067, "grad_norm": 21.308420181274414, "learning_rate": 4.706085753803596e-06, "logits/chosen": -1.9544880390167236, "logits/rejected": 4.07980489730835, "logps/chosen": -432.8442077636719, "logps/rejected": -980.4149169921875, "loss": 0.0999, "rewards/accuracies": 0.875, "rewards/chosen": -9.972265243530273, "rewards/margins": 22.069438934326172, "rewards/rejected": -32.04170608520508, "step": 738 }, { "epoch": 0.4597200622083981, "grad_norm": 6.995245456695557, "learning_rate": 4.7049331489165515e-06, "logits/chosen": -0.3527810573577881, "logits/rejected": 3.945213794708252, "logps/chosen": -425.16424560546875, "logps/rejected": -800.3833618164062, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -9.171060562133789, "rewards/margins": 21.52447509765625, "rewards/rejected": -30.69553565979004, "step": 739 }, { "epoch": 0.4603421461897356, "grad_norm": 9.528139114379883, "learning_rate": 4.703780544029507e-06, "logits/chosen": 0.8781489729881287, "logits/rejected": 2.834296464920044, "logps/chosen": -608.282470703125, "logps/rejected": -802.6109619140625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -11.577667236328125, "rewards/margins": 16.0227108001709, "rewards/rejected": -27.600378036499023, "step": 740 }, { "epoch": 0.4609642301710731, "grad_norm": 0.005674920044839382, "learning_rate": 4.702627939142462e-06, "logits/chosen": -2.576509475708008, "logits/rejected": 3.848076820373535, "logps/chosen": -365.6551208496094, "logps/rejected": -1039.408447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.874650001525879, "rewards/margins": 30.167984008789062, "rewards/rejected": -37.04263687133789, "step": 741 }, { "epoch": 0.4615863141524106, "grad_norm": 15.865659713745117, "learning_rate": 4.701475334255417e-06, "logits/chosen": -0.6655741930007935, "logits/rejected": 3.148348569869995, "logps/chosen": -500.2545166015625, "logps/rejected": -918.5359497070312, "loss": 0.1022, "rewards/accuracies": 0.875, "rewards/chosen": -7.519559860229492, "rewards/margins": 20.05760955810547, "rewards/rejected": -27.577167510986328, "step": 742 }, { "epoch": 0.46220839813374803, "grad_norm": 0.08761231601238251, "learning_rate": 4.700322729368373e-06, "logits/chosen": -0.6104300022125244, "logits/rejected": 2.0581812858581543, "logps/chosen": -421.5753173828125, "logps/rejected": -801.907958984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.096579074859619, "rewards/margins": 20.425188064575195, "rewards/rejected": -26.521766662597656, "step": 743 }, { "epoch": 0.46283048211508554, "grad_norm": 0.027988320216536522, "learning_rate": 4.6991701244813285e-06, "logits/chosen": 0.3800036907196045, "logits/rejected": 3.919179916381836, "logps/chosen": -517.352294921875, "logps/rejected": -964.316162109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.163141250610352, "rewards/margins": 28.622203826904297, "rewards/rejected": -38.785343170166016, "step": 744 }, { "epoch": 0.463452566096423, "grad_norm": 25.30392837524414, "learning_rate": 4.698017519594284e-06, "logits/chosen": -1.4486440420150757, "logits/rejected": 3.305373191833496, "logps/chosen": -373.08807373046875, "logps/rejected": -865.032958984375, "loss": 0.6425, "rewards/accuracies": 0.875, "rewards/chosen": -6.044712543487549, "rewards/margins": 28.989627838134766, "rewards/rejected": -35.034339904785156, "step": 745 }, { "epoch": 0.4640746500777605, "grad_norm": 5.703428268432617, "learning_rate": 4.696864914707239e-06, "logits/chosen": -2.756654739379883, "logits/rejected": 0.5146583318710327, "logps/chosen": -502.0623474121094, "logps/rejected": -884.9734497070312, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -9.868464469909668, "rewards/margins": 18.628908157348633, "rewards/rejected": -28.497373580932617, "step": 746 }, { "epoch": 0.464696734059098, "grad_norm": 0.549371063709259, "learning_rate": 4.695712309820194e-06, "logits/chosen": 0.6102413535118103, "logits/rejected": 3.507424831390381, "logps/chosen": -387.8964538574219, "logps/rejected": -794.0484008789062, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -9.76818561553955, "rewards/margins": 23.353492736816406, "rewards/rejected": -33.12167739868164, "step": 747 }, { "epoch": 0.46531881804043546, "grad_norm": 0.001107222051359713, "learning_rate": 4.694559704933149e-06, "logits/chosen": -1.4949012994766235, "logits/rejected": 2.643195390701294, "logps/chosen": -450.792724609375, "logps/rejected": -943.675537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.531936645507812, "rewards/margins": 24.204355239868164, "rewards/rejected": -34.736289978027344, "step": 748 }, { "epoch": 0.46594090202177296, "grad_norm": 17.563385009765625, "learning_rate": 4.693407100046105e-06, "logits/chosen": 1.4797841310501099, "logits/rejected": 4.371963024139404, "logps/chosen": -494.5657958984375, "logps/rejected": -917.1324462890625, "loss": 0.264, "rewards/accuracies": 0.875, "rewards/chosen": -9.093585968017578, "rewards/margins": 18.366085052490234, "rewards/rejected": -27.459671020507812, "step": 749 }, { "epoch": 0.4665629860031104, "grad_norm": 43.27721405029297, "learning_rate": 4.69225449515906e-06, "logits/chosen": 2.0956027507781982, "logits/rejected": 3.2051877975463867, "logps/chosen": -626.8250732421875, "logps/rejected": -921.1513671875, "loss": 1.0323, "rewards/accuracies": 0.875, "rewards/chosen": -13.858763694763184, "rewards/margins": 16.39712142944336, "rewards/rejected": -30.25588607788086, "step": 750 }, { "epoch": 0.4671850699844479, "grad_norm": 18.37482261657715, "learning_rate": 4.691101890272015e-06, "logits/chosen": -0.03599190711975098, "logits/rejected": 2.4114785194396973, "logps/chosen": -523.785888671875, "logps/rejected": -904.9202270507812, "loss": 0.1681, "rewards/accuracies": 0.875, "rewards/chosen": -9.543304443359375, "rewards/margins": 26.735179901123047, "rewards/rejected": -36.27848434448242, "step": 751 }, { "epoch": 0.46780715396578537, "grad_norm": 2.7367844581604004, "learning_rate": 4.68994928538497e-06, "logits/chosen": 2.2475149631500244, "logits/rejected": 2.533215284347534, "logps/chosen": -502.916015625, "logps/rejected": -703.9136352539062, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -5.636620998382568, "rewards/margins": 16.46638298034668, "rewards/rejected": -22.103004455566406, "step": 752 }, { "epoch": 0.4684292379471229, "grad_norm": 21.466821670532227, "learning_rate": 4.6887966804979255e-06, "logits/chosen": 0.2573103904724121, "logits/rejected": 3.109172821044922, "logps/chosen": -586.2950439453125, "logps/rejected": -882.7223510742188, "loss": 0.1991, "rewards/accuracies": 0.875, "rewards/chosen": -5.989780902862549, "rewards/margins": 17.86110496520996, "rewards/rejected": -23.85088539123535, "step": 753 }, { "epoch": 0.46905132192846033, "grad_norm": 54.392433166503906, "learning_rate": 4.687644075610881e-06, "logits/chosen": -2.5727341175079346, "logits/rejected": 1.3274749517440796, "logps/chosen": -367.27911376953125, "logps/rejected": -709.7308349609375, "loss": 2.162, "rewards/accuracies": 0.875, "rewards/chosen": -6.0185112953186035, "rewards/margins": 17.04226303100586, "rewards/rejected": -23.060775756835938, "step": 754 }, { "epoch": 0.46967340590979784, "grad_norm": 56.05888366699219, "learning_rate": 4.686491470723836e-06, "logits/chosen": -0.37654730677604675, "logits/rejected": 1.5416078567504883, "logps/chosen": -543.9898681640625, "logps/rejected": -910.7147216796875, "loss": 1.9481, "rewards/accuracies": 0.875, "rewards/chosen": -8.147024154663086, "rewards/margins": 21.383563995361328, "rewards/rejected": -29.530590057373047, "step": 755 }, { "epoch": 0.4702954898911353, "grad_norm": 46.91044998168945, "learning_rate": 4.685338865836791e-06, "logits/chosen": 1.3805171251296997, "logits/rejected": 4.201688289642334, "logps/chosen": -599.8497314453125, "logps/rejected": -894.1846313476562, "loss": 1.7776, "rewards/accuracies": 0.75, "rewards/chosen": -10.594759941101074, "rewards/margins": 20.424440383911133, "rewards/rejected": -31.019201278686523, "step": 756 }, { "epoch": 0.4709175738724728, "grad_norm": 8.315458297729492, "learning_rate": 4.684186260949747e-06, "logits/chosen": 4.614778518676758, "logits/rejected": 4.109716415405273, "logps/chosen": -738.0523681640625, "logps/rejected": -843.14599609375, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -11.66948413848877, "rewards/margins": 13.888378143310547, "rewards/rejected": -25.557861328125, "step": 757 }, { "epoch": 0.47153965785381025, "grad_norm": 9.746458053588867, "learning_rate": 4.6830336560627025e-06, "logits/chosen": 1.578921914100647, "logits/rejected": 1.6241142749786377, "logps/chosen": -752.6221923828125, "logps/rejected": -848.0787963867188, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": -11.152379989624023, "rewards/margins": 15.030645370483398, "rewards/rejected": -26.183025360107422, "step": 758 }, { "epoch": 0.47216174183514775, "grad_norm": 54.19750213623047, "learning_rate": 4.681881051175658e-06, "logits/chosen": -2.6089086532592773, "logits/rejected": 0.563267707824707, "logps/chosen": -492.4306945800781, "logps/rejected": -795.8643798828125, "loss": 1.1181, "rewards/accuracies": 0.875, "rewards/chosen": -7.355657577514648, "rewards/margins": 14.989089965820312, "rewards/rejected": -22.344745635986328, "step": 759 }, { "epoch": 0.4727838258164852, "grad_norm": 40.35515594482422, "learning_rate": 4.680728446288613e-06, "logits/chosen": 1.459610939025879, "logits/rejected": 2.081594228744507, "logps/chosen": -646.0441284179688, "logps/rejected": -817.3916015625, "loss": 0.6381, "rewards/accuracies": 0.875, "rewards/chosen": -10.502636909484863, "rewards/margins": 13.858505249023438, "rewards/rejected": -24.36114501953125, "step": 760 }, { "epoch": 0.4734059097978227, "grad_norm": 37.597110748291016, "learning_rate": 4.679575841401568e-06, "logits/chosen": -0.21912920475006104, "logits/rejected": 4.26984167098999, "logps/chosen": -525.0760498046875, "logps/rejected": -860.02294921875, "loss": 0.5311, "rewards/accuracies": 0.75, "rewards/chosen": -8.533904075622559, "rewards/margins": 13.084606170654297, "rewards/rejected": -21.61850929260254, "step": 761 }, { "epoch": 0.47402799377916016, "grad_norm": 3.999826669692993, "learning_rate": 4.678423236514523e-06, "logits/chosen": 1.7163630723953247, "logits/rejected": 5.152613639831543, "logps/chosen": -440.76507568359375, "logps/rejected": -786.729248046875, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -3.547441005706787, "rewards/margins": 15.236788749694824, "rewards/rejected": -18.784229278564453, "step": 762 }, { "epoch": 0.47465007776049767, "grad_norm": 1.3566317420554697e-06, "learning_rate": 4.677270631627479e-06, "logits/chosen": 0.08991807699203491, "logits/rejected": 4.114513397216797, "logps/chosen": -538.1998901367188, "logps/rejected": -1037.4219970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.77888822555542, "rewards/margins": 29.567556381225586, "rewards/rejected": -33.3464469909668, "step": 763 }, { "epoch": 0.4752721617418352, "grad_norm": 1.6059880256652832, "learning_rate": 4.676118026740434e-06, "logits/chosen": 1.8196675777435303, "logits/rejected": 4.471251487731934, "logps/chosen": -545.8761596679688, "logps/rejected": -786.5194091796875, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -4.208722114562988, "rewards/margins": 13.019830703735352, "rewards/rejected": -17.228551864624023, "step": 764 }, { "epoch": 0.4758942457231726, "grad_norm": 8.153962135314941, "learning_rate": 4.674965421853389e-06, "logits/chosen": 3.151045322418213, "logits/rejected": 3.54077410697937, "logps/chosen": -668.2283935546875, "logps/rejected": -743.080078125, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": -5.266658782958984, "rewards/margins": 11.068963050842285, "rewards/rejected": -16.335622787475586, "step": 765 }, { "epoch": 0.47651632970451013, "grad_norm": 41.73101806640625, "learning_rate": 4.673812816966344e-06, "logits/chosen": 0.876078188419342, "logits/rejected": 3.8836395740509033, "logps/chosen": -591.0263671875, "logps/rejected": -935.2633666992188, "loss": 1.4338, "rewards/accuracies": 0.875, "rewards/chosen": -6.700626373291016, "rewards/margins": 18.544769287109375, "rewards/rejected": -25.24539566040039, "step": 766 }, { "epoch": 0.4771384136858476, "grad_norm": 43.17827224731445, "learning_rate": 4.6726602120792995e-06, "logits/chosen": -1.3370412588119507, "logits/rejected": 1.3205068111419678, "logps/chosen": -465.7358703613281, "logps/rejected": -738.778076171875, "loss": 0.7697, "rewards/accuracies": 0.75, "rewards/chosen": -4.05903959274292, "rewards/margins": 13.253592491149902, "rewards/rejected": -17.312631607055664, "step": 767 }, { "epoch": 0.4777604976671851, "grad_norm": 0.40470781922340393, "learning_rate": 4.671507607192255e-06, "logits/chosen": 0.4932441711425781, "logits/rejected": 4.747807025909424, "logps/chosen": -420.48004150390625, "logps/rejected": -759.2723999023438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.8618760108947754, "rewards/margins": 15.449785232543945, "rewards/rejected": -19.311660766601562, "step": 768 }, { "epoch": 0.47838258164852254, "grad_norm": 1.6781436204910278, "learning_rate": 4.67035500230521e-06, "logits/chosen": -0.32106083631515503, "logits/rejected": 2.2426936626434326, "logps/chosen": -293.17706298828125, "logps/rejected": -586.2684326171875, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -2.407588243484497, "rewards/margins": 13.709722518920898, "rewards/rejected": -16.1173095703125, "step": 769 }, { "epoch": 0.47900466562986005, "grad_norm": 0.521558403968811, "learning_rate": 4.669202397418165e-06, "logits/chosen": 0.8737245798110962, "logits/rejected": 2.6280784606933594, "logps/chosen": -627.6446533203125, "logps/rejected": -758.8953857421875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.146016597747803, "rewards/margins": 14.013367652893066, "rewards/rejected": -19.15938377380371, "step": 770 }, { "epoch": 0.4796267496111975, "grad_norm": 0.2913641333580017, "learning_rate": 4.66804979253112e-06, "logits/chosen": -1.9072849750518799, "logits/rejected": 2.029294013977051, "logps/chosen": -430.62786865234375, "logps/rejected": -845.602783203125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.7142653465270996, "rewards/margins": 19.053903579711914, "rewards/rejected": -22.76816749572754, "step": 771 }, { "epoch": 0.480248833592535, "grad_norm": 8.871782302856445, "learning_rate": 4.6668971876440765e-06, "logits/chosen": 0.41620588302612305, "logits/rejected": 3.4311208724975586, "logps/chosen": -433.32684326171875, "logps/rejected": -803.77783203125, "loss": 0.2005, "rewards/accuracies": 0.875, "rewards/chosen": -4.387638092041016, "rewards/margins": 16.02027702331543, "rewards/rejected": -20.407915115356445, "step": 772 }, { "epoch": 0.48087091757387246, "grad_norm": 1.6098644733428955, "learning_rate": 4.665744582757032e-06, "logits/chosen": 3.09476900100708, "logits/rejected": 2.489342212677002, "logps/chosen": -642.2359008789062, "logps/rejected": -781.3652954101562, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -5.92470645904541, "rewards/margins": 16.429569244384766, "rewards/rejected": -22.35427474975586, "step": 773 }, { "epoch": 0.48149300155520997, "grad_norm": 4.467459678649902, "learning_rate": 4.664591977869987e-06, "logits/chosen": 0.6544711589813232, "logits/rejected": 4.949759483337402, "logps/chosen": -458.5473937988281, "logps/rejected": -877.2110595703125, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -4.6345696449279785, "rewards/margins": 20.904634475708008, "rewards/rejected": -25.539203643798828, "step": 774 }, { "epoch": 0.4821150855365474, "grad_norm": 2.7226791381835938, "learning_rate": 4.663439372982942e-06, "logits/chosen": -0.25376057624816895, "logits/rejected": 3.4412503242492676, "logps/chosen": -515.607177734375, "logps/rejected": -887.7158813476562, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -1.2038459777832031, "rewards/margins": 14.594051361083984, "rewards/rejected": -15.797898292541504, "step": 775 }, { "epoch": 0.4827371695178849, "grad_norm": 19.828170776367188, "learning_rate": 4.662286768095897e-06, "logits/chosen": 2.055224895477295, "logits/rejected": 3.848604440689087, "logps/chosen": -650.3723754882812, "logps/rejected": -948.07470703125, "loss": 0.1521, "rewards/accuracies": 0.875, "rewards/chosen": -8.56473445892334, "rewards/margins": 17.763263702392578, "rewards/rejected": -26.3279972076416, "step": 776 }, { "epoch": 0.4833592534992224, "grad_norm": 30.034021377563477, "learning_rate": 4.661134163208853e-06, "logits/chosen": 0.7829601764678955, "logits/rejected": 3.1941311359405518, "logps/chosen": -531.4030151367188, "logps/rejected": -833.4378051757812, "loss": 1.231, "rewards/accuracies": 0.875, "rewards/chosen": -5.631799221038818, "rewards/margins": 16.359912872314453, "rewards/rejected": -21.991710662841797, "step": 777 }, { "epoch": 0.4839813374805599, "grad_norm": 26.594362258911133, "learning_rate": 4.659981558321808e-06, "logits/chosen": -1.7318233251571655, "logits/rejected": 0.6387461423873901, "logps/chosen": -417.7113952636719, "logps/rejected": -671.1884765625, "loss": 0.4294, "rewards/accuracies": 0.875, "rewards/chosen": -2.4069485664367676, "rewards/margins": 14.395118713378906, "rewards/rejected": -16.80206871032715, "step": 778 }, { "epoch": 0.48460342146189733, "grad_norm": 0.039601147174835205, "learning_rate": 4.658828953434763e-06, "logits/chosen": 1.3546221256256104, "logits/rejected": 3.462092399597168, "logps/chosen": -591.6671142578125, "logps/rejected": -932.0730590820312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.906269550323486, "rewards/margins": 20.576311111450195, "rewards/rejected": -28.482580184936523, "step": 779 }, { "epoch": 0.48522550544323484, "grad_norm": 0.25924599170684814, "learning_rate": 4.657676348547718e-06, "logits/chosen": -2.0941567420959473, "logits/rejected": 2.815412759780884, "logps/chosen": -234.81979370117188, "logps/rejected": -680.761474609375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.9445247650146484, "rewards/margins": 19.434349060058594, "rewards/rejected": -23.37887191772461, "step": 780 }, { "epoch": 0.4858475894245723, "grad_norm": 40.54834747314453, "learning_rate": 4.6565237436606735e-06, "logits/chosen": -2.5595715045928955, "logits/rejected": 1.2172040939331055, "logps/chosen": -449.4385986328125, "logps/rejected": -810.2548828125, "loss": 1.0862, "rewards/accuracies": 0.75, "rewards/chosen": -4.879364490509033, "rewards/margins": 14.449647903442383, "rewards/rejected": -19.329011917114258, "step": 781 }, { "epoch": 0.4864696734059098, "grad_norm": 0.00287282164208591, "learning_rate": 4.655371138773629e-06, "logits/chosen": -0.0861775279045105, "logits/rejected": 1.4691548347473145, "logps/chosen": -417.1506652832031, "logps/rejected": -736.8383178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.55625057220459, "rewards/margins": 19.743030548095703, "rewards/rejected": -26.29928207397461, "step": 782 }, { "epoch": 0.4870917573872473, "grad_norm": 0.6697667837142944, "learning_rate": 4.654218533886584e-06, "logits/chosen": 0.8768869638442993, "logits/rejected": 3.338918685913086, "logps/chosen": -578.1866455078125, "logps/rejected": -951.1362915039062, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.037720680236816, "rewards/margins": 21.80088996887207, "rewards/rejected": -27.83860969543457, "step": 783 }, { "epoch": 0.48771384136858476, "grad_norm": 13.243257522583008, "learning_rate": 4.653065928999539e-06, "logits/chosen": 0.5735681653022766, "logits/rejected": 1.3145192861557007, "logps/chosen": -499.7567138671875, "logps/rejected": -654.567626953125, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": -2.3559441566467285, "rewards/margins": 11.995672225952148, "rewards/rejected": -14.351615905761719, "step": 784 }, { "epoch": 0.48833592534992226, "grad_norm": 2.240255832672119, "learning_rate": 4.651913324112494e-06, "logits/chosen": 0.1266368329524994, "logits/rejected": 3.817394733428955, "logps/chosen": -391.1224365234375, "logps/rejected": -742.4867553710938, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -2.3036351203918457, "rewards/margins": 17.926546096801758, "rewards/rejected": -20.230178833007812, "step": 785 }, { "epoch": 0.4889580093312597, "grad_norm": 0.12976962327957153, "learning_rate": 4.6507607192254504e-06, "logits/chosen": 0.6222423315048218, "logits/rejected": 4.154601097106934, "logps/chosen": -553.1651000976562, "logps/rejected": -969.9971923828125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.439103126525879, "rewards/margins": 21.052276611328125, "rewards/rejected": -27.49138069152832, "step": 786 }, { "epoch": 0.4895800933125972, "grad_norm": 12.953614234924316, "learning_rate": 4.649608114338406e-06, "logits/chosen": -1.1646008491516113, "logits/rejected": 2.2630698680877686, "logps/chosen": -446.870361328125, "logps/rejected": -865.9774169921875, "loss": 0.0981, "rewards/accuracies": 0.875, "rewards/chosen": -7.6477274894714355, "rewards/margins": 17.280902862548828, "rewards/rejected": -24.928630828857422, "step": 787 }, { "epoch": 0.49020217729393467, "grad_norm": 24.662870407104492, "learning_rate": 4.648455509451361e-06, "logits/chosen": 2.1699166297912598, "logits/rejected": 3.7511253356933594, "logps/chosen": -555.0864868164062, "logps/rejected": -872.4273681640625, "loss": 0.5918, "rewards/accuracies": 0.875, "rewards/chosen": -5.620430946350098, "rewards/margins": 18.23769187927246, "rewards/rejected": -23.858123779296875, "step": 788 }, { "epoch": 0.4908242612752722, "grad_norm": 0.28288745880126953, "learning_rate": 4.647302904564316e-06, "logits/chosen": -0.5442591905593872, "logits/rejected": 3.1663870811462402, "logps/chosen": -431.5106506347656, "logps/rejected": -803.7391357421875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.360430717468262, "rewards/margins": 13.79015827178955, "rewards/rejected": -19.150588989257812, "step": 789 }, { "epoch": 0.49144634525660963, "grad_norm": 26.9351863861084, "learning_rate": 4.646150299677271e-06, "logits/chosen": -0.5746155381202698, "logits/rejected": 2.7818684577941895, "logps/chosen": -447.7415466308594, "logps/rejected": -706.74072265625, "loss": 0.3673, "rewards/accuracies": 0.875, "rewards/chosen": -3.7811388969421387, "rewards/margins": 11.556182861328125, "rewards/rejected": -15.337322235107422, "step": 790 }, { "epoch": 0.49206842923794714, "grad_norm": 18.7960262298584, "learning_rate": 4.6449976947902266e-06, "logits/chosen": -0.4236619472503662, "logits/rejected": 3.328354835510254, "logps/chosen": -433.424560546875, "logps/rejected": -815.5387573242188, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -6.099715232849121, "rewards/margins": 17.299833297729492, "rewards/rejected": -23.399547576904297, "step": 791 }, { "epoch": 0.4926905132192846, "grad_norm": 0.0008821140509098768, "learning_rate": 4.643845089903182e-06, "logits/chosen": -0.3153393268585205, "logits/rejected": 3.704957962036133, "logps/chosen": -450.7311096191406, "logps/rejected": -864.4533081054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.469179391860962, "rewards/margins": 22.34432029724121, "rewards/rejected": -25.813501358032227, "step": 792 }, { "epoch": 0.4933125972006221, "grad_norm": 0.9906829595565796, "learning_rate": 4.642692485016137e-06, "logits/chosen": -0.16309303045272827, "logits/rejected": 4.056373596191406, "logps/chosen": -536.8864135742188, "logps/rejected": -936.7313232421875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -6.904061317443848, "rewards/margins": 19.377239227294922, "rewards/rejected": -26.281301498413086, "step": 793 }, { "epoch": 0.49393468118195955, "grad_norm": 4.4242730140686035, "learning_rate": 4.641539880129092e-06, "logits/chosen": 1.683719515800476, "logits/rejected": 3.134469747543335, "logps/chosen": -638.5487060546875, "logps/rejected": -888.1138916015625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -5.161971569061279, "rewards/margins": 17.2728271484375, "rewards/rejected": -22.434797286987305, "step": 794 }, { "epoch": 0.49455676516329705, "grad_norm": 0.9904939532279968, "learning_rate": 4.6403872752420475e-06, "logits/chosen": 0.6282204985618591, "logits/rejected": 2.4573426246643066, "logps/chosen": -596.8509521484375, "logps/rejected": -819.6409301757812, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -8.868796348571777, "rewards/margins": 16.89417266845703, "rewards/rejected": -25.762969970703125, "step": 795 }, { "epoch": 0.4951788491446345, "grad_norm": 0.050002530217170715, "learning_rate": 4.639234670355003e-06, "logits/chosen": 0.26788705587387085, "logits/rejected": 4.157488822937012, "logps/chosen": -586.4090576171875, "logps/rejected": -960.6173095703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.250540733337402, "rewards/margins": 18.535598754882812, "rewards/rejected": -23.78614044189453, "step": 796 }, { "epoch": 0.495800933125972, "grad_norm": 0.3074765205383301, "learning_rate": 4.638082065467958e-06, "logits/chosen": 2.1239802837371826, "logits/rejected": 4.563276767730713, "logps/chosen": -500.58929443359375, "logps/rejected": -899.6068115234375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.407536029815674, "rewards/margins": 16.969867706298828, "rewards/rejected": -24.377403259277344, "step": 797 }, { "epoch": 0.49642301710730946, "grad_norm": 0.8631083965301514, "learning_rate": 4.636929460580913e-06, "logits/chosen": 0.07526445388793945, "logits/rejected": 1.3133467435836792, "logps/chosen": -547.68994140625, "logps/rejected": -817.7927856445312, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -6.602295398712158, "rewards/margins": 11.569221496582031, "rewards/rejected": -18.17151641845703, "step": 798 }, { "epoch": 0.49704510108864697, "grad_norm": 0.0004819149326067418, "learning_rate": 4.635776855693868e-06, "logits/chosen": -1.4631166458129883, "logits/rejected": 4.0301384925842285, "logps/chosen": -348.5102233886719, "logps/rejected": -859.6078491210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0662336349487305, "rewards/margins": 19.39805030822754, "rewards/rejected": -24.464282989501953, "step": 799 }, { "epoch": 0.4976671850699845, "grad_norm": 1.7157622575759888, "learning_rate": 4.634624250806824e-06, "logits/chosen": 1.9659500122070312, "logits/rejected": 3.541919708251953, "logps/chosen": -578.5076904296875, "logps/rejected": -742.12841796875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -3.218245506286621, "rewards/margins": 15.700920104980469, "rewards/rejected": -18.919166564941406, "step": 800 }, { "epoch": 0.4982892690513219, "grad_norm": 13.794669151306152, "learning_rate": 4.63347164591978e-06, "logits/chosen": 3.571282386779785, "logits/rejected": 4.697101593017578, "logps/chosen": -663.9895629882812, "logps/rejected": -960.5087890625, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": -7.03059196472168, "rewards/margins": 18.64733123779297, "rewards/rejected": -25.677921295166016, "step": 801 }, { "epoch": 0.49891135303265943, "grad_norm": 4.653563022613525, "learning_rate": 4.632319041032735e-06, "logits/chosen": -0.7853131890296936, "logits/rejected": 2.139136552810669, "logps/chosen": -603.5784912109375, "logps/rejected": -852.1097412109375, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -5.409650802612305, "rewards/margins": 11.422354698181152, "rewards/rejected": -16.83200454711914, "step": 802 }, { "epoch": 0.4995334370139969, "grad_norm": 27.104677200317383, "learning_rate": 4.63116643614569e-06, "logits/chosen": -1.9991862773895264, "logits/rejected": 3.2186059951782227, "logps/chosen": -384.8690185546875, "logps/rejected": -877.2080078125, "loss": 0.1709, "rewards/accuracies": 0.875, "rewards/chosen": -5.625671863555908, "rewards/margins": 17.056589126586914, "rewards/rejected": -22.682260513305664, "step": 803 }, { "epoch": 0.5001555209953343, "grad_norm": 0.00779650267213583, "learning_rate": 4.630013831258645e-06, "logits/chosen": -1.2525960206985474, "logits/rejected": 3.466864585876465, "logps/chosen": -509.2117919921875, "logps/rejected": -991.336181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.337059020996094, "rewards/margins": 22.115440368652344, "rewards/rejected": -26.45250129699707, "step": 804 }, { "epoch": 0.5007776049766719, "grad_norm": 35.081085205078125, "learning_rate": 4.6288612263716006e-06, "logits/chosen": 1.4515666961669922, "logits/rejected": 2.7874021530151367, "logps/chosen": -558.619140625, "logps/rejected": -708.9620361328125, "loss": 0.3746, "rewards/accuracies": 0.875, "rewards/chosen": -3.3137013912200928, "rewards/margins": 13.120676040649414, "rewards/rejected": -16.434377670288086, "step": 805 }, { "epoch": 0.5013996889580093, "grad_norm": 0.13099785149097443, "learning_rate": 4.627708621484556e-06, "logits/chosen": -0.2925226092338562, "logits/rejected": 2.9255149364471436, "logps/chosen": -508.7183837890625, "logps/rejected": -914.4775390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.891010284423828, "rewards/margins": 18.918092727661133, "rewards/rejected": -25.809104919433594, "step": 806 }, { "epoch": 0.5020217729393468, "grad_norm": 16.45452880859375, "learning_rate": 4.626556016597511e-06, "logits/chosen": 0.7290836572647095, "logits/rejected": 3.254908323287964, "logps/chosen": -575.7127685546875, "logps/rejected": -794.7529296875, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": -7.9818434715271, "rewards/margins": 11.375015258789062, "rewards/rejected": -19.35685920715332, "step": 807 }, { "epoch": 0.5026438569206843, "grad_norm": 1.7199348211288452, "learning_rate": 4.625403411710466e-06, "logits/chosen": 4.098562717437744, "logits/rejected": 5.463818073272705, "logps/chosen": -594.138671875, "logps/rejected": -830.6209716796875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -4.791903495788574, "rewards/margins": 15.993047714233398, "rewards/rejected": -20.784954071044922, "step": 808 }, { "epoch": 0.5032659409020218, "grad_norm": 0.016061756759881973, "learning_rate": 4.6242508068234215e-06, "logits/chosen": -2.1252896785736084, "logits/rejected": 2.6811745166778564, "logps/chosen": -316.495361328125, "logps/rejected": -750.6249389648438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.024198055267334, "rewards/margins": 18.38684844970703, "rewards/rejected": -23.41104507446289, "step": 809 }, { "epoch": 0.5038880248833593, "grad_norm": 1.1880834102630615, "learning_rate": 4.623098201936377e-06, "logits/chosen": 2.4408676624298096, "logits/rejected": 3.9343366622924805, "logps/chosen": -487.645263671875, "logps/rejected": -694.6914672851562, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -6.5457634925842285, "rewards/margins": 16.69846534729004, "rewards/rejected": -23.244230270385742, "step": 810 }, { "epoch": 0.5045101088646967, "grad_norm": 27.158384323120117, "learning_rate": 4.621945597049332e-06, "logits/chosen": 2.4023821353912354, "logits/rejected": 5.048232078552246, "logps/chosen": -527.9765625, "logps/rejected": -850.1030883789062, "loss": 0.2794, "rewards/accuracies": 0.875, "rewards/chosen": -4.261188507080078, "rewards/margins": 13.181292533874512, "rewards/rejected": -17.442481994628906, "step": 811 }, { "epoch": 0.5051321928460342, "grad_norm": 0.00033380460808984935, "learning_rate": 4.620792992162287e-06, "logits/chosen": -1.0781645774841309, "logits/rejected": 3.9537434577941895, "logps/chosen": -447.1126708984375, "logps/rejected": -944.2455444335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.95174503326416, "rewards/margins": 22.1113338470459, "rewards/rejected": -27.063077926635742, "step": 812 }, { "epoch": 0.5057542768273717, "grad_norm": 3.74644914700184e-05, "learning_rate": 4.619640387275242e-06, "logits/chosen": 1.647376298904419, "logits/rejected": 3.9168145656585693, "logps/chosen": -501.5458984375, "logps/rejected": -923.9151611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.664668083190918, "rewards/margins": 23.443607330322266, "rewards/rejected": -30.108272552490234, "step": 813 }, { "epoch": 0.5063763608087092, "grad_norm": 3.1582701467414154e-06, "learning_rate": 4.618487782388198e-06, "logits/chosen": -0.49896568059921265, "logits/rejected": 2.6935436725616455, "logps/chosen": -384.57977294921875, "logps/rejected": -788.4315185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.673390865325928, "rewards/margins": 22.454517364501953, "rewards/rejected": -27.127906799316406, "step": 814 }, { "epoch": 0.5069984447900466, "grad_norm": 0.014756478369235992, "learning_rate": 4.617335177501153e-06, "logits/chosen": 0.6831048727035522, "logits/rejected": 4.275322437286377, "logps/chosen": -511.0201110839844, "logps/rejected": -856.8211669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.114786624908447, "rewards/margins": 18.40111541748047, "rewards/rejected": -22.51590347290039, "step": 815 }, { "epoch": 0.5076205287713841, "grad_norm": 17.718061447143555, "learning_rate": 4.616182572614109e-06, "logits/chosen": 1.340681552886963, "logits/rejected": 3.1322154998779297, "logps/chosen": -622.8712158203125, "logps/rejected": -1000.7344360351562, "loss": 0.108, "rewards/accuracies": 0.875, "rewards/chosen": -6.9386420249938965, "rewards/margins": 22.70752716064453, "rewards/rejected": -29.646167755126953, "step": 816 }, { "epoch": 0.5082426127527216, "grad_norm": 22.104307174682617, "learning_rate": 4.615029967727064e-06, "logits/chosen": -2.7415151596069336, "logits/rejected": 2.7648801803588867, "logps/chosen": -374.72784423828125, "logps/rejected": -899.2406616210938, "loss": 0.2482, "rewards/accuracies": 0.875, "rewards/chosen": -3.962153673171997, "rewards/margins": 18.367874145507812, "rewards/rejected": -22.330028533935547, "step": 817 }, { "epoch": 0.5088646967340591, "grad_norm": 32.42689514160156, "learning_rate": 4.613877362840019e-06, "logits/chosen": 1.2072477340698242, "logits/rejected": 3.5181543827056885, "logps/chosen": -589.1121215820312, "logps/rejected": -896.8251953125, "loss": 0.8031, "rewards/accuracies": 0.875, "rewards/chosen": -6.592341899871826, "rewards/margins": 17.727842330932617, "rewards/rejected": -24.32018280029297, "step": 818 }, { "epoch": 0.5094867807153965, "grad_norm": 11.6628999710083, "learning_rate": 4.6127247579529746e-06, "logits/chosen": 1.7232805490493774, "logits/rejected": 3.776496171951294, "logps/chosen": -568.9671630859375, "logps/rejected": -874.9674072265625, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -8.510915756225586, "rewards/margins": 19.880298614501953, "rewards/rejected": -28.391212463378906, "step": 819 }, { "epoch": 0.5101088646967341, "grad_norm": 0.7271249890327454, "learning_rate": 4.61157215306593e-06, "logits/chosen": 3.2940382957458496, "logits/rejected": 4.654926300048828, "logps/chosen": -770.0858154296875, "logps/rejected": -996.1089477539062, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.865810394287109, "rewards/margins": 19.61363983154297, "rewards/rejected": -27.479454040527344, "step": 820 }, { "epoch": 0.5107309486780716, "grad_norm": 1.074064016342163, "learning_rate": 4.610419548178885e-06, "logits/chosen": 1.2415605783462524, "logits/rejected": 2.915865182876587, "logps/chosen": -571.21728515625, "logps/rejected": -850.7125854492188, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -7.833264350891113, "rewards/margins": 13.54500961303711, "rewards/rejected": -21.378273010253906, "step": 821 }, { "epoch": 0.511353032659409, "grad_norm": 0.013838615268468857, "learning_rate": 4.609266943291839e-06, "logits/chosen": -2.0438036918640137, "logits/rejected": 1.6256415843963623, "logps/chosen": -343.5359802246094, "logps/rejected": -786.7481689453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.463273763656616, "rewards/margins": 24.050792694091797, "rewards/rejected": -27.514068603515625, "step": 822 }, { "epoch": 0.5119751166407465, "grad_norm": 0.001235920935869217, "learning_rate": 4.608114338404795e-06, "logits/chosen": -0.3681415319442749, "logits/rejected": 2.907719135284424, "logps/chosen": -496.8587646484375, "logps/rejected": -865.68701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.546807765960693, "rewards/margins": 21.944412231445312, "rewards/rejected": -26.49121856689453, "step": 823 }, { "epoch": 0.512597200622084, "grad_norm": 12.031455039978027, "learning_rate": 4.60696173351775e-06, "logits/chosen": 0.4697137773036957, "logits/rejected": 4.086321830749512, "logps/chosen": -567.7445678710938, "logps/rejected": -843.5440673828125, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": -5.043512344360352, "rewards/margins": 12.916199684143066, "rewards/rejected": -17.9597110748291, "step": 824 }, { "epoch": 0.5132192846034215, "grad_norm": 23.215635299682617, "learning_rate": 4.605809128630706e-06, "logits/chosen": 0.749940037727356, "logits/rejected": 3.3669252395629883, "logps/chosen": -502.6315612792969, "logps/rejected": -741.6364135742188, "loss": 0.6005, "rewards/accuracies": 0.875, "rewards/chosen": -4.730608940124512, "rewards/margins": 13.164880752563477, "rewards/rejected": -17.895488739013672, "step": 825 }, { "epoch": 0.5138413685847589, "grad_norm": 0.6654096841812134, "learning_rate": 4.604656523743661e-06, "logits/chosen": 0.640193521976471, "logits/rejected": 3.969430685043335, "logps/chosen": -396.957763671875, "logps/rejected": -793.5340576171875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.256521701812744, "rewards/margins": 13.162273406982422, "rewards/rejected": -17.418794631958008, "step": 826 }, { "epoch": 0.5144634525660964, "grad_norm": 22.310516357421875, "learning_rate": 4.603503918856616e-06, "logits/chosen": -0.8501242399215698, "logits/rejected": 3.2708749771118164, "logps/chosen": -453.19781494140625, "logps/rejected": -840.7752685546875, "loss": 0.4233, "rewards/accuracies": 0.75, "rewards/chosen": -3.1451218128204346, "rewards/margins": 17.685771942138672, "rewards/rejected": -20.830896377563477, "step": 827 }, { "epoch": 0.5150855365474339, "grad_norm": 16.61135482788086, "learning_rate": 4.602351313969572e-06, "logits/chosen": 1.2462623119354248, "logits/rejected": 5.105804920196533, "logps/chosen": -500.0732421875, "logps/rejected": -918.1619873046875, "loss": 0.1104, "rewards/accuracies": 0.875, "rewards/chosen": -4.89671516418457, "rewards/margins": 21.1292724609375, "rewards/rejected": -26.025989532470703, "step": 828 }, { "epoch": 0.5157076205287714, "grad_norm": 0.044712089002132416, "learning_rate": 4.601198709082527e-06, "logits/chosen": -1.1490451097488403, "logits/rejected": 2.0136399269104004, "logps/chosen": -406.8626708984375, "logps/rejected": -761.128662109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.640765190124512, "rewards/margins": 17.16360855102539, "rewards/rejected": -24.804372787475586, "step": 829 }, { "epoch": 0.5163297045101088, "grad_norm": 15.269174575805664, "learning_rate": 4.600046104195482e-06, "logits/chosen": 0.6019724607467651, "logits/rejected": 3.838488817214966, "logps/chosen": -618.8802490234375, "logps/rejected": -999.3819580078125, "loss": 0.1088, "rewards/accuracies": 0.875, "rewards/chosen": -2.692051410675049, "rewards/margins": 17.41494369506836, "rewards/rejected": -20.10699462890625, "step": 830 }, { "epoch": 0.5169517884914463, "grad_norm": 0.014774742536246777, "learning_rate": 4.598893499308437e-06, "logits/chosen": -2.8291544914245605, "logits/rejected": 3.349138021469116, "logps/chosen": -300.6230163574219, "logps/rejected": -764.5264892578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.240640163421631, "rewards/margins": 20.8753719329834, "rewards/rejected": -23.116012573242188, "step": 831 }, { "epoch": 0.5175738724727839, "grad_norm": 1.0063591003417969, "learning_rate": 4.5977408944213925e-06, "logits/chosen": -0.15792837738990784, "logits/rejected": 4.523929595947266, "logps/chosen": -366.80047607421875, "logps/rejected": -839.95703125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -4.910074234008789, "rewards/margins": 15.645296096801758, "rewards/rejected": -20.555370330810547, "step": 832 }, { "epoch": 0.5181959564541213, "grad_norm": 26.251590728759766, "learning_rate": 4.596588289534348e-06, "logits/chosen": 1.309356451034546, "logits/rejected": 3.279106616973877, "logps/chosen": -526.9833984375, "logps/rejected": -892.2254028320312, "loss": 0.2489, "rewards/accuracies": 0.875, "rewards/chosen": -4.946220397949219, "rewards/margins": 24.749940872192383, "rewards/rejected": -29.69615936279297, "step": 833 }, { "epoch": 0.5188180404354588, "grad_norm": 0.0008679351885803044, "learning_rate": 4.595435684647303e-06, "logits/chosen": -0.7757160663604736, "logits/rejected": 3.746737003326416, "logps/chosen": -412.482666015625, "logps/rejected": -899.3509521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5694308280944824, "rewards/margins": 18.884733200073242, "rewards/rejected": -22.454164505004883, "step": 834 }, { "epoch": 0.5194401244167963, "grad_norm": 0.44425633549690247, "learning_rate": 4.594283079760258e-06, "logits/chosen": -1.775087594985962, "logits/rejected": 2.158067464828491, "logps/chosen": -380.7835693359375, "logps/rejected": -724.4786376953125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -6.430812835693359, "rewards/margins": 15.602945327758789, "rewards/rejected": -22.03375816345215, "step": 835 }, { "epoch": 0.5200622083981338, "grad_norm": 1.231345295906067, "learning_rate": 4.593130474873213e-06, "logits/chosen": 1.7987587451934814, "logits/rejected": 3.159287691116333, "logps/chosen": -584.3314208984375, "logps/rejected": -882.6793212890625, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -8.365367889404297, "rewards/margins": 22.243616104125977, "rewards/rejected": -30.608985900878906, "step": 836 }, { "epoch": 0.5206842923794712, "grad_norm": 0.0006359569961205125, "learning_rate": 4.591977869986169e-06, "logits/chosen": -0.4184119701385498, "logits/rejected": 1.7271060943603516, "logps/chosen": -530.0023193359375, "logps/rejected": -953.5392456054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.454473495483398, "rewards/margins": 27.001876831054688, "rewards/rejected": -33.45635223388672, "step": 837 }, { "epoch": 0.5213063763608087, "grad_norm": 35.898162841796875, "learning_rate": 4.590825265099124e-06, "logits/chosen": 1.0566655397415161, "logits/rejected": 3.6799302101135254, "logps/chosen": -493.8349609375, "logps/rejected": -713.1172485351562, "loss": 0.251, "rewards/accuracies": 0.75, "rewards/chosen": -7.644853115081787, "rewards/margins": 11.174239158630371, "rewards/rejected": -18.819091796875, "step": 838 }, { "epoch": 0.5219284603421462, "grad_norm": 4.95714573389705e-07, "learning_rate": 4.589672660212079e-06, "logits/chosen": -0.5007285475730896, "logits/rejected": 4.842340469360352, "logps/chosen": -400.39447021484375, "logps/rejected": -1001.4553833007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2126622200012207, "rewards/margins": 32.23038864135742, "rewards/rejected": -35.443050384521484, "step": 839 }, { "epoch": 0.5225505443234837, "grad_norm": 33.6435546875, "learning_rate": 4.588520055325035e-06, "logits/chosen": 2.3923635482788086, "logits/rejected": 3.9905447959899902, "logps/chosen": -576.5003051757812, "logps/rejected": -802.433837890625, "loss": 1.0211, "rewards/accuracies": 0.875, "rewards/chosen": -7.508181095123291, "rewards/margins": 14.160221099853516, "rewards/rejected": -21.66840171813965, "step": 840 }, { "epoch": 0.5231726283048211, "grad_norm": 3.987846612930298, "learning_rate": 4.58736745043799e-06, "logits/chosen": 1.476585030555725, "logits/rejected": 3.4213404655456543, "logps/chosen": -489.3512268066406, "logps/rejected": -753.6704711914062, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -6.481175899505615, "rewards/margins": 15.506742477416992, "rewards/rejected": -21.987918853759766, "step": 841 }, { "epoch": 0.5237947122861586, "grad_norm": 0.2226215898990631, "learning_rate": 4.5862148455509456e-06, "logits/chosen": -0.1877286732196808, "logits/rejected": 3.0288901329040527, "logps/chosen": -532.8231811523438, "logps/rejected": -876.0736694335938, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.797203540802002, "rewards/margins": 18.92719078063965, "rewards/rejected": -25.724393844604492, "step": 842 }, { "epoch": 0.5244167962674962, "grad_norm": 37.902252197265625, "learning_rate": 4.585062240663901e-06, "logits/chosen": 2.5677266120910645, "logits/rejected": 3.460753917694092, "logps/chosen": -683.076904296875, "logps/rejected": -910.3892211914062, "loss": 0.7027, "rewards/accuracies": 0.875, "rewards/chosen": -5.876817226409912, "rewards/margins": 19.93701171875, "rewards/rejected": -25.81382942199707, "step": 843 }, { "epoch": 0.5250388802488336, "grad_norm": 30.997549057006836, "learning_rate": 4.583909635776856e-06, "logits/chosen": 1.2948592901229858, "logits/rejected": 3.858691453933716, "logps/chosen": -622.263671875, "logps/rejected": -937.9315185546875, "loss": 0.3479, "rewards/accuracies": 0.875, "rewards/chosen": -9.006096839904785, "rewards/margins": 20.229141235351562, "rewards/rejected": -29.23523712158203, "step": 844 }, { "epoch": 0.5256609642301711, "grad_norm": 29.12040138244629, "learning_rate": 4.582757030889811e-06, "logits/chosen": 1.0461492538452148, "logits/rejected": 4.793246269226074, "logps/chosen": -491.78057861328125, "logps/rejected": -799.749755859375, "loss": 0.769, "rewards/accuracies": 0.875, "rewards/chosen": -6.587861061096191, "rewards/margins": 10.82066822052002, "rewards/rejected": -17.40852928161621, "step": 845 }, { "epoch": 0.5262830482115085, "grad_norm": 0.0004020752676296979, "learning_rate": 4.5816044260027665e-06, "logits/chosen": -1.9583938121795654, "logits/rejected": 2.0443408489227295, "logps/chosen": -410.334228515625, "logps/rejected": -881.2435913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.354313850402832, "rewards/margins": 25.540056228637695, "rewards/rejected": -31.89436912536621, "step": 846 }, { "epoch": 0.5269051321928461, "grad_norm": 0.008207093924283981, "learning_rate": 4.580451821115722e-06, "logits/chosen": 0.43992137908935547, "logits/rejected": 0.8132442235946655, "logps/chosen": -625.908203125, "logps/rejected": -858.2850341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.035774230957031, "rewards/margins": 20.796154022216797, "rewards/rejected": -30.831926345825195, "step": 847 }, { "epoch": 0.5275272161741835, "grad_norm": 1.043589691107627e-05, "learning_rate": 4.579299216228677e-06, "logits/chosen": -1.982966423034668, "logits/rejected": 2.495979070663452, "logps/chosen": -449.64886474609375, "logps/rejected": -1031.390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.190682888031006, "rewards/margins": 23.95623016357422, "rewards/rejected": -30.14691162109375, "step": 848 }, { "epoch": 0.528149300155521, "grad_norm": 10.80439281463623, "learning_rate": 4.578146611341632e-06, "logits/chosen": 2.5422017574310303, "logits/rejected": 4.219742774963379, "logps/chosen": -570.5355224609375, "logps/rejected": -833.507568359375, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": -6.7938432693481445, "rewards/margins": 19.633541107177734, "rewards/rejected": -26.427385330200195, "step": 849 }, { "epoch": 0.5287713841368584, "grad_norm": 0.1870967447757721, "learning_rate": 4.576994006454587e-06, "logits/chosen": -2.3501226902008057, "logits/rejected": 2.8784193992614746, "logps/chosen": -357.7137451171875, "logps/rejected": -834.7598876953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.8307647705078125, "rewards/margins": 20.596311569213867, "rewards/rejected": -28.42707633972168, "step": 850 }, { "epoch": 0.529393468118196, "grad_norm": 8.725545883178711, "learning_rate": 4.575841401567543e-06, "logits/chosen": 1.067751407623291, "logits/rejected": 2.8930211067199707, "logps/chosen": -585.4593505859375, "logps/rejected": -889.775390625, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -6.797390460968018, "rewards/margins": 16.867427825927734, "rewards/rejected": -23.664817810058594, "step": 851 }, { "epoch": 0.5300155520995334, "grad_norm": 1.1312555074691772, "learning_rate": 4.574688796680498e-06, "logits/chosen": -0.3240584135055542, "logits/rejected": 1.6172173023223877, "logps/chosen": -376.5591735839844, "logps/rejected": -594.09375, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -5.678043842315674, "rewards/margins": 12.331840515136719, "rewards/rejected": -18.009883880615234, "step": 852 }, { "epoch": 0.5306376360808709, "grad_norm": 2.2278480529785156, "learning_rate": 4.573536191793453e-06, "logits/chosen": 0.935234546661377, "logits/rejected": 3.973618507385254, "logps/chosen": -482.78704833984375, "logps/rejected": -930.9962158203125, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -8.005084991455078, "rewards/margins": 25.713951110839844, "rewards/rejected": -33.71903610229492, "step": 853 }, { "epoch": 0.5312597200622085, "grad_norm": 25.2884464263916, "learning_rate": 4.572383586906409e-06, "logits/chosen": -2.831954002380371, "logits/rejected": 3.0417380332946777, "logps/chosen": -374.65911865234375, "logps/rejected": -950.072509765625, "loss": 1.4801, "rewards/accuracies": 0.875, "rewards/chosen": -8.000370025634766, "rewards/margins": 23.205615997314453, "rewards/rejected": -31.20598602294922, "step": 854 }, { "epoch": 0.5318818040435459, "grad_norm": 0.008000146597623825, "learning_rate": 4.571230982019364e-06, "logits/chosen": 0.9278247356414795, "logits/rejected": 4.047576427459717, "logps/chosen": -548.40087890625, "logps/rejected": -874.353271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.476536750793457, "rewards/margins": 16.574291229248047, "rewards/rejected": -27.05082893371582, "step": 855 }, { "epoch": 0.5325038880248834, "grad_norm": 10.3145751953125, "learning_rate": 4.5700783771323196e-06, "logits/chosen": -2.642561674118042, "logits/rejected": 1.6923301219940186, "logps/chosen": -393.0373229980469, "logps/rejected": -858.5068359375, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -5.599725723266602, "rewards/margins": 19.435718536376953, "rewards/rejected": -25.035446166992188, "step": 856 }, { "epoch": 0.5331259720062208, "grad_norm": 0.030150998383760452, "learning_rate": 4.568925772245275e-06, "logits/chosen": 1.1801414489746094, "logits/rejected": 3.758105993270874, "logps/chosen": -503.06121826171875, "logps/rejected": -868.5562744140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.317437171936035, "rewards/margins": 21.45519256591797, "rewards/rejected": -29.77263069152832, "step": 857 }, { "epoch": 0.5337480559875584, "grad_norm": 29.704832077026367, "learning_rate": 4.56777316735823e-06, "logits/chosen": -2.479341506958008, "logits/rejected": 2.4226291179656982, "logps/chosen": -396.0041809082031, "logps/rejected": -965.6904296875, "loss": 0.4235, "rewards/accuracies": 0.875, "rewards/chosen": -6.2071075439453125, "rewards/margins": 19.336469650268555, "rewards/rejected": -25.5435791015625, "step": 858 }, { "epoch": 0.5343701399688958, "grad_norm": 0.005256436299532652, "learning_rate": 4.566620562471185e-06, "logits/chosen": -1.4652228355407715, "logits/rejected": 2.630767345428467, "logps/chosen": -484.7992858886719, "logps/rejected": -940.8848266601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.632268905639648, "rewards/margins": 24.89701271057129, "rewards/rejected": -31.529281616210938, "step": 859 }, { "epoch": 0.5349922239502333, "grad_norm": 0.4753815829753876, "learning_rate": 4.5654679575841405e-06, "logits/chosen": 0.8926655054092407, "logits/rejected": 3.2669153213500977, "logps/chosen": -473.52288818359375, "logps/rejected": -825.9222412109375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -6.783342361450195, "rewards/margins": 21.152517318725586, "rewards/rejected": -27.93585968017578, "step": 860 }, { "epoch": 0.5356143079315707, "grad_norm": 33.34446716308594, "learning_rate": 4.564315352697096e-06, "logits/chosen": 0.9703787565231323, "logits/rejected": 2.1039042472839355, "logps/chosen": -631.0071411132812, "logps/rejected": -800.5887451171875, "loss": 0.3649, "rewards/accuracies": 0.875, "rewards/chosen": -9.831226348876953, "rewards/margins": 12.349831581115723, "rewards/rejected": -22.181058883666992, "step": 861 }, { "epoch": 0.5362363919129083, "grad_norm": 7.412981358356774e-05, "learning_rate": 4.563162747810051e-06, "logits/chosen": -0.35365569591522217, "logits/rejected": 3.4597878456115723, "logps/chosen": -567.4420166015625, "logps/rejected": -991.6310424804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.804872989654541, "rewards/margins": 25.330028533935547, "rewards/rejected": -32.13490295410156, "step": 862 }, { "epoch": 0.5368584758942457, "grad_norm": 0.021897537633776665, "learning_rate": 4.562010142923006e-06, "logits/chosen": 0.643225908279419, "logits/rejected": 4.038776397705078, "logps/chosen": -504.0196533203125, "logps/rejected": -991.962158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.733529090881348, "rewards/margins": 24.76568603515625, "rewards/rejected": -35.49921798706055, "step": 863 }, { "epoch": 0.5374805598755832, "grad_norm": 0.4537505805492401, "learning_rate": 4.560857538035961e-06, "logits/chosen": 0.07119336724281311, "logits/rejected": 2.72735333442688, "logps/chosen": -516.2216796875, "logps/rejected": -841.7863159179688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.701822757720947, "rewards/margins": 20.65270233154297, "rewards/rejected": -26.35452651977539, "step": 864 }, { "epoch": 0.5381026438569206, "grad_norm": 0.033131957054138184, "learning_rate": 4.559704933148917e-06, "logits/chosen": -1.2717825174331665, "logits/rejected": 3.118110418319702, "logps/chosen": -524.9382934570312, "logps/rejected": -973.964111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.52232551574707, "rewards/margins": 20.606990814208984, "rewards/rejected": -28.129316329956055, "step": 865 }, { "epoch": 0.5387247278382582, "grad_norm": 0.0002912423515226692, "learning_rate": 4.558552328261872e-06, "logits/chosen": -0.5772674083709717, "logits/rejected": 2.762540102005005, "logps/chosen": -496.74420166015625, "logps/rejected": -960.775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.3707194328308105, "rewards/margins": 22.571828842163086, "rewards/rejected": -29.942546844482422, "step": 866 }, { "epoch": 0.5393468118195957, "grad_norm": 0.011110931634902954, "learning_rate": 4.557399723374827e-06, "logits/chosen": -0.25224435329437256, "logits/rejected": 3.20752215385437, "logps/chosen": -567.7034912109375, "logps/rejected": -968.6920166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.4491548538208, "rewards/margins": 24.463436126708984, "rewards/rejected": -32.91259002685547, "step": 867 }, { "epoch": 0.5399688958009331, "grad_norm": 18.80489730834961, "learning_rate": 4.556247118487782e-06, "logits/chosen": 0.04405069351196289, "logits/rejected": 2.4502503871917725, "logps/chosen": -551.693359375, "logps/rejected": -857.9959106445312, "loss": 0.1383, "rewards/accuracies": 0.875, "rewards/chosen": -9.809330940246582, "rewards/margins": 18.363243103027344, "rewards/rejected": -28.172576904296875, "step": 868 }, { "epoch": 0.5405909797822706, "grad_norm": 10.304757118225098, "learning_rate": 4.555094513600738e-06, "logits/chosen": 0.8795121908187866, "logits/rejected": 3.748501777648926, "logps/chosen": -553.6944580078125, "logps/rejected": -883.4180297851562, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": -5.258725166320801, "rewards/margins": 20.32225799560547, "rewards/rejected": -25.580982208251953, "step": 869 }, { "epoch": 0.5412130637636081, "grad_norm": 17.23099708557129, "learning_rate": 4.5539419087136936e-06, "logits/chosen": 0.2523691654205322, "logits/rejected": 1.9475032091140747, "logps/chosen": -538.177978515625, "logps/rejected": -872.0485229492188, "loss": 0.1189, "rewards/accuracies": 0.875, "rewards/chosen": -8.795501708984375, "rewards/margins": 19.7119140625, "rewards/rejected": -28.507417678833008, "step": 870 }, { "epoch": 0.5418351477449456, "grad_norm": 35.22981643676758, "learning_rate": 4.552789303826649e-06, "logits/chosen": 0.7181805372238159, "logits/rejected": 3.5797152519226074, "logps/chosen": -681.2504272460938, "logps/rejected": -976.5474243164062, "loss": 0.4289, "rewards/accuracies": 0.875, "rewards/chosen": -15.748141288757324, "rewards/margins": 15.885237693786621, "rewards/rejected": -31.633377075195312, "step": 871 }, { "epoch": 0.542457231726283, "grad_norm": 19.412866592407227, "learning_rate": 4.551636698939604e-06, "logits/chosen": 0.8360533714294434, "logits/rejected": 2.686750888824463, "logps/chosen": -524.9096069335938, "logps/rejected": -870.4130249023438, "loss": 0.4212, "rewards/accuracies": 0.875, "rewards/chosen": -9.245410919189453, "rewards/margins": 18.76921272277832, "rewards/rejected": -28.014623641967773, "step": 872 }, { "epoch": 0.5430793157076206, "grad_norm": 31.095205307006836, "learning_rate": 4.550484094052559e-06, "logits/chosen": -3.711292266845703, "logits/rejected": 1.670317530632019, "logps/chosen": -318.65130615234375, "logps/rejected": -842.1668090820312, "loss": 0.5101, "rewards/accuracies": 0.875, "rewards/chosen": -4.937169075012207, "rewards/margins": 26.391700744628906, "rewards/rejected": -31.32887077331543, "step": 873 }, { "epoch": 0.543701399688958, "grad_norm": 0.007474643178284168, "learning_rate": 4.5493314891655145e-06, "logits/chosen": -1.8571935892105103, "logits/rejected": 1.5715970993041992, "logps/chosen": -418.4180908203125, "logps/rejected": -763.9039306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.32398796081543, "rewards/margins": 18.20618438720703, "rewards/rejected": -24.53017234802246, "step": 874 }, { "epoch": 0.5443234836702955, "grad_norm": 8.715862274169922, "learning_rate": 4.54817888427847e-06, "logits/chosen": 1.0062412023544312, "logits/rejected": 3.670095920562744, "logps/chosen": -386.5008544921875, "logps/rejected": -703.3475341796875, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": -5.710971832275391, "rewards/margins": 16.32583236694336, "rewards/rejected": -22.036806106567383, "step": 875 }, { "epoch": 0.5449455676516329, "grad_norm": 36.20487976074219, "learning_rate": 4.547026279391425e-06, "logits/chosen": 0.13863390684127808, "logits/rejected": 3.659630537033081, "logps/chosen": -503.400390625, "logps/rejected": -918.284423828125, "loss": 0.2315, "rewards/accuracies": 0.875, "rewards/chosen": -7.179703235626221, "rewards/margins": 20.5958251953125, "rewards/rejected": -27.77552604675293, "step": 876 }, { "epoch": 0.5455676516329705, "grad_norm": 0.4360642731189728, "learning_rate": 4.54587367450438e-06, "logits/chosen": 2.4419362545013428, "logits/rejected": 3.9887049198150635, "logps/chosen": -642.28076171875, "logps/rejected": -993.3703002929688, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -8.067706108093262, "rewards/margins": 21.047924041748047, "rewards/rejected": -29.11562728881836, "step": 877 }, { "epoch": 0.546189735614308, "grad_norm": 2.6440608501434326, "learning_rate": 4.544721069617335e-06, "logits/chosen": -1.8029651641845703, "logits/rejected": 2.106647491455078, "logps/chosen": -499.1551208496094, "logps/rejected": -1073.3516845703125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -10.704458236694336, "rewards/margins": 27.312103271484375, "rewards/rejected": -38.016563415527344, "step": 878 }, { "epoch": 0.5468118195956454, "grad_norm": 0.3573562800884247, "learning_rate": 4.543568464730291e-06, "logits/chosen": 1.9899303913116455, "logits/rejected": 4.196540355682373, "logps/chosen": -654.3473510742188, "logps/rejected": -1023.14501953125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.193310737609863, "rewards/margins": 26.175338745117188, "rewards/rejected": -35.36864471435547, "step": 879 }, { "epoch": 0.5474339035769828, "grad_norm": 0.24273815751075745, "learning_rate": 4.542415859843246e-06, "logits/chosen": 0.9308842420578003, "logits/rejected": 2.3023080825805664, "logps/chosen": -582.0292358398438, "logps/rejected": -899.563720703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.894572257995605, "rewards/margins": 24.512340545654297, "rewards/rejected": -33.40690994262695, "step": 880 }, { "epoch": 0.5480559875583204, "grad_norm": 2.9039588298473973e-06, "learning_rate": 4.541263254956201e-06, "logits/chosen": -0.09958934783935547, "logits/rejected": 2.7995901107788086, "logps/chosen": -520.8726806640625, "logps/rejected": -907.4647216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.7143473625183105, "rewards/margins": 24.487667083740234, "rewards/rejected": -32.2020149230957, "step": 881 }, { "epoch": 0.5486780715396579, "grad_norm": 25.56290626525879, "learning_rate": 4.540110650069156e-06, "logits/chosen": -0.7153788208961487, "logits/rejected": 1.400726079940796, "logps/chosen": -434.67169189453125, "logps/rejected": -671.6514892578125, "loss": 0.3784, "rewards/accuracies": 0.875, "rewards/chosen": -6.010853290557861, "rewards/margins": 11.52184009552002, "rewards/rejected": -17.53269386291504, "step": 882 }, { "epoch": 0.5493001555209953, "grad_norm": 0.8233245015144348, "learning_rate": 4.538958045182112e-06, "logits/chosen": 1.417142629623413, "logits/rejected": 3.3376898765563965, "logps/chosen": -614.8678588867188, "logps/rejected": -820.2135009765625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -7.703726291656494, "rewards/margins": 20.157958984375, "rewards/rejected": -27.861684799194336, "step": 883 }, { "epoch": 0.5499222395023328, "grad_norm": 0.21891425549983978, "learning_rate": 4.5378054402950676e-06, "logits/chosen": 2.8602006435394287, "logits/rejected": 4.124835968017578, "logps/chosen": -712.83056640625, "logps/rejected": -938.74169921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -11.705924987792969, "rewards/margins": 22.996124267578125, "rewards/rejected": -34.702049255371094, "step": 884 }, { "epoch": 0.5505443234836703, "grad_norm": 0.023627132177352905, "learning_rate": 4.536652835408023e-06, "logits/chosen": -2.5472285747528076, "logits/rejected": 1.7891316413879395, "logps/chosen": -455.49041748046875, "logps/rejected": -1010.7542114257812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.745904922485352, "rewards/margins": 27.36368179321289, "rewards/rejected": -34.10958480834961, "step": 885 }, { "epoch": 0.5511664074650078, "grad_norm": 0.01632552035152912, "learning_rate": 4.535500230520978e-06, "logits/chosen": -0.05308155715465546, "logits/rejected": 3.6396572589874268, "logps/chosen": -558.2744750976562, "logps/rejected": -1079.80615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.748810768127441, "rewards/margins": 27.036746978759766, "rewards/rejected": -36.785560607910156, "step": 886 }, { "epoch": 0.5517884914463452, "grad_norm": 58.952552795410156, "learning_rate": 4.534347625633933e-06, "logits/chosen": 0.29872390627861023, "logits/rejected": 1.8410981893539429, "logps/chosen": -637.9015502929688, "logps/rejected": -892.9439086914062, "loss": 1.9597, "rewards/accuracies": 0.875, "rewards/chosen": -13.743337631225586, "rewards/margins": 15.554926872253418, "rewards/rejected": -29.298263549804688, "step": 887 }, { "epoch": 0.5524105754276827, "grad_norm": 0.8052808046340942, "learning_rate": 4.5331950207468885e-06, "logits/chosen": 1.1044509410858154, "logits/rejected": 2.839838981628418, "logps/chosen": -631.5152587890625, "logps/rejected": -952.8838500976562, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -13.291111946105957, "rewards/margins": 19.244768142700195, "rewards/rejected": -32.53588104248047, "step": 888 }, { "epoch": 0.5530326594090202, "grad_norm": 23.6992130279541, "learning_rate": 4.532042415859844e-06, "logits/chosen": -1.5598255395889282, "logits/rejected": 3.7028071880340576, "logps/chosen": -507.09930419921875, "logps/rejected": -1020.5116577148438, "loss": 0.2016, "rewards/accuracies": 0.875, "rewards/chosen": -10.400575637817383, "rewards/margins": 23.5273494720459, "rewards/rejected": -33.92792510986328, "step": 889 }, { "epoch": 0.5536547433903577, "grad_norm": 0.001044000033289194, "learning_rate": 4.530889810972799e-06, "logits/chosen": -0.5535763502120972, "logits/rejected": 3.151967763900757, "logps/chosen": -441.44842529296875, "logps/rejected": -916.7725830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.017629623413086, "rewards/margins": 23.890779495239258, "rewards/rejected": -27.90840721130371, "step": 890 }, { "epoch": 0.5542768273716951, "grad_norm": 5.863152980804443, "learning_rate": 4.529737206085754e-06, "logits/chosen": 0.11987632513046265, "logits/rejected": 0.8908869624137878, "logps/chosen": -618.422607421875, "logps/rejected": -864.134521484375, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -9.305463790893555, "rewards/margins": 19.35150146484375, "rewards/rejected": -28.656967163085938, "step": 891 }, { "epoch": 0.5548989113530327, "grad_norm": 0.014139095321297646, "learning_rate": 4.528584601198709e-06, "logits/chosen": 2.1482062339782715, "logits/rejected": 4.1894755363464355, "logps/chosen": -619.8270263671875, "logps/rejected": -974.5040893554688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.460693359375, "rewards/margins": 23.501544952392578, "rewards/rejected": -31.962238311767578, "step": 892 }, { "epoch": 0.5555209953343702, "grad_norm": 46.96757888793945, "learning_rate": 4.527431996311665e-06, "logits/chosen": 0.5948436260223389, "logits/rejected": 2.237551212310791, "logps/chosen": -593.1738891601562, "logps/rejected": -931.5321044921875, "loss": 1.05, "rewards/accuracies": 0.875, "rewards/chosen": -7.85053014755249, "rewards/margins": 25.84325408935547, "rewards/rejected": -33.693782806396484, "step": 893 }, { "epoch": 0.5561430793157076, "grad_norm": 5.902840614318848, "learning_rate": 4.52627939142462e-06, "logits/chosen": 0.7165172100067139, "logits/rejected": 2.4892489910125732, "logps/chosen": -592.964111328125, "logps/rejected": -840.6056518554688, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -9.395855903625488, "rewards/margins": 14.041229248046875, "rewards/rejected": -23.43708610534668, "step": 894 }, { "epoch": 0.5567651632970451, "grad_norm": 22.181793212890625, "learning_rate": 4.525126786537575e-06, "logits/chosen": 2.861790180206299, "logits/rejected": 3.9013900756835938, "logps/chosen": -650.225830078125, "logps/rejected": -826.1898193359375, "loss": 0.3234, "rewards/accuracies": 0.875, "rewards/chosen": -13.422914505004883, "rewards/margins": 14.123764038085938, "rewards/rejected": -27.54667854309082, "step": 895 }, { "epoch": 0.5573872472783826, "grad_norm": 2.8335178285487927e-05, "learning_rate": 4.52397418165053e-06, "logits/chosen": 1.190727949142456, "logits/rejected": 2.2278220653533936, "logps/chosen": -605.7880859375, "logps/rejected": -1063.646240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.507134437561035, "rewards/margins": 31.69879150390625, "rewards/rejected": -40.20592498779297, "step": 896 }, { "epoch": 0.5580093312597201, "grad_norm": 0.004062708467245102, "learning_rate": 4.5228215767634855e-06, "logits/chosen": -0.09006160497665405, "logits/rejected": 3.0125908851623535, "logps/chosen": -552.53759765625, "logps/rejected": -950.954833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.288880348205566, "rewards/margins": 22.93324851989746, "rewards/rejected": -31.222129821777344, "step": 897 }, { "epoch": 0.5586314152410575, "grad_norm": 0.052480533719062805, "learning_rate": 4.5216689718764415e-06, "logits/chosen": -0.5029276609420776, "logits/rejected": 2.6806373596191406, "logps/chosen": -414.865966796875, "logps/rejected": -748.1179809570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.556739807128906, "rewards/margins": 23.550384521484375, "rewards/rejected": -28.107120513916016, "step": 898 }, { "epoch": 0.559253499222395, "grad_norm": 0.0029522059485316277, "learning_rate": 4.520516366989397e-06, "logits/chosen": -1.5119998455047607, "logits/rejected": 3.129838466644287, "logps/chosen": -380.0979919433594, "logps/rejected": -1028.6463623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.766282081604004, "rewards/margins": 34.25803756713867, "rewards/rejected": -41.02431869506836, "step": 899 }, { "epoch": 0.5598755832037325, "grad_norm": 0.31720152497291565, "learning_rate": 4.519363762102352e-06, "logits/chosen": -0.17320013046264648, "logits/rejected": 3.0692298412323, "logps/chosen": -400.371337890625, "logps/rejected": -822.8155517578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.835075378417969, "rewards/margins": 17.175716400146484, "rewards/rejected": -25.010791778564453, "step": 900 }, { "epoch": 0.56049766718507, "grad_norm": 2.5494184494018555, "learning_rate": 4.518211157215307e-06, "logits/chosen": -0.592503547668457, "logits/rejected": 3.36773681640625, "logps/chosen": -465.85687255859375, "logps/rejected": -930.584228515625, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -5.216737747192383, "rewards/margins": 25.044021606445312, "rewards/rejected": -30.260761260986328, "step": 901 }, { "epoch": 0.5611197511664074, "grad_norm": 26.77128791809082, "learning_rate": 4.5170585523282624e-06, "logits/chosen": 0.34828922152519226, "logits/rejected": 3.3961830139160156, "logps/chosen": -468.8632507324219, "logps/rejected": -877.02880859375, "loss": 0.2851, "rewards/accuracies": 0.875, "rewards/chosen": -6.608033180236816, "rewards/margins": 16.834678649902344, "rewards/rejected": -23.442712783813477, "step": 902 }, { "epoch": 0.5617418351477449, "grad_norm": 0.055523090064525604, "learning_rate": 4.515905947441218e-06, "logits/chosen": -0.8115134835243225, "logits/rejected": 4.621614933013916, "logps/chosen": -442.95782470703125, "logps/rejected": -994.3385620117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.80880880355835, "rewards/margins": 25.47365951538086, "rewards/rejected": -31.282470703125, "step": 903 }, { "epoch": 0.5623639191290825, "grad_norm": 0.0003423531888984144, "learning_rate": 4.514753342554173e-06, "logits/chosen": -1.8804258108139038, "logits/rejected": 3.207822799682617, "logps/chosen": -318.6630859375, "logps/rejected": -776.7474365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.680161952972412, "rewards/margins": 20.768556594848633, "rewards/rejected": -26.448719024658203, "step": 904 }, { "epoch": 0.5629860031104199, "grad_norm": 1.876424789428711, "learning_rate": 4.513600737667128e-06, "logits/chosen": 1.282327651977539, "logits/rejected": 3.3914713859558105, "logps/chosen": -579.7039794921875, "logps/rejected": -843.2907104492188, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -6.453951358795166, "rewards/margins": 17.213043212890625, "rewards/rejected": -23.666996002197266, "step": 905 }, { "epoch": 0.5636080870917574, "grad_norm": 13.606136322021484, "learning_rate": 4.512448132780083e-06, "logits/chosen": 0.7030451893806458, "logits/rejected": 2.2091081142425537, "logps/chosen": -486.6806335449219, "logps/rejected": -760.94482421875, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": -7.211647033691406, "rewards/margins": 14.849355697631836, "rewards/rejected": -22.061004638671875, "step": 906 }, { "epoch": 0.5642301710730949, "grad_norm": 0.00018268085841555148, "learning_rate": 4.5112955278930386e-06, "logits/chosen": -1.1317505836486816, "logits/rejected": 2.438883066177368, "logps/chosen": -503.29754638671875, "logps/rejected": -965.1803588867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7387869358062744, "rewards/margins": 24.79349708557129, "rewards/rejected": -27.532285690307617, "step": 907 }, { "epoch": 0.5648522550544324, "grad_norm": 3.6311252117156982, "learning_rate": 4.510142923005994e-06, "logits/chosen": 1.5224366188049316, "logits/rejected": 4.215592861175537, "logps/chosen": -709.0532836914062, "logps/rejected": -996.6823120117188, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -10.90211296081543, "rewards/margins": 18.26665496826172, "rewards/rejected": -29.16876792907715, "step": 908 }, { "epoch": 0.5654743390357698, "grad_norm": 1.1938445568084717, "learning_rate": 4.508990318118949e-06, "logits/chosen": -1.4121729135513306, "logits/rejected": 2.583425998687744, "logps/chosen": -376.30084228515625, "logps/rejected": -830.406982421875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.8484039306640625, "rewards/margins": 18.305500030517578, "rewards/rejected": -23.15390396118164, "step": 909 }, { "epoch": 0.5660964230171073, "grad_norm": 21.51590919494629, "learning_rate": 4.507837713231904e-06, "logits/chosen": 2.6975741386413574, "logits/rejected": 4.030991077423096, "logps/chosen": -666.0827026367188, "logps/rejected": -952.5798950195312, "loss": 0.3689, "rewards/accuracies": 0.875, "rewards/chosen": -6.313325881958008, "rewards/margins": 20.379703521728516, "rewards/rejected": -26.693031311035156, "step": 910 }, { "epoch": 0.5667185069984448, "grad_norm": 32.47568893432617, "learning_rate": 4.5066851083448595e-06, "logits/chosen": 0.2974998950958252, "logits/rejected": 3.011427402496338, "logps/chosen": -503.1514587402344, "logps/rejected": -745.7210693359375, "loss": 0.7872, "rewards/accuracies": 0.875, "rewards/chosen": -7.5651936531066895, "rewards/margins": 14.901344299316406, "rewards/rejected": -22.466537475585938, "step": 911 }, { "epoch": 0.5673405909797823, "grad_norm": 0.0001468830305384472, "learning_rate": 4.5055325034578155e-06, "logits/chosen": -0.25655102729797363, "logits/rejected": 2.659013271331787, "logps/chosen": -516.9340209960938, "logps/rejected": -862.8931884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.951726913452148, "rewards/margins": 22.00909423828125, "rewards/rejected": -27.960819244384766, "step": 912 }, { "epoch": 0.5679626749611197, "grad_norm": 0.0001624006254132837, "learning_rate": 4.504379898570771e-06, "logits/chosen": 1.011976718902588, "logits/rejected": 4.230520248413086, "logps/chosen": -440.92352294921875, "logps/rejected": -846.1481323242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.438632965087891, "rewards/margins": 25.73596954345703, "rewards/rejected": -32.17460632324219, "step": 913 }, { "epoch": 0.5685847589424572, "grad_norm": 0.8141604065895081, "learning_rate": 4.503227293683726e-06, "logits/chosen": -1.5450491905212402, "logits/rejected": 4.107460975646973, "logps/chosen": -444.59075927734375, "logps/rejected": -966.14013671875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -6.689481735229492, "rewards/margins": 18.89098358154297, "rewards/rejected": -25.580467224121094, "step": 914 }, { "epoch": 0.5692068429237948, "grad_norm": 31.039724349975586, "learning_rate": 4.502074688796681e-06, "logits/chosen": 2.0466227531433105, "logits/rejected": 2.263880491256714, "logps/chosen": -701.8427734375, "logps/rejected": -896.3662109375, "loss": 0.6259, "rewards/accuracies": 0.875, "rewards/chosen": -12.721805572509766, "rewards/margins": 17.82135772705078, "rewards/rejected": -30.54316520690918, "step": 915 }, { "epoch": 0.5698289269051322, "grad_norm": 0.0480804368853569, "learning_rate": 4.5009220839096364e-06, "logits/chosen": 0.30877208709716797, "logits/rejected": 3.4405734539031982, "logps/chosen": -545.4224243164062, "logps/rejected": -977.380615234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.624667167663574, "rewards/margins": 28.29396629333496, "rewards/rejected": -33.91863250732422, "step": 916 }, { "epoch": 0.5704510108864697, "grad_norm": 2.4432003498077393, "learning_rate": 4.499769479022592e-06, "logits/chosen": -2.2052576541900635, "logits/rejected": 2.1533780097961426, "logps/chosen": -324.31756591796875, "logps/rejected": -803.582275390625, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -7.368386745452881, "rewards/margins": 25.227500915527344, "rewards/rejected": -32.595890045166016, "step": 917 }, { "epoch": 0.5710730948678071, "grad_norm": 0.05066022649407387, "learning_rate": 4.498616874135547e-06, "logits/chosen": -2.10320782661438, "logits/rejected": 0.7626610994338989, "logps/chosen": -444.4925842285156, "logps/rejected": -829.7848510742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.30866813659668, "rewards/margins": 21.65294647216797, "rewards/rejected": -29.961612701416016, "step": 918 }, { "epoch": 0.5716951788491447, "grad_norm": 0.5566099882125854, "learning_rate": 4.497464269248502e-06, "logits/chosen": 1.7583600282669067, "logits/rejected": 2.833300828933716, "logps/chosen": -627.1986083984375, "logps/rejected": -866.6060791015625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -9.058523178100586, "rewards/margins": 18.403121948242188, "rewards/rejected": -27.461645126342773, "step": 919 }, { "epoch": 0.5723172628304821, "grad_norm": 8.222851753234863, "learning_rate": 4.496311664361457e-06, "logits/chosen": 0.21099892258644104, "logits/rejected": 2.976849317550659, "logps/chosen": -485.7565002441406, "logps/rejected": -840.64794921875, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -8.408513069152832, "rewards/margins": 13.19969367980957, "rewards/rejected": -21.608205795288086, "step": 920 }, { "epoch": 0.5729393468118196, "grad_norm": 1.9750475530599942e-06, "learning_rate": 4.4951590594744126e-06, "logits/chosen": 0.6594505310058594, "logits/rejected": 4.408999443054199, "logps/chosen": -609.9363403320312, "logps/rejected": -1108.749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.744295120239258, "rewards/margins": 28.605003356933594, "rewards/rejected": -37.34929656982422, "step": 921 }, { "epoch": 0.573561430793157, "grad_norm": 0.04551282152533531, "learning_rate": 4.494006454587368e-06, "logits/chosen": 1.736369252204895, "logits/rejected": 4.076783180236816, "logps/chosen": -623.5065307617188, "logps/rejected": -960.9025268554688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.853501319885254, "rewards/margins": 20.565467834472656, "rewards/rejected": -29.418968200683594, "step": 922 }, { "epoch": 0.5741835147744946, "grad_norm": 0.6607025265693665, "learning_rate": 4.492853849700323e-06, "logits/chosen": 1.1568937301635742, "logits/rejected": 4.127242088317871, "logps/chosen": -621.6375732421875, "logps/rejected": -945.704345703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -10.65005874633789, "rewards/margins": 21.218669891357422, "rewards/rejected": -31.868732452392578, "step": 923 }, { "epoch": 0.574805598755832, "grad_norm": 0.00010709751950344071, "learning_rate": 4.491701244813278e-06, "logits/chosen": 1.5515047311782837, "logits/rejected": 3.1574530601501465, "logps/chosen": -653.9324340820312, "logps/rejected": -937.08984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.508325576782227, "rewards/margins": 21.496139526367188, "rewards/rejected": -33.00446701049805, "step": 924 }, { "epoch": 0.5754276827371695, "grad_norm": 7.271386623382568, "learning_rate": 4.4905486399262335e-06, "logits/chosen": 0.06512796878814697, "logits/rejected": 4.788290023803711, "logps/chosen": -309.8177185058594, "logps/rejected": -841.85546875, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -5.533644199371338, "rewards/margins": 25.78881072998047, "rewards/rejected": -31.32245635986328, "step": 925 }, { "epoch": 0.576049766718507, "grad_norm": 1.5662002563476562, "learning_rate": 4.489396035039189e-06, "logits/chosen": -0.054076001048088074, "logits/rejected": 1.7035764455795288, "logps/chosen": -501.0464782714844, "logps/rejected": -795.3419189453125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -6.691514015197754, "rewards/margins": 20.460548400878906, "rewards/rejected": -27.152061462402344, "step": 926 }, { "epoch": 0.5766718506998445, "grad_norm": 3.0245060770539567e-05, "learning_rate": 4.488243430152145e-06, "logits/chosen": 1.1634256839752197, "logits/rejected": 3.546384334564209, "logps/chosen": -582.4814453125, "logps/rejected": -1055.0667724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.61699104309082, "rewards/margins": 27.090225219726562, "rewards/rejected": -38.70721435546875, "step": 927 }, { "epoch": 0.577293934681182, "grad_norm": 0.00030546420020982623, "learning_rate": 4.4870908252651e-06, "logits/chosen": 0.3939926028251648, "logits/rejected": 1.8541518449783325, "logps/chosen": -529.6817016601562, "logps/rejected": -912.8052368164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.74662971496582, "rewards/margins": 26.91443634033203, "rewards/rejected": -35.661067962646484, "step": 928 }, { "epoch": 0.5779160186625194, "grad_norm": 5.225065251579508e-05, "learning_rate": 4.485938220378055e-06, "logits/chosen": 1.2161568403244019, "logits/rejected": 2.1696629524230957, "logps/chosen": -686.54931640625, "logps/rejected": -1048.06396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.813570022583008, "rewards/margins": 29.099994659423828, "rewards/rejected": -39.91356658935547, "step": 929 }, { "epoch": 0.578538102643857, "grad_norm": 3.6309118270874023, "learning_rate": 4.4847856154910104e-06, "logits/chosen": 1.50335693359375, "logits/rejected": 3.812343120574951, "logps/chosen": -531.6221923828125, "logps/rejected": -818.347900390625, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -10.524703979492188, "rewards/margins": 18.538118362426758, "rewards/rejected": -29.062822341918945, "step": 930 }, { "epoch": 0.5791601866251944, "grad_norm": 0.5539863705635071, "learning_rate": 4.483633010603966e-06, "logits/chosen": -2.3044533729553223, "logits/rejected": 1.220093011856079, "logps/chosen": -345.4551696777344, "logps/rejected": -790.6837158203125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.869375228881836, "rewards/margins": 20.257827758789062, "rewards/rejected": -25.12720489501953, "step": 931 }, { "epoch": 0.5797822706065319, "grad_norm": 0.11792272329330444, "learning_rate": 4.482480405716921e-06, "logits/chosen": -1.1023590564727783, "logits/rejected": 3.9352211952209473, "logps/chosen": -422.75665283203125, "logps/rejected": -827.6967163085938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.071016788482666, "rewards/margins": 17.74365997314453, "rewards/rejected": -21.814678192138672, "step": 932 }, { "epoch": 0.5804043545878693, "grad_norm": 17.893983840942383, "learning_rate": 4.481327800829876e-06, "logits/chosen": 0.9140981435775757, "logits/rejected": 3.72688627243042, "logps/chosen": -569.3828125, "logps/rejected": -979.8197021484375, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": -10.508774757385254, "rewards/margins": 22.191757202148438, "rewards/rejected": -32.700531005859375, "step": 933 }, { "epoch": 0.5810264385692069, "grad_norm": 0.005547088570892811, "learning_rate": 4.480175195942831e-06, "logits/chosen": 1.4913636445999146, "logits/rejected": 4.0409159660339355, "logps/chosen": -654.9049682617188, "logps/rejected": -945.757568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.859716415405273, "rewards/margins": 20.024673461914062, "rewards/rejected": -29.884387969970703, "step": 934 }, { "epoch": 0.5816485225505443, "grad_norm": 37.471954345703125, "learning_rate": 4.4790225910557866e-06, "logits/chosen": -1.901386022567749, "logits/rejected": 2.022355318069458, "logps/chosen": -464.411376953125, "logps/rejected": -863.151123046875, "loss": 0.4578, "rewards/accuracies": 0.875, "rewards/chosen": -10.3386812210083, "rewards/margins": 14.769096374511719, "rewards/rejected": -25.107778549194336, "step": 935 }, { "epoch": 0.5822706065318818, "grad_norm": 0.7605171203613281, "learning_rate": 4.477869986168742e-06, "logits/chosen": 0.5889070630073547, "logits/rejected": 2.96616792678833, "logps/chosen": -525.977783203125, "logps/rejected": -948.6707153320312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -9.921030044555664, "rewards/margins": 24.97126579284668, "rewards/rejected": -34.89229965209961, "step": 936 }, { "epoch": 0.5828926905132192, "grad_norm": 0.043847814202308655, "learning_rate": 4.476717381281697e-06, "logits/chosen": 4.610742568969727, "logits/rejected": 5.402982234954834, "logps/chosen": -703.4801025390625, "logps/rejected": -1112.8795166015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.42116928100586, "rewards/margins": 26.29779815673828, "rewards/rejected": -37.718971252441406, "step": 937 }, { "epoch": 0.5835147744945568, "grad_norm": 0.0021270744036883116, "learning_rate": 4.475564776394652e-06, "logits/chosen": -2.914405345916748, "logits/rejected": 3.734607219696045, "logps/chosen": -344.076416015625, "logps/rejected": -967.015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8051252365112305, "rewards/margins": 28.18408203125, "rewards/rejected": -34.98920440673828, "step": 938 }, { "epoch": 0.5841368584758942, "grad_norm": 14.73412036895752, "learning_rate": 4.4744121715076075e-06, "logits/chosen": -0.1586018055677414, "logits/rejected": 2.3905863761901855, "logps/chosen": -584.124755859375, "logps/rejected": -859.9805297851562, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -13.387353897094727, "rewards/margins": 16.92449378967285, "rewards/rejected": -30.311847686767578, "step": 939 }, { "epoch": 0.5847589424572317, "grad_norm": 2.371389150619507, "learning_rate": 4.473259566620563e-06, "logits/chosen": -0.3750753402709961, "logits/rejected": 2.5787787437438965, "logps/chosen": -538.4427490234375, "logps/rejected": -924.25341796875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -8.576361656188965, "rewards/margins": 19.905107498168945, "rewards/rejected": -28.481468200683594, "step": 940 }, { "epoch": 0.5853810264385692, "grad_norm": 91.64904022216797, "learning_rate": 4.472106961733518e-06, "logits/chosen": 0.6789902448654175, "logits/rejected": 2.3577969074249268, "logps/chosen": -772.6759033203125, "logps/rejected": -923.3992919921875, "loss": 2.4035, "rewards/accuracies": 0.875, "rewards/chosen": -23.004234313964844, "rewards/margins": 11.870817184448242, "rewards/rejected": -34.87505340576172, "step": 941 }, { "epoch": 0.5860031104199067, "grad_norm": 0.0015563821652904153, "learning_rate": 4.470954356846474e-06, "logits/chosen": -0.7656075954437256, "logits/rejected": 1.6892644166946411, "logps/chosen": -569.5089111328125, "logps/rejected": -983.2672119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.503901481628418, "rewards/margins": 24.433698654174805, "rewards/rejected": -37.937599182128906, "step": 942 }, { "epoch": 0.5866251944012442, "grad_norm": 0.0011474979110062122, "learning_rate": 4.469801751959429e-06, "logits/chosen": 0.09146726131439209, "logits/rejected": 2.295836925506592, "logps/chosen": -590.6580200195312, "logps/rejected": -893.857666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.355199813842773, "rewards/margins": 19.084501266479492, "rewards/rejected": -33.439701080322266, "step": 943 }, { "epoch": 0.5872472783825816, "grad_norm": 30.391260147094727, "learning_rate": 4.4686491470723844e-06, "logits/chosen": -2.1463727951049805, "logits/rejected": 0.11997640132904053, "logps/chosen": -468.9334411621094, "logps/rejected": -767.435302734375, "loss": 0.3284, "rewards/accuracies": 0.875, "rewards/chosen": -5.860607147216797, "rewards/margins": 21.03769302368164, "rewards/rejected": -26.898300170898438, "step": 944 }, { "epoch": 0.5878693623639192, "grad_norm": 0.011637120507657528, "learning_rate": 4.46749654218534e-06, "logits/chosen": -2.7303953170776367, "logits/rejected": 2.3290488719940186, "logps/chosen": -414.9637451171875, "logps/rejected": -1065.450439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.247337341308594, "rewards/margins": 32.224693298339844, "rewards/rejected": -41.47202682495117, "step": 945 }, { "epoch": 0.5884914463452566, "grad_norm": 1.5746371746063232, "learning_rate": 4.466343937298295e-06, "logits/chosen": -0.009310126304626465, "logits/rejected": 1.9677188396453857, "logps/chosen": -400.695068359375, "logps/rejected": -643.286865234375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -5.75554084777832, "rewards/margins": 15.892524719238281, "rewards/rejected": -21.6480655670166, "step": 946 }, { "epoch": 0.5891135303265941, "grad_norm": 0.0009989278623834252, "learning_rate": 4.46519133241125e-06, "logits/chosen": 0.7627098560333252, "logits/rejected": 3.4856858253479004, "logps/chosen": -511.3065185546875, "logps/rejected": -910.0632934570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.348669052124023, "rewards/margins": 23.043886184692383, "rewards/rejected": -31.392555236816406, "step": 947 }, { "epoch": 0.5897356143079315, "grad_norm": 0.08712099492549896, "learning_rate": 4.464038727524205e-06, "logits/chosen": -1.3138643503189087, "logits/rejected": 4.858949184417725, "logps/chosen": -408.7076110839844, "logps/rejected": -1118.2198486328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.463565826416016, "rewards/margins": 32.61619567871094, "rewards/rejected": -39.07975769042969, "step": 948 }, { "epoch": 0.5903576982892691, "grad_norm": 21.967683792114258, "learning_rate": 4.4628861226371606e-06, "logits/chosen": -0.37556570768356323, "logits/rejected": 2.6165435314178467, "logps/chosen": -548.4478759765625, "logps/rejected": -884.171142578125, "loss": 0.1892, "rewards/accuracies": 0.875, "rewards/chosen": -8.981535911560059, "rewards/margins": 18.82710838317871, "rewards/rejected": -27.80864715576172, "step": 949 }, { "epoch": 0.5909797822706065, "grad_norm": 6.485556241386803e-06, "learning_rate": 4.461733517750116e-06, "logits/chosen": 0.7720064520835876, "logits/rejected": 4.895740509033203, "logps/chosen": -619.361328125, "logps/rejected": -1186.765869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.193835258483887, "rewards/margins": 32.08253860473633, "rewards/rejected": -45.276371002197266, "step": 950 }, { "epoch": 0.591601866251944, "grad_norm": 0.9476038813591003, "learning_rate": 4.460580912863071e-06, "logits/chosen": -0.9694265127182007, "logits/rejected": 2.774254322052002, "logps/chosen": -434.80706787109375, "logps/rejected": -810.2960205078125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -7.517581462860107, "rewards/margins": 19.960872650146484, "rewards/rejected": -27.478456497192383, "step": 951 }, { "epoch": 0.5922239502332814, "grad_norm": 0.024504758417606354, "learning_rate": 4.459428307976026e-06, "logits/chosen": 3.0849127769470215, "logits/rejected": 3.723421096801758, "logps/chosen": -814.4183959960938, "logps/rejected": -1017.4376220703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.637451171875, "rewards/margins": 18.967361450195312, "rewards/rejected": -33.60481262207031, "step": 952 }, { "epoch": 0.592846034214619, "grad_norm": 0.004705091007053852, "learning_rate": 4.4582757030889815e-06, "logits/chosen": 1.4308946132659912, "logits/rejected": 2.767594814300537, "logps/chosen": -657.2290649414062, "logps/rejected": -1088.5958251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.916187286376953, "rewards/margins": 31.49768829345703, "rewards/rejected": -41.413875579833984, "step": 953 }, { "epoch": 0.5934681181959565, "grad_norm": 28.829410552978516, "learning_rate": 4.457123098201937e-06, "logits/chosen": 1.4665814638137817, "logits/rejected": 3.462592363357544, "logps/chosen": -622.665771484375, "logps/rejected": -946.5082397460938, "loss": 1.3582, "rewards/accuracies": 0.875, "rewards/chosen": -9.926738739013672, "rewards/margins": 17.527084350585938, "rewards/rejected": -27.45382308959961, "step": 954 }, { "epoch": 0.5940902021772939, "grad_norm": 29.489173889160156, "learning_rate": 4.455970493314892e-06, "logits/chosen": -1.9026505947113037, "logits/rejected": 2.029543399810791, "logps/chosen": -472.3614501953125, "logps/rejected": -930.0457763671875, "loss": 1.0627, "rewards/accuracies": 0.875, "rewards/chosen": -7.602188587188721, "rewards/margins": 23.613767623901367, "rewards/rejected": -31.215957641601562, "step": 955 }, { "epoch": 0.5947122861586314, "grad_norm": 0.16023872792720795, "learning_rate": 4.454817888427848e-06, "logits/chosen": 1.162771463394165, "logits/rejected": 4.04050874710083, "logps/chosen": -575.5117797851562, "logps/rejected": -988.9061279296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -10.193204879760742, "rewards/margins": 24.5118408203125, "rewards/rejected": -34.705047607421875, "step": 956 }, { "epoch": 0.5953343701399689, "grad_norm": 0.026538310572504997, "learning_rate": 4.453665283540803e-06, "logits/chosen": -0.10087063908576965, "logits/rejected": 5.041918754577637, "logps/chosen": -416.1558532714844, "logps/rejected": -1082.86669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.628523349761963, "rewards/margins": 30.532407760620117, "rewards/rejected": -38.16093063354492, "step": 957 }, { "epoch": 0.5959564541213064, "grad_norm": 37.81728744506836, "learning_rate": 4.4525126786537576e-06, "logits/chosen": 1.0219215154647827, "logits/rejected": 0.9904434680938721, "logps/chosen": -560.9853515625, "logps/rejected": -754.2684936523438, "loss": 0.3818, "rewards/accuracies": 0.875, "rewards/chosen": -12.566256523132324, "rewards/margins": 15.562918663024902, "rewards/rejected": -28.129173278808594, "step": 958 }, { "epoch": 0.5965785381026438, "grad_norm": 0.3736514151096344, "learning_rate": 4.451360073766713e-06, "logits/chosen": -0.7447368502616882, "logits/rejected": 2.2910654544830322, "logps/chosen": -463.67401123046875, "logps/rejected": -965.9398193359375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -10.4921293258667, "rewards/margins": 22.65591049194336, "rewards/rejected": -33.14804458618164, "step": 959 }, { "epoch": 0.5972006220839814, "grad_norm": 0.0048063406720757484, "learning_rate": 4.450207468879668e-06, "logits/chosen": -0.7846198081970215, "logits/rejected": 2.181448459625244, "logps/chosen": -573.2274169921875, "logps/rejected": -1039.460205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.714447021484375, "rewards/margins": 28.0849666595459, "rewards/rejected": -38.799415588378906, "step": 960 }, { "epoch": 0.5978227060653188, "grad_norm": 3.337085008621216, "learning_rate": 4.449054863992623e-06, "logits/chosen": 0.2808011770248413, "logits/rejected": 4.691339492797852, "logps/chosen": -572.583740234375, "logps/rejected": -1046.2052001953125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -10.704526901245117, "rewards/margins": 25.15294647216797, "rewards/rejected": -35.85747528076172, "step": 961 }, { "epoch": 0.5984447900466563, "grad_norm": 0.00023763404169585556, "learning_rate": 4.4479022591055785e-06, "logits/chosen": -0.1722264289855957, "logits/rejected": 3.0525989532470703, "logps/chosen": -494.31640625, "logps/rejected": -920.9412841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.198963165283203, "rewards/margins": 23.290542602539062, "rewards/rejected": -32.489505767822266, "step": 962 }, { "epoch": 0.5990668740279937, "grad_norm": 0.0672890692949295, "learning_rate": 4.446749654218534e-06, "logits/chosen": 0.6471335887908936, "logits/rejected": 3.0925450325012207, "logps/chosen": -591.6119384765625, "logps/rejected": -936.7149658203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.652125358581543, "rewards/margins": 22.490100860595703, "rewards/rejected": -35.14222717285156, "step": 963 }, { "epoch": 0.5996889580093313, "grad_norm": 0.0020569032058119774, "learning_rate": 4.445597049331489e-06, "logits/chosen": -0.9469920992851257, "logits/rejected": 3.857593297958374, "logps/chosen": -392.8443298339844, "logps/rejected": -893.7676391601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.6447930335998535, "rewards/margins": 26.24173355102539, "rewards/rejected": -33.88652420043945, "step": 964 }, { "epoch": 0.6003110419906688, "grad_norm": 0.0019404878839850426, "learning_rate": 4.444444444444444e-06, "logits/chosen": 3.261852264404297, "logits/rejected": 4.384195804595947, "logps/chosen": -611.6591186523438, "logps/rejected": -969.6968994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.406609058380127, "rewards/margins": 24.724668502807617, "rewards/rejected": -32.13127899169922, "step": 965 }, { "epoch": 0.6009331259720062, "grad_norm": 0.06938864290714264, "learning_rate": 4.4432918395574e-06, "logits/chosen": 1.4138062000274658, "logits/rejected": 2.655439853668213, "logps/chosen": -639.2680053710938, "logps/rejected": -933.8479614257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.837974548339844, "rewards/margins": 23.26096534729004, "rewards/rejected": -33.09893798828125, "step": 966 }, { "epoch": 0.6015552099533437, "grad_norm": 3.7771530151367188, "learning_rate": 4.4421392346703554e-06, "logits/chosen": -1.438300371170044, "logits/rejected": 2.7318453788757324, "logps/chosen": -392.73748779296875, "logps/rejected": -839.9110107421875, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -6.509245872497559, "rewards/margins": 22.722030639648438, "rewards/rejected": -29.231277465820312, "step": 967 }, { "epoch": 0.6021772939346812, "grad_norm": 1.8222912549972534, "learning_rate": 4.440986629783311e-06, "logits/chosen": -2.3222408294677734, "logits/rejected": 2.861205816268921, "logps/chosen": -411.26904296875, "logps/rejected": -1023.5480346679688, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -4.547379493713379, "rewards/margins": 29.81708526611328, "rewards/rejected": -34.364463806152344, "step": 968 }, { "epoch": 0.6027993779160187, "grad_norm": 0.17347721755504608, "learning_rate": 4.439834024896266e-06, "logits/chosen": 1.14876127243042, "logits/rejected": 3.9284627437591553, "logps/chosen": -526.479248046875, "logps/rejected": -783.3134765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.16906452178955, "rewards/margins": 16.533124923706055, "rewards/rejected": -25.702190399169922, "step": 969 }, { "epoch": 0.6034214618973561, "grad_norm": 0.12882846593856812, "learning_rate": 4.438681420009221e-06, "logits/chosen": 1.3886802196502686, "logits/rejected": 3.3670883178710938, "logps/chosen": -646.9923095703125, "logps/rejected": -995.0706176757812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -12.358152389526367, "rewards/margins": 25.910850524902344, "rewards/rejected": -38.269004821777344, "step": 970 }, { "epoch": 0.6040435458786936, "grad_norm": 2.1180777549743652, "learning_rate": 4.437528815122176e-06, "logits/chosen": -1.9676039218902588, "logits/rejected": 1.6668283939361572, "logps/chosen": -396.4910583496094, "logps/rejected": -773.7052001953125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -6.505646705627441, "rewards/margins": 22.966257095336914, "rewards/rejected": -29.471904754638672, "step": 971 }, { "epoch": 0.6046656298600311, "grad_norm": 0.35996007919311523, "learning_rate": 4.4363762102351316e-06, "logits/chosen": -1.3434209823608398, "logits/rejected": 3.100332260131836, "logps/chosen": -553.2901000976562, "logps/rejected": -1007.8410034179688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.710760593414307, "rewards/margins": 26.766399383544922, "rewards/rejected": -33.47716522216797, "step": 972 }, { "epoch": 0.6052877138413686, "grad_norm": 0.1532444953918457, "learning_rate": 4.435223605348087e-06, "logits/chosen": -1.289564847946167, "logits/rejected": 2.1508238315582275, "logps/chosen": -451.6590881347656, "logps/rejected": -913.523193359375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.025468349456787, "rewards/margins": 29.21463394165039, "rewards/rejected": -36.2401008605957, "step": 973 }, { "epoch": 0.605909797822706, "grad_norm": 0.03995480760931969, "learning_rate": 4.434071000461042e-06, "logits/chosen": -0.7090196013450623, "logits/rejected": 3.135948419570923, "logps/chosen": -515.958740234375, "logps/rejected": -1091.345947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.616611957550049, "rewards/margins": 27.977787017822266, "rewards/rejected": -35.594398498535156, "step": 974 }, { "epoch": 0.6065318818040435, "grad_norm": 0.0549919418990612, "learning_rate": 4.432918395573997e-06, "logits/chosen": -3.5261151790618896, "logits/rejected": 2.896216869354248, "logps/chosen": -440.1463623046875, "logps/rejected": -1085.7508544921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.423600673675537, "rewards/margins": 23.392396926879883, "rewards/rejected": -30.815998077392578, "step": 975 }, { "epoch": 0.6071539657853811, "grad_norm": 2.218489044025773e-06, "learning_rate": 4.4317657906869525e-06, "logits/chosen": 1.2783446311950684, "logits/rejected": 4.227397918701172, "logps/chosen": -630.39892578125, "logps/rejected": -1058.150146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.845111846923828, "rewards/margins": 26.44537353515625, "rewards/rejected": -37.29048538208008, "step": 976 }, { "epoch": 0.6077760497667185, "grad_norm": 8.354219608008862e-05, "learning_rate": 4.430613185799908e-06, "logits/chosen": 1.6482150554656982, "logits/rejected": 4.484062671661377, "logps/chosen": -564.74169921875, "logps/rejected": -963.2635498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.121161460876465, "rewards/margins": 20.304988861083984, "rewards/rejected": -29.426151275634766, "step": 977 }, { "epoch": 0.608398133748056, "grad_norm": 4.307034015655518, "learning_rate": 4.429460580912863e-06, "logits/chosen": 2.6032145023345947, "logits/rejected": 4.085454940795898, "logps/chosen": -634.4119873046875, "logps/rejected": -1028.705810546875, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -13.905746459960938, "rewards/margins": 25.430469512939453, "rewards/rejected": -39.336219787597656, "step": 978 }, { "epoch": 0.6090202177293935, "grad_norm": 0.4238090217113495, "learning_rate": 4.428307976025818e-06, "logits/chosen": -3.106131076812744, "logits/rejected": 2.2020976543426514, "logps/chosen": -386.2730712890625, "logps/rejected": -872.7025146484375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -8.160600662231445, "rewards/margins": 20.310396194458008, "rewards/rejected": -28.47100067138672, "step": 979 }, { "epoch": 0.609642301710731, "grad_norm": 26.356096267700195, "learning_rate": 4.427155371138774e-06, "logits/chosen": -2.559213399887085, "logits/rejected": 1.2290418148040771, "logps/chosen": -407.3168029785156, "logps/rejected": -864.3316650390625, "loss": 0.1682, "rewards/accuracies": 0.875, "rewards/chosen": -7.493479251861572, "rewards/margins": 27.14444923400879, "rewards/rejected": -34.63793182373047, "step": 980 }, { "epoch": 0.6102643856920684, "grad_norm": 22.799148559570312, "learning_rate": 4.4260027662517294e-06, "logits/chosen": 2.6630711555480957, "logits/rejected": 5.5165886878967285, "logps/chosen": -714.7571411132812, "logps/rejected": -1119.52490234375, "loss": 0.1418, "rewards/accuracies": 0.875, "rewards/chosen": -13.795547485351562, "rewards/margins": 20.45812225341797, "rewards/rejected": -34.25366973876953, "step": 981 }, { "epoch": 0.6108864696734059, "grad_norm": 3.947579685359415e-08, "learning_rate": 4.424850161364685e-06, "logits/chosen": -2.429842710494995, "logits/rejected": 3.5816140174865723, "logps/chosen": -479.0736083984375, "logps/rejected": -1232.229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.675029754638672, "rewards/margins": 39.637107849121094, "rewards/rejected": -47.31214141845703, "step": 982 }, { "epoch": 0.6115085536547434, "grad_norm": 0.040997885167598724, "learning_rate": 4.42369755647764e-06, "logits/chosen": -1.8641680479049683, "logits/rejected": 0.8546357154846191, "logps/chosen": -338.0428466796875, "logps/rejected": -715.9549560546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.433816432952881, "rewards/margins": 18.383935928344727, "rewards/rejected": -25.8177547454834, "step": 983 }, { "epoch": 0.6121306376360809, "grad_norm": 0.002463550539687276, "learning_rate": 4.422544951590595e-06, "logits/chosen": -1.1334680318832397, "logits/rejected": 2.1662166118621826, "logps/chosen": -500.00653076171875, "logps/rejected": -1011.924560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.910249710083008, "rewards/margins": 30.985679626464844, "rewards/rejected": -37.895931243896484, "step": 984 }, { "epoch": 0.6127527216174183, "grad_norm": 5.114550590515137, "learning_rate": 4.42139234670355e-06, "logits/chosen": 0.6392805576324463, "logits/rejected": 2.9587857723236084, "logps/chosen": -700.7958374023438, "logps/rejected": -1121.68115234375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -15.589431762695312, "rewards/margins": 27.82578468322754, "rewards/rejected": -43.41521453857422, "step": 985 }, { "epoch": 0.6133748055987558, "grad_norm": 0.00019221102411393076, "learning_rate": 4.4202397418165056e-06, "logits/chosen": 0.7990036010742188, "logits/rejected": 4.720720291137695, "logps/chosen": -450.42877197265625, "logps/rejected": -1025.32861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.870571136474609, "rewards/margins": 35.376182556152344, "rewards/rejected": -42.24674987792969, "step": 986 }, { "epoch": 0.6139968895800934, "grad_norm": 0.0023970867041498423, "learning_rate": 4.419087136929461e-06, "logits/chosen": -0.657366931438446, "logits/rejected": 2.340158462524414, "logps/chosen": -488.14508056640625, "logps/rejected": -866.5415649414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.836606979370117, "rewards/margins": 24.216018676757812, "rewards/rejected": -35.0526237487793, "step": 987 }, { "epoch": 0.6146189735614308, "grad_norm": 6.9399729909491725e-06, "learning_rate": 4.417934532042416e-06, "logits/chosen": -0.09824991226196289, "logits/rejected": 2.612088203430176, "logps/chosen": -665.5135498046875, "logps/rejected": -1138.546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.184950828552246, "rewards/margins": 32.675193786621094, "rewards/rejected": -44.86014175415039, "step": 988 }, { "epoch": 0.6152410575427683, "grad_norm": 3.5185718536376953, "learning_rate": 4.416781927155371e-06, "logits/chosen": 0.624005913734436, "logits/rejected": 3.345196485519409, "logps/chosen": -449.0107116699219, "logps/rejected": -871.7337036132812, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -8.905702590942383, "rewards/margins": 23.94113540649414, "rewards/rejected": -32.84683609008789, "step": 989 }, { "epoch": 0.6158631415241057, "grad_norm": 21.973644256591797, "learning_rate": 4.4156293222683265e-06, "logits/chosen": -0.036577463150024414, "logits/rejected": 2.1554677486419678, "logps/chosen": -622.6143798828125, "logps/rejected": -868.1095581054688, "loss": 0.1398, "rewards/accuracies": 0.875, "rewards/chosen": -9.636955261230469, "rewards/margins": 20.860532760620117, "rewards/rejected": -30.497488021850586, "step": 990 }, { "epoch": 0.6164852255054433, "grad_norm": 24.79236602783203, "learning_rate": 4.414476717381282e-06, "logits/chosen": 2.160615921020508, "logits/rejected": 3.0709891319274902, "logps/chosen": -594.5836181640625, "logps/rejected": -798.706298828125, "loss": 0.3574, "rewards/accuracies": 0.875, "rewards/chosen": -9.179106712341309, "rewards/margins": 16.09888458251953, "rewards/rejected": -25.277992248535156, "step": 991 }, { "epoch": 0.6171073094867807, "grad_norm": 0.13140863180160522, "learning_rate": 4.413324112494237e-06, "logits/chosen": -2.102931499481201, "logits/rejected": 3.019148588180542, "logps/chosen": -356.99591064453125, "logps/rejected": -933.642333984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.668631553649902, "rewards/margins": 25.06151580810547, "rewards/rejected": -33.73014831542969, "step": 992 }, { "epoch": 0.6177293934681182, "grad_norm": 15.942180633544922, "learning_rate": 4.412171507607192e-06, "logits/chosen": 2.256786346435547, "logits/rejected": 2.633397340774536, "logps/chosen": -646.35205078125, "logps/rejected": -853.915283203125, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": -8.75490951538086, "rewards/margins": 15.246297836303711, "rewards/rejected": -24.00120735168457, "step": 993 }, { "epoch": 0.6183514774494556, "grad_norm": 0.00010330742225050926, "learning_rate": 4.411018902720147e-06, "logits/chosen": -0.5863848924636841, "logits/rejected": 3.1927225589752197, "logps/chosen": -504.6681213378906, "logps/rejected": -1111.0050048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.162359237670898, "rewards/margins": 32.09947204589844, "rewards/rejected": -39.26183319091797, "step": 994 }, { "epoch": 0.6189735614307932, "grad_norm": 6.420986652374268, "learning_rate": 4.4098662978331034e-06, "logits/chosen": -1.7422621250152588, "logits/rejected": 1.580471396446228, "logps/chosen": -557.5867309570312, "logps/rejected": -980.4736938476562, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -7.687779426574707, "rewards/margins": 21.697011947631836, "rewards/rejected": -29.384790420532227, "step": 995 }, { "epoch": 0.6195956454121306, "grad_norm": 0.06811324506998062, "learning_rate": 4.408713692946059e-06, "logits/chosen": -1.5099706649780273, "logits/rejected": 1.8679317235946655, "logps/chosen": -473.080078125, "logps/rejected": -850.0989990234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.807185173034668, "rewards/margins": 24.385814666748047, "rewards/rejected": -30.19300079345703, "step": 996 }, { "epoch": 0.6202177293934681, "grad_norm": 0.033417295664548874, "learning_rate": 4.407561088059014e-06, "logits/chosen": -1.5323376655578613, "logits/rejected": 3.1901423931121826, "logps/chosen": -524.0758056640625, "logps/rejected": -959.96484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.175868034362793, "rewards/margins": 24.759984970092773, "rewards/rejected": -31.935853958129883, "step": 997 }, { "epoch": 0.6208398133748056, "grad_norm": 3.334057282700087e-06, "learning_rate": 4.406408483171969e-06, "logits/chosen": -0.14260584115982056, "logits/rejected": 4.323513984680176, "logps/chosen": -528.5975341796875, "logps/rejected": -1003.25390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.384839057922363, "rewards/margins": 27.611757278442383, "rewards/rejected": -34.99659729003906, "step": 998 }, { "epoch": 0.6214618973561431, "grad_norm": 30.795129776000977, "learning_rate": 4.405255878284924e-06, "logits/chosen": 0.9537069201469421, "logits/rejected": 3.2278642654418945, "logps/chosen": -481.7140197753906, "logps/rejected": -847.1483764648438, "loss": 1.1289, "rewards/accuracies": 0.875, "rewards/chosen": -10.633514404296875, "rewards/margins": 20.894466400146484, "rewards/rejected": -31.527982711791992, "step": 999 }, { "epoch": 0.6220839813374806, "grad_norm": 33.78679656982422, "learning_rate": 4.4041032733978796e-06, "logits/chosen": 1.469162106513977, "logits/rejected": 2.2073097229003906, "logps/chosen": -571.0089111328125, "logps/rejected": -838.247802734375, "loss": 1.7458, "rewards/accuracies": 0.875, "rewards/chosen": -10.620084762573242, "rewards/margins": 23.940921783447266, "rewards/rejected": -34.561004638671875, "step": 1000 }, { "epoch": 0.622706065318818, "grad_norm": 0.0004030153213534504, "learning_rate": 4.402950668510835e-06, "logits/chosen": 2.196051597595215, "logits/rejected": 2.730830430984497, "logps/chosen": -716.089111328125, "logps/rejected": -981.1859130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.918209075927734, "rewards/margins": 24.088151931762695, "rewards/rejected": -40.00636291503906, "step": 1001 }, { "epoch": 0.6233281493001556, "grad_norm": 0.00020120911358390003, "learning_rate": 4.40179806362379e-06, "logits/chosen": -1.0908539295196533, "logits/rejected": 2.8006591796875, "logps/chosen": -419.78668212890625, "logps/rejected": -949.0003051757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.273379802703857, "rewards/margins": 28.251667022705078, "rewards/rejected": -34.525047302246094, "step": 1002 }, { "epoch": 0.623950233281493, "grad_norm": 1.5236693620681763, "learning_rate": 4.400645458736745e-06, "logits/chosen": 0.5440617203712463, "logits/rejected": 2.4226527214050293, "logps/chosen": -589.6771850585938, "logps/rejected": -848.9038696289062, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -7.010832786560059, "rewards/margins": 15.059330940246582, "rewards/rejected": -22.07016372680664, "step": 1003 }, { "epoch": 0.6245723172628305, "grad_norm": 30.517330169677734, "learning_rate": 4.3994928538497005e-06, "logits/chosen": 2.2613136768341064, "logits/rejected": 2.4845120906829834, "logps/chosen": -683.16845703125, "logps/rejected": -969.8983154296875, "loss": 0.3333, "rewards/accuracies": 0.875, "rewards/chosen": -11.087505340576172, "rewards/margins": 21.87521743774414, "rewards/rejected": -32.96272277832031, "step": 1004 }, { "epoch": 0.6251944012441679, "grad_norm": 0.007849554531276226, "learning_rate": 4.398340248962656e-06, "logits/chosen": 3.205324649810791, "logits/rejected": 4.223089218139648, "logps/chosen": -602.585205078125, "logps/rejected": -911.59521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.011014938354492, "rewards/margins": 24.49148178100586, "rewards/rejected": -32.50249481201172, "step": 1005 }, { "epoch": 0.6258164852255055, "grad_norm": 0.010902726091444492, "learning_rate": 4.397187644075611e-06, "logits/chosen": 0.14429256319999695, "logits/rejected": 2.621492624282837, "logps/chosen": -522.736083984375, "logps/rejected": -865.6357421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.426938533782959, "rewards/margins": 22.33446502685547, "rewards/rejected": -28.761404037475586, "step": 1006 }, { "epoch": 0.6264385692068429, "grad_norm": 13.197189331054688, "learning_rate": 4.396035039188566e-06, "logits/chosen": 1.6405010223388672, "logits/rejected": 1.7997384071350098, "logps/chosen": -608.0145874023438, "logps/rejected": -708.7522583007812, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": -12.889400482177734, "rewards/margins": 7.958559036254883, "rewards/rejected": -20.847959518432617, "step": 1007 }, { "epoch": 0.6270606531881804, "grad_norm": 0.01054982841014862, "learning_rate": 4.394882434301521e-06, "logits/chosen": -0.7522687315940857, "logits/rejected": 2.160587787628174, "logps/chosen": -489.1429748535156, "logps/rejected": -953.8292236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.892003059387207, "rewards/margins": 25.63636016845703, "rewards/rejected": -34.52836227416992, "step": 1008 }, { "epoch": 0.6276827371695178, "grad_norm": 40.35072326660156, "learning_rate": 4.3937298294144774e-06, "logits/chosen": -0.01016843318939209, "logits/rejected": 5.0052595138549805, "logps/chosen": -512.0286865234375, "logps/rejected": -1018.2623901367188, "loss": 0.7274, "rewards/accuracies": 0.875, "rewards/chosen": -4.998366355895996, "rewards/margins": 23.161270141601562, "rewards/rejected": -28.159637451171875, "step": 1009 }, { "epoch": 0.6283048211508554, "grad_norm": 52.293617248535156, "learning_rate": 4.392577224527433e-06, "logits/chosen": 1.1491129398345947, "logits/rejected": 3.0440309047698975, "logps/chosen": -722.254150390625, "logps/rejected": -954.6030883789062, "loss": 1.4045, "rewards/accuracies": 0.875, "rewards/chosen": -10.873852729797363, "rewards/margins": 16.21906852722168, "rewards/rejected": -27.09292221069336, "step": 1010 }, { "epoch": 0.6289269051321928, "grad_norm": 0.0604068785905838, "learning_rate": 4.391424619640388e-06, "logits/chosen": 0.5932100415229797, "logits/rejected": 1.7829232215881348, "logps/chosen": -480.3331604003906, "logps/rejected": -731.1142578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.753337860107422, "rewards/margins": 21.87540626525879, "rewards/rejected": -27.62874412536621, "step": 1011 }, { "epoch": 0.6295489891135303, "grad_norm": 40.562896728515625, "learning_rate": 4.390272014753343e-06, "logits/chosen": -1.131493330001831, "logits/rejected": 2.884080410003662, "logps/chosen": -587.9010009765625, "logps/rejected": -928.799560546875, "loss": 1.0836, "rewards/accuracies": 0.75, "rewards/chosen": -8.43549633026123, "rewards/margins": 21.197429656982422, "rewards/rejected": -29.63292694091797, "step": 1012 }, { "epoch": 0.6301710730948679, "grad_norm": 0.0011462063994258642, "learning_rate": 4.389119409866298e-06, "logits/chosen": 1.7970309257507324, "logits/rejected": 3.385439872741699, "logps/chosen": -638.41015625, "logps/rejected": -985.9293212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.416422843933105, "rewards/margins": 26.92089080810547, "rewards/rejected": -35.337310791015625, "step": 1013 }, { "epoch": 0.6307931570762053, "grad_norm": 0.04923012852668762, "learning_rate": 4.3879668049792536e-06, "logits/chosen": -3.66701340675354, "logits/rejected": 2.5858314037323, "logps/chosen": -333.7469787597656, "logps/rejected": -959.65673828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.30173110961914, "rewards/margins": 26.938438415527344, "rewards/rejected": -35.240169525146484, "step": 1014 }, { "epoch": 0.6314152410575428, "grad_norm": 11.959614753723145, "learning_rate": 4.386814200092209e-06, "logits/chosen": 1.8299262523651123, "logits/rejected": 3.218519687652588, "logps/chosen": -626.8076171875, "logps/rejected": -905.640625, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": -12.366762161254883, "rewards/margins": 18.138498306274414, "rewards/rejected": -30.505260467529297, "step": 1015 }, { "epoch": 0.6320373250388802, "grad_norm": 0.0018947566859424114, "learning_rate": 4.385661595205164e-06, "logits/chosen": -0.7899747490882874, "logits/rejected": 4.134137153625488, "logps/chosen": -327.9424133300781, "logps/rejected": -877.3003540039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7264366149902344, "rewards/margins": 26.387657165527344, "rewards/rejected": -30.114093780517578, "step": 1016 }, { "epoch": 0.6326594090202178, "grad_norm": 35.56440734863281, "learning_rate": 4.384508990318119e-06, "logits/chosen": 2.359657049179077, "logits/rejected": 2.69771146774292, "logps/chosen": -820.05224609375, "logps/rejected": -974.1748657226562, "loss": 0.6496, "rewards/accuracies": 0.875, "rewards/chosen": -13.05585765838623, "rewards/margins": 15.927709579467773, "rewards/rejected": -28.983566284179688, "step": 1017 }, { "epoch": 0.6332814930015552, "grad_norm": 1.1538484159245854e-06, "learning_rate": 4.3833563854310744e-06, "logits/chosen": 3.8544185161590576, "logits/rejected": 3.564330577850342, "logps/chosen": -839.0157470703125, "logps/rejected": -1088.1396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.0699462890625, "rewards/margins": 28.169740676879883, "rewards/rejected": -41.23968505859375, "step": 1018 }, { "epoch": 0.6339035769828927, "grad_norm": 6.535756983794272e-05, "learning_rate": 4.38220378054403e-06, "logits/chosen": 3.4101524353027344, "logits/rejected": 3.27657413482666, "logps/chosen": -806.0890502929688, "logps/rejected": -1069.5372314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.686437606811523, "rewards/margins": 26.94811248779297, "rewards/rejected": -39.634552001953125, "step": 1019 }, { "epoch": 0.6345256609642301, "grad_norm": 8.647769927978516, "learning_rate": 4.381051175656985e-06, "logits/chosen": 0.3077080249786377, "logits/rejected": 2.6129024028778076, "logps/chosen": -582.3071899414062, "logps/rejected": -991.269287109375, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -9.514552116394043, "rewards/margins": 20.795188903808594, "rewards/rejected": -30.30974006652832, "step": 1020 }, { "epoch": 0.6351477449455677, "grad_norm": 28.72833824157715, "learning_rate": 4.37989857076994e-06, "logits/chosen": -0.33616751432418823, "logits/rejected": 2.4276583194732666, "logps/chosen": -450.0052185058594, "logps/rejected": -819.1146240234375, "loss": 0.5203, "rewards/accuracies": 0.875, "rewards/chosen": -6.684841632843018, "rewards/margins": 26.024911880493164, "rewards/rejected": -32.709754943847656, "step": 1021 }, { "epoch": 0.6357698289269051, "grad_norm": 0.21740223467350006, "learning_rate": 4.378745965882895e-06, "logits/chosen": 0.4956507682800293, "logits/rejected": 1.2035505771636963, "logps/chosen": -541.427490234375, "logps/rejected": -844.2938232421875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.296308517456055, "rewards/margins": 21.546104431152344, "rewards/rejected": -30.8424129486084, "step": 1022 }, { "epoch": 0.6363919129082426, "grad_norm": 2.216559648513794, "learning_rate": 4.3775933609958506e-06, "logits/chosen": 2.147252321243286, "logits/rejected": 4.214204788208008, "logps/chosen": -626.7850341796875, "logps/rejected": -1001.1859130859375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -8.586669921875, "rewards/margins": 21.359535217285156, "rewards/rejected": -29.946203231811523, "step": 1023 }, { "epoch": 0.63701399688958, "grad_norm": 23.62967872619629, "learning_rate": 4.376440756108807e-06, "logits/chosen": 0.6927123069763184, "logits/rejected": 3.6509900093078613, "logps/chosen": -519.7496337890625, "logps/rejected": -928.06396484375, "loss": 0.5355, "rewards/accuracies": 0.875, "rewards/chosen": -8.055206298828125, "rewards/margins": 18.006229400634766, "rewards/rejected": -26.061437606811523, "step": 1024 }, { "epoch": 0.6376360808709176, "grad_norm": 0.0004841023765038699, "learning_rate": 4.375288151221762e-06, "logits/chosen": 1.0736597776412964, "logits/rejected": 4.022658348083496, "logps/chosen": -565.89208984375, "logps/rejected": -976.2791137695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.828113555908203, "rewards/margins": 28.443811416625977, "rewards/rejected": -37.27192687988281, "step": 1025 }, { "epoch": 0.6382581648522551, "grad_norm": 49.700984954833984, "learning_rate": 4.374135546334717e-06, "logits/chosen": 3.4103102684020996, "logits/rejected": 2.3115811347961426, "logps/chosen": -681.27197265625, "logps/rejected": -830.0875244140625, "loss": 2.0367, "rewards/accuracies": 0.75, "rewards/chosen": -12.577679634094238, "rewards/margins": 16.04987144470215, "rewards/rejected": -28.627552032470703, "step": 1026 }, { "epoch": 0.6388802488335925, "grad_norm": 0.06918217986822128, "learning_rate": 4.372982941447672e-06, "logits/chosen": -1.2822195291519165, "logits/rejected": 2.634605646133423, "logps/chosen": -509.82550048828125, "logps/rejected": -1073.20751953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.181923866271973, "rewards/margins": 29.78413200378418, "rewards/rejected": -39.96605682373047, "step": 1027 }, { "epoch": 0.63950233281493, "grad_norm": 2.3471317291259766, "learning_rate": 4.3718303365606275e-06, "logits/chosen": 0.784964919090271, "logits/rejected": 2.4620285034179688, "logps/chosen": -525.9091796875, "logps/rejected": -822.0399169921875, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -5.523943901062012, "rewards/margins": 22.30738067626953, "rewards/rejected": -27.83132553100586, "step": 1028 }, { "epoch": 0.6401244167962675, "grad_norm": 2.681765920442558e-07, "learning_rate": 4.370677731673583e-06, "logits/chosen": 0.8570016622543335, "logits/rejected": 3.016300678253174, "logps/chosen": -529.441650390625, "logps/rejected": -956.5719604492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.739961624145508, "rewards/margins": 32.02619934082031, "rewards/rejected": -40.76616287231445, "step": 1029 }, { "epoch": 0.640746500777605, "grad_norm": 3.9567577838897705, "learning_rate": 4.369525126786538e-06, "logits/chosen": -3.876920700073242, "logits/rejected": 1.7937712669372559, "logps/chosen": -355.7938232421875, "logps/rejected": -930.0498657226562, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -3.368406295776367, "rewards/margins": 25.666030883789062, "rewards/rejected": -29.034439086914062, "step": 1030 }, { "epoch": 0.6413685847589424, "grad_norm": 0.2327953577041626, "learning_rate": 4.368372521899493e-06, "logits/chosen": 2.2502920627593994, "logits/rejected": 3.6756107807159424, "logps/chosen": -637.3325805664062, "logps/rejected": -1016.4441528320312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.519428253173828, "rewards/margins": 23.643564224243164, "rewards/rejected": -31.162994384765625, "step": 1031 }, { "epoch": 0.64199066874028, "grad_norm": 1.140275478363037, "learning_rate": 4.3672199170124484e-06, "logits/chosen": 2.291574001312256, "logits/rejected": 3.5964736938476562, "logps/chosen": -700.5925903320312, "logps/rejected": -1020.0197143554688, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -10.006669998168945, "rewards/margins": 21.45091438293457, "rewards/rejected": -31.457584381103516, "step": 1032 }, { "epoch": 0.6426127527216174, "grad_norm": 2.2512295246124268, "learning_rate": 4.366067312125404e-06, "logits/chosen": 0.2611212432384491, "logits/rejected": 3.5303797721862793, "logps/chosen": -486.99786376953125, "logps/rejected": -896.658203125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -10.969782829284668, "rewards/margins": 21.509904861450195, "rewards/rejected": -32.47968673706055, "step": 1033 }, { "epoch": 0.6432348367029549, "grad_norm": 0.03622567281126976, "learning_rate": 4.364914707238359e-06, "logits/chosen": -1.3039064407348633, "logits/rejected": 3.5805559158325195, "logps/chosen": -321.51263427734375, "logps/rejected": -974.6703491210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.9876842498779297, "rewards/margins": 33.68848419189453, "rewards/rejected": -37.676170349121094, "step": 1034 }, { "epoch": 0.6438569206842923, "grad_norm": 0.006617639679461718, "learning_rate": 4.363762102351314e-06, "logits/chosen": 0.1614302396774292, "logits/rejected": 3.4580628871917725, "logps/chosen": -348.7968444824219, "logps/rejected": -815.583740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.265744686126709, "rewards/margins": 26.007923126220703, "rewards/rejected": -32.27366638183594, "step": 1035 }, { "epoch": 0.6444790046656299, "grad_norm": 7.089043140411377, "learning_rate": 4.362609497464269e-06, "logits/chosen": 0.5390743017196655, "logits/rejected": 2.8165862560272217, "logps/chosen": -610.386474609375, "logps/rejected": -993.70263671875, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -11.227594375610352, "rewards/margins": 21.322710037231445, "rewards/rejected": -32.5503044128418, "step": 1036 }, { "epoch": 0.6451010886469674, "grad_norm": 10.708149909973145, "learning_rate": 4.3614568925772246e-06, "logits/chosen": 2.2750465869903564, "logits/rejected": 3.9014241695404053, "logps/chosen": -666.2877197265625, "logps/rejected": -926.8017578125, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": -8.22494125366211, "rewards/margins": 16.43631935119629, "rewards/rejected": -24.66126251220703, "step": 1037 }, { "epoch": 0.6457231726283048, "grad_norm": 0.009115724824368954, "learning_rate": 4.360304287690181e-06, "logits/chosen": -1.581646203994751, "logits/rejected": 0.9383874535560608, "logps/chosen": -342.2408447265625, "logps/rejected": -759.0081176757812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.06154727935791, "rewards/margins": 25.451290130615234, "rewards/rejected": -31.512836456298828, "step": 1038 }, { "epoch": 0.6463452566096423, "grad_norm": 4.3823953888022515e-07, "learning_rate": 4.359151682803136e-06, "logits/chosen": 0.9563695192337036, "logits/rejected": 4.24501895904541, "logps/chosen": -538.5618286132812, "logps/rejected": -1025.32177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.323747634887695, "rewards/margins": 28.09737205505371, "rewards/rejected": -38.421119689941406, "step": 1039 }, { "epoch": 0.6469673405909798, "grad_norm": 53.47418212890625, "learning_rate": 4.357999077916091e-06, "logits/chosen": 3.6600234508514404, "logits/rejected": 4.332643508911133, "logps/chosen": -763.8884887695312, "logps/rejected": -1002.970458984375, "loss": 3.4196, "rewards/accuracies": 0.75, "rewards/chosen": -12.286805152893066, "rewards/margins": 16.814016342163086, "rewards/rejected": -29.100818634033203, "step": 1040 }, { "epoch": 0.6475894245723173, "grad_norm": 36.0056266784668, "learning_rate": 4.356846473029046e-06, "logits/chosen": 0.25408101081848145, "logits/rejected": 2.7115085124969482, "logps/chosen": -581.7474365234375, "logps/rejected": -872.8613891601562, "loss": 0.3032, "rewards/accuracies": 0.875, "rewards/chosen": -13.85314655303955, "rewards/margins": 20.30516242980957, "rewards/rejected": -34.15830612182617, "step": 1041 }, { "epoch": 0.6482115085536547, "grad_norm": 0.3534427285194397, "learning_rate": 4.3556938681420015e-06, "logits/chosen": 0.19164976477622986, "logits/rejected": 2.4058098793029785, "logps/chosen": -709.7728271484375, "logps/rejected": -1034.244384765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -10.23792552947998, "rewards/margins": 23.737945556640625, "rewards/rejected": -33.97587585449219, "step": 1042 }, { "epoch": 0.6488335925349922, "grad_norm": 35.1163444519043, "learning_rate": 4.354541263254957e-06, "logits/chosen": 0.810158371925354, "logits/rejected": 2.667872428894043, "logps/chosen": -640.8101196289062, "logps/rejected": -909.2906494140625, "loss": 1.0731, "rewards/accuracies": 0.875, "rewards/chosen": -12.474588394165039, "rewards/margins": 19.483205795288086, "rewards/rejected": -31.957794189453125, "step": 1043 }, { "epoch": 0.6494556765163297, "grad_norm": 25.680383682250977, "learning_rate": 4.353388658367912e-06, "logits/chosen": -3.0454163551330566, "logits/rejected": 3.9892959594726562, "logps/chosen": -402.1624755859375, "logps/rejected": -987.141357421875, "loss": 0.2195, "rewards/accuracies": 0.875, "rewards/chosen": -6.043094635009766, "rewards/margins": 29.327125549316406, "rewards/rejected": -35.37022018432617, "step": 1044 }, { "epoch": 0.6500777604976672, "grad_norm": 5.064794540405273, "learning_rate": 4.352236053480867e-06, "logits/chosen": 1.396227240562439, "logits/rejected": 4.078095436096191, "logps/chosen": -505.2065124511719, "logps/rejected": -839.9376220703125, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -6.201690673828125, "rewards/margins": 19.825117111206055, "rewards/rejected": -26.02680778503418, "step": 1045 }, { "epoch": 0.6506998444790046, "grad_norm": 0.008247487246990204, "learning_rate": 4.3510834485938224e-06, "logits/chosen": -2.75317120552063, "logits/rejected": 2.8981680870056152, "logps/chosen": -273.85064697265625, "logps/rejected": -738.4959106445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.633110523223877, "rewards/margins": 20.329675674438477, "rewards/rejected": -24.962783813476562, "step": 1046 }, { "epoch": 0.6513219284603421, "grad_norm": 6.141415119171143, "learning_rate": 4.349930843706778e-06, "logits/chosen": 2.316647529602051, "logits/rejected": 3.3674442768096924, "logps/chosen": -659.9111938476562, "logps/rejected": -934.2586669921875, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -9.761323928833008, "rewards/margins": 20.5927677154541, "rewards/rejected": -30.35409164428711, "step": 1047 }, { "epoch": 0.6519440124416797, "grad_norm": 0.015566140413284302, "learning_rate": 4.348778238819733e-06, "logits/chosen": 2.220116138458252, "logits/rejected": 4.262792110443115, "logps/chosen": -594.0078735351562, "logps/rejected": -833.1097412109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.603310585021973, "rewards/margins": 16.571250915527344, "rewards/rejected": -24.174562454223633, "step": 1048 }, { "epoch": 0.6525660964230171, "grad_norm": 0.9627737402915955, "learning_rate": 4.347625633932688e-06, "logits/chosen": 1.216709017753601, "logits/rejected": 4.047371864318848, "logps/chosen": -513.246826171875, "logps/rejected": -946.6227416992188, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -10.843459129333496, "rewards/margins": 19.405704498291016, "rewards/rejected": -30.249164581298828, "step": 1049 }, { "epoch": 0.6531881804043546, "grad_norm": 0.03551546484231949, "learning_rate": 4.346473029045643e-06, "logits/chosen": -0.7971693873405457, "logits/rejected": 2.8314757347106934, "logps/chosen": -499.04095458984375, "logps/rejected": -885.7818603515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.940540313720703, "rewards/margins": 17.48625946044922, "rewards/rejected": -26.426803588867188, "step": 1050 }, { "epoch": 0.6538102643856921, "grad_norm": 12.715831756591797, "learning_rate": 4.3453204241585986e-06, "logits/chosen": 1.9154138565063477, "logits/rejected": 4.475796699523926, "logps/chosen": -618.64111328125, "logps/rejected": -1036.7139892578125, "loss": 0.1346, "rewards/accuracies": 0.875, "rewards/chosen": -12.447593688964844, "rewards/margins": 20.037174224853516, "rewards/rejected": -32.48476791381836, "step": 1051 }, { "epoch": 0.6544323483670296, "grad_norm": 3.329910396132618e-05, "learning_rate": 4.344167819271554e-06, "logits/chosen": 1.1139119863510132, "logits/rejected": 4.082623481750488, "logps/chosen": -543.2000122070312, "logps/rejected": -942.43212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.591399192810059, "rewards/margins": 23.034143447875977, "rewards/rejected": -31.62554168701172, "step": 1052 }, { "epoch": 0.655054432348367, "grad_norm": 0.15551453828811646, "learning_rate": 4.34301521438451e-06, "logits/chosen": 1.3609882593154907, "logits/rejected": 4.041923522949219, "logps/chosen": -533.775146484375, "logps/rejected": -987.229248046875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.904748916625977, "rewards/margins": 26.243389129638672, "rewards/rejected": -35.14813995361328, "step": 1053 }, { "epoch": 0.6556765163297045, "grad_norm": 0.6531232595443726, "learning_rate": 4.341862609497465e-06, "logits/chosen": 0.10662335157394409, "logits/rejected": 2.761049509048462, "logps/chosen": -542.4237060546875, "logps/rejected": -890.3243408203125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -7.279252052307129, "rewards/margins": 21.53714942932129, "rewards/rejected": -28.81639862060547, "step": 1054 }, { "epoch": 0.656298600311042, "grad_norm": 0.011171232908964157, "learning_rate": 4.34071000461042e-06, "logits/chosen": 1.9401049613952637, "logits/rejected": 4.1546549797058105, "logps/chosen": -606.16552734375, "logps/rejected": -992.0186157226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.93386173248291, "rewards/margins": 28.016857147216797, "rewards/rejected": -36.950721740722656, "step": 1055 }, { "epoch": 0.6569206842923795, "grad_norm": 5.907070636749268, "learning_rate": 4.3395573997233755e-06, "logits/chosen": -2.723787784576416, "logits/rejected": 1.073384165763855, "logps/chosen": -247.42018127441406, "logps/rejected": -671.1397094726562, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -2.980680465698242, "rewards/margins": 20.786914825439453, "rewards/rejected": -23.767597198486328, "step": 1056 }, { "epoch": 0.6575427682737169, "grad_norm": 2.8011792892357334e-05, "learning_rate": 4.338404794836331e-06, "logits/chosen": -2.1832637786865234, "logits/rejected": 3.1213631629943848, "logps/chosen": -317.74371337890625, "logps/rejected": -867.841552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.91716194152832, "rewards/margins": 26.970073699951172, "rewards/rejected": -31.887237548828125, "step": 1057 }, { "epoch": 0.6581648522550544, "grad_norm": 0.0006950918468646705, "learning_rate": 4.337252189949286e-06, "logits/chosen": -0.36333489418029785, "logits/rejected": 2.7499642372131348, "logps/chosen": -508.69482421875, "logps/rejected": -994.1200561523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.09777545928955, "rewards/margins": 26.999156951904297, "rewards/rejected": -36.09693145751953, "step": 1058 }, { "epoch": 0.658786936236392, "grad_norm": 2.4685513973236084, "learning_rate": 4.336099585062241e-06, "logits/chosen": -0.1563507318496704, "logits/rejected": 3.1660311222076416, "logps/chosen": -530.2171630859375, "logps/rejected": -1001.8350219726562, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -12.256999969482422, "rewards/margins": 25.335674285888672, "rewards/rejected": -37.592674255371094, "step": 1059 }, { "epoch": 0.6594090202177294, "grad_norm": 0.9450600743293762, "learning_rate": 4.3349469801751964e-06, "logits/chosen": 1.899235486984253, "logits/rejected": 2.9143006801605225, "logps/chosen": -607.775146484375, "logps/rejected": -880.9578247070312, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -9.9500732421875, "rewards/margins": 20.845109939575195, "rewards/rejected": -30.795185089111328, "step": 1060 }, { "epoch": 0.6600311041990669, "grad_norm": 6.812902450561523, "learning_rate": 4.333794375288152e-06, "logits/chosen": 0.020940184593200684, "logits/rejected": 3.1008946895599365, "logps/chosen": -641.708251953125, "logps/rejected": -1082.6326904296875, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -11.005695343017578, "rewards/margins": 22.83586883544922, "rewards/rejected": -33.8415641784668, "step": 1061 }, { "epoch": 0.6606531881804043, "grad_norm": 39.42338180541992, "learning_rate": 4.332641770401107e-06, "logits/chosen": 2.0118627548217773, "logits/rejected": 4.52200984954834, "logps/chosen": -578.223388671875, "logps/rejected": -1029.5029296875, "loss": 0.7323, "rewards/accuracies": 0.875, "rewards/chosen": -10.329606056213379, "rewards/margins": 25.680768966674805, "rewards/rejected": -36.010372161865234, "step": 1062 }, { "epoch": 0.6612752721617419, "grad_norm": 25.15177345275879, "learning_rate": 4.331489165514062e-06, "logits/chosen": 2.8278603553771973, "logits/rejected": 4.047382831573486, "logps/chosen": -703.8547973632812, "logps/rejected": -873.251953125, "loss": 0.1875, "rewards/accuracies": 0.875, "rewards/chosen": -12.270652770996094, "rewards/margins": 9.811416625976562, "rewards/rejected": -22.082069396972656, "step": 1063 }, { "epoch": 0.6618973561430793, "grad_norm": 5.5414453527191654e-05, "learning_rate": 4.330336560627017e-06, "logits/chosen": 2.8537893295288086, "logits/rejected": 5.341554164886475, "logps/chosen": -699.42626953125, "logps/rejected": -1039.4193115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.638263702392578, "rewards/margins": 21.15549659729004, "rewards/rejected": -30.793758392333984, "step": 1064 }, { "epoch": 0.6625194401244168, "grad_norm": 0.4664863348007202, "learning_rate": 4.3291839557399726e-06, "logits/chosen": 1.168850302696228, "logits/rejected": 4.478352069854736, "logps/chosen": -556.7830200195312, "logps/rejected": -1005.1434326171875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -7.349123954772949, "rewards/margins": 23.540437698364258, "rewards/rejected": -30.889562606811523, "step": 1065 }, { "epoch": 0.6631415241057543, "grad_norm": 0.7163582444190979, "learning_rate": 4.328031350852928e-06, "logits/chosen": 1.0548595190048218, "logits/rejected": 3.2712948322296143, "logps/chosen": -639.1087646484375, "logps/rejected": -1086.1822509765625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -9.083908081054688, "rewards/margins": 25.394393920898438, "rewards/rejected": -34.478302001953125, "step": 1066 }, { "epoch": 0.6637636080870918, "grad_norm": 33.42515182495117, "learning_rate": 4.326878745965883e-06, "logits/chosen": -0.47864389419555664, "logits/rejected": 2.4255480766296387, "logps/chosen": -567.6307983398438, "logps/rejected": -969.3095703125, "loss": 0.6311, "rewards/accuracies": 0.875, "rewards/chosen": -9.092820167541504, "rewards/margins": 23.71457290649414, "rewards/rejected": -32.807395935058594, "step": 1067 }, { "epoch": 0.6643856920684292, "grad_norm": 12.694299697875977, "learning_rate": 4.325726141078839e-06, "logits/chosen": -1.4194493293762207, "logits/rejected": 3.0020575523376465, "logps/chosen": -451.8589172363281, "logps/rejected": -883.7705078125, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": -6.017159461975098, "rewards/margins": 20.179271697998047, "rewards/rejected": -26.196434020996094, "step": 1068 }, { "epoch": 0.6650077760497667, "grad_norm": 0.006870250217616558, "learning_rate": 4.324573536191794e-06, "logits/chosen": 1.6804357767105103, "logits/rejected": 5.030431747436523, "logps/chosen": -512.820556640625, "logps/rejected": -1007.7488403320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.801078796386719, "rewards/margins": 25.51949691772461, "rewards/rejected": -34.32057571411133, "step": 1069 }, { "epoch": 0.6656298600311042, "grad_norm": 32.28739547729492, "learning_rate": 4.3234209313047495e-06, "logits/chosen": 1.4529192447662354, "logits/rejected": 3.566375494003296, "logps/chosen": -579.253173828125, "logps/rejected": -901.6878051757812, "loss": 0.72, "rewards/accuracies": 0.875, "rewards/chosen": -9.97698974609375, "rewards/margins": 17.470598220825195, "rewards/rejected": -27.447586059570312, "step": 1070 }, { "epoch": 0.6662519440124417, "grad_norm": 0.3848552703857422, "learning_rate": 4.322268326417705e-06, "logits/chosen": -1.314362645149231, "logits/rejected": 3.793914794921875, "logps/chosen": -444.42431640625, "logps/rejected": -918.371826171875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -11.004027366638184, "rewards/margins": 23.923250198364258, "rewards/rejected": -34.927276611328125, "step": 1071 }, { "epoch": 0.6668740279937792, "grad_norm": 7.505640983581543, "learning_rate": 4.32111572153066e-06, "logits/chosen": 1.2513737678527832, "logits/rejected": 3.1623566150665283, "logps/chosen": -580.482177734375, "logps/rejected": -918.35205078125, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -7.184943199157715, "rewards/margins": 20.685853958129883, "rewards/rejected": -27.87079620361328, "step": 1072 }, { "epoch": 0.6674961119751166, "grad_norm": 0.02148018218576908, "learning_rate": 4.319963116643615e-06, "logits/chosen": -1.3228559494018555, "logits/rejected": 2.4585723876953125, "logps/chosen": -362.78375244140625, "logps/rejected": -723.1258544921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.308596134185791, "rewards/margins": 14.433586120605469, "rewards/rejected": -19.742183685302734, "step": 1073 }, { "epoch": 0.6681181959564542, "grad_norm": 3.208101406926289e-05, "learning_rate": 4.31881051175657e-06, "logits/chosen": 0.2509106993675232, "logits/rejected": 3.2502963542938232, "logps/chosen": -535.8070068359375, "logps/rejected": -910.6559448242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.306463718414307, "rewards/margins": 25.636838912963867, "rewards/rejected": -31.943300247192383, "step": 1074 }, { "epoch": 0.6687402799377916, "grad_norm": 9.964673154172488e-06, "learning_rate": 4.317657906869526e-06, "logits/chosen": 1.126412034034729, "logits/rejected": 4.417489528656006, "logps/chosen": -470.00775146484375, "logps/rejected": -927.0919189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.198090553283691, "rewards/margins": 26.117660522460938, "rewards/rejected": -36.31575012207031, "step": 1075 }, { "epoch": 0.6693623639191291, "grad_norm": 0.0008977479301393032, "learning_rate": 4.316505301982481e-06, "logits/chosen": 0.4949992299079895, "logits/rejected": 3.2071704864501953, "logps/chosen": -493.83941650390625, "logps/rejected": -869.6880493164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.259034156799316, "rewards/margins": 23.93576431274414, "rewards/rejected": -32.19479751586914, "step": 1076 }, { "epoch": 0.6699844479004665, "grad_norm": 3.079448938369751, "learning_rate": 4.315352697095436e-06, "logits/chosen": 0.8684056401252747, "logits/rejected": 2.2930994033813477, "logps/chosen": -553.3845825195312, "logps/rejected": -812.5850830078125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -10.49267578125, "rewards/margins": 17.628101348876953, "rewards/rejected": -28.120777130126953, "step": 1077 }, { "epoch": 0.6706065318818041, "grad_norm": 41.547515869140625, "learning_rate": 4.314200092208391e-06, "logits/chosen": 0.5994904041290283, "logits/rejected": 2.7967746257781982, "logps/chosen": -611.67626953125, "logps/rejected": -939.912841796875, "loss": 0.5201, "rewards/accuracies": 0.875, "rewards/chosen": -14.551267623901367, "rewards/margins": 19.001649856567383, "rewards/rejected": -33.552913665771484, "step": 1078 }, { "epoch": 0.6712286158631415, "grad_norm": 22.828508377075195, "learning_rate": 4.3130474873213465e-06, "logits/chosen": -1.1581928730010986, "logits/rejected": 2.9677014350891113, "logps/chosen": -476.2674560546875, "logps/rejected": -992.6043701171875, "loss": 0.5091, "rewards/accuracies": 0.875, "rewards/chosen": -5.894004821777344, "rewards/margins": 30.105693817138672, "rewards/rejected": -35.999698638916016, "step": 1079 }, { "epoch": 0.671850699844479, "grad_norm": 8.702930450439453, "learning_rate": 4.311894882434302e-06, "logits/chosen": 1.4790234565734863, "logits/rejected": 2.22629451751709, "logps/chosen": -553.0067138671875, "logps/rejected": -781.3177490234375, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -12.244466781616211, "rewards/margins": 18.253623962402344, "rewards/rejected": -30.498090744018555, "step": 1080 }, { "epoch": 0.6724727838258164, "grad_norm": 4.632612705230713, "learning_rate": 4.310742277547257e-06, "logits/chosen": -1.0388520956039429, "logits/rejected": 3.8379063606262207, "logps/chosen": -415.9241027832031, "logps/rejected": -862.2066650390625, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -6.942617893218994, "rewards/margins": 18.617382049560547, "rewards/rejected": -25.559999465942383, "step": 1081 }, { "epoch": 0.673094867807154, "grad_norm": 1.0436056982143782e-05, "learning_rate": 4.309589672660213e-06, "logits/chosen": 0.40221107006073, "logits/rejected": 3.482527017593384, "logps/chosen": -478.3935546875, "logps/rejected": -928.32080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.656597137451172, "rewards/margins": 24.38768768310547, "rewards/rejected": -32.044288635253906, "step": 1082 }, { "epoch": 0.6737169517884914, "grad_norm": 0.0026403770316392183, "learning_rate": 4.308437067773168e-06, "logits/chosen": 1.7504465579986572, "logits/rejected": 3.3605589866638184, "logps/chosen": -625.5438232421875, "logps/rejected": -1023.2490844726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.362934112548828, "rewards/margins": 24.983165740966797, "rewards/rejected": -37.346099853515625, "step": 1083 }, { "epoch": 0.6743390357698289, "grad_norm": 1.1153315305709839, "learning_rate": 4.3072844628861235e-06, "logits/chosen": -2.518407106399536, "logits/rejected": 2.908334970474243, "logps/chosen": -213.45147705078125, "logps/rejected": -750.7979736328125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -2.212100028991699, "rewards/margins": 24.980051040649414, "rewards/rejected": -27.192150115966797, "step": 1084 }, { "epoch": 0.6749611197511665, "grad_norm": 1.64064513228368e-05, "learning_rate": 4.306131857999079e-06, "logits/chosen": -2.173828363418579, "logits/rejected": 3.981977939605713, "logps/chosen": -264.6275634765625, "logps/rejected": -981.3345947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.846001625061035, "rewards/margins": 35.636131286621094, "rewards/rejected": -40.48213195800781, "step": 1085 }, { "epoch": 0.6755832037325039, "grad_norm": 0.5660700798034668, "learning_rate": 4.304979253112034e-06, "logits/chosen": -2.6855359077453613, "logits/rejected": 3.02661395072937, "logps/chosen": -393.4590759277344, "logps/rejected": -945.981689453125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.0285325050354, "rewards/margins": 25.920177459716797, "rewards/rejected": -30.948711395263672, "step": 1086 }, { "epoch": 0.6762052877138414, "grad_norm": 0.030613282695412636, "learning_rate": 4.303826648224989e-06, "logits/chosen": 1.5232200622558594, "logits/rejected": 4.597107887268066, "logps/chosen": -372.9260559082031, "logps/rejected": -692.3011474609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.295344352722168, "rewards/margins": 18.140079498291016, "rewards/rejected": -26.435422897338867, "step": 1087 }, { "epoch": 0.6768273716951788, "grad_norm": 37.187347412109375, "learning_rate": 4.302674043337944e-06, "logits/chosen": -2.1207356452941895, "logits/rejected": 3.0639090538024902, "logps/chosen": -440.3720703125, "logps/rejected": -1019.062255859375, "loss": 0.8417, "rewards/accuracies": 0.875, "rewards/chosen": -9.682382583618164, "rewards/margins": 26.8316650390625, "rewards/rejected": -36.51404571533203, "step": 1088 }, { "epoch": 0.6774494556765164, "grad_norm": 29.836376190185547, "learning_rate": 4.3015214384509e-06, "logits/chosen": 1.0716478824615479, "logits/rejected": 2.154702663421631, "logps/chosen": -618.8206176757812, "logps/rejected": -877.2896728515625, "loss": 0.2427, "rewards/accuracies": 0.875, "rewards/chosen": -8.232145309448242, "rewards/margins": 20.821189880371094, "rewards/rejected": -29.053335189819336, "step": 1089 }, { "epoch": 0.6780715396578538, "grad_norm": 42.209190368652344, "learning_rate": 4.300368833563855e-06, "logits/chosen": 1.3273580074310303, "logits/rejected": 1.303884506225586, "logps/chosen": -744.8178100585938, "logps/rejected": -790.72021484375, "loss": 1.0787, "rewards/accuracies": 0.75, "rewards/chosen": -8.935836791992188, "rewards/margins": 10.744930267333984, "rewards/rejected": -19.68076515197754, "step": 1090 }, { "epoch": 0.6786936236391913, "grad_norm": 0.10251186788082123, "learning_rate": 4.29921622867681e-06, "logits/chosen": 0.6547682285308838, "logits/rejected": 3.9677000045776367, "logps/chosen": -543.1813354492188, "logps/rejected": -987.18017578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.872847557067871, "rewards/margins": 27.053184509277344, "rewards/rejected": -37.92603302001953, "step": 1091 }, { "epoch": 0.6793157076205287, "grad_norm": 2.1696592739317566e-05, "learning_rate": 4.298063623789765e-06, "logits/chosen": -1.0745490789413452, "logits/rejected": 3.7233543395996094, "logps/chosen": -304.859619140625, "logps/rejected": -808.3275146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9483261108398438, "rewards/margins": 24.147890090942383, "rewards/rejected": -27.096214294433594, "step": 1092 }, { "epoch": 0.6799377916018663, "grad_norm": 2.565423011779785, "learning_rate": 4.2969110189027205e-06, "logits/chosen": -3.332108497619629, "logits/rejected": 2.945809841156006, "logps/chosen": -467.0668640136719, "logps/rejected": -1202.020263671875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -8.956332206726074, "rewards/margins": 32.6529655456543, "rewards/rejected": -41.60929870605469, "step": 1093 }, { "epoch": 0.6805598755832037, "grad_norm": 8.504562377929688, "learning_rate": 4.295758414015676e-06, "logits/chosen": -1.0805021524429321, "logits/rejected": 3.7121896743774414, "logps/chosen": -542.5369262695312, "logps/rejected": -1073.4661865234375, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -12.501508712768555, "rewards/margins": 22.639549255371094, "rewards/rejected": -35.14105987548828, "step": 1094 }, { "epoch": 0.6811819595645412, "grad_norm": 0.003098748391494155, "learning_rate": 4.294605809128631e-06, "logits/chosen": -1.046175241470337, "logits/rejected": 4.001430511474609, "logps/chosen": -493.8402404785156, "logps/rejected": -1004.2360229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.581122398376465, "rewards/margins": 22.447479248046875, "rewards/rejected": -33.028602600097656, "step": 1095 }, { "epoch": 0.6818040435458786, "grad_norm": 0.005420563742518425, "learning_rate": 4.293453204241586e-06, "logits/chosen": -2.7170073986053467, "logits/rejected": 4.104083061218262, "logps/chosen": -232.8094482421875, "logps/rejected": -865.2706298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.027500867843628, "rewards/margins": 31.173065185546875, "rewards/rejected": -33.200565338134766, "step": 1096 }, { "epoch": 0.6824261275272162, "grad_norm": 19.43113899230957, "learning_rate": 4.2923005993545414e-06, "logits/chosen": 2.000507354736328, "logits/rejected": 3.084726572036743, "logps/chosen": -587.5668334960938, "logps/rejected": -902.2487182617188, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": -12.952615737915039, "rewards/margins": 22.29494285583496, "rewards/rejected": -35.24755859375, "step": 1097 }, { "epoch": 0.6830482115085537, "grad_norm": 39.97905349731445, "learning_rate": 4.291147994467497e-06, "logits/chosen": -0.7416508197784424, "logits/rejected": 1.96217679977417, "logps/chosen": -488.38623046875, "logps/rejected": -880.5936889648438, "loss": 0.5121, "rewards/accuracies": 0.875, "rewards/chosen": -7.624948024749756, "rewards/margins": 23.470813751220703, "rewards/rejected": -31.095762252807617, "step": 1098 }, { "epoch": 0.6836702954898911, "grad_norm": 34.53898620605469, "learning_rate": 4.289995389580452e-06, "logits/chosen": 0.8552457094192505, "logits/rejected": 2.1850810050964355, "logps/chosen": -686.6106567382812, "logps/rejected": -946.2464599609375, "loss": 0.4666, "rewards/accuracies": 0.875, "rewards/chosen": -6.199734687805176, "rewards/margins": 21.99437713623047, "rewards/rejected": -28.194110870361328, "step": 1099 }, { "epoch": 0.6842923794712286, "grad_norm": 0.003329685889184475, "learning_rate": 4.288842784693407e-06, "logits/chosen": 0.4598864018917084, "logits/rejected": 4.591629981994629, "logps/chosen": -522.9757080078125, "logps/rejected": -1050.53515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.485584259033203, "rewards/margins": 27.7689151763916, "rewards/rejected": -38.25450134277344, "step": 1100 }, { "epoch": 0.6849144634525661, "grad_norm": 12.893879890441895, "learning_rate": 4.287690179806362e-06, "logits/chosen": 0.198805034160614, "logits/rejected": 2.8921289443969727, "logps/chosen": -504.9701232910156, "logps/rejected": -907.6068115234375, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": -6.6027445793151855, "rewards/margins": 25.79681396484375, "rewards/rejected": -32.399559020996094, "step": 1101 }, { "epoch": 0.6855365474339036, "grad_norm": 1.0506843328475952, "learning_rate": 4.2865375749193176e-06, "logits/chosen": 0.5944235324859619, "logits/rejected": 2.503926992416382, "logps/chosen": -628.24951171875, "logps/rejected": -913.1500854492188, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -12.653724670410156, "rewards/margins": 21.677860260009766, "rewards/rejected": -34.33158493041992, "step": 1102 }, { "epoch": 0.686158631415241, "grad_norm": 0.0006199941853992641, "learning_rate": 4.285384970032273e-06, "logits/chosen": -0.5382259488105774, "logits/rejected": 1.0772992372512817, "logps/chosen": -622.629638671875, "logps/rejected": -902.0396118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.727117538452148, "rewards/margins": 21.430234909057617, "rewards/rejected": -32.157352447509766, "step": 1103 }, { "epoch": 0.6867807153965786, "grad_norm": 7.48344612121582, "learning_rate": 4.284232365145228e-06, "logits/chosen": 1.367098093032837, "logits/rejected": 4.316980361938477, "logps/chosen": -645.3848266601562, "logps/rejected": -1083.33642578125, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -13.908622741699219, "rewards/margins": 25.33052635192871, "rewards/rejected": -39.23915100097656, "step": 1104 }, { "epoch": 0.687402799377916, "grad_norm": 0.026603125035762787, "learning_rate": 4.283079760258183e-06, "logits/chosen": 0.3288910984992981, "logits/rejected": 2.3766226768493652, "logps/chosen": -659.520751953125, "logps/rejected": -1085.520263671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.07796859741211, "rewards/margins": 29.936979293823242, "rewards/rejected": -44.01494598388672, "step": 1105 }, { "epoch": 0.6880248833592535, "grad_norm": 0.002381574595347047, "learning_rate": 4.281927155371139e-06, "logits/chosen": -1.4142065048217773, "logits/rejected": 3.241100311279297, "logps/chosen": -346.6325378417969, "logps/rejected": -852.815185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.068493366241455, "rewards/margins": 26.232025146484375, "rewards/rejected": -33.30052185058594, "step": 1106 }, { "epoch": 0.6886469673405909, "grad_norm": 0.00010436380398459733, "learning_rate": 4.2807745504840945e-06, "logits/chosen": 2.5706067085266113, "logits/rejected": 4.239389419555664, "logps/chosen": -624.5850219726562, "logps/rejected": -1006.6663818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.321513175964355, "rewards/margins": 25.764984130859375, "rewards/rejected": -36.08649826049805, "step": 1107 }, { "epoch": 0.6892690513219285, "grad_norm": 31.843982696533203, "learning_rate": 4.27962194559705e-06, "logits/chosen": -0.6637098789215088, "logits/rejected": 5.162286281585693, "logps/chosen": -434.01593017578125, "logps/rejected": -1057.7811279296875, "loss": 0.5937, "rewards/accuracies": 0.875, "rewards/chosen": -7.872724533081055, "rewards/margins": 28.320775985717773, "rewards/rejected": -36.19350051879883, "step": 1108 }, { "epoch": 0.689891135303266, "grad_norm": 47.80104446411133, "learning_rate": 4.278469340710005e-06, "logits/chosen": 0.11493664979934692, "logits/rejected": 4.293163299560547, "logps/chosen": -686.111083984375, "logps/rejected": -1146.0205078125, "loss": 1.3721, "rewards/accuracies": 0.875, "rewards/chosen": -17.41529083251953, "rewards/margins": 23.800018310546875, "rewards/rejected": -41.215309143066406, "step": 1109 }, { "epoch": 0.6905132192846034, "grad_norm": 0.004325952846556902, "learning_rate": 4.27731673582296e-06, "logits/chosen": 2.2962706089019775, "logits/rejected": 4.085445404052734, "logps/chosen": -549.131591796875, "logps/rejected": -846.0113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.297896385192871, "rewards/margins": 24.9063663482666, "rewards/rejected": -33.204261779785156, "step": 1110 }, { "epoch": 0.6911353032659409, "grad_norm": 0.001067809178493917, "learning_rate": 4.2761641309359154e-06, "logits/chosen": 2.287487506866455, "logits/rejected": 4.490987300872803, "logps/chosen": -520.4144897460938, "logps/rejected": -875.7572631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.179506301879883, "rewards/margins": 23.131309509277344, "rewards/rejected": -31.310815811157227, "step": 1111 }, { "epoch": 0.6917573872472784, "grad_norm": 0.005620141979306936, "learning_rate": 4.275011526048871e-06, "logits/chosen": -0.24929457902908325, "logits/rejected": 3.7390763759613037, "logps/chosen": -464.5872497558594, "logps/rejected": -948.9923706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.896591186523438, "rewards/margins": 24.613645553588867, "rewards/rejected": -34.51023864746094, "step": 1112 }, { "epoch": 0.6923794712286159, "grad_norm": 1.4482674598693848, "learning_rate": 4.273858921161826e-06, "logits/chosen": -0.8546969294548035, "logits/rejected": 2.3102991580963135, "logps/chosen": -460.37567138671875, "logps/rejected": -781.9107666015625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -9.514959335327148, "rewards/margins": 17.49068832397461, "rewards/rejected": -27.005647659301758, "step": 1113 }, { "epoch": 0.6930015552099533, "grad_norm": 0.015572289004921913, "learning_rate": 4.272706316274781e-06, "logits/chosen": -0.16300639510154724, "logits/rejected": 4.673491954803467, "logps/chosen": -304.6646728515625, "logps/rejected": -899.9229736328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.211049556732178, "rewards/margins": 29.155414581298828, "rewards/rejected": -34.36646270751953, "step": 1114 }, { "epoch": 0.6936236391912908, "grad_norm": 0.4332619309425354, "learning_rate": 4.271553711387736e-06, "logits/chosen": -0.7185725569725037, "logits/rejected": 2.5487172603607178, "logps/chosen": -564.232421875, "logps/rejected": -977.0563354492188, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -9.150856971740723, "rewards/margins": 24.530282974243164, "rewards/rejected": -33.6811408996582, "step": 1115 }, { "epoch": 0.6942457231726283, "grad_norm": 0.03866080194711685, "learning_rate": 4.2704011065006916e-06, "logits/chosen": -0.015806496143341064, "logits/rejected": 3.439502477645874, "logps/chosen": -407.9251403808594, "logps/rejected": -908.4151000976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.0901570320129395, "rewards/margins": 28.038902282714844, "rewards/rejected": -33.129058837890625, "step": 1116 }, { "epoch": 0.6948678071539658, "grad_norm": 0.0025013545528054237, "learning_rate": 4.269248501613647e-06, "logits/chosen": -2.3568055629730225, "logits/rejected": 3.969569683074951, "logps/chosen": -347.612060546875, "logps/rejected": -945.9745483398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6943399906158447, "rewards/margins": 22.949398040771484, "rewards/rejected": -26.643739700317383, "step": 1117 }, { "epoch": 0.6954898911353032, "grad_norm": 15.973790168762207, "learning_rate": 4.268095896726602e-06, "logits/chosen": 1.2299774885177612, "logits/rejected": 2.655921697616577, "logps/chosen": -528.4793090820312, "logps/rejected": -724.660888671875, "loss": 0.2251, "rewards/accuracies": 0.75, "rewards/chosen": -7.363526344299316, "rewards/margins": 11.236658096313477, "rewards/rejected": -18.600183486938477, "step": 1118 }, { "epoch": 0.6961119751166407, "grad_norm": 1.4213616847991943, "learning_rate": 4.266943291839557e-06, "logits/chosen": -0.38948550820350647, "logits/rejected": 4.425605773925781, "logps/chosen": -447.60528564453125, "logps/rejected": -929.9447021484375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -5.339234352111816, "rewards/margins": 24.30902862548828, "rewards/rejected": -29.648265838623047, "step": 1119 }, { "epoch": 0.6967340590979783, "grad_norm": 0.3841400742530823, "learning_rate": 4.2657906869525125e-06, "logits/chosen": -2.0254392623901367, "logits/rejected": 3.4356141090393066, "logps/chosen": -338.26055908203125, "logps/rejected": -872.2510986328125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -6.423895835876465, "rewards/margins": 25.691959381103516, "rewards/rejected": -32.1158561706543, "step": 1120 }, { "epoch": 0.6973561430793157, "grad_norm": 4.166716394138348e-08, "learning_rate": 4.2646380820654685e-06, "logits/chosen": 2.1841678619384766, "logits/rejected": 4.150865077972412, "logps/chosen": -758.68896484375, "logps/rejected": -1092.0853271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.268762588500977, "rewards/margins": 25.824085235595703, "rewards/rejected": -36.09284973144531, "step": 1121 }, { "epoch": 0.6979782270606532, "grad_norm": 9.886246516543906e-06, "learning_rate": 4.263485477178424e-06, "logits/chosen": 1.7041479349136353, "logits/rejected": 3.58937406539917, "logps/chosen": -578.86767578125, "logps/rejected": -935.6997680664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.358209609985352, "rewards/margins": 23.363967895507812, "rewards/rejected": -31.722179412841797, "step": 1122 }, { "epoch": 0.6986003110419907, "grad_norm": 4.791447639465332, "learning_rate": 4.262332872291379e-06, "logits/chosen": 1.879792332649231, "logits/rejected": 2.9788360595703125, "logps/chosen": -565.7600708007812, "logps/rejected": -867.056396484375, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -9.71964168548584, "rewards/margins": 20.734661102294922, "rewards/rejected": -30.454303741455078, "step": 1123 }, { "epoch": 0.6992223950233282, "grad_norm": 29.363506317138672, "learning_rate": 4.261180267404334e-06, "logits/chosen": 1.0604872703552246, "logits/rejected": 3.045793294906616, "logps/chosen": -479.184326171875, "logps/rejected": -773.6400146484375, "loss": 0.3521, "rewards/accuracies": 0.875, "rewards/chosen": -4.974681854248047, "rewards/margins": 16.952167510986328, "rewards/rejected": -21.926849365234375, "step": 1124 }, { "epoch": 0.6998444790046656, "grad_norm": 15.334929466247559, "learning_rate": 4.2600276625172894e-06, "logits/chosen": -2.5029263496398926, "logits/rejected": 1.321319341659546, "logps/chosen": -454.6629943847656, "logps/rejected": -864.0693359375, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -9.473987579345703, "rewards/margins": 19.154024124145508, "rewards/rejected": -28.628009796142578, "step": 1125 }, { "epoch": 0.7004665629860031, "grad_norm": 7.4878207669826224e-06, "learning_rate": 4.258875057630245e-06, "logits/chosen": 2.3144640922546387, "logits/rejected": 3.8180489540100098, "logps/chosen": -532.5983276367188, "logps/rejected": -865.9114990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.514817237854004, "rewards/margins": 27.909273147583008, "rewards/rejected": -37.42408752441406, "step": 1126 }, { "epoch": 0.7010886469673406, "grad_norm": 0.5590468645095825, "learning_rate": 4.2577224527432e-06, "logits/chosen": 1.154442310333252, "logits/rejected": 4.903882026672363, "logps/chosen": -610.6256713867188, "logps/rejected": -1103.4720458984375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -10.860889434814453, "rewards/margins": 27.493499755859375, "rewards/rejected": -38.35438919067383, "step": 1127 }, { "epoch": 0.7017107309486781, "grad_norm": 0.0033634125720709562, "learning_rate": 4.256569847856155e-06, "logits/chosen": -0.3866727948188782, "logits/rejected": 1.2385426759719849, "logps/chosen": -459.158935546875, "logps/rejected": -760.5558471679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.200139999389648, "rewards/margins": 20.704795837402344, "rewards/rejected": -28.904937744140625, "step": 1128 }, { "epoch": 0.7023328149300155, "grad_norm": 0.00017669117369223386, "learning_rate": 4.25541724296911e-06, "logits/chosen": -0.9125028252601624, "logits/rejected": 2.94661808013916, "logps/chosen": -439.4700927734375, "logps/rejected": -901.3716430664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.392777919769287, "rewards/margins": 26.68711280822754, "rewards/rejected": -34.07988739013672, "step": 1129 }, { "epoch": 0.702954898911353, "grad_norm": 20.32399559020996, "learning_rate": 4.2542646380820656e-06, "logits/chosen": 2.0383431911468506, "logits/rejected": 4.066139221191406, "logps/chosen": -680.9654541015625, "logps/rejected": -1033.8470458984375, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": -12.162495613098145, "rewards/margins": 18.726539611816406, "rewards/rejected": -30.889034271240234, "step": 1130 }, { "epoch": 0.7035769828926906, "grad_norm": 0.00040257195360027254, "learning_rate": 4.253112033195021e-06, "logits/chosen": 1.1132146120071411, "logits/rejected": 2.3047876358032227, "logps/chosen": -578.9852905273438, "logps/rejected": -819.7774658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.081069946289062, "rewards/margins": 21.187292098999023, "rewards/rejected": -30.268362045288086, "step": 1131 }, { "epoch": 0.704199066874028, "grad_norm": 0.03850219398736954, "learning_rate": 4.251959428307976e-06, "logits/chosen": -2.612558364868164, "logits/rejected": 2.0713813304901123, "logps/chosen": -407.8648681640625, "logps/rejected": -971.5011596679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.519382476806641, "rewards/margins": 29.602420806884766, "rewards/rejected": -35.121803283691406, "step": 1132 }, { "epoch": 0.7048211508553655, "grad_norm": 11.934901237487793, "learning_rate": 4.250806823420931e-06, "logits/chosen": -0.5439414978027344, "logits/rejected": 3.837416172027588, "logps/chosen": -493.5166320800781, "logps/rejected": -1030.0224609375, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": -9.399980545043945, "rewards/margins": 22.14865493774414, "rewards/rejected": -31.54863166809082, "step": 1133 }, { "epoch": 0.7054432348367029, "grad_norm": 0.003834787290543318, "learning_rate": 4.2496542185338864e-06, "logits/chosen": 0.1230611801147461, "logits/rejected": 2.813925266265869, "logps/chosen": -550.717529296875, "logps/rejected": -868.7197875976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.865203857421875, "rewards/margins": 20.2117919921875, "rewards/rejected": -27.076993942260742, "step": 1134 }, { "epoch": 0.7060653188180405, "grad_norm": 0.04540088772773743, "learning_rate": 4.2485016136468425e-06, "logits/chosen": -1.5632213354110718, "logits/rejected": 2.7671170234680176, "logps/chosen": -450.50579833984375, "logps/rejected": -946.9651489257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.025535583496094, "rewards/margins": 23.097501754760742, "rewards/rejected": -33.12303924560547, "step": 1135 }, { "epoch": 0.7066874027993779, "grad_norm": 5.21671724319458, "learning_rate": 4.247349008759798e-06, "logits/chosen": -2.5789096355438232, "logits/rejected": 2.2848589420318604, "logps/chosen": -390.320556640625, "logps/rejected": -940.9581298828125, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -8.420944213867188, "rewards/margins": 23.340347290039062, "rewards/rejected": -31.76129150390625, "step": 1136 }, { "epoch": 0.7073094867807154, "grad_norm": 2.545893430709839, "learning_rate": 4.246196403872753e-06, "logits/chosen": 1.3946446180343628, "logits/rejected": 3.885446071624756, "logps/chosen": -593.1643676757812, "logps/rejected": -894.2576904296875, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -9.472441673278809, "rewards/margins": 18.71339225769043, "rewards/rejected": -28.185832977294922, "step": 1137 }, { "epoch": 0.7079315707620529, "grad_norm": 0.9141021370887756, "learning_rate": 4.245043798985708e-06, "logits/chosen": -1.6937819719314575, "logits/rejected": 3.2571754455566406, "logps/chosen": -420.5745544433594, "logps/rejected": -904.1632690429688, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -7.709376335144043, "rewards/margins": 21.777610778808594, "rewards/rejected": -29.486984252929688, "step": 1138 }, { "epoch": 0.7085536547433904, "grad_norm": 17.139318466186523, "learning_rate": 4.243891194098663e-06, "logits/chosen": 0.8341930508613586, "logits/rejected": 2.6119236946105957, "logps/chosen": -627.296630859375, "logps/rejected": -942.0413818359375, "loss": 0.3848, "rewards/accuracies": 0.875, "rewards/chosen": -11.913735389709473, "rewards/margins": 20.716289520263672, "rewards/rejected": -32.630027770996094, "step": 1139 }, { "epoch": 0.7091757387247278, "grad_norm": 0.12485391646623611, "learning_rate": 4.242738589211619e-06, "logits/chosen": 1.4291424751281738, "logits/rejected": 4.054292678833008, "logps/chosen": -568.59326171875, "logps/rejected": -992.9465942382812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.06442642211914, "rewards/margins": 25.010478973388672, "rewards/rejected": -34.07490539550781, "step": 1140 }, { "epoch": 0.7097978227060653, "grad_norm": 0.08407150208950043, "learning_rate": 4.241585984324574e-06, "logits/chosen": 0.009067535400390625, "logits/rejected": 3.6980178356170654, "logps/chosen": -557.7619018554688, "logps/rejected": -1060.493896484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.739320755004883, "rewards/margins": 27.374553680419922, "rewards/rejected": -37.11387634277344, "step": 1141 }, { "epoch": 0.7104199066874028, "grad_norm": 0.9794238209724426, "learning_rate": 4.240433379437529e-06, "logits/chosen": 1.6314430236816406, "logits/rejected": 3.8010740280151367, "logps/chosen": -520.548583984375, "logps/rejected": -840.6610107421875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -8.92463493347168, "rewards/margins": 18.227628707885742, "rewards/rejected": -27.152263641357422, "step": 1142 }, { "epoch": 0.7110419906687403, "grad_norm": 0.0969887226819992, "learning_rate": 4.239280774550484e-06, "logits/chosen": 1.1301627159118652, "logits/rejected": 4.129144668579102, "logps/chosen": -588.8717651367188, "logps/rejected": -987.9451904296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.679798126220703, "rewards/margins": 20.80878448486328, "rewards/rejected": -32.488582611083984, "step": 1143 }, { "epoch": 0.7116640746500777, "grad_norm": 0.00018440843268763274, "learning_rate": 4.2381281696634395e-06, "logits/chosen": -4.2730536460876465, "logits/rejected": 1.6310522556304932, "logps/chosen": -379.5325622558594, "logps/rejected": -1090.905517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.479127883911133, "rewards/margins": 27.126827239990234, "rewards/rejected": -36.60595703125, "step": 1144 }, { "epoch": 0.7122861586314152, "grad_norm": 5.640890321956249e-06, "learning_rate": 4.236975564776395e-06, "logits/chosen": -0.5040819048881531, "logits/rejected": 2.4404137134552, "logps/chosen": -602.3915405273438, "logps/rejected": -1003.7799072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.199384689331055, "rewards/margins": 25.80965232849121, "rewards/rejected": -38.009037017822266, "step": 1145 }, { "epoch": 0.7129082426127528, "grad_norm": 0.0018288391875103116, "learning_rate": 4.23582295988935e-06, "logits/chosen": -2.6323633193969727, "logits/rejected": 3.2968077659606934, "logps/chosen": -274.1453857421875, "logps/rejected": -953.5428466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.15943717956543, "rewards/margins": 31.97180938720703, "rewards/rejected": -38.131248474121094, "step": 1146 }, { "epoch": 0.7135303265940902, "grad_norm": 0.5116393566131592, "learning_rate": 4.234670355002305e-06, "logits/chosen": 2.3888742923736572, "logits/rejected": 4.2413763999938965, "logps/chosen": -573.247314453125, "logps/rejected": -932.5107421875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -10.365524291992188, "rewards/margins": 23.069791793823242, "rewards/rejected": -33.43531799316406, "step": 1147 }, { "epoch": 0.7141524105754277, "grad_norm": 1.6715589481464121e-06, "learning_rate": 4.2335177501152604e-06, "logits/chosen": -1.7874610424041748, "logits/rejected": 1.3828914165496826, "logps/chosen": -511.89208984375, "logps/rejected": -1041.0760498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.092294692993164, "rewards/margins": 31.230894088745117, "rewards/rejected": -45.32318878173828, "step": 1148 }, { "epoch": 0.7147744945567651, "grad_norm": 0.0019122587982565165, "learning_rate": 4.232365145228216e-06, "logits/chosen": -0.33687490224838257, "logits/rejected": 2.45595121383667, "logps/chosen": -480.1834411621094, "logps/rejected": -849.4967041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1947479248046875, "rewards/margins": 21.8647518157959, "rewards/rejected": -29.059499740600586, "step": 1149 }, { "epoch": 0.7153965785381027, "grad_norm": 1.4362432956695557, "learning_rate": 4.231212540341172e-06, "logits/chosen": -0.1586349606513977, "logits/rejected": 3.885680675506592, "logps/chosen": -409.5668640136719, "logps/rejected": -885.145263671875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -8.473544120788574, "rewards/margins": 24.60749626159668, "rewards/rejected": -33.08103942871094, "step": 1150 }, { "epoch": 0.7160186625194401, "grad_norm": 0.010208888910710812, "learning_rate": 4.230059935454127e-06, "logits/chosen": -0.2658725380897522, "logits/rejected": 3.370497703552246, "logps/chosen": -574.7543334960938, "logps/rejected": -974.1325073242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.140201568603516, "rewards/margins": 26.478336334228516, "rewards/rejected": -35.61853790283203, "step": 1151 }, { "epoch": 0.7166407465007776, "grad_norm": 36.79997634887695, "learning_rate": 4.228907330567082e-06, "logits/chosen": 1.2792608737945557, "logits/rejected": 3.2375693321228027, "logps/chosen": -534.0382690429688, "logps/rejected": -953.0762939453125, "loss": 0.1517, "rewards/accuracies": 0.875, "rewards/chosen": -12.882672309875488, "rewards/margins": 26.688556671142578, "rewards/rejected": -39.57122802734375, "step": 1152 }, { "epoch": 0.717262830482115, "grad_norm": 0.035786353051662445, "learning_rate": 4.227754725680037e-06, "logits/chosen": -1.173093318939209, "logits/rejected": 4.787845134735107, "logps/chosen": -524.6954956054688, "logps/rejected": -1209.607666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.333581924438477, "rewards/margins": 33.21443176269531, "rewards/rejected": -48.548011779785156, "step": 1153 }, { "epoch": 0.7178849144634526, "grad_norm": 3.625681088692545e-08, "learning_rate": 4.226602120792993e-06, "logits/chosen": -1.6249737739562988, "logits/rejected": 4.376003265380859, "logps/chosen": -340.1498107910156, "logps/rejected": -1045.368896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.608695030212402, "rewards/margins": 34.630897521972656, "rewards/rejected": -43.23958969116211, "step": 1154 }, { "epoch": 0.71850699844479, "grad_norm": 2.848944689048949e-07, "learning_rate": 4.225449515905948e-06, "logits/chosen": 2.577023983001709, "logits/rejected": 4.594021320343018, "logps/chosen": -614.6937255859375, "logps/rejected": -1042.291259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.032281875610352, "rewards/margins": 31.21833038330078, "rewards/rejected": -41.250614166259766, "step": 1155 }, { "epoch": 0.7191290824261275, "grad_norm": 38.33723068237305, "learning_rate": 4.224296911018903e-06, "logits/chosen": 1.7918505668640137, "logits/rejected": 2.563223361968994, "logps/chosen": -577.7158203125, "logps/rejected": -873.4849243164062, "loss": 1.1238, "rewards/accuracies": 0.875, "rewards/chosen": -9.859672546386719, "rewards/margins": 15.18758773803711, "rewards/rejected": -25.047260284423828, "step": 1156 }, { "epoch": 0.7197511664074651, "grad_norm": 0.010270086117088795, "learning_rate": 4.223144306131858e-06, "logits/chosen": -1.845631718635559, "logits/rejected": 3.4590423107147217, "logps/chosen": -422.4835510253906, "logps/rejected": -1021.2655029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.028568267822266, "rewards/margins": 28.59836196899414, "rewards/rejected": -38.626930236816406, "step": 1157 }, { "epoch": 0.7203732503888025, "grad_norm": 39.00583267211914, "learning_rate": 4.2219917012448135e-06, "logits/chosen": 2.3478219509124756, "logits/rejected": 3.523054599761963, "logps/chosen": -624.9152221679688, "logps/rejected": -920.3958129882812, "loss": 1.2437, "rewards/accuracies": 0.875, "rewards/chosen": -13.256948471069336, "rewards/margins": 21.686954498291016, "rewards/rejected": -34.94390106201172, "step": 1158 }, { "epoch": 0.72099533437014, "grad_norm": 26.327835083007812, "learning_rate": 4.220839096357769e-06, "logits/chosen": 2.0256595611572266, "logits/rejected": 4.168939590454102, "logps/chosen": -581.934814453125, "logps/rejected": -965.870361328125, "loss": 0.4675, "rewards/accuracies": 0.875, "rewards/chosen": -14.27999496459961, "rewards/margins": 23.65022850036621, "rewards/rejected": -37.93022155761719, "step": 1159 }, { "epoch": 0.7216174183514774, "grad_norm": 30.75332260131836, "learning_rate": 4.219686491470724e-06, "logits/chosen": -1.0962412357330322, "logits/rejected": 4.113137245178223, "logps/chosen": -478.94451904296875, "logps/rejected": -908.378173828125, "loss": 0.731, "rewards/accuracies": 0.875, "rewards/chosen": -11.116175651550293, "rewards/margins": 23.1622314453125, "rewards/rejected": -34.278404235839844, "step": 1160 }, { "epoch": 0.722239502332815, "grad_norm": 4.19508695602417, "learning_rate": 4.218533886583679e-06, "logits/chosen": 1.5392760038375854, "logits/rejected": 3.2550981044769287, "logps/chosen": -615.6009521484375, "logps/rejected": -1001.22021484375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -14.858214378356934, "rewards/margins": 21.70871925354004, "rewards/rejected": -36.566932678222656, "step": 1161 }, { "epoch": 0.7228615863141524, "grad_norm": 15.874382019042969, "learning_rate": 4.2173812816966344e-06, "logits/chosen": 0.26850560307502747, "logits/rejected": 1.8894236087799072, "logps/chosen": -580.573974609375, "logps/rejected": -969.2291870117188, "loss": 0.1518, "rewards/accuracies": 0.875, "rewards/chosen": -10.876068115234375, "rewards/margins": 26.33701515197754, "rewards/rejected": -37.21308135986328, "step": 1162 }, { "epoch": 0.7234836702954899, "grad_norm": 0.0005865055718459189, "learning_rate": 4.21622867680959e-06, "logits/chosen": -1.2452207803726196, "logits/rejected": 2.307518482208252, "logps/chosen": -477.93206787109375, "logps/rejected": -966.4757690429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.003711700439453, "rewards/margins": 25.798137664794922, "rewards/rejected": -36.801849365234375, "step": 1163 }, { "epoch": 0.7241057542768273, "grad_norm": 31.254302978515625, "learning_rate": 4.215076071922546e-06, "logits/chosen": -1.0185731649398804, "logits/rejected": 2.684237480163574, "logps/chosen": -494.90966796875, "logps/rejected": -883.1465454101562, "loss": 0.2146, "rewards/accuracies": 0.875, "rewards/chosen": -10.114580154418945, "rewards/margins": 20.16065788269043, "rewards/rejected": -30.275238037109375, "step": 1164 }, { "epoch": 0.7247278382581649, "grad_norm": 0.04852492734789848, "learning_rate": 4.213923467035501e-06, "logits/chosen": 0.2611873149871826, "logits/rejected": 3.668205738067627, "logps/chosen": -466.4119873046875, "logps/rejected": -863.919189453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.094996452331543, "rewards/margins": 22.080657958984375, "rewards/rejected": -31.1756534576416, "step": 1165 }, { "epoch": 0.7253499222395023, "grad_norm": 41.72018051147461, "learning_rate": 4.212770862148456e-06, "logits/chosen": 0.11253970861434937, "logits/rejected": 3.1600046157836914, "logps/chosen": -525.0753784179688, "logps/rejected": -816.2158813476562, "loss": 1.3193, "rewards/accuracies": 0.875, "rewards/chosen": -11.143072128295898, "rewards/margins": 22.540931701660156, "rewards/rejected": -33.68400192260742, "step": 1166 }, { "epoch": 0.7259720062208398, "grad_norm": 0.019932212308049202, "learning_rate": 4.211618257261411e-06, "logits/chosen": 2.4527406692504883, "logits/rejected": 3.221583843231201, "logps/chosen": -673.4598388671875, "logps/rejected": -895.5223388671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.391605377197266, "rewards/margins": 19.719661712646484, "rewards/rejected": -30.111265182495117, "step": 1167 }, { "epoch": 0.7265940902021772, "grad_norm": 0.0018634117441251874, "learning_rate": 4.210465652374367e-06, "logits/chosen": 2.2655115127563477, "logits/rejected": 3.0503270626068115, "logps/chosen": -653.0009155273438, "logps/rejected": -930.0877075195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.022181510925293, "rewards/margins": 25.00848388671875, "rewards/rejected": -38.03066635131836, "step": 1168 }, { "epoch": 0.7272161741835148, "grad_norm": 4.644537448883057, "learning_rate": 4.209313047487322e-06, "logits/chosen": 0.016414497047662735, "logits/rejected": 4.014350891113281, "logps/chosen": -466.252197265625, "logps/rejected": -1043.4276123046875, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -9.585433959960938, "rewards/margins": 28.32579803466797, "rewards/rejected": -37.911231994628906, "step": 1169 }, { "epoch": 0.7278382581648523, "grad_norm": 47.371559143066406, "learning_rate": 4.208160442600277e-06, "logits/chosen": 0.4835778772830963, "logits/rejected": 2.874391555786133, "logps/chosen": -633.0033569335938, "logps/rejected": -1000.6109619140625, "loss": 0.6942, "rewards/accuracies": 0.875, "rewards/chosen": -16.999771118164062, "rewards/margins": 18.929607391357422, "rewards/rejected": -35.929378509521484, "step": 1170 }, { "epoch": 0.7284603421461897, "grad_norm": 5.057783603668213, "learning_rate": 4.207007837713232e-06, "logits/chosen": -0.22269773483276367, "logits/rejected": 3.5258021354675293, "logps/chosen": -513.5776977539062, "logps/rejected": -1006.0166015625, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -11.099183082580566, "rewards/margins": 25.058616638183594, "rewards/rejected": -36.157798767089844, "step": 1171 }, { "epoch": 0.7290824261275272, "grad_norm": 30.32122802734375, "learning_rate": 4.2058552328261875e-06, "logits/chosen": -0.032325103878974915, "logits/rejected": 3.461878776550293, "logps/chosen": -534.3825073242188, "logps/rejected": -879.685302734375, "loss": 0.269, "rewards/accuracies": 0.875, "rewards/chosen": -9.895355224609375, "rewards/margins": 17.291181564331055, "rewards/rejected": -27.18653678894043, "step": 1172 }, { "epoch": 0.7297045101088647, "grad_norm": 5.359790802001953, "learning_rate": 4.204702627939143e-06, "logits/chosen": 0.5488654971122742, "logits/rejected": 3.726155996322632, "logps/chosen": -459.772216796875, "logps/rejected": -815.4249267578125, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -9.793237686157227, "rewards/margins": 21.808074951171875, "rewards/rejected": -31.6013126373291, "step": 1173 }, { "epoch": 0.7303265940902022, "grad_norm": 7.4824442863464355, "learning_rate": 4.203550023052098e-06, "logits/chosen": 0.13619333505630493, "logits/rejected": 4.627652168273926, "logps/chosen": -367.88629150390625, "logps/rejected": -852.6173095703125, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -8.94129467010498, "rewards/margins": 24.91082000732422, "rewards/rejected": -33.85211181640625, "step": 1174 }, { "epoch": 0.7309486780715396, "grad_norm": 0.19221581518650055, "learning_rate": 4.202397418165053e-06, "logits/chosen": 0.3853622376918793, "logits/rejected": 3.9664766788482666, "logps/chosen": -639.0650634765625, "logps/rejected": -1077.99169921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.3831148147583, "rewards/margins": 20.84623908996582, "rewards/rejected": -30.229352951049805, "step": 1175 }, { "epoch": 0.7315707620528772, "grad_norm": 0.00024160981411114335, "learning_rate": 4.2012448132780084e-06, "logits/chosen": -3.6084365844726562, "logits/rejected": 2.1406548023223877, "logps/chosen": -383.0024108886719, "logps/rejected": -1027.7147216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.151406288146973, "rewards/margins": 29.40199851989746, "rewards/rejected": -35.55340576171875, "step": 1176 }, { "epoch": 0.7321928460342146, "grad_norm": 0.12362519651651382, "learning_rate": 4.200092208390964e-06, "logits/chosen": -0.8688986301422119, "logits/rejected": 3.6696877479553223, "logps/chosen": -380.8484802246094, "logps/rejected": -854.9744262695312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.829305171966553, "rewards/margins": 21.9825439453125, "rewards/rejected": -28.811851501464844, "step": 1177 }, { "epoch": 0.7328149300155521, "grad_norm": 0.08682712912559509, "learning_rate": 4.198939603503919e-06, "logits/chosen": -1.9073578119277954, "logits/rejected": 2.8443100452423096, "logps/chosen": -363.9940490722656, "logps/rejected": -863.8155517578125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.090331077575684, "rewards/margins": 23.799715042114258, "rewards/rejected": -29.890045166015625, "step": 1178 }, { "epoch": 0.7334370139968895, "grad_norm": 0.023318586871027946, "learning_rate": 4.197786998616875e-06, "logits/chosen": 0.8904905319213867, "logits/rejected": 2.4922831058502197, "logps/chosen": -671.4569091796875, "logps/rejected": -982.808837890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.915792465209961, "rewards/margins": 25.762042999267578, "rewards/rejected": -34.67783737182617, "step": 1179 }, { "epoch": 0.7340590979782271, "grad_norm": 16.87903594970703, "learning_rate": 4.19663439372983e-06, "logits/chosen": 3.0492799282073975, "logits/rejected": 4.473697662353516, "logps/chosen": -709.522216796875, "logps/rejected": -964.0701904296875, "loss": 0.2486, "rewards/accuracies": 0.875, "rewards/chosen": -7.466982364654541, "rewards/margins": 19.10428237915039, "rewards/rejected": -26.571266174316406, "step": 1180 }, { "epoch": 0.7346811819595646, "grad_norm": 0.0012558766175061464, "learning_rate": 4.195481788842785e-06, "logits/chosen": -0.866086483001709, "logits/rejected": 3.2224390506744385, "logps/chosen": -540.8012084960938, "logps/rejected": -1071.502685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.356019973754883, "rewards/margins": 32.99429702758789, "rewards/rejected": -45.35031509399414, "step": 1181 }, { "epoch": 0.735303265940902, "grad_norm": 0.0014930395409464836, "learning_rate": 4.194329183955741e-06, "logits/chosen": -0.5875803232192993, "logits/rejected": 2.544929265975952, "logps/chosen": -353.6885681152344, "logps/rejected": -829.4857177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.364542961120605, "rewards/margins": 27.51534652709961, "rewards/rejected": -35.87989044189453, "step": 1182 }, { "epoch": 0.7359253499222395, "grad_norm": 0.022201891988515854, "learning_rate": 4.193176579068696e-06, "logits/chosen": 0.19210612773895264, "logits/rejected": 3.1613025665283203, "logps/chosen": -617.490234375, "logps/rejected": -954.2713012695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.373228073120117, "rewards/margins": 24.7774600982666, "rewards/rejected": -35.15068817138672, "step": 1183 }, { "epoch": 0.736547433903577, "grad_norm": 1.701123595237732, "learning_rate": 4.192023974181651e-06, "logits/chosen": -0.042814530432224274, "logits/rejected": 3.2058024406433105, "logps/chosen": -491.13330078125, "logps/rejected": -948.0253295898438, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -10.885446548461914, "rewards/margins": 28.518571853637695, "rewards/rejected": -39.40401840209961, "step": 1184 }, { "epoch": 0.7371695178849145, "grad_norm": 0.0002101602149195969, "learning_rate": 4.190871369294606e-06, "logits/chosen": 1.0134602785110474, "logits/rejected": 2.7794322967529297, "logps/chosen": -631.4107055664062, "logps/rejected": -1013.0341186523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.573670387268066, "rewards/margins": 24.038246154785156, "rewards/rejected": -35.611915588378906, "step": 1185 }, { "epoch": 0.7377916018662519, "grad_norm": 37.85633087158203, "learning_rate": 4.1897187644075615e-06, "logits/chosen": 2.350338935852051, "logits/rejected": 2.9470388889312744, "logps/chosen": -744.1123046875, "logps/rejected": -894.8653564453125, "loss": 0.3682, "rewards/accuracies": 0.875, "rewards/chosen": -10.176665306091309, "rewards/margins": 11.495548248291016, "rewards/rejected": -21.67221450805664, "step": 1186 }, { "epoch": 0.7384136858475894, "grad_norm": 18.087539672851562, "learning_rate": 4.188566159520517e-06, "logits/chosen": 1.4269673824310303, "logits/rejected": 4.23713493347168, "logps/chosen": -613.511474609375, "logps/rejected": -1102.857421875, "loss": 0.1002, "rewards/accuracies": 0.875, "rewards/chosen": -9.977934837341309, "rewards/margins": 28.6257266998291, "rewards/rejected": -38.60366439819336, "step": 1187 }, { "epoch": 0.7390357698289269, "grad_norm": 4.416318461153423e-06, "learning_rate": 4.187413554633472e-06, "logits/chosen": 1.2357975244522095, "logits/rejected": 5.414222240447998, "logps/chosen": -494.890869140625, "logps/rejected": -973.6812133789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.658195972442627, "rewards/margins": 33.96454620361328, "rewards/rejected": -39.62274169921875, "step": 1188 }, { "epoch": 0.7396578538102644, "grad_norm": 0.07317977398633957, "learning_rate": 4.186260949746427e-06, "logits/chosen": -2.1512906551361084, "logits/rejected": 3.888598918914795, "logps/chosen": -344.72454833984375, "logps/rejected": -1022.0072021484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.827019691467285, "rewards/margins": 32.47252655029297, "rewards/rejected": -40.29954528808594, "step": 1189 }, { "epoch": 0.7402799377916018, "grad_norm": 33.14171600341797, "learning_rate": 4.185108344859382e-06, "logits/chosen": -2.4892563819885254, "logits/rejected": 3.2610530853271484, "logps/chosen": -430.8138122558594, "logps/rejected": -886.5509643554688, "loss": 0.5152, "rewards/accuracies": 0.875, "rewards/chosen": -7.59311056137085, "rewards/margins": 19.534095764160156, "rewards/rejected": -27.12720489501953, "step": 1190 }, { "epoch": 0.7409020217729394, "grad_norm": 38.92852020263672, "learning_rate": 4.183955739972338e-06, "logits/chosen": 1.1341769695281982, "logits/rejected": 3.1176342964172363, "logps/chosen": -613.213134765625, "logps/rejected": -884.856689453125, "loss": 0.9723, "rewards/accuracies": 0.875, "rewards/chosen": -11.557759284973145, "rewards/margins": 15.217055320739746, "rewards/rejected": -26.774816513061523, "step": 1191 }, { "epoch": 0.7415241057542769, "grad_norm": 0.0014572658110409975, "learning_rate": 4.182803135085293e-06, "logits/chosen": -3.2773330211639404, "logits/rejected": 2.6450986862182617, "logps/chosen": -469.614501953125, "logps/rejected": -1094.7154541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.044410705566406, "rewards/margins": 28.91035270690918, "rewards/rejected": -37.95476531982422, "step": 1192 }, { "epoch": 0.7421461897356143, "grad_norm": 51.88095474243164, "learning_rate": 4.181650530198248e-06, "logits/chosen": -0.44079411029815674, "logits/rejected": 0.846016526222229, "logps/chosen": -553.1190185546875, "logps/rejected": -895.7581787109375, "loss": 0.6423, "rewards/accuracies": 0.75, "rewards/chosen": -7.631755828857422, "rewards/margins": 19.03414535522461, "rewards/rejected": -26.665903091430664, "step": 1193 }, { "epoch": 0.7427682737169518, "grad_norm": 1.7541396617889404, "learning_rate": 4.180497925311204e-06, "logits/chosen": 2.258620262145996, "logits/rejected": 2.388881206512451, "logps/chosen": -714.3488159179688, "logps/rejected": -937.6739501953125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -13.310730934143066, "rewards/margins": 22.443029403686523, "rewards/rejected": -35.753761291503906, "step": 1194 }, { "epoch": 0.7433903576982893, "grad_norm": 41.518917083740234, "learning_rate": 4.179345320424159e-06, "logits/chosen": -0.8975342512130737, "logits/rejected": 3.0638198852539062, "logps/chosen": -586.2952880859375, "logps/rejected": -1060.0189208984375, "loss": 0.6023, "rewards/accuracies": 0.875, "rewards/chosen": -13.759418487548828, "rewards/margins": 23.457223892211914, "rewards/rejected": -37.216644287109375, "step": 1195 }, { "epoch": 0.7440124416796268, "grad_norm": 0.0007180199609138072, "learning_rate": 4.178192715537115e-06, "logits/chosen": 2.2216758728027344, "logits/rejected": 0.9457840919494629, "logps/chosen": -644.4976806640625, "logps/rejected": -935.6669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.617611885070801, "rewards/margins": 24.9567928314209, "rewards/rejected": -32.57440185546875, "step": 1196 }, { "epoch": 0.7446345256609642, "grad_norm": 0.6821045875549316, "learning_rate": 4.17704011065007e-06, "logits/chosen": -0.23671680688858032, "logits/rejected": 4.001680850982666, "logps/chosen": -459.28765869140625, "logps/rejected": -920.262451171875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.031026840209961, "rewards/margins": 26.548620223999023, "rewards/rejected": -33.57965087890625, "step": 1197 }, { "epoch": 0.7452566096423017, "grad_norm": 0.0007749017095193267, "learning_rate": 4.175887505763025e-06, "logits/chosen": 0.943214476108551, "logits/rejected": 3.5111474990844727, "logps/chosen": -541.9749145507812, "logps/rejected": -922.7454833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.438486099243164, "rewards/margins": 21.518447875976562, "rewards/rejected": -32.95693588256836, "step": 1198 }, { "epoch": 0.7458786936236392, "grad_norm": 0.011360271833837032, "learning_rate": 4.17473490087598e-06, "logits/chosen": 0.1720401495695114, "logits/rejected": 2.3051769733428955, "logps/chosen": -502.39007568359375, "logps/rejected": -829.52880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.472356796264648, "rewards/margins": 20.189279556274414, "rewards/rejected": -28.661632537841797, "step": 1199 }, { "epoch": 0.7465007776049767, "grad_norm": 0.048153672367334366, "learning_rate": 4.1735822959889355e-06, "logits/chosen": 0.5549330711364746, "logits/rejected": 1.9446189403533936, "logps/chosen": -512.0660400390625, "logps/rejected": -945.5157470703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.108501434326172, "rewards/margins": 30.116321563720703, "rewards/rejected": -38.224822998046875, "step": 1200 }, { "epoch": 0.7471228615863141, "grad_norm": 0.3604937493801117, "learning_rate": 4.172429691101891e-06, "logits/chosen": -1.7650338411331177, "logits/rejected": 4.217351913452148, "logps/chosen": -459.73455810546875, "logps/rejected": -1017.9171752929688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.848245620727539, "rewards/margins": 28.195613861083984, "rewards/rejected": -36.04385757446289, "step": 1201 }, { "epoch": 0.7477449455676516, "grad_norm": 8.572696685860137e-08, "learning_rate": 4.171277086214846e-06, "logits/chosen": -3.219414710998535, "logits/rejected": 3.129903793334961, "logps/chosen": -304.957763671875, "logps/rejected": -1000.2400512695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.250598907470703, "rewards/margins": 32.68410110473633, "rewards/rejected": -39.93470001220703, "step": 1202 }, { "epoch": 0.7483670295489891, "grad_norm": 0.0002947688626591116, "learning_rate": 4.170124481327801e-06, "logits/chosen": -4.783797264099121, "logits/rejected": 0.5106593370437622, "logps/chosen": -353.1451416015625, "logps/rejected": -933.575439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.65771198272705, "rewards/margins": 30.264278411865234, "rewards/rejected": -38.92198944091797, "step": 1203 }, { "epoch": 0.7489891135303266, "grad_norm": 0.25060757994651794, "learning_rate": 4.168971876440756e-06, "logits/chosen": -3.233412981033325, "logits/rejected": 3.1030588150024414, "logps/chosen": -330.5057373046875, "logps/rejected": -871.0208740234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.052924394607544, "rewards/margins": 21.390153884887695, "rewards/rejected": -23.443077087402344, "step": 1204 }, { "epoch": 0.749611197511664, "grad_norm": 26.881370544433594, "learning_rate": 4.167819271553712e-06, "logits/chosen": -0.15430384874343872, "logits/rejected": 0.6273139715194702, "logps/chosen": -479.899658203125, "logps/rejected": -717.349365234375, "loss": 0.3197, "rewards/accuracies": 0.875, "rewards/chosen": -6.2665629386901855, "rewards/margins": 14.102705955505371, "rewards/rejected": -20.36927032470703, "step": 1205 }, { "epoch": 0.7502332814930015, "grad_norm": 26.799030303955078, "learning_rate": 4.166666666666667e-06, "logits/chosen": 1.0275671482086182, "logits/rejected": 3.896846294403076, "logps/chosen": -617.9913330078125, "logps/rejected": -1051.7125244140625, "loss": 0.7833, "rewards/accuracies": 0.875, "rewards/chosen": -12.258463859558105, "rewards/margins": 28.130443572998047, "rewards/rejected": -40.3889045715332, "step": 1206 }, { "epoch": 0.7508553654743391, "grad_norm": 1.8569644453236833e-05, "learning_rate": 4.165514061779622e-06, "logits/chosen": -2.7208077907562256, "logits/rejected": 1.4001092910766602, "logps/chosen": -469.54425048828125, "logps/rejected": -927.6661987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.268619537353516, "rewards/margins": 24.516754150390625, "rewards/rejected": -33.78537368774414, "step": 1207 }, { "epoch": 0.7514774494556765, "grad_norm": 0.35053345561027527, "learning_rate": 4.164361456892578e-06, "logits/chosen": 1.7314445972442627, "logits/rejected": 3.289358377456665, "logps/chosen": -510.5986633300781, "logps/rejected": -721.63720703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.4644880294799805, "rewards/margins": 17.393329620361328, "rewards/rejected": -24.857816696166992, "step": 1208 }, { "epoch": 0.752099533437014, "grad_norm": 0.00014789852139074355, "learning_rate": 4.163208852005533e-06, "logits/chosen": -1.9375808238983154, "logits/rejected": 4.110387802124023, "logps/chosen": -506.71685791015625, "logps/rejected": -1072.865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.544163227081299, "rewards/margins": 24.764442443847656, "rewards/rejected": -32.3086051940918, "step": 1209 }, { "epoch": 0.7527216174183515, "grad_norm": 15.979223251342773, "learning_rate": 4.162056247118489e-06, "logits/chosen": -2.863640308380127, "logits/rejected": -0.7609111070632935, "logps/chosen": -369.8997802734375, "logps/rejected": -622.5078735351562, "loss": 0.7803, "rewards/accuracies": 0.875, "rewards/chosen": -7.1783270835876465, "rewards/margins": 17.676910400390625, "rewards/rejected": -24.85523796081543, "step": 1210 }, { "epoch": 0.753343701399689, "grad_norm": 13.629114151000977, "learning_rate": 4.160903642231444e-06, "logits/chosen": -1.767147421836853, "logits/rejected": 3.5947704315185547, "logps/chosen": -434.3961181640625, "logps/rejected": -929.360595703125, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": -3.125800132751465, "rewards/margins": 20.63152313232422, "rewards/rejected": -23.75732421875, "step": 1211 }, { "epoch": 0.7539657853810264, "grad_norm": 23.103200912475586, "learning_rate": 4.159751037344399e-06, "logits/chosen": 2.1288254261016846, "logits/rejected": 4.818546772003174, "logps/chosen": -674.2977294921875, "logps/rejected": -1065.44775390625, "loss": 0.1744, "rewards/accuracies": 0.875, "rewards/chosen": -9.28523063659668, "rewards/margins": 26.10309600830078, "rewards/rejected": -35.38832473754883, "step": 1212 }, { "epoch": 0.7545878693623639, "grad_norm": 0.00010452913556946442, "learning_rate": 4.158598432457354e-06, "logits/chosen": -0.18677985668182373, "logits/rejected": 3.669586658477783, "logps/chosen": -445.21282958984375, "logps/rejected": -975.4344482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.2635016441345215, "rewards/margins": 30.339672088623047, "rewards/rejected": -35.60317611694336, "step": 1213 }, { "epoch": 0.7552099533437014, "grad_norm": 0.026064734905958176, "learning_rate": 4.1574458275703095e-06, "logits/chosen": -2.9125752449035645, "logits/rejected": 2.556290864944458, "logps/chosen": -416.84442138671875, "logps/rejected": -1013.16455078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.045517921447754, "rewards/margins": 28.890634536743164, "rewards/rejected": -37.936153411865234, "step": 1214 }, { "epoch": 0.7558320373250389, "grad_norm": 0.056555796414613724, "learning_rate": 4.156293222683265e-06, "logits/chosen": 0.5654237866401672, "logits/rejected": 3.7856836318969727, "logps/chosen": -555.9217529296875, "logps/rejected": -896.5208740234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.224716186523438, "rewards/margins": 21.14641571044922, "rewards/rejected": -31.371129989624023, "step": 1215 }, { "epoch": 0.7564541213063763, "grad_norm": 27.12925910949707, "learning_rate": 4.15514061779622e-06, "logits/chosen": 1.7580722570419312, "logits/rejected": 4.319855690002441, "logps/chosen": -604.033447265625, "logps/rejected": -890.464111328125, "loss": 0.2599, "rewards/accuracies": 0.875, "rewards/chosen": -7.791042804718018, "rewards/margins": 17.623245239257812, "rewards/rejected": -25.414289474487305, "step": 1216 }, { "epoch": 0.7570762052877138, "grad_norm": 48.94939422607422, "learning_rate": 4.153988012909175e-06, "logits/chosen": 0.016593068838119507, "logits/rejected": 3.1171576976776123, "logps/chosen": -501.7952880859375, "logps/rejected": -860.7073364257812, "loss": 1.7968, "rewards/accuracies": 0.75, "rewards/chosen": -12.870309829711914, "rewards/margins": 20.11650276184082, "rewards/rejected": -32.98681640625, "step": 1217 }, { "epoch": 0.7576982892690514, "grad_norm": 0.00010765832848846912, "learning_rate": 4.15283540802213e-06, "logits/chosen": -0.6010594964027405, "logits/rejected": 4.7762250900268555, "logps/chosen": -502.10894775390625, "logps/rejected": -1062.4873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.751803398132324, "rewards/margins": 29.6951904296875, "rewards/rejected": -39.446990966796875, "step": 1218 }, { "epoch": 0.7583203732503888, "grad_norm": 29.66597557067871, "learning_rate": 4.151682803135086e-06, "logits/chosen": 0.9268134832382202, "logits/rejected": 2.4630675315856934, "logps/chosen": -636.5853271484375, "logps/rejected": -1010.4078369140625, "loss": 0.3565, "rewards/accuracies": 0.875, "rewards/chosen": -9.526751518249512, "rewards/margins": 27.499366760253906, "rewards/rejected": -37.02611541748047, "step": 1219 }, { "epoch": 0.7589424572317263, "grad_norm": 40.08103561401367, "learning_rate": 4.150530198248041e-06, "logits/chosen": -1.796665072441101, "logits/rejected": 2.3823959827423096, "logps/chosen": -502.9656982421875, "logps/rejected": -979.6764526367188, "loss": 0.5065, "rewards/accuracies": 0.875, "rewards/chosen": -10.961488723754883, "rewards/margins": 22.71531105041504, "rewards/rejected": -33.67679977416992, "step": 1220 }, { "epoch": 0.7595645412130637, "grad_norm": 0.29949039220809937, "learning_rate": 4.149377593360996e-06, "logits/chosen": 2.081892728805542, "logits/rejected": 3.9054107666015625, "logps/chosen": -588.187744140625, "logps/rejected": -923.739990234375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.143125534057617, "rewards/margins": 25.538053512573242, "rewards/rejected": -33.68117904663086, "step": 1221 }, { "epoch": 0.7601866251944013, "grad_norm": 5.415968189481646e-06, "learning_rate": 4.148224988473951e-06, "logits/chosen": 1.3517963886260986, "logits/rejected": 3.5172603130340576, "logps/chosen": -586.5282592773438, "logps/rejected": -1011.800537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.000934600830078, "rewards/margins": 27.62999153137207, "rewards/rejected": -38.630924224853516, "step": 1222 }, { "epoch": 0.7608087091757387, "grad_norm": 0.0005389899015426636, "learning_rate": 4.147072383586907e-06, "logits/chosen": -0.550157368183136, "logits/rejected": 1.908506989479065, "logps/chosen": -415.641357421875, "logps/rejected": -694.29345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.685222148895264, "rewards/margins": 19.236995697021484, "rewards/rejected": -24.922218322753906, "step": 1223 }, { "epoch": 0.7614307931570762, "grad_norm": 0.00261475401930511, "learning_rate": 4.145919778699863e-06, "logits/chosen": 0.1322479248046875, "logits/rejected": 2.452498435974121, "logps/chosen": -573.089111328125, "logps/rejected": -906.893310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.214548110961914, "rewards/margins": 22.150157928466797, "rewards/rejected": -32.364707946777344, "step": 1224 }, { "epoch": 0.7620528771384136, "grad_norm": 0.026828749105334282, "learning_rate": 4.144767173812818e-06, "logits/chosen": 1.557422161102295, "logits/rejected": 2.9317455291748047, "logps/chosen": -688.3087158203125, "logps/rejected": -935.5712890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.075133323669434, "rewards/margins": 18.204994201660156, "rewards/rejected": -30.280128479003906, "step": 1225 }, { "epoch": 0.7626749611197512, "grad_norm": 6.467380523681641, "learning_rate": 4.143614568925773e-06, "logits/chosen": 0.3639960289001465, "logits/rejected": 3.995713233947754, "logps/chosen": -623.2493286132812, "logps/rejected": -1020.880126953125, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -11.351022720336914, "rewards/margins": 22.294944763183594, "rewards/rejected": -33.64596939086914, "step": 1226 }, { "epoch": 0.7632970451010886, "grad_norm": 0.0015312153846025467, "learning_rate": 4.142461964038728e-06, "logits/chosen": 2.2482800483703613, "logits/rejected": 3.425135374069214, "logps/chosen": -578.300048828125, "logps/rejected": -859.5419921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.633014678955078, "rewards/margins": 21.133586883544922, "rewards/rejected": -31.7666015625, "step": 1227 }, { "epoch": 0.7639191290824261, "grad_norm": 45.72669982910156, "learning_rate": 4.1413093591516835e-06, "logits/chosen": -0.774622917175293, "logits/rejected": 3.128000259399414, "logps/chosen": -628.8204345703125, "logps/rejected": -994.5520629882812, "loss": 0.7044, "rewards/accuracies": 0.75, "rewards/chosen": -10.55477523803711, "rewards/margins": 14.067876815795898, "rewards/rejected": -24.62265396118164, "step": 1228 }, { "epoch": 0.7645412130637637, "grad_norm": 0.012374775484204292, "learning_rate": 4.140156754264638e-06, "logits/chosen": -1.5677317380905151, "logits/rejected": 3.701002359390259, "logps/chosen": -402.24896240234375, "logps/rejected": -841.3854370117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.498409271240234, "rewards/margins": 20.21123504638672, "rewards/rejected": -25.709644317626953, "step": 1229 }, { "epoch": 0.7651632970451011, "grad_norm": 17.667760848999023, "learning_rate": 4.139004149377593e-06, "logits/chosen": -0.14382916688919067, "logits/rejected": 3.2963027954101562, "logps/chosen": -537.2012939453125, "logps/rejected": -980.7515869140625, "loss": 0.0985, "rewards/accuracies": 0.875, "rewards/chosen": -8.536566734313965, "rewards/margins": 25.164365768432617, "rewards/rejected": -33.70093536376953, "step": 1230 }, { "epoch": 0.7657853810264386, "grad_norm": 0.952735185623169, "learning_rate": 4.137851544490548e-06, "logits/chosen": 2.5447874069213867, "logits/rejected": 3.823918104171753, "logps/chosen": -715.80419921875, "logps/rejected": -1034.6048583984375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -14.001297950744629, "rewards/margins": 20.011383056640625, "rewards/rejected": -34.01268005371094, "step": 1231 }, { "epoch": 0.766407465007776, "grad_norm": 0.005690231919288635, "learning_rate": 4.136698939603504e-06, "logits/chosen": -2.129636287689209, "logits/rejected": 3.6906516551971436, "logps/chosen": -313.97076416015625, "logps/rejected": -797.6610107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.206093788146973, "rewards/margins": 20.89019012451172, "rewards/rejected": -28.096282958984375, "step": 1232 }, { "epoch": 0.7670295489891136, "grad_norm": 35.85158920288086, "learning_rate": 4.13554633471646e-06, "logits/chosen": -0.5706221461296082, "logits/rejected": 2.8684823513031006, "logps/chosen": -525.5751953125, "logps/rejected": -977.8446044921875, "loss": 0.8167, "rewards/accuracies": 0.875, "rewards/chosen": -13.22457218170166, "rewards/margins": 23.050174713134766, "rewards/rejected": -36.274749755859375, "step": 1233 }, { "epoch": 0.767651632970451, "grad_norm": 57.710357666015625, "learning_rate": 4.134393729829415e-06, "logits/chosen": -0.09655407816171646, "logits/rejected": 1.2852544784545898, "logps/chosen": -605.3065185546875, "logps/rejected": -809.9786376953125, "loss": 1.7346, "rewards/accuracies": 0.75, "rewards/chosen": -13.01135540008545, "rewards/margins": 13.374024391174316, "rewards/rejected": -26.385379791259766, "step": 1234 }, { "epoch": 0.7682737169517885, "grad_norm": 0.0007348281214945018, "learning_rate": 4.13324112494237e-06, "logits/chosen": 2.3951401710510254, "logits/rejected": 4.68405294418335, "logps/chosen": -720.020751953125, "logps/rejected": -1034.022705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.383231163024902, "rewards/margins": 21.747941970825195, "rewards/rejected": -33.13117218017578, "step": 1235 }, { "epoch": 0.7688958009331259, "grad_norm": 40.10581970214844, "learning_rate": 4.132088520055325e-06, "logits/chosen": -1.058050513267517, "logits/rejected": 1.4945002794265747, "logps/chosen": -526.7613525390625, "logps/rejected": -863.2998657226562, "loss": 0.6122, "rewards/accuracies": 0.875, "rewards/chosen": -8.167625427246094, "rewards/margins": 21.5457763671875, "rewards/rejected": -29.713401794433594, "step": 1236 }, { "epoch": 0.7695178849144635, "grad_norm": 0.1868850290775299, "learning_rate": 4.1309359151682805e-06, "logits/chosen": -0.6735565662384033, "logits/rejected": 3.166142463684082, "logps/chosen": -555.5580444335938, "logps/rejected": -926.4407958984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.521209716796875, "rewards/margins": 24.875356674194336, "rewards/rejected": -38.396568298339844, "step": 1237 }, { "epoch": 0.7701399688958009, "grad_norm": 17.024826049804688, "learning_rate": 4.129783310281236e-06, "logits/chosen": -1.321545124053955, "logits/rejected": 2.6138415336608887, "logps/chosen": -372.07635498046875, "logps/rejected": -855.348388671875, "loss": 0.1021, "rewards/accuracies": 0.875, "rewards/chosen": -6.196434020996094, "rewards/margins": 24.538761138916016, "rewards/rejected": -30.735193252563477, "step": 1238 }, { "epoch": 0.7707620528771384, "grad_norm": 8.435630798339844, "learning_rate": 4.128630705394191e-06, "logits/chosen": 2.0260398387908936, "logits/rejected": 4.694815635681152, "logps/chosen": -593.011962890625, "logps/rejected": -928.0355224609375, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -9.318330764770508, "rewards/margins": 20.424802780151367, "rewards/rejected": -29.743133544921875, "step": 1239 }, { "epoch": 0.7713841368584758, "grad_norm": 15.079669952392578, "learning_rate": 4.127478100507146e-06, "logits/chosen": -1.5058603286743164, "logits/rejected": 3.0221991539001465, "logps/chosen": -332.8693542480469, "logps/rejected": -755.5462036132812, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": -4.04178524017334, "rewards/margins": 22.907142639160156, "rewards/rejected": -26.94892692565918, "step": 1240 }, { "epoch": 0.7720062208398134, "grad_norm": 1.4181602001190186, "learning_rate": 4.1263254956201014e-06, "logits/chosen": 2.111570119857788, "logits/rejected": 5.410984039306641, "logps/chosen": -535.548095703125, "logps/rejected": -1025.924072265625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -7.443130016326904, "rewards/margins": 25.616270065307617, "rewards/rejected": -33.05940246582031, "step": 1241 }, { "epoch": 0.7726283048211509, "grad_norm": 25.272632598876953, "learning_rate": 4.125172890733057e-06, "logits/chosen": 0.5199185013771057, "logits/rejected": 2.5829875469207764, "logps/chosen": -668.5764770507812, "logps/rejected": -979.669921875, "loss": 0.2667, "rewards/accuracies": 0.875, "rewards/chosen": -10.801076889038086, "rewards/margins": 19.066125869750977, "rewards/rejected": -29.867202758789062, "step": 1242 }, { "epoch": 0.7732503888024883, "grad_norm": 7.891100722190458e-06, "learning_rate": 4.124020285846012e-06, "logits/chosen": 1.2405672073364258, "logits/rejected": 4.021305084228516, "logps/chosen": -650.9105224609375, "logps/rejected": -1101.25341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.636184692382812, "rewards/margins": 27.267887115478516, "rewards/rejected": -36.904075622558594, "step": 1243 }, { "epoch": 0.7738724727838259, "grad_norm": 26.89502716064453, "learning_rate": 4.122867680958967e-06, "logits/chosen": 2.664867401123047, "logits/rejected": 4.436239242553711, "logps/chosen": -678.9229736328125, "logps/rejected": -927.5115356445312, "loss": 0.2157, "rewards/accuracies": 0.875, "rewards/chosen": -10.128175735473633, "rewards/margins": 16.038503646850586, "rewards/rejected": -26.16668128967285, "step": 1244 }, { "epoch": 0.7744945567651633, "grad_norm": 0.000490253500174731, "learning_rate": 4.121715076071922e-06, "logits/chosen": 3.488694667816162, "logits/rejected": 3.4402871131896973, "logps/chosen": -698.736328125, "logps/rejected": -955.9376220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.386505126953125, "rewards/margins": 21.48382568359375, "rewards/rejected": -30.870332717895508, "step": 1245 }, { "epoch": 0.7751166407465008, "grad_norm": 0.0054618967697024345, "learning_rate": 4.1205624711848776e-06, "logits/chosen": -0.03978198766708374, "logits/rejected": 4.639406204223633, "logps/chosen": -345.4989013671875, "logps/rejected": -861.0494384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.173473358154297, "rewards/margins": 24.827526092529297, "rewards/rejected": -32.000999450683594, "step": 1246 }, { "epoch": 0.7757387247278382, "grad_norm": 27.640233993530273, "learning_rate": 4.119409866297834e-06, "logits/chosen": -1.448029637336731, "logits/rejected": 2.496913433074951, "logps/chosen": -540.625732421875, "logps/rejected": -894.2573852539062, "loss": 0.4569, "rewards/accuracies": 0.875, "rewards/chosen": -7.5917887687683105, "rewards/margins": 21.170873641967773, "rewards/rejected": -28.762664794921875, "step": 1247 }, { "epoch": 0.7763608087091758, "grad_norm": 0.006316736806184053, "learning_rate": 4.118257261410789e-06, "logits/chosen": -2.042257785797119, "logits/rejected": 3.5098836421966553, "logps/chosen": -353.7255859375, "logps/rejected": -886.4546508789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.367911338806152, "rewards/margins": 21.32790756225586, "rewards/rejected": -27.695819854736328, "step": 1248 }, { "epoch": 0.7769828926905132, "grad_norm": 8.07496166229248, "learning_rate": 4.117104656523744e-06, "logits/chosen": -0.2673183083534241, "logits/rejected": 3.8783979415893555, "logps/chosen": -519.2168579101562, "logps/rejected": -1002.5673828125, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": -10.099881172180176, "rewards/margins": 28.038101196289062, "rewards/rejected": -38.13798522949219, "step": 1249 }, { "epoch": 0.7776049766718507, "grad_norm": 2.958667278289795, "learning_rate": 4.115952051636699e-06, "logits/chosen": 0.0354306697845459, "logits/rejected": 3.6872379779815674, "logps/chosen": -516.661376953125, "logps/rejected": -992.490478515625, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -8.733585357666016, "rewards/margins": 24.6900634765625, "rewards/rejected": -33.423648834228516, "step": 1250 }, { "epoch": 0.7782270606531881, "grad_norm": 6.522555828094482, "learning_rate": 4.1147994467496545e-06, "logits/chosen": -0.5028421878814697, "logits/rejected": 2.9892351627349854, "logps/chosen": -484.8457336425781, "logps/rejected": -888.728759765625, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -6.360073089599609, "rewards/margins": 21.569787979125977, "rewards/rejected": -27.929861068725586, "step": 1251 }, { "epoch": 0.7788491446345257, "grad_norm": 0.0009231179719790816, "learning_rate": 4.11364684186261e-06, "logits/chosen": 1.3579542636871338, "logits/rejected": 3.720470905303955, "logps/chosen": -570.8295288085938, "logps/rejected": -911.2254638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.875762939453125, "rewards/margins": 23.522167205810547, "rewards/rejected": -31.397930145263672, "step": 1252 }, { "epoch": 0.7794712286158632, "grad_norm": 33.93539810180664, "learning_rate": 4.112494236975565e-06, "logits/chosen": -0.39095282554626465, "logits/rejected": 3.6155381202697754, "logps/chosen": -434.69134521484375, "logps/rejected": -829.1648559570312, "loss": 0.88, "rewards/accuracies": 0.75, "rewards/chosen": -6.6295928955078125, "rewards/margins": 20.17112922668457, "rewards/rejected": -26.800724029541016, "step": 1253 }, { "epoch": 0.7800933125972006, "grad_norm": 0.00012112106196582317, "learning_rate": 4.11134163208852e-06, "logits/chosen": 0.8299846649169922, "logits/rejected": 3.10577654838562, "logps/chosen": -411.1665344238281, "logps/rejected": -749.471923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.046252250671387, "rewards/margins": 24.529747009277344, "rewards/rejected": -29.576000213623047, "step": 1254 }, { "epoch": 0.7807153965785381, "grad_norm": 0.04176100715994835, "learning_rate": 4.110189027201475e-06, "logits/chosen": 3.4682154655456543, "logits/rejected": 4.857761859893799, "logps/chosen": -729.2528686523438, "logps/rejected": -954.7874145507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.462285995483398, "rewards/margins": 15.092519760131836, "rewards/rejected": -23.554805755615234, "step": 1255 }, { "epoch": 0.7813374805598756, "grad_norm": 0.012754272669553757, "learning_rate": 4.109036422314431e-06, "logits/chosen": -1.916728138923645, "logits/rejected": 3.354323625564575, "logps/chosen": -385.1650390625, "logps/rejected": -876.616455078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.575162649154663, "rewards/margins": 26.097787857055664, "rewards/rejected": -28.672950744628906, "step": 1256 }, { "epoch": 0.7819595645412131, "grad_norm": 3.408075281186029e-05, "learning_rate": 4.107883817427386e-06, "logits/chosen": -1.3067548274993896, "logits/rejected": 2.744790554046631, "logps/chosen": -485.6220397949219, "logps/rejected": -959.7124633789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.141744613647461, "rewards/margins": 25.00613784790039, "rewards/rejected": -31.14788055419922, "step": 1257 }, { "epoch": 0.7825816485225505, "grad_norm": 30.633358001708984, "learning_rate": 4.106731212540341e-06, "logits/chosen": 0.2253757119178772, "logits/rejected": 3.640742778778076, "logps/chosen": -466.9993591308594, "logps/rejected": -941.8668212890625, "loss": 0.7124, "rewards/accuracies": 0.75, "rewards/chosen": -4.170479774475098, "rewards/margins": 25.03893280029297, "rewards/rejected": -29.209413528442383, "step": 1258 }, { "epoch": 0.783203732503888, "grad_norm": 0.0012299851514399052, "learning_rate": 4.105578607653296e-06, "logits/chosen": 0.25474441051483154, "logits/rejected": 4.737069606781006, "logps/chosen": -513.61181640625, "logps/rejected": -1030.615966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.679970741271973, "rewards/margins": 24.170072555541992, "rewards/rejected": -32.85004425048828, "step": 1259 }, { "epoch": 0.7838258164852255, "grad_norm": 8.764855010667816e-06, "learning_rate": 4.1044260027662515e-06, "logits/chosen": 1.943559169769287, "logits/rejected": 4.239380836486816, "logps/chosen": -667.2208251953125, "logps/rejected": -1133.9271240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.01846694946289, "rewards/margins": 32.54998779296875, "rewards/rejected": -41.56845474243164, "step": 1260 }, { "epoch": 0.784447900466563, "grad_norm": 0.08819999545812607, "learning_rate": 4.103273397879208e-06, "logits/chosen": -0.5123782157897949, "logits/rejected": 3.680102825164795, "logps/chosen": -395.6453857421875, "logps/rejected": -820.6551513671875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.562816858291626, "rewards/margins": 22.052396774291992, "rewards/rejected": -24.61521339416504, "step": 1261 }, { "epoch": 0.7850699844479004, "grad_norm": 1.087512493133545, "learning_rate": 4.102120792992163e-06, "logits/chosen": 0.575364351272583, "logits/rejected": 5.38516092300415, "logps/chosen": -390.524658203125, "logps/rejected": -909.357421875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -6.065713882446289, "rewards/margins": 23.575519561767578, "rewards/rejected": -29.641231536865234, "step": 1262 }, { "epoch": 0.785692068429238, "grad_norm": 0.4358244240283966, "learning_rate": 4.100968188105118e-06, "logits/chosen": -0.8720681667327881, "logits/rejected": 3.837329387664795, "logps/chosen": -467.9682312011719, "logps/rejected": -962.12744140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.806166648864746, "rewards/margins": 28.562875747680664, "rewards/rejected": -37.369041442871094, "step": 1263 }, { "epoch": 0.7863141524105755, "grad_norm": 0.13888658583164215, "learning_rate": 4.099815583218073e-06, "logits/chosen": 1.3139761686325073, "logits/rejected": 3.096834421157837, "logps/chosen": -642.6888427734375, "logps/rejected": -916.2388916015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -11.698795318603516, "rewards/margins": 20.658552169799805, "rewards/rejected": -32.35734939575195, "step": 1264 }, { "epoch": 0.7869362363919129, "grad_norm": 0.20697729289531708, "learning_rate": 4.0986629783310285e-06, "logits/chosen": -1.3422939777374268, "logits/rejected": 3.9096124172210693, "logps/chosen": -477.9691162109375, "logps/rejected": -1044.06787109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.874945640563965, "rewards/margins": 23.77069854736328, "rewards/rejected": -30.645645141601562, "step": 1265 }, { "epoch": 0.7875583203732504, "grad_norm": 25.782604217529297, "learning_rate": 4.097510373443984e-06, "logits/chosen": 1.302040696144104, "logits/rejected": 4.496374130249023, "logps/chosen": -546.2277221679688, "logps/rejected": -870.092041015625, "loss": 0.4589, "rewards/accuracies": 0.875, "rewards/chosen": -7.486966609954834, "rewards/margins": 18.784543991088867, "rewards/rejected": -26.27151107788086, "step": 1266 }, { "epoch": 0.7881804043545879, "grad_norm": 5.894190311431885, "learning_rate": 4.096357768556939e-06, "logits/chosen": 1.1210216283798218, "logits/rejected": 3.163902759552002, "logps/chosen": -623.9061889648438, "logps/rejected": -906.975830078125, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -9.462145805358887, "rewards/margins": 18.545324325561523, "rewards/rejected": -28.007469177246094, "step": 1267 }, { "epoch": 0.7888024883359254, "grad_norm": 5.6784025218803436e-05, "learning_rate": 4.095205163669894e-06, "logits/chosen": -1.7827244997024536, "logits/rejected": 2.870236396789551, "logps/chosen": -451.1767578125, "logps/rejected": -966.4337158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.05531120300293, "rewards/margins": 30.899620056152344, "rewards/rejected": -38.954933166503906, "step": 1268 }, { "epoch": 0.7894245723172628, "grad_norm": 26.408723831176758, "learning_rate": 4.094052558782849e-06, "logits/chosen": 0.2130802422761917, "logits/rejected": 3.3761491775512695, "logps/chosen": -543.6114501953125, "logps/rejected": -946.999267578125, "loss": 0.3535, "rewards/accuracies": 0.875, "rewards/chosen": -8.727877616882324, "rewards/margins": 23.667865753173828, "rewards/rejected": -32.39574432373047, "step": 1269 }, { "epoch": 0.7900466562986003, "grad_norm": 0.0007318244897760451, "learning_rate": 4.092899953895805e-06, "logits/chosen": 1.295201301574707, "logits/rejected": 3.541900157928467, "logps/chosen": -644.5004272460938, "logps/rejected": -1030.76123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.541393756866455, "rewards/margins": 28.21343421936035, "rewards/rejected": -33.75482940673828, "step": 1270 }, { "epoch": 0.7906687402799378, "grad_norm": 0.8289233446121216, "learning_rate": 4.09174734900876e-06, "logits/chosen": 1.6525609493255615, "logits/rejected": 4.1350908279418945, "logps/chosen": -319.18353271484375, "logps/rejected": -693.30517578125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -5.030683517456055, "rewards/margins": 20.20093536376953, "rewards/rejected": -25.231618881225586, "step": 1271 }, { "epoch": 0.7912908242612753, "grad_norm": 1.632172703742981, "learning_rate": 4.090594744121715e-06, "logits/chosen": 3.0184199810028076, "logits/rejected": 4.094862937927246, "logps/chosen": -606.4073486328125, "logps/rejected": -852.579833984375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -9.487676620483398, "rewards/margins": 18.483928680419922, "rewards/rejected": -27.971603393554688, "step": 1272 }, { "epoch": 0.7919129082426127, "grad_norm": 0.05077657476067543, "learning_rate": 4.08944213923467e-06, "logits/chosen": 3.8291378021240234, "logits/rejected": 3.1013073921203613, "logps/chosen": -732.06884765625, "logps/rejected": -885.6846923828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -14.131711959838867, "rewards/margins": 19.296707153320312, "rewards/rejected": -33.42841720581055, "step": 1273 }, { "epoch": 0.7925349922239502, "grad_norm": 23.887598037719727, "learning_rate": 4.0882895343476255e-06, "logits/chosen": -0.9498767852783203, "logits/rejected": 1.325081467628479, "logps/chosen": -477.6418762207031, "logps/rejected": -833.086181640625, "loss": 0.4625, "rewards/accuracies": 0.875, "rewards/chosen": -8.639581680297852, "rewards/margins": 16.080799102783203, "rewards/rejected": -24.720382690429688, "step": 1274 }, { "epoch": 0.7931570762052877, "grad_norm": 28.14263153076172, "learning_rate": 4.087136929460581e-06, "logits/chosen": 1.22007417678833, "logits/rejected": 5.139258861541748, "logps/chosen": -536.0076904296875, "logps/rejected": -923.4759521484375, "loss": 0.6636, "rewards/accuracies": 0.875, "rewards/chosen": -11.12144660949707, "rewards/margins": 16.953052520751953, "rewards/rejected": -28.074499130249023, "step": 1275 }, { "epoch": 0.7937791601866252, "grad_norm": 0.00033927810727618635, "learning_rate": 4.085984324573537e-06, "logits/chosen": 1.6169593334197998, "logits/rejected": 4.11783504486084, "logps/chosen": -577.6982421875, "logps/rejected": -1023.56396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.645487785339355, "rewards/margins": 30.665348052978516, "rewards/rejected": -40.31083297729492, "step": 1276 }, { "epoch": 0.7944012441679627, "grad_norm": 0.0009849730413407087, "learning_rate": 4.084831719686492e-06, "logits/chosen": 5.122714519500732, "logits/rejected": 5.933152675628662, "logps/chosen": -818.3211059570312, "logps/rejected": -1011.8892822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.776374816894531, "rewards/margins": 21.75531768798828, "rewards/rejected": -32.53169250488281, "step": 1277 }, { "epoch": 0.7950233281493001, "grad_norm": 1.222702980041504, "learning_rate": 4.083679114799447e-06, "logits/chosen": 0.6945499777793884, "logits/rejected": 2.474721908569336, "logps/chosen": -430.6312255859375, "logps/rejected": -710.2374267578125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -7.810029983520508, "rewards/margins": 17.72357177734375, "rewards/rejected": -25.533601760864258, "step": 1278 }, { "epoch": 0.7956454121306377, "grad_norm": 0.0002225928328698501, "learning_rate": 4.0825265099124025e-06, "logits/chosen": 0.8473066091537476, "logits/rejected": 5.379339218139648, "logps/chosen": -543.3826293945312, "logps/rejected": -1045.0135498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.165830612182617, "rewards/margins": 25.138229370117188, "rewards/rejected": -33.30405807495117, "step": 1279 }, { "epoch": 0.7962674961119751, "grad_norm": 3.7584469318389893, "learning_rate": 4.081373905025358e-06, "logits/chosen": -2.6725735664367676, "logits/rejected": 2.786515951156616, "logps/chosen": -283.8596496582031, "logps/rejected": -721.12841796875, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -4.44124698638916, "rewards/margins": 19.681657791137695, "rewards/rejected": -24.122905731201172, "step": 1280 }, { "epoch": 0.7968895800933126, "grad_norm": 0.00032045444822870195, "learning_rate": 4.080221300138313e-06, "logits/chosen": 0.8776124119758606, "logits/rejected": 4.913902282714844, "logps/chosen": -560.6947631835938, "logps/rejected": -1074.2099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.016932487487793, "rewards/margins": 28.982444763183594, "rewards/rejected": -37.99938201904297, "step": 1281 }, { "epoch": 0.7975116640746501, "grad_norm": 0.02013535052537918, "learning_rate": 4.079068695251268e-06, "logits/chosen": 1.071070909500122, "logits/rejected": 3.490199565887451, "logps/chosen": -558.4388427734375, "logps/rejected": -955.5740356445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.795027732849121, "rewards/margins": 21.521804809570312, "rewards/rejected": -28.31683349609375, "step": 1282 }, { "epoch": 0.7981337480559876, "grad_norm": 0.3003612160682678, "learning_rate": 4.077916090364223e-06, "logits/chosen": 0.02880948781967163, "logits/rejected": 3.635495185852051, "logps/chosen": -549.6759033203125, "logps/rejected": -964.7757568359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -10.934090614318848, "rewards/margins": 24.060932159423828, "rewards/rejected": -34.995025634765625, "step": 1283 }, { "epoch": 0.798755832037325, "grad_norm": 0.021271033212542534, "learning_rate": 4.076763485477179e-06, "logits/chosen": 0.04239767789840698, "logits/rejected": 2.0946714878082275, "logps/chosen": -625.83837890625, "logps/rejected": -981.59423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.937152862548828, "rewards/margins": 23.588687896728516, "rewards/rejected": -29.525840759277344, "step": 1284 }, { "epoch": 0.7993779160186625, "grad_norm": 31.318174362182617, "learning_rate": 4.075610880590134e-06, "logits/chosen": 2.3726325035095215, "logits/rejected": 2.115004301071167, "logps/chosen": -538.9212646484375, "logps/rejected": -803.7254638671875, "loss": 1.1857, "rewards/accuracies": 0.875, "rewards/chosen": -11.018959045410156, "rewards/margins": 18.249353408813477, "rewards/rejected": -29.268310546875, "step": 1285 }, { "epoch": 0.8, "grad_norm": 9.438876152038574, "learning_rate": 4.074458275703089e-06, "logits/chosen": -1.8963449001312256, "logits/rejected": 2.997347354888916, "logps/chosen": -343.0146179199219, "logps/rejected": -801.9140014648438, "loss": 0.1191, "rewards/accuracies": 0.875, "rewards/chosen": -7.2926506996154785, "rewards/margins": 18.306407928466797, "rewards/rejected": -25.59906005859375, "step": 1286 }, { "epoch": 0.8006220839813375, "grad_norm": 10.629618644714355, "learning_rate": 4.073305670816044e-06, "logits/chosen": -1.7159764766693115, "logits/rejected": 3.7132809162139893, "logps/chosen": -352.5853271484375, "logps/rejected": -948.1820068359375, "loss": 0.1264, "rewards/accuracies": 0.875, "rewards/chosen": -8.144214630126953, "rewards/margins": 26.92259979248047, "rewards/rejected": -35.06681442260742, "step": 1287 }, { "epoch": 0.801244167962675, "grad_norm": 1.6232259273529053, "learning_rate": 4.0721530659289995e-06, "logits/chosen": 1.8202488422393799, "logits/rejected": 5.256498336791992, "logps/chosen": -496.8153076171875, "logps/rejected": -939.01953125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -9.51845645904541, "rewards/margins": 27.129898071289062, "rewards/rejected": -36.64834976196289, "step": 1288 }, { "epoch": 0.8018662519440124, "grad_norm": 2.7157328128814697, "learning_rate": 4.071000461041955e-06, "logits/chosen": 1.093205213546753, "logits/rejected": 4.126691818237305, "logps/chosen": -497.1220397949219, "logps/rejected": -887.37060546875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -7.251227855682373, "rewards/margins": 22.370820999145508, "rewards/rejected": -29.62204933166504, "step": 1289 }, { "epoch": 0.80248833592535, "grad_norm": 0.0017288104863837361, "learning_rate": 4.06984785615491e-06, "logits/chosen": -2.63922381401062, "logits/rejected": 2.14827823638916, "logps/chosen": -384.6424560546875, "logps/rejected": -1019.2603149414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.489599704742432, "rewards/margins": 26.801565170288086, "rewards/rejected": -34.29116439819336, "step": 1290 }, { "epoch": 0.8031104199066874, "grad_norm": 0.3165653347969055, "learning_rate": 4.068695251267866e-06, "logits/chosen": -0.5058521628379822, "logits/rejected": 3.0581393241882324, "logps/chosen": -632.1590576171875, "logps/rejected": -1066.4691162109375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.409887313842773, "rewards/margins": 26.200271606445312, "rewards/rejected": -34.61016082763672, "step": 1291 }, { "epoch": 0.8037325038880249, "grad_norm": 0.0003989443648606539, "learning_rate": 4.067542646380821e-06, "logits/chosen": -0.6932083964347839, "logits/rejected": 2.834740161895752, "logps/chosen": -439.7989196777344, "logps/rejected": -1053.9024658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.509608745574951, "rewards/margins": 31.004009246826172, "rewards/rejected": -36.51361846923828, "step": 1292 }, { "epoch": 0.8043545878693623, "grad_norm": 0.0012794709764420986, "learning_rate": 4.0663900414937765e-06, "logits/chosen": 0.4577113091945648, "logits/rejected": 3.1413795948028564, "logps/chosen": -636.5763549804688, "logps/rejected": -1040.104736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.54163932800293, "rewards/margins": 28.04217529296875, "rewards/rejected": -39.58381271362305, "step": 1293 }, { "epoch": 0.8049766718506999, "grad_norm": 0.010304873809218407, "learning_rate": 4.065237436606732e-06, "logits/chosen": 0.19404852390289307, "logits/rejected": 4.620615005493164, "logps/chosen": -519.24853515625, "logps/rejected": -1041.069091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.314437866210938, "rewards/margins": 24.46490478515625, "rewards/rejected": -32.77934265136719, "step": 1294 }, { "epoch": 0.8055987558320373, "grad_norm": 28.267431259155273, "learning_rate": 4.064084831719687e-06, "logits/chosen": -1.7566967010498047, "logits/rejected": 2.2776732444763184, "logps/chosen": -419.6002197265625, "logps/rejected": -839.655517578125, "loss": 0.1848, "rewards/accuracies": 0.875, "rewards/chosen": -9.208789825439453, "rewards/margins": 19.25501251220703, "rewards/rejected": -28.463802337646484, "step": 1295 }, { "epoch": 0.8062208398133748, "grad_norm": 37.10252380371094, "learning_rate": 4.062932226832642e-06, "logits/chosen": 2.2791783809661865, "logits/rejected": 4.953285217285156, "logps/chosen": -517.1919555664062, "logps/rejected": -895.48779296875, "loss": 0.4776, "rewards/accuracies": 0.875, "rewards/chosen": -7.959544658660889, "rewards/margins": 20.814197540283203, "rewards/rejected": -28.77374267578125, "step": 1296 }, { "epoch": 0.8068429237947123, "grad_norm": 0.35563036799430847, "learning_rate": 4.061779621945597e-06, "logits/chosen": -0.17489880323410034, "logits/rejected": 3.205916404724121, "logps/chosen": -325.92034912109375, "logps/rejected": -804.7338256835938, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.247753143310547, "rewards/margins": 23.912797927856445, "rewards/rejected": -31.16054916381836, "step": 1297 }, { "epoch": 0.8074650077760498, "grad_norm": 21.93829917907715, "learning_rate": 4.060627017058553e-06, "logits/chosen": -0.9875195026397705, "logits/rejected": 2.943293571472168, "logps/chosen": -536.0520629882812, "logps/rejected": -937.1588134765625, "loss": 0.1268, "rewards/accuracies": 0.875, "rewards/chosen": -8.072823524475098, "rewards/margins": 22.72176742553711, "rewards/rejected": -30.794591903686523, "step": 1298 }, { "epoch": 0.8080870917573872, "grad_norm": 5.47617491974961e-05, "learning_rate": 4.059474412171508e-06, "logits/chosen": 1.7214620113372803, "logits/rejected": 4.240743637084961, "logps/chosen": -623.8914184570312, "logps/rejected": -996.2498168945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.60321044921875, "rewards/margins": 24.35324478149414, "rewards/rejected": -33.95645523071289, "step": 1299 }, { "epoch": 0.8087091757387247, "grad_norm": 42.22055435180664, "learning_rate": 4.058321807284463e-06, "logits/chosen": 0.30668437480926514, "logits/rejected": 2.93937087059021, "logps/chosen": -671.924072265625, "logps/rejected": -990.665283203125, "loss": 0.871, "rewards/accuracies": 0.875, "rewards/chosen": -10.068768501281738, "rewards/margins": 17.43556785583496, "rewards/rejected": -27.504337310791016, "step": 1300 }, { "epoch": 0.8093312597200623, "grad_norm": 0.3986184597015381, "learning_rate": 4.057169202397418e-06, "logits/chosen": -1.8530347347259521, "logits/rejected": 2.342902183532715, "logps/chosen": -344.0250549316406, "logps/rejected": -775.0608520507812, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.663173198699951, "rewards/margins": 27.073396682739258, "rewards/rejected": -30.736572265625, "step": 1301 }, { "epoch": 0.8099533437013997, "grad_norm": 0.01742498017847538, "learning_rate": 4.0560165975103735e-06, "logits/chosen": -0.03469623625278473, "logits/rejected": 3.8866472244262695, "logps/chosen": -395.10205078125, "logps/rejected": -897.292236328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.772964954376221, "rewards/margins": 22.193988800048828, "rewards/rejected": -29.96695327758789, "step": 1302 }, { "epoch": 0.8105754276827372, "grad_norm": 3.09145289065782e-05, "learning_rate": 4.054863992623329e-06, "logits/chosen": -0.405514657497406, "logits/rejected": 2.8145222663879395, "logps/chosen": -432.71527099609375, "logps/rejected": -894.4420166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.114578247070312, "rewards/margins": 24.84501838684082, "rewards/rejected": -32.9595947265625, "step": 1303 }, { "epoch": 0.8111975116640746, "grad_norm": 0.15704073011875153, "learning_rate": 4.053711387736284e-06, "logits/chosen": 0.05295020341873169, "logits/rejected": 4.433631420135498, "logps/chosen": -493.4110107421875, "logps/rejected": -917.6148681640625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.961913585662842, "rewards/margins": 22.568294525146484, "rewards/rejected": -30.530208587646484, "step": 1304 }, { "epoch": 0.8118195956454122, "grad_norm": 0.05151224881410599, "learning_rate": 4.05255878284924e-06, "logits/chosen": -1.0337135791778564, "logits/rejected": 2.5338025093078613, "logps/chosen": -373.13446044921875, "logps/rejected": -814.0833740234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.4312238693237305, "rewards/margins": 24.39572525024414, "rewards/rejected": -28.826946258544922, "step": 1305 }, { "epoch": 0.8124416796267496, "grad_norm": 3.894642304658191e-06, "learning_rate": 4.051406177962195e-06, "logits/chosen": 0.11323362588882446, "logits/rejected": 5.9528961181640625, "logps/chosen": -481.096435546875, "logps/rejected": -1122.715087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.372435092926025, "rewards/margins": 27.15767478942871, "rewards/rejected": -34.53010940551758, "step": 1306 }, { "epoch": 0.8130637636080871, "grad_norm": 1.2801079719793051e-05, "learning_rate": 4.0502535730751505e-06, "logits/chosen": -1.0122219324111938, "logits/rejected": 2.4837958812713623, "logps/chosen": -411.2268981933594, "logps/rejected": -929.159423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.501988410949707, "rewards/margins": 28.270177841186523, "rewards/rejected": -36.77216339111328, "step": 1307 }, { "epoch": 0.8136858475894245, "grad_norm": 0.003478578059002757, "learning_rate": 4.049100968188106e-06, "logits/chosen": -1.201788067817688, "logits/rejected": 2.739528179168701, "logps/chosen": -561.0819091796875, "logps/rejected": -1004.871337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.666325569152832, "rewards/margins": 24.24523162841797, "rewards/rejected": -33.911556243896484, "step": 1308 }, { "epoch": 0.8143079315707621, "grad_norm": 35.751625061035156, "learning_rate": 4.047948363301061e-06, "logits/chosen": 1.2579597234725952, "logits/rejected": 3.93234920501709, "logps/chosen": -611.4188232421875, "logps/rejected": -933.8849487304688, "loss": 0.8951, "rewards/accuracies": 0.875, "rewards/chosen": -12.106334686279297, "rewards/margins": 18.69715118408203, "rewards/rejected": -30.803485870361328, "step": 1309 }, { "epoch": 0.8149300155520995, "grad_norm": 0.08502575755119324, "learning_rate": 4.046795758414016e-06, "logits/chosen": 2.097734212875366, "logits/rejected": 4.242420673370361, "logps/chosen": -593.9769287109375, "logps/rejected": -998.7327880859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.55612564086914, "rewards/margins": 24.9191837310791, "rewards/rejected": -34.47530746459961, "step": 1310 }, { "epoch": 0.815552099533437, "grad_norm": 10.52176284790039, "learning_rate": 4.045643153526971e-06, "logits/chosen": -0.6155394315719604, "logits/rejected": 2.2130019664764404, "logps/chosen": -541.8138427734375, "logps/rejected": -1002.4979858398438, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -7.962281227111816, "rewards/margins": 29.961563110351562, "rewards/rejected": -37.92384338378906, "step": 1311 }, { "epoch": 0.8161741835147744, "grad_norm": 18.777523040771484, "learning_rate": 4.044490548639927e-06, "logits/chosen": -0.9691512584686279, "logits/rejected": 2.1284027099609375, "logps/chosen": -523.74365234375, "logps/rejected": -879.116943359375, "loss": 0.3622, "rewards/accuracies": 0.875, "rewards/chosen": -7.904512405395508, "rewards/margins": 19.846830368041992, "rewards/rejected": -27.751344680786133, "step": 1312 }, { "epoch": 0.816796267496112, "grad_norm": 30.227996826171875, "learning_rate": 4.043337943752882e-06, "logits/chosen": 0.47516682744026184, "logits/rejected": 2.2081222534179688, "logps/chosen": -482.805908203125, "logps/rejected": -740.0989379882812, "loss": 0.3119, "rewards/accuracies": 0.875, "rewards/chosen": -8.329744338989258, "rewards/margins": 23.253252029418945, "rewards/rejected": -31.582996368408203, "step": 1313 }, { "epoch": 0.8174183514774495, "grad_norm": 0.0792866200208664, "learning_rate": 4.042185338865837e-06, "logits/chosen": -1.4188458919525146, "logits/rejected": 2.5889883041381836, "logps/chosen": -361.36846923828125, "logps/rejected": -814.2896728515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.850963115692139, "rewards/margins": 22.003490447998047, "rewards/rejected": -27.854455947875977, "step": 1314 }, { "epoch": 0.8180404354587869, "grad_norm": 0.00015036317927297205, "learning_rate": 4.041032733978792e-06, "logits/chosen": -2.443631172180176, "logits/rejected": 3.8295273780822754, "logps/chosen": -400.744384765625, "logps/rejected": -1040.559814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.101746082305908, "rewards/margins": 30.8872127532959, "rewards/rejected": -36.98896026611328, "step": 1315 }, { "epoch": 0.8186625194401245, "grad_norm": 0.001937979948706925, "learning_rate": 4.0398801290917475e-06, "logits/chosen": -2.559013843536377, "logits/rejected": 1.6643040180206299, "logps/chosen": -383.789306640625, "logps/rejected": -887.9030151367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.655877113342285, "rewards/margins": 26.765106201171875, "rewards/rejected": -32.420982360839844, "step": 1316 }, { "epoch": 0.8192846034214619, "grad_norm": 0.5322608351707458, "learning_rate": 4.038727524204703e-06, "logits/chosen": 3.073065757751465, "logits/rejected": 3.1308679580688477, "logps/chosen": -611.296142578125, "logps/rejected": -846.6818237304688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -9.426542282104492, "rewards/margins": 20.219778060913086, "rewards/rejected": -29.646320343017578, "step": 1317 }, { "epoch": 0.8199066874027994, "grad_norm": 0.12200061976909637, "learning_rate": 4.037574919317658e-06, "logits/chosen": 0.1979970932006836, "logits/rejected": 3.523850440979004, "logps/chosen": -410.6728210449219, "logps/rejected": -758.5045166015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.334256172180176, "rewards/margins": 18.89111328125, "rewards/rejected": -25.225370407104492, "step": 1318 }, { "epoch": 0.8205287713841368, "grad_norm": 41.30738067626953, "learning_rate": 4.036422314430613e-06, "logits/chosen": 1.2874574661254883, "logits/rejected": 2.957958936691284, "logps/chosen": -643.2376098632812, "logps/rejected": -870.0418090820312, "loss": 0.5965, "rewards/accuracies": 0.625, "rewards/chosen": -10.196017265319824, "rewards/margins": 16.791812896728516, "rewards/rejected": -26.98782730102539, "step": 1319 }, { "epoch": 0.8211508553654744, "grad_norm": 13.898651123046875, "learning_rate": 4.035269709543569e-06, "logits/chosen": -0.14826762676239014, "logits/rejected": 2.940274238586426, "logps/chosen": -498.494384765625, "logps/rejected": -841.1326904296875, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": -7.115964412689209, "rewards/margins": 20.87997055053711, "rewards/rejected": -27.995935440063477, "step": 1320 }, { "epoch": 0.8217729393468118, "grad_norm": 0.00021476426627486944, "learning_rate": 4.0341171046565245e-06, "logits/chosen": 1.9943870306015015, "logits/rejected": 4.891172885894775, "logps/chosen": -547.0040283203125, "logps/rejected": -987.03564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.015552997589111, "rewards/margins": 29.849987030029297, "rewards/rejected": -33.86553955078125, "step": 1321 }, { "epoch": 0.8223950233281493, "grad_norm": 0.2498874068260193, "learning_rate": 4.03296449976948e-06, "logits/chosen": 0.19681772589683533, "logits/rejected": 2.519896984100342, "logps/chosen": -482.88604736328125, "logps/rejected": -794.16650390625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.21995735168457, "rewards/margins": 18.27970314025879, "rewards/rejected": -23.49966049194336, "step": 1322 }, { "epoch": 0.8230171073094867, "grad_norm": 0.48097652196884155, "learning_rate": 4.031811894882435e-06, "logits/chosen": 0.03272548317909241, "logits/rejected": 2.3512604236602783, "logps/chosen": -544.5303955078125, "logps/rejected": -919.6380615234375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -10.7990083694458, "rewards/margins": 20.45400047302246, "rewards/rejected": -31.253009796142578, "step": 1323 }, { "epoch": 0.8236391912908243, "grad_norm": 18.469717025756836, "learning_rate": 4.03065928999539e-06, "logits/chosen": 1.2011947631835938, "logits/rejected": 2.6470980644226074, "logps/chosen": -474.2806091308594, "logps/rejected": -750.458984375, "loss": 0.3039, "rewards/accuracies": 0.875, "rewards/chosen": -5.48334264755249, "rewards/margins": 17.305072784423828, "rewards/rejected": -22.788414001464844, "step": 1324 }, { "epoch": 0.8242612752721618, "grad_norm": 0.008889297023415565, "learning_rate": 4.029506685108345e-06, "logits/chosen": -0.05120176076889038, "logits/rejected": 3.0984973907470703, "logps/chosen": -528.498046875, "logps/rejected": -984.384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.832124710083008, "rewards/margins": 21.603178024291992, "rewards/rejected": -29.435304641723633, "step": 1325 }, { "epoch": 0.8248833592534992, "grad_norm": 32.134124755859375, "learning_rate": 4.028354080221301e-06, "logits/chosen": 1.6112223863601685, "logits/rejected": 3.236201763153076, "logps/chosen": -515.0714721679688, "logps/rejected": -771.778564453125, "loss": 0.3432, "rewards/accuracies": 0.875, "rewards/chosen": -5.943190574645996, "rewards/margins": 17.906421661376953, "rewards/rejected": -23.849613189697266, "step": 1326 }, { "epoch": 0.8255054432348367, "grad_norm": 0.06956858187913895, "learning_rate": 4.027201475334256e-06, "logits/chosen": 2.438955068588257, "logits/rejected": 3.4251158237457275, "logps/chosen": -608.2529296875, "logps/rejected": -849.7999267578125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.219766616821289, "rewards/margins": 18.12618637084961, "rewards/rejected": -27.34595489501953, "step": 1327 }, { "epoch": 0.8261275272161742, "grad_norm": 2.3918983060866594e-07, "learning_rate": 4.026048870447211e-06, "logits/chosen": 1.0514161586761475, "logits/rejected": 2.963059425354004, "logps/chosen": -494.9385070800781, "logps/rejected": -828.365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.151393890380859, "rewards/margins": 27.91655158996582, "rewards/rejected": -33.06794357299805, "step": 1328 }, { "epoch": 0.8267496111975117, "grad_norm": 7.566370010375977, "learning_rate": 4.024896265560166e-06, "logits/chosen": 0.3350151777267456, "logits/rejected": 3.3889219760894775, "logps/chosen": -504.8259582519531, "logps/rejected": -994.1041259765625, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -6.070315361022949, "rewards/margins": 25.839262008666992, "rewards/rejected": -31.909576416015625, "step": 1329 }, { "epoch": 0.8273716951788491, "grad_norm": 0.982587993144989, "learning_rate": 4.0237436606731215e-06, "logits/chosen": 0.6158033013343811, "logits/rejected": 4.1185383796691895, "logps/chosen": -592.6993408203125, "logps/rejected": -937.3599853515625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -9.00186538696289, "rewards/margins": 17.788156509399414, "rewards/rejected": -26.790019989013672, "step": 1330 }, { "epoch": 0.8279937791601866, "grad_norm": 0.037367358803749084, "learning_rate": 4.022591055786077e-06, "logits/chosen": 0.18107163906097412, "logits/rejected": 4.079789638519287, "logps/chosen": -401.80194091796875, "logps/rejected": -841.3532104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.279299736022949, "rewards/margins": 18.987918853759766, "rewards/rejected": -24.26721954345703, "step": 1331 }, { "epoch": 0.8286158631415241, "grad_norm": 0.060256477445364, "learning_rate": 4.021438450899032e-06, "logits/chosen": 1.9197179079055786, "logits/rejected": 4.6027984619140625, "logps/chosen": -545.1920166015625, "logps/rejected": -982.877685546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.839240074157715, "rewards/margins": 21.339420318603516, "rewards/rejected": -27.17866325378418, "step": 1332 }, { "epoch": 0.8292379471228616, "grad_norm": 0.0030993474647402763, "learning_rate": 4.020285846011987e-06, "logits/chosen": 2.615834951400757, "logits/rejected": 4.404464244842529, "logps/chosen": -659.1356811523438, "logps/rejected": -1015.8995971679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.51331901550293, "rewards/margins": 24.97059440612793, "rewards/rejected": -32.483917236328125, "step": 1333 }, { "epoch": 0.829860031104199, "grad_norm": 31.577295303344727, "learning_rate": 4.019133241124943e-06, "logits/chosen": 0.41338852047920227, "logits/rejected": 2.415388584136963, "logps/chosen": -541.7307739257812, "logps/rejected": -757.7950439453125, "loss": 0.4238, "rewards/accuracies": 0.75, "rewards/chosen": -5.685833930969238, "rewards/margins": 14.005095481872559, "rewards/rejected": -19.690927505493164, "step": 1334 }, { "epoch": 0.8304821150855366, "grad_norm": 31.769001007080078, "learning_rate": 4.0179806362378985e-06, "logits/chosen": 0.11144089698791504, "logits/rejected": 4.3588714599609375, "logps/chosen": -499.26263427734375, "logps/rejected": -887.7661743164062, "loss": 0.3706, "rewards/accuracies": 0.875, "rewards/chosen": -5.3436174392700195, "rewards/margins": 19.843544006347656, "rewards/rejected": -25.187162399291992, "step": 1335 }, { "epoch": 0.831104199066874, "grad_norm": 11.622576713562012, "learning_rate": 4.016828031350854e-06, "logits/chosen": 1.747812032699585, "logits/rejected": 3.772061824798584, "logps/chosen": -461.38385009765625, "logps/rejected": -747.3123168945312, "loss": 0.1702, "rewards/accuracies": 0.875, "rewards/chosen": -4.077765941619873, "rewards/margins": 15.385506629943848, "rewards/rejected": -19.463272094726562, "step": 1336 }, { "epoch": 0.8317262830482115, "grad_norm": 1.9541438817977905, "learning_rate": 4.015675426463809e-06, "logits/chosen": -2.268688678741455, "logits/rejected": 1.4469751119613647, "logps/chosen": -414.74066162109375, "logps/rejected": -820.842529296875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -5.688546657562256, "rewards/margins": 19.483699798583984, "rewards/rejected": -25.1722469329834, "step": 1337 }, { "epoch": 0.832348367029549, "grad_norm": 4.007698589703068e-05, "learning_rate": 4.014522821576764e-06, "logits/chosen": 2.887434482574463, "logits/rejected": 4.472775936126709, "logps/chosen": -543.4795532226562, "logps/rejected": -864.4800415039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.919847011566162, "rewards/margins": 24.65685272216797, "rewards/rejected": -29.576698303222656, "step": 1338 }, { "epoch": 0.8329704510108865, "grad_norm": 0.002910461975261569, "learning_rate": 4.013370216689719e-06, "logits/chosen": 1.7962815761566162, "logits/rejected": 0.9302045106887817, "logps/chosen": -607.7379150390625, "logps/rejected": -662.3109130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.729726314544678, "rewards/margins": 16.888690948486328, "rewards/rejected": -21.61841583251953, "step": 1339 }, { "epoch": 0.833592534992224, "grad_norm": 2.4401133487117477e-05, "learning_rate": 4.012217611802675e-06, "logits/chosen": 0.7974531650543213, "logits/rejected": 3.5518081188201904, "logps/chosen": -415.49932861328125, "logps/rejected": -802.1265869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4340407848358154, "rewards/margins": 24.112585067749023, "rewards/rejected": -26.546627044677734, "step": 1340 }, { "epoch": 0.8342146189735614, "grad_norm": 0.8045310974121094, "learning_rate": 4.01106500691563e-06, "logits/chosen": 2.564563274383545, "logits/rejected": 3.640261173248291, "logps/chosen": -601.762451171875, "logps/rejected": -823.6668701171875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -5.859514236450195, "rewards/margins": 20.506380081176758, "rewards/rejected": -26.365894317626953, "step": 1341 }, { "epoch": 0.8348367029548989, "grad_norm": 1.1494208574295044, "learning_rate": 4.009912402028585e-06, "logits/chosen": -0.538671612739563, "logits/rejected": 2.3395137786865234, "logps/chosen": -535.9871826171875, "logps/rejected": -866.9425048828125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -4.740545749664307, "rewards/margins": 15.756071090698242, "rewards/rejected": -20.49661636352539, "step": 1342 }, { "epoch": 0.8354587869362364, "grad_norm": 8.666638677823357e-06, "learning_rate": 4.00875979714154e-06, "logits/chosen": -1.9475653171539307, "logits/rejected": 2.8373095989227295, "logps/chosen": -410.46331787109375, "logps/rejected": -874.9161987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.119300365447998, "rewards/margins": 24.03900909423828, "rewards/rejected": -30.158308029174805, "step": 1343 }, { "epoch": 0.8360808709175739, "grad_norm": 35.65961456298828, "learning_rate": 4.0076071922544955e-06, "logits/chosen": 1.6600673198699951, "logits/rejected": 3.5128393173217773, "logps/chosen": -575.7931518554688, "logps/rejected": -894.3558349609375, "loss": 0.4741, "rewards/accuracies": 0.875, "rewards/chosen": -7.155614852905273, "rewards/margins": 18.48762321472168, "rewards/rejected": -25.643238067626953, "step": 1344 }, { "epoch": 0.8367029548989113, "grad_norm": 0.0019475392764434218, "learning_rate": 4.006454587367451e-06, "logits/chosen": -2.3781747817993164, "logits/rejected": 3.6053688526153564, "logps/chosen": -367.33099365234375, "logps/rejected": -857.9221801757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.666664123535156, "rewards/margins": 22.033830642700195, "rewards/rejected": -30.70049476623535, "step": 1345 }, { "epoch": 0.8373250388802488, "grad_norm": 0.07737136632204056, "learning_rate": 4.005301982480406e-06, "logits/chosen": 2.302640676498413, "logits/rejected": 2.480597972869873, "logps/chosen": -547.6039428710938, "logps/rejected": -700.687255859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.121016263961792, "rewards/margins": 14.329965591430664, "rewards/rejected": -17.45098304748535, "step": 1346 }, { "epoch": 0.8379471228615863, "grad_norm": 8.023022651672363, "learning_rate": 4.004149377593361e-06, "logits/chosen": 1.4918807744979858, "logits/rejected": 5.198164939880371, "logps/chosen": -551.644287109375, "logps/rejected": -944.3262939453125, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": -5.576295375823975, "rewards/margins": 17.830055236816406, "rewards/rejected": -23.406349182128906, "step": 1347 }, { "epoch": 0.8385692068429238, "grad_norm": 9.650162610341795e-06, "learning_rate": 4.002996772706316e-06, "logits/chosen": -1.5291988849639893, "logits/rejected": 3.2551000118255615, "logps/chosen": -291.2911376953125, "logps/rejected": -817.490966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.394601821899414, "rewards/margins": 24.574989318847656, "rewards/rejected": -30.969589233398438, "step": 1348 }, { "epoch": 0.8391912908242612, "grad_norm": 2.9257236747071147e-05, "learning_rate": 4.0018441678192725e-06, "logits/chosen": -2.260589122772217, "logits/rejected": 2.732025146484375, "logps/chosen": -261.58526611328125, "logps/rejected": -809.49951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.9103120565414429, "rewards/margins": 25.605756759643555, "rewards/rejected": -27.516071319580078, "step": 1349 }, { "epoch": 0.8398133748055988, "grad_norm": 4.390152753330767e-05, "learning_rate": 4.000691562932228e-06, "logits/chosen": 2.0370168685913086, "logits/rejected": 2.6412336826324463, "logps/chosen": -591.8038330078125, "logps/rejected": -994.63232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.617895126342773, "rewards/margins": 27.143917083740234, "rewards/rejected": -35.761810302734375, "step": 1350 }, { "epoch": 0.8404354587869363, "grad_norm": 0.0004083360836375505, "learning_rate": 3.999538958045183e-06, "logits/chosen": -2.8748371601104736, "logits/rejected": 3.718611717224121, "logps/chosen": -303.1407470703125, "logps/rejected": -864.2947998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.6751580238342285, "rewards/margins": 20.303796768188477, "rewards/rejected": -25.978954315185547, "step": 1351 }, { "epoch": 0.8410575427682737, "grad_norm": 0.0017260868335142732, "learning_rate": 3.998386353158138e-06, "logits/chosen": 0.10009878873825073, "logits/rejected": 3.7453651428222656, "logps/chosen": -399.87725830078125, "logps/rejected": -858.879638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.825431823730469, "rewards/margins": 23.454038619995117, "rewards/rejected": -29.279468536376953, "step": 1352 }, { "epoch": 0.8416796267496112, "grad_norm": 18.463964462280273, "learning_rate": 3.997233748271093e-06, "logits/chosen": -0.7138807773590088, "logits/rejected": 3.2711589336395264, "logps/chosen": -509.6426086425781, "logps/rejected": -1053.80029296875, "loss": 0.1009, "rewards/accuracies": 0.875, "rewards/chosen": -4.224634170532227, "rewards/margins": 35.433074951171875, "rewards/rejected": -39.657711029052734, "step": 1353 }, { "epoch": 0.8423017107309487, "grad_norm": 0.012860018759965897, "learning_rate": 3.996081143384049e-06, "logits/chosen": 3.318115711212158, "logits/rejected": 4.649235725402832, "logps/chosen": -688.559326171875, "logps/rejected": -973.8826293945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.674195766448975, "rewards/margins": 22.215259552001953, "rewards/rejected": -29.889455795288086, "step": 1354 }, { "epoch": 0.8429237947122862, "grad_norm": 2.4849356350387097e-07, "learning_rate": 3.994928538497004e-06, "logits/chosen": -3.6064767837524414, "logits/rejected": 3.288621425628662, "logps/chosen": -288.0089111328125, "logps/rejected": -1011.349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1988747119903564, "rewards/margins": 35.37522888183594, "rewards/rejected": -36.574100494384766, "step": 1355 }, { "epoch": 0.8435458786936236, "grad_norm": 6.193071365356445, "learning_rate": 3.993775933609959e-06, "logits/chosen": -3.4069361686706543, "logits/rejected": 2.3285210132598877, "logps/chosen": -267.35748291015625, "logps/rejected": -765.3587646484375, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -5.777052402496338, "rewards/margins": 20.48461151123047, "rewards/rejected": -26.26166534423828, "step": 1356 }, { "epoch": 0.8441679626749611, "grad_norm": 2.2144156446302077e-06, "learning_rate": 3.992623328722914e-06, "logits/chosen": -1.2399766445159912, "logits/rejected": 4.118406772613525, "logps/chosen": -283.79296875, "logps/rejected": -785.6634521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6745691299438477, "rewards/margins": 23.444019317626953, "rewards/rejected": -26.118587493896484, "step": 1357 }, { "epoch": 0.8447900466562986, "grad_norm": 0.5156397819519043, "learning_rate": 3.9914707238358695e-06, "logits/chosen": 0.3204643726348877, "logits/rejected": 2.7228527069091797, "logps/chosen": -560.1771850585938, "logps/rejected": -1028.784912109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -9.866544723510742, "rewards/margins": 23.669208526611328, "rewards/rejected": -33.53575134277344, "step": 1358 }, { "epoch": 0.8454121306376361, "grad_norm": 0.00017892532923724502, "learning_rate": 3.990318118948825e-06, "logits/chosen": -2.0653839111328125, "logits/rejected": 3.424193859100342, "logps/chosen": -285.17431640625, "logps/rejected": -915.456787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.734927177429199, "rewards/margins": 29.654401779174805, "rewards/rejected": -34.38932800292969, "step": 1359 }, { "epoch": 0.8460342146189735, "grad_norm": 0.012640786357223988, "learning_rate": 3.98916551406178e-06, "logits/chosen": 2.023895025253296, "logits/rejected": 4.745113849639893, "logps/chosen": -416.2607421875, "logps/rejected": -926.1375732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5701878070831299, "rewards/margins": 21.946216583251953, "rewards/rejected": -23.516407012939453, "step": 1360 }, { "epoch": 0.846656298600311, "grad_norm": 35.73863983154297, "learning_rate": 3.988012909174735e-06, "logits/chosen": -0.7985638380050659, "logits/rejected": 1.3513717651367188, "logps/chosen": -588.6486206054688, "logps/rejected": -883.7210693359375, "loss": 0.4721, "rewards/accuracies": 0.875, "rewards/chosen": -8.495615005493164, "rewards/margins": 19.96755599975586, "rewards/rejected": -28.463171005249023, "step": 1361 }, { "epoch": 0.8472783825816486, "grad_norm": 3.837876558303833, "learning_rate": 3.98686030428769e-06, "logits/chosen": -4.322826385498047, "logits/rejected": 1.2892265319824219, "logps/chosen": -353.0385437011719, "logps/rejected": -832.596923828125, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -5.767023086547852, "rewards/margins": 19.498014450073242, "rewards/rejected": -25.265037536621094, "step": 1362 }, { "epoch": 0.847900466562986, "grad_norm": 4.668637120630592e-05, "learning_rate": 3.9857076994006465e-06, "logits/chosen": 2.5119447708129883, "logits/rejected": 4.515449047088623, "logps/chosen": -661.9925537109375, "logps/rejected": -1010.0653686523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.104189872741699, "rewards/margins": 25.521949768066406, "rewards/rejected": -31.626140594482422, "step": 1363 }, { "epoch": 0.8485225505443235, "grad_norm": 26.245492935180664, "learning_rate": 3.984555094513602e-06, "logits/chosen": 0.29489636421203613, "logits/rejected": 3.7235867977142334, "logps/chosen": -556.9881591796875, "logps/rejected": -929.4679565429688, "loss": 0.2556, "rewards/accuracies": 0.875, "rewards/chosen": -8.547537803649902, "rewards/margins": 22.821739196777344, "rewards/rejected": -31.369277954101562, "step": 1364 }, { "epoch": 0.8491446345256609, "grad_norm": 3.216511686332524e-05, "learning_rate": 3.983402489626556e-06, "logits/chosen": 0.3073354959487915, "logits/rejected": 3.334766149520874, "logps/chosen": -539.6654052734375, "logps/rejected": -937.9322509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.841401100158691, "rewards/margins": 27.675031661987305, "rewards/rejected": -32.51643371582031, "step": 1365 }, { "epoch": 0.8497667185069985, "grad_norm": 19.330078125, "learning_rate": 3.982249884739511e-06, "logits/chosen": 0.2854769229888916, "logits/rejected": 2.968214988708496, "logps/chosen": -527.625244140625, "logps/rejected": -923.3499145507812, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -11.42944622039795, "rewards/margins": 21.908905029296875, "rewards/rejected": -33.33835220336914, "step": 1366 }, { "epoch": 0.8503888024883359, "grad_norm": 0.020990528166294098, "learning_rate": 3.9810972798524665e-06, "logits/chosen": 1.3086750507354736, "logits/rejected": 3.233398914337158, "logps/chosen": -608.595947265625, "logps/rejected": -1012.1083374023438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.629014015197754, "rewards/margins": 26.847347259521484, "rewards/rejected": -33.47636032104492, "step": 1367 }, { "epoch": 0.8510108864696734, "grad_norm": 0.0055344197899103165, "learning_rate": 3.979944674965422e-06, "logits/chosen": 1.164971947669983, "logits/rejected": 4.53289270401001, "logps/chosen": -633.4853515625, "logps/rejected": -986.58251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.878882884979248, "rewards/margins": 25.460460662841797, "rewards/rejected": -31.339344024658203, "step": 1368 }, { "epoch": 0.8516329704510109, "grad_norm": 0.05928613618016243, "learning_rate": 3.978792070078377e-06, "logits/chosen": 2.948124647140503, "logits/rejected": 3.4460105895996094, "logps/chosen": -740.6045532226562, "logps/rejected": -955.6398315429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.457581520080566, "rewards/margins": 21.442768096923828, "rewards/rejected": -28.90035057067871, "step": 1369 }, { "epoch": 0.8522550544323484, "grad_norm": 10.205842018127441, "learning_rate": 3.977639465191332e-06, "logits/chosen": 2.0318384170532227, "logits/rejected": 3.432497024536133, "logps/chosen": -605.9002685546875, "logps/rejected": -772.0656127929688, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -3.010197877883911, "rewards/margins": 10.323250770568848, "rewards/rejected": -13.333450317382812, "step": 1370 }, { "epoch": 0.8528771384136858, "grad_norm": 1.9100292921066284, "learning_rate": 3.976486860304287e-06, "logits/chosen": 1.0926796197891235, "logits/rejected": 4.672023296356201, "logps/chosen": -490.7054443359375, "logps/rejected": -892.81591796875, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -8.199094772338867, "rewards/margins": 17.715091705322266, "rewards/rejected": -25.914186477661133, "step": 1371 }, { "epoch": 0.8534992223950233, "grad_norm": 0.035171184688806534, "learning_rate": 3.975334255417243e-06, "logits/chosen": -2.5091309547424316, "logits/rejected": 3.1034936904907227, "logps/chosen": -389.6402893066406, "logps/rejected": -885.12939453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.678465843200684, "rewards/margins": 21.970054626464844, "rewards/rejected": -26.648521423339844, "step": 1372 }, { "epoch": 0.8541213063763609, "grad_norm": 2.5301403999328613, "learning_rate": 3.974181650530199e-06, "logits/chosen": -0.04623675346374512, "logits/rejected": 2.8056583404541016, "logps/chosen": -357.27496337890625, "logps/rejected": -749.208984375, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -4.000709533691406, "rewards/margins": 20.618499755859375, "rewards/rejected": -24.61920928955078, "step": 1373 }, { "epoch": 0.8547433903576983, "grad_norm": 23.893159866333008, "learning_rate": 3.973029045643154e-06, "logits/chosen": 2.059338092803955, "logits/rejected": 3.9922189712524414, "logps/chosen": -662.5907592773438, "logps/rejected": -978.7205810546875, "loss": 0.1882, "rewards/accuracies": 0.875, "rewards/chosen": -6.927433967590332, "rewards/margins": 19.52408218383789, "rewards/rejected": -26.451515197753906, "step": 1374 }, { "epoch": 0.8553654743390358, "grad_norm": 0.0024465518072247505, "learning_rate": 3.971876440756109e-06, "logits/chosen": 0.2747696042060852, "logits/rejected": 4.139282703399658, "logps/chosen": -435.94207763671875, "logps/rejected": -831.924560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.307514190673828, "rewards/margins": 16.516843795776367, "rewards/rejected": -23.824356079101562, "step": 1375 }, { "epoch": 0.8559875583203732, "grad_norm": 1.0032643871227265e-07, "learning_rate": 3.970723835869064e-06, "logits/chosen": -0.6229652166366577, "logits/rejected": 5.645219326019287, "logps/chosen": -434.7634582519531, "logps/rejected": -1070.2586669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9480323791503906, "rewards/margins": 32.68678283691406, "rewards/rejected": -35.63481903076172, "step": 1376 }, { "epoch": 0.8566096423017108, "grad_norm": 0.0009331091423518956, "learning_rate": 3.96957123098202e-06, "logits/chosen": 1.4642635583877563, "logits/rejected": 3.1478796005249023, "logps/chosen": -506.8468017578125, "logps/rejected": -809.0045166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.069552898406982, "rewards/margins": 19.62548828125, "rewards/rejected": -26.69504165649414, "step": 1377 }, { "epoch": 0.8572317262830482, "grad_norm": 0.1357276737689972, "learning_rate": 3.968418626094975e-06, "logits/chosen": 2.1965925693511963, "logits/rejected": 3.75270676612854, "logps/chosen": -529.9581298828125, "logps/rejected": -775.684814453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.251548767089844, "rewards/margins": 17.33985710144043, "rewards/rejected": -23.591405868530273, "step": 1378 }, { "epoch": 0.8578538102643857, "grad_norm": 18.46625518798828, "learning_rate": 3.96726602120793e-06, "logits/chosen": -0.16634273529052734, "logits/rejected": 2.594822883605957, "logps/chosen": -565.5756225585938, "logps/rejected": -926.483642578125, "loss": 0.1594, "rewards/accuracies": 0.875, "rewards/chosen": -6.502753257751465, "rewards/margins": 20.557899475097656, "rewards/rejected": -27.060651779174805, "step": 1379 }, { "epoch": 0.8584758942457231, "grad_norm": 0.018732385709881783, "learning_rate": 3.966113416320885e-06, "logits/chosen": -0.2522784471511841, "logits/rejected": 2.6350338459014893, "logps/chosen": -499.401123046875, "logps/rejected": -913.3438720703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.384436130523682, "rewards/margins": 24.542402267456055, "rewards/rejected": -30.92683982849121, "step": 1380 }, { "epoch": 0.8590979782270607, "grad_norm": 2.273261547088623, "learning_rate": 3.9649608114338405e-06, "logits/chosen": -1.5306123495101929, "logits/rejected": 2.4383885860443115, "logps/chosen": -394.1301574707031, "logps/rejected": -719.6019287109375, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -9.67462158203125, "rewards/margins": 16.036300659179688, "rewards/rejected": -25.710922241210938, "step": 1381 }, { "epoch": 0.8597200622083981, "grad_norm": 0.14083699882030487, "learning_rate": 3.963808206546796e-06, "logits/chosen": 1.4821040630340576, "logits/rejected": 3.562310218811035, "logps/chosen": -630.3826904296875, "logps/rejected": -1096.625732421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.841997146606445, "rewards/margins": 29.525222778320312, "rewards/rejected": -38.367218017578125, "step": 1382 }, { "epoch": 0.8603421461897356, "grad_norm": 0.49102070927619934, "learning_rate": 3.962655601659751e-06, "logits/chosen": -1.6309912204742432, "logits/rejected": 2.7926807403564453, "logps/chosen": -375.1983947753906, "logps/rejected": -995.9594116210938, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.588959693908691, "rewards/margins": 31.954254150390625, "rewards/rejected": -37.543212890625, "step": 1383 }, { "epoch": 0.860964230171073, "grad_norm": 1.9696420431137085, "learning_rate": 3.961502996772706e-06, "logits/chosen": 1.5886688232421875, "logits/rejected": 4.186314582824707, "logps/chosen": -483.3215637207031, "logps/rejected": -956.5115966796875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -5.810182571411133, "rewards/margins": 23.55510711669922, "rewards/rejected": -29.36528968811035, "step": 1384 }, { "epoch": 0.8615863141524106, "grad_norm": 3.1318552494049072, "learning_rate": 3.960350391885661e-06, "logits/chosen": 1.087369680404663, "logits/rejected": 1.6794929504394531, "logps/chosen": -620.4326171875, "logps/rejected": -852.4151000976562, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -8.948686599731445, "rewards/margins": 20.768470764160156, "rewards/rejected": -29.71715545654297, "step": 1385 }, { "epoch": 0.862208398133748, "grad_norm": 7.456227467628196e-05, "learning_rate": 3.959197786998617e-06, "logits/chosen": -0.2758218050003052, "logits/rejected": 3.6374728679656982, "logps/chosen": -510.7708740234375, "logps/rejected": -986.722412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.5473737716674805, "rewards/margins": 26.101940155029297, "rewards/rejected": -32.649314880371094, "step": 1386 }, { "epoch": 0.8628304821150855, "grad_norm": 0.1047658622264862, "learning_rate": 3.958045182111573e-06, "logits/chosen": 0.1710960417985916, "logits/rejected": 2.208815336227417, "logps/chosen": -611.6002197265625, "logps/rejected": -1039.8455810546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.992301940917969, "rewards/margins": 29.51941680908203, "rewards/rejected": -39.51171875, "step": 1387 }, { "epoch": 0.8634525660964231, "grad_norm": 0.0017270062817260623, "learning_rate": 3.956892577224528e-06, "logits/chosen": 0.7946970462799072, "logits/rejected": 1.492017388343811, "logps/chosen": -634.5062255859375, "logps/rejected": -867.8536376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.7527494430542, "rewards/margins": 20.600440979003906, "rewards/rejected": -31.353191375732422, "step": 1388 }, { "epoch": 0.8640746500777605, "grad_norm": 23.987987518310547, "learning_rate": 3.955739972337483e-06, "logits/chosen": -1.741621494293213, "logits/rejected": 2.635464906692505, "logps/chosen": -403.216552734375, "logps/rejected": -912.2803955078125, "loss": 0.1765, "rewards/accuracies": 0.875, "rewards/chosen": -8.126017570495605, "rewards/margins": 22.75326919555664, "rewards/rejected": -30.879287719726562, "step": 1389 }, { "epoch": 0.864696734059098, "grad_norm": 3.4604811668395996, "learning_rate": 3.954587367450438e-06, "logits/chosen": -1.273258090019226, "logits/rejected": 3.6706578731536865, "logps/chosen": -359.53375244140625, "logps/rejected": -844.9307861328125, "loss": 0.1569, "rewards/accuracies": 0.875, "rewards/chosen": -5.742606163024902, "rewards/margins": 26.624191284179688, "rewards/rejected": -32.366798400878906, "step": 1390 }, { "epoch": 0.8653188180404354, "grad_norm": 8.28245174488984e-05, "learning_rate": 3.953434762563394e-06, "logits/chosen": 0.8197730779647827, "logits/rejected": 3.010805606842041, "logps/chosen": -598.204345703125, "logps/rejected": -1058.3233642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.552316188812256, "rewards/margins": 30.922182083129883, "rewards/rejected": -37.47449493408203, "step": 1391 }, { "epoch": 0.865940902021773, "grad_norm": 36.1810417175293, "learning_rate": 3.952282157676349e-06, "logits/chosen": 4.309150695800781, "logits/rejected": 6.092033386230469, "logps/chosen": -792.2166137695312, "logps/rejected": -1117.3037109375, "loss": 0.6571, "rewards/accuracies": 0.875, "rewards/chosen": -10.301803588867188, "rewards/margins": 18.393749237060547, "rewards/rejected": -28.695552825927734, "step": 1392 }, { "epoch": 0.8665629860031104, "grad_norm": 7.91036436567083e-05, "learning_rate": 3.951129552789304e-06, "logits/chosen": -0.3365095853805542, "logits/rejected": 3.304396629333496, "logps/chosen": -554.6954345703125, "logps/rejected": -1004.7761840820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.135865211486816, "rewards/margins": 28.570690155029297, "rewards/rejected": -37.70655822753906, "step": 1393 }, { "epoch": 0.8671850699844479, "grad_norm": 8.95252513885498, "learning_rate": 3.949976947902259e-06, "logits/chosen": -1.6988708972930908, "logits/rejected": 2.7335429191589355, "logps/chosen": -381.9712829589844, "logps/rejected": -769.756103515625, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -6.227147579193115, "rewards/margins": 21.232027053833008, "rewards/rejected": -27.459177017211914, "step": 1394 }, { "epoch": 0.8678071539657853, "grad_norm": 0.00047960656229406595, "learning_rate": 3.9488243430152145e-06, "logits/chosen": 1.272325873374939, "logits/rejected": 4.848588466644287, "logps/chosen": -473.9206237792969, "logps/rejected": -895.236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.872984886169434, "rewards/margins": 26.865549087524414, "rewards/rejected": -33.73853302001953, "step": 1395 }, { "epoch": 0.8684292379471229, "grad_norm": 32.17039489746094, "learning_rate": 3.94767173812817e-06, "logits/chosen": 1.9787907600402832, "logits/rejected": 5.032547473907471, "logps/chosen": -563.3594360351562, "logps/rejected": -925.163330078125, "loss": 0.8672, "rewards/accuracies": 0.875, "rewards/chosen": -7.483692169189453, "rewards/margins": 18.271421432495117, "rewards/rejected": -25.755115509033203, "step": 1396 }, { "epoch": 0.8690513219284604, "grad_norm": 34.24311447143555, "learning_rate": 3.946519133241125e-06, "logits/chosen": -1.7461861371994019, "logits/rejected": 1.7403262853622437, "logps/chosen": -553.9136352539062, "logps/rejected": -946.935302734375, "loss": 0.2911, "rewards/accuracies": 0.875, "rewards/chosen": -8.655351638793945, "rewards/margins": 22.64749526977539, "rewards/rejected": -31.302845001220703, "step": 1397 }, { "epoch": 0.8696734059097978, "grad_norm": 29.2575740814209, "learning_rate": 3.94536652835408e-06, "logits/chosen": 0.19249913096427917, "logits/rejected": 3.8216664791107178, "logps/chosen": -437.7361145019531, "logps/rejected": -935.0411376953125, "loss": 0.4214, "rewards/accuracies": 0.875, "rewards/chosen": -4.826973915100098, "rewards/margins": 25.423913955688477, "rewards/rejected": -30.250885009765625, "step": 1398 }, { "epoch": 0.8702954898911353, "grad_norm": 28.614089965820312, "learning_rate": 3.944213923467035e-06, "logits/chosen": 1.4811843633651733, "logits/rejected": 4.661951065063477, "logps/chosen": -563.6171875, "logps/rejected": -973.4630126953125, "loss": 0.9317, "rewards/accuracies": 0.875, "rewards/chosen": -4.618762969970703, "rewards/margins": 22.27410888671875, "rewards/rejected": -26.892871856689453, "step": 1399 }, { "epoch": 0.8709175738724728, "grad_norm": 0.4589652717113495, "learning_rate": 3.943061318579991e-06, "logits/chosen": 0.9266824722290039, "logits/rejected": 2.3370745182037354, "logps/chosen": -574.8289184570312, "logps/rejected": -844.5990600585938, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -9.61984920501709, "rewards/margins": 17.409948348999023, "rewards/rejected": -27.029796600341797, "step": 1400 }, { "epoch": 0.8715396578538103, "grad_norm": 27.476842880249023, "learning_rate": 3.941908713692946e-06, "logits/chosen": 1.1406919956207275, "logits/rejected": 3.6667251586914062, "logps/chosen": -604.9240112304688, "logps/rejected": -743.1671142578125, "loss": 0.4692, "rewards/accuracies": 0.875, "rewards/chosen": -8.29455852508545, "rewards/margins": 12.168575286865234, "rewards/rejected": -20.463132858276367, "step": 1401 }, { "epoch": 0.8721617418351477, "grad_norm": 17.606111526489258, "learning_rate": 3.940756108805902e-06, "logits/chosen": 1.7271714210510254, "logits/rejected": 3.771793842315674, "logps/chosen": -625.2269287109375, "logps/rejected": -941.1549072265625, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": -7.325542449951172, "rewards/margins": 25.147600173950195, "rewards/rejected": -32.473140716552734, "step": 1402 }, { "epoch": 0.8727838258164852, "grad_norm": 0.0003857784904539585, "learning_rate": 3.939603503918857e-06, "logits/chosen": -1.2490999698638916, "logits/rejected": 2.748063087463379, "logps/chosen": -391.865966796875, "logps/rejected": -883.5145263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2403740882873535, "rewards/margins": 22.9473876953125, "rewards/rejected": -27.187763214111328, "step": 1403 }, { "epoch": 0.8734059097978227, "grad_norm": 16.675168991088867, "learning_rate": 3.938450899031812e-06, "logits/chosen": -0.7203967571258545, "logits/rejected": 2.4281888008117676, "logps/chosen": -406.6605224609375, "logps/rejected": -737.8378295898438, "loss": 0.147, "rewards/accuracies": 0.875, "rewards/chosen": -4.123994827270508, "rewards/margins": 14.674833297729492, "rewards/rejected": -18.798828125, "step": 1404 }, { "epoch": 0.8740279937791602, "grad_norm": 0.03721201419830322, "learning_rate": 3.937298294144768e-06, "logits/chosen": -1.2863411903381348, "logits/rejected": 5.00859260559082, "logps/chosen": -428.8861083984375, "logps/rejected": -1081.917724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.048243522644043, "rewards/margins": 28.324411392211914, "rewards/rejected": -35.372657775878906, "step": 1405 }, { "epoch": 0.8746500777604976, "grad_norm": 24.92822265625, "learning_rate": 3.936145689257723e-06, "logits/chosen": 1.4132871627807617, "logits/rejected": 2.852602481842041, "logps/chosen": -564.416748046875, "logps/rejected": -827.8515014648438, "loss": 0.2008, "rewards/accuracies": 0.875, "rewards/chosen": -7.818817138671875, "rewards/margins": 17.714767456054688, "rewards/rejected": -25.533586502075195, "step": 1406 }, { "epoch": 0.8752721617418352, "grad_norm": 6.125887870788574, "learning_rate": 3.934993084370678e-06, "logits/chosen": -1.2724628448486328, "logits/rejected": 3.4561619758605957, "logps/chosen": -458.8982238769531, "logps/rejected": -891.2723388671875, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -9.694085121154785, "rewards/margins": 23.815200805664062, "rewards/rejected": -33.50928497314453, "step": 1407 }, { "epoch": 0.8758942457231726, "grad_norm": 0.11236572265625, "learning_rate": 3.933840479483633e-06, "logits/chosen": -1.3016149997711182, "logits/rejected": 2.4662973880767822, "logps/chosen": -342.9910583496094, "logps/rejected": -818.015380859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.7304840087890625, "rewards/margins": 26.967788696289062, "rewards/rejected": -33.698272705078125, "step": 1408 }, { "epoch": 0.8765163297045101, "grad_norm": 41.05025100708008, "learning_rate": 3.9326878745965885e-06, "logits/chosen": 0.767713189125061, "logits/rejected": 4.092617511749268, "logps/chosen": -576.6070556640625, "logps/rejected": -1000.982421875, "loss": 0.9999, "rewards/accuracies": 0.875, "rewards/chosen": -9.53645133972168, "rewards/margins": 25.861608505249023, "rewards/rejected": -35.3980598449707, "step": 1409 }, { "epoch": 0.8771384136858476, "grad_norm": 0.00035387437674216926, "learning_rate": 3.931535269709544e-06, "logits/chosen": -0.6754996180534363, "logits/rejected": 4.191483497619629, "logps/chosen": -348.2604675292969, "logps/rejected": -981.4669189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.8978424072265625, "rewards/margins": 30.909826278686523, "rewards/rejected": -36.80767059326172, "step": 1410 }, { "epoch": 0.8777604976671851, "grad_norm": 0.04639606177806854, "learning_rate": 3.930382664822499e-06, "logits/chosen": 0.5568681955337524, "logits/rejected": 3.2366063594818115, "logps/chosen": -395.332763671875, "logps/rejected": -721.87109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.578215599060059, "rewards/margins": 20.79397201538086, "rewards/rejected": -26.372188568115234, "step": 1411 }, { "epoch": 0.8783825816485226, "grad_norm": 16.890092849731445, "learning_rate": 3.929230059935454e-06, "logits/chosen": 1.8669103384017944, "logits/rejected": 2.9881675243377686, "logps/chosen": -683.0795288085938, "logps/rejected": -946.185546875, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": -10.376381874084473, "rewards/margins": 20.531028747558594, "rewards/rejected": -30.90740966796875, "step": 1412 }, { "epoch": 0.87900466562986, "grad_norm": 18.696821212768555, "learning_rate": 3.928077455048409e-06, "logits/chosen": 1.8616418838500977, "logits/rejected": 3.8562190532684326, "logps/chosen": -602.8616943359375, "logps/rejected": -1019.068603515625, "loss": 0.2133, "rewards/accuracies": 0.875, "rewards/chosen": -6.290316581726074, "rewards/margins": 27.61571502685547, "rewards/rejected": -33.90603256225586, "step": 1413 }, { "epoch": 0.8796267496111975, "grad_norm": 0.017830608412623405, "learning_rate": 3.926924850161365e-06, "logits/chosen": -1.609785795211792, "logits/rejected": 3.152907609939575, "logps/chosen": -446.86395263671875, "logps/rejected": -1040.036865234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.331204414367676, "rewards/margins": 32.17317199707031, "rewards/rejected": -39.50437545776367, "step": 1414 }, { "epoch": 0.880248833592535, "grad_norm": 0.01888345554471016, "learning_rate": 3.92577224527432e-06, "logits/chosen": 0.08871287107467651, "logits/rejected": 3.6794910430908203, "logps/chosen": -519.1620483398438, "logps/rejected": -920.4866333007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.557089805603027, "rewards/margins": 16.47603988647461, "rewards/rejected": -25.033130645751953, "step": 1415 }, { "epoch": 0.8808709175738725, "grad_norm": 0.00010415662109153345, "learning_rate": 3.924619640387275e-06, "logits/chosen": -0.11627277731895447, "logits/rejected": 2.4083340167999268, "logps/chosen": -451.6080322265625, "logps/rejected": -901.6516723632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.407066345214844, "rewards/margins": 26.67092514038086, "rewards/rejected": -34.07799530029297, "step": 1416 }, { "epoch": 0.8814930015552099, "grad_norm": 0.021583333611488342, "learning_rate": 3.923467035500231e-06, "logits/chosen": 1.4402005672454834, "logits/rejected": 2.0313756465911865, "logps/chosen": -656.1533813476562, "logps/rejected": -1004.5067138671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.986201286315918, "rewards/margins": 25.95221710205078, "rewards/rejected": -36.938419342041016, "step": 1417 }, { "epoch": 0.8821150855365474, "grad_norm": 0.7738665342330933, "learning_rate": 3.922314430613186e-06, "logits/chosen": 3.2806267738342285, "logits/rejected": 3.2303647994995117, "logps/chosen": -668.647216796875, "logps/rejected": -898.922119140625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -9.184161186218262, "rewards/margins": 24.993419647216797, "rewards/rejected": -34.17757797241211, "step": 1418 }, { "epoch": 0.882737169517885, "grad_norm": 23.072006225585938, "learning_rate": 3.921161825726142e-06, "logits/chosen": 0.755908727645874, "logits/rejected": 2.168006658554077, "logps/chosen": -565.05810546875, "logps/rejected": -842.9892578125, "loss": 0.4122, "rewards/accuracies": 0.875, "rewards/chosen": -5.335588455200195, "rewards/margins": 20.874221801757812, "rewards/rejected": -26.20981216430664, "step": 1419 }, { "epoch": 0.8833592534992224, "grad_norm": 0.002978462493047118, "learning_rate": 3.920009220839097e-06, "logits/chosen": -0.32014715671539307, "logits/rejected": 1.7139149904251099, "logps/chosen": -513.91455078125, "logps/rejected": -936.3826293945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.227300643920898, "rewards/margins": 27.83060073852539, "rewards/rejected": -32.057899475097656, "step": 1420 }, { "epoch": 0.8839813374805598, "grad_norm": 30.52089500427246, "learning_rate": 3.918856615952052e-06, "logits/chosen": 2.750991106033325, "logits/rejected": 2.399867534637451, "logps/chosen": -659.213623046875, "logps/rejected": -921.962158203125, "loss": 0.2245, "rewards/accuracies": 0.875, "rewards/chosen": -8.95782470703125, "rewards/margins": 23.880416870117188, "rewards/rejected": -32.83824157714844, "step": 1421 }, { "epoch": 0.8846034214618974, "grad_norm": 0.259468138217926, "learning_rate": 3.917704011065007e-06, "logits/chosen": 1.1308553218841553, "logits/rejected": 2.8450875282287598, "logps/chosen": -667.422607421875, "logps/rejected": -841.1114501953125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -9.422941207885742, "rewards/margins": 15.405401229858398, "rewards/rejected": -24.82834243774414, "step": 1422 }, { "epoch": 0.8852255054432349, "grad_norm": 10.124845504760742, "learning_rate": 3.9165514061779625e-06, "logits/chosen": 1.7226544618606567, "logits/rejected": 4.723477840423584, "logps/chosen": -534.2507934570312, "logps/rejected": -866.4068603515625, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": -8.586525917053223, "rewards/margins": 19.207855224609375, "rewards/rejected": -27.79438018798828, "step": 1423 }, { "epoch": 0.8858475894245723, "grad_norm": 0.558952808380127, "learning_rate": 3.915398801290918e-06, "logits/chosen": -0.2260989248752594, "logits/rejected": 2.9631381034851074, "logps/chosen": -496.76678466796875, "logps/rejected": -953.5576782226562, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -8.71063232421875, "rewards/margins": 23.282506942749023, "rewards/rejected": -31.993141174316406, "step": 1424 }, { "epoch": 0.8864696734059098, "grad_norm": 16.527063369750977, "learning_rate": 3.914246196403873e-06, "logits/chosen": 3.2674198150634766, "logits/rejected": 5.55305290222168, "logps/chosen": -580.180908203125, "logps/rejected": -1039.1260986328125, "loss": 0.1108, "rewards/accuracies": 0.875, "rewards/chosen": -10.649737358093262, "rewards/margins": 31.834278106689453, "rewards/rejected": -42.48401641845703, "step": 1425 }, { "epoch": 0.8870917573872473, "grad_norm": 0.042061157524585724, "learning_rate": 3.913093591516828e-06, "logits/chosen": 0.06921637058258057, "logits/rejected": 3.0379743576049805, "logps/chosen": -557.9514770507812, "logps/rejected": -997.2664794921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.284543514251709, "rewards/margins": 23.125932693481445, "rewards/rejected": -30.41047477722168, "step": 1426 }, { "epoch": 0.8877138413685848, "grad_norm": 1.3195871114730835, "learning_rate": 3.911940986629783e-06, "logits/chosen": 1.0034469366073608, "logits/rejected": 3.320380687713623, "logps/chosen": -512.767822265625, "logps/rejected": -833.2984619140625, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -6.851565361022949, "rewards/margins": 18.35439109802246, "rewards/rejected": -25.20595932006836, "step": 1427 }, { "epoch": 0.8883359253499222, "grad_norm": 0.3945462703704834, "learning_rate": 3.910788381742739e-06, "logits/chosen": -1.1836017370224, "logits/rejected": 3.2389326095581055, "logps/chosen": -339.166748046875, "logps/rejected": -919.89599609375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.06601333618164, "rewards/margins": 26.293392181396484, "rewards/rejected": -34.359405517578125, "step": 1428 }, { "epoch": 0.8889580093312597, "grad_norm": 5.651898391079158e-06, "learning_rate": 3.909635776855694e-06, "logits/chosen": -2.461566686630249, "logits/rejected": 3.693530321121216, "logps/chosen": -466.4064636230469, "logps/rejected": -1155.3829345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.28778076171875, "rewards/margins": 35.794368743896484, "rewards/rejected": -43.08214569091797, "step": 1429 }, { "epoch": 0.8895800933125972, "grad_norm": 3.520720565575175e-05, "learning_rate": 3.908483171968649e-06, "logits/chosen": 2.5713906288146973, "logits/rejected": 3.6118719577789307, "logps/chosen": -649.1976318359375, "logps/rejected": -1010.8802490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.335611343383789, "rewards/margins": 29.236289978027344, "rewards/rejected": -38.571903228759766, "step": 1430 }, { "epoch": 0.8902021772939347, "grad_norm": 0.06428039073944092, "learning_rate": 3.907330567081605e-06, "logits/chosen": -0.25640755891799927, "logits/rejected": 2.151702404022217, "logps/chosen": -601.4017944335938, "logps/rejected": -966.105224609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.176769256591797, "rewards/margins": 27.328582763671875, "rewards/rejected": -35.505348205566406, "step": 1431 }, { "epoch": 0.8908242612752721, "grad_norm": 0.6199120879173279, "learning_rate": 3.90617796219456e-06, "logits/chosen": 2.1671009063720703, "logits/rejected": 3.3351383209228516, "logps/chosen": -604.8416748046875, "logps/rejected": -902.930419921875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -8.571571350097656, "rewards/margins": 23.950511932373047, "rewards/rejected": -32.5220832824707, "step": 1432 }, { "epoch": 0.8914463452566096, "grad_norm": 2.534109115600586, "learning_rate": 3.905025357307516e-06, "logits/chosen": -1.5430231094360352, "logits/rejected": 2.451169490814209, "logps/chosen": -502.9190979003906, "logps/rejected": -919.2320556640625, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -8.408283233642578, "rewards/margins": 18.22637176513672, "rewards/rejected": -26.634654998779297, "step": 1433 }, { "epoch": 0.8920684292379472, "grad_norm": 0.4797965884208679, "learning_rate": 3.903872752420471e-06, "logits/chosen": -0.4005916714668274, "logits/rejected": 3.6412811279296875, "logps/chosen": -618.4359130859375, "logps/rejected": -1001.5445556640625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -11.378423690795898, "rewards/margins": 21.48226547241211, "rewards/rejected": -32.86069107055664, "step": 1434 }, { "epoch": 0.8926905132192846, "grad_norm": 33.51510238647461, "learning_rate": 3.902720147533426e-06, "logits/chosen": 0.43302229046821594, "logits/rejected": 3.2057642936706543, "logps/chosen": -321.2042236328125, "logps/rejected": -509.62615966796875, "loss": 0.4201, "rewards/accuracies": 0.875, "rewards/chosen": -7.494220733642578, "rewards/margins": 10.129779815673828, "rewards/rejected": -17.624000549316406, "step": 1435 }, { "epoch": 0.8933125972006221, "grad_norm": 0.0015452936058863997, "learning_rate": 3.901567542646381e-06, "logits/chosen": 1.8499112129211426, "logits/rejected": 0.8464174866676331, "logps/chosen": -620.2147216796875, "logps/rejected": -714.9520263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.805253028869629, "rewards/margins": 18.28689193725586, "rewards/rejected": -27.092144012451172, "step": 1436 }, { "epoch": 0.8939346811819595, "grad_norm": 22.18968391418457, "learning_rate": 3.9004149377593365e-06, "logits/chosen": 1.2555292844772339, "logits/rejected": 4.441434860229492, "logps/chosen": -566.209228515625, "logps/rejected": -969.3631591796875, "loss": 0.1819, "rewards/accuracies": 0.875, "rewards/chosen": -10.221857070922852, "rewards/margins": 23.401203155517578, "rewards/rejected": -33.6230583190918, "step": 1437 }, { "epoch": 0.8945567651632971, "grad_norm": 0.0015446230536326766, "learning_rate": 3.899262332872292e-06, "logits/chosen": 0.08676552772521973, "logits/rejected": 2.6135525703430176, "logps/chosen": -444.3998107910156, "logps/rejected": -775.56591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.776384353637695, "rewards/margins": 18.37545394897461, "rewards/rejected": -25.151840209960938, "step": 1438 }, { "epoch": 0.8951788491446345, "grad_norm": 20.202762603759766, "learning_rate": 3.898109727985247e-06, "logits/chosen": -2.568183422088623, "logits/rejected": 2.953566312789917, "logps/chosen": -487.15130615234375, "logps/rejected": -918.3087158203125, "loss": 0.1565, "rewards/accuracies": 0.875, "rewards/chosen": -9.887203216552734, "rewards/margins": 16.490564346313477, "rewards/rejected": -26.377765655517578, "step": 1439 }, { "epoch": 0.895800933125972, "grad_norm": 23.546581268310547, "learning_rate": 3.896957123098202e-06, "logits/chosen": 1.4027090072631836, "logits/rejected": 5.3587188720703125, "logps/chosen": -492.6435546875, "logps/rejected": -915.1826782226562, "loss": 0.2141, "rewards/accuracies": 0.875, "rewards/chosen": -6.819850921630859, "rewards/margins": 22.011383056640625, "rewards/rejected": -28.831233978271484, "step": 1440 }, { "epoch": 0.8964230171073095, "grad_norm": 0.7575953602790833, "learning_rate": 3.895804518211157e-06, "logits/chosen": 1.5283441543579102, "logits/rejected": 4.948976039886475, "logps/chosen": -611.407470703125, "logps/rejected": -1001.5595092773438, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -9.188275337219238, "rewards/margins": 19.500892639160156, "rewards/rejected": -28.689167022705078, "step": 1441 }, { "epoch": 0.897045101088647, "grad_norm": 0.41505351662635803, "learning_rate": 3.894651913324113e-06, "logits/chosen": 1.9158740043640137, "logits/rejected": 3.627244472503662, "logps/chosen": -689.9222412109375, "logps/rejected": -981.9581298828125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -5.818538665771484, "rewards/margins": 17.018362045288086, "rewards/rejected": -22.836902618408203, "step": 1442 }, { "epoch": 0.8976671850699844, "grad_norm": 0.14478036761283875, "learning_rate": 3.893499308437068e-06, "logits/chosen": 0.055893540382385254, "logits/rejected": 2.3074419498443604, "logps/chosen": -507.1900939941406, "logps/rejected": -802.4224853515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.000877380371094, "rewards/margins": 20.082687377929688, "rewards/rejected": -26.08356475830078, "step": 1443 }, { "epoch": 0.8982892690513219, "grad_norm": 3.7201330087555107e-06, "learning_rate": 3.892346703550023e-06, "logits/chosen": -1.7229702472686768, "logits/rejected": 3.9929304122924805, "logps/chosen": -359.07342529296875, "logps/rejected": -1021.4876098632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.652368545532227, "rewards/margins": 32.61327362060547, "rewards/rejected": -40.26564025878906, "step": 1444 }, { "epoch": 0.8989113530326595, "grad_norm": 0.00015392265049740672, "learning_rate": 3.891194098662978e-06, "logits/chosen": -1.7059762477874756, "logits/rejected": 2.676309585571289, "logps/chosen": -373.9096374511719, "logps/rejected": -994.46923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.087657928466797, "rewards/margins": 29.257225036621094, "rewards/rejected": -37.34488296508789, "step": 1445 }, { "epoch": 0.8995334370139969, "grad_norm": 4.5255632400512695, "learning_rate": 3.890041493775934e-06, "logits/chosen": 2.433472156524658, "logits/rejected": 3.5981287956237793, "logps/chosen": -639.4766845703125, "logps/rejected": -950.7479858398438, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -12.024510383605957, "rewards/margins": 22.969099044799805, "rewards/rejected": -34.99361038208008, "step": 1446 }, { "epoch": 0.9001555209953344, "grad_norm": 24.062063217163086, "learning_rate": 3.88888888888889e-06, "logits/chosen": 3.22426700592041, "logits/rejected": 5.0054030418396, "logps/chosen": -579.1534423828125, "logps/rejected": -883.1697387695312, "loss": 0.1114, "rewards/accuracies": 0.875, "rewards/chosen": -6.831135272979736, "rewards/margins": 21.391578674316406, "rewards/rejected": -28.222713470458984, "step": 1447 }, { "epoch": 0.9007776049766718, "grad_norm": 9.99648982542567e-05, "learning_rate": 3.887736284001845e-06, "logits/chosen": -1.6023995876312256, "logits/rejected": 4.028076648712158, "logps/chosen": -378.3094482421875, "logps/rejected": -946.167724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.24447774887085, "rewards/margins": 29.51643943786621, "rewards/rejected": -36.76091766357422, "step": 1448 }, { "epoch": 0.9013996889580094, "grad_norm": 6.691157341003418, "learning_rate": 3.8865836791148e-06, "logits/chosen": 1.337373971939087, "logits/rejected": 3.3517513275146484, "logps/chosen": -594.4137573242188, "logps/rejected": -1030.775634765625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -7.6077680587768555, "rewards/margins": 24.80924415588379, "rewards/rejected": -32.41700744628906, "step": 1449 }, { "epoch": 0.9020217729393468, "grad_norm": 0.0007598842494189739, "learning_rate": 3.885431074227755e-06, "logits/chosen": 0.9249590039253235, "logits/rejected": 2.9459688663482666, "logps/chosen": -374.2164306640625, "logps/rejected": -668.7687377929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.251091480255127, "rewards/margins": 16.075878143310547, "rewards/rejected": -20.32697105407715, "step": 1450 }, { "epoch": 0.9026438569206843, "grad_norm": 0.0011190170189365745, "learning_rate": 3.8842784693407105e-06, "logits/chosen": -0.2251252681016922, "logits/rejected": 3.9785103797912598, "logps/chosen": -447.6703186035156, "logps/rejected": -956.0332641601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.806117296218872, "rewards/margins": 30.147293090820312, "rewards/rejected": -33.95341110229492, "step": 1451 }, { "epoch": 0.9032659409020217, "grad_norm": 14.530166625976562, "learning_rate": 3.883125864453666e-06, "logits/chosen": 1.0380009412765503, "logits/rejected": 4.930631637573242, "logps/chosen": -557.4342041015625, "logps/rejected": -968.177978515625, "loss": 0.0801, "rewards/accuracies": 1.0, "rewards/chosen": -12.003500938415527, "rewards/margins": 20.134029388427734, "rewards/rejected": -32.13752746582031, "step": 1452 }, { "epoch": 0.9038880248833593, "grad_norm": 33.54362106323242, "learning_rate": 3.881973259566621e-06, "logits/chosen": -1.212352991104126, "logits/rejected": 2.3545548915863037, "logps/chosen": -355.854736328125, "logps/rejected": -823.626220703125, "loss": 0.9964, "rewards/accuracies": 0.875, "rewards/chosen": -6.923611164093018, "rewards/margins": 25.169193267822266, "rewards/rejected": -32.092803955078125, "step": 1453 }, { "epoch": 0.9045101088646967, "grad_norm": 36.934478759765625, "learning_rate": 3.880820654679576e-06, "logits/chosen": -0.6643639802932739, "logits/rejected": 3.543142080307007, "logps/chosen": -580.7192993164062, "logps/rejected": -1017.1659545898438, "loss": 1.1885, "rewards/accuracies": 0.875, "rewards/chosen": -11.022318840026855, "rewards/margins": 23.986433029174805, "rewards/rejected": -35.008750915527344, "step": 1454 }, { "epoch": 0.9051321928460342, "grad_norm": 18.589052200317383, "learning_rate": 3.879668049792531e-06, "logits/chosen": -0.07116609811782837, "logits/rejected": 3.4631710052490234, "logps/chosen": -381.1407470703125, "logps/rejected": -778.381103515625, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": -4.6300368309021, "rewards/margins": 24.78908920288086, "rewards/rejected": -29.41912841796875, "step": 1455 }, { "epoch": 0.9057542768273716, "grad_norm": 20.014081954956055, "learning_rate": 3.878515444905487e-06, "logits/chosen": 0.9744256734848022, "logits/rejected": 3.516145944595337, "logps/chosen": -598.3582763671875, "logps/rejected": -893.91845703125, "loss": 0.2065, "rewards/accuracies": 0.875, "rewards/chosen": -10.06954574584961, "rewards/margins": 14.647180557250977, "rewards/rejected": -24.716726303100586, "step": 1456 }, { "epoch": 0.9063763608087092, "grad_norm": 0.7139878869056702, "learning_rate": 3.877362840018442e-06, "logits/chosen": -1.604265570640564, "logits/rejected": 2.56709623336792, "logps/chosen": -441.33447265625, "logps/rejected": -902.9589233398438, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -8.15466594696045, "rewards/margins": 26.56195640563965, "rewards/rejected": -34.71662139892578, "step": 1457 }, { "epoch": 0.9069984447900467, "grad_norm": 7.607209408888593e-05, "learning_rate": 3.876210235131397e-06, "logits/chosen": -0.12832467257976532, "logits/rejected": 3.708998680114746, "logps/chosen": -383.6351013183594, "logps/rejected": -780.0354614257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.427809238433838, "rewards/margins": 17.41489028930664, "rewards/rejected": -21.84269905090332, "step": 1458 }, { "epoch": 0.9076205287713841, "grad_norm": 0.0008937264792621136, "learning_rate": 3.875057630244352e-06, "logits/chosen": -1.8712693452835083, "logits/rejected": 2.955657482147217, "logps/chosen": -354.0270690917969, "logps/rejected": -975.659912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.730475425720215, "rewards/margins": 28.753705978393555, "rewards/rejected": -35.48418045043945, "step": 1459 }, { "epoch": 0.9082426127527217, "grad_norm": 0.16675208508968353, "learning_rate": 3.873905025357308e-06, "logits/chosen": -1.975113034248352, "logits/rejected": 3.2344610691070557, "logps/chosen": -351.33624267578125, "logps/rejected": -852.4354248046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.95164680480957, "rewards/margins": 20.913976669311523, "rewards/rejected": -25.865623474121094, "step": 1460 }, { "epoch": 0.9088646967340591, "grad_norm": 0.006307605188339949, "learning_rate": 3.872752420470264e-06, "logits/chosen": 0.9668619632720947, "logits/rejected": 4.2649102210998535, "logps/chosen": -405.6862487792969, "logps/rejected": -878.3807983398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.950449466705322, "rewards/margins": 27.552230834960938, "rewards/rejected": -34.502681732177734, "step": 1461 }, { "epoch": 0.9094867807153966, "grad_norm": 0.003334360895678401, "learning_rate": 3.871599815583219e-06, "logits/chosen": -3.593207836151123, "logits/rejected": 2.388051986694336, "logps/chosen": -346.031982421875, "logps/rejected": -930.6376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.312501907348633, "rewards/margins": 27.05126953125, "rewards/rejected": -32.363773345947266, "step": 1462 }, { "epoch": 0.910108864696734, "grad_norm": 12.493054389953613, "learning_rate": 3.870447210696174e-06, "logits/chosen": 0.8264709711074829, "logits/rejected": 3.445063591003418, "logps/chosen": -599.697998046875, "logps/rejected": -1038.114990234375, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": -8.2918062210083, "rewards/margins": 29.403207778930664, "rewards/rejected": -37.69501495361328, "step": 1463 }, { "epoch": 0.9107309486780716, "grad_norm": 5.892303943634033, "learning_rate": 3.869294605809129e-06, "logits/chosen": 2.750220537185669, "logits/rejected": 4.762117385864258, "logps/chosen": -643.5029907226562, "logps/rejected": -941.763427734375, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -6.043200492858887, "rewards/margins": 20.676185607910156, "rewards/rejected": -26.71938705444336, "step": 1464 }, { "epoch": 0.911353032659409, "grad_norm": 0.4347749948501587, "learning_rate": 3.8681420009220845e-06, "logits/chosen": -0.7005534172058105, "logits/rejected": 3.7231321334838867, "logps/chosen": -591.067626953125, "logps/rejected": -1191.287841796875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -13.792040824890137, "rewards/margins": 29.695411682128906, "rewards/rejected": -43.48745346069336, "step": 1465 }, { "epoch": 0.9119751166407465, "grad_norm": 35.06818771362305, "learning_rate": 3.86698939603504e-06, "logits/chosen": -0.31166747212409973, "logits/rejected": 2.879892587661743, "logps/chosen": -587.843505859375, "logps/rejected": -1006.7908325195312, "loss": 0.5774, "rewards/accuracies": 0.875, "rewards/chosen": -9.827130317687988, "rewards/margins": 23.738933563232422, "rewards/rejected": -33.566062927246094, "step": 1466 }, { "epoch": 0.9125972006220839, "grad_norm": 1.881179923657328e-05, "learning_rate": 3.865836791147995e-06, "logits/chosen": 0.7096707820892334, "logits/rejected": 3.158099889755249, "logps/chosen": -459.2037353515625, "logps/rejected": -847.1054077148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.284489631652832, "rewards/margins": 27.181751251220703, "rewards/rejected": -35.46623992919922, "step": 1467 }, { "epoch": 0.9132192846034215, "grad_norm": 0.25267404317855835, "learning_rate": 3.86468418626095e-06, "logits/chosen": 0.695156991481781, "logits/rejected": 4.628496170043945, "logps/chosen": -482.04547119140625, "logps/rejected": -1053.236083984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.188310146331787, "rewards/margins": 29.162036895751953, "rewards/rejected": -36.350345611572266, "step": 1468 }, { "epoch": 0.913841368584759, "grad_norm": 45.62656784057617, "learning_rate": 3.863531581373905e-06, "logits/chosen": 0.6176962852478027, "logits/rejected": 4.680180549621582, "logps/chosen": -467.5579833984375, "logps/rejected": -935.610595703125, "loss": 1.2717, "rewards/accuracies": 0.875, "rewards/chosen": -9.91832447052002, "rewards/margins": 21.834449768066406, "rewards/rejected": -31.752771377563477, "step": 1469 }, { "epoch": 0.9144634525660964, "grad_norm": 2.9416592121124268, "learning_rate": 3.862378976486861e-06, "logits/chosen": 1.5381639003753662, "logits/rejected": 2.0583224296569824, "logps/chosen": -576.9505004882812, "logps/rejected": -878.991943359375, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -11.187313079833984, "rewards/margins": 21.081104278564453, "rewards/rejected": -32.26841735839844, "step": 1470 }, { "epoch": 0.9150855365474339, "grad_norm": 29.226638793945312, "learning_rate": 3.861226371599816e-06, "logits/chosen": -0.7698392868041992, "logits/rejected": 2.5437121391296387, "logps/chosen": -477.9600830078125, "logps/rejected": -946.5003051757812, "loss": 0.5726, "rewards/accuracies": 0.875, "rewards/chosen": -8.625056266784668, "rewards/margins": 24.998863220214844, "rewards/rejected": -33.62391662597656, "step": 1471 }, { "epoch": 0.9157076205287714, "grad_norm": 42.338714599609375, "learning_rate": 3.860073766712771e-06, "logits/chosen": -1.9399776458740234, "logits/rejected": 1.7948557138442993, "logps/chosen": -435.4498291015625, "logps/rejected": -864.4039916992188, "loss": 0.588, "rewards/accuracies": 0.875, "rewards/chosen": -9.043302536010742, "rewards/margins": 22.714567184448242, "rewards/rejected": -31.757871627807617, "step": 1472 }, { "epoch": 0.9163297045101089, "grad_norm": 1.9460109967894823e-08, "learning_rate": 3.858921161825726e-06, "logits/chosen": -1.515357255935669, "logits/rejected": 4.758892059326172, "logps/chosen": -382.9659118652344, "logps/rejected": -1205.88232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.439264297485352, "rewards/margins": 39.36029052734375, "rewards/rejected": -43.799556732177734, "step": 1473 }, { "epoch": 0.9169517884914463, "grad_norm": 0.0579552948474884, "learning_rate": 3.8577685569386815e-06, "logits/chosen": 2.0449562072753906, "logits/rejected": 3.7542054653167725, "logps/chosen": -670.1748657226562, "logps/rejected": -958.8605346679688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.759854316711426, "rewards/margins": 21.060232162475586, "rewards/rejected": -29.820087432861328, "step": 1474 }, { "epoch": 0.9175738724727839, "grad_norm": 0.2062915861606598, "learning_rate": 3.8566159520516376e-06, "logits/chosen": 0.2341582179069519, "logits/rejected": 3.144930124282837, "logps/chosen": -577.846435546875, "logps/rejected": -970.8562622070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -10.450343132019043, "rewards/margins": 20.801151275634766, "rewards/rejected": -31.251493453979492, "step": 1475 }, { "epoch": 0.9181959564541213, "grad_norm": 0.007952794432640076, "learning_rate": 3.855463347164593e-06, "logits/chosen": 0.30161577463150024, "logits/rejected": 3.102041244506836, "logps/chosen": -563.1438598632812, "logps/rejected": -955.537841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.881620407104492, "rewards/margins": 22.008378982543945, "rewards/rejected": -31.889997482299805, "step": 1476 }, { "epoch": 0.9188180404354588, "grad_norm": 7.2674581907961056e-09, "learning_rate": 3.854310742277548e-06, "logits/chosen": -3.6798012256622314, "logits/rejected": 2.85562801361084, "logps/chosen": -301.4128723144531, "logps/rejected": -986.750244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.19948673248291, "rewards/margins": 34.309539794921875, "rewards/rejected": -38.50902557373047, "step": 1477 }, { "epoch": 0.9194401244167962, "grad_norm": 28.337913513183594, "learning_rate": 3.853158137390503e-06, "logits/chosen": -0.632785439491272, "logits/rejected": 5.028564453125, "logps/chosen": -345.7908935546875, "logps/rejected": -858.96337890625, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": -5.487540245056152, "rewards/margins": 20.401901245117188, "rewards/rejected": -25.889440536499023, "step": 1478 }, { "epoch": 0.9200622083981338, "grad_norm": 20.7117919921875, "learning_rate": 3.8520055325034585e-06, "logits/chosen": -2.059865713119507, "logits/rejected": 3.133439540863037, "logps/chosen": -542.2882080078125, "logps/rejected": -954.7108154296875, "loss": 0.1002, "rewards/accuracies": 0.875, "rewards/chosen": -10.028863906860352, "rewards/margins": 21.37855339050293, "rewards/rejected": -31.40741729736328, "step": 1479 }, { "epoch": 0.9206842923794712, "grad_norm": 6.594052314758301, "learning_rate": 3.850852927616414e-06, "logits/chosen": 0.9150658845901489, "logits/rejected": 2.945340156555176, "logps/chosen": -579.0211181640625, "logps/rejected": -941.65283203125, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -7.479080677032471, "rewards/margins": 20.222869873046875, "rewards/rejected": -27.70195198059082, "step": 1480 }, { "epoch": 0.9213063763608087, "grad_norm": 27.423505783081055, "learning_rate": 3.849700322729369e-06, "logits/chosen": 1.4268765449523926, "logits/rejected": 2.176144599914551, "logps/chosen": -381.7594909667969, "logps/rejected": -644.2747192382812, "loss": 0.8959, "rewards/accuracies": 0.875, "rewards/chosen": -5.049097537994385, "rewards/margins": 18.318859100341797, "rewards/rejected": -23.367958068847656, "step": 1481 }, { "epoch": 0.9219284603421461, "grad_norm": 0.021746966987848282, "learning_rate": 3.848547717842324e-06, "logits/chosen": -0.9644485712051392, "logits/rejected": 0.720867395401001, "logps/chosen": -620.6685791015625, "logps/rejected": -1114.089111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.114725112915039, "rewards/margins": 27.469085693359375, "rewards/rejected": -37.58380889892578, "step": 1482 }, { "epoch": 0.9225505443234837, "grad_norm": 7.061457381496439e-06, "learning_rate": 3.847395112955279e-06, "logits/chosen": -1.9248368740081787, "logits/rejected": 3.2156076431274414, "logps/chosen": -459.8319396972656, "logps/rejected": -1093.714599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.586408615112305, "rewards/margins": 35.24261474609375, "rewards/rejected": -46.82902526855469, "step": 1483 }, { "epoch": 0.9231726283048212, "grad_norm": 1.888171027530916e-05, "learning_rate": 3.846242508068235e-06, "logits/chosen": 2.8101515769958496, "logits/rejected": 4.89240837097168, "logps/chosen": -612.6300659179688, "logps/rejected": -977.2906494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.669929504394531, "rewards/margins": 28.796072006225586, "rewards/rejected": -36.465999603271484, "step": 1484 }, { "epoch": 0.9237947122861586, "grad_norm": 1.2461988262657542e-05, "learning_rate": 3.84508990318119e-06, "logits/chosen": 1.7340483665466309, "logits/rejected": 2.461452007293701, "logps/chosen": -687.24658203125, "logps/rejected": -1035.656005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.96574592590332, "rewards/margins": 32.33479690551758, "rewards/rejected": -39.30054473876953, "step": 1485 }, { "epoch": 0.9244167962674961, "grad_norm": 41.07564926147461, "learning_rate": 3.843937298294145e-06, "logits/chosen": 1.1576273441314697, "logits/rejected": 3.3022708892822266, "logps/chosen": -500.56304931640625, "logps/rejected": -773.8848876953125, "loss": 0.6384, "rewards/accuracies": 0.875, "rewards/chosen": -12.655431747436523, "rewards/margins": 11.647870063781738, "rewards/rejected": -24.303302764892578, "step": 1486 }, { "epoch": 0.9250388802488336, "grad_norm": 0.00020040707022417337, "learning_rate": 3.8427846934071e-06, "logits/chosen": 0.566953718662262, "logits/rejected": 5.4430155754089355, "logps/chosen": -459.21337890625, "logps/rejected": -1017.710205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.9802422523498535, "rewards/margins": 29.258119583129883, "rewards/rejected": -34.238365173339844, "step": 1487 }, { "epoch": 0.9256609642301711, "grad_norm": 1.0271210670471191, "learning_rate": 3.8416320885200555e-06, "logits/chosen": 0.3819788098335266, "logits/rejected": 4.326751708984375, "logps/chosen": -546.4390869140625, "logps/rejected": -1124.6964111328125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -9.056602478027344, "rewards/margins": 34.257320404052734, "rewards/rejected": -43.31392288208008, "step": 1488 }, { "epoch": 0.9262830482115085, "grad_norm": 1.6125093679875135e-05, "learning_rate": 3.8404794836330116e-06, "logits/chosen": -0.38977789878845215, "logits/rejected": 3.8898396492004395, "logps/chosen": -374.86932373046875, "logps/rejected": -950.9157104492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.441466331481934, "rewards/margins": 26.366893768310547, "rewards/rejected": -32.8083610534668, "step": 1489 }, { "epoch": 0.926905132192846, "grad_norm": 0.35164663195610046, "learning_rate": 3.839326878745967e-06, "logits/chosen": 0.4527741074562073, "logits/rejected": 4.156494140625, "logps/chosen": -468.0583801269531, "logps/rejected": -877.856201171875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.627846717834473, "rewards/margins": 23.385486602783203, "rewards/rejected": -31.013330459594727, "step": 1490 }, { "epoch": 0.9275272161741835, "grad_norm": 0.315719336271286, "learning_rate": 3.838174273858922e-06, "logits/chosen": 0.0138932466506958, "logits/rejected": 2.9424490928649902, "logps/chosen": -483.2838134765625, "logps/rejected": -983.056396484375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.161341667175293, "rewards/margins": 27.726139068603516, "rewards/rejected": -36.887481689453125, "step": 1491 }, { "epoch": 0.928149300155521, "grad_norm": 4.0229817386716604e-05, "learning_rate": 3.837021668971877e-06, "logits/chosen": -1.5416885614395142, "logits/rejected": 2.365922689437866, "logps/chosen": -419.181884765625, "logps/rejected": -830.7940063476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.081663131713867, "rewards/margins": 18.408069610595703, "rewards/rejected": -24.48973274230957, "step": 1492 }, { "epoch": 0.9287713841368584, "grad_norm": 1.0233517969027162e-05, "learning_rate": 3.8358690640848325e-06, "logits/chosen": -1.0449734926223755, "logits/rejected": 2.454464912414551, "logps/chosen": -428.5576477050781, "logps/rejected": -974.4542236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.8700408935546875, "rewards/margins": 29.465404510498047, "rewards/rejected": -34.335445404052734, "step": 1493 }, { "epoch": 0.929393468118196, "grad_norm": 0.00012100357707822695, "learning_rate": 3.834716459197788e-06, "logits/chosen": -2.1408612728118896, "logits/rejected": 2.4791533946990967, "logps/chosen": -360.2967834472656, "logps/rejected": -843.7381591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2425336837768555, "rewards/margins": 23.199928283691406, "rewards/rejected": -27.442462921142578, "step": 1494 }, { "epoch": 0.9300155520995335, "grad_norm": 0.26367151737213135, "learning_rate": 3.833563854310743e-06, "logits/chosen": 1.8193261623382568, "logits/rejected": 3.4168057441711426, "logps/chosen": -586.12158203125, "logps/rejected": -1028.549072265625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -15.226459503173828, "rewards/margins": 24.45671844482422, "rewards/rejected": -39.68317794799805, "step": 1495 }, { "epoch": 0.9306376360808709, "grad_norm": 3.7611985206604004, "learning_rate": 3.832411249423698e-06, "logits/chosen": -1.5775599479675293, "logits/rejected": 1.8469675779342651, "logps/chosen": -503.709716796875, "logps/rejected": -838.383544921875, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -7.134449481964111, "rewards/margins": 11.143117904663086, "rewards/rejected": -18.27756690979004, "step": 1496 }, { "epoch": 0.9312597200622084, "grad_norm": 0.019973743706941605, "learning_rate": 3.831258644536653e-06, "logits/chosen": 0.1289931833744049, "logits/rejected": 4.267576217651367, "logps/chosen": -532.5215454101562, "logps/rejected": -971.3292846679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.559263229370117, "rewards/margins": 21.614898681640625, "rewards/rejected": -31.174163818359375, "step": 1497 }, { "epoch": 0.9318818040435459, "grad_norm": 28.844974517822266, "learning_rate": 3.830106039649609e-06, "logits/chosen": -1.243943691253662, "logits/rejected": 4.444502830505371, "logps/chosen": -427.8436279296875, "logps/rejected": -1023.44580078125, "loss": 0.3356, "rewards/accuracies": 0.875, "rewards/chosen": -7.57684326171875, "rewards/margins": 24.601680755615234, "rewards/rejected": -32.178524017333984, "step": 1498 }, { "epoch": 0.9325038880248834, "grad_norm": 0.011191684752702713, "learning_rate": 3.828953434762564e-06, "logits/chosen": 1.799675464630127, "logits/rejected": 2.868252754211426, "logps/chosen": -638.8121948242188, "logps/rejected": -927.4824829101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.192627906799316, "rewards/margins": 24.738967895507812, "rewards/rejected": -32.93159484863281, "step": 1499 }, { "epoch": 0.9331259720062208, "grad_norm": 0.0029204629827290773, "learning_rate": 3.827800829875519e-06, "logits/chosen": -1.8656450510025024, "logits/rejected": 4.460055351257324, "logps/chosen": -455.1031799316406, "logps/rejected": -1073.0537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.718934535980225, "rewards/margins": 23.455421447753906, "rewards/rejected": -30.174358367919922, "step": 1500 }, { "epoch": 0.9337480559875583, "grad_norm": 25.7710018157959, "learning_rate": 3.826648224988474e-06, "logits/chosen": 1.2761621475219727, "logits/rejected": 3.3277039527893066, "logps/chosen": -503.96881103515625, "logps/rejected": -754.3949584960938, "loss": 0.2218, "rewards/accuracies": 0.875, "rewards/chosen": -10.906332015991211, "rewards/margins": 16.754884719848633, "rewards/rejected": -27.66121482849121, "step": 1501 }, { "epoch": 0.9343701399688958, "grad_norm": 0.059097982943058014, "learning_rate": 3.8254956201014295e-06, "logits/chosen": 1.6621503829956055, "logits/rejected": 3.892489433288574, "logps/chosen": -591.111328125, "logps/rejected": -929.8556518554688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.30999231338501, "rewards/margins": 23.505043029785156, "rewards/rejected": -29.815034866333008, "step": 1502 }, { "epoch": 0.9349922239502333, "grad_norm": 4.284040187485516e-06, "learning_rate": 3.824343015214385e-06, "logits/chosen": 0.3832207918167114, "logits/rejected": 2.669308662414551, "logps/chosen": -499.63031005859375, "logps/rejected": -856.37939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.689759254455566, "rewards/margins": 24.031463623046875, "rewards/rejected": -33.721221923828125, "step": 1503 }, { "epoch": 0.9356143079315707, "grad_norm": 9.027886699186638e-05, "learning_rate": 3.82319041032734e-06, "logits/chosen": -2.296435594558716, "logits/rejected": 3.5077037811279297, "logps/chosen": -212.14871215820312, "logps/rejected": -839.968505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.725435733795166, "rewards/margins": 32.51136016845703, "rewards/rejected": -34.236793518066406, "step": 1504 }, { "epoch": 0.9362363919129082, "grad_norm": 0.11467395722866058, "learning_rate": 3.822037805440295e-06, "logits/chosen": 1.1032499074935913, "logits/rejected": 3.6253840923309326, "logps/chosen": -658.2777099609375, "logps/rejected": -1048.64013671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.855783462524414, "rewards/margins": 22.73939323425293, "rewards/rejected": -33.595176696777344, "step": 1505 }, { "epoch": 0.9368584758942458, "grad_norm": 17.38050651550293, "learning_rate": 3.82088520055325e-06, "logits/chosen": 0.569591224193573, "logits/rejected": 2.40620493888855, "logps/chosen": -527.115234375, "logps/rejected": -804.364013671875, "loss": 0.1106, "rewards/accuracies": 0.875, "rewards/chosen": -8.470925331115723, "rewards/margins": 20.458789825439453, "rewards/rejected": -28.929716110229492, "step": 1506 }, { "epoch": 0.9374805598755832, "grad_norm": 2.645406084411661e-06, "learning_rate": 3.819732595666206e-06, "logits/chosen": 1.03667151927948, "logits/rejected": 2.616377592086792, "logps/chosen": -580.8767700195312, "logps/rejected": -976.0105590820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.645763397216797, "rewards/margins": 31.013835906982422, "rewards/rejected": -38.65959930419922, "step": 1507 }, { "epoch": 0.9381026438569207, "grad_norm": 0.3618963658809662, "learning_rate": 3.818579990779161e-06, "logits/chosen": 1.1675472259521484, "logits/rejected": 2.6586809158325195, "logps/chosen": -697.2937622070312, "logps/rejected": -1109.1123046875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.925567626953125, "rewards/margins": 25.126672744750977, "rewards/rejected": -32.05223846435547, "step": 1508 }, { "epoch": 0.9387247278382581, "grad_norm": 0.0021237938199192286, "learning_rate": 3.817427385892116e-06, "logits/chosen": -2.1299657821655273, "logits/rejected": 1.384620189666748, "logps/chosen": -426.447509765625, "logps/rejected": -796.8826904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.058237552642822, "rewards/margins": 20.120403289794922, "rewards/rejected": -25.17864227294922, "step": 1509 }, { "epoch": 0.9393468118195957, "grad_norm": 3.2634453773498535, "learning_rate": 3.816274781005071e-06, "logits/chosen": -0.11459943652153015, "logits/rejected": 3.2260124683380127, "logps/chosen": -421.47979736328125, "logps/rejected": -808.63720703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -7.018412113189697, "rewards/margins": 19.303525924682617, "rewards/rejected": -26.321937561035156, "step": 1510 }, { "epoch": 0.9399688958009331, "grad_norm": 0.0048073939979076385, "learning_rate": 3.8151221761180265e-06, "logits/chosen": 1.1135034561157227, "logits/rejected": 4.2871599197387695, "logps/chosen": -391.2454833984375, "logps/rejected": -800.1884155273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.755030632019043, "rewards/margins": 24.084712982177734, "rewards/rejected": -29.83974266052246, "step": 1511 }, { "epoch": 0.9405909797822706, "grad_norm": 3.793792963027954, "learning_rate": 3.813969571230982e-06, "logits/chosen": 2.290614604949951, "logits/rejected": 3.953888177871704, "logps/chosen": -527.8026123046875, "logps/rejected": -860.7406005859375, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -6.779989719390869, "rewards/margins": 21.989734649658203, "rewards/rejected": -28.769725799560547, "step": 1512 }, { "epoch": 0.9412130637636081, "grad_norm": 2.0059680537087843e-05, "learning_rate": 3.8128169663439374e-06, "logits/chosen": -0.5008499622344971, "logits/rejected": 3.417922019958496, "logps/chosen": -479.7211608886719, "logps/rejected": -1060.9326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.907632827758789, "rewards/margins": 34.93695068359375, "rewards/rejected": -43.84458541870117, "step": 1513 }, { "epoch": 0.9418351477449456, "grad_norm": 28.518451690673828, "learning_rate": 3.8116643614568926e-06, "logits/chosen": 2.51342511177063, "logits/rejected": 3.519273281097412, "logps/chosen": -710.60791015625, "logps/rejected": -995.9493408203125, "loss": 0.1775, "rewards/accuracies": 0.875, "rewards/chosen": -8.318286895751953, "rewards/margins": 25.26766586303711, "rewards/rejected": -33.58595275878906, "step": 1514 }, { "epoch": 0.942457231726283, "grad_norm": 30.555561065673828, "learning_rate": 3.810511756569848e-06, "logits/chosen": -1.0337927341461182, "logits/rejected": 4.793352127075195, "logps/chosen": -364.50323486328125, "logps/rejected": -910.0731811523438, "loss": 1.2915, "rewards/accuracies": 0.875, "rewards/chosen": -4.789246559143066, "rewards/margins": 18.14818000793457, "rewards/rejected": -22.93742561340332, "step": 1515 }, { "epoch": 0.9430793157076205, "grad_norm": 5.560061300258212e-09, "learning_rate": 3.809359151682803e-06, "logits/chosen": 1.3392835855484009, "logits/rejected": 4.3378682136535645, "logps/chosen": -580.5053100585938, "logps/rejected": -1080.9197998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.431815147399902, "rewards/margins": 34.50389099121094, "rewards/rejected": -42.935707092285156, "step": 1516 }, { "epoch": 0.943701399688958, "grad_norm": 0.00020487657457124442, "learning_rate": 3.8082065467957587e-06, "logits/chosen": -1.4015358686447144, "logits/rejected": 1.7667157649993896, "logps/chosen": -367.30902099609375, "logps/rejected": -756.1085815429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.350221633911133, "rewards/margins": 23.439151763916016, "rewards/rejected": -28.789371490478516, "step": 1517 }, { "epoch": 0.9443234836702955, "grad_norm": 29.253721237182617, "learning_rate": 3.807053941908714e-06, "logits/chosen": -1.1160175800323486, "logits/rejected": 3.391474723815918, "logps/chosen": -453.40234375, "logps/rejected": -814.8978271484375, "loss": 0.301, "rewards/accuracies": 0.875, "rewards/chosen": -4.987330913543701, "rewards/margins": 16.134384155273438, "rewards/rejected": -21.121715545654297, "step": 1518 }, { "epoch": 0.944945567651633, "grad_norm": 1.373146414756775, "learning_rate": 3.805901337021669e-06, "logits/chosen": 1.4696600437164307, "logits/rejected": 1.907806396484375, "logps/chosen": -587.392333984375, "logps/rejected": -867.681884765625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.671820640563965, "rewards/margins": 20.09083366394043, "rewards/rejected": -27.762653350830078, "step": 1519 }, { "epoch": 0.9455676516329704, "grad_norm": 0.1505119949579239, "learning_rate": 3.8047487321346244e-06, "logits/chosen": -0.6864757537841797, "logits/rejected": 2.311563014984131, "logps/chosen": -537.08984375, "logps/rejected": -973.3675537109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.800401210784912, "rewards/margins": 29.207828521728516, "rewards/rejected": -37.00823211669922, "step": 1520 }, { "epoch": 0.946189735614308, "grad_norm": 7.776718848617747e-05, "learning_rate": 3.8035961272475796e-06, "logits/chosen": 1.3452345132827759, "logits/rejected": 3.4356236457824707, "logps/chosen": -501.22119140625, "logps/rejected": -830.6551513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.098999500274658, "rewards/margins": 22.240488052368164, "rewards/rejected": -28.339487075805664, "step": 1521 }, { "epoch": 0.9468118195956454, "grad_norm": 0.03515046462416649, "learning_rate": 3.802443522360535e-06, "logits/chosen": -2.4966351985931396, "logits/rejected": 4.5510101318359375, "logps/chosen": -336.7701416015625, "logps/rejected": -1093.436767578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.726394176483154, "rewards/margins": 35.122657775878906, "rewards/rejected": -41.84905242919922, "step": 1522 }, { "epoch": 0.9474339035769829, "grad_norm": 4.1857827454805374e-05, "learning_rate": 3.80129091747349e-06, "logits/chosen": 0.1419128179550171, "logits/rejected": 4.158993244171143, "logps/chosen": -575.00390625, "logps/rejected": -1083.60546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.428834915161133, "rewards/margins": 30.497568130493164, "rewards/rejected": -41.92639923095703, "step": 1523 }, { "epoch": 0.9480559875583203, "grad_norm": 0.005177459679543972, "learning_rate": 3.8001383125864457e-06, "logits/chosen": 0.1375989019870758, "logits/rejected": 2.984259605407715, "logps/chosen": -510.2491455078125, "logps/rejected": -893.9998779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.213672637939453, "rewards/margins": 26.06291961669922, "rewards/rejected": -32.27659225463867, "step": 1524 }, { "epoch": 0.9486780715396579, "grad_norm": 37.03826904296875, "learning_rate": 3.798985707699401e-06, "logits/chosen": 0.5167028307914734, "logits/rejected": 3.305917739868164, "logps/chosen": -637.629150390625, "logps/rejected": -1092.0506591796875, "loss": 1.057, "rewards/accuracies": 0.875, "rewards/chosen": -12.100252151489258, "rewards/margins": 23.94793128967285, "rewards/rejected": -36.04818344116211, "step": 1525 }, { "epoch": 0.9493001555209953, "grad_norm": 0.01407893281430006, "learning_rate": 3.797833102812356e-06, "logits/chosen": -1.060408115386963, "logits/rejected": 4.336411952972412, "logps/chosen": -326.1235046386719, "logps/rejected": -876.7889404296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.4298095703125, "rewards/margins": 23.324308395385742, "rewards/rejected": -28.754119873046875, "step": 1526 }, { "epoch": 0.9499222395023328, "grad_norm": 43.05733871459961, "learning_rate": 3.7966804979253114e-06, "logits/chosen": -0.4274927079677582, "logits/rejected": 4.001701354980469, "logps/chosen": -360.96649169921875, "logps/rejected": -897.93408203125, "loss": 1.0622, "rewards/accuracies": 0.875, "rewards/chosen": -4.60791540145874, "rewards/margins": 26.19590950012207, "rewards/rejected": -30.80382537841797, "step": 1527 }, { "epoch": 0.9505443234836704, "grad_norm": 0.0020403736270964146, "learning_rate": 3.7955278930382666e-06, "logits/chosen": 0.3815556764602661, "logits/rejected": 4.2986369132995605, "logps/chosen": -537.9657592773438, "logps/rejected": -1013.0636596679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.577686309814453, "rewards/margins": 27.301551818847656, "rewards/rejected": -35.87923812866211, "step": 1528 }, { "epoch": 0.9511664074650078, "grad_norm": 0.0007894797599874437, "learning_rate": 3.794375288151222e-06, "logits/chosen": 0.7989105582237244, "logits/rejected": 4.2252912521362305, "logps/chosen": -527.30322265625, "logps/rejected": -973.3763427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.929349899291992, "rewards/margins": 28.399248123168945, "rewards/rejected": -36.32859802246094, "step": 1529 }, { "epoch": 0.9517884914463453, "grad_norm": 0.00019009016978088766, "learning_rate": 3.793222683264177e-06, "logits/chosen": 1.2304333448410034, "logits/rejected": 4.029991626739502, "logps/chosen": -571.3868408203125, "logps/rejected": -926.7520141601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0893449783325195, "rewards/margins": 19.844104766845703, "rewards/rejected": -26.933452606201172, "step": 1530 }, { "epoch": 0.9524105754276827, "grad_norm": 0.0637696161866188, "learning_rate": 3.7920700783771323e-06, "logits/chosen": 2.5368003845214844, "logits/rejected": 4.378127574920654, "logps/chosen": -632.891845703125, "logps/rejected": -1011.991455078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.220358848571777, "rewards/margins": 26.87921714782715, "rewards/rejected": -35.09957504272461, "step": 1531 }, { "epoch": 0.9530326594090203, "grad_norm": 0.38544222712516785, "learning_rate": 3.790917473490088e-06, "logits/chosen": -0.9029761552810669, "logits/rejected": 3.0724494457244873, "logps/chosen": -549.3706665039062, "logps/rejected": -1025.44677734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.191251754760742, "rewards/margins": 22.814775466918945, "rewards/rejected": -31.006027221679688, "step": 1532 }, { "epoch": 0.9536547433903577, "grad_norm": 34.8115234375, "learning_rate": 3.789764868603043e-06, "logits/chosen": 1.5727530717849731, "logits/rejected": 5.6106414794921875, "logps/chosen": -443.1806640625, "logps/rejected": -793.2718505859375, "loss": 0.4784, "rewards/accuracies": 0.875, "rewards/chosen": -6.441493511199951, "rewards/margins": 17.451435089111328, "rewards/rejected": -23.892925262451172, "step": 1533 }, { "epoch": 0.9542768273716952, "grad_norm": 0.00016758311539888382, "learning_rate": 3.7886122637159984e-06, "logits/chosen": 1.5388946533203125, "logits/rejected": 4.52772331237793, "logps/chosen": -596.45068359375, "logps/rejected": -1042.272216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.116270065307617, "rewards/margins": 28.782930374145508, "rewards/rejected": -36.899200439453125, "step": 1534 }, { "epoch": 0.9548989113530326, "grad_norm": 0.010228978469967842, "learning_rate": 3.7874596588289536e-06, "logits/chosen": 2.3814992904663086, "logits/rejected": 3.1295394897460938, "logps/chosen": -621.3070068359375, "logps/rejected": -937.0552368164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.042771816253662, "rewards/margins": 26.261520385742188, "rewards/rejected": -32.304290771484375, "step": 1535 }, { "epoch": 0.9555209953343702, "grad_norm": 0.0004916785983368754, "learning_rate": 3.786307053941909e-06, "logits/chosen": -0.32576048374176025, "logits/rejected": 2.7030093669891357, "logps/chosen": -467.4586181640625, "logps/rejected": -823.8536376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.294271945953369, "rewards/margins": 24.553098678588867, "rewards/rejected": -31.847370147705078, "step": 1536 }, { "epoch": 0.9561430793157076, "grad_norm": 2.831015110015869, "learning_rate": 3.785154449054864e-06, "logits/chosen": 0.12055912613868713, "logits/rejected": 2.6946396827697754, "logps/chosen": -471.73199462890625, "logps/rejected": -810.045166015625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -7.989734649658203, "rewards/margins": 22.072750091552734, "rewards/rejected": -30.062484741210938, "step": 1537 }, { "epoch": 0.9567651632970451, "grad_norm": 0.1458519995212555, "learning_rate": 3.7840018441678193e-06, "logits/chosen": -1.1733816862106323, "logits/rejected": 3.1834568977355957, "logps/chosen": -345.3623352050781, "logps/rejected": -824.790771484375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.023584842681885, "rewards/margins": 25.241973876953125, "rewards/rejected": -30.26555824279785, "step": 1538 }, { "epoch": 0.9573872472783825, "grad_norm": 0.05512907728552818, "learning_rate": 3.782849239280775e-06, "logits/chosen": -1.629906177520752, "logits/rejected": 0.8318213820457458, "logps/chosen": -447.51483154296875, "logps/rejected": -814.4088134765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.437944412231445, "rewards/margins": 23.343429565429688, "rewards/rejected": -29.7813720703125, "step": 1539 }, { "epoch": 0.9580093312597201, "grad_norm": 3.436262545619684e-07, "learning_rate": 3.78169663439373e-06, "logits/chosen": -0.798626184463501, "logits/rejected": 3.8584299087524414, "logps/chosen": -544.6287841796875, "logps/rejected": -1154.15478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.424917697906494, "rewards/margins": 31.494213104248047, "rewards/rejected": -37.919132232666016, "step": 1540 }, { "epoch": 0.9586314152410575, "grad_norm": 5.306674779603782e-07, "learning_rate": 3.7805440295066854e-06, "logits/chosen": -1.1544137001037598, "logits/rejected": 4.800648212432861, "logps/chosen": -271.69183349609375, "logps/rejected": -902.1212768554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7454826831817627, "rewards/margins": 29.29214096069336, "rewards/rejected": -32.037620544433594, "step": 1541 }, { "epoch": 0.959253499222395, "grad_norm": 7.686992168426514, "learning_rate": 3.7793914246196406e-06, "logits/chosen": 0.2284245491027832, "logits/rejected": 2.3884739875793457, "logps/chosen": -577.6343994140625, "logps/rejected": -819.865234375, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -7.3329057693481445, "rewards/margins": 20.446739196777344, "rewards/rejected": -27.779645919799805, "step": 1542 }, { "epoch": 0.9598755832037325, "grad_norm": 0.03158849850296974, "learning_rate": 3.778238819732596e-06, "logits/chosen": -3.3085033893585205, "logits/rejected": 2.410858154296875, "logps/chosen": -369.4300537109375, "logps/rejected": -1009.00634765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.852804660797119, "rewards/margins": 29.572834014892578, "rewards/rejected": -33.42564010620117, "step": 1543 }, { "epoch": 0.96049766718507, "grad_norm": 0.8256038427352905, "learning_rate": 3.777086214845551e-06, "logits/chosen": 0.9084221720695496, "logits/rejected": 3.3126614093780518, "logps/chosen": -562.0577392578125, "logps/rejected": -833.6492309570312, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -6.350132942199707, "rewards/margins": 18.237110137939453, "rewards/rejected": -24.587242126464844, "step": 1544 }, { "epoch": 0.9611197511664075, "grad_norm": 0.02136695571243763, "learning_rate": 3.7759336099585063e-06, "logits/chosen": -0.20459318161010742, "logits/rejected": 3.8194947242736816, "logps/chosen": -334.81689453125, "logps/rejected": -757.7464599609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.10609769821167, "rewards/margins": 27.78612518310547, "rewards/rejected": -29.892223358154297, "step": 1545 }, { "epoch": 0.9617418351477449, "grad_norm": 0.021653296425938606, "learning_rate": 3.774781005071462e-06, "logits/chosen": 0.6639248132705688, "logits/rejected": 4.86984920501709, "logps/chosen": -418.23291015625, "logps/rejected": -902.3069458007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.3227763175964355, "rewards/margins": 27.793251037597656, "rewards/rejected": -34.11602783203125, "step": 1546 }, { "epoch": 0.9623639191290825, "grad_norm": 0.8193944692611694, "learning_rate": 3.773628400184417e-06, "logits/chosen": -1.013139009475708, "logits/rejected": 1.9905011653900146, "logps/chosen": -330.7326354980469, "logps/rejected": -710.2229614257812, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.1681528091430664, "rewards/margins": 18.948850631713867, "rewards/rejected": -22.11700439453125, "step": 1547 }, { "epoch": 0.9629860031104199, "grad_norm": 0.0021697860211133957, "learning_rate": 3.7724757952973724e-06, "logits/chosen": -0.6302704215049744, "logits/rejected": 3.592517614364624, "logps/chosen": -392.3840026855469, "logps/rejected": -913.2039794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.25330924987793, "rewards/margins": 28.252193450927734, "rewards/rejected": -32.50550079345703, "step": 1548 }, { "epoch": 0.9636080870917574, "grad_norm": 0.0017832452431321144, "learning_rate": 3.7713231904103276e-06, "logits/chosen": 2.577538013458252, "logits/rejected": 4.379067420959473, "logps/chosen": -711.4339599609375, "logps/rejected": -950.3943481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.664702415466309, "rewards/margins": 21.427143096923828, "rewards/rejected": -28.091846466064453, "step": 1549 }, { "epoch": 0.9642301710730948, "grad_norm": 0.010014387778937817, "learning_rate": 3.770170585523283e-06, "logits/chosen": 1.4134823083877563, "logits/rejected": 3.6276726722717285, "logps/chosen": -564.8404541015625, "logps/rejected": -985.745849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.489076614379883, "rewards/margins": 26.61454963684082, "rewards/rejected": -34.1036262512207, "step": 1550 }, { "epoch": 0.9648522550544324, "grad_norm": 0.14533917605876923, "learning_rate": 3.769017980636238e-06, "logits/chosen": 1.69158935546875, "logits/rejected": 4.186570644378662, "logps/chosen": -550.7415771484375, "logps/rejected": -960.3980712890625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.251543998718262, "rewards/margins": 28.392061233520508, "rewards/rejected": -35.64360046386719, "step": 1551 }, { "epoch": 0.9654743390357698, "grad_norm": 3.336298704147339, "learning_rate": 3.7678653757491933e-06, "logits/chosen": -2.7250447273254395, "logits/rejected": 2.549415111541748, "logps/chosen": -451.37615966796875, "logps/rejected": -1036.66748046875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -7.669151306152344, "rewards/margins": 29.32394027709961, "rewards/rejected": -36.99309158325195, "step": 1552 }, { "epoch": 0.9660964230171073, "grad_norm": 0.28479263186454773, "learning_rate": 3.766712770862149e-06, "logits/chosen": 1.810473084449768, "logits/rejected": 2.76468563079834, "logps/chosen": -551.802734375, "logps/rejected": -898.9547119140625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -9.322225570678711, "rewards/margins": 22.87482452392578, "rewards/rejected": -32.197052001953125, "step": 1553 }, { "epoch": 0.9667185069984447, "grad_norm": 0.0004268392804078758, "learning_rate": 3.765560165975104e-06, "logits/chosen": 2.188840866088867, "logits/rejected": 4.608028411865234, "logps/chosen": -606.5416870117188, "logps/rejected": -998.5819091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.792022705078125, "rewards/margins": 22.136215209960938, "rewards/rejected": -31.928239822387695, "step": 1554 }, { "epoch": 0.9673405909797823, "grad_norm": 0.03662179782986641, "learning_rate": 3.7644075610880594e-06, "logits/chosen": 1.1045016050338745, "logits/rejected": 4.016812324523926, "logps/chosen": -576.162353515625, "logps/rejected": -997.515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.6547722816467285, "rewards/margins": 24.034767150878906, "rewards/rejected": -31.68954086303711, "step": 1555 }, { "epoch": 0.9679626749611198, "grad_norm": 0.42050519585609436, "learning_rate": 3.7632549562010146e-06, "logits/chosen": 1.1344008445739746, "logits/rejected": 3.8797502517700195, "logps/chosen": -490.56060791015625, "logps/rejected": -869.088134765625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -9.672945022583008, "rewards/margins": 23.849864959716797, "rewards/rejected": -33.52280807495117, "step": 1556 }, { "epoch": 0.9685847589424572, "grad_norm": 8.319174230564386e-05, "learning_rate": 3.76210235131397e-06, "logits/chosen": -1.3127119541168213, "logits/rejected": 2.8727219104766846, "logps/chosen": -462.66583251953125, "logps/rejected": -927.9498901367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.315878391265869, "rewards/margins": 25.330734252929688, "rewards/rejected": -31.6466121673584, "step": 1557 }, { "epoch": 0.9692068429237947, "grad_norm": 24.195049285888672, "learning_rate": 3.760949746426925e-06, "logits/chosen": 3.776907444000244, "logits/rejected": 5.161801338195801, "logps/chosen": -647.3763427734375, "logps/rejected": -823.6619873046875, "loss": 0.2038, "rewards/accuracies": 0.875, "rewards/chosen": -8.529730796813965, "rewards/margins": 16.111204147338867, "rewards/rejected": -24.640933990478516, "step": 1558 }, { "epoch": 0.9698289269051322, "grad_norm": 0.14263981580734253, "learning_rate": 3.7597971415398803e-06, "logits/chosen": 0.6714162826538086, "logits/rejected": 3.141258716583252, "logps/chosen": -566.3853149414062, "logps/rejected": -1052.81103515625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -10.304349899291992, "rewards/margins": 30.776416778564453, "rewards/rejected": -41.08076858520508, "step": 1559 }, { "epoch": 0.9704510108864697, "grad_norm": 0.06349503993988037, "learning_rate": 3.7586445366528355e-06, "logits/chosen": 3.69173526763916, "logits/rejected": 4.2286505699157715, "logps/chosen": -663.752685546875, "logps/rejected": -996.7243041992188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.244875907897949, "rewards/margins": 24.53765296936035, "rewards/rejected": -31.782527923583984, "step": 1560 }, { "epoch": 0.9710730948678071, "grad_norm": 7.525384902954102, "learning_rate": 3.757491931765791e-06, "logits/chosen": -0.16161316633224487, "logits/rejected": 0.01133960485458374, "logps/chosen": -527.05859375, "logps/rejected": -715.83984375, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -5.3956475257873535, "rewards/margins": 16.443391799926758, "rewards/rejected": -21.839038848876953, "step": 1561 }, { "epoch": 0.9716951788491446, "grad_norm": 0.00021120811288710684, "learning_rate": 3.7563393268787464e-06, "logits/chosen": -0.11896657943725586, "logits/rejected": 3.7753396034240723, "logps/chosen": -415.9942626953125, "logps/rejected": -981.8436279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.215848445892334, "rewards/margins": 32.04558563232422, "rewards/rejected": -37.261436462402344, "step": 1562 }, { "epoch": 0.9723172628304821, "grad_norm": 7.325221538543701, "learning_rate": 3.7551867219917016e-06, "logits/chosen": 0.15939027070999146, "logits/rejected": 1.709261417388916, "logps/chosen": -630.5030517578125, "logps/rejected": -840.1243286132812, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -13.626913070678711, "rewards/margins": 15.032745361328125, "rewards/rejected": -28.659658432006836, "step": 1563 }, { "epoch": 0.9729393468118196, "grad_norm": 0.1605205535888672, "learning_rate": 3.754034117104657e-06, "logits/chosen": 1.4628174304962158, "logits/rejected": 4.365293502807617, "logps/chosen": -595.6699829101562, "logps/rejected": -947.2667236328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.8566484451293945, "rewards/margins": 21.407180786132812, "rewards/rejected": -29.26382827758789, "step": 1564 }, { "epoch": 0.973561430793157, "grad_norm": 2.649874448776245, "learning_rate": 3.752881512217612e-06, "logits/chosen": -4.495944976806641, "logits/rejected": 2.500077247619629, "logps/chosen": -270.2780456542969, "logps/rejected": -841.184326171875, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -3.3575921058654785, "rewards/margins": 22.65192413330078, "rewards/rejected": -26.009517669677734, "step": 1565 }, { "epoch": 0.9741835147744946, "grad_norm": 0.049845147877931595, "learning_rate": 3.7517289073305673e-06, "logits/chosen": 1.8272593021392822, "logits/rejected": 3.8699135780334473, "logps/chosen": -470.1072692871094, "logps/rejected": -900.577392578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.356943130493164, "rewards/margins": 22.48064422607422, "rewards/rejected": -28.83758544921875, "step": 1566 }, { "epoch": 0.9748055987558321, "grad_norm": 0.03664658963680267, "learning_rate": 3.7505763024435225e-06, "logits/chosen": 2.496447801589966, "logits/rejected": 3.1999642848968506, "logps/chosen": -694.1896362304688, "logps/rejected": -1058.082275390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.669995307922363, "rewards/margins": 21.938507080078125, "rewards/rejected": -32.60850143432617, "step": 1567 }, { "epoch": 0.9754276827371695, "grad_norm": 1.3099258467264008e-05, "learning_rate": 3.749423697556478e-06, "logits/chosen": 1.693497896194458, "logits/rejected": 3.8283023834228516, "logps/chosen": -559.7762451171875, "logps/rejected": -944.8941040039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.410549163818359, "rewards/margins": 24.579004287719727, "rewards/rejected": -31.989551544189453, "step": 1568 }, { "epoch": 0.976049766718507, "grad_norm": 0.0269145630300045, "learning_rate": 3.7482710926694334e-06, "logits/chosen": 1.4291343688964844, "logits/rejected": 2.6044325828552246, "logps/chosen": -597.8721923828125, "logps/rejected": -844.9185180664062, "loss": 0.0866, "rewards/accuracies": 1.0, "rewards/chosen": -7.501439571380615, "rewards/margins": 19.320228576660156, "rewards/rejected": -26.821666717529297, "step": 1569 }, { "epoch": 0.9766718506998445, "grad_norm": 11.221540451049805, "learning_rate": 3.7471184877823886e-06, "logits/chosen": -2.356743335723877, "logits/rejected": 0.9585628509521484, "logps/chosen": -409.2689514160156, "logps/rejected": -786.9153442382812, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -3.4099254608154297, "rewards/margins": 18.11174201965332, "rewards/rejected": -21.52166748046875, "step": 1570 }, { "epoch": 0.977293934681182, "grad_norm": 0.6764800548553467, "learning_rate": 3.745965882895344e-06, "logits/chosen": -0.759571373462677, "logits/rejected": 2.580238103866577, "logps/chosen": -517.606689453125, "logps/rejected": -927.9739379882812, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.846333980560303, "rewards/margins": 23.722732543945312, "rewards/rejected": -30.56906509399414, "step": 1571 }, { "epoch": 0.9779160186625194, "grad_norm": 0.004113756585866213, "learning_rate": 3.744813278008299e-06, "logits/chosen": 0.08926576375961304, "logits/rejected": 3.5595028400421143, "logps/chosen": -414.646240234375, "logps/rejected": -768.23193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.433351039886475, "rewards/margins": 15.794235229492188, "rewards/rejected": -23.22758674621582, "step": 1572 }, { "epoch": 0.9785381026438569, "grad_norm": 28.806289672851562, "learning_rate": 3.7436606731212543e-06, "logits/chosen": -3.4051802158355713, "logits/rejected": 4.699017524719238, "logps/chosen": -343.5263366699219, "logps/rejected": -1104.3868408203125, "loss": 0.5611, "rewards/accuracies": 0.875, "rewards/chosen": -8.073864936828613, "rewards/margins": 31.18024253845215, "rewards/rejected": -39.25410461425781, "step": 1573 }, { "epoch": 0.9791601866251944, "grad_norm": 13.794449806213379, "learning_rate": 3.7425080682342095e-06, "logits/chosen": 1.9040722846984863, "logits/rejected": 2.7815659046173096, "logps/chosen": -543.7911376953125, "logps/rejected": -792.92333984375, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": -8.056597709655762, "rewards/margins": 22.866260528564453, "rewards/rejected": -30.92285919189453, "step": 1574 }, { "epoch": 0.9797822706065319, "grad_norm": 1.1686525344848633, "learning_rate": 3.741355463347165e-06, "logits/chosen": -0.2909442186355591, "logits/rejected": 2.8880224227905273, "logps/chosen": -557.0026245117188, "logps/rejected": -963.135498046875, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -8.541722297668457, "rewards/margins": 24.39960289001465, "rewards/rejected": -32.941322326660156, "step": 1575 }, { "epoch": 0.9804043545878693, "grad_norm": 19.24813461303711, "learning_rate": 3.7402028584601204e-06, "logits/chosen": 2.339660406112671, "logits/rejected": 2.4095168113708496, "logps/chosen": -645.5527954101562, "logps/rejected": -958.1924438476562, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": -11.453535079956055, "rewards/margins": 22.29770278930664, "rewards/rejected": -33.75123977661133, "step": 1576 }, { "epoch": 0.9810264385692068, "grad_norm": 0.04252305626869202, "learning_rate": 3.7390502535730756e-06, "logits/chosen": -0.9952183961868286, "logits/rejected": 2.1234028339385986, "logps/chosen": -267.9193420410156, "logps/rejected": -676.8464965820312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.890249252319336, "rewards/margins": 26.59477996826172, "rewards/rejected": -29.485031127929688, "step": 1577 }, { "epoch": 0.9816485225505444, "grad_norm": 0.010037853382527828, "learning_rate": 3.737897648686031e-06, "logits/chosen": -0.8864223957061768, "logits/rejected": 3.127655267715454, "logps/chosen": -308.0932312011719, "logps/rejected": -827.228271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.313598871231079, "rewards/margins": 30.997886657714844, "rewards/rejected": -34.311485290527344, "step": 1578 }, { "epoch": 0.9822706065318818, "grad_norm": 7.811158866388723e-05, "learning_rate": 3.736745043798986e-06, "logits/chosen": -2.4733104705810547, "logits/rejected": 2.745809555053711, "logps/chosen": -557.5784912109375, "logps/rejected": -1105.34423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.980599880218506, "rewards/margins": 29.99527931213379, "rewards/rejected": -35.97587966918945, "step": 1579 }, { "epoch": 0.9828926905132193, "grad_norm": 27.442073822021484, "learning_rate": 3.7355924389119413e-06, "logits/chosen": -0.202871173620224, "logits/rejected": 2.683882713317871, "logps/chosen": -478.58892822265625, "logps/rejected": -933.044677734375, "loss": 0.217, "rewards/accuracies": 0.875, "rewards/chosen": -6.578217506408691, "rewards/margins": 20.964635848999023, "rewards/rejected": -27.5428524017334, "step": 1580 }, { "epoch": 0.9835147744945568, "grad_norm": 0.003448404837399721, "learning_rate": 3.7344398340248965e-06, "logits/chosen": -0.1981586217880249, "logits/rejected": 3.0170540809631348, "logps/chosen": -482.3427429199219, "logps/rejected": -849.2074584960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.227471828460693, "rewards/margins": 25.559865951538086, "rewards/rejected": -32.78733825683594, "step": 1581 }, { "epoch": 0.9841368584758943, "grad_norm": 25.31926918029785, "learning_rate": 3.7332872291378517e-06, "logits/chosen": 1.5486270189285278, "logits/rejected": 3.3535120487213135, "logps/chosen": -537.72119140625, "logps/rejected": -846.4741821289062, "loss": 0.3553, "rewards/accuracies": 0.875, "rewards/chosen": -9.907859802246094, "rewards/margins": 20.8604736328125, "rewards/rejected": -30.768333435058594, "step": 1582 }, { "epoch": 0.9847589424572317, "grad_norm": 0.2907392680644989, "learning_rate": 3.7321346242508073e-06, "logits/chosen": 2.166731119155884, "logits/rejected": 3.08122181892395, "logps/chosen": -654.4718017578125, "logps/rejected": -1042.1151123046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.967266082763672, "rewards/margins": 25.029521942138672, "rewards/rejected": -34.99678421020508, "step": 1583 }, { "epoch": 0.9853810264385692, "grad_norm": 48.268089294433594, "learning_rate": 3.7309820193637626e-06, "logits/chosen": 0.6497130990028381, "logits/rejected": 3.1703836917877197, "logps/chosen": -576.513916015625, "logps/rejected": -972.1256103515625, "loss": 0.5888, "rewards/accuracies": 0.875, "rewards/chosen": -13.059791564941406, "rewards/margins": 23.662952423095703, "rewards/rejected": -36.72274398803711, "step": 1584 }, { "epoch": 0.9860031104199067, "grad_norm": 0.12340130656957626, "learning_rate": 3.729829414476718e-06, "logits/chosen": -1.3608407974243164, "logits/rejected": 3.115481376647949, "logps/chosen": -305.034423828125, "logps/rejected": -866.042724609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.087643623352051, "rewards/margins": 27.852447509765625, "rewards/rejected": -32.94009017944336, "step": 1585 }, { "epoch": 0.9866251944012442, "grad_norm": 0.01151037123054266, "learning_rate": 3.728676809589673e-06, "logits/chosen": -0.01191103458404541, "logits/rejected": 4.217233180999756, "logps/chosen": -447.7720031738281, "logps/rejected": -1023.5369873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.726734161376953, "rewards/margins": 28.46622657775879, "rewards/rejected": -34.19295883178711, "step": 1586 }, { "epoch": 0.9872472783825816, "grad_norm": 8.350014013558393e-08, "learning_rate": 3.7275242047026282e-06, "logits/chosen": -1.0805532932281494, "logits/rejected": 2.784482479095459, "logps/chosen": -338.6806640625, "logps/rejected": -867.11376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.449554443359375, "rewards/margins": 31.410503387451172, "rewards/rejected": -37.86006164550781, "step": 1587 }, { "epoch": 0.9878693623639191, "grad_norm": 0.25290653109550476, "learning_rate": 3.7263715998155835e-06, "logits/chosen": -1.5539438724517822, "logits/rejected": 4.337480545043945, "logps/chosen": -450.3958740234375, "logps/rejected": -1068.654052734375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.97259521484375, "rewards/margins": 27.88930320739746, "rewards/rejected": -35.86189651489258, "step": 1588 }, { "epoch": 0.9884914463452567, "grad_norm": 1.1513815678654282e-07, "learning_rate": 3.7252189949285387e-06, "logits/chosen": -0.6780804395675659, "logits/rejected": 3.5184125900268555, "logps/chosen": -455.1690368652344, "logps/rejected": -1012.3525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.412100791931152, "rewards/margins": 34.602020263671875, "rewards/rejected": -44.01411819458008, "step": 1589 }, { "epoch": 0.9891135303265941, "grad_norm": 0.03904994949698448, "learning_rate": 3.7240663900414943e-06, "logits/chosen": 0.6309325695037842, "logits/rejected": 2.775806427001953, "logps/chosen": -541.5093383789062, "logps/rejected": -961.15380859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.032151222229004, "rewards/margins": 29.603219985961914, "rewards/rejected": -36.635372161865234, "step": 1590 }, { "epoch": 0.9897356143079316, "grad_norm": 2.7226502652411e-07, "learning_rate": 3.7229137851544496e-06, "logits/chosen": -1.4838294982910156, "logits/rejected": 3.9604568481445312, "logps/chosen": -425.91741943359375, "logps/rejected": -1060.175048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.890406608581543, "rewards/margins": 33.74592208862305, "rewards/rejected": -41.636329650878906, "step": 1591 }, { "epoch": 0.990357698289269, "grad_norm": 0.00038821130874566734, "learning_rate": 3.721761180267405e-06, "logits/chosen": 3.0226492881774902, "logits/rejected": 3.96284818649292, "logps/chosen": -622.4625244140625, "logps/rejected": -963.4630126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.260305404663086, "rewards/margins": 29.045948028564453, "rewards/rejected": -36.306251525878906, "step": 1592 }, { "epoch": 0.9909797822706066, "grad_norm": 21.46782684326172, "learning_rate": 3.72060857538036e-06, "logits/chosen": 0.38420045375823975, "logits/rejected": 2.7506988048553467, "logps/chosen": -505.051025390625, "logps/rejected": -888.744384765625, "loss": 0.1871, "rewards/accuracies": 0.875, "rewards/chosen": -9.530044555664062, "rewards/margins": 24.28890037536621, "rewards/rejected": -33.818946838378906, "step": 1593 }, { "epoch": 0.991601866251944, "grad_norm": 24.086999893188477, "learning_rate": 3.7194559704933152e-06, "logits/chosen": 2.784050464630127, "logits/rejected": 4.437070369720459, "logps/chosen": -740.00634765625, "logps/rejected": -1098.108642578125, "loss": 0.1265, "rewards/accuracies": 0.875, "rewards/chosen": -12.069332122802734, "rewards/margins": 26.843494415283203, "rewards/rejected": -38.91282653808594, "step": 1594 }, { "epoch": 0.9922239502332815, "grad_norm": 0.003621672512963414, "learning_rate": 3.7183033656062705e-06, "logits/chosen": -1.716835379600525, "logits/rejected": 4.3016862869262695, "logps/chosen": -372.40069580078125, "logps/rejected": -949.9512329101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.385269165039062, "rewards/margins": 27.320537567138672, "rewards/rejected": -36.705806732177734, "step": 1595 }, { "epoch": 0.9928460342146189, "grad_norm": 7.639461994171143, "learning_rate": 3.7171507607192257e-06, "logits/chosen": 2.2062644958496094, "logits/rejected": 3.4240951538085938, "logps/chosen": -706.8009033203125, "logps/rejected": -994.1138305664062, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": -8.54970932006836, "rewards/margins": 16.25775718688965, "rewards/rejected": -24.807464599609375, "step": 1596 }, { "epoch": 0.9934681181959565, "grad_norm": 6.353683090765117e-08, "learning_rate": 3.7159981558321813e-06, "logits/chosen": -0.1960483193397522, "logits/rejected": 2.1023592948913574, "logps/chosen": -530.5663452148438, "logps/rejected": -991.4660034179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.981578826904297, "rewards/margins": 29.86503028869629, "rewards/rejected": -39.84661102294922, "step": 1597 }, { "epoch": 0.9940902021772939, "grad_norm": 18.419109344482422, "learning_rate": 3.7148455509451366e-06, "logits/chosen": 0.2364797592163086, "logits/rejected": 2.677727460861206, "logps/chosen": -561.347412109375, "logps/rejected": -885.8209838867188, "loss": 0.1108, "rewards/accuracies": 0.875, "rewards/chosen": -8.789037704467773, "rewards/margins": 22.291229248046875, "rewards/rejected": -31.080265045166016, "step": 1598 }, { "epoch": 0.9947122861586314, "grad_norm": 0.12117072939872742, "learning_rate": 3.713692946058092e-06, "logits/chosen": 1.5356340408325195, "logits/rejected": 3.313687801361084, "logps/chosen": -543.3634643554688, "logps/rejected": -787.9395751953125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.35412883758545, "rewards/margins": 17.061670303344727, "rewards/rejected": -27.415800094604492, "step": 1599 }, { "epoch": 0.995334370139969, "grad_norm": 0.0006846533506177366, "learning_rate": 3.712540341171047e-06, "logits/chosen": -0.7549853324890137, "logits/rejected": 3.749081611633301, "logps/chosen": -353.02239990234375, "logps/rejected": -909.6260986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.173841953277588, "rewards/margins": 24.962791442871094, "rewards/rejected": -31.136634826660156, "step": 1600 }, { "epoch": 0.9959564541213064, "grad_norm": 39.99483108520508, "learning_rate": 3.7113877362840022e-06, "logits/chosen": 1.2029293775558472, "logits/rejected": 3.3627114295959473, "logps/chosen": -645.314697265625, "logps/rejected": -1047.4544677734375, "loss": 0.669, "rewards/accuracies": 0.875, "rewards/chosen": -9.72260856628418, "rewards/margins": 27.31245994567871, "rewards/rejected": -37.035064697265625, "step": 1601 }, { "epoch": 0.9965785381026439, "grad_norm": 49.874942779541016, "learning_rate": 3.7102351313969575e-06, "logits/chosen": 3.284135103225708, "logits/rejected": 4.036801815032959, "logps/chosen": -687.740478515625, "logps/rejected": -827.6663818359375, "loss": 0.9122, "rewards/accuracies": 0.75, "rewards/chosen": -6.97233247756958, "rewards/margins": 11.839042663574219, "rewards/rejected": -18.81137466430664, "step": 1602 }, { "epoch": 0.9972006220839813, "grad_norm": 0.06565108150243759, "learning_rate": 3.7090825265099127e-06, "logits/chosen": -0.45381850004196167, "logits/rejected": 4.070195198059082, "logps/chosen": -459.11358642578125, "logps/rejected": -935.2645874023438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.786318778991699, "rewards/margins": 21.61214256286621, "rewards/rejected": -28.398460388183594, "step": 1603 }, { "epoch": 0.9978227060653189, "grad_norm": 0.16008152067661285, "learning_rate": 3.7079299216228683e-06, "logits/chosen": 2.136279582977295, "logits/rejected": 4.350179672241211, "logps/chosen": -650.0744018554688, "logps/rejected": -1078.3236083984375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.329696655273438, "rewards/margins": 26.71537208557129, "rewards/rejected": -35.04507064819336, "step": 1604 }, { "epoch": 0.9984447900466563, "grad_norm": 32.31972122192383, "learning_rate": 3.7067773167358236e-06, "logits/chosen": 1.3715128898620605, "logits/rejected": 4.225821495056152, "logps/chosen": -534.2005615234375, "logps/rejected": -910.2581787109375, "loss": 0.6488, "rewards/accuracies": 0.875, "rewards/chosen": -9.50748348236084, "rewards/margins": 17.401165008544922, "rewards/rejected": -26.908649444580078, "step": 1605 }, { "epoch": 0.9990668740279938, "grad_norm": 0.004076329059898853, "learning_rate": 3.705624711848779e-06, "logits/chosen": 0.3134296238422394, "logits/rejected": 2.1142117977142334, "logps/chosen": -437.3398132324219, "logps/rejected": -795.037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.274627685546875, "rewards/margins": 22.731220245361328, "rewards/rejected": -32.0058479309082, "step": 1606 }, { "epoch": 0.9996889580093312, "grad_norm": 6.227400263014715e-06, "learning_rate": 3.704472106961734e-06, "logits/chosen": -0.9058216214179993, "logits/rejected": 4.364875793457031, "logps/chosen": -474.5135498046875, "logps/rejected": -1069.215576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.431096076965332, "rewards/margins": 33.49390411376953, "rewards/rejected": -40.92500305175781, "step": 1607 }, { "epoch": 1.0003110419906687, "grad_norm": 34.91269302368164, "learning_rate": 3.7033195020746892e-06, "logits/chosen": 1.389174461364746, "logits/rejected": 4.184999942779541, "logps/chosen": -401.0694580078125, "logps/rejected": -723.718994140625, "loss": 0.7746, "rewards/accuracies": 0.875, "rewards/chosen": -7.645594596862793, "rewards/margins": 20.324663162231445, "rewards/rejected": -27.970258712768555, "step": 1608 }, { "epoch": 1.0009331259720062, "grad_norm": 0.023849627003073692, "learning_rate": 3.7021668971876445e-06, "logits/chosen": -0.6653918027877808, "logits/rejected": 3.9303970336914062, "logps/chosen": -543.9305419921875, "logps/rejected": -1017.9217529296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.253396511077881, "rewards/margins": 29.039730072021484, "rewards/rejected": -35.293128967285156, "step": 1609 }, { "epoch": 1.0015552099533438, "grad_norm": 5.411730307969265e-05, "learning_rate": 3.7010142923005997e-06, "logits/chosen": -0.11129330098628998, "logits/rejected": 3.716736316680908, "logps/chosen": -332.31414794921875, "logps/rejected": -754.6508178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.006335735321045, "rewards/margins": 23.23809814453125, "rewards/rejected": -29.24443244934082, "step": 1610 }, { "epoch": 1.0021772939346811, "grad_norm": 0.9929457306861877, "learning_rate": 3.699861687413555e-06, "logits/chosen": 0.5281230211257935, "logits/rejected": 3.874539852142334, "logps/chosen": -519.0057373046875, "logps/rejected": -1060.0328369140625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.74343204498291, "rewards/margins": 31.822721481323242, "rewards/rejected": -39.56615447998047, "step": 1611 }, { "epoch": 1.0027993779160187, "grad_norm": 1.0761016607284546, "learning_rate": 3.6987090825265106e-06, "logits/chosen": -0.8074854612350464, "logits/rejected": 4.119511127471924, "logps/chosen": -493.4097900390625, "logps/rejected": -956.7243041992188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -7.330320358276367, "rewards/margins": 21.3006591796875, "rewards/rejected": -28.630979537963867, "step": 1612 }, { "epoch": 1.003421461897356, "grad_norm": 0.0010593491606414318, "learning_rate": 3.6975564776394658e-06, "logits/chosen": -1.6595462560653687, "logits/rejected": 3.404412269592285, "logps/chosen": -308.19329833984375, "logps/rejected": -925.4331665039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.133266448974609, "rewards/margins": 31.814796447753906, "rewards/rejected": -36.948062896728516, "step": 1613 }, { "epoch": 1.0040435458786936, "grad_norm": 0.1719919741153717, "learning_rate": 3.696403872752421e-06, "logits/chosen": 1.4374184608459473, "logits/rejected": 4.8389716148376465, "logps/chosen": -508.57843017578125, "logps/rejected": -896.6202392578125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.462780475616455, "rewards/margins": 22.043060302734375, "rewards/rejected": -27.505840301513672, "step": 1614 }, { "epoch": 1.0046656298600312, "grad_norm": 0.00047982099931687117, "learning_rate": 3.6952512678653762e-06, "logits/chosen": -0.11289626359939575, "logits/rejected": 5.051069259643555, "logps/chosen": -386.2294921875, "logps/rejected": -910.8284301757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.725472450256348, "rewards/margins": 25.839317321777344, "rewards/rejected": -33.564788818359375, "step": 1615 }, { "epoch": 1.0052877138413685, "grad_norm": 0.684409499168396, "learning_rate": 3.6940986629783315e-06, "logits/chosen": 1.061985969543457, "logits/rejected": 1.488252878189087, "logps/chosen": -473.2418212890625, "logps/rejected": -788.3059692382812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -9.849893569946289, "rewards/margins": 23.360258102416992, "rewards/rejected": -33.21015167236328, "step": 1616 }, { "epoch": 1.005909797822706, "grad_norm": 0.01652875542640686, "learning_rate": 3.6929460580912867e-06, "logits/chosen": -1.0540319681167603, "logits/rejected": 2.3286192417144775, "logps/chosen": -477.8441162109375, "logps/rejected": -959.3941040039062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.00267219543457, "rewards/margins": 23.850139617919922, "rewards/rejected": -32.85280990600586, "step": 1617 }, { "epoch": 1.0065318818040436, "grad_norm": 2.5337561737615033e-07, "learning_rate": 3.691793453204242e-06, "logits/chosen": -0.5124354362487793, "logits/rejected": 4.590569019317627, "logps/chosen": -465.9842834472656, "logps/rejected": -1202.935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.984279632568359, "rewards/margins": 38.834861755371094, "rewards/rejected": -45.81914138793945, "step": 1618 }, { "epoch": 1.007153965785381, "grad_norm": 2.6867403984069824, "learning_rate": 3.6906408483171976e-06, "logits/chosen": 0.7521177530288696, "logits/rejected": 5.3029465675354, "logps/chosen": -548.4725341796875, "logps/rejected": -1041.0906982421875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -9.513587951660156, "rewards/margins": 21.835956573486328, "rewards/rejected": -31.34954261779785, "step": 1619 }, { "epoch": 1.0077760497667185, "grad_norm": 0.004673125222325325, "learning_rate": 3.6894882434301528e-06, "logits/chosen": 0.1786353588104248, "logits/rejected": 3.4529829025268555, "logps/chosen": -371.00592041015625, "logps/rejected": -769.5985717773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.893185615539551, "rewards/margins": 24.657033920288086, "rewards/rejected": -31.550220489501953, "step": 1620 }, { "epoch": 1.008398133748056, "grad_norm": 0.0009540282189846039, "learning_rate": 3.688335638543108e-06, "logits/chosen": -0.4206286072731018, "logits/rejected": 2.911111831665039, "logps/chosen": -376.7239990234375, "logps/rejected": -911.2799072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.087729454040527, "rewards/margins": 30.201828002929688, "rewards/rejected": -35.28955841064453, "step": 1621 }, { "epoch": 1.0090202177293934, "grad_norm": 5.449791206046939e-05, "learning_rate": 3.6871830336560632e-06, "logits/chosen": 1.2042280435562134, "logits/rejected": 5.087825775146484, "logps/chosen": -480.3478698730469, "logps/rejected": -862.7271118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.974275588989258, "rewards/margins": 22.0208797454834, "rewards/rejected": -27.995153427124023, "step": 1622 }, { "epoch": 1.009642301710731, "grad_norm": 19.46409034729004, "learning_rate": 3.6860304287690185e-06, "logits/chosen": 3.0977351665496826, "logits/rejected": 2.2187840938568115, "logps/chosen": -602.4797973632812, "logps/rejected": -827.6873168945312, "loss": 0.1975, "rewards/accuracies": 0.875, "rewards/chosen": -9.518498420715332, "rewards/margins": 18.173076629638672, "rewards/rejected": -27.691574096679688, "step": 1623 }, { "epoch": 1.0102643856920683, "grad_norm": 14.768061637878418, "learning_rate": 3.6848778238819737e-06, "logits/chosen": -0.5324491858482361, "logits/rejected": 3.6249537467956543, "logps/chosen": -440.9612121582031, "logps/rejected": -942.3751831054688, "loss": 0.1718, "rewards/accuracies": 0.875, "rewards/chosen": -10.06298828125, "rewards/margins": 22.763708114624023, "rewards/rejected": -32.826698303222656, "step": 1624 }, { "epoch": 1.010886469673406, "grad_norm": 0.0009539787424728274, "learning_rate": 3.683725218994929e-06, "logits/chosen": -2.3274078369140625, "logits/rejected": 4.436672687530518, "logps/chosen": -315.9355773925781, "logps/rejected": -1044.0924072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2964043617248535, "rewards/margins": 34.74640655517578, "rewards/rejected": -39.042808532714844, "step": 1625 }, { "epoch": 1.0115085536547435, "grad_norm": 0.07945874333381653, "learning_rate": 3.6825726141078846e-06, "logits/chosen": 0.2944844663143158, "logits/rejected": 3.081148624420166, "logps/chosen": -613.04931640625, "logps/rejected": -1093.5924072265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.141210556030273, "rewards/margins": 28.32501983642578, "rewards/rejected": -40.46623229980469, "step": 1626 }, { "epoch": 1.0121306376360808, "grad_norm": 7.767158649585326e-07, "learning_rate": 3.6814200092208398e-06, "logits/chosen": 0.2727533280849457, "logits/rejected": 2.755222797393799, "logps/chosen": -553.1746826171875, "logps/rejected": -915.7510375976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.928220748901367, "rewards/margins": 24.597822189331055, "rewards/rejected": -30.52604103088379, "step": 1627 }, { "epoch": 1.0127527216174184, "grad_norm": 1.0329980850219727, "learning_rate": 3.680267404333795e-06, "logits/chosen": 0.5199713110923767, "logits/rejected": 3.451162815093994, "logps/chosen": -573.7638549804688, "logps/rejected": -1039.6966552734375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -10.805469512939453, "rewards/margins": 24.30244255065918, "rewards/rejected": -35.10791015625, "step": 1628 }, { "epoch": 1.013374805598756, "grad_norm": 5.4592834203504026e-05, "learning_rate": 3.6791147994467502e-06, "logits/chosen": -0.6406729221343994, "logits/rejected": 3.3449177742004395, "logps/chosen": -435.84527587890625, "logps/rejected": -979.3944091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.502603530883789, "rewards/margins": 30.276081085205078, "rewards/rejected": -39.7786865234375, "step": 1629 }, { "epoch": 1.0139968895800933, "grad_norm": 23.031963348388672, "learning_rate": 3.6779621945597055e-06, "logits/chosen": 2.074857473373413, "logits/rejected": 4.605471611022949, "logps/chosen": -614.27490234375, "logps/rejected": -1021.327880859375, "loss": 0.151, "rewards/accuracies": 0.875, "rewards/chosen": -7.717360019683838, "rewards/margins": 22.947689056396484, "rewards/rejected": -30.66505241394043, "step": 1630 }, { "epoch": 1.0146189735614308, "grad_norm": 28.341402053833008, "learning_rate": 3.6768095896726607e-06, "logits/chosen": 0.06842297315597534, "logits/rejected": 3.198674440383911, "logps/chosen": -429.0621643066406, "logps/rejected": -785.5848388671875, "loss": 0.3921, "rewards/accuracies": 0.875, "rewards/chosen": -6.456113815307617, "rewards/margins": 12.912420272827148, "rewards/rejected": -19.368534088134766, "step": 1631 }, { "epoch": 1.0152410575427682, "grad_norm": 0.001189008355140686, "learning_rate": 3.675656984785616e-06, "logits/chosen": 1.5726920366287231, "logits/rejected": 4.682127952575684, "logps/chosen": -489.81280517578125, "logps/rejected": -987.9769897460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.758681297302246, "rewards/margins": 29.487350463867188, "rewards/rejected": -36.24603271484375, "step": 1632 }, { "epoch": 1.0158631415241057, "grad_norm": 0.0009915231494233012, "learning_rate": 3.674504379898571e-06, "logits/chosen": -0.04778742790222168, "logits/rejected": 3.202317237854004, "logps/chosen": -492.9322509765625, "logps/rejected": -1061.7415771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.842962265014648, "rewards/margins": 33.83513259887695, "rewards/rejected": -42.67809295654297, "step": 1633 }, { "epoch": 1.0164852255054433, "grad_norm": 0.020902851596474648, "learning_rate": 3.6733517750115268e-06, "logits/chosen": 1.1798291206359863, "logits/rejected": 3.560368299484253, "logps/chosen": -612.0977783203125, "logps/rejected": -1176.016357421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.882665157318115, "rewards/margins": 33.82012176513672, "rewards/rejected": -40.70278549194336, "step": 1634 }, { "epoch": 1.0171073094867806, "grad_norm": 1.8183759450912476, "learning_rate": 3.672199170124482e-06, "logits/chosen": 2.5154342651367188, "logits/rejected": 2.281287670135498, "logps/chosen": -606.6002197265625, "logps/rejected": -919.1095581054688, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -11.341569900512695, "rewards/margins": 22.567405700683594, "rewards/rejected": -33.908973693847656, "step": 1635 }, { "epoch": 1.0177293934681182, "grad_norm": 2.0949930679137196e-10, "learning_rate": 3.671046565237437e-06, "logits/chosen": 1.1360574960708618, "logits/rejected": 3.4395339488983154, "logps/chosen": -571.862548828125, "logps/rejected": -1135.98388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.192299842834473, "rewards/margins": 36.66862106323242, "rewards/rejected": -44.86091995239258, "step": 1636 }, { "epoch": 1.0183514774494558, "grad_norm": 2.616175413131714, "learning_rate": 3.669893960350392e-06, "logits/chosen": -0.20007240772247314, "logits/rejected": 3.70963716506958, "logps/chosen": -440.68817138671875, "logps/rejected": -851.4669189453125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -7.582137107849121, "rewards/margins": 23.428585052490234, "rewards/rejected": -31.010723114013672, "step": 1637 }, { "epoch": 1.018973561430793, "grad_norm": 0.011836409568786621, "learning_rate": 3.6687413554633473e-06, "logits/chosen": 1.411272406578064, "logits/rejected": 3.4393310546875, "logps/chosen": -656.6214599609375, "logps/rejected": -992.0446166992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.456025123596191, "rewards/margins": 30.13931655883789, "rewards/rejected": -34.595340728759766, "step": 1638 }, { "epoch": 1.0195956454121307, "grad_norm": 0.24777275323867798, "learning_rate": 3.6675887505763025e-06, "logits/chosen": -0.15388274192810059, "logits/rejected": 0.12217582017183304, "logps/chosen": -670.36572265625, "logps/rejected": -854.996337890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.72994613647461, "rewards/margins": 20.67226219177246, "rewards/rejected": -30.402206420898438, "step": 1639 }, { "epoch": 1.0202177293934682, "grad_norm": 0.6539616584777832, "learning_rate": 3.6664361456892577e-06, "logits/chosen": 0.7842258214950562, "logits/rejected": 3.279789924621582, "logps/chosen": -583.931640625, "logps/rejected": -1048.6573486328125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -7.54281759262085, "rewards/margins": 29.190086364746094, "rewards/rejected": -36.73290252685547, "step": 1640 }, { "epoch": 1.0208398133748056, "grad_norm": 0.44214633107185364, "learning_rate": 3.665283540802213e-06, "logits/chosen": 2.359795331954956, "logits/rejected": 3.069380760192871, "logps/chosen": -659.0450439453125, "logps/rejected": -957.125732421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.856813430786133, "rewards/margins": 23.456443786621094, "rewards/rejected": -32.313255310058594, "step": 1641 }, { "epoch": 1.0214618973561431, "grad_norm": 0.011518875136971474, "learning_rate": 3.664130935915168e-06, "logits/chosen": 3.3660264015197754, "logits/rejected": 4.603273391723633, "logps/chosen": -709.8883056640625, "logps/rejected": -965.43896484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.867781639099121, "rewards/margins": 22.071300506591797, "rewards/rejected": -30.939083099365234, "step": 1642 }, { "epoch": 1.0220839813374805, "grad_norm": 0.0006374081131070852, "learning_rate": 3.662978331028124e-06, "logits/chosen": -0.2983798682689667, "logits/rejected": 4.081971168518066, "logps/chosen": -327.46661376953125, "logps/rejected": -816.51806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7871956825256348, "rewards/margins": 23.058956146240234, "rewards/rejected": -26.846153259277344, "step": 1643 }, { "epoch": 1.022706065318818, "grad_norm": 0.633435845375061, "learning_rate": 3.661825726141079e-06, "logits/chosen": -1.1860074996948242, "logits/rejected": 2.1363470554351807, "logps/chosen": -608.3544311523438, "logps/rejected": -986.3404541015625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.85249662399292, "rewards/margins": 27.040531158447266, "rewards/rejected": -34.893028259277344, "step": 1644 }, { "epoch": 1.0233281493001556, "grad_norm": 0.00012436254473868757, "learning_rate": 3.6606731212540342e-06, "logits/chosen": 0.9427947998046875, "logits/rejected": 2.397813081741333, "logps/chosen": -606.085693359375, "logps/rejected": -1083.9486083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.637308120727539, "rewards/margins": 34.03984069824219, "rewards/rejected": -43.677146911621094, "step": 1645 }, { "epoch": 1.023950233281493, "grad_norm": 2.4389617465203628e-05, "learning_rate": 3.6595205163669895e-06, "logits/chosen": 1.5660756826400757, "logits/rejected": 2.541382312774658, "logps/chosen": -635.6356201171875, "logps/rejected": -952.6004028320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.896754264831543, "rewards/margins": 22.892070770263672, "rewards/rejected": -33.78882598876953, "step": 1646 }, { "epoch": 1.0245723172628305, "grad_norm": 0.00010650245530996472, "learning_rate": 3.6583679114799447e-06, "logits/chosen": -1.616473913192749, "logits/rejected": 2.7548515796661377, "logps/chosen": -359.27069091796875, "logps/rejected": -915.1429443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.845514297485352, "rewards/margins": 27.412410736083984, "rewards/rejected": -32.2579231262207, "step": 1647 }, { "epoch": 1.025194401244168, "grad_norm": 0.15823419392108917, "learning_rate": 3.6572153065929e-06, "logits/chosen": 1.3099300861358643, "logits/rejected": 5.212904930114746, "logps/chosen": -537.3043823242188, "logps/rejected": -903.5443725585938, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.36880111694336, "rewards/margins": 16.472347259521484, "rewards/rejected": -25.841148376464844, "step": 1648 }, { "epoch": 1.0258164852255054, "grad_norm": 6.6609704845177475e-06, "learning_rate": 3.656062701705855e-06, "logits/chosen": 1.4306132793426514, "logits/rejected": 3.6266322135925293, "logps/chosen": -444.43133544921875, "logps/rejected": -858.3787841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.317989349365234, "rewards/margins": 30.24403190612793, "rewards/rejected": -34.56201934814453, "step": 1649 }, { "epoch": 1.026438569206843, "grad_norm": 0.005263295955955982, "learning_rate": 3.654910096818811e-06, "logits/chosen": 0.2962612509727478, "logits/rejected": 3.644659996032715, "logps/chosen": -574.5421752929688, "logps/rejected": -990.66162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.722488403320312, "rewards/margins": 26.464990615844727, "rewards/rejected": -36.187477111816406, "step": 1650 }, { "epoch": 1.0270606531881805, "grad_norm": 0.08479459583759308, "learning_rate": 3.653757491931766e-06, "logits/chosen": -0.08327645063400269, "logits/rejected": 3.943711519241333, "logps/chosen": -545.4989013671875, "logps/rejected": -958.6614990234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.588180541992188, "rewards/margins": 22.37979507446289, "rewards/rejected": -30.96797752380371, "step": 1651 }, { "epoch": 1.0276827371695179, "grad_norm": 3.4717464814093546e-07, "learning_rate": 3.6526048870447212e-06, "logits/chosen": -1.4334676265716553, "logits/rejected": 3.306443452835083, "logps/chosen": -395.6757507324219, "logps/rejected": -912.210205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.050688743591309, "rewards/margins": 27.281938552856445, "rewards/rejected": -31.332626342773438, "step": 1652 }, { "epoch": 1.0283048211508554, "grad_norm": 0.04955174773931503, "learning_rate": 3.6514522821576765e-06, "logits/chosen": 0.1008802056312561, "logits/rejected": 2.2651162147521973, "logps/chosen": -557.874267578125, "logps/rejected": -865.3643798828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.999269962310791, "rewards/margins": 18.858959197998047, "rewards/rejected": -26.85822868347168, "step": 1653 }, { "epoch": 1.0289269051321928, "grad_norm": 24.3441162109375, "learning_rate": 3.6502996772706317e-06, "logits/chosen": 0.04064282774925232, "logits/rejected": 3.0165932178497314, "logps/chosen": -483.45697021484375, "logps/rejected": -857.9864501953125, "loss": 0.2728, "rewards/accuracies": 0.875, "rewards/chosen": -5.3655900955200195, "rewards/margins": 18.789987564086914, "rewards/rejected": -24.155576705932617, "step": 1654 }, { "epoch": 1.0295489891135303, "grad_norm": 54.452606201171875, "learning_rate": 3.649147072383587e-06, "logits/chosen": -1.2691631317138672, "logits/rejected": 1.3112772703170776, "logps/chosen": -548.9378051757812, "logps/rejected": -790.1439208984375, "loss": 1.3937, "rewards/accuracies": 0.75, "rewards/chosen": -8.830726623535156, "rewards/margins": 15.580018043518066, "rewards/rejected": -24.410743713378906, "step": 1655 }, { "epoch": 1.0301710730948679, "grad_norm": 1.790919542312622, "learning_rate": 3.647994467496542e-06, "logits/chosen": -1.0112364292144775, "logits/rejected": 0.8678668737411499, "logps/chosen": -378.75933837890625, "logps/rejected": -695.4180908203125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -7.413599967956543, "rewards/margins": 19.858667373657227, "rewards/rejected": -27.272268295288086, "step": 1656 }, { "epoch": 1.0307931570762052, "grad_norm": 3.013064088008832e-06, "learning_rate": 3.6468418626094974e-06, "logits/chosen": -3.579735279083252, "logits/rejected": 0.7812553644180298, "logps/chosen": -366.45904541015625, "logps/rejected": -877.666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.8892035484313965, "rewards/margins": 25.258516311645508, "rewards/rejected": -33.14772033691406, "step": 1657 }, { "epoch": 1.0314152410575428, "grad_norm": 0.20314611494541168, "learning_rate": 3.645689257722453e-06, "logits/chosen": 0.3334569036960602, "logits/rejected": 2.605698347091675, "logps/chosen": -470.14068603515625, "logps/rejected": -883.865478515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -9.243642807006836, "rewards/margins": 25.31780242919922, "rewards/rejected": -34.56144714355469, "step": 1658 }, { "epoch": 1.0320373250388803, "grad_norm": 4.763148542252793e-09, "learning_rate": 3.6445366528354082e-06, "logits/chosen": -0.9464988112449646, "logits/rejected": 3.2320268154144287, "logps/chosen": -583.88232421875, "logps/rejected": -1142.690673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.410359382629395, "rewards/margins": 33.93184280395508, "rewards/rejected": -45.342201232910156, "step": 1659 }, { "epoch": 1.0326594090202177, "grad_norm": 3.7136945724487305, "learning_rate": 3.6433840479483635e-06, "logits/chosen": 0.5876110792160034, "logits/rejected": 3.5559744834899902, "logps/chosen": -435.9635009765625, "logps/rejected": -796.0913696289062, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -7.127996444702148, "rewards/margins": 24.278839111328125, "rewards/rejected": -31.406837463378906, "step": 1660 }, { "epoch": 1.0332814930015553, "grad_norm": 6.777904033660889, "learning_rate": 3.6422314430613187e-06, "logits/chosen": 2.48170804977417, "logits/rejected": 4.341867923736572, "logps/chosen": -720.7596435546875, "logps/rejected": -1063.3670654296875, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -10.72669506072998, "rewards/margins": 22.279956817626953, "rewards/rejected": -33.00665283203125, "step": 1661 }, { "epoch": 1.0339035769828926, "grad_norm": 31.819677352905273, "learning_rate": 3.641078838174274e-06, "logits/chosen": 0.8430761694908142, "logits/rejected": 2.8223979473114014, "logps/chosen": -605.6489868164062, "logps/rejected": -917.00927734375, "loss": 0.6779, "rewards/accuracies": 0.875, "rewards/chosen": -9.731306076049805, "rewards/margins": 23.4366455078125, "rewards/rejected": -33.16795349121094, "step": 1662 }, { "epoch": 1.0345256609642302, "grad_norm": 0.5945461392402649, "learning_rate": 3.639926233287229e-06, "logits/chosen": 1.1385351419448853, "logits/rejected": 3.092228412628174, "logps/chosen": -531.9583740234375, "logps/rejected": -990.5872802734375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -9.152070045471191, "rewards/margins": 28.98786163330078, "rewards/rejected": -38.139930725097656, "step": 1663 }, { "epoch": 1.0351477449455677, "grad_norm": 0.0029474389739334583, "learning_rate": 3.6387736284001844e-06, "logits/chosen": 2.3572731018066406, "logits/rejected": 3.6039628982543945, "logps/chosen": -572.6138916015625, "logps/rejected": -994.5134887695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.076838493347168, "rewards/margins": 30.933841705322266, "rewards/rejected": -37.01068115234375, "step": 1664 }, { "epoch": 1.035769828926905, "grad_norm": 30.376829147338867, "learning_rate": 3.63762102351314e-06, "logits/chosen": -0.8222250938415527, "logits/rejected": 3.142303943634033, "logps/chosen": -321.65899658203125, "logps/rejected": -653.8709716796875, "loss": 0.4004, "rewards/accuracies": 0.875, "rewards/chosen": -7.204082489013672, "rewards/margins": 15.132912635803223, "rewards/rejected": -22.33699607849121, "step": 1665 }, { "epoch": 1.0363919129082426, "grad_norm": 0.008387638255953789, "learning_rate": 3.6364684186260952e-06, "logits/chosen": 3.4995431900024414, "logits/rejected": 5.597842216491699, "logps/chosen": -749.5764770507812, "logps/rejected": -1146.525146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.341553211212158, "rewards/margins": 25.269424438476562, "rewards/rejected": -31.610977172851562, "step": 1666 }, { "epoch": 1.0370139968895802, "grad_norm": 25.438154220581055, "learning_rate": 3.6353158137390505e-06, "logits/chosen": 0.6028562784194946, "logits/rejected": 4.041049003601074, "logps/chosen": -510.02557373046875, "logps/rejected": -868.3908081054688, "loss": 0.1768, "rewards/accuracies": 0.875, "rewards/chosen": -7.452632427215576, "rewards/margins": 18.50664520263672, "rewards/rejected": -25.959278106689453, "step": 1667 }, { "epoch": 1.0376360808709175, "grad_norm": 7.822577953338623, "learning_rate": 3.6341632088520057e-06, "logits/chosen": -3.6562254428863525, "logits/rejected": 0.9931538701057434, "logps/chosen": -330.18707275390625, "logps/rejected": -787.677734375, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -4.651616096496582, "rewards/margins": 20.370330810546875, "rewards/rejected": -25.021947860717773, "step": 1668 }, { "epoch": 1.038258164852255, "grad_norm": 4.106832981109619, "learning_rate": 3.633010603964961e-06, "logits/chosen": -0.2826972007751465, "logits/rejected": 3.079453468322754, "logps/chosen": -547.0927734375, "logps/rejected": -959.0612182617188, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -9.533456802368164, "rewards/margins": 22.27680206298828, "rewards/rejected": -31.810256958007812, "step": 1669 }, { "epoch": 1.0388802488335926, "grad_norm": 4.633700370788574, "learning_rate": 3.631857999077916e-06, "logits/chosen": -0.9866232872009277, "logits/rejected": 3.467489719390869, "logps/chosen": -432.56134033203125, "logps/rejected": -881.6924438476562, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -7.791879177093506, "rewards/margins": 22.992813110351562, "rewards/rejected": -30.784690856933594, "step": 1670 }, { "epoch": 1.03950233281493, "grad_norm": 1.7924748659133911, "learning_rate": 3.6307053941908714e-06, "logits/chosen": 1.0629141330718994, "logits/rejected": 3.884521722793579, "logps/chosen": -445.9981994628906, "logps/rejected": -854.4365844726562, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -6.354894638061523, "rewards/margins": 24.29794692993164, "rewards/rejected": -30.652841567993164, "step": 1671 }, { "epoch": 1.0401244167962675, "grad_norm": 0.001384186209179461, "learning_rate": 3.629552789303827e-06, "logits/chosen": 1.3695205450057983, "logits/rejected": 4.970004081726074, "logps/chosen": -430.239501953125, "logps/rejected": -870.437744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.031839370727539, "rewards/margins": 24.595035552978516, "rewards/rejected": -29.626874923706055, "step": 1672 }, { "epoch": 1.0407465007776049, "grad_norm": 0.32689252495765686, "learning_rate": 3.6284001844167822e-06, "logits/chosen": -0.18244385719299316, "logits/rejected": 2.874420166015625, "logps/chosen": -481.1783447265625, "logps/rejected": -967.6571044921875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.099664688110352, "rewards/margins": 25.957063674926758, "rewards/rejected": -33.05672836303711, "step": 1673 }, { "epoch": 1.0413685847589425, "grad_norm": 28.884672164916992, "learning_rate": 3.6272475795297375e-06, "logits/chosen": -0.16030162572860718, "logits/rejected": 3.1493980884552, "logps/chosen": -402.1942443847656, "logps/rejected": -800.2150268554688, "loss": 0.4067, "rewards/accuracies": 0.875, "rewards/chosen": -6.243395805358887, "rewards/margins": 16.908164978027344, "rewards/rejected": -23.151561737060547, "step": 1674 }, { "epoch": 1.04199066874028, "grad_norm": 0.03992212936282158, "learning_rate": 3.6260949746426927e-06, "logits/chosen": 0.6602403521537781, "logits/rejected": 3.8015923500061035, "logps/chosen": -679.17236328125, "logps/rejected": -1099.7989501953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.684561729431152, "rewards/margins": 26.821062088012695, "rewards/rejected": -36.50562286376953, "step": 1675 }, { "epoch": 1.0426127527216174, "grad_norm": 34.57837677001953, "learning_rate": 3.624942369755648e-06, "logits/chosen": 0.16994890570640564, "logits/rejected": 1.8535521030426025, "logps/chosen": -510.3432922363281, "logps/rejected": -723.0161743164062, "loss": 0.6064, "rewards/accuracies": 0.875, "rewards/chosen": -4.824977874755859, "rewards/margins": 18.82767105102539, "rewards/rejected": -23.652650833129883, "step": 1676 }, { "epoch": 1.043234836702955, "grad_norm": 4.1895599365234375, "learning_rate": 3.623789764868603e-06, "logits/chosen": -0.6701554656028748, "logits/rejected": 4.756260871887207, "logps/chosen": -426.80133056640625, "logps/rejected": -956.4772338867188, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -4.39846658706665, "rewards/margins": 22.706832885742188, "rewards/rejected": -27.105300903320312, "step": 1677 }, { "epoch": 1.0438569206842925, "grad_norm": 0.04196924343705177, "learning_rate": 3.6226371599815584e-06, "logits/chosen": 0.7270650267601013, "logits/rejected": 1.8944915533065796, "logps/chosen": -611.7488403320312, "logps/rejected": -976.1044921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.306270599365234, "rewards/margins": 25.221054077148438, "rewards/rejected": -32.52732467651367, "step": 1678 }, { "epoch": 1.0444790046656298, "grad_norm": 2.5633633136749268, "learning_rate": 3.621484555094514e-06, "logits/chosen": 2.270385503768921, "logits/rejected": 5.121720314025879, "logps/chosen": -519.3915405273438, "logps/rejected": -906.0528564453125, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -9.927509307861328, "rewards/margins": 23.416658401489258, "rewards/rejected": -33.34416580200195, "step": 1679 }, { "epoch": 1.0451010886469674, "grad_norm": 0.004999854601919651, "learning_rate": 3.6203319502074692e-06, "logits/chosen": -0.6083928346633911, "logits/rejected": 3.2550253868103027, "logps/chosen": -405.646484375, "logps/rejected": -903.578857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4245429039001465, "rewards/margins": 26.661523818969727, "rewards/rejected": -29.08606719970703, "step": 1680 }, { "epoch": 1.0457231726283047, "grad_norm": 4.50816260126885e-05, "learning_rate": 3.6191793453204245e-06, "logits/chosen": -0.9008429050445557, "logits/rejected": 1.5069544315338135, "logps/chosen": -469.40972900390625, "logps/rejected": -793.4561157226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.385979652404785, "rewards/margins": 20.223291397094727, "rewards/rejected": -26.609272003173828, "step": 1681 }, { "epoch": 1.0463452566096423, "grad_norm": 7.654258728027344, "learning_rate": 3.6180267404333797e-06, "logits/chosen": -0.44024014472961426, "logits/rejected": 1.088015079498291, "logps/chosen": -698.7821044921875, "logps/rejected": -1008.3251953125, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -9.793458938598633, "rewards/margins": 23.57197380065918, "rewards/rejected": -33.36543273925781, "step": 1682 }, { "epoch": 1.0469673405909798, "grad_norm": 0.00012983712076675147, "learning_rate": 3.616874135546335e-06, "logits/chosen": -2.612804412841797, "logits/rejected": 1.900937557220459, "logps/chosen": -354.28045654296875, "logps/rejected": -788.4923706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.724903106689453, "rewards/margins": 25.43047523498535, "rewards/rejected": -30.155378341674805, "step": 1683 }, { "epoch": 1.0475894245723172, "grad_norm": 0.007622862234711647, "learning_rate": 3.61572153065929e-06, "logits/chosen": 1.4981238842010498, "logits/rejected": 3.9929189682006836, "logps/chosen": -509.0369567871094, "logps/rejected": -943.208740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.061955451965332, "rewards/margins": 24.140560150146484, "rewards/rejected": -33.2025146484375, "step": 1684 }, { "epoch": 1.0482115085536547, "grad_norm": 25.75448226928711, "learning_rate": 3.6145689257722454e-06, "logits/chosen": 2.079399585723877, "logits/rejected": 2.769184112548828, "logps/chosen": -719.6904907226562, "logps/rejected": -1111.2332763671875, "loss": 0.1932, "rewards/accuracies": 0.875, "rewards/chosen": -11.611705780029297, "rewards/margins": 29.136491775512695, "rewards/rejected": -40.748199462890625, "step": 1685 }, { "epoch": 1.0488335925349923, "grad_norm": 6.400043275789358e-06, "learning_rate": 3.6134163208852006e-06, "logits/chosen": 0.7596876621246338, "logits/rejected": 2.1933321952819824, "logps/chosen": -483.5215759277344, "logps/rejected": -831.635009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2516350746154785, "rewards/margins": 24.35067367553711, "rewards/rejected": -30.602312088012695, "step": 1686 }, { "epoch": 1.0494556765163296, "grad_norm": 7.343322067754343e-06, "learning_rate": 3.6122637159981562e-06, "logits/chosen": -2.8173489570617676, "logits/rejected": 3.3061482906341553, "logps/chosen": -377.90399169921875, "logps/rejected": -955.3590698242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.850915908813477, "rewards/margins": 30.7342529296875, "rewards/rejected": -35.585166931152344, "step": 1687 }, { "epoch": 1.0500777604976672, "grad_norm": 0.28369632363319397, "learning_rate": 3.6111111111111115e-06, "logits/chosen": 1.0880281925201416, "logits/rejected": 3.659137725830078, "logps/chosen": -371.4967346191406, "logps/rejected": -714.4507446289062, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.510320663452148, "rewards/margins": 24.008285522460938, "rewards/rejected": -28.518606185913086, "step": 1688 }, { "epoch": 1.0506998444790048, "grad_norm": 0.8588642477989197, "learning_rate": 3.6099585062240667e-06, "logits/chosen": 0.7619567513465881, "logits/rejected": 3.8341445922851562, "logps/chosen": -413.8572998046875, "logps/rejected": -709.1582641601562, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -7.043118953704834, "rewards/margins": 17.45237922668457, "rewards/rejected": -24.495498657226562, "step": 1689 }, { "epoch": 1.0513219284603421, "grad_norm": 0.015677358955144882, "learning_rate": 3.608805901337022e-06, "logits/chosen": -1.9144474267959595, "logits/rejected": 4.085642337799072, "logps/chosen": -452.252197265625, "logps/rejected": -1200.351318359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.386390686035156, "rewards/margins": 35.35425567626953, "rewards/rejected": -48.74065017700195, "step": 1690 }, { "epoch": 1.0519440124416797, "grad_norm": 0.030642762780189514, "learning_rate": 3.607653296449977e-06, "logits/chosen": -3.1585073471069336, "logits/rejected": -0.3589048981666565, "logps/chosen": -465.46905517578125, "logps/rejected": -877.997314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.650146484375, "rewards/margins": 23.00937271118164, "rewards/rejected": -32.65951919555664, "step": 1691 }, { "epoch": 1.052566096423017, "grad_norm": 0.0374862477183342, "learning_rate": 3.6065006915629324e-06, "logits/chosen": -0.11048626899719238, "logits/rejected": 3.808638095855713, "logps/chosen": -430.2825622558594, "logps/rejected": -957.116943359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.346963882446289, "rewards/margins": 26.467803955078125, "rewards/rejected": -33.81476974487305, "step": 1692 }, { "epoch": 1.0531881804043546, "grad_norm": 0.0897771343588829, "learning_rate": 3.6053480866758876e-06, "logits/chosen": -0.038357075303792953, "logits/rejected": 1.691274642944336, "logps/chosen": -655.2930908203125, "logps/rejected": -917.3342895507812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.112953186035156, "rewards/margins": 17.01007843017578, "rewards/rejected": -27.123031616210938, "step": 1693 }, { "epoch": 1.0538102643856921, "grad_norm": 0.006796684116125107, "learning_rate": 3.6041954817888432e-06, "logits/chosen": -0.4090842008590698, "logits/rejected": 0.5765342712402344, "logps/chosen": -350.09661865234375, "logps/rejected": -703.912353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.090215682983398, "rewards/margins": 24.583942413330078, "rewards/rejected": -33.674156188964844, "step": 1694 }, { "epoch": 1.0544323483670295, "grad_norm": 2.254205355711747e-05, "learning_rate": 3.6030428769017985e-06, "logits/chosen": -1.0816428661346436, "logits/rejected": 3.604936361312866, "logps/chosen": -298.3710021972656, "logps/rejected": -861.7861938476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.984401702880859, "rewards/margins": 27.764816284179688, "rewards/rejected": -33.74921798706055, "step": 1695 }, { "epoch": 1.055054432348367, "grad_norm": 9.45249485084787e-05, "learning_rate": 3.6018902720147537e-06, "logits/chosen": 0.5957896113395691, "logits/rejected": 4.191173553466797, "logps/chosen": -351.27838134765625, "logps/rejected": -839.6342163085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.228531837463379, "rewards/margins": 27.103302001953125, "rewards/rejected": -32.33183288574219, "step": 1696 }, { "epoch": 1.0556765163297046, "grad_norm": 1.1835973262786865, "learning_rate": 3.600737667127709e-06, "logits/chosen": -0.7743030786514282, "logits/rejected": 3.2317123413085938, "logps/chosen": -473.1207275390625, "logps/rejected": -919.996826171875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -10.106958389282227, "rewards/margins": 20.596567153930664, "rewards/rejected": -30.70352554321289, "step": 1697 }, { "epoch": 1.056298600311042, "grad_norm": 0.09530337899923325, "learning_rate": 3.599585062240664e-06, "logits/chosen": 0.988264799118042, "logits/rejected": 4.399881839752197, "logps/chosen": -569.2879028320312, "logps/rejected": -964.9010009765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.983223915100098, "rewards/margins": 26.619644165039062, "rewards/rejected": -33.602867126464844, "step": 1698 }, { "epoch": 1.0569206842923795, "grad_norm": 0.04010099917650223, "learning_rate": 3.5984324573536193e-06, "logits/chosen": 1.159104585647583, "logits/rejected": 4.351685523986816, "logps/chosen": -564.5264892578125, "logps/rejected": -908.9020385742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.197347640991211, "rewards/margins": 18.77018928527832, "rewards/rejected": -30.96753692626953, "step": 1699 }, { "epoch": 1.0575427682737168, "grad_norm": 0.00016707685426808894, "learning_rate": 3.5972798524665746e-06, "logits/chosen": -0.8789552450180054, "logits/rejected": 2.5933098793029785, "logps/chosen": -367.5953369140625, "logps/rejected": -794.2536010742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.060921669006348, "rewards/margins": 25.133541107177734, "rewards/rejected": -31.1944637298584, "step": 1700 }, { "epoch": 1.0581648522550544, "grad_norm": 0.0005692985141649842, "learning_rate": 3.5961272475795302e-06, "logits/chosen": 1.1418954133987427, "logits/rejected": 4.3281450271606445, "logps/chosen": -504.8585205078125, "logps/rejected": -1039.9560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.611676216125488, "rewards/margins": 33.13249206542969, "rewards/rejected": -38.74416732788086, "step": 1701 }, { "epoch": 1.058786936236392, "grad_norm": 25.801237106323242, "learning_rate": 3.5949746426924854e-06, "logits/chosen": 0.6158846616744995, "logits/rejected": 3.2959022521972656, "logps/chosen": -553.5155639648438, "logps/rejected": -928.0843505859375, "loss": 0.2626, "rewards/accuracies": 0.875, "rewards/chosen": -8.362363815307617, "rewards/margins": 24.19908905029297, "rewards/rejected": -32.56145477294922, "step": 1702 }, { "epoch": 1.0594090202177293, "grad_norm": 0.4599364101886749, "learning_rate": 3.5938220378054407e-06, "logits/chosen": -2.900252342224121, "logits/rejected": 1.8551745414733887, "logps/chosen": -292.502197265625, "logps/rejected": -844.1199951171875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -5.056050777435303, "rewards/margins": 27.006755828857422, "rewards/rejected": -32.062808990478516, "step": 1703 }, { "epoch": 1.0600311041990669, "grad_norm": 1.3544723515224177e-05, "learning_rate": 3.592669432918396e-06, "logits/chosen": 1.1340935230255127, "logits/rejected": 4.3828887939453125, "logps/chosen": -554.423828125, "logps/rejected": -1023.113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.717758178710938, "rewards/margins": 26.753938674926758, "rewards/rejected": -38.47169876098633, "step": 1704 }, { "epoch": 1.0606531881804044, "grad_norm": 0.0009403342264704406, "learning_rate": 3.591516828031351e-06, "logits/chosen": 3.1635279655456543, "logits/rejected": 3.546613931655884, "logps/chosen": -670.4608154296875, "logps/rejected": -1008.8966064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.73913288116455, "rewards/margins": 24.537559509277344, "rewards/rejected": -39.27669143676758, "step": 1705 }, { "epoch": 1.0612752721617418, "grad_norm": 0.11844262480735779, "learning_rate": 3.5903642231443063e-06, "logits/chosen": -1.9967526197433472, "logits/rejected": -0.313257098197937, "logps/chosen": -472.6290588378906, "logps/rejected": -765.5889892578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -10.75210189819336, "rewards/margins": 18.136417388916016, "rewards/rejected": -28.888519287109375, "step": 1706 }, { "epoch": 1.0618973561430793, "grad_norm": 11.051610946655273, "learning_rate": 3.5892116182572616e-06, "logits/chosen": 2.81465744972229, "logits/rejected": 3.082577705383301, "logps/chosen": -748.4796142578125, "logps/rejected": -1038.966064453125, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": -11.436058044433594, "rewards/margins": 26.616634368896484, "rewards/rejected": -38.05268859863281, "step": 1707 }, { "epoch": 1.062519440124417, "grad_norm": 0.0004929989227093756, "learning_rate": 3.588059013370217e-06, "logits/chosen": -0.6941190958023071, "logits/rejected": 4.581844329833984, "logps/chosen": -417.68316650390625, "logps/rejected": -1064.1141357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.887452125549316, "rewards/margins": 31.676464080810547, "rewards/rejected": -39.56391906738281, "step": 1708 }, { "epoch": 1.0631415241057542, "grad_norm": 0.06487241387367249, "learning_rate": 3.5869064084831724e-06, "logits/chosen": -1.3791570663452148, "logits/rejected": 3.591864585876465, "logps/chosen": -283.37664794921875, "logps/rejected": -892.575927734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.048565864562988, "rewards/margins": 31.034317016601562, "rewards/rejected": -36.082881927490234, "step": 1709 }, { "epoch": 1.0637636080870918, "grad_norm": 1.2701135347015224e-05, "learning_rate": 3.5857538035961277e-06, "logits/chosen": -3.2900099754333496, "logits/rejected": 2.401799201965332, "logps/chosen": -363.16302490234375, "logps/rejected": -969.2822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9275922775268555, "rewards/margins": 31.4039306640625, "rewards/rejected": -37.33152389526367, "step": 1710 }, { "epoch": 1.0643856920684291, "grad_norm": 1.2971514706805465e-06, "learning_rate": 3.584601198709083e-06, "logits/chosen": 0.3413795232772827, "logits/rejected": 4.562448501586914, "logps/chosen": -470.3218994140625, "logps/rejected": -1056.247314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.446603775024414, "rewards/margins": 33.22271728515625, "rewards/rejected": -41.66931915283203, "step": 1711 }, { "epoch": 1.0650077760497667, "grad_norm": 0.004861316177994013, "learning_rate": 3.583448593822038e-06, "logits/chosen": 0.6895210146903992, "logits/rejected": 3.1214401721954346, "logps/chosen": -584.0718383789062, "logps/rejected": -966.9913940429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.424615859985352, "rewards/margins": 28.374963760375977, "rewards/rejected": -38.799583435058594, "step": 1712 }, { "epoch": 1.0656298600311043, "grad_norm": 1.0662657022476196, "learning_rate": 3.5822959889349933e-06, "logits/chosen": 0.13189710676670074, "logits/rejected": 3.2180283069610596, "logps/chosen": -615.065185546875, "logps/rejected": -1035.4063720703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -11.89202880859375, "rewards/margins": 25.611513137817383, "rewards/rejected": -37.503543853759766, "step": 1713 }, { "epoch": 1.0662519440124416, "grad_norm": 0.06123606860637665, "learning_rate": 3.5811433840479486e-06, "logits/chosen": 0.0728345513343811, "logits/rejected": 2.7082386016845703, "logps/chosen": -618.5989990234375, "logps/rejected": -1061.659423828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.811964988708496, "rewards/margins": 26.231304168701172, "rewards/rejected": -34.043270111083984, "step": 1714 }, { "epoch": 1.0668740279937792, "grad_norm": 0.47655490040779114, "learning_rate": 3.579990779160904e-06, "logits/chosen": -0.4628029465675354, "logits/rejected": 3.919542074203491, "logps/chosen": -585.7105712890625, "logps/rejected": -1029.9808349609375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -13.386682510375977, "rewards/margins": 21.445314407348633, "rewards/rejected": -34.83199691772461, "step": 1715 }, { "epoch": 1.0674961119751167, "grad_norm": 0.0005064127617515624, "learning_rate": 3.5788381742738594e-06, "logits/chosen": -0.7007519006729126, "logits/rejected": 3.573007583618164, "logps/chosen": -193.0349578857422, "logps/rejected": -660.3382568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.492222547531128, "rewards/margins": 21.314695358276367, "rewards/rejected": -24.806917190551758, "step": 1716 }, { "epoch": 1.068118195956454, "grad_norm": 0.0011604634346440434, "learning_rate": 3.5776855693868147e-06, "logits/chosen": -0.47038382291793823, "logits/rejected": 3.3902623653411865, "logps/chosen": -441.37841796875, "logps/rejected": -1046.822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.868171691894531, "rewards/margins": 31.64148712158203, "rewards/rejected": -39.50965881347656, "step": 1717 }, { "epoch": 1.0687402799377916, "grad_norm": 2.1240060329437256, "learning_rate": 3.57653296449977e-06, "logits/chosen": 0.4144650101661682, "logits/rejected": 3.6955389976501465, "logps/chosen": -497.20538330078125, "logps/rejected": -924.2775268554688, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -9.828954696655273, "rewards/margins": 21.07897186279297, "rewards/rejected": -30.90792465209961, "step": 1718 }, { "epoch": 1.069362363919129, "grad_norm": 47.840118408203125, "learning_rate": 3.575380359612725e-06, "logits/chosen": 1.5221973657608032, "logits/rejected": 4.614435195922852, "logps/chosen": -642.9663696289062, "logps/rejected": -1006.8882446289062, "loss": 1.9259, "rewards/accuracies": 0.875, "rewards/chosen": -10.217392921447754, "rewards/margins": 26.158397674560547, "rewards/rejected": -36.375789642333984, "step": 1719 }, { "epoch": 1.0699844479004665, "grad_norm": 0.0057447003200650215, "learning_rate": 3.5742277547256803e-06, "logits/chosen": -1.6053816080093384, "logits/rejected": 3.832913398742676, "logps/chosen": -497.01678466796875, "logps/rejected": -1073.96142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.212861061096191, "rewards/margins": 27.787372589111328, "rewards/rejected": -38.00023651123047, "step": 1720 }, { "epoch": 1.070606531881804, "grad_norm": 0.00011915850336663425, "learning_rate": 3.5730751498386356e-06, "logits/chosen": 2.358245611190796, "logits/rejected": 3.673961877822876, "logps/chosen": -703.9837646484375, "logps/rejected": -1114.497802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.243408203125, "rewards/margins": 32.560333251953125, "rewards/rejected": -43.803741455078125, "step": 1721 }, { "epoch": 1.0712286158631414, "grad_norm": 0.002361687133088708, "learning_rate": 3.571922544951591e-06, "logits/chosen": 0.057686299085617065, "logits/rejected": 4.59140682220459, "logps/chosen": -405.74224853515625, "logps/rejected": -1008.8343505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.059910774230957, "rewards/margins": 30.22999382019043, "rewards/rejected": -37.28990173339844, "step": 1722 }, { "epoch": 1.071850699844479, "grad_norm": 0.14930595457553864, "learning_rate": 3.5707699400645464e-06, "logits/chosen": 0.9290033578872681, "logits/rejected": 3.07366943359375, "logps/chosen": -557.7803955078125, "logps/rejected": -876.728759765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.407896041870117, "rewards/margins": 18.68692398071289, "rewards/rejected": -27.09482192993164, "step": 1723 }, { "epoch": 1.0724727838258166, "grad_norm": 1.5788064899879828e-07, "learning_rate": 3.5696173351775017e-06, "logits/chosen": 3.5497095584869385, "logits/rejected": 4.858210563659668, "logps/chosen": -683.814208984375, "logps/rejected": -1127.4013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.025189399719238, "rewards/margins": 35.34825134277344, "rewards/rejected": -47.37343978881836, "step": 1724 }, { "epoch": 1.073094867807154, "grad_norm": 5.755589008331299, "learning_rate": 3.568464730290457e-06, "logits/chosen": 1.5815749168395996, "logits/rejected": 2.7180745601654053, "logps/chosen": -666.0305786132812, "logps/rejected": -947.807373046875, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -11.947914123535156, "rewards/margins": 19.84267234802246, "rewards/rejected": -31.790584564208984, "step": 1725 }, { "epoch": 1.0737169517884915, "grad_norm": 0.0012952083488926291, "learning_rate": 3.567312125403412e-06, "logits/chosen": -1.330794095993042, "logits/rejected": 4.020459175109863, "logps/chosen": -424.281494140625, "logps/rejected": -1116.1873779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.352400779724121, "rewards/margins": 33.686038970947266, "rewards/rejected": -41.03843688964844, "step": 1726 }, { "epoch": 1.074339035769829, "grad_norm": 0.009987055324018002, "learning_rate": 3.5661595205163673e-06, "logits/chosen": 1.846252202987671, "logits/rejected": 3.3140053749084473, "logps/chosen": -703.4176025390625, "logps/rejected": -1091.683837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.896821022033691, "rewards/margins": 30.977752685546875, "rewards/rejected": -42.87457275390625, "step": 1727 }, { "epoch": 1.0749611197511664, "grad_norm": 0.7924360632896423, "learning_rate": 3.5650069156293226e-06, "logits/chosen": -0.5128199458122253, "logits/rejected": 2.4827842712402344, "logps/chosen": -590.5065307617188, "logps/rejected": -997.1490478515625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -9.042410850524902, "rewards/margins": 26.186431884765625, "rewards/rejected": -35.228843688964844, "step": 1728 }, { "epoch": 1.075583203732504, "grad_norm": 0.13733063638210297, "learning_rate": 3.5638543107422778e-06, "logits/chosen": 2.4186601638793945, "logits/rejected": 3.949766159057617, "logps/chosen": -703.2379760742188, "logps/rejected": -1119.8372802734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.43460464477539, "rewards/margins": 30.021484375, "rewards/rejected": -40.45608901977539, "step": 1729 }, { "epoch": 1.0762052877138413, "grad_norm": 2.674889628906385e-06, "learning_rate": 3.5627017058552334e-06, "logits/chosen": 1.08864426612854, "logits/rejected": 3.4560294151306152, "logps/chosen": -562.939208984375, "logps/rejected": -1042.669677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.059733390808105, "rewards/margins": 28.375022888183594, "rewards/rejected": -36.434757232666016, "step": 1730 }, { "epoch": 1.0768273716951788, "grad_norm": 0.05376862734556198, "learning_rate": 3.5615491009681887e-06, "logits/chosen": 1.5392053127288818, "logits/rejected": 4.284621238708496, "logps/chosen": -443.6164855957031, "logps/rejected": -851.0294799804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.83315372467041, "rewards/margins": 21.720170974731445, "rewards/rejected": -28.553325653076172, "step": 1731 }, { "epoch": 1.0774494556765164, "grad_norm": 7.555571210104972e-05, "learning_rate": 3.560396496081144e-06, "logits/chosen": -1.2423949241638184, "logits/rejected": 2.552158832550049, "logps/chosen": -444.58331298828125, "logps/rejected": -878.7606201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.577056407928467, "rewards/margins": 29.53194808959961, "rewards/rejected": -37.109004974365234, "step": 1732 }, { "epoch": 1.0780715396578537, "grad_norm": 0.08181095123291016, "learning_rate": 3.559243891194099e-06, "logits/chosen": 1.7129337787628174, "logits/rejected": 3.8680801391601562, "logps/chosen": -610.336669921875, "logps/rejected": -952.7198486328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.26831579208374, "rewards/margins": 18.939006805419922, "rewards/rejected": -26.20732307434082, "step": 1733 }, { "epoch": 1.0786936236391913, "grad_norm": 0.15843388438224792, "learning_rate": 3.5580912863070543e-06, "logits/chosen": 1.7182284593582153, "logits/rejected": 3.604914903640747, "logps/chosen": -526.2333984375, "logps/rejected": -879.4224853515625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.743529319763184, "rewards/margins": 24.613826751708984, "rewards/rejected": -33.357357025146484, "step": 1734 }, { "epoch": 1.0793157076205289, "grad_norm": 0.1478980928659439, "learning_rate": 3.5569386814200096e-06, "logits/chosen": -3.3239331245422363, "logits/rejected": 2.918520450592041, "logps/chosen": -378.73895263671875, "logps/rejected": -867.5570068359375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.589836120605469, "rewards/margins": 21.376054763793945, "rewards/rejected": -29.96588897705078, "step": 1735 }, { "epoch": 1.0799377916018662, "grad_norm": 39.71168518066406, "learning_rate": 3.5557860765329648e-06, "logits/chosen": -0.2881968021392822, "logits/rejected": 2.109957695007324, "logps/chosen": -569.751220703125, "logps/rejected": -1004.1986083984375, "loss": 0.5705, "rewards/accuracies": 0.875, "rewards/chosen": -11.916030883789062, "rewards/margins": 23.231706619262695, "rewards/rejected": -35.14773941040039, "step": 1736 }, { "epoch": 1.0805598755832038, "grad_norm": 37.14696502685547, "learning_rate": 3.55463347164592e-06, "logits/chosen": -0.9090088605880737, "logits/rejected": 3.865264415740967, "logps/chosen": -479.4161682128906, "logps/rejected": -1054.5419921875, "loss": 0.7049, "rewards/accuracies": 0.875, "rewards/chosen": -8.393864631652832, "rewards/margins": 24.767200469970703, "rewards/rejected": -33.16106414794922, "step": 1737 }, { "epoch": 1.081181959564541, "grad_norm": 0.012204733677208424, "learning_rate": 3.5534808667588757e-06, "logits/chosen": 1.1127996444702148, "logits/rejected": 3.1084322929382324, "logps/chosen": -617.5895385742188, "logps/rejected": -993.505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.355681419372559, "rewards/margins": 26.18058204650879, "rewards/rejected": -34.53626251220703, "step": 1738 }, { "epoch": 1.0818040435458787, "grad_norm": 1.4145355635264423e-05, "learning_rate": 3.552328261871831e-06, "logits/chosen": 0.434572696685791, "logits/rejected": 2.9870080947875977, "logps/chosen": -521.6600341796875, "logps/rejected": -987.229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.352119445800781, "rewards/margins": 28.358474731445312, "rewards/rejected": -35.710594177246094, "step": 1739 }, { "epoch": 1.0824261275272162, "grad_norm": 0.00888131745159626, "learning_rate": 3.551175656984786e-06, "logits/chosen": -1.785019874572754, "logits/rejected": 3.1531331539154053, "logps/chosen": -370.5480651855469, "logps/rejected": -925.7297973632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.955384731292725, "rewards/margins": 20.881046295166016, "rewards/rejected": -25.8364315032959, "step": 1740 }, { "epoch": 1.0830482115085536, "grad_norm": 52.913543701171875, "learning_rate": 3.5500230520977413e-06, "logits/chosen": 0.27387386560440063, "logits/rejected": 2.122863292694092, "logps/chosen": -518.5341186523438, "logps/rejected": -820.9344482421875, "loss": 1.1936, "rewards/accuracies": 0.875, "rewards/chosen": -8.488155364990234, "rewards/margins": 17.69202423095703, "rewards/rejected": -26.180179595947266, "step": 1741 }, { "epoch": 1.0836702954898911, "grad_norm": 0.0007935499306768179, "learning_rate": 3.5488704472106966e-06, "logits/chosen": 0.46489566564559937, "logits/rejected": 2.1978495121002197, "logps/chosen": -661.927734375, "logps/rejected": -1023.1362915039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.307547569274902, "rewards/margins": 29.69561004638672, "rewards/rejected": -39.00315475463867, "step": 1742 }, { "epoch": 1.0842923794712287, "grad_norm": 0.004735798109322786, "learning_rate": 3.5477178423236518e-06, "logits/chosen": -0.2975190281867981, "logits/rejected": 4.6708784103393555, "logps/chosen": -363.0914306640625, "logps/rejected": -1009.577880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.831550121307373, "rewards/margins": 28.918603897094727, "rewards/rejected": -32.750152587890625, "step": 1743 }, { "epoch": 1.084914463452566, "grad_norm": 2.3199174404144287, "learning_rate": 3.546565237436607e-06, "logits/chosen": 1.5427138805389404, "logits/rejected": 4.5314202308654785, "logps/chosen": -506.8985900878906, "logps/rejected": -889.401611328125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -7.435576915740967, "rewards/margins": 23.069461822509766, "rewards/rejected": -30.505037307739258, "step": 1744 }, { "epoch": 1.0855365474339036, "grad_norm": 25.25603485107422, "learning_rate": 3.5454126325495627e-06, "logits/chosen": 1.1176072359085083, "logits/rejected": 2.8879714012145996, "logps/chosen": -601.0845336914062, "logps/rejected": -935.34716796875, "loss": 0.1484, "rewards/accuracies": 0.875, "rewards/chosen": -4.336665153503418, "rewards/margins": 25.64136505126953, "rewards/rejected": -29.978031158447266, "step": 1745 }, { "epoch": 1.0861586314152412, "grad_norm": 0.0002335595345357433, "learning_rate": 3.544260027662518e-06, "logits/chosen": -0.33439940214157104, "logits/rejected": 3.145282745361328, "logps/chosen": -417.33099365234375, "logps/rejected": -870.7086791992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.937946319580078, "rewards/margins": 25.857038497924805, "rewards/rejected": -29.794986724853516, "step": 1746 }, { "epoch": 1.0867807153965785, "grad_norm": 37.67134094238281, "learning_rate": 3.543107422775473e-06, "logits/chosen": 2.3051233291625977, "logits/rejected": 5.67755126953125, "logps/chosen": -426.3350830078125, "logps/rejected": -745.7735595703125, "loss": 1.4902, "rewards/accuracies": 0.875, "rewards/chosen": -5.041298866271973, "rewards/margins": 14.599910736083984, "rewards/rejected": -19.641210556030273, "step": 1747 }, { "epoch": 1.087402799377916, "grad_norm": 27.38837432861328, "learning_rate": 3.5419548178884283e-06, "logits/chosen": 1.921449065208435, "logits/rejected": 3.4902946949005127, "logps/chosen": -623.1732788085938, "logps/rejected": -968.2666015625, "loss": 0.156, "rewards/accuracies": 0.875, "rewards/chosen": -9.072044372558594, "rewards/margins": 21.721195220947266, "rewards/rejected": -30.793241500854492, "step": 1748 }, { "epoch": 1.0880248833592534, "grad_norm": 0.005325262900441885, "learning_rate": 3.5408022130013836e-06, "logits/chosen": -0.21698921918869019, "logits/rejected": 1.6183581352233887, "logps/chosen": -479.52520751953125, "logps/rejected": -765.736083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.421666145324707, "rewards/margins": 21.242387771606445, "rewards/rejected": -25.664051055908203, "step": 1749 }, { "epoch": 1.088646967340591, "grad_norm": 29.812423706054688, "learning_rate": 3.5396496081143388e-06, "logits/chosen": 2.655641555786133, "logits/rejected": 3.9962892532348633, "logps/chosen": -595.0345458984375, "logps/rejected": -831.300537109375, "loss": 0.324, "rewards/accuracies": 0.875, "rewards/chosen": -8.994194030761719, "rewards/margins": 15.970343589782715, "rewards/rejected": -24.96453857421875, "step": 1750 }, { "epoch": 1.0892690513219285, "grad_norm": 0.00018733121396508068, "learning_rate": 3.538497003227294e-06, "logits/chosen": 2.5199174880981445, "logits/rejected": 4.3708176612854, "logps/chosen": -560.2379760742188, "logps/rejected": -865.1304931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.548035621643066, "rewards/margins": 25.743770599365234, "rewards/rejected": -30.291807174682617, "step": 1751 }, { "epoch": 1.0898911353032659, "grad_norm": 0.00782470591366291, "learning_rate": 3.5373443983402496e-06, "logits/chosen": -0.2565594017505646, "logits/rejected": 1.5670526027679443, "logps/chosen": -402.8641662597656, "logps/rejected": -719.1917724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.71183967590332, "rewards/margins": 18.037317276000977, "rewards/rejected": -22.749156951904297, "step": 1752 }, { "epoch": 1.0905132192846034, "grad_norm": 6.2403564453125, "learning_rate": 3.536191793453205e-06, "logits/chosen": 1.9860265254974365, "logits/rejected": 3.790701389312744, "logps/chosen": -570.9795532226562, "logps/rejected": -994.5543212890625, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -6.19058895111084, "rewards/margins": 28.27306365966797, "rewards/rejected": -34.463653564453125, "step": 1753 }, { "epoch": 1.091135303265941, "grad_norm": 0.0002669897803571075, "learning_rate": 3.53503918856616e-06, "logits/chosen": -1.019551396369934, "logits/rejected": 3.5526912212371826, "logps/chosen": -352.00616455078125, "logps/rejected": -853.19921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6608293056488037, "rewards/margins": 24.306198120117188, "rewards/rejected": -27.967025756835938, "step": 1754 }, { "epoch": 1.0917573872472783, "grad_norm": 0.5336105227470398, "learning_rate": 3.5338865836791153e-06, "logits/chosen": 0.23992976546287537, "logits/rejected": 4.342520713806152, "logps/chosen": -516.0511474609375, "logps/rejected": -1050.3311767578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.2335124015808105, "rewards/margins": 25.32655906677246, "rewards/rejected": -29.560073852539062, "step": 1755 }, { "epoch": 1.092379471228616, "grad_norm": 21.57744026184082, "learning_rate": 3.5327339787920705e-06, "logits/chosen": -4.463540554046631, "logits/rejected": 0.9967468976974487, "logps/chosen": -313.4986267089844, "logps/rejected": -840.9525146484375, "loss": 0.1341, "rewards/accuracies": 0.875, "rewards/chosen": -5.787725448608398, "rewards/margins": 21.620792388916016, "rewards/rejected": -27.408519744873047, "step": 1756 }, { "epoch": 1.0930015552099532, "grad_norm": 0.03455764427781105, "learning_rate": 3.5315813739050258e-06, "logits/chosen": -0.07754494249820709, "logits/rejected": 2.7861900329589844, "logps/chosen": -501.96295166015625, "logps/rejected": -887.67822265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.042675971984863, "rewards/margins": 20.873672485351562, "rewards/rejected": -28.91634750366211, "step": 1757 }, { "epoch": 1.0936236391912908, "grad_norm": 0.006460077594965696, "learning_rate": 3.530428769017981e-06, "logits/chosen": -0.666114330291748, "logits/rejected": 3.3996047973632812, "logps/chosen": -448.8841247558594, "logps/rejected": -914.3242797851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.196425914764404, "rewards/margins": 28.60602378845215, "rewards/rejected": -33.802452087402344, "step": 1758 }, { "epoch": 1.0942457231726284, "grad_norm": 0.28905969858169556, "learning_rate": 3.5292761641309362e-06, "logits/chosen": 2.0526375770568848, "logits/rejected": 4.210762977600098, "logps/chosen": -552.855712890625, "logps/rejected": -991.937255859375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.942243576049805, "rewards/margins": 27.496761322021484, "rewards/rejected": -34.439002990722656, "step": 1759 }, { "epoch": 1.0948678071539657, "grad_norm": 3.9752464294433594, "learning_rate": 3.528123559243892e-06, "logits/chosen": 0.26489585638046265, "logits/rejected": 4.406553745269775, "logps/chosen": -454.697021484375, "logps/rejected": -943.942138671875, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -3.372507095336914, "rewards/margins": 25.299306869506836, "rewards/rejected": -28.67181396484375, "step": 1760 }, { "epoch": 1.0954898911353033, "grad_norm": 0.10441815853118896, "learning_rate": 3.526970954356847e-06, "logits/chosen": -0.5542750954627991, "logits/rejected": 1.8777621984481812, "logps/chosen": -410.62689208984375, "logps/rejected": -739.4208984375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.8234503269195557, "rewards/margins": 20.886167526245117, "rewards/rejected": -24.709617614746094, "step": 1761 }, { "epoch": 1.0961119751166408, "grad_norm": 0.13274742662906647, "learning_rate": 3.5258183494698023e-06, "logits/chosen": 0.48963046073913574, "logits/rejected": 4.257356643676758, "logps/chosen": -567.223388671875, "logps/rejected": -1009.8548583984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.456854343414307, "rewards/margins": 22.50156593322754, "rewards/rejected": -29.95842170715332, "step": 1762 }, { "epoch": 1.0967340590979782, "grad_norm": 3.6613287193176802e-06, "learning_rate": 3.5246657445827575e-06, "logits/chosen": 2.103224754333496, "logits/rejected": 3.8015027046203613, "logps/chosen": -587.1085205078125, "logps/rejected": -925.3865966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.701694965362549, "rewards/margins": 27.56255340576172, "rewards/rejected": -32.26424789428711, "step": 1763 }, { "epoch": 1.0973561430793157, "grad_norm": 0.017274130135774612, "learning_rate": 3.5235131396957128e-06, "logits/chosen": 0.8076609373092651, "logits/rejected": 4.674169063568115, "logps/chosen": -450.8760986328125, "logps/rejected": -957.8709716796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.7160773277282715, "rewards/margins": 27.142183303833008, "rewards/rejected": -34.85826110839844, "step": 1764 }, { "epoch": 1.0979782270606533, "grad_norm": 0.0009655518224462867, "learning_rate": 3.522360534808668e-06, "logits/chosen": 0.0774463415145874, "logits/rejected": 2.751673460006714, "logps/chosen": -576.5747680664062, "logps/rejected": -1003.846923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.677048683166504, "rewards/margins": 28.99536895751953, "rewards/rejected": -38.67241668701172, "step": 1765 }, { "epoch": 1.0986003110419906, "grad_norm": 0.015237356536090374, "learning_rate": 3.5212079299216232e-06, "logits/chosen": -0.023343242704868317, "logits/rejected": 3.3576998710632324, "logps/chosen": -518.5059204101562, "logps/rejected": -920.527099609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.050875663757324, "rewards/margins": 23.865657806396484, "rewards/rejected": -29.91653060913086, "step": 1766 }, { "epoch": 1.0992223950233282, "grad_norm": 33.6476936340332, "learning_rate": 3.520055325034579e-06, "logits/chosen": -0.05070209503173828, "logits/rejected": 4.106517791748047, "logps/chosen": -587.09130859375, "logps/rejected": -1130.2176513671875, "loss": 0.4541, "rewards/accuracies": 0.875, "rewards/chosen": -7.864964485168457, "rewards/margins": 25.942523956298828, "rewards/rejected": -33.80748748779297, "step": 1767 }, { "epoch": 1.0998444790046655, "grad_norm": 8.172510570148006e-05, "learning_rate": 3.518902720147534e-06, "logits/chosen": 1.3177427053451538, "logits/rejected": 2.9541220664978027, "logps/chosen": -635.0423583984375, "logps/rejected": -1084.8924560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.324959754943848, "rewards/margins": 28.07250213623047, "rewards/rejected": -40.3974609375, "step": 1768 }, { "epoch": 1.100466562986003, "grad_norm": 0.029781443998217583, "learning_rate": 3.5177501152604893e-06, "logits/chosen": -0.6049349904060364, "logits/rejected": 3.1094350814819336, "logps/chosen": -471.39276123046875, "logps/rejected": -894.5453491210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.47968864440918, "rewards/margins": 23.859102249145508, "rewards/rejected": -30.338787078857422, "step": 1769 }, { "epoch": 1.1010886469673407, "grad_norm": 4.44408655166626, "learning_rate": 3.5165975103734445e-06, "logits/chosen": 0.391635537147522, "logits/rejected": 2.7561874389648438, "logps/chosen": -515.3228149414062, "logps/rejected": -884.0820922851562, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -8.186380386352539, "rewards/margins": 16.5716495513916, "rewards/rejected": -24.758028030395508, "step": 1770 }, { "epoch": 1.101710730948678, "grad_norm": 0.00016401683387812227, "learning_rate": 3.5154449054863993e-06, "logits/chosen": 0.2484072893857956, "logits/rejected": 3.562471628189087, "logps/chosen": -561.8585815429688, "logps/rejected": -1126.3258056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.577777862548828, "rewards/margins": 31.752361297607422, "rewards/rejected": -42.33013916015625, "step": 1771 }, { "epoch": 1.1023328149300156, "grad_norm": 6.723960268573137e-06, "learning_rate": 3.5142923005993546e-06, "logits/chosen": -0.14800924062728882, "logits/rejected": 4.816242218017578, "logps/chosen": -402.42181396484375, "logps/rejected": -910.867431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.692221641540527, "rewards/margins": 23.456363677978516, "rewards/rejected": -31.14858627319336, "step": 1772 }, { "epoch": 1.1029548989113531, "grad_norm": 8.856857311911881e-05, "learning_rate": 3.51313969571231e-06, "logits/chosen": 0.9645895957946777, "logits/rejected": 3.0167455673217773, "logps/chosen": -675.44091796875, "logps/rejected": -964.310791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.163752555847168, "rewards/margins": 26.749027252197266, "rewards/rejected": -34.91278076171875, "step": 1773 }, { "epoch": 1.1035769828926905, "grad_norm": 25.04258918762207, "learning_rate": 3.511987090825265e-06, "logits/chosen": -0.3559788465499878, "logits/rejected": 2.854271411895752, "logps/chosen": -573.7998657226562, "logps/rejected": -945.9089965820312, "loss": 0.311, "rewards/accuracies": 0.875, "rewards/chosen": -11.527469635009766, "rewards/margins": 23.415010452270508, "rewards/rejected": -34.942481994628906, "step": 1774 }, { "epoch": 1.104199066874028, "grad_norm": 0.0004723109886981547, "learning_rate": 3.5108344859382202e-06, "logits/chosen": 0.11427342891693115, "logits/rejected": 2.153625011444092, "logps/chosen": -475.01019287109375, "logps/rejected": -929.59228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.516587257385254, "rewards/margins": 28.553447723388672, "rewards/rejected": -34.07003402709961, "step": 1775 }, { "epoch": 1.1048211508553654, "grad_norm": 27.63475227355957, "learning_rate": 3.509681881051176e-06, "logits/chosen": 3.1944663524627686, "logits/rejected": 3.0088977813720703, "logps/chosen": -710.120849609375, "logps/rejected": -900.2386474609375, "loss": 0.3799, "rewards/accuracies": 0.875, "rewards/chosen": -7.0547027587890625, "rewards/margins": 22.974761962890625, "rewards/rejected": -30.029464721679688, "step": 1776 }, { "epoch": 1.105443234836703, "grad_norm": 26.656400680541992, "learning_rate": 3.508529276164131e-06, "logits/chosen": 1.5964024066925049, "logits/rejected": 3.5252232551574707, "logps/chosen": -550.2457885742188, "logps/rejected": -974.29296875, "loss": 0.2396, "rewards/accuracies": 0.875, "rewards/chosen": -8.295560836791992, "rewards/margins": 23.832962036132812, "rewards/rejected": -32.12852096557617, "step": 1777 }, { "epoch": 1.1060653188180405, "grad_norm": 0.04470802843570709, "learning_rate": 3.5073766712770863e-06, "logits/chosen": 1.9093613624572754, "logits/rejected": 3.0826358795166016, "logps/chosen": -693.091552734375, "logps/rejected": -931.0435180664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.754042625427246, "rewards/margins": 23.099082946777344, "rewards/rejected": -34.853126525878906, "step": 1778 }, { "epoch": 1.1066874027993778, "grad_norm": 0.3163846433162689, "learning_rate": 3.5062240663900416e-06, "logits/chosen": -1.5664516687393188, "logits/rejected": 2.738992214202881, "logps/chosen": -354.3194885253906, "logps/rejected": -797.6347045898438, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.3946616649627686, "rewards/margins": 20.815555572509766, "rewards/rejected": -24.21021842956543, "step": 1779 }, { "epoch": 1.1073094867807154, "grad_norm": 32.263362884521484, "learning_rate": 3.505071461502997e-06, "logits/chosen": 0.2788873314857483, "logits/rejected": 3.975221633911133, "logps/chosen": -567.035888671875, "logps/rejected": -931.5633544921875, "loss": 0.5395, "rewards/accuracies": 0.875, "rewards/chosen": -7.741927146911621, "rewards/margins": 21.19263458251953, "rewards/rejected": -28.93456268310547, "step": 1780 }, { "epoch": 1.107931570762053, "grad_norm": 0.0009872695663943887, "learning_rate": 3.503918856615952e-06, "logits/chosen": 0.8488771915435791, "logits/rejected": 1.7789608240127563, "logps/chosen": -445.4822692871094, "logps/rejected": -737.8762817382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.745743274688721, "rewards/margins": 21.692352294921875, "rewards/rejected": -27.438098907470703, "step": 1781 }, { "epoch": 1.1085536547433903, "grad_norm": 0.0002924785949289799, "learning_rate": 3.5027662517289072e-06, "logits/chosen": 0.3400723934173584, "logits/rejected": 3.4457757472991943, "logps/chosen": -407.4837646484375, "logps/rejected": -932.68603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.976365089416504, "rewards/margins": 27.15723419189453, "rewards/rejected": -32.13359832763672, "step": 1782 }, { "epoch": 1.1091757387247279, "grad_norm": 1.0559955626376905e-05, "learning_rate": 3.5016136468418625e-06, "logits/chosen": -1.7059123516082764, "logits/rejected": 3.296755790710449, "logps/chosen": -360.22650146484375, "logps/rejected": -902.837158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.966699123382568, "rewards/margins": 26.81707000732422, "rewards/rejected": -34.78376770019531, "step": 1783 }, { "epoch": 1.1097978227060654, "grad_norm": 5.097897529602051, "learning_rate": 3.500461041954818e-06, "logits/chosen": 1.7751491069793701, "logits/rejected": 3.033304452896118, "logps/chosen": -599.8610229492188, "logps/rejected": -737.4515380859375, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -7.404728412628174, "rewards/margins": 13.427257537841797, "rewards/rejected": -20.831985473632812, "step": 1784 }, { "epoch": 1.1104199066874028, "grad_norm": 0.36752787232398987, "learning_rate": 3.4993084370677733e-06, "logits/chosen": 0.8674039244651794, "logits/rejected": 3.6712758541107178, "logps/chosen": -614.0510864257812, "logps/rejected": -977.8643798828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -9.517800331115723, "rewards/margins": 21.28858757019043, "rewards/rejected": -30.80638885498047, "step": 1785 }, { "epoch": 1.1110419906687403, "grad_norm": 7.078679686856049e-07, "learning_rate": 3.4981558321807286e-06, "logits/chosen": 1.386345386505127, "logits/rejected": 3.601700782775879, "logps/chosen": -534.5748291015625, "logps/rejected": -879.3649291992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.662722587585449, "rewards/margins": 22.788742065429688, "rewards/rejected": -27.45146369934082, "step": 1786 }, { "epoch": 1.1116640746500777, "grad_norm": 0.01572837121784687, "learning_rate": 3.4970032272936838e-06, "logits/chosen": 1.7985820770263672, "logits/rejected": 2.970552682876587, "logps/chosen": -631.8236694335938, "logps/rejected": -1082.2847900390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.598766803741455, "rewards/margins": 28.447805404663086, "rewards/rejected": -36.04656982421875, "step": 1787 }, { "epoch": 1.1122861586314152, "grad_norm": 0.0024721594527363777, "learning_rate": 3.495850622406639e-06, "logits/chosen": -1.2049006223678589, "logits/rejected": 3.378148317337036, "logps/chosen": -365.1925964355469, "logps/rejected": -871.2085571289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.7494306564331055, "rewards/margins": 27.50840187072754, "rewards/rejected": -32.25783157348633, "step": 1788 }, { "epoch": 1.1129082426127528, "grad_norm": 0.0008935022633522749, "learning_rate": 3.4946980175195942e-06, "logits/chosen": -1.4913283586502075, "logits/rejected": 3.5099782943725586, "logps/chosen": -498.25164794921875, "logps/rejected": -1032.5533447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7877275943756104, "rewards/margins": 30.057601928710938, "rewards/rejected": -31.84532928466797, "step": 1789 }, { "epoch": 1.1135303265940901, "grad_norm": 0.45027968287467957, "learning_rate": 3.4935454126325495e-06, "logits/chosen": 0.22388708591461182, "logits/rejected": 4.521572113037109, "logps/chosen": -420.34503173828125, "logps/rejected": -930.3162841796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -6.024784088134766, "rewards/margins": 24.955102920532227, "rewards/rejected": -30.97988510131836, "step": 1790 }, { "epoch": 1.1141524105754277, "grad_norm": 16.772062301635742, "learning_rate": 3.492392807745505e-06, "logits/chosen": 1.6867791414260864, "logits/rejected": 2.316291332244873, "logps/chosen": -534.1275024414062, "logps/rejected": -781.1571044921875, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": -9.479506492614746, "rewards/margins": 22.093637466430664, "rewards/rejected": -31.573143005371094, "step": 1791 }, { "epoch": 1.1147744945567652, "grad_norm": 2.7089431285858154, "learning_rate": 3.4912402028584603e-06, "logits/chosen": -1.008533000946045, "logits/rejected": 2.780308246612549, "logps/chosen": -537.2943725585938, "logps/rejected": -1070.39501953125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -8.919048309326172, "rewards/margins": 26.721981048583984, "rewards/rejected": -35.641029357910156, "step": 1792 }, { "epoch": 1.1153965785381026, "grad_norm": 0.09999026358127594, "learning_rate": 3.4900875979714156e-06, "logits/chosen": 0.9722707271575928, "logits/rejected": 2.4060893058776855, "logps/chosen": -599.2125244140625, "logps/rejected": -940.9347534179688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.931610584259033, "rewards/margins": 24.689077377319336, "rewards/rejected": -31.62068748474121, "step": 1793 }, { "epoch": 1.1160186625194402, "grad_norm": 0.0830949917435646, "learning_rate": 3.4889349930843708e-06, "logits/chosen": 1.924202799797058, "logits/rejected": 3.6883726119995117, "logps/chosen": -553.5717163085938, "logps/rejected": -947.3374633789062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.690497398376465, "rewards/margins": 26.857397079467773, "rewards/rejected": -32.54789733886719, "step": 1794 }, { "epoch": 1.1166407465007775, "grad_norm": 2.811249032674823e-06, "learning_rate": 3.487782388197326e-06, "logits/chosen": -1.5614333152770996, "logits/rejected": 1.486207365989685, "logps/chosen": -335.30596923828125, "logps/rejected": -901.7238159179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4471044540405273, "rewards/margins": 29.40472984313965, "rewards/rejected": -32.85183334350586, "step": 1795 }, { "epoch": 1.117262830482115, "grad_norm": 0.00018924751202575862, "learning_rate": 3.4866297833102812e-06, "logits/chosen": -1.4683024883270264, "logits/rejected": 5.106406211853027, "logps/chosen": -420.83184814453125, "logps/rejected": -1123.9346923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.011943817138672, "rewards/margins": 36.865928649902344, "rewards/rejected": -42.877872467041016, "step": 1796 }, { "epoch": 1.1178849144634526, "grad_norm": 0.023054877296090126, "learning_rate": 3.4854771784232365e-06, "logits/chosen": -0.9309723973274231, "logits/rejected": 4.368443965911865, "logps/chosen": -336.5071716308594, "logps/rejected": -876.109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.8464765548706055, "rewards/margins": 22.422441482543945, "rewards/rejected": -27.268917083740234, "step": 1797 }, { "epoch": 1.11850699844479, "grad_norm": 0.00115344044752419, "learning_rate": 3.484324573536192e-06, "logits/chosen": -1.9685173034667969, "logits/rejected": 4.014041423797607, "logps/chosen": -358.6552734375, "logps/rejected": -1036.917724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.902338981628418, "rewards/margins": 30.619861602783203, "rewards/rejected": -35.52220153808594, "step": 1798 }, { "epoch": 1.1191290824261275, "grad_norm": 29.647974014282227, "learning_rate": 3.4831719686491473e-06, "logits/chosen": 0.758859395980835, "logits/rejected": 3.8699092864990234, "logps/chosen": -597.4893798828125, "logps/rejected": -891.358154296875, "loss": 0.1947, "rewards/accuracies": 0.875, "rewards/chosen": -8.706351280212402, "rewards/margins": 18.08136749267578, "rewards/rejected": -26.7877197265625, "step": 1799 }, { "epoch": 1.119751166407465, "grad_norm": 1.148080173152266e-05, "learning_rate": 3.4820193637621026e-06, "logits/chosen": -1.1118121147155762, "logits/rejected": 4.116186141967773, "logps/chosen": -457.7972106933594, "logps/rejected": -1084.82958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.23808479309082, "rewards/margins": 31.739526748657227, "rewards/rejected": -37.97761154174805, "step": 1800 }, { "epoch": 1.1203732503888024, "grad_norm": 0.008707517758011818, "learning_rate": 3.4808667588750578e-06, "logits/chosen": -0.4364929497241974, "logits/rejected": 3.4837522506713867, "logps/chosen": -444.2327575683594, "logps/rejected": -858.1317749023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.276412963867188, "rewards/margins": 18.92943572998047, "rewards/rejected": -29.205846786499023, "step": 1801 }, { "epoch": 1.12099533437014, "grad_norm": 11.017598152160645, "learning_rate": 3.479714153988013e-06, "logits/chosen": -1.9811372756958008, "logits/rejected": 2.453932523727417, "logps/chosen": -341.76824951171875, "logps/rejected": -778.7388916015625, "loss": 0.2198, "rewards/accuracies": 0.875, "rewards/chosen": -4.120153903961182, "rewards/margins": 24.861141204833984, "rewards/rejected": -28.98129653930664, "step": 1802 }, { "epoch": 1.1216174183514775, "grad_norm": 0.12693174183368683, "learning_rate": 3.4785615491009682e-06, "logits/chosen": 1.7608217000961304, "logits/rejected": 4.857548713684082, "logps/chosen": -597.873291015625, "logps/rejected": -1057.2740478515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.650867462158203, "rewards/margins": 31.496370315551758, "rewards/rejected": -40.147239685058594, "step": 1803 }, { "epoch": 1.1222395023328149, "grad_norm": 20.185020446777344, "learning_rate": 3.4774089442139235e-06, "logits/chosen": 1.1995744705200195, "logits/rejected": 2.085767984390259, "logps/chosen": -576.979248046875, "logps/rejected": -812.4219970703125, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": -5.986093521118164, "rewards/margins": 15.25129508972168, "rewards/rejected": -21.237390518188477, "step": 1804 }, { "epoch": 1.1228615863141524, "grad_norm": 0.004299265798181295, "learning_rate": 3.476256339326879e-06, "logits/chosen": 1.5160009860992432, "logits/rejected": 4.4680047035217285, "logps/chosen": -594.9562377929688, "logps/rejected": -942.6099243164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.65675163269043, "rewards/margins": 21.242027282714844, "rewards/rejected": -32.89877700805664, "step": 1805 }, { "epoch": 1.1234836702954898, "grad_norm": 8.741262435913086, "learning_rate": 3.4751037344398343e-06, "logits/chosen": -1.0180206298828125, "logits/rejected": 4.231414318084717, "logps/chosen": -440.399169921875, "logps/rejected": -888.4819946289062, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -4.514710426330566, "rewards/margins": 19.439733505249023, "rewards/rejected": -23.954442977905273, "step": 1806 }, { "epoch": 1.1241057542768274, "grad_norm": 0.0063452026806771755, "learning_rate": 3.4739511295527896e-06, "logits/chosen": 2.629770278930664, "logits/rejected": 3.7640302181243896, "logps/chosen": -643.5936279296875, "logps/rejected": -977.7257080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.321480751037598, "rewards/margins": 25.136545181274414, "rewards/rejected": -32.45802688598633, "step": 1807 }, { "epoch": 1.124727838258165, "grad_norm": 5.7119975090026855, "learning_rate": 3.4727985246657448e-06, "logits/chosen": -0.7087704539299011, "logits/rejected": 2.7257461547851562, "logps/chosen": -487.17840576171875, "logps/rejected": -791.4234619140625, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -5.3370232582092285, "rewards/margins": 14.614947319030762, "rewards/rejected": -19.95197105407715, "step": 1808 }, { "epoch": 1.1253499222395023, "grad_norm": 0.023234574124217033, "learning_rate": 3.4716459197787e-06, "logits/chosen": 0.3555334806442261, "logits/rejected": 2.6853690147399902, "logps/chosen": -598.6488037109375, "logps/rejected": -956.1478271484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7893404960632324, "rewards/margins": 23.659915924072266, "rewards/rejected": -26.449256896972656, "step": 1809 }, { "epoch": 1.1259720062208398, "grad_norm": 0.003041701391339302, "learning_rate": 3.4704933148916552e-06, "logits/chosen": 2.4652957916259766, "logits/rejected": 4.392395973205566, "logps/chosen": -628.3650512695312, "logps/rejected": -973.9765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.876049041748047, "rewards/margins": 28.1997127532959, "rewards/rejected": -35.07575988769531, "step": 1810 }, { "epoch": 1.1265940902021774, "grad_norm": 0.15895365178585052, "learning_rate": 3.4693407100046105e-06, "logits/chosen": -0.8710925579071045, "logits/rejected": 2.300516128540039, "logps/chosen": -419.5194396972656, "logps/rejected": -712.6929321289062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.929962158203125, "rewards/margins": 13.017057418823242, "rewards/rejected": -15.947019577026367, "step": 1811 }, { "epoch": 1.1272161741835147, "grad_norm": 12.767523765563965, "learning_rate": 3.4681881051175657e-06, "logits/chosen": -1.348296880722046, "logits/rejected": 3.6018869876861572, "logps/chosen": -453.49639892578125, "logps/rejected": -975.9755859375, "loss": 0.0632, "rewards/accuracies": 1.0, "rewards/chosen": -5.690885066986084, "rewards/margins": 29.5626220703125, "rewards/rejected": -35.253509521484375, "step": 1812 }, { "epoch": 1.1278382581648523, "grad_norm": 0.8971028923988342, "learning_rate": 3.4670355002305213e-06, "logits/chosen": -0.9237762689590454, "logits/rejected": 2.268069267272949, "logps/chosen": -493.8834228515625, "logps/rejected": -852.326416015625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -3.8636460304260254, "rewards/margins": 20.109888076782227, "rewards/rejected": -23.973533630371094, "step": 1813 }, { "epoch": 1.1284603421461896, "grad_norm": 9.861696243286133, "learning_rate": 3.4658828953434765e-06, "logits/chosen": -1.5349409580230713, "logits/rejected": 3.1588385105133057, "logps/chosen": -334.5981140136719, "logps/rejected": -751.074462890625, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -4.684638977050781, "rewards/margins": 21.330123901367188, "rewards/rejected": -26.0147647857666, "step": 1814 }, { "epoch": 1.1290824261275272, "grad_norm": 0.10593397915363312, "learning_rate": 3.4647302904564318e-06, "logits/chosen": -0.2577982544898987, "logits/rejected": 1.7699002027511597, "logps/chosen": -580.86181640625, "logps/rejected": -899.0047607421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.7143940925598145, "rewards/margins": 23.71632194519043, "rewards/rejected": -31.430715560913086, "step": 1815 }, { "epoch": 1.1297045101088647, "grad_norm": 0.03057820163667202, "learning_rate": 3.463577685569387e-06, "logits/chosen": 2.243663787841797, "logits/rejected": 4.455940246582031, "logps/chosen": -518.3104248046875, "logps/rejected": -925.2927856445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.088144302368164, "rewards/margins": 24.08509063720703, "rewards/rejected": -30.173236846923828, "step": 1816 }, { "epoch": 1.130326594090202, "grad_norm": 0.29756349325180054, "learning_rate": 3.4624250806823422e-06, "logits/chosen": 1.827619194984436, "logits/rejected": 3.9997007846832275, "logps/chosen": -611.642578125, "logps/rejected": -1030.654296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.628411293029785, "rewards/margins": 25.694616317749023, "rewards/rejected": -32.323028564453125, "step": 1817 }, { "epoch": 1.1309486780715396, "grad_norm": 0.008640704676508904, "learning_rate": 3.4612724757952974e-06, "logits/chosen": 1.3161953687667847, "logits/rejected": 3.7667105197906494, "logps/chosen": -409.8922119140625, "logps/rejected": -770.370849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.131985187530518, "rewards/margins": 24.006284713745117, "rewards/rejected": -30.138267517089844, "step": 1818 }, { "epoch": 1.1315707620528772, "grad_norm": 0.05921082943677902, "learning_rate": 3.4601198709082527e-06, "logits/chosen": 2.588874101638794, "logits/rejected": 2.696937084197998, "logps/chosen": -628.57568359375, "logps/rejected": -915.2703857421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.494431495666504, "rewards/margins": 29.17119026184082, "rewards/rejected": -37.66562271118164, "step": 1819 }, { "epoch": 1.1321928460342146, "grad_norm": 18.75832748413086, "learning_rate": 3.4589672660212083e-06, "logits/chosen": -0.10499536991119385, "logits/rejected": 2.8624749183654785, "logps/chosen": -397.4913330078125, "logps/rejected": -832.5264892578125, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": -6.647233486175537, "rewards/margins": 26.839122772216797, "rewards/rejected": -33.486358642578125, "step": 1820 }, { "epoch": 1.1328149300155521, "grad_norm": 0.5162553191184998, "learning_rate": 3.4578146611341635e-06, "logits/chosen": -0.5121818780899048, "logits/rejected": 4.498852252960205, "logps/chosen": -433.69439697265625, "logps/rejected": -1014.330322265625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.915709972381592, "rewards/margins": 24.85306167602539, "rewards/rejected": -30.76877212524414, "step": 1821 }, { "epoch": 1.1334370139968897, "grad_norm": 16.169828414916992, "learning_rate": 3.4566620562471188e-06, "logits/chosen": -2.0768256187438965, "logits/rejected": 3.2809956073760986, "logps/chosen": -361.49493408203125, "logps/rejected": -873.9121704101562, "loss": 0.1493, "rewards/accuracies": 0.875, "rewards/chosen": -1.6932404041290283, "rewards/margins": 18.26701545715332, "rewards/rejected": -19.960254669189453, "step": 1822 }, { "epoch": 1.134059097978227, "grad_norm": 2.1423237323760986, "learning_rate": 3.455509451360074e-06, "logits/chosen": 1.41920804977417, "logits/rejected": 4.007068634033203, "logps/chosen": -520.28076171875, "logps/rejected": -855.2392578125, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -5.254186153411865, "rewards/margins": 23.409135818481445, "rewards/rejected": -28.6633243560791, "step": 1823 }, { "epoch": 1.1346811819595646, "grad_norm": 40.92685317993164, "learning_rate": 3.4543568464730292e-06, "logits/chosen": -0.6169062852859497, "logits/rejected": 0.6554781198501587, "logps/chosen": -348.8257141113281, "logps/rejected": -584.2406005859375, "loss": 1.5563, "rewards/accuracies": 0.75, "rewards/chosen": -6.060636043548584, "rewards/margins": 13.35194206237793, "rewards/rejected": -19.412578582763672, "step": 1824 }, { "epoch": 1.1353032659409021, "grad_norm": 25.226947784423828, "learning_rate": 3.4532042415859844e-06, "logits/chosen": -2.031526565551758, "logits/rejected": 0.9291627407073975, "logps/chosen": -439.3551025390625, "logps/rejected": -834.6775512695312, "loss": 0.2199, "rewards/accuracies": 0.875, "rewards/chosen": -4.6891093254089355, "rewards/margins": 20.3415584564209, "rewards/rejected": -25.03066635131836, "step": 1825 }, { "epoch": 1.1359253499222395, "grad_norm": 0.34828466176986694, "learning_rate": 3.4520516366989397e-06, "logits/chosen": -0.45988568663597107, "logits/rejected": 4.3835344314575195, "logps/chosen": -523.087890625, "logps/rejected": -990.68603515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.440042495727539, "rewards/margins": 24.660259246826172, "rewards/rejected": -30.100299835205078, "step": 1826 }, { "epoch": 1.136547433903577, "grad_norm": 0.009784924797713757, "learning_rate": 3.4508990318118953e-06, "logits/chosen": 1.2427332401275635, "logits/rejected": 5.04477596282959, "logps/chosen": -479.31439208984375, "logps/rejected": -879.8087158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.576506614685059, "rewards/margins": 22.55929183959961, "rewards/rejected": -31.135799407958984, "step": 1827 }, { "epoch": 1.1371695178849144, "grad_norm": 0.010587329044938087, "learning_rate": 3.4497464269248505e-06, "logits/chosen": 1.4068046808242798, "logits/rejected": 4.196283340454102, "logps/chosen": -501.2727355957031, "logps/rejected": -985.1683349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.856316566467285, "rewards/margins": 28.490116119384766, "rewards/rejected": -34.346435546875, "step": 1828 }, { "epoch": 1.137791601866252, "grad_norm": 0.047132086008787155, "learning_rate": 3.4485938220378058e-06, "logits/chosen": 2.6061911582946777, "logits/rejected": 3.4840335845947266, "logps/chosen": -646.5637817382812, "logps/rejected": -892.6546630859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.78953742980957, "rewards/margins": 22.944318771362305, "rewards/rejected": -28.733856201171875, "step": 1829 }, { "epoch": 1.1384136858475895, "grad_norm": 0.00011491310579003766, "learning_rate": 3.447441217150761e-06, "logits/chosen": 3.098417043685913, "logits/rejected": 4.135062217712402, "logps/chosen": -601.7989501953125, "logps/rejected": -902.9705810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.078056335449219, "rewards/margins": 26.534677505493164, "rewards/rejected": -32.612735748291016, "step": 1830 }, { "epoch": 1.1390357698289268, "grad_norm": 4.412220001220703, "learning_rate": 3.4462886122637162e-06, "logits/chosen": 1.1942940950393677, "logits/rejected": 2.7071847915649414, "logps/chosen": -393.26165771484375, "logps/rejected": -777.9138793945312, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -4.756586074829102, "rewards/margins": 20.164287567138672, "rewards/rejected": -24.920875549316406, "step": 1831 }, { "epoch": 1.1396578538102644, "grad_norm": 0.0028339342679828405, "learning_rate": 3.4451360073766714e-06, "logits/chosen": 0.46175873279571533, "logits/rejected": 4.249823570251465, "logps/chosen": -444.9114074707031, "logps/rejected": -845.7836303710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.111077308654785, "rewards/margins": 18.86326026916504, "rewards/rejected": -22.97433853149414, "step": 1832 }, { "epoch": 1.1402799377916017, "grad_norm": 21.523834228515625, "learning_rate": 3.4439834024896267e-06, "logits/chosen": 0.8463013172149658, "logits/rejected": 3.7445366382598877, "logps/chosen": -480.28466796875, "logps/rejected": -760.3237915039062, "loss": 0.1944, "rewards/accuracies": 0.875, "rewards/chosen": -6.874420166015625, "rewards/margins": 18.69571304321289, "rewards/rejected": -25.570131301879883, "step": 1833 }, { "epoch": 1.1409020217729393, "grad_norm": 0.8458882570266724, "learning_rate": 3.442830797602582e-06, "logits/chosen": 1.6432691812515259, "logits/rejected": 4.076677322387695, "logps/chosen": -539.7843017578125, "logps/rejected": -844.2291259765625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -5.506782531738281, "rewards/margins": 13.569256782531738, "rewards/rejected": -19.076038360595703, "step": 1834 }, { "epoch": 1.1415241057542769, "grad_norm": 0.08397660404443741, "learning_rate": 3.4416781927155375e-06, "logits/chosen": -0.7760798335075378, "logits/rejected": 3.8760299682617188, "logps/chosen": -462.271484375, "logps/rejected": -995.9507446289062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.9000444412231445, "rewards/margins": 27.665067672729492, "rewards/rejected": -32.56511306762695, "step": 1835 }, { "epoch": 1.1421461897356142, "grad_norm": 30.610496520996094, "learning_rate": 3.4405255878284928e-06, "logits/chosen": 1.3444571495056152, "logits/rejected": 2.9977803230285645, "logps/chosen": -615.3402709960938, "logps/rejected": -886.0873413085938, "loss": 0.2224, "rewards/accuracies": 0.875, "rewards/chosen": -6.009075164794922, "rewards/margins": 18.689491271972656, "rewards/rejected": -24.698566436767578, "step": 1836 }, { "epoch": 1.1427682737169518, "grad_norm": 0.008456461131572723, "learning_rate": 3.439372982941448e-06, "logits/chosen": 0.7336775064468384, "logits/rejected": 1.3986623287200928, "logps/chosen": -559.4346923828125, "logps/rejected": -851.5731201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.957766056060791, "rewards/margins": 24.573040008544922, "rewards/rejected": -31.530807495117188, "step": 1837 }, { "epoch": 1.1433903576982893, "grad_norm": 0.3551645576953888, "learning_rate": 3.4382203780544032e-06, "logits/chosen": -1.7950788736343384, "logits/rejected": 0.5798860788345337, "logps/chosen": -513.7240600585938, "logps/rejected": -909.632568359375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -5.312442779541016, "rewards/margins": 27.34845733642578, "rewards/rejected": -32.66089630126953, "step": 1838 }, { "epoch": 1.1440124416796267, "grad_norm": 0.0067468322813510895, "learning_rate": 3.4370677731673584e-06, "logits/chosen": -0.35771840810775757, "logits/rejected": 1.6661145687103271, "logps/chosen": -544.712890625, "logps/rejected": -888.5681762695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.579408168792725, "rewards/margins": 24.83259391784668, "rewards/rejected": -29.412002563476562, "step": 1839 }, { "epoch": 1.1446345256609642, "grad_norm": 0.06160581111907959, "learning_rate": 3.4359151682803137e-06, "logits/chosen": 1.519280195236206, "logits/rejected": 4.197678565979004, "logps/chosen": -560.5360107421875, "logps/rejected": -958.4043579101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.82316780090332, "rewards/margins": 23.101886749267578, "rewards/rejected": -28.925052642822266, "step": 1840 }, { "epoch": 1.1452566096423018, "grad_norm": 0.05726956948637962, "learning_rate": 3.434762563393269e-06, "logits/chosen": -1.2474236488342285, "logits/rejected": 3.9748692512512207, "logps/chosen": -507.4255065917969, "logps/rejected": -922.804443359375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.531657218933105, "rewards/margins": 16.375091552734375, "rewards/rejected": -24.90674591064453, "step": 1841 }, { "epoch": 1.1458786936236391, "grad_norm": 0.13852474093437195, "learning_rate": 3.4336099585062245e-06, "logits/chosen": -4.124720573425293, "logits/rejected": 1.3218566179275513, "logps/chosen": -302.812255859375, "logps/rejected": -713.38720703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.925245761871338, "rewards/margins": 17.303882598876953, "rewards/rejected": -21.229129791259766, "step": 1842 }, { "epoch": 1.1465007776049767, "grad_norm": 0.4648420810699463, "learning_rate": 3.4324573536191798e-06, "logits/chosen": -1.682816982269287, "logits/rejected": 1.9900355339050293, "logps/chosen": -366.4227600097656, "logps/rejected": -789.5729370117188, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.685972213745117, "rewards/margins": 22.045013427734375, "rewards/rejected": -27.730985641479492, "step": 1843 }, { "epoch": 1.1471228615863143, "grad_norm": 0.00197424809448421, "learning_rate": 3.431304748732135e-06, "logits/chosen": 2.2375340461730957, "logits/rejected": 3.6597280502319336, "logps/chosen": -543.592529296875, "logps/rejected": -841.5507202148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5524721145629883, "rewards/margins": 21.644725799560547, "rewards/rejected": -25.19719696044922, "step": 1844 }, { "epoch": 1.1477449455676516, "grad_norm": 0.0012396638048812747, "learning_rate": 3.43015214384509e-06, "logits/chosen": 0.10678932815790176, "logits/rejected": 2.318019390106201, "logps/chosen": -518.6724853515625, "logps/rejected": -880.78759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.50995397567749, "rewards/margins": 25.387535095214844, "rewards/rejected": -29.89748764038086, "step": 1845 }, { "epoch": 1.1483670295489892, "grad_norm": 29.93779945373535, "learning_rate": 3.4289995389580454e-06, "logits/chosen": -0.42190590500831604, "logits/rejected": 3.944661855697632, "logps/chosen": -453.431640625, "logps/rejected": -858.2440185546875, "loss": 0.3281, "rewards/accuracies": 0.875, "rewards/chosen": -7.201959133148193, "rewards/margins": 17.08089256286621, "rewards/rejected": -24.282852172851562, "step": 1846 }, { "epoch": 1.1489891135303265, "grad_norm": 0.016757098957896233, "learning_rate": 3.4278469340710007e-06, "logits/chosen": -0.9321748614311218, "logits/rejected": 2.595136880874634, "logps/chosen": -430.6005859375, "logps/rejected": -932.3299560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7968358993530273, "rewards/margins": 21.861114501953125, "rewards/rejected": -25.657949447631836, "step": 1847 }, { "epoch": 1.149611197511664, "grad_norm": 0.001826676307246089, "learning_rate": 3.426694329183956e-06, "logits/chosen": -1.5225608348846436, "logits/rejected": 3.197697162628174, "logps/chosen": -296.51165771484375, "logps/rejected": -825.5092163085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.746575117111206, "rewards/margins": 27.655593872070312, "rewards/rejected": -29.40216827392578, "step": 1848 }, { "epoch": 1.1502332814930016, "grad_norm": 0.0038855511229485273, "learning_rate": 3.4255417242969115e-06, "logits/chosen": -2.6227917671203613, "logits/rejected": 0.8911744356155396, "logps/chosen": -415.6127014160156, "logps/rejected": -1022.6885375976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.036016464233398, "rewards/margins": 29.356048583984375, "rewards/rejected": -37.392066955566406, "step": 1849 }, { "epoch": 1.150855365474339, "grad_norm": 30.342679977416992, "learning_rate": 3.4243891194098668e-06, "logits/chosen": 1.395837426185608, "logits/rejected": 2.462480306625366, "logps/chosen": -599.7311401367188, "logps/rejected": -745.6565551757812, "loss": 0.6978, "rewards/accuracies": 0.875, "rewards/chosen": -6.861150741577148, "rewards/margins": 15.441508293151855, "rewards/rejected": -22.302658081054688, "step": 1850 }, { "epoch": 1.1514774494556765, "grad_norm": 10.274881362915039, "learning_rate": 3.423236514522822e-06, "logits/chosen": 3.5531959533691406, "logits/rejected": 3.5783658027648926, "logps/chosen": -626.026123046875, "logps/rejected": -849.3629150390625, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": -9.679840087890625, "rewards/margins": 17.586177825927734, "rewards/rejected": -27.26601791381836, "step": 1851 }, { "epoch": 1.1520995334370139, "grad_norm": 0.01562790386378765, "learning_rate": 3.422083909635777e-06, "logits/chosen": 1.6220135688781738, "logits/rejected": 3.173398971557617, "logps/chosen": -565.2597045898438, "logps/rejected": -797.6683349609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.182493209838867, "rewards/margins": 15.873281478881836, "rewards/rejected": -22.055774688720703, "step": 1852 }, { "epoch": 1.1527216174183514, "grad_norm": 0.02775227278470993, "learning_rate": 3.4209313047487324e-06, "logits/chosen": 0.7502641677856445, "logits/rejected": 4.694336414337158, "logps/chosen": -500.84454345703125, "logps/rejected": -958.1452026367188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.311468124389648, "rewards/margins": 25.65319061279297, "rewards/rejected": -30.964656829833984, "step": 1853 }, { "epoch": 1.153343701399689, "grad_norm": 0.004239192698150873, "learning_rate": 3.4197786998616877e-06, "logits/chosen": 1.1329727172851562, "logits/rejected": 4.304278373718262, "logps/chosen": -465.35028076171875, "logps/rejected": -992.7420043945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.760903358459473, "rewards/margins": 27.45376968383789, "rewards/rejected": -33.21466827392578, "step": 1854 }, { "epoch": 1.1539657853810263, "grad_norm": 36.458282470703125, "learning_rate": 3.418626094974643e-06, "logits/chosen": 0.939897894859314, "logits/rejected": 3.1979446411132812, "logps/chosen": -555.7550659179688, "logps/rejected": -973.84814453125, "loss": 0.5565, "rewards/accuracies": 0.875, "rewards/chosen": -2.7667133808135986, "rewards/margins": 26.34735107421875, "rewards/rejected": -29.114065170288086, "step": 1855 }, { "epoch": 1.154587869362364, "grad_norm": 0.10218100994825363, "learning_rate": 3.4174734900875985e-06, "logits/chosen": 2.230012893676758, "logits/rejected": 1.621048927307129, "logps/chosen": -566.50341796875, "logps/rejected": -781.8897094726562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.9751152992248535, "rewards/margins": 17.062904357910156, "rewards/rejected": -24.038021087646484, "step": 1856 }, { "epoch": 1.1552099533437015, "grad_norm": 3.060608833038714e-07, "learning_rate": 3.4163208852005538e-06, "logits/chosen": 0.6016669273376465, "logits/rejected": 2.2606773376464844, "logps/chosen": -630.81396484375, "logps/rejected": -930.8638916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.966922760009766, "rewards/margins": 28.5898380279541, "rewards/rejected": -39.5567626953125, "step": 1857 }, { "epoch": 1.1558320373250388, "grad_norm": 7.1231184005737305, "learning_rate": 3.415168280313509e-06, "logits/chosen": -0.9377905130386353, "logits/rejected": 2.894376277923584, "logps/chosen": -480.42840576171875, "logps/rejected": -777.5912475585938, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -5.04439115524292, "rewards/margins": 14.577075958251953, "rewards/rejected": -19.6214656829834, "step": 1858 }, { "epoch": 1.1564541213063764, "grad_norm": 14.957558631896973, "learning_rate": 3.414015675426464e-06, "logits/chosen": 1.6655893325805664, "logits/rejected": 4.472172260284424, "logps/chosen": -559.6851196289062, "logps/rejected": -858.532958984375, "loss": 0.139, "rewards/accuracies": 0.875, "rewards/chosen": -7.541600227355957, "rewards/margins": 15.499042510986328, "rewards/rejected": -23.04064178466797, "step": 1859 }, { "epoch": 1.157076205287714, "grad_norm": 0.31053587794303894, "learning_rate": 3.4128630705394194e-06, "logits/chosen": 2.565433979034424, "logits/rejected": 3.6672003269195557, "logps/chosen": -529.284912109375, "logps/rejected": -770.8364868164062, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -6.860030174255371, "rewards/margins": 14.366354942321777, "rewards/rejected": -21.22638511657715, "step": 1860 }, { "epoch": 1.1576982892690513, "grad_norm": 0.01324005238711834, "learning_rate": 3.4117104656523747e-06, "logits/chosen": -0.9546438455581665, "logits/rejected": 3.5789406299591064, "logps/chosen": -428.26800537109375, "logps/rejected": -955.6097412109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.1026291847229, "rewards/margins": 27.42490005493164, "rewards/rejected": -32.52752685546875, "step": 1861 }, { "epoch": 1.1583203732503888, "grad_norm": 2.0153255453347896e-10, "learning_rate": 3.41055786076533e-06, "logits/chosen": 0.16465669870376587, "logits/rejected": 4.746971130371094, "logps/chosen": -483.0347595214844, "logps/rejected": -1162.955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.803442478179932, "rewards/margins": 39.28862380981445, "rewards/rejected": -46.092063903808594, "step": 1862 }, { "epoch": 1.1589424572317264, "grad_norm": 1.7264849816456262e-07, "learning_rate": 3.409405255878285e-06, "logits/chosen": -1.883517861366272, "logits/rejected": 3.036733388900757, "logps/chosen": -385.6026611328125, "logps/rejected": -886.4443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.124665260314941, "rewards/margins": 31.6492977142334, "rewards/rejected": -35.773963928222656, "step": 1863 }, { "epoch": 1.1595645412130637, "grad_norm": 0.017328694462776184, "learning_rate": 3.4082526509912408e-06, "logits/chosen": -1.9856454133987427, "logits/rejected": 4.194329261779785, "logps/chosen": -361.50079345703125, "logps/rejected": -1048.82275390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6735262870788574, "rewards/margins": 32.9036750793457, "rewards/rejected": -36.57720184326172, "step": 1864 }, { "epoch": 1.1601866251944013, "grad_norm": 0.021417386829853058, "learning_rate": 3.407100046104196e-06, "logits/chosen": -2.5190682411193848, "logits/rejected": 1.7409802675247192, "logps/chosen": -420.822998046875, "logps/rejected": -982.8463134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.529491424560547, "rewards/margins": 26.406007766723633, "rewards/rejected": -35.93549728393555, "step": 1865 }, { "epoch": 1.1608087091757386, "grad_norm": 2.4923558157752268e-05, "learning_rate": 3.405947441217151e-06, "logits/chosen": -1.5282564163208008, "logits/rejected": 1.6944283246994019, "logps/chosen": -333.0899353027344, "logps/rejected": -779.9032592773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1282002925872803, "rewards/margins": 28.565494537353516, "rewards/rejected": -31.693696975708008, "step": 1866 }, { "epoch": 1.1614307931570762, "grad_norm": 8.579933166503906, "learning_rate": 3.4047948363301064e-06, "logits/chosen": 0.9272339940071106, "logits/rejected": 3.0300166606903076, "logps/chosen": -622.0635986328125, "logps/rejected": -927.7623291015625, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": -6.8049421310424805, "rewards/margins": 20.665813446044922, "rewards/rejected": -27.470754623413086, "step": 1867 }, { "epoch": 1.1620528771384138, "grad_norm": 0.16499559581279755, "learning_rate": 3.4036422314430616e-06, "logits/chosen": 0.13868463039398193, "logits/rejected": 1.8279064893722534, "logps/chosen": -570.3772583007812, "logps/rejected": -917.9130249023438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.679691314697266, "rewards/margins": 21.65399742126465, "rewards/rejected": -30.333688735961914, "step": 1868 }, { "epoch": 1.162674961119751, "grad_norm": 4.282171249389648, "learning_rate": 3.402489626556017e-06, "logits/chosen": 1.3641959428787231, "logits/rejected": 2.3401095867156982, "logps/chosen": -680.9366455078125, "logps/rejected": -977.0753173828125, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -8.871400833129883, "rewards/margins": 18.216270446777344, "rewards/rejected": -27.087669372558594, "step": 1869 }, { "epoch": 1.1632970451010887, "grad_norm": 0.30975544452667236, "learning_rate": 3.401337021668972e-06, "logits/chosen": -0.5252646207809448, "logits/rejected": 1.5963172912597656, "logps/chosen": -546.5020751953125, "logps/rejected": -842.185546875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -7.25322151184082, "rewards/margins": 20.49350929260254, "rewards/rejected": -27.746732711791992, "step": 1870 }, { "epoch": 1.163919129082426, "grad_norm": 0.0698472410440445, "learning_rate": 3.4001844167819277e-06, "logits/chosen": -0.9682607054710388, "logits/rejected": 3.4010274410247803, "logps/chosen": -492.4614562988281, "logps/rejected": -1111.92236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.805222511291504, "rewards/margins": 35.77224349975586, "rewards/rejected": -41.57746505737305, "step": 1871 }, { "epoch": 1.1645412130637636, "grad_norm": 18.648984909057617, "learning_rate": 3.399031811894883e-06, "logits/chosen": 0.6838645935058594, "logits/rejected": 2.589054584503174, "logps/chosen": -500.4499816894531, "logps/rejected": -837.994384765625, "loss": 0.1269, "rewards/accuracies": 0.875, "rewards/chosen": -7.229727745056152, "rewards/margins": 16.818801879882812, "rewards/rejected": -24.04853057861328, "step": 1872 }, { "epoch": 1.1651632970451011, "grad_norm": 7.5405064308142755e-06, "learning_rate": 3.397879207007838e-06, "logits/chosen": -1.3973257541656494, "logits/rejected": 0.9450278878211975, "logps/chosen": -473.623779296875, "logps/rejected": -899.1165771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.336153268814087, "rewards/margins": 30.845565795898438, "rewards/rejected": -34.18171691894531, "step": 1873 }, { "epoch": 1.1657853810264385, "grad_norm": 0.12114159017801285, "learning_rate": 3.3967266021207934e-06, "logits/chosen": 3.0597760677337646, "logits/rejected": 4.02023983001709, "logps/chosen": -646.0111083984375, "logps/rejected": -886.9459228515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.5245513916015625, "rewards/margins": 20.911964416503906, "rewards/rejected": -28.43651580810547, "step": 1874 }, { "epoch": 1.166407465007776, "grad_norm": 1.1679210662841797, "learning_rate": 3.3955739972337486e-06, "logits/chosen": 0.360098659992218, "logits/rejected": 2.741698741912842, "logps/chosen": -627.6326904296875, "logps/rejected": -949.013671875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -10.880720138549805, "rewards/margins": 21.968727111816406, "rewards/rejected": -32.849449157714844, "step": 1875 }, { "epoch": 1.1670295489891136, "grad_norm": 20.22886085510254, "learning_rate": 3.394421392346704e-06, "logits/chosen": 1.9235471487045288, "logits/rejected": 4.863920211791992, "logps/chosen": -573.7352294921875, "logps/rejected": -1049.0943603515625, "loss": 0.221, "rewards/accuracies": 0.875, "rewards/chosen": -9.95592212677002, "rewards/margins": 27.50440788269043, "rewards/rejected": -37.4603271484375, "step": 1876 }, { "epoch": 1.167651632970451, "grad_norm": 1.4595121683669277e-05, "learning_rate": 3.393268787459659e-06, "logits/chosen": -2.712231397628784, "logits/rejected": 1.803051471710205, "logps/chosen": -349.8004150390625, "logps/rejected": -857.34228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.230327129364014, "rewards/margins": 29.79707145690918, "rewards/rejected": -36.02739715576172, "step": 1877 }, { "epoch": 1.1682737169517885, "grad_norm": 22.88018035888672, "learning_rate": 3.3921161825726147e-06, "logits/chosen": 1.118056297302246, "logits/rejected": 3.7516231536865234, "logps/chosen": -641.5447998046875, "logps/rejected": -1088.7965087890625, "loss": 0.3303, "rewards/accuracies": 0.875, "rewards/chosen": -8.877915382385254, "rewards/margins": 25.538475036621094, "rewards/rejected": -34.4163932800293, "step": 1878 }, { "epoch": 1.168895800933126, "grad_norm": 0.9193662405014038, "learning_rate": 3.39096357768557e-06, "logits/chosen": 1.4894222021102905, "logits/rejected": 4.886639595031738, "logps/chosen": -579.7659301757812, "logps/rejected": -1037.610595703125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -14.003030776977539, "rewards/margins": 23.57931137084961, "rewards/rejected": -37.58234405517578, "step": 1879 }, { "epoch": 1.1695178849144634, "grad_norm": 8.398724555969238, "learning_rate": 3.389810972798525e-06, "logits/chosen": 1.810590147972107, "logits/rejected": 4.150803089141846, "logps/chosen": -519.09326171875, "logps/rejected": -977.6038818359375, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": -5.65777587890625, "rewards/margins": 32.098777770996094, "rewards/rejected": -37.756553649902344, "step": 1880 }, { "epoch": 1.170139968895801, "grad_norm": 0.16134734451770782, "learning_rate": 3.3886583679114804e-06, "logits/chosen": -2.5285017490386963, "logits/rejected": 2.1500158309936523, "logps/chosen": -376.177001953125, "logps/rejected": -864.7513427734375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -9.267976760864258, "rewards/margins": 24.168720245361328, "rewards/rejected": -33.43669891357422, "step": 1881 }, { "epoch": 1.1707620528771385, "grad_norm": 0.00012305942072998732, "learning_rate": 3.3875057630244356e-06, "logits/chosen": -0.5643349885940552, "logits/rejected": 3.5905985832214355, "logps/chosen": -445.88201904296875, "logps/rejected": -1024.126708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.768947601318359, "rewards/margins": 31.707120895385742, "rewards/rejected": -38.47606658935547, "step": 1882 }, { "epoch": 1.1713841368584759, "grad_norm": 0.018285546451807022, "learning_rate": 3.386353158137391e-06, "logits/chosen": -1.7855366468429565, "logits/rejected": 2.1956353187561035, "logps/chosen": -345.2109375, "logps/rejected": -812.0648193359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.950940132141113, "rewards/margins": 26.879283905029297, "rewards/rejected": -33.830223083496094, "step": 1883 }, { "epoch": 1.1720062208398134, "grad_norm": 5.884623169549741e-05, "learning_rate": 3.385200553250346e-06, "logits/chosen": -0.8889130353927612, "logits/rejected": 4.409315586090088, "logps/chosen": -434.36590576171875, "logps/rejected": -1086.90625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.007776737213135, "rewards/margins": 31.600605010986328, "rewards/rejected": -37.60838317871094, "step": 1884 }, { "epoch": 1.1726283048211508, "grad_norm": 0.03610111027956009, "learning_rate": 3.3840479483633013e-06, "logits/chosen": 0.03821098804473877, "logits/rejected": 1.735025405883789, "logps/chosen": -539.9411010742188, "logps/rejected": -877.2424926757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.749122619628906, "rewards/margins": 24.988826751708984, "rewards/rejected": -31.737951278686523, "step": 1885 }, { "epoch": 1.1732503888024883, "grad_norm": 2.511057937226724e-05, "learning_rate": 3.382895343476257e-06, "logits/chosen": -0.6976618766784668, "logits/rejected": 5.024380683898926, "logps/chosen": -392.182861328125, "logps/rejected": -1032.8702392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2829113006591797, "rewards/margins": 29.962074279785156, "rewards/rejected": -33.24498748779297, "step": 1886 }, { "epoch": 1.173872472783826, "grad_norm": 0.3009835183620453, "learning_rate": 3.381742738589212e-06, "logits/chosen": 0.5339366793632507, "logits/rejected": 3.618192195892334, "logps/chosen": -487.1822204589844, "logps/rejected": -911.4076538085938, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -6.047733306884766, "rewards/margins": 20.27420425415039, "rewards/rejected": -26.321937561035156, "step": 1887 }, { "epoch": 1.1744945567651632, "grad_norm": 13.114173889160156, "learning_rate": 3.3805901337021674e-06, "logits/chosen": -0.8705597519874573, "logits/rejected": 3.0176239013671875, "logps/chosen": -430.07037353515625, "logps/rejected": -851.323486328125, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -6.196186065673828, "rewards/margins": 23.563753128051758, "rewards/rejected": -29.75994110107422, "step": 1888 }, { "epoch": 1.1751166407465008, "grad_norm": 15.261913299560547, "learning_rate": 3.3794375288151226e-06, "logits/chosen": -1.91538667678833, "logits/rejected": 3.901822090148926, "logps/chosen": -502.11639404296875, "logps/rejected": -1063.5262451171875, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": -10.721534729003906, "rewards/margins": 28.344356536865234, "rewards/rejected": -39.065895080566406, "step": 1889 }, { "epoch": 1.1757387247278381, "grad_norm": 0.00018090769299305975, "learning_rate": 3.378284923928078e-06, "logits/chosen": -2.1836063861846924, "logits/rejected": 3.5650699138641357, "logps/chosen": -461.0611572265625, "logps/rejected": -1103.5301513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.961669921875, "rewards/margins": 30.819530487060547, "rewards/rejected": -39.78120040893555, "step": 1890 }, { "epoch": 1.1763608087091757, "grad_norm": 6.663813591003418, "learning_rate": 3.377132319041033e-06, "logits/chosen": -0.012041866779327393, "logits/rejected": 3.2969517707824707, "logps/chosen": -520.4405517578125, "logps/rejected": -839.4365844726562, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -7.451179504394531, "rewards/margins": 17.711742401123047, "rewards/rejected": -25.162921905517578, "step": 1891 }, { "epoch": 1.1769828926905133, "grad_norm": 0.0020499967504292727, "learning_rate": 3.3759797141539883e-06, "logits/chosen": 0.5658305883407593, "logits/rejected": 2.8545844554901123, "logps/chosen": -587.4124755859375, "logps/rejected": -868.9716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.186939239501953, "rewards/margins": 18.542457580566406, "rewards/rejected": -28.72939682006836, "step": 1892 }, { "epoch": 1.1776049766718506, "grad_norm": 0.056425973773002625, "learning_rate": 3.374827109266944e-06, "logits/chosen": 1.9432047605514526, "logits/rejected": 5.135171890258789, "logps/chosen": -534.0836181640625, "logps/rejected": -919.96484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.417974472045898, "rewards/margins": 22.20659828186035, "rewards/rejected": -30.62457275390625, "step": 1893 }, { "epoch": 1.1782270606531882, "grad_norm": 5.256790609564632e-05, "learning_rate": 3.373674504379899e-06, "logits/chosen": 0.011599451303482056, "logits/rejected": 2.3024370670318604, "logps/chosen": -513.41259765625, "logps/rejected": -879.0580444335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.419273376464844, "rewards/margins": 26.65741539001465, "rewards/rejected": -35.076690673828125, "step": 1894 }, { "epoch": 1.1788491446345257, "grad_norm": 0.29891255497932434, "learning_rate": 3.3725218994928544e-06, "logits/chosen": -1.412156581878662, "logits/rejected": 1.3909084796905518, "logps/chosen": -525.9658203125, "logps/rejected": -961.8241577148438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.266643524169922, "rewards/margins": 25.69481086730957, "rewards/rejected": -31.961456298828125, "step": 1895 }, { "epoch": 1.179471228615863, "grad_norm": 6.161828517913818, "learning_rate": 3.3713692946058096e-06, "logits/chosen": -0.9599625468254089, "logits/rejected": 3.51469087600708, "logps/chosen": -455.3974609375, "logps/rejected": -972.5230102539062, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -6.252844333648682, "rewards/margins": 24.31687355041504, "rewards/rejected": -30.569719314575195, "step": 1896 }, { "epoch": 1.1800933125972006, "grad_norm": 16.62723159790039, "learning_rate": 3.370216689718765e-06, "logits/chosen": 0.051849544048309326, "logits/rejected": 2.560445785522461, "logps/chosen": -513.8330078125, "logps/rejected": -841.5488891601562, "loss": 0.1151, "rewards/accuracies": 0.875, "rewards/chosen": -7.442409992218018, "rewards/margins": 20.9028377532959, "rewards/rejected": -28.34524917602539, "step": 1897 }, { "epoch": 1.1807153965785382, "grad_norm": 0.0016077188774943352, "learning_rate": 3.36906408483172e-06, "logits/chosen": -0.7170097231864929, "logits/rejected": 4.153207778930664, "logps/chosen": -476.2669982910156, "logps/rejected": -1010.2525634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.360616683959961, "rewards/margins": 26.90864372253418, "rewards/rejected": -33.269264221191406, "step": 1898 }, { "epoch": 1.1813374805598755, "grad_norm": 0.009510945528745651, "learning_rate": 3.3679114799446753e-06, "logits/chosen": 1.9269828796386719, "logits/rejected": 4.278368949890137, "logps/chosen": -580.8291625976562, "logps/rejected": -914.4641723632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.361428260803223, "rewards/margins": 25.89552116394043, "rewards/rejected": -36.25695037841797, "step": 1899 }, { "epoch": 1.181959564541213, "grad_norm": 0.16183213889598846, "learning_rate": 3.366758875057631e-06, "logits/chosen": -1.8444448709487915, "logits/rejected": 3.2564399242401123, "logps/chosen": -402.34051513671875, "logps/rejected": -958.4459228515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.02139139175415, "rewards/margins": 23.725345611572266, "rewards/rejected": -28.746734619140625, "step": 1900 }, { "epoch": 1.1825816485225507, "grad_norm": 9.202930328910952e-08, "learning_rate": 3.365606270170586e-06, "logits/chosen": 0.11875700950622559, "logits/rejected": 4.793677806854248, "logps/chosen": -453.58819580078125, "logps/rejected": -1055.4208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.494197845458984, "rewards/margins": 36.492919921875, "rewards/rejected": -42.98711395263672, "step": 1901 }, { "epoch": 1.183203732503888, "grad_norm": 0.012897444888949394, "learning_rate": 3.3644536652835414e-06, "logits/chosen": -0.3415604829788208, "logits/rejected": 3.5608906745910645, "logps/chosen": -443.36767578125, "logps/rejected": -957.35888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.884511947631836, "rewards/margins": 33.34904479980469, "rewards/rejected": -43.233558654785156, "step": 1902 }, { "epoch": 1.1838258164852256, "grad_norm": 0.0005375120672397316, "learning_rate": 3.3633010603964966e-06, "logits/chosen": -1.8501484394073486, "logits/rejected": 3.881671905517578, "logps/chosen": -455.2761535644531, "logps/rejected": -1120.222900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.41702651977539, "rewards/margins": 33.15281677246094, "rewards/rejected": -41.569847106933594, "step": 1903 }, { "epoch": 1.184447900466563, "grad_norm": 1.0201901197433472, "learning_rate": 3.362148455509452e-06, "logits/chosen": 0.9012954235076904, "logits/rejected": 3.5016984939575195, "logps/chosen": -504.5726623535156, "logps/rejected": -815.4268798828125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -9.356898307800293, "rewards/margins": 18.643299102783203, "rewards/rejected": -28.00019645690918, "step": 1904 }, { "epoch": 1.1850699844479005, "grad_norm": 2.634040594100952, "learning_rate": 3.360995850622407e-06, "logits/chosen": 1.6875536441802979, "logits/rejected": 4.519953727722168, "logps/chosen": -580.7654418945312, "logps/rejected": -1029.687744140625, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -10.505790710449219, "rewards/margins": 27.653100967407227, "rewards/rejected": -38.15888977050781, "step": 1905 }, { "epoch": 1.185692068429238, "grad_norm": 27.720535278320312, "learning_rate": 3.3598432457353623e-06, "logits/chosen": 2.4027135372161865, "logits/rejected": 3.5298256874084473, "logps/chosen": -580.3115844726562, "logps/rejected": -849.4248046875, "loss": 0.6196, "rewards/accuracies": 0.875, "rewards/chosen": -10.953683853149414, "rewards/margins": 22.228294372558594, "rewards/rejected": -33.181976318359375, "step": 1906 }, { "epoch": 1.1863141524105754, "grad_norm": 0.0001605092256795615, "learning_rate": 3.358690640848317e-06, "logits/chosen": 1.2494337558746338, "logits/rejected": 3.3936190605163574, "logps/chosen": -534.42236328125, "logps/rejected": -982.61865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.883479118347168, "rewards/margins": 31.28057861328125, "rewards/rejected": -38.164058685302734, "step": 1907 }, { "epoch": 1.186936236391913, "grad_norm": 4.926142196381988e-07, "learning_rate": 3.3575380359612723e-06, "logits/chosen": -1.3139442205429077, "logits/rejected": 2.4188690185546875, "logps/chosen": -367.416015625, "logps/rejected": -865.8370361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.059207916259766, "rewards/margins": 31.45437240600586, "rewards/rejected": -40.513580322265625, "step": 1908 }, { "epoch": 1.1875583203732503, "grad_norm": 0.0003280296514276415, "learning_rate": 3.3563854310742276e-06, "logits/chosen": -0.26524052023887634, "logits/rejected": 2.9963579177856445, "logps/chosen": -569.2260131835938, "logps/rejected": -1066.4149169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.808199882507324, "rewards/margins": 28.098133087158203, "rewards/rejected": -35.906333923339844, "step": 1909 }, { "epoch": 1.1881804043545878, "grad_norm": 0.014919915236532688, "learning_rate": 3.355232826187183e-06, "logits/chosen": 2.3412349224090576, "logits/rejected": 4.641690254211426, "logps/chosen": -598.1677856445312, "logps/rejected": -1043.271240234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.240728378295898, "rewards/margins": 25.26788330078125, "rewards/rejected": -32.50861358642578, "step": 1910 }, { "epoch": 1.1888024883359254, "grad_norm": 2.226551581996361e-10, "learning_rate": 3.3540802213001384e-06, "logits/chosen": 3.548205852508545, "logits/rejected": 3.9359588623046875, "logps/chosen": -690.6002807617188, "logps/rejected": -1156.86572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.449170112609863, "rewards/margins": 35.990379333496094, "rewards/rejected": -48.439544677734375, "step": 1911 }, { "epoch": 1.1894245723172627, "grad_norm": 0.0006320227403193712, "learning_rate": 3.3529276164130937e-06, "logits/chosen": 2.9576265811920166, "logits/rejected": 1.8555573225021362, "logps/chosen": -705.2396240234375, "logps/rejected": -898.2821044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.850976943969727, "rewards/margins": 17.06894874572754, "rewards/rejected": -29.9199275970459, "step": 1912 }, { "epoch": 1.1900466562986003, "grad_norm": 0.008909384720027447, "learning_rate": 3.351775011526049e-06, "logits/chosen": -3.500988006591797, "logits/rejected": 3.632211446762085, "logps/chosen": -272.4637756347656, "logps/rejected": -955.69091796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.5750045776367188, "rewards/margins": 33.8862419128418, "rewards/rejected": -37.461246490478516, "step": 1913 }, { "epoch": 1.1906687402799379, "grad_norm": 52.2058219909668, "learning_rate": 3.350622406639004e-06, "logits/chosen": 0.48060905933380127, "logits/rejected": 2.518648147583008, "logps/chosen": -588.2604370117188, "logps/rejected": -919.8138427734375, "loss": 0.5477, "rewards/accuracies": 0.875, "rewards/chosen": -12.316235542297363, "rewards/margins": 21.10009765625, "rewards/rejected": -33.41633605957031, "step": 1914 }, { "epoch": 1.1912908242612752, "grad_norm": 0.004597253166139126, "learning_rate": 3.3494698017519593e-06, "logits/chosen": -0.6631978750228882, "logits/rejected": 3.513437271118164, "logps/chosen": -484.0467224121094, "logps/rejected": -1010.7188720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.64289665222168, "rewards/margins": 30.839702606201172, "rewards/rejected": -39.48259735107422, "step": 1915 }, { "epoch": 1.1919129082426128, "grad_norm": 0.00033162301406264305, "learning_rate": 3.3483171968649146e-06, "logits/chosen": 1.9419931173324585, "logits/rejected": 4.727841377258301, "logps/chosen": -546.1142578125, "logps/rejected": -1015.4229736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.172233581542969, "rewards/margins": 29.4285888671875, "rewards/rejected": -37.60082244873047, "step": 1916 }, { "epoch": 1.1925349922239503, "grad_norm": 2.137709617614746, "learning_rate": 3.34716459197787e-06, "logits/chosen": 1.4818284511566162, "logits/rejected": 4.6252312660217285, "logps/chosen": -506.4788513183594, "logps/rejected": -930.1427612304688, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": -10.089163780212402, "rewards/margins": 22.086009979248047, "rewards/rejected": -32.175174713134766, "step": 1917 }, { "epoch": 1.1931570762052877, "grad_norm": 0.007987787947058678, "learning_rate": 3.3460119870908254e-06, "logits/chosen": 0.12977349758148193, "logits/rejected": 1.7750893831253052, "logps/chosen": -578.8599853515625, "logps/rejected": -882.8643188476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8722429275512695, "rewards/margins": 24.44931411743164, "rewards/rejected": -31.321556091308594, "step": 1918 }, { "epoch": 1.1937791601866252, "grad_norm": 0.645113468170166, "learning_rate": 3.3448593822037807e-06, "logits/chosen": -1.326491117477417, "logits/rejected": 2.144299268722534, "logps/chosen": -461.71722412109375, "logps/rejected": -866.579833984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -7.663753986358643, "rewards/margins": 25.251678466796875, "rewards/rejected": -32.91543197631836, "step": 1919 }, { "epoch": 1.1944012441679628, "grad_norm": 0.18914903700351715, "learning_rate": 3.343706777316736e-06, "logits/chosen": 3.1509156227111816, "logits/rejected": 4.0067291259765625, "logps/chosen": -537.1878662109375, "logps/rejected": -864.8395385742188, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -10.262554168701172, "rewards/margins": 20.671634674072266, "rewards/rejected": -30.93419075012207, "step": 1920 }, { "epoch": 1.1950233281493001, "grad_norm": 0.03683840483427048, "learning_rate": 3.342554172429691e-06, "logits/chosen": -2.908719062805176, "logits/rejected": 1.6505014896392822, "logps/chosen": -387.63677978515625, "logps/rejected": -857.525390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.601930618286133, "rewards/margins": 25.352493286132812, "rewards/rejected": -34.95442199707031, "step": 1921 }, { "epoch": 1.1956454121306377, "grad_norm": 2.0521059036254883, "learning_rate": 3.3414015675426463e-06, "logits/chosen": -0.8097438812255859, "logits/rejected": 2.9124770164489746, "logps/chosen": -482.3450927734375, "logps/rejected": -1000.04248046875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -6.175515651702881, "rewards/margins": 22.42913246154785, "rewards/rejected": -28.60464859008789, "step": 1922 }, { "epoch": 1.196267496111975, "grad_norm": 0.00045063262223266065, "learning_rate": 3.3402489626556016e-06, "logits/chosen": 1.2557860612869263, "logits/rejected": 3.385568380355835, "logps/chosen": -530.5950927734375, "logps/rejected": -909.8157958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.130796432495117, "rewards/margins": 27.050382614135742, "rewards/rejected": -35.18117904663086, "step": 1923 }, { "epoch": 1.1968895800933126, "grad_norm": 2.0517676446729638e-08, "learning_rate": 3.339096357768557e-06, "logits/chosen": -0.006276607513427734, "logits/rejected": 3.593689441680908, "logps/chosen": -383.19757080078125, "logps/rejected": -871.0137939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.104961395263672, "rewards/margins": 32.07358169555664, "rewards/rejected": -36.17854309082031, "step": 1924 }, { "epoch": 1.1975116640746502, "grad_norm": 0.0016482784412801266, "learning_rate": 3.3379437528815124e-06, "logits/chosen": 0.5384297370910645, "logits/rejected": 3.214430809020996, "logps/chosen": -566.6683349609375, "logps/rejected": -984.3301391601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.574440002441406, "rewards/margins": 23.82293701171875, "rewards/rejected": -33.397377014160156, "step": 1925 }, { "epoch": 1.1981337480559875, "grad_norm": 27.448835372924805, "learning_rate": 3.3367911479944676e-06, "logits/chosen": 0.10056604444980621, "logits/rejected": 3.597458839416504, "logps/chosen": -456.06005859375, "logps/rejected": -866.9276123046875, "loss": 0.571, "rewards/accuracies": 0.875, "rewards/chosen": -6.5373406410217285, "rewards/margins": 24.73653221130371, "rewards/rejected": -31.27387237548828, "step": 1926 }, { "epoch": 1.198755832037325, "grad_norm": 34.07020950317383, "learning_rate": 3.335638543107423e-06, "logits/chosen": -0.8443750143051147, "logits/rejected": 2.3590760231018066, "logps/chosen": -502.44146728515625, "logps/rejected": -899.0219116210938, "loss": 1.0365, "rewards/accuracies": 0.875, "rewards/chosen": -4.6267242431640625, "rewards/margins": 19.69811248779297, "rewards/rejected": -24.32483673095703, "step": 1927 }, { "epoch": 1.1993779160186626, "grad_norm": 0.00026728963712230325, "learning_rate": 3.334485938220378e-06, "logits/chosen": 0.41143798828125, "logits/rejected": 3.4771077632904053, "logps/chosen": -672.9883422851562, "logps/rejected": -1171.4521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.810882568359375, "rewards/margins": 35.20409393310547, "rewards/rejected": -43.01497268676758, "step": 1928 }, { "epoch": 1.2, "grad_norm": 2.0605654071914614e-08, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 0.14419078826904297, "logits/rejected": 3.9184060096740723, "logps/chosen": -515.4486694335938, "logps/rejected": -1113.77783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.166099548339844, "rewards/margins": 39.462459564208984, "rewards/rejected": -47.628562927246094, "step": 1929 }, { "epoch": 1.2006220839813375, "grad_norm": 0.08832728117704391, "learning_rate": 3.3321807284462885e-06, "logits/chosen": 0.09780023992061615, "logits/rejected": 4.137458801269531, "logps/chosen": -517.83642578125, "logps/rejected": -946.2546997070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.975343704223633, "rewards/margins": 27.883960723876953, "rewards/rejected": -37.85930633544922, "step": 1930 }, { "epoch": 1.2012441679626749, "grad_norm": 0.004210162442177534, "learning_rate": 3.331028123559244e-06, "logits/chosen": -1.5383262634277344, "logits/rejected": 4.4926958084106445, "logps/chosen": -306.3968505859375, "logps/rejected": -1023.9912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.66239857673645, "rewards/margins": 36.54762268066406, "rewards/rejected": -40.21002197265625, "step": 1931 }, { "epoch": 1.2018662519440124, "grad_norm": 0.007443973794579506, "learning_rate": 3.3298755186721994e-06, "logits/chosen": 0.7929803133010864, "logits/rejected": 4.572413921356201, "logps/chosen": -468.37957763671875, "logps/rejected": -869.6012573242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.426292896270752, "rewards/margins": 21.67800521850586, "rewards/rejected": -27.104297637939453, "step": 1932 }, { "epoch": 1.20248833592535, "grad_norm": 0.05187014490365982, "learning_rate": 3.3287229137851546e-06, "logits/chosen": 0.013190984725952148, "logits/rejected": 2.615093231201172, "logps/chosen": -338.7560729980469, "logps/rejected": -766.6841430664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.282043933868408, "rewards/margins": 25.907127380371094, "rewards/rejected": -30.189170837402344, "step": 1933 }, { "epoch": 1.2031104199066873, "grad_norm": 0.5828655958175659, "learning_rate": 3.32757030889811e-06, "logits/chosen": 0.724718451499939, "logits/rejected": 3.5598180294036865, "logps/chosen": -377.14056396484375, "logps/rejected": -799.251708984375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -5.795290946960449, "rewards/margins": 22.91904067993164, "rewards/rejected": -28.714330673217773, "step": 1934 }, { "epoch": 1.2037325038880249, "grad_norm": 0.00023630520445294678, "learning_rate": 3.326417704011065e-06, "logits/chosen": -0.6954635381698608, "logits/rejected": 3.3336634635925293, "logps/chosen": -451.3642578125, "logps/rejected": -940.421142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.785922050476074, "rewards/margins": 25.902555465698242, "rewards/rejected": -30.688478469848633, "step": 1935 }, { "epoch": 1.2043545878693624, "grad_norm": 0.0010673885699361563, "learning_rate": 3.3252650991240203e-06, "logits/chosen": -1.998305082321167, "logits/rejected": 3.031818389892578, "logps/chosen": -428.25927734375, "logps/rejected": -1018.7134399414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.27209186553955, "rewards/margins": 32.9398307800293, "rewards/rejected": -43.2119255065918, "step": 1936 }, { "epoch": 1.2049766718506998, "grad_norm": 1.6674846410751343, "learning_rate": 3.3241124942369755e-06, "logits/chosen": -1.2776638269424438, "logits/rejected": 3.502741813659668, "logps/chosen": -463.2667236328125, "logps/rejected": -874.223876953125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -4.820012092590332, "rewards/margins": 20.7093505859375, "rewards/rejected": -25.529361724853516, "step": 1937 }, { "epoch": 1.2055987558320373, "grad_norm": 0.02726488746702671, "learning_rate": 3.3229598893499308e-06, "logits/chosen": 0.24503111839294434, "logits/rejected": 4.084878921508789, "logps/chosen": -399.0093994140625, "logps/rejected": -825.203369140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.999037742614746, "rewards/margins": 23.390485763549805, "rewards/rejected": -29.3895263671875, "step": 1938 }, { "epoch": 1.206220839813375, "grad_norm": 0.00054695934522897, "learning_rate": 3.3218072844628864e-06, "logits/chosen": 2.297356128692627, "logits/rejected": 3.3070321083068848, "logps/chosen": -587.7952880859375, "logps/rejected": -1102.398193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.056290626525879, "rewards/margins": 34.74806213378906, "rewards/rejected": -45.804351806640625, "step": 1939 }, { "epoch": 1.2068429237947123, "grad_norm": 2.3318307399749756, "learning_rate": 3.3206546795758416e-06, "logits/chosen": 0.43444839119911194, "logits/rejected": 2.4421451091766357, "logps/chosen": -514.5410766601562, "logps/rejected": -874.9398803710938, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -10.247170448303223, "rewards/margins": 22.54619026184082, "rewards/rejected": -32.793357849121094, "step": 1940 }, { "epoch": 1.2074650077760498, "grad_norm": 0.00011700214236043394, "learning_rate": 3.319502074688797e-06, "logits/chosen": 0.9871399402618408, "logits/rejected": 3.244748592376709, "logps/chosen": -435.0483703613281, "logps/rejected": -841.668701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.72576904296875, "rewards/margins": 29.82848358154297, "rewards/rejected": -34.55425262451172, "step": 1941 }, { "epoch": 1.2080870917573872, "grad_norm": 2.920883893966675, "learning_rate": 3.318349469801752e-06, "logits/chosen": -0.21452677249908447, "logits/rejected": 4.4931230545043945, "logps/chosen": -367.2193603515625, "logps/rejected": -1028.98046875, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -6.970757484436035, "rewards/margins": 35.071495056152344, "rewards/rejected": -42.04225158691406, "step": 1942 }, { "epoch": 1.2087091757387247, "grad_norm": 0.03972639888525009, "learning_rate": 3.3171968649147073e-06, "logits/chosen": 0.8004956245422363, "logits/rejected": 3.5307986736297607, "logps/chosen": -599.837890625, "logps/rejected": -1062.3994140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.409638404846191, "rewards/margins": 28.89748764038086, "rewards/rejected": -40.307125091552734, "step": 1943 }, { "epoch": 1.2093312597200623, "grad_norm": 0.03882686793804169, "learning_rate": 3.3160442600276625e-06, "logits/chosen": 3.156482458114624, "logits/rejected": 3.9867730140686035, "logps/chosen": -716.43994140625, "logps/rejected": -1105.8594970703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.058252334594727, "rewards/margins": 29.100160598754883, "rewards/rejected": -39.15841293334961, "step": 1944 }, { "epoch": 1.2099533437013996, "grad_norm": 0.02182290330529213, "learning_rate": 3.3148916551406178e-06, "logits/chosen": -0.6206588745117188, "logits/rejected": 3.366884231567383, "logps/chosen": -562.1417236328125, "logps/rejected": -1199.1522216796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.860138893127441, "rewards/margins": 36.03993225097656, "rewards/rejected": -42.90007019042969, "step": 1945 }, { "epoch": 1.2105754276827372, "grad_norm": 0.00022302680008579046, "learning_rate": 3.3137390502535734e-06, "logits/chosen": 3.027329444885254, "logits/rejected": 4.128240585327148, "logps/chosen": -577.2239990234375, "logps/rejected": -878.79150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.890042304992676, "rewards/margins": 27.70443344116211, "rewards/rejected": -37.59447479248047, "step": 1946 }, { "epoch": 1.2111975116640747, "grad_norm": 0.008291305974125862, "learning_rate": 3.3125864453665286e-06, "logits/chosen": 0.27542710304260254, "logits/rejected": 4.529088973999023, "logps/chosen": -476.2554931640625, "logps/rejected": -1009.677490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.14692497253418, "rewards/margins": 31.135059356689453, "rewards/rejected": -43.281982421875, "step": 1947 }, { "epoch": 1.211819595645412, "grad_norm": 0.00021095202828291804, "learning_rate": 3.311433840479484e-06, "logits/chosen": 1.0152184963226318, "logits/rejected": 4.031658172607422, "logps/chosen": -593.0845336914062, "logps/rejected": -943.8492431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.655757904052734, "rewards/margins": 27.139101028442383, "rewards/rejected": -33.794857025146484, "step": 1948 }, { "epoch": 1.2124416796267496, "grad_norm": 0.08369395136833191, "learning_rate": 3.310281235592439e-06, "logits/chosen": -1.5338207483291626, "logits/rejected": 3.6264917850494385, "logps/chosen": -494.798828125, "logps/rejected": -950.973388671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.699420928955078, "rewards/margins": 25.905492782592773, "rewards/rejected": -37.60491180419922, "step": 1949 }, { "epoch": 1.213063763608087, "grad_norm": 2.5580344200134277, "learning_rate": 3.3091286307053943e-06, "logits/chosen": 0.913398265838623, "logits/rejected": 3.289531707763672, "logps/chosen": -484.4707946777344, "logps/rejected": -813.2208251953125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -11.331765174865723, "rewards/margins": 19.53480339050293, "rewards/rejected": -30.86656951904297, "step": 1950 }, { "epoch": 1.2136858475894245, "grad_norm": 0.050781961530447006, "learning_rate": 3.3079760258183495e-06, "logits/chosen": 0.06513766199350357, "logits/rejected": 3.91764760017395, "logps/chosen": -417.3397521972656, "logps/rejected": -851.27978515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.4588847160339355, "rewards/margins": 18.167699813842773, "rewards/rejected": -25.626585006713867, "step": 1951 }, { "epoch": 1.2143079315707621, "grad_norm": 13.09532356262207, "learning_rate": 3.3068234209313048e-06, "logits/chosen": -0.8919419646263123, "logits/rejected": 3.8915514945983887, "logps/chosen": -444.9429626464844, "logps/rejected": -1011.6834106445312, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -9.357451438903809, "rewards/margins": 25.754016876220703, "rewards/rejected": -35.11146926879883, "step": 1952 }, { "epoch": 1.2149300155520995, "grad_norm": 0.04654877260327339, "learning_rate": 3.3056708160442604e-06, "logits/chosen": -1.6564157009124756, "logits/rejected": 2.609309673309326, "logps/chosen": -329.3642578125, "logps/rejected": -767.1329956054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.011837959289551, "rewards/margins": 18.270030975341797, "rewards/rejected": -22.281869888305664, "step": 1953 }, { "epoch": 1.215552099533437, "grad_norm": 1.1246953010559082, "learning_rate": 3.3045182111572156e-06, "logits/chosen": 1.621370792388916, "logits/rejected": 4.352920055389404, "logps/chosen": -653.8741455078125, "logps/rejected": -1105.3143310546875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -14.642403602600098, "rewards/margins": 29.800912857055664, "rewards/rejected": -44.44331741333008, "step": 1954 }, { "epoch": 1.2161741835147746, "grad_norm": 0.0006063711480237544, "learning_rate": 3.303365606270171e-06, "logits/chosen": -0.577562153339386, "logits/rejected": 4.346451759338379, "logps/chosen": -534.3248291015625, "logps/rejected": -1122.94580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.442867279052734, "rewards/margins": 32.12034606933594, "rewards/rejected": -43.56321334838867, "step": 1955 }, { "epoch": 1.216796267496112, "grad_norm": 1.4868415594100952, "learning_rate": 3.302213001383126e-06, "logits/chosen": 1.4747681617736816, "logits/rejected": 3.2828774452209473, "logps/chosen": -673.1659545898438, "logps/rejected": -1035.533203125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -13.805810928344727, "rewards/margins": 22.413631439208984, "rewards/rejected": -36.21944046020508, "step": 1956 }, { "epoch": 1.2174183514774495, "grad_norm": 0.013984655030071735, "learning_rate": 3.3010603964960813e-06, "logits/chosen": 2.234647512435913, "logits/rejected": 3.8705615997314453, "logps/chosen": -657.6099243164062, "logps/rejected": -959.9219970703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.698150634765625, "rewards/margins": 24.213985443115234, "rewards/rejected": -34.912139892578125, "step": 1957 }, { "epoch": 1.218040435458787, "grad_norm": 30.901208877563477, "learning_rate": 3.2999077916090365e-06, "logits/chosen": -0.880326509475708, "logits/rejected": 3.0242815017700195, "logps/chosen": -587.2315673828125, "logps/rejected": -973.1295776367188, "loss": 0.1456, "rewards/accuracies": 0.875, "rewards/chosen": -9.446496963500977, "rewards/margins": 20.663053512573242, "rewards/rejected": -30.10955047607422, "step": 1958 }, { "epoch": 1.2186625194401244, "grad_norm": 0.33974507451057434, "learning_rate": 3.2987551867219918e-06, "logits/chosen": 2.2866456508636475, "logits/rejected": 3.3971104621887207, "logps/chosen": -551.7151489257812, "logps/rejected": -930.0809326171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -11.42250919342041, "rewards/margins": 25.322158813476562, "rewards/rejected": -36.744667053222656, "step": 1959 }, { "epoch": 1.219284603421462, "grad_norm": 0.031288594007492065, "learning_rate": 3.297602581834947e-06, "logits/chosen": -1.6816962957382202, "logits/rejected": 1.167089819908142, "logps/chosen": -536.8770751953125, "logps/rejected": -976.789306640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.510819435119629, "rewards/margins": 25.894309997558594, "rewards/rejected": -33.405128479003906, "step": 1960 }, { "epoch": 1.2199066874027993, "grad_norm": 0.018396716564893723, "learning_rate": 3.2964499769479026e-06, "logits/chosen": 2.6608729362487793, "logits/rejected": 3.563199520111084, "logps/chosen": -745.041748046875, "logps/rejected": -1060.1583251953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.359828948974609, "rewards/margins": 28.92093276977539, "rewards/rejected": -36.28076171875, "step": 1961 }, { "epoch": 1.2205287713841368, "grad_norm": 0.0005018580704927444, "learning_rate": 3.295297372060858e-06, "logits/chosen": -1.0413039922714233, "logits/rejected": 2.514838457107544, "logps/chosen": -389.07940673828125, "logps/rejected": -1057.867919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.18116283416748, "rewards/margins": 42.0729866027832, "rewards/rejected": -52.254150390625, "step": 1962 }, { "epoch": 1.2211508553654744, "grad_norm": 4.92830122311716e-06, "learning_rate": 3.294144767173813e-06, "logits/chosen": 0.7838652729988098, "logits/rejected": 3.6844985485076904, "logps/chosen": -555.4747314453125, "logps/rejected": -1020.4698486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.917938232421875, "rewards/margins": 30.011611938476562, "rewards/rejected": -40.92955017089844, "step": 1963 }, { "epoch": 1.2217729393468117, "grad_norm": 0.00024959229631349444, "learning_rate": 3.2929921622867683e-06, "logits/chosen": -0.8331981897354126, "logits/rejected": 2.0492069721221924, "logps/chosen": -357.7832946777344, "logps/rejected": -851.5213623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.565192222595215, "rewards/margins": 34.37034606933594, "rewards/rejected": -38.93553924560547, "step": 1964 }, { "epoch": 1.2223950233281493, "grad_norm": 10.026619911193848, "learning_rate": 3.2918395573997235e-06, "logits/chosen": 1.6410657167434692, "logits/rejected": 3.0660173892974854, "logps/chosen": -495.8389892578125, "logps/rejected": -751.0767822265625, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -7.825559616088867, "rewards/margins": 16.955659866333008, "rewards/rejected": -24.781219482421875, "step": 1965 }, { "epoch": 1.2230171073094869, "grad_norm": 0.003237862139940262, "learning_rate": 3.2906869525126788e-06, "logits/chosen": -0.04731714725494385, "logits/rejected": 1.0842527151107788, "logps/chosen": -526.9979858398438, "logps/rejected": -806.66064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.047800064086914, "rewards/margins": 18.901748657226562, "rewards/rejected": -28.949548721313477, "step": 1966 }, { "epoch": 1.2236391912908242, "grad_norm": 0.037597037851810455, "learning_rate": 3.289534347625634e-06, "logits/chosen": 1.0335030555725098, "logits/rejected": 3.654738426208496, "logps/chosen": -643.566162109375, "logps/rejected": -1147.51953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.852840423583984, "rewards/margins": 31.7597599029541, "rewards/rejected": -42.61259841918945, "step": 1967 }, { "epoch": 1.2242612752721618, "grad_norm": 1.8747437934507616e-05, "learning_rate": 3.2883817427385896e-06, "logits/chosen": 0.3725661039352417, "logits/rejected": 2.0947065353393555, "logps/chosen": -600.5690307617188, "logps/rejected": -1068.793212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.553552627563477, "rewards/margins": 31.938674926757812, "rewards/rejected": -42.492225646972656, "step": 1968 }, { "epoch": 1.2248833592534991, "grad_norm": 0.0005407995195128024, "learning_rate": 3.287229137851545e-06, "logits/chosen": -1.3575810194015503, "logits/rejected": 1.826256513595581, "logps/chosen": -450.8987121582031, "logps/rejected": -844.3551635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.845052719116211, "rewards/margins": 22.745574951171875, "rewards/rejected": -27.59062957763672, "step": 1969 }, { "epoch": 1.2255054432348367, "grad_norm": 0.02516097202897072, "learning_rate": 3.2860765329645e-06, "logits/chosen": -0.5253548622131348, "logits/rejected": 2.288137435913086, "logps/chosen": -445.89776611328125, "logps/rejected": -847.6441650390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.445370674133301, "rewards/margins": 23.657224655151367, "rewards/rejected": -31.102596282958984, "step": 1970 }, { "epoch": 1.2261275272161742, "grad_norm": 0.004528042860329151, "learning_rate": 3.2849239280774553e-06, "logits/chosen": -0.6673039197921753, "logits/rejected": 4.811821937561035, "logps/chosen": -388.5225524902344, "logps/rejected": -1090.91552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.815102577209473, "rewards/margins": 32.85487365722656, "rewards/rejected": -40.66997528076172, "step": 1971 }, { "epoch": 1.2267496111975116, "grad_norm": 27.828004837036133, "learning_rate": 3.2837713231904105e-06, "logits/chosen": -0.22079074382781982, "logits/rejected": 1.5481947660446167, "logps/chosen": -481.0959167480469, "logps/rejected": -800.5830078125, "loss": 0.3633, "rewards/accuracies": 0.875, "rewards/chosen": -9.814485549926758, "rewards/margins": 17.76893424987793, "rewards/rejected": -27.583419799804688, "step": 1972 }, { "epoch": 1.2273716951788491, "grad_norm": 1.7662297295828466e-06, "learning_rate": 3.2826187183033658e-06, "logits/chosen": -1.9919651746749878, "logits/rejected": 3.906245231628418, "logps/chosen": -400.87591552734375, "logps/rejected": -1072.261962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9460577964782715, "rewards/margins": 34.99480438232422, "rewards/rejected": -41.94086456298828, "step": 1973 }, { "epoch": 1.2279937791601867, "grad_norm": 36.82212829589844, "learning_rate": 3.281466113416321e-06, "logits/chosen": 1.6460797786712646, "logits/rejected": 4.4644365310668945, "logps/chosen": -531.8225708007812, "logps/rejected": -957.0294189453125, "loss": 1.1201, "rewards/accuracies": 0.875, "rewards/chosen": -10.023094177246094, "rewards/margins": 26.032180786132812, "rewards/rejected": -36.055274963378906, "step": 1974 }, { "epoch": 1.228615863141524, "grad_norm": 0.004306244198232889, "learning_rate": 3.2803135085292766e-06, "logits/chosen": 0.5558570027351379, "logits/rejected": 1.4123010635375977, "logps/chosen": -666.03662109375, "logps/rejected": -970.247314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.882766723632812, "rewards/margins": 25.52187156677246, "rewards/rejected": -39.404640197753906, "step": 1975 }, { "epoch": 1.2292379471228616, "grad_norm": 2.109964370727539, "learning_rate": 3.279160903642232e-06, "logits/chosen": -0.5743099451065063, "logits/rejected": 1.87367844581604, "logps/chosen": -483.983154296875, "logps/rejected": -847.1797485351562, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -7.363353729248047, "rewards/margins": 26.407093048095703, "rewards/rejected": -33.77044677734375, "step": 1976 }, { "epoch": 1.2298600311041992, "grad_norm": 0.10898027569055557, "learning_rate": 3.278008298755187e-06, "logits/chosen": 1.553083062171936, "logits/rejected": 2.812044143676758, "logps/chosen": -544.963134765625, "logps/rejected": -787.267578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.337335586547852, "rewards/margins": 15.037760734558105, "rewards/rejected": -23.375097274780273, "step": 1977 }, { "epoch": 1.2304821150855365, "grad_norm": 1.6577892303466797, "learning_rate": 3.2768556938681423e-06, "logits/chosen": -0.6032044887542725, "logits/rejected": 2.0228164196014404, "logps/chosen": -337.15655517578125, "logps/rejected": -684.0770874023438, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -4.9896721839904785, "rewards/margins": 17.63288688659668, "rewards/rejected": -22.622560501098633, "step": 1978 }, { "epoch": 1.231104199066874, "grad_norm": 0.0017263826448470354, "learning_rate": 3.2757030889810975e-06, "logits/chosen": -0.22079506516456604, "logits/rejected": 4.21293830871582, "logps/chosen": -463.0474853515625, "logps/rejected": -1015.839111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.130630016326904, "rewards/margins": 31.987415313720703, "rewards/rejected": -38.118045806884766, "step": 1979 }, { "epoch": 1.2317262830482114, "grad_norm": 0.0001258711126865819, "learning_rate": 3.2745504840940528e-06, "logits/chosen": 0.06448210775852203, "logits/rejected": 3.2480227947235107, "logps/chosen": -513.82177734375, "logps/rejected": -953.921630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.06885051727295, "rewards/margins": 28.184560775756836, "rewards/rejected": -38.25341033935547, "step": 1980 }, { "epoch": 1.232348367029549, "grad_norm": 0.003937113098800182, "learning_rate": 3.273397879207008e-06, "logits/chosen": -0.053459495306015015, "logits/rejected": 2.767993927001953, "logps/chosen": -453.16973876953125, "logps/rejected": -907.441162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.106377601623535, "rewards/margins": 22.306228637695312, "rewards/rejected": -31.41260528564453, "step": 1981 }, { "epoch": 1.2329704510108865, "grad_norm": 2.3660552501678467, "learning_rate": 3.2722452743199636e-06, "logits/chosen": 0.03287597745656967, "logits/rejected": 2.237107753753662, "logps/chosen": -492.45489501953125, "logps/rejected": -861.8848876953125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -11.626747131347656, "rewards/margins": 24.731849670410156, "rewards/rejected": -36.35859680175781, "step": 1982 }, { "epoch": 1.2335925349922239, "grad_norm": 41.7318115234375, "learning_rate": 3.271092669432919e-06, "logits/chosen": 0.4554020166397095, "logits/rejected": 3.25327730178833, "logps/chosen": -608.2691650390625, "logps/rejected": -1019.5993041992188, "loss": 1.0557, "rewards/accuracies": 0.875, "rewards/chosen": -11.943293571472168, "rewards/margins": 25.79165267944336, "rewards/rejected": -37.734947204589844, "step": 1983 }, { "epoch": 1.2342146189735614, "grad_norm": 0.2129361927509308, "learning_rate": 3.269940064545874e-06, "logits/chosen": -0.07215934991836548, "logits/rejected": 2.628373384475708, "logps/chosen": -618.8433227539062, "logps/rejected": -1128.62060546875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.97716236114502, "rewards/margins": 31.908266067504883, "rewards/rejected": -40.88542938232422, "step": 1984 }, { "epoch": 1.234836702954899, "grad_norm": 25.82171058654785, "learning_rate": 3.2687874596588293e-06, "logits/chosen": 1.560760736465454, "logits/rejected": 2.470482587814331, "logps/chosen": -685.6170654296875, "logps/rejected": -946.1788330078125, "loss": 0.2324, "rewards/accuracies": 0.875, "rewards/chosen": -8.528305053710938, "rewards/margins": 20.159406661987305, "rewards/rejected": -28.687711715698242, "step": 1985 }, { "epoch": 1.2354587869362363, "grad_norm": 0.010561628267168999, "learning_rate": 3.2676348547717845e-06, "logits/chosen": 0.25866788625717163, "logits/rejected": 2.066354751586914, "logps/chosen": -548.3967895507812, "logps/rejected": -968.6322021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.815115928649902, "rewards/margins": 26.51719856262207, "rewards/rejected": -39.332313537597656, "step": 1986 }, { "epoch": 1.236080870917574, "grad_norm": 39.62969207763672, "learning_rate": 3.2664822498847397e-06, "logits/chosen": 1.6390573978424072, "logits/rejected": 3.757746696472168, "logps/chosen": -676.7535400390625, "logps/rejected": -942.8658447265625, "loss": 0.7047, "rewards/accuracies": 0.75, "rewards/chosen": -8.365790367126465, "rewards/margins": 15.35942268371582, "rewards/rejected": -23.72521209716797, "step": 1987 }, { "epoch": 1.2367029548989112, "grad_norm": 2.783613681793213, "learning_rate": 3.265329644997695e-06, "logits/chosen": 1.4804677963256836, "logits/rejected": 2.917877197265625, "logps/chosen": -530.9738159179688, "logps/rejected": -870.9774169921875, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -7.374899387359619, "rewards/margins": 26.20675277709961, "rewards/rejected": -33.5816535949707, "step": 1988 }, { "epoch": 1.2373250388802488, "grad_norm": 0.20069032907485962, "learning_rate": 3.26417704011065e-06, "logits/chosen": 0.017816901206970215, "logits/rejected": 3.8825366497039795, "logps/chosen": -483.43939208984375, "logps/rejected": -911.643798828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.31286096572876, "rewards/margins": 21.46490478515625, "rewards/rejected": -26.77776527404785, "step": 1989 }, { "epoch": 1.2379471228615864, "grad_norm": 0.6583080887794495, "learning_rate": 3.263024435223606e-06, "logits/chosen": 3.9078497886657715, "logits/rejected": 3.138378381729126, "logps/chosen": -599.2481079101562, "logps/rejected": -749.1458740234375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -8.0813570022583, "rewards/margins": 19.185405731201172, "rewards/rejected": -27.26676368713379, "step": 1990 }, { "epoch": 1.2385692068429237, "grad_norm": 0.01927441544830799, "learning_rate": 3.261871830336561e-06, "logits/chosen": 2.222565174102783, "logits/rejected": 4.206331729888916, "logps/chosen": -617.39990234375, "logps/rejected": -958.8414306640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.376999378204346, "rewards/margins": 23.93924331665039, "rewards/rejected": -28.316242218017578, "step": 1991 }, { "epoch": 1.2391912908242613, "grad_norm": 3.1013777256011963, "learning_rate": 3.2607192254495163e-06, "logits/chosen": -1.4709454774856567, "logits/rejected": 1.9248082637786865, "logps/chosen": -505.6230773925781, "logps/rejected": -811.3922119140625, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -10.807146072387695, "rewards/margins": 13.65088939666748, "rewards/rejected": -24.45803451538086, "step": 1992 }, { "epoch": 1.2398133748055988, "grad_norm": 25.103227615356445, "learning_rate": 3.2595666205624715e-06, "logits/chosen": 0.7945823073387146, "logits/rejected": 4.353921413421631, "logps/chosen": -557.3357543945312, "logps/rejected": -973.616943359375, "loss": 0.3781, "rewards/accuracies": 0.875, "rewards/chosen": -8.110764503479004, "rewards/margins": 25.25164222717285, "rewards/rejected": -33.362403869628906, "step": 1993 }, { "epoch": 1.2404354587869362, "grad_norm": 6.928995571797714e-05, "learning_rate": 3.2584140156754267e-06, "logits/chosen": -1.121807336807251, "logits/rejected": 3.6673836708068848, "logps/chosen": -458.9044189453125, "logps/rejected": -1019.62060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.898647785186768, "rewards/margins": 24.41216278076172, "rewards/rejected": -30.31081199645996, "step": 1994 }, { "epoch": 1.2410575427682737, "grad_norm": 0.7680820226669312, "learning_rate": 3.257261410788382e-06, "logits/chosen": -1.2524921894073486, "logits/rejected": 0.39433813095092773, "logps/chosen": -464.68634033203125, "logps/rejected": -828.046142578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.972171306610107, "rewards/margins": 24.00033187866211, "rewards/rejected": -30.97249984741211, "step": 1995 }, { "epoch": 1.2416796267496113, "grad_norm": 4.20468268202967e-06, "learning_rate": 3.256108805901337e-06, "logits/chosen": 1.3287625312805176, "logits/rejected": 2.1750547885894775, "logps/chosen": -659.4127197265625, "logps/rejected": -1102.378662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.000626564025879, "rewards/margins": 29.679622650146484, "rewards/rejected": -42.68025207519531, "step": 1996 }, { "epoch": 1.2423017107309486, "grad_norm": 29.853729248046875, "learning_rate": 3.254956201014293e-06, "logits/chosen": -2.877234697341919, "logits/rejected": 3.3251304626464844, "logps/chosen": -391.4083557128906, "logps/rejected": -1055.4921875, "loss": 0.318, "rewards/accuracies": 0.875, "rewards/chosen": -6.726519584655762, "rewards/margins": 30.075284957885742, "rewards/rejected": -36.80180358886719, "step": 1997 }, { "epoch": 1.2429237947122862, "grad_norm": 0.00010009088146034628, "learning_rate": 3.253803596127248e-06, "logits/chosen": 0.9821017384529114, "logits/rejected": 3.299887180328369, "logps/chosen": -577.6727294921875, "logps/rejected": -1019.5888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.769001007080078, "rewards/margins": 27.716604232788086, "rewards/rejected": -33.48560333251953, "step": 1998 }, { "epoch": 1.2435458786936238, "grad_norm": 1.9293725927127525e-05, "learning_rate": 3.2526509912402033e-06, "logits/chosen": -0.9194085001945496, "logits/rejected": 1.708883285522461, "logps/chosen": -453.50048828125, "logps/rejected": -840.7008056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.381048202514648, "rewards/margins": 25.372150421142578, "rewards/rejected": -30.753196716308594, "step": 1999 }, { "epoch": 1.244167962674961, "grad_norm": 0.05470879748463631, "learning_rate": 3.2514983863531585e-06, "logits/chosen": -2.1505768299102783, "logits/rejected": 3.283268928527832, "logps/chosen": -362.8547668457031, "logps/rejected": -830.670166015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.526420593261719, "rewards/margins": 21.226747512817383, "rewards/rejected": -27.7531681060791, "step": 2000 }, { "epoch": 1.2447900466562987, "grad_norm": 3.966289520263672, "learning_rate": 3.2503457814661137e-06, "logits/chosen": -2.3252477645874023, "logits/rejected": 1.5557923316955566, "logps/chosen": -499.74041748046875, "logps/rejected": -917.0076904296875, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -10.588899612426758, "rewards/margins": 19.9823055267334, "rewards/rejected": -30.571205139160156, "step": 2001 }, { "epoch": 1.245412130637636, "grad_norm": 0.007073727436363697, "learning_rate": 3.249193176579069e-06, "logits/chosen": -2.3345444202423096, "logits/rejected": 0.8063172698020935, "logps/chosen": -391.07745361328125, "logps/rejected": -796.2643432617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.851896286010742, "rewards/margins": 20.569223403930664, "rewards/rejected": -28.421119689941406, "step": 2002 }, { "epoch": 1.2460342146189736, "grad_norm": 8.042104309424758e-05, "learning_rate": 3.248040571692024e-06, "logits/chosen": -0.987544596195221, "logits/rejected": 2.0049033164978027, "logps/chosen": -505.7494201660156, "logps/rejected": -1006.3981323242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.076696872711182, "rewards/margins": 24.556446075439453, "rewards/rejected": -31.633142471313477, "step": 2003 }, { "epoch": 1.2466562986003111, "grad_norm": 7.321867145514051e-15, "learning_rate": 3.24688796680498e-06, "logits/chosen": -0.5694581270217896, "logits/rejected": 3.1910927295684814, "logps/chosen": -467.1485595703125, "logps/rejected": -1130.46142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.254088401794434, "rewards/margins": 44.00873565673828, "rewards/rejected": -48.26282501220703, "step": 2004 }, { "epoch": 1.2472783825816485, "grad_norm": 7.13050667400239e-06, "learning_rate": 3.245735361917935e-06, "logits/chosen": -0.6603339910507202, "logits/rejected": 2.0062170028686523, "logps/chosen": -396.538818359375, "logps/rejected": -916.5963134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.292914867401123, "rewards/margins": 31.59789276123047, "rewards/rejected": -35.89080810546875, "step": 2005 }, { "epoch": 1.247900466562986, "grad_norm": 0.019461452960968018, "learning_rate": 3.2445827570308903e-06, "logits/chosen": -2.6960866451263428, "logits/rejected": 3.560029983520508, "logps/chosen": -458.8998107910156, "logps/rejected": -1042.880859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.916061401367188, "rewards/margins": 28.842018127441406, "rewards/rejected": -38.75807571411133, "step": 2006 }, { "epoch": 1.2485225505443234, "grad_norm": 3.5628645420074463, "learning_rate": 3.2434301521438455e-06, "logits/chosen": 1.0278129577636719, "logits/rejected": 1.1622637510299683, "logps/chosen": -554.2152099609375, "logps/rejected": -736.61376953125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -6.3448333740234375, "rewards/margins": 14.856306076049805, "rewards/rejected": -21.201141357421875, "step": 2007 }, { "epoch": 1.249144634525661, "grad_norm": 0.07135644555091858, "learning_rate": 3.2422775472568007e-06, "logits/chosen": 1.733124017715454, "logits/rejected": 4.378563404083252, "logps/chosen": -640.43896484375, "logps/rejected": -1003.464599609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.873009204864502, "rewards/margins": 19.59971046447754, "rewards/rejected": -25.472719192504883, "step": 2008 }, { "epoch": 1.2497667185069985, "grad_norm": 7.073273877722386e-07, "learning_rate": 3.241124942369756e-06, "logits/chosen": 0.5708224773406982, "logits/rejected": 3.2701048851013184, "logps/chosen": -563.2235717773438, "logps/rejected": -1009.9588012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.070895195007324, "rewards/margins": 29.836795806884766, "rewards/rejected": -35.907691955566406, "step": 2009 }, { "epoch": 1.2503888024883358, "grad_norm": 0.0315566249191761, "learning_rate": 3.239972337482711e-06, "logits/chosen": 1.7904354333877563, "logits/rejected": 1.5742547512054443, "logps/chosen": -606.237060546875, "logps/rejected": -813.814208984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.600082874298096, "rewards/margins": 21.077608108520508, "rewards/rejected": -27.677692413330078, "step": 2010 }, { "epoch": 1.2510108864696734, "grad_norm": 14.575484275817871, "learning_rate": 3.2388197325956664e-06, "logits/chosen": 0.15945017337799072, "logits/rejected": 2.334524154663086, "logps/chosen": -357.0718078613281, "logps/rejected": -729.5298461914062, "loss": 0.1145, "rewards/accuracies": 0.875, "rewards/chosen": -0.7053083777427673, "rewards/margins": 26.521766662597656, "rewards/rejected": -27.227073669433594, "step": 2011 }, { "epoch": 1.251632970451011, "grad_norm": 4.577037543640472e-05, "learning_rate": 3.237667127708622e-06, "logits/chosen": -0.05641406774520874, "logits/rejected": 3.0679242610931396, "logps/chosen": -470.065185546875, "logps/rejected": -942.2273559570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.128070831298828, "rewards/margins": 29.991792678833008, "rewards/rejected": -38.11986541748047, "step": 2012 }, { "epoch": 1.2522550544323483, "grad_norm": 36.018550872802734, "learning_rate": 3.2365145228215773e-06, "logits/chosen": 3.5689821243286133, "logits/rejected": 2.6732563972473145, "logps/chosen": -845.01318359375, "logps/rejected": -962.8516845703125, "loss": 1.7042, "rewards/accuracies": 0.875, "rewards/chosen": -12.484785079956055, "rewards/margins": 17.063976287841797, "rewards/rejected": -29.54875946044922, "step": 2013 }, { "epoch": 1.2528771384136859, "grad_norm": 4.052655640407465e-06, "learning_rate": 3.2353619179345325e-06, "logits/chosen": 1.3075627088546753, "logits/rejected": 5.013179779052734, "logps/chosen": -513.4508056640625, "logps/rejected": -1001.1782836914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.508722305297852, "rewards/margins": 29.324068069458008, "rewards/rejected": -37.83279037475586, "step": 2014 }, { "epoch": 1.2534992223950234, "grad_norm": 0.396076500415802, "learning_rate": 3.2342093130474877e-06, "logits/chosen": -0.08282530307769775, "logits/rejected": 2.28505802154541, "logps/chosen": -409.0604248046875, "logps/rejected": -751.6499633789062, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.9696044921875, "rewards/margins": 19.8177547454834, "rewards/rejected": -25.7873592376709, "step": 2015 }, { "epoch": 1.2541213063763608, "grad_norm": 22.250051498413086, "learning_rate": 3.233056708160443e-06, "logits/chosen": 2.5599923133850098, "logits/rejected": 2.784825325012207, "logps/chosen": -707.2561645507812, "logps/rejected": -902.107666015625, "loss": 0.1687, "rewards/accuracies": 0.875, "rewards/chosen": -7.913710594177246, "rewards/margins": 21.890016555786133, "rewards/rejected": -29.803726196289062, "step": 2016 }, { "epoch": 1.2547433903576983, "grad_norm": 3.3847265967779094e-07, "learning_rate": 3.231904103273398e-06, "logits/chosen": -0.3941129446029663, "logits/rejected": 2.676504135131836, "logps/chosen": -527.0136108398438, "logps/rejected": -865.6759033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.5630974769592285, "rewards/margins": 25.08944320678711, "rewards/rejected": -29.652542114257812, "step": 2017 }, { "epoch": 1.255365474339036, "grad_norm": 1.4325063228607178, "learning_rate": 3.2307514983863534e-06, "logits/chosen": -0.9161220192909241, "logits/rejected": 2.9123919010162354, "logps/chosen": -465.49871826171875, "logps/rejected": -955.76611328125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -5.465087890625, "rewards/margins": 24.841506958007812, "rewards/rejected": -30.306594848632812, "step": 2018 }, { "epoch": 1.2559875583203732, "grad_norm": 0.0013318108394742012, "learning_rate": 3.229598893499309e-06, "logits/chosen": 3.270078659057617, "logits/rejected": 3.45019268989563, "logps/chosen": -590.9571533203125, "logps/rejected": -902.8822021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.184449672698975, "rewards/margins": 22.32142448425293, "rewards/rejected": -29.505874633789062, "step": 2019 }, { "epoch": 1.2566096423017108, "grad_norm": 0.0002312797005288303, "learning_rate": 3.2284462886122643e-06, "logits/chosen": 1.7641077041625977, "logits/rejected": 3.2676451206207275, "logps/chosen": -574.0552978515625, "logps/rejected": -958.4447631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.362613677978516, "rewards/margins": 22.630130767822266, "rewards/rejected": -28.99274444580078, "step": 2020 }, { "epoch": 1.2572317262830481, "grad_norm": 16.431461334228516, "learning_rate": 3.2272936837252195e-06, "logits/chosen": 1.4729183912277222, "logits/rejected": 4.335656642913818, "logps/chosen": -618.861572265625, "logps/rejected": -970.4544677734375, "loss": 0.1198, "rewards/accuracies": 0.875, "rewards/chosen": -2.866339683532715, "rewards/margins": 20.629886627197266, "rewards/rejected": -23.496227264404297, "step": 2021 }, { "epoch": 1.2578538102643857, "grad_norm": 2.040773868560791, "learning_rate": 3.2261410788381747e-06, "logits/chosen": 0.7448826432228088, "logits/rejected": 3.73989200592041, "logps/chosen": -474.6575012207031, "logps/rejected": -845.82080078125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -5.902726173400879, "rewards/margins": 24.317073822021484, "rewards/rejected": -30.219797134399414, "step": 2022 }, { "epoch": 1.258475894245723, "grad_norm": 0.001415720907971263, "learning_rate": 3.22498847395113e-06, "logits/chosen": -3.1525511741638184, "logits/rejected": 3.938098430633545, "logps/chosen": -251.63796997070312, "logps/rejected": -826.9188842773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.161817789077759, "rewards/margins": 26.74517059326172, "rewards/rejected": -28.906986236572266, "step": 2023 }, { "epoch": 1.2590979782270606, "grad_norm": 2.495192766189575, "learning_rate": 3.223835869064085e-06, "logits/chosen": 1.8843852281570435, "logits/rejected": 3.638036012649536, "logps/chosen": -599.0542602539062, "logps/rejected": -920.1378784179688, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -7.660336494445801, "rewards/margins": 21.504348754882812, "rewards/rejected": -29.164684295654297, "step": 2024 }, { "epoch": 1.2597200622083982, "grad_norm": 0.10371783375740051, "learning_rate": 3.2226832641770404e-06, "logits/chosen": -3.003751754760742, "logits/rejected": 2.675790548324585, "logps/chosen": -427.89923095703125, "logps/rejected": -978.2462158203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.760486125946045, "rewards/margins": 27.6287841796875, "rewards/rejected": -32.38926696777344, "step": 2025 }, { "epoch": 1.2603421461897355, "grad_norm": 12.984769821166992, "learning_rate": 3.221530659289996e-06, "logits/chosen": 0.3211978077888489, "logits/rejected": 2.3837990760803223, "logps/chosen": -542.7652587890625, "logps/rejected": -1028.8651123046875, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -5.041831970214844, "rewards/margins": 29.209022521972656, "rewards/rejected": -34.2508544921875, "step": 2026 }, { "epoch": 1.260964230171073, "grad_norm": 0.0963178277015686, "learning_rate": 3.2203780544029513e-06, "logits/chosen": 1.8055074214935303, "logits/rejected": 4.252056121826172, "logps/chosen": -537.7628173828125, "logps/rejected": -976.2918701171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.120723724365234, "rewards/margins": 26.42084503173828, "rewards/rejected": -34.54157257080078, "step": 2027 }, { "epoch": 1.2615863141524106, "grad_norm": 0.031602680683135986, "learning_rate": 3.2192254495159065e-06, "logits/chosen": 0.5145872235298157, "logits/rejected": 3.8570258617401123, "logps/chosen": -490.4646301269531, "logps/rejected": -889.964599609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.190765857696533, "rewards/margins": 19.79231071472168, "rewards/rejected": -23.983078002929688, "step": 2028 }, { "epoch": 1.262208398133748, "grad_norm": 3.2892661094665527, "learning_rate": 3.2180728446288617e-06, "logits/chosen": 2.532592296600342, "logits/rejected": 2.6494524478912354, "logps/chosen": -501.0936279296875, "logps/rejected": -779.4171752929688, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -11.345937728881836, "rewards/margins": 20.526443481445312, "rewards/rejected": -31.872379302978516, "step": 2029 }, { "epoch": 1.2628304821150855, "grad_norm": 0.1048092469573021, "learning_rate": 3.216920239741817e-06, "logits/chosen": 0.8485032320022583, "logits/rejected": 2.9057791233062744, "logps/chosen": -572.03857421875, "logps/rejected": -965.0180053710938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.7810775637626648, "rewards/margins": 28.03656005859375, "rewards/rejected": -28.817636489868164, "step": 2030 }, { "epoch": 1.263452566096423, "grad_norm": 0.04916305094957352, "learning_rate": 3.215767634854772e-06, "logits/chosen": 2.4191346168518066, "logits/rejected": 5.252843379974365, "logps/chosen": -629.9830932617188, "logps/rejected": -1004.4718627929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.305938243865967, "rewards/margins": 24.819896697998047, "rewards/rejected": -31.125835418701172, "step": 2031 }, { "epoch": 1.2640746500777604, "grad_norm": 4.426191298989579e-05, "learning_rate": 3.2146150299677274e-06, "logits/chosen": -0.6191399097442627, "logits/rejected": 2.3437392711639404, "logps/chosen": -436.3004150390625, "logps/rejected": -949.069580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.960773468017578, "rewards/margins": 28.112300872802734, "rewards/rejected": -37.07307052612305, "step": 2032 }, { "epoch": 1.264696734059098, "grad_norm": 0.016834422945976257, "learning_rate": 3.213462425080683e-06, "logits/chosen": 1.7167022228240967, "logits/rejected": 2.2886548042297363, "logps/chosen": -659.6353759765625, "logps/rejected": -891.0438842773438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.525177001953125, "rewards/margins": 16.286649703979492, "rewards/rejected": -26.811824798583984, "step": 2033 }, { "epoch": 1.2653188180404356, "grad_norm": 0.1283029019832611, "learning_rate": 3.2123098201936383e-06, "logits/chosen": -2.6232731342315674, "logits/rejected": 2.2754905223846436, "logps/chosen": -330.6903381347656, "logps/rejected": -941.403564453125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9191455841064453, "rewards/margins": 31.921056747436523, "rewards/rejected": -33.84020233154297, "step": 2034 }, { "epoch": 1.265940902021773, "grad_norm": 2.896956357290037e-05, "learning_rate": 3.2111572153065935e-06, "logits/chosen": 0.8080716729164124, "logits/rejected": 2.0364251136779785, "logps/chosen": -618.687255859375, "logps/rejected": -909.19580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.720874309539795, "rewards/margins": 23.601850509643555, "rewards/rejected": -29.32272720336914, "step": 2035 }, { "epoch": 1.2665629860031105, "grad_norm": 0.4231943190097809, "learning_rate": 3.2100046104195487e-06, "logits/chosen": 0.10635495185852051, "logits/rejected": 2.2073330879211426, "logps/chosen": -602.9811401367188, "logps/rejected": -936.6310424804688, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -11.511858940124512, "rewards/margins": 18.06280517578125, "rewards/rejected": -29.574663162231445, "step": 2036 }, { "epoch": 1.267185069984448, "grad_norm": 0.10137329250574112, "learning_rate": 3.208852005532504e-06, "logits/chosen": 0.14811724424362183, "logits/rejected": 3.2426066398620605, "logps/chosen": -468.83148193359375, "logps/rejected": -961.1331176757812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.441210746765137, "rewards/margins": 29.445789337158203, "rewards/rejected": -35.887001037597656, "step": 2037 }, { "epoch": 1.2678071539657854, "grad_norm": 0.0012554771965369582, "learning_rate": 3.207699400645459e-06, "logits/chosen": -1.054654598236084, "logits/rejected": 2.8366193771362305, "logps/chosen": -340.777099609375, "logps/rejected": -852.3719482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.394537925720215, "rewards/margins": 27.311038970947266, "rewards/rejected": -31.705581665039062, "step": 2038 }, { "epoch": 1.268429237947123, "grad_norm": 0.002670434070751071, "learning_rate": 3.2065467957584144e-06, "logits/chosen": 0.28375720977783203, "logits/rejected": 4.042474269866943, "logps/chosen": -600.4718627929688, "logps/rejected": -1158.3646240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.513896942138672, "rewards/margins": 30.927602767944336, "rewards/rejected": -37.44150161743164, "step": 2039 }, { "epoch": 1.2690513219284603, "grad_norm": 3.808347901212983e-05, "learning_rate": 3.2053941908713696e-06, "logits/chosen": 1.5050125122070312, "logits/rejected": 4.318361759185791, "logps/chosen": -664.9024658203125, "logps/rejected": -1053.49609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.091887950897217, "rewards/margins": 28.934024810791016, "rewards/rejected": -35.025909423828125, "step": 2040 }, { "epoch": 1.2696734059097978, "grad_norm": 0.009404631331562996, "learning_rate": 3.2042415859843253e-06, "logits/chosen": -1.170180320739746, "logits/rejected": 2.024054765701294, "logps/chosen": -408.93048095703125, "logps/rejected": -817.9940185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.462571620941162, "rewards/margins": 18.640281677246094, "rewards/rejected": -25.102855682373047, "step": 2041 }, { "epoch": 1.2702954898911352, "grad_norm": 1.738207538437564e-05, "learning_rate": 3.2030889810972796e-06, "logits/chosen": 2.111595392227173, "logits/rejected": 4.570797443389893, "logps/chosen": -592.1729125976562, "logps/rejected": -1171.582763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.525852203369141, "rewards/margins": 41.40595626831055, "rewards/rejected": -48.93180847167969, "step": 2042 }, { "epoch": 1.2709175738724727, "grad_norm": 1.4109896421432495, "learning_rate": 3.2019363762102353e-06, "logits/chosen": 0.4325029253959656, "logits/rejected": 3.582610607147217, "logps/chosen": -439.2037658691406, "logps/rejected": -752.7412109375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -5.916749477386475, "rewards/margins": 15.604118347167969, "rewards/rejected": -21.5208683013916, "step": 2043 }, { "epoch": 1.2715396578538103, "grad_norm": 0.002375447889789939, "learning_rate": 3.2007837713231905e-06, "logits/chosen": 0.35185641050338745, "logits/rejected": 2.958373785018921, "logps/chosen": -416.5084228515625, "logps/rejected": -772.28125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.487848281860352, "rewards/margins": 21.232372283935547, "rewards/rejected": -26.72022247314453, "step": 2044 }, { "epoch": 1.2721617418351476, "grad_norm": 0.1174873635172844, "learning_rate": 3.1996311664361457e-06, "logits/chosen": -0.5361908078193665, "logits/rejected": 3.4455199241638184, "logps/chosen": -454.63323974609375, "logps/rejected": -859.8018798828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.4316816329956055, "rewards/margins": 16.827882766723633, "rewards/rejected": -21.259565353393555, "step": 2045 }, { "epoch": 1.2727838258164852, "grad_norm": 0.022169746458530426, "learning_rate": 3.198478561549101e-06, "logits/chosen": 1.50562584400177, "logits/rejected": 2.5826327800750732, "logps/chosen": -629.9881591796875, "logps/rejected": -940.7921142578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.677554607391357, "rewards/margins": 23.858470916748047, "rewards/rejected": -29.536026000976562, "step": 2046 }, { "epoch": 1.2734059097978228, "grad_norm": 0.00024391288752667606, "learning_rate": 3.197325956662056e-06, "logits/chosen": -0.16936111450195312, "logits/rejected": 3.6262879371643066, "logps/chosen": -429.77386474609375, "logps/rejected": -943.689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.290872573852539, "rewards/margins": 28.654075622558594, "rewards/rejected": -32.944950103759766, "step": 2047 }, { "epoch": 1.27402799377916, "grad_norm": 0.09606008976697922, "learning_rate": 3.1961733517750114e-06, "logits/chosen": -1.716057538986206, "logits/rejected": 2.7073190212249756, "logps/chosen": -440.507080078125, "logps/rejected": -845.6986083984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.6731438636779785, "rewards/margins": 22.331708908081055, "rewards/rejected": -28.004854202270508, "step": 2048 }, { "epoch": 1.2746500777604977, "grad_norm": 0.04162227362394333, "learning_rate": 3.1950207468879666e-06, "logits/chosen": -2.305849075317383, "logits/rejected": 3.7094991207122803, "logps/chosen": -292.4382019042969, "logps/rejected": -912.4019775390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.582938194274902, "rewards/margins": 30.001415252685547, "rewards/rejected": -34.5843505859375, "step": 2049 }, { "epoch": 1.2752721617418352, "grad_norm": 7.29586124420166, "learning_rate": 3.1938681420009223e-06, "logits/chosen": 0.8130073547363281, "logits/rejected": 2.7934420108795166, "logps/chosen": -439.5497131347656, "logps/rejected": -721.2268676757812, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -9.392171859741211, "rewards/margins": 15.094999313354492, "rewards/rejected": -24.487171173095703, "step": 2050 }, { "epoch": 1.2758942457231726, "grad_norm": 0.008065351285040379, "learning_rate": 3.1927155371138775e-06, "logits/chosen": 1.056755781173706, "logits/rejected": 4.707803726196289, "logps/chosen": -500.83953857421875, "logps/rejected": -960.3767700195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.4384870529174805, "rewards/margins": 24.951709747314453, "rewards/rejected": -31.390199661254883, "step": 2051 }, { "epoch": 1.2765163297045101, "grad_norm": 43.08720779418945, "learning_rate": 3.1915629322268327e-06, "logits/chosen": 2.101077079772949, "logits/rejected": 3.4182801246643066, "logps/chosen": -597.8524169921875, "logps/rejected": -868.7628784179688, "loss": 1.6764, "rewards/accuracies": 0.875, "rewards/chosen": -6.14328670501709, "rewards/margins": 16.80899429321289, "rewards/rejected": -22.952281951904297, "step": 2052 }, { "epoch": 1.2771384136858477, "grad_norm": 0.9350723624229431, "learning_rate": 3.190410327339788e-06, "logits/chosen": 0.829081118106842, "logits/rejected": 3.3746399879455566, "logps/chosen": -456.2769470214844, "logps/rejected": -832.5189208984375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -7.106916427612305, "rewards/margins": 25.035396575927734, "rewards/rejected": -32.142311096191406, "step": 2053 }, { "epoch": 1.277760497667185, "grad_norm": 1.491163730621338, "learning_rate": 3.189257722452743e-06, "logits/chosen": -0.5811960101127625, "logits/rejected": 3.256408214569092, "logps/chosen": -448.40478515625, "logps/rejected": -795.5897827148438, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -6.50742769241333, "rewards/margins": 18.371578216552734, "rewards/rejected": -24.879005432128906, "step": 2054 }, { "epoch": 1.2783825816485226, "grad_norm": 0.00022475777950603515, "learning_rate": 3.1881051175656984e-06, "logits/chosen": -2.1531898975372314, "logits/rejected": 1.8872551918029785, "logps/chosen": -377.0928039550781, "logps/rejected": -985.4249267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.719932556152344, "rewards/margins": 31.08294677734375, "rewards/rejected": -37.802879333496094, "step": 2055 }, { "epoch": 1.2790046656298601, "grad_norm": 0.0981423407793045, "learning_rate": 3.1869525126786536e-06, "logits/chosen": 0.9427670240402222, "logits/rejected": 3.304424285888672, "logps/chosen": -557.6295166015625, "logps/rejected": -969.7200927734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.98606014251709, "rewards/margins": 25.926830291748047, "rewards/rejected": -32.91289138793945, "step": 2056 }, { "epoch": 1.2796267496111975, "grad_norm": 0.7095301151275635, "learning_rate": 3.1857999077916093e-06, "logits/chosen": -1.1271562576293945, "logits/rejected": 3.539489984512329, "logps/chosen": -500.1939697265625, "logps/rejected": -919.2050170898438, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -4.236118316650391, "rewards/margins": 19.31410026550293, "rewards/rejected": -23.550216674804688, "step": 2057 }, { "epoch": 1.280248833592535, "grad_norm": 0.01117491815239191, "learning_rate": 3.1846473029045645e-06, "logits/chosen": 0.3624744415283203, "logits/rejected": 2.193741798400879, "logps/chosen": -464.18096923828125, "logps/rejected": -894.273681640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.566399574279785, "rewards/margins": 26.122310638427734, "rewards/rejected": -32.68871307373047, "step": 2058 }, { "epoch": 1.2808709175738724, "grad_norm": 0.6809660196304321, "learning_rate": 3.1834946980175197e-06, "logits/chosen": -0.000639304518699646, "logits/rejected": 1.8177388906478882, "logps/chosen": -540.7205810546875, "logps/rejected": -900.8763427734375, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -10.371767044067383, "rewards/margins": 22.083099365234375, "rewards/rejected": -32.45486831665039, "step": 2059 }, { "epoch": 1.28149300155521, "grad_norm": 0.001697477768175304, "learning_rate": 3.182342093130475e-06, "logits/chosen": -0.06266975402832031, "logits/rejected": 3.489635467529297, "logps/chosen": -470.419677734375, "logps/rejected": -1003.0845947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.994956970214844, "rewards/margins": 27.230785369873047, "rewards/rejected": -35.22574234008789, "step": 2060 }, { "epoch": 1.2821150855365475, "grad_norm": 0.18616902828216553, "learning_rate": 3.18118948824343e-06, "logits/chosen": 1.1356843709945679, "logits/rejected": 4.710424423217773, "logps/chosen": -583.5643920898438, "logps/rejected": -959.2979125976562, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.797614097595215, "rewards/margins": 23.102134704589844, "rewards/rejected": -27.89974594116211, "step": 2061 }, { "epoch": 1.2827371695178849, "grad_norm": 0.047538481652736664, "learning_rate": 3.1800368833563854e-06, "logits/chosen": -1.8425778150558472, "logits/rejected": 2.8900485038757324, "logps/chosen": -361.5750732421875, "logps/rejected": -856.4237670898438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.557888984680176, "rewards/margins": 26.233121871948242, "rewards/rejected": -31.791013717651367, "step": 2062 }, { "epoch": 1.2833592534992224, "grad_norm": 0.30076029896736145, "learning_rate": 3.1788842784693406e-06, "logits/chosen": 1.5041686296463013, "logits/rejected": 3.6929330825805664, "logps/chosen": -591.1065673828125, "logps/rejected": -993.000244140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.452116012573242, "rewards/margins": 27.180265426635742, "rewards/rejected": -35.632381439208984, "step": 2063 }, { "epoch": 1.2839813374805598, "grad_norm": 0.7477201819419861, "learning_rate": 3.177731673582296e-06, "logits/chosen": -0.05225837230682373, "logits/rejected": 4.111126899719238, "logps/chosen": -477.04266357421875, "logps/rejected": -881.5646362304688, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -6.437117099761963, "rewards/margins": 25.369081497192383, "rewards/rejected": -31.80620002746582, "step": 2064 }, { "epoch": 1.2846034214618973, "grad_norm": 3.26867825606314e-06, "learning_rate": 3.1765790686952515e-06, "logits/chosen": 0.5494682788848877, "logits/rejected": 3.7316155433654785, "logps/chosen": -527.177001953125, "logps/rejected": -987.5484619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.781964302062988, "rewards/margins": 29.342601776123047, "rewards/rejected": -36.12456512451172, "step": 2065 }, { "epoch": 1.2852255054432349, "grad_norm": 1.0592074431770016e-05, "learning_rate": 3.1754264638082067e-06, "logits/chosen": 1.2520337104797363, "logits/rejected": 5.836635589599609, "logps/chosen": -480.50396728515625, "logps/rejected": -1135.91796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.007936000823975, "rewards/margins": 32.592288970947266, "rewards/rejected": -36.600223541259766, "step": 2066 }, { "epoch": 1.2858475894245722, "grad_norm": 2.0127735137939453, "learning_rate": 3.174273858921162e-06, "logits/chosen": 2.7770450115203857, "logits/rejected": 3.0169568061828613, "logps/chosen": -601.7747802734375, "logps/rejected": -867.8875732421875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -7.067495346069336, "rewards/margins": 22.419506072998047, "rewards/rejected": -29.487001419067383, "step": 2067 }, { "epoch": 1.2864696734059098, "grad_norm": 0.13479429483413696, "learning_rate": 3.173121254034117e-06, "logits/chosen": 1.5869368314743042, "logits/rejected": 4.738080978393555, "logps/chosen": -596.6153564453125, "logps/rejected": -1134.51904296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.250545501708984, "rewards/margins": 30.297420501708984, "rewards/rejected": -39.5479621887207, "step": 2068 }, { "epoch": 1.2870917573872473, "grad_norm": 6.155482769012451, "learning_rate": 3.1719686491470724e-06, "logits/chosen": -1.9126255512237549, "logits/rejected": 1.934863805770874, "logps/chosen": -307.49249267578125, "logps/rejected": -780.62939453125, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -5.279009819030762, "rewards/margins": 22.31053924560547, "rewards/rejected": -27.589548110961914, "step": 2069 }, { "epoch": 1.2877138413685847, "grad_norm": 0.27120184898376465, "learning_rate": 3.1708160442600276e-06, "logits/chosen": 2.8405706882476807, "logits/rejected": 4.2547526359558105, "logps/chosen": -458.87255859375, "logps/rejected": -801.9660034179688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.665666580200195, "rewards/margins": 23.629499435424805, "rewards/rejected": -29.295169830322266, "step": 2070 }, { "epoch": 1.2883359253499223, "grad_norm": 36.809181213378906, "learning_rate": 3.169663439372983e-06, "logits/chosen": -3.2923147678375244, "logits/rejected": 3.318225383758545, "logps/chosen": -266.0099182128906, "logps/rejected": -859.7506103515625, "loss": 0.9804, "rewards/accuracies": 0.875, "rewards/chosen": -7.480389595031738, "rewards/margins": 24.918094635009766, "rewards/rejected": -32.39848327636719, "step": 2071 }, { "epoch": 1.2889580093312598, "grad_norm": 0.0019073631847277284, "learning_rate": 3.1685108344859385e-06, "logits/chosen": 2.2889578342437744, "logits/rejected": 3.743168354034424, "logps/chosen": -539.8269653320312, "logps/rejected": -848.6666870117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.808744430541992, "rewards/margins": 21.052974700927734, "rewards/rejected": -26.86172103881836, "step": 2072 }, { "epoch": 1.2895800933125972, "grad_norm": 0.00458146259188652, "learning_rate": 3.1673582295988937e-06, "logits/chosen": -0.6084625720977783, "logits/rejected": 2.6312525272369385, "logps/chosen": -308.7406005859375, "logps/rejected": -774.83544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.150224208831787, "rewards/margins": 26.105060577392578, "rewards/rejected": -31.255285263061523, "step": 2073 }, { "epoch": 1.2902021772939347, "grad_norm": 7.325793266296387, "learning_rate": 3.166205624711849e-06, "logits/chosen": 0.36518922448158264, "logits/rejected": 1.6637272834777832, "logps/chosen": -547.444580078125, "logps/rejected": -901.6878662109375, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -8.453319549560547, "rewards/margins": 28.862653732299805, "rewards/rejected": -37.315975189208984, "step": 2074 }, { "epoch": 1.2908242612752723, "grad_norm": 25.79266357421875, "learning_rate": 3.165053019824804e-06, "logits/chosen": -0.021634042263031006, "logits/rejected": 2.2119054794311523, "logps/chosen": -526.3079223632812, "logps/rejected": -965.0336303710938, "loss": 0.2514, "rewards/accuracies": 0.875, "rewards/chosen": -8.492626190185547, "rewards/margins": 30.373445510864258, "rewards/rejected": -38.86606979370117, "step": 2075 }, { "epoch": 1.2914463452566096, "grad_norm": 0.0180280189961195, "learning_rate": 3.1639004149377594e-06, "logits/chosen": 2.165846824645996, "logits/rejected": 4.180533409118652, "logps/chosen": -627.2410278320312, "logps/rejected": -1168.7398681640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.25662899017334, "rewards/margins": 32.789207458496094, "rewards/rejected": -44.045833587646484, "step": 2076 }, { "epoch": 1.2920684292379472, "grad_norm": 0.0044320556335151196, "learning_rate": 3.1627478100507146e-06, "logits/chosen": -1.9515249729156494, "logits/rejected": 2.808650016784668, "logps/chosen": -472.1334228515625, "logps/rejected": -1063.12548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.644009590148926, "rewards/margins": 31.643199920654297, "rewards/rejected": -42.287208557128906, "step": 2077 }, { "epoch": 1.2926905132192845, "grad_norm": 0.0055711762979626656, "learning_rate": 3.16159520516367e-06, "logits/chosen": 0.9135769605636597, "logits/rejected": 3.6809005737304688, "logps/chosen": -445.4312744140625, "logps/rejected": -779.880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.659487247467041, "rewards/margins": 16.80797004699707, "rewards/rejected": -24.467456817626953, "step": 2078 }, { "epoch": 1.293312597200622, "grad_norm": 2.7579030990600586, "learning_rate": 3.1604426002766255e-06, "logits/chosen": -2.535278797149658, "logits/rejected": 2.1650233268737793, "logps/chosen": -480.51025390625, "logps/rejected": -1007.449462890625, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": -11.463523864746094, "rewards/margins": 27.31024742126465, "rewards/rejected": -38.773773193359375, "step": 2079 }, { "epoch": 1.2939346811819596, "grad_norm": 1.0793917226692429e-07, "learning_rate": 3.1592899953895807e-06, "logits/chosen": 1.1380003690719604, "logits/rejected": 4.078077793121338, "logps/chosen": -603.9254150390625, "logps/rejected": -1046.822998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.813261032104492, "rewards/margins": 32.562355041503906, "rewards/rejected": -41.375614166259766, "step": 2080 }, { "epoch": 1.294556765163297, "grad_norm": 0.0008141865837387741, "learning_rate": 3.158137390502536e-06, "logits/chosen": 0.8326452970504761, "logits/rejected": 2.2659921646118164, "logps/chosen": -609.6137084960938, "logps/rejected": -971.3287963867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.248361587524414, "rewards/margins": 27.811277389526367, "rewards/rejected": -40.05963897705078, "step": 2081 }, { "epoch": 1.2951788491446345, "grad_norm": 34.80479049682617, "learning_rate": 3.156984785615491e-06, "logits/chosen": 1.3409345149993896, "logits/rejected": 3.883260726928711, "logps/chosen": -566.5382080078125, "logps/rejected": -915.5721435546875, "loss": 1.045, "rewards/accuracies": 0.875, "rewards/chosen": -9.75674057006836, "rewards/margins": 24.255878448486328, "rewards/rejected": -34.01261901855469, "step": 2082 }, { "epoch": 1.2958009331259719, "grad_norm": 0.1624457836151123, "learning_rate": 3.1558321807284464e-06, "logits/chosen": 1.185449481010437, "logits/rejected": 1.4735817909240723, "logps/chosen": -448.03564453125, "logps/rejected": -671.8951416015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.0474700927734375, "rewards/margins": 20.61285400390625, "rewards/rejected": -25.66032600402832, "step": 2083 }, { "epoch": 1.2964230171073094, "grad_norm": 5.242217957857065e-05, "learning_rate": 3.1546795758414016e-06, "logits/chosen": -3.8523991107940674, "logits/rejected": 2.827188491821289, "logps/chosen": -233.0387725830078, "logps/rejected": -933.7574462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.143452167510986, "rewards/margins": 33.942142486572266, "rewards/rejected": -39.085594177246094, "step": 2084 }, { "epoch": 1.297045101088647, "grad_norm": 3.0982849352767516e-07, "learning_rate": 3.153526970954357e-06, "logits/chosen": 2.814451217651367, "logits/rejected": 5.035799026489258, "logps/chosen": -613.6903076171875, "logps/rejected": -1098.580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.112676620483398, "rewards/margins": 32.56719207763672, "rewards/rejected": -42.679874420166016, "step": 2085 }, { "epoch": 1.2976671850699844, "grad_norm": 0.005450894124805927, "learning_rate": 3.152374366067312e-06, "logits/chosen": -0.5371922254562378, "logits/rejected": 3.8709936141967773, "logps/chosen": -378.77783203125, "logps/rejected": -962.4693603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.809730529785156, "rewards/margins": 30.568540573120117, "rewards/rejected": -37.378273010253906, "step": 2086 }, { "epoch": 1.298289269051322, "grad_norm": 4.169853687286377, "learning_rate": 3.1512217611802677e-06, "logits/chosen": 0.9278308153152466, "logits/rejected": 4.105837345123291, "logps/chosen": -345.00360107421875, "logps/rejected": -775.925048828125, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -5.518424987792969, "rewards/margins": 26.84881591796875, "rewards/rejected": -32.36724090576172, "step": 2087 }, { "epoch": 1.2989113530326595, "grad_norm": 0.3264186978340149, "learning_rate": 3.150069156293223e-06, "logits/chosen": 1.8526872396469116, "logits/rejected": 3.779759168624878, "logps/chosen": -618.3271484375, "logps/rejected": -1036.8548583984375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -9.674715995788574, "rewards/margins": 28.104290008544922, "rewards/rejected": -37.77900695800781, "step": 2088 }, { "epoch": 1.2995334370139968, "grad_norm": 7.297713756561279, "learning_rate": 3.148916551406178e-06, "logits/chosen": 1.2116997241973877, "logits/rejected": 2.26175594329834, "logps/chosen": -562.7557983398438, "logps/rejected": -918.6259155273438, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -7.19647216796875, "rewards/margins": 26.213180541992188, "rewards/rejected": -33.40965270996094, "step": 2089 }, { "epoch": 1.3001555209953344, "grad_norm": 0.004468827974051237, "learning_rate": 3.1477639465191334e-06, "logits/chosen": -1.1220444440841675, "logits/rejected": 3.41408109664917, "logps/chosen": -421.54791259765625, "logps/rejected": -948.6719970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.602625846862793, "rewards/margins": 27.19068145751953, "rewards/rejected": -33.793304443359375, "step": 2090 }, { "epoch": 1.300777604976672, "grad_norm": 0.007508592680096626, "learning_rate": 3.1466113416320886e-06, "logits/chosen": -0.01274651288986206, "logits/rejected": 2.5498158931732178, "logps/chosen": -466.1578369140625, "logps/rejected": -758.73583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.210990905761719, "rewards/margins": 18.896699905395508, "rewards/rejected": -28.107690811157227, "step": 2091 }, { "epoch": 1.3013996889580093, "grad_norm": 2.6341097354888916, "learning_rate": 3.145458736745044e-06, "logits/chosen": 1.0074734687805176, "logits/rejected": 3.3519134521484375, "logps/chosen": -627.6580810546875, "logps/rejected": -977.8060302734375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.129983425140381, "rewards/margins": 24.711997985839844, "rewards/rejected": -31.841981887817383, "step": 2092 }, { "epoch": 1.3020217729393468, "grad_norm": 0.10764219611883163, "learning_rate": 3.144306131857999e-06, "logits/chosen": -2.0879392623901367, "logits/rejected": 1.9700127840042114, "logps/chosen": -452.25634765625, "logps/rejected": -1075.676025390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.861340522766113, "rewards/margins": 39.19294738769531, "rewards/rejected": -47.054283142089844, "step": 2093 }, { "epoch": 1.3026438569206844, "grad_norm": 0.0006069698138162494, "learning_rate": 3.1431535269709547e-06, "logits/chosen": -0.44354957342147827, "logits/rejected": 3.16389536857605, "logps/chosen": -556.4852294921875, "logps/rejected": -1139.32373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.457627296447754, "rewards/margins": 37.76030349731445, "rewards/rejected": -48.217933654785156, "step": 2094 }, { "epoch": 1.3032659409020217, "grad_norm": 1.178689956665039, "learning_rate": 3.14200092208391e-06, "logits/chosen": -0.7547582387924194, "logits/rejected": 3.3931565284729004, "logps/chosen": -524.9329833984375, "logps/rejected": -1077.71826171875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -11.029295921325684, "rewards/margins": 28.463600158691406, "rewards/rejected": -39.492897033691406, "step": 2095 }, { "epoch": 1.3038880248833593, "grad_norm": 14.788235664367676, "learning_rate": 3.140848317196865e-06, "logits/chosen": -0.28261828422546387, "logits/rejected": 1.6292152404785156, "logps/chosen": -532.36962890625, "logps/rejected": -860.1507568359375, "loss": 0.4315, "rewards/accuracies": 0.875, "rewards/chosen": -7.120588779449463, "rewards/margins": 19.741498947143555, "rewards/rejected": -26.86208724975586, "step": 2096 }, { "epoch": 1.3045101088646969, "grad_norm": 30.001577377319336, "learning_rate": 3.1396957123098204e-06, "logits/chosen": 1.9691777229309082, "logits/rejected": 3.434938430786133, "logps/chosen": -684.923828125, "logps/rejected": -1013.4309692382812, "loss": 0.6869, "rewards/accuracies": 0.875, "rewards/chosen": -12.90506362915039, "rewards/margins": 24.885982513427734, "rewards/rejected": -37.79104232788086, "step": 2097 }, { "epoch": 1.3051321928460342, "grad_norm": 0.1871831715106964, "learning_rate": 3.1385431074227756e-06, "logits/chosen": 2.1880619525909424, "logits/rejected": 1.8200218677520752, "logps/chosen": -679.3081665039062, "logps/rejected": -855.6882934570312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -12.308370590209961, "rewards/margins": 20.151187896728516, "rewards/rejected": -32.45956039428711, "step": 2098 }, { "epoch": 1.3057542768273718, "grad_norm": 0.2509006857872009, "learning_rate": 3.137390502535731e-06, "logits/chosen": 1.5480281114578247, "logits/rejected": 2.9846410751342773, "logps/chosen": -634.6478881835938, "logps/rejected": -973.6765747070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -10.49044418334961, "rewards/margins": 22.26751708984375, "rewards/rejected": -32.757957458496094, "step": 2099 }, { "epoch": 1.3063763608087091, "grad_norm": 0.05295734107494354, "learning_rate": 3.136237897648686e-06, "logits/chosen": 2.5485236644744873, "logits/rejected": 3.1285977363586426, "logps/chosen": -528.86865234375, "logps/rejected": -914.2305297851562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.828814506530762, "rewards/margins": 29.736709594726562, "rewards/rejected": -37.565521240234375, "step": 2100 }, { "epoch": 1.3069984447900467, "grad_norm": 0.00011729019752237946, "learning_rate": 3.1350852927616417e-06, "logits/chosen": 1.2746424674987793, "logits/rejected": 4.064460754394531, "logps/chosen": -570.7022705078125, "logps/rejected": -1051.89892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.794236660003662, "rewards/margins": 36.453895568847656, "rewards/rejected": -44.24812698364258, "step": 2101 }, { "epoch": 1.307620528771384, "grad_norm": 0.013876068405807018, "learning_rate": 3.133932687874597e-06, "logits/chosen": 1.3518435955047607, "logits/rejected": 1.6746224164962769, "logps/chosen": -567.3868408203125, "logps/rejected": -852.9873657226562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.235878944396973, "rewards/margins": 25.40787696838379, "rewards/rejected": -37.64375686645508, "step": 2102 }, { "epoch": 1.3082426127527216, "grad_norm": 0.7705238461494446, "learning_rate": 3.132780082987552e-06, "logits/chosen": 0.999545156955719, "logits/rejected": 4.350939750671387, "logps/chosen": -371.6129150390625, "logps/rejected": -756.1754150390625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -5.465909481048584, "rewards/margins": 21.080066680908203, "rewards/rejected": -26.545976638793945, "step": 2103 }, { "epoch": 1.3088646967340591, "grad_norm": 1.206705927848816, "learning_rate": 3.1316274781005074e-06, "logits/chosen": 0.7771980166435242, "logits/rejected": 2.899505853652954, "logps/chosen": -496.0504455566406, "logps/rejected": -950.00341796875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -6.855134010314941, "rewards/margins": 29.67104721069336, "rewards/rejected": -36.52618408203125, "step": 2104 }, { "epoch": 1.3094867807153965, "grad_norm": 0.00644198153167963, "learning_rate": 3.1304748732134626e-06, "logits/chosen": 0.1656215786933899, "logits/rejected": 4.405486106872559, "logps/chosen": -458.61248779296875, "logps/rejected": -1091.378662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.653053283691406, "rewards/margins": 34.8778076171875, "rewards/rejected": -45.530860900878906, "step": 2105 }, { "epoch": 1.310108864696734, "grad_norm": 7.407296657562256, "learning_rate": 3.129322268326418e-06, "logits/chosen": -0.6084730625152588, "logits/rejected": 3.4701526165008545, "logps/chosen": -413.8167419433594, "logps/rejected": -819.1229858398438, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -8.87021255493164, "rewards/margins": 20.633195877075195, "rewards/rejected": -29.503406524658203, "step": 2106 }, { "epoch": 1.3107309486780716, "grad_norm": 9.020928700920194e-06, "learning_rate": 3.128169663439373e-06, "logits/chosen": -0.03172177076339722, "logits/rejected": 2.916116237640381, "logps/chosen": -515.1375122070312, "logps/rejected": -1125.2947998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.552846908569336, "rewards/margins": 36.16227722167969, "rewards/rejected": -48.715126037597656, "step": 2107 }, { "epoch": 1.311353032659409, "grad_norm": 0.9298233389854431, "learning_rate": 3.1270170585523287e-06, "logits/chosen": 2.7141852378845215, "logits/rejected": 1.5818345546722412, "logps/chosen": -704.413818359375, "logps/rejected": -827.8814697265625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -10.471658706665039, "rewards/margins": 18.957651138305664, "rewards/rejected": -29.429309844970703, "step": 2108 }, { "epoch": 1.3119751166407465, "grad_norm": 0.10091857612133026, "learning_rate": 3.125864453665284e-06, "logits/chosen": 2.0753657817840576, "logits/rejected": 2.2879538536071777, "logps/chosen": -658.8688354492188, "logps/rejected": -899.2041015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.374801635742188, "rewards/margins": 26.37302017211914, "rewards/rejected": -39.74782180786133, "step": 2109 }, { "epoch": 1.312597200622084, "grad_norm": 0.4145754873752594, "learning_rate": 3.124711848778239e-06, "logits/chosen": -0.41198205947875977, "logits/rejected": 3.2668869495391846, "logps/chosen": -540.3209838867188, "logps/rejected": -990.2476806640625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.664708137512207, "rewards/margins": 28.692848205566406, "rewards/rejected": -38.35755920410156, "step": 2110 }, { "epoch": 1.3132192846034214, "grad_norm": 0.03950352221727371, "learning_rate": 3.1235592438911944e-06, "logits/chosen": 2.346403121948242, "logits/rejected": 3.843334436416626, "logps/chosen": -665.7584838867188, "logps/rejected": -1120.620361328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.084131240844727, "rewards/margins": 28.030925750732422, "rewards/rejected": -41.11505889892578, "step": 2111 }, { "epoch": 1.313841368584759, "grad_norm": 0.01574692316353321, "learning_rate": 3.1224066390041496e-06, "logits/chosen": 0.11658996343612671, "logits/rejected": 2.8681399822235107, "logps/chosen": -657.4076538085938, "logps/rejected": -1048.323486328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.495312690734863, "rewards/margins": 27.018035888671875, "rewards/rejected": -34.51334762573242, "step": 2112 }, { "epoch": 1.3144634525660965, "grad_norm": 0.24601449072360992, "learning_rate": 3.121254034117105e-06, "logits/chosen": -1.321711540222168, "logits/rejected": 0.7253201603889465, "logps/chosen": -473.1822509765625, "logps/rejected": -928.5146484375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -6.915327548980713, "rewards/margins": 29.833887100219727, "rewards/rejected": -36.74921417236328, "step": 2113 }, { "epoch": 1.3150855365474339, "grad_norm": 6.262076567509212e-06, "learning_rate": 3.12010142923006e-06, "logits/chosen": 2.3087410926818848, "logits/rejected": 4.198649883270264, "logps/chosen": -656.1389770507812, "logps/rejected": -1184.9796142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.827698707580566, "rewards/margins": 35.55638885498047, "rewards/rejected": -48.38408660888672, "step": 2114 }, { "epoch": 1.3157076205287714, "grad_norm": 0.3786371648311615, "learning_rate": 3.1189488243430153e-06, "logits/chosen": 1.2803771495819092, "logits/rejected": 3.8872060775756836, "logps/chosen": -535.787841796875, "logps/rejected": -1055.6212158203125, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -7.432448863983154, "rewards/margins": 30.718673706054688, "rewards/rejected": -38.151123046875, "step": 2115 }, { "epoch": 1.316329704510109, "grad_norm": 0.00028373984969221056, "learning_rate": 3.117796219455971e-06, "logits/chosen": 0.8943437933921814, "logits/rejected": 2.8500447273254395, "logps/chosen": -649.5738525390625, "logps/rejected": -1057.764892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.27314567565918, "rewards/margins": 30.976238250732422, "rewards/rejected": -42.24938201904297, "step": 2116 }, { "epoch": 1.3169517884914463, "grad_norm": 24.275968551635742, "learning_rate": 3.116643614568926e-06, "logits/chosen": -0.27282553911209106, "logits/rejected": 1.769002914428711, "logps/chosen": -585.8446044921875, "logps/rejected": -944.812255859375, "loss": 0.3463, "rewards/accuracies": 0.875, "rewards/chosen": -11.907154083251953, "rewards/margins": 23.340641021728516, "rewards/rejected": -35.24779510498047, "step": 2117 }, { "epoch": 1.317573872472784, "grad_norm": 0.0063476222567260265, "learning_rate": 3.1154910096818814e-06, "logits/chosen": -0.006000339984893799, "logits/rejected": 3.46754789352417, "logps/chosen": -503.677734375, "logps/rejected": -1099.6572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.680063247680664, "rewards/margins": 30.042665481567383, "rewards/rejected": -37.72273254394531, "step": 2118 }, { "epoch": 1.3181959564541212, "grad_norm": 5.0704827308654785, "learning_rate": 3.1143384047948366e-06, "logits/chosen": -0.21951770782470703, "logits/rejected": 1.3027235269546509, "logps/chosen": -553.710693359375, "logps/rejected": -946.5357055664062, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -7.494546890258789, "rewards/margins": 23.37753677368164, "rewards/rejected": -30.872081756591797, "step": 2119 }, { "epoch": 1.3188180404354588, "grad_norm": 0.000527180265635252, "learning_rate": 3.113185799907792e-06, "logits/chosen": 0.8772927522659302, "logits/rejected": 5.24975061416626, "logps/chosen": -441.1733093261719, "logps/rejected": -981.8777465820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.961986541748047, "rewards/margins": 30.325220108032227, "rewards/rejected": -36.287208557128906, "step": 2120 }, { "epoch": 1.3194401244167961, "grad_norm": 6.20682158114505e-06, "learning_rate": 3.112033195020747e-06, "logits/chosen": 1.3164989948272705, "logits/rejected": 2.95279598236084, "logps/chosen": -661.6497802734375, "logps/rejected": -1091.271728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.640320777893066, "rewards/margins": 34.3283576965332, "rewards/rejected": -46.96868133544922, "step": 2121 }, { "epoch": 1.3200622083981337, "grad_norm": 1.8734179735183716, "learning_rate": 3.1108805901337023e-06, "logits/chosen": 1.941436767578125, "logits/rejected": 1.3771246671676636, "logps/chosen": -673.208251953125, "logps/rejected": -903.9393920898438, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -10.356344223022461, "rewards/margins": 22.33736801147461, "rewards/rejected": -32.69371032714844, "step": 2122 }, { "epoch": 1.3206842923794713, "grad_norm": 1.8553708287072368e-05, "learning_rate": 3.109727985246658e-06, "logits/chosen": -3.6621267795562744, "logits/rejected": 0.9122580289840698, "logps/chosen": -348.76800537109375, "logps/rejected": -936.0460815429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.951865196228027, "rewards/margins": 31.755855560302734, "rewards/rejected": -40.70772171020508, "step": 2123 }, { "epoch": 1.3213063763608086, "grad_norm": 0.3098766505718231, "learning_rate": 3.108575380359613e-06, "logits/chosen": 0.4674542546272278, "logits/rejected": 2.539536714553833, "logps/chosen": -612.90576171875, "logps/rejected": -1040.7362060546875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -12.947325706481934, "rewards/margins": 32.84162521362305, "rewards/rejected": -45.78894805908203, "step": 2124 }, { "epoch": 1.3219284603421462, "grad_norm": 0.00044224029988981783, "learning_rate": 3.1074227754725684e-06, "logits/chosen": -2.9238626956939697, "logits/rejected": 2.93160343170166, "logps/chosen": -186.1780548095703, "logps/rejected": -841.7645874023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.421435356140137, "rewards/margins": 32.04075622558594, "rewards/rejected": -36.462196350097656, "step": 2125 }, { "epoch": 1.3225505443234837, "grad_norm": 0.8636406064033508, "learning_rate": 3.1062701705855236e-06, "logits/chosen": 2.3500113487243652, "logits/rejected": 3.3730363845825195, "logps/chosen": -606.7012939453125, "logps/rejected": -1062.3568115234375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -8.823650360107422, "rewards/margins": 30.047855377197266, "rewards/rejected": -38.87150573730469, "step": 2126 }, { "epoch": 1.323172628304821, "grad_norm": 0.0007554941112175584, "learning_rate": 3.105117565698479e-06, "logits/chosen": -1.9400715827941895, "logits/rejected": 3.186816692352295, "logps/chosen": -492.09014892578125, "logps/rejected": -1054.965087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.473559856414795, "rewards/margins": 26.097862243652344, "rewards/rejected": -32.57142639160156, "step": 2127 }, { "epoch": 1.3237947122861586, "grad_norm": 38.66023635864258, "learning_rate": 3.103964960811434e-06, "logits/chosen": 2.6481308937072754, "logits/rejected": 1.8837988376617432, "logps/chosen": -702.989501953125, "logps/rejected": -984.9061889648438, "loss": 0.4483, "rewards/accuracies": 0.875, "rewards/chosen": -11.173105239868164, "rewards/margins": 24.799110412597656, "rewards/rejected": -35.97221374511719, "step": 2128 }, { "epoch": 1.3244167962674962, "grad_norm": 0.9219430685043335, "learning_rate": 3.1028123559243893e-06, "logits/chosen": 0.7498334646224976, "logits/rejected": 4.0075788497924805, "logps/chosen": -537.6915893554688, "logps/rejected": -943.332275390625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -8.623586654663086, "rewards/margins": 25.398311614990234, "rewards/rejected": -34.02189636230469, "step": 2129 }, { "epoch": 1.3250388802488335, "grad_norm": 1.8907319088157237e-07, "learning_rate": 3.101659751037345e-06, "logits/chosen": -0.394875168800354, "logits/rejected": 4.241382598876953, "logps/chosen": -342.15899658203125, "logps/rejected": -979.307861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.465579986572266, "rewards/margins": 36.069435119628906, "rewards/rejected": -43.53501510620117, "step": 2130 }, { "epoch": 1.325660964230171, "grad_norm": 1.4963295459747314, "learning_rate": 3.1005071461503e-06, "logits/chosen": 3.006678581237793, "logits/rejected": 4.0752854347229, "logps/chosen": -644.6529541015625, "logps/rejected": -862.7306518554688, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -13.216716766357422, "rewards/margins": 15.411653518676758, "rewards/rejected": -28.628368377685547, "step": 2131 }, { "epoch": 1.3262830482115087, "grad_norm": 0.0001043678421410732, "learning_rate": 3.0993545412632554e-06, "logits/chosen": -0.3442641496658325, "logits/rejected": 3.0216991901397705, "logps/chosen": -512.071533203125, "logps/rejected": -1114.519287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.777825355529785, "rewards/margins": 38.027313232421875, "rewards/rejected": -44.805137634277344, "step": 2132 }, { "epoch": 1.326905132192846, "grad_norm": 0.0006537793087773025, "learning_rate": 3.0982019363762106e-06, "logits/chosen": 3.7730226516723633, "logits/rejected": 4.204200744628906, "logps/chosen": -813.5851440429688, "logps/rejected": -1144.5123291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.097002983093262, "rewards/margins": 32.316932678222656, "rewards/rejected": -47.41393280029297, "step": 2133 }, { "epoch": 1.3275272161741836, "grad_norm": 0.019565237686038017, "learning_rate": 3.097049331489166e-06, "logits/chosen": -2.0026626586914062, "logits/rejected": 2.547441005706787, "logps/chosen": -440.7296142578125, "logps/rejected": -934.6153564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.003240585327148, "rewards/margins": 26.617525100708008, "rewards/rejected": -35.620765686035156, "step": 2134 }, { "epoch": 1.3281493001555211, "grad_norm": 3.7515264921239577e-06, "learning_rate": 3.095896726602121e-06, "logits/chosen": 0.791428804397583, "logits/rejected": 4.45504093170166, "logps/chosen": -493.91546630859375, "logps/rejected": -1070.8453369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.852304458618164, "rewards/margins": 29.491228103637695, "rewards/rejected": -40.34353256225586, "step": 2135 }, { "epoch": 1.3287713841368585, "grad_norm": 0.002371502574533224, "learning_rate": 3.0947441217150763e-06, "logits/chosen": 2.1727709770202637, "logits/rejected": 1.9977059364318848, "logps/chosen": -679.185302734375, "logps/rejected": -975.188720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.928152084350586, "rewards/margins": 27.553071975708008, "rewards/rejected": -38.481224060058594, "step": 2136 }, { "epoch": 1.329393468118196, "grad_norm": 0.56076979637146, "learning_rate": 3.0935915168280315e-06, "logits/chosen": -0.7301803827285767, "logits/rejected": 3.738680124282837, "logps/chosen": -526.186279296875, "logps/rejected": -996.2095947265625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -10.729106903076172, "rewards/margins": 23.9970703125, "rewards/rejected": -34.72617721557617, "step": 2137 }, { "epoch": 1.3300155520995334, "grad_norm": 0.00125672179274261, "learning_rate": 3.092438911940987e-06, "logits/chosen": 0.5604817867279053, "logits/rejected": 4.069615840911865, "logps/chosen": -403.9148254394531, "logps/rejected": -942.8673706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.626916885375977, "rewards/margins": 32.32008361816406, "rewards/rejected": -39.946998596191406, "step": 2138 }, { "epoch": 1.330637636080871, "grad_norm": 34.205047607421875, "learning_rate": 3.0912863070539424e-06, "logits/chosen": 0.6298574209213257, "logits/rejected": 2.4317867755889893, "logps/chosen": -627.36767578125, "logps/rejected": -1017.8082885742188, "loss": 0.7327, "rewards/accuracies": 0.875, "rewards/chosen": -10.587239265441895, "rewards/margins": 23.058414459228516, "rewards/rejected": -33.645652770996094, "step": 2139 }, { "epoch": 1.3312597200622083, "grad_norm": 0.0010638857493177056, "learning_rate": 3.0901337021668976e-06, "logits/chosen": -0.36331796646118164, "logits/rejected": 3.6444251537323, "logps/chosen": -563.85693359375, "logps/rejected": -1249.5921630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.383662223815918, "rewards/margins": 41.731117248535156, "rewards/rejected": -53.114776611328125, "step": 2140 }, { "epoch": 1.3318818040435458, "grad_norm": 13.8467435836792, "learning_rate": 3.088981097279853e-06, "logits/chosen": 0.953113317489624, "logits/rejected": 2.9334874153137207, "logps/chosen": -620.501220703125, "logps/rejected": -894.0372314453125, "loss": 0.1024, "rewards/accuracies": 0.875, "rewards/chosen": -9.037420272827148, "rewards/margins": 14.888923645019531, "rewards/rejected": -23.92634391784668, "step": 2141 }, { "epoch": 1.3325038880248834, "grad_norm": 2.1303920220816508e-05, "learning_rate": 3.087828492392808e-06, "logits/chosen": -2.658367156982422e-05, "logits/rejected": 4.296390056610107, "logps/chosen": -483.7230224609375, "logps/rejected": -1070.751708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.992851257324219, "rewards/margins": 26.837543487548828, "rewards/rejected": -38.83039474487305, "step": 2142 }, { "epoch": 1.3331259720062207, "grad_norm": 0.026947133243083954, "learning_rate": 3.0866758875057633e-06, "logits/chosen": 0.3194420337677002, "logits/rejected": 3.87138032913208, "logps/chosen": -482.6922607421875, "logps/rejected": -992.799560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.412920951843262, "rewards/margins": 28.275657653808594, "rewards/rejected": -37.68857955932617, "step": 2143 }, { "epoch": 1.3337480559875583, "grad_norm": 0.0005353665328584611, "learning_rate": 3.0855232826187185e-06, "logits/chosen": 2.0463151931762695, "logits/rejected": 3.850377082824707, "logps/chosen": -562.7142944335938, "logps/rejected": -857.8176879882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.175778865814209, "rewards/margins": 25.96051025390625, "rewards/rejected": -33.13629150390625, "step": 2144 }, { "epoch": 1.3343701399688959, "grad_norm": 0.0006094225682318211, "learning_rate": 3.084370677731674e-06, "logits/chosen": -2.2982234954833984, "logits/rejected": 2.2199718952178955, "logps/chosen": -375.00616455078125, "logps/rejected": -919.08740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9371609687805176, "rewards/margins": 26.919570922851562, "rewards/rejected": -30.85672950744629, "step": 2145 }, { "epoch": 1.3349922239502332, "grad_norm": 0.0008352459408342838, "learning_rate": 3.0832180728446294e-06, "logits/chosen": -1.811676263809204, "logits/rejected": 3.494459867477417, "logps/chosen": -418.7713317871094, "logps/rejected": -1007.1888427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.297712326049805, "rewards/margins": 27.783876419067383, "rewards/rejected": -35.08158874511719, "step": 2146 }, { "epoch": 1.3356143079315708, "grad_norm": 0.03766888752579689, "learning_rate": 3.0820654679575846e-06, "logits/chosen": -0.016537785530090332, "logits/rejected": 3.032536506652832, "logps/chosen": -383.07684326171875, "logps/rejected": -800.5468139648438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.415544033050537, "rewards/margins": 24.178037643432617, "rewards/rejected": -30.593582153320312, "step": 2147 }, { "epoch": 1.3362363919129083, "grad_norm": 1.5738648176193237, "learning_rate": 3.08091286307054e-06, "logits/chosen": 1.2972499132156372, "logits/rejected": 3.6453158855438232, "logps/chosen": -636.6319580078125, "logps/rejected": -1055.734130859375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -10.79327392578125, "rewards/margins": 32.944515228271484, "rewards/rejected": -43.73778533935547, "step": 2148 }, { "epoch": 1.3368584758942457, "grad_norm": 0.9019051194190979, "learning_rate": 3.079760258183495e-06, "logits/chosen": -0.11323332786560059, "logits/rejected": 4.938979148864746, "logps/chosen": -527.7478637695312, "logps/rejected": -1196.08447265625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -13.183340072631836, "rewards/margins": 36.72466278076172, "rewards/rejected": -49.90800476074219, "step": 2149 }, { "epoch": 1.3374805598755832, "grad_norm": 0.0704350695014, "learning_rate": 3.0786076532964503e-06, "logits/chosen": -2.881014823913574, "logits/rejected": 1.5524474382400513, "logps/chosen": -409.4421081542969, "logps/rejected": -975.3092651367188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.2625885009765625, "rewards/margins": 32.731468200683594, "rewards/rejected": -39.994056701660156, "step": 2150 }, { "epoch": 1.3381026438569208, "grad_norm": 0.001212852424941957, "learning_rate": 3.0774550484094055e-06, "logits/chosen": -3.0680320262908936, "logits/rejected": 2.949629306793213, "logps/chosen": -383.16021728515625, "logps/rejected": -1086.001708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.776760101318359, "rewards/margins": 36.22696304321289, "rewards/rejected": -44.00372314453125, "step": 2151 }, { "epoch": 1.3387247278382581, "grad_norm": 4.288447856903076, "learning_rate": 3.076302443522361e-06, "logits/chosen": -0.5125648379325867, "logits/rejected": 2.735118865966797, "logps/chosen": -511.6773986816406, "logps/rejected": -967.4842529296875, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -8.024867057800293, "rewards/margins": 30.234031677246094, "rewards/rejected": -38.25889587402344, "step": 2152 }, { "epoch": 1.3393468118195957, "grad_norm": 0.00015137945592869073, "learning_rate": 3.0751498386353164e-06, "logits/chosen": 0.21446776390075684, "logits/rejected": 1.9205048084259033, "logps/chosen": -533.0106201171875, "logps/rejected": -891.5570068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.088725566864014, "rewards/margins": 26.396310806274414, "rewards/rejected": -33.48503494262695, "step": 2153 }, { "epoch": 1.3399688958009333, "grad_norm": 0.18598617613315582, "learning_rate": 3.0739972337482716e-06, "logits/chosen": -0.05675274133682251, "logits/rejected": 4.564008712768555, "logps/chosen": -496.07373046875, "logps/rejected": -1063.553955078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.981276988983154, "rewards/margins": 31.187776565551758, "rewards/rejected": -39.16905212402344, "step": 2154 }, { "epoch": 1.3405909797822706, "grad_norm": 12.508747100830078, "learning_rate": 3.072844628861227e-06, "logits/chosen": 2.6598260402679443, "logits/rejected": 3.2036752700805664, "logps/chosen": -674.0693969726562, "logps/rejected": -987.4519653320312, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -12.833343505859375, "rewards/margins": 25.156932830810547, "rewards/rejected": -37.990272521972656, "step": 2155 }, { "epoch": 1.3412130637636082, "grad_norm": 0.00395676214247942, "learning_rate": 3.071692023974182e-06, "logits/chosen": 2.0138673782348633, "logits/rejected": 3.2959272861480713, "logps/chosen": -664.734375, "logps/rejected": -1090.194580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.462017059326172, "rewards/margins": 29.33136749267578, "rewards/rejected": -39.79338836669922, "step": 2156 }, { "epoch": 1.3418351477449455, "grad_norm": 0.0016368265496566892, "learning_rate": 3.0705394190871373e-06, "logits/chosen": 2.031752347946167, "logits/rejected": 3.387629270553589, "logps/chosen": -595.8892211914062, "logps/rejected": -908.4937744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.228726387023926, "rewards/margins": 29.409114837646484, "rewards/rejected": -40.637840270996094, "step": 2157 }, { "epoch": 1.342457231726283, "grad_norm": 0.004761462565511465, "learning_rate": 3.0693868142000925e-06, "logits/chosen": -0.8495367765426636, "logits/rejected": 1.2057145833969116, "logps/chosen": -457.0412292480469, "logps/rejected": -904.3390502929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.196920394897461, "rewards/margins": 26.140365600585938, "rewards/rejected": -34.33728790283203, "step": 2158 }, { "epoch": 1.3430793157076204, "grad_norm": 0.00018156910664401948, "learning_rate": 3.068234209313048e-06, "logits/chosen": 0.3538955748081207, "logits/rejected": 2.821544647216797, "logps/chosen": -508.57733154296875, "logps/rejected": -989.5989990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.294981002807617, "rewards/margins": 30.52140998840332, "rewards/rejected": -37.81639099121094, "step": 2159 }, { "epoch": 1.343701399688958, "grad_norm": 0.0011041401885449886, "learning_rate": 3.0670816044260034e-06, "logits/chosen": 2.2572083473205566, "logits/rejected": 4.512560844421387, "logps/chosen": -734.511474609375, "logps/rejected": -1160.5986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.02027702331543, "rewards/margins": 28.191675186157227, "rewards/rejected": -41.211952209472656, "step": 2160 }, { "epoch": 1.3443234836702955, "grad_norm": 0.02089731954038143, "learning_rate": 3.0659289995389586e-06, "logits/chosen": 1.5649492740631104, "logits/rejected": 4.971779823303223, "logps/chosen": -527.366943359375, "logps/rejected": -990.109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.970163345336914, "rewards/margins": 29.34923553466797, "rewards/rejected": -37.31939697265625, "step": 2161 }, { "epoch": 1.3449455676516329, "grad_norm": 0.01778881810605526, "learning_rate": 3.064776394651914e-06, "logits/chosen": -0.14575600624084473, "logits/rejected": 3.174208641052246, "logps/chosen": -595.9559326171875, "logps/rejected": -1114.135498046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.597963333129883, "rewards/margins": 32.00989532470703, "rewards/rejected": -44.60785675048828, "step": 2162 }, { "epoch": 1.3455676516329704, "grad_norm": 0.06671002507209778, "learning_rate": 3.063623789764869e-06, "logits/chosen": 0.020184874534606934, "logits/rejected": 0.91424161195755, "logps/chosen": -740.4108276367188, "logps/rejected": -928.4967651367188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -16.36511993408203, "rewards/margins": 15.735336303710938, "rewards/rejected": -32.10045623779297, "step": 2163 }, { "epoch": 1.346189735614308, "grad_norm": 5.080242156982422, "learning_rate": 3.0624711848778243e-06, "logits/chosen": -0.9118480682373047, "logits/rejected": 3.6349895000457764, "logps/chosen": -563.9638671875, "logps/rejected": -1191.065185546875, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -11.759162902832031, "rewards/margins": 34.771461486816406, "rewards/rejected": -46.53062057495117, "step": 2164 }, { "epoch": 1.3468118195956453, "grad_norm": 5.798954589408822e-06, "learning_rate": 3.0613185799907795e-06, "logits/chosen": 0.6963211894035339, "logits/rejected": 3.031797409057617, "logps/chosen": -366.27276611328125, "logps/rejected": -834.5653076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.694557189941406, "rewards/margins": 26.892271041870117, "rewards/rejected": -31.58682632446289, "step": 2165 }, { "epoch": 1.347433903576983, "grad_norm": 31.725595474243164, "learning_rate": 3.0601659751037347e-06, "logits/chosen": 2.5356428623199463, "logits/rejected": 5.059682846069336, "logps/chosen": -436.6680908203125, "logps/rejected": -781.17529296875, "loss": 0.3965, "rewards/accuracies": 0.75, "rewards/chosen": -8.763218879699707, "rewards/margins": 19.586204528808594, "rewards/rejected": -28.349422454833984, "step": 2166 }, { "epoch": 1.3480559875583205, "grad_norm": 4.612741947174072, "learning_rate": 3.0590133702166904e-06, "logits/chosen": 0.2573816180229187, "logits/rejected": 2.5070643424987793, "logps/chosen": -609.2000122070312, "logps/rejected": -941.3594970703125, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -7.779262065887451, "rewards/margins": 19.19269561767578, "rewards/rejected": -26.971956253051758, "step": 2167 }, { "epoch": 1.3486780715396578, "grad_norm": 0.3579530715942383, "learning_rate": 3.0578607653296456e-06, "logits/chosen": 0.7475700378417969, "logits/rejected": 2.640726327896118, "logps/chosen": -698.0152587890625, "logps/rejected": -1077.4521484375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -13.386947631835938, "rewards/margins": 27.110260009765625, "rewards/rejected": -40.49720764160156, "step": 2168 }, { "epoch": 1.3493001555209954, "grad_norm": 0.0044502979144454, "learning_rate": 3.056708160442601e-06, "logits/chosen": -0.43040260672569275, "logits/rejected": 2.405294895172119, "logps/chosen": -550.3878173828125, "logps/rejected": -1116.6385498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.990240097045898, "rewards/margins": 32.76185607910156, "rewards/rejected": -43.752098083496094, "step": 2169 }, { "epoch": 1.349922239502333, "grad_norm": 0.044596899300813675, "learning_rate": 3.055555555555556e-06, "logits/chosen": 2.8219692707061768, "logits/rejected": 3.6726090908050537, "logps/chosen": -737.0827026367188, "logps/rejected": -1036.586181640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.100625038146973, "rewards/margins": 22.17905616760254, "rewards/rejected": -35.27967834472656, "step": 2170 }, { "epoch": 1.3505443234836703, "grad_norm": 3.109398312517442e-05, "learning_rate": 3.0544029506685113e-06, "logits/chosen": 0.12296566367149353, "logits/rejected": 3.4189023971557617, "logps/chosen": -581.9413452148438, "logps/rejected": -1092.4742431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.76758098602295, "rewards/margins": 31.79904556274414, "rewards/rejected": -44.566627502441406, "step": 2171 }, { "epoch": 1.3511664074650078, "grad_norm": 6.899026629980654e-05, "learning_rate": 3.0532503457814665e-06, "logits/chosen": -0.5253732204437256, "logits/rejected": 2.619370460510254, "logps/chosen": -349.04693603515625, "logps/rejected": -896.3671264648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.893255233764648, "rewards/margins": 34.224151611328125, "rewards/rejected": -40.117408752441406, "step": 2172 }, { "epoch": 1.3517884914463454, "grad_norm": 17.062660217285156, "learning_rate": 3.0520977408944217e-06, "logits/chosen": 0.9396522045135498, "logits/rejected": 2.4779770374298096, "logps/chosen": -412.9204406738281, "logps/rejected": -755.146728515625, "loss": 0.1, "rewards/accuracies": 0.875, "rewards/chosen": -5.589811325073242, "rewards/margins": 23.087783813476562, "rewards/rejected": -28.677597045898438, "step": 2173 }, { "epoch": 1.3524105754276827, "grad_norm": 0.1276179552078247, "learning_rate": 3.0509451360073774e-06, "logits/chosen": -2.0715835094451904, "logits/rejected": 2.1000967025756836, "logps/chosen": -390.0458068847656, "logps/rejected": -878.71826171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.325183868408203, "rewards/margins": 26.3515625, "rewards/rejected": -34.67674255371094, "step": 2174 }, { "epoch": 1.3530326594090203, "grad_norm": 0.5153205990791321, "learning_rate": 3.0497925311203326e-06, "logits/chosen": -1.6525423526763916, "logits/rejected": 3.016981601715088, "logps/chosen": -440.17242431640625, "logps/rejected": -1001.20947265625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.2114839553833, "rewards/margins": 30.331432342529297, "rewards/rejected": -38.54291534423828, "step": 2175 }, { "epoch": 1.3536547433903576, "grad_norm": 1.2170327863714192e-05, "learning_rate": 3.048639926233288e-06, "logits/chosen": -1.190338373184204, "logits/rejected": 1.2228800058364868, "logps/chosen": -388.32745361328125, "logps/rejected": -796.8060913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.56209659576416, "rewards/margins": 26.223255157470703, "rewards/rejected": -31.78535270690918, "step": 2176 }, { "epoch": 1.3542768273716952, "grad_norm": 36.89498519897461, "learning_rate": 3.047487321346243e-06, "logits/chosen": -1.2961982488632202, "logits/rejected": 2.1595730781555176, "logps/chosen": -483.5260009765625, "logps/rejected": -938.668701171875, "loss": 0.3598, "rewards/accuracies": 0.875, "rewards/chosen": -11.170467376708984, "rewards/margins": 22.213472366333008, "rewards/rejected": -33.383941650390625, "step": 2177 }, { "epoch": 1.3548989113530325, "grad_norm": 0.0018010102212429047, "learning_rate": 3.046334716459198e-06, "logits/chosen": 1.577118158340454, "logits/rejected": 4.508591651916504, "logps/chosen": -512.4389038085938, "logps/rejected": -965.859619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.248443603515625, "rewards/margins": 28.630136489868164, "rewards/rejected": -42.878578186035156, "step": 2178 }, { "epoch": 1.35552099533437, "grad_norm": 0.009161030873656273, "learning_rate": 3.045182111572153e-06, "logits/chosen": -1.5993404388427734, "logits/rejected": 1.1790560483932495, "logps/chosen": -384.3905334472656, "logps/rejected": -770.4791259765625, "loss": 0.0866, "rewards/accuracies": 0.875, "rewards/chosen": -5.886144638061523, "rewards/margins": 26.454479217529297, "rewards/rejected": -32.34062194824219, "step": 2179 }, { "epoch": 1.3561430793157077, "grad_norm": 0.004069062415510416, "learning_rate": 3.0440295066851083e-06, "logits/chosen": 2.374175548553467, "logits/rejected": 4.294281959533691, "logps/chosen": -632.2175903320312, "logps/rejected": -1098.2437744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.008170127868652, "rewards/margins": 35.68601989746094, "rewards/rejected": -49.69418716430664, "step": 2180 }, { "epoch": 1.356765163297045, "grad_norm": 4.699228286743164, "learning_rate": 3.0428769017980635e-06, "logits/chosen": 1.171891450881958, "logits/rejected": 3.271820545196533, "logps/chosen": -440.36456298828125, "logps/rejected": -847.413818359375, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -9.948079109191895, "rewards/margins": 21.653432846069336, "rewards/rejected": -31.601512908935547, "step": 2181 }, { "epoch": 1.3573872472783826, "grad_norm": 0.0018416978418827057, "learning_rate": 3.0417242969110187e-06, "logits/chosen": -1.469376802444458, "logits/rejected": 4.174241065979004, "logps/chosen": -350.88006591796875, "logps/rejected": -962.7076416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.410130500793457, "rewards/margins": 26.457324981689453, "rewards/rejected": -33.867454528808594, "step": 2182 }, { "epoch": 1.3580093312597201, "grad_norm": 14.689421653747559, "learning_rate": 3.0405716920239744e-06, "logits/chosen": -1.2712656259536743, "logits/rejected": 2.44496488571167, "logps/chosen": -562.570556640625, "logps/rejected": -999.324462890625, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": -8.03051471710205, "rewards/margins": 25.950546264648438, "rewards/rejected": -33.98106002807617, "step": 2183 }, { "epoch": 1.3586314152410575, "grad_norm": 6.13081192568643e-06, "learning_rate": 3.0394190871369296e-06, "logits/chosen": -0.9931305646896362, "logits/rejected": 3.7858331203460693, "logps/chosen": -382.7630310058594, "logps/rejected": -1043.0816650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.340179443359375, "rewards/margins": 38.07415771484375, "rewards/rejected": -45.414337158203125, "step": 2184 }, { "epoch": 1.359253499222395, "grad_norm": 0.6616275310516357, "learning_rate": 3.038266482249885e-06, "logits/chosen": -1.0988692045211792, "logits/rejected": 2.3249878883361816, "logps/chosen": -549.3353271484375, "logps/rejected": -909.2433471679688, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -9.345589637756348, "rewards/margins": 23.204883575439453, "rewards/rejected": -32.550472259521484, "step": 2185 }, { "epoch": 1.3598755832037326, "grad_norm": 0.0020644681062549353, "learning_rate": 3.03711387736284e-06, "logits/chosen": -0.09544289112091064, "logits/rejected": 2.8298497200012207, "logps/chosen": -523.290771484375, "logps/rejected": -1006.2669067382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.667964935302734, "rewards/margins": 35.17736053466797, "rewards/rejected": -43.8453254699707, "step": 2186 }, { "epoch": 1.36049766718507, "grad_norm": 0.0008767215767875314, "learning_rate": 3.0359612724757953e-06, "logits/chosen": 0.11832370609045029, "logits/rejected": 2.8614234924316406, "logps/chosen": -575.4088134765625, "logps/rejected": -970.757080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.645832061767578, "rewards/margins": 28.8430233001709, "rewards/rejected": -37.48885726928711, "step": 2187 }, { "epoch": 1.3611197511664075, "grad_norm": 0.10958057641983032, "learning_rate": 3.0348086675887505e-06, "logits/chosen": -0.055881351232528687, "logits/rejected": 3.112879514694214, "logps/chosen": -474.2701110839844, "logps/rejected": -868.6067504882812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.301572322845459, "rewards/margins": 23.456018447875977, "rewards/rejected": -29.757593154907227, "step": 2188 }, { "epoch": 1.361741835147745, "grad_norm": 14.898518562316895, "learning_rate": 3.0336560627017057e-06, "logits/chosen": 2.1643824577331543, "logits/rejected": 4.367372512817383, "logps/chosen": -676.5453491210938, "logps/rejected": -1104.552001953125, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": -12.872103691101074, "rewards/margins": 28.61599349975586, "rewards/rejected": -41.48809814453125, "step": 2189 }, { "epoch": 1.3623639191290824, "grad_norm": 12.740869522094727, "learning_rate": 3.032503457814661e-06, "logits/chosen": 0.9304056167602539, "logits/rejected": 4.536799430847168, "logps/chosen": -492.8013610839844, "logps/rejected": -1083.08544921875, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -7.854243755340576, "rewards/margins": 35.24501419067383, "rewards/rejected": -43.0992546081543, "step": 2190 }, { "epoch": 1.36298600311042, "grad_norm": 6.474887868535006e-06, "learning_rate": 3.0313508529276166e-06, "logits/chosen": 0.4632197618484497, "logits/rejected": 3.4487247467041016, "logps/chosen": -522.9790649414062, "logps/rejected": -915.0444946289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.24857234954834, "rewards/margins": 23.84241485595703, "rewards/rejected": -32.09098434448242, "step": 2191 }, { "epoch": 1.3636080870917575, "grad_norm": 0.3108627498149872, "learning_rate": 3.030198248040572e-06, "logits/chosen": 1.1985173225402832, "logits/rejected": 3.053633451461792, "logps/chosen": -631.7606201171875, "logps/rejected": -1198.51416015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -13.875808715820312, "rewards/margins": 36.570396423339844, "rewards/rejected": -50.446205139160156, "step": 2192 }, { "epoch": 1.3642301710730949, "grad_norm": 0.0007350373198278248, "learning_rate": 3.029045643153527e-06, "logits/chosen": -1.5113818645477295, "logits/rejected": 0.9358669519424438, "logps/chosen": -529.31591796875, "logps/rejected": -954.6470947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.916925430297852, "rewards/margins": 34.9250602722168, "rewards/rejected": -42.841983795166016, "step": 2193 }, { "epoch": 1.3648522550544324, "grad_norm": 0.00030861847335472703, "learning_rate": 3.0278930382664823e-06, "logits/chosen": -0.05734395980834961, "logits/rejected": 2.462827682495117, "logps/chosen": -528.8324584960938, "logps/rejected": -969.40625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.957712173461914, "rewards/margins": 32.38273620605469, "rewards/rejected": -45.340450286865234, "step": 2194 }, { "epoch": 1.3654743390357698, "grad_norm": 4.3081145122414455e-05, "learning_rate": 3.0267404333794375e-06, "logits/chosen": -0.979878306388855, "logits/rejected": 4.725745677947998, "logps/chosen": -534.77880859375, "logps/rejected": -1191.40234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.238239288330078, "rewards/margins": 26.146331787109375, "rewards/rejected": -34.38457107543945, "step": 2195 }, { "epoch": 1.3660964230171073, "grad_norm": 0.001050711958669126, "learning_rate": 3.0255878284923927e-06, "logits/chosen": 2.1874237060546875, "logits/rejected": 2.7432210445404053, "logps/chosen": -759.297119140625, "logps/rejected": -1180.341064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.559810638427734, "rewards/margins": 32.7918586730957, "rewards/rejected": -43.35166931152344, "step": 2196 }, { "epoch": 1.3667185069984447, "grad_norm": 0.07033015042543411, "learning_rate": 3.024435223605348e-06, "logits/chosen": 0.10440731048583984, "logits/rejected": 2.772296190261841, "logps/chosen": -421.4175109863281, "logps/rejected": -813.867919921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.586875915527344, "rewards/margins": 25.79877281188965, "rewards/rejected": -33.385650634765625, "step": 2197 }, { "epoch": 1.3673405909797822, "grad_norm": 0.018981391564011574, "learning_rate": 3.0232826187183036e-06, "logits/chosen": 0.21775102615356445, "logits/rejected": 3.7370357513427734, "logps/chosen": -497.3377990722656, "logps/rejected": -1062.53076171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.430828094482422, "rewards/margins": 30.548934936523438, "rewards/rejected": -39.97976303100586, "step": 2198 }, { "epoch": 1.3679626749611198, "grad_norm": 0.007361208088696003, "learning_rate": 3.022130013831259e-06, "logits/chosen": 1.6204208135604858, "logits/rejected": 4.520246505737305, "logps/chosen": -634.2261962890625, "logps/rejected": -1120.288818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.430303573608398, "rewards/margins": 32.23867416381836, "rewards/rejected": -42.668975830078125, "step": 2199 }, { "epoch": 1.3685847589424571, "grad_norm": 0.0001982577668968588, "learning_rate": 3.020977408944214e-06, "logits/chosen": 2.498847007751465, "logits/rejected": 4.539471626281738, "logps/chosen": -700.4442138671875, "logps/rejected": -987.4329833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.197793960571289, "rewards/margins": 21.45416259765625, "rewards/rejected": -34.65195846557617, "step": 2200 }, { "epoch": 1.3692068429237947, "grad_norm": 6.363242732732033e-08, "learning_rate": 3.0198248040571693e-06, "logits/chosen": 1.5000331401824951, "logits/rejected": 2.548959732055664, "logps/chosen": -638.2463989257812, "logps/rejected": -1020.43310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.709440231323242, "rewards/margins": 31.604660034179688, "rewards/rejected": -44.3140983581543, "step": 2201 }, { "epoch": 1.3698289269051322, "grad_norm": 0.0008398335776291788, "learning_rate": 3.0186721991701245e-06, "logits/chosen": -0.9833596348762512, "logits/rejected": 4.07463264465332, "logps/chosen": -460.53094482421875, "logps/rejected": -1072.9697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.126919746398926, "rewards/margins": 28.681936264038086, "rewards/rejected": -38.80885696411133, "step": 2202 }, { "epoch": 1.3704510108864696, "grad_norm": 0.11784781515598297, "learning_rate": 3.0175195942830797e-06, "logits/chosen": -1.933821678161621, "logits/rejected": 2.0009396076202393, "logps/chosen": -407.39422607421875, "logps/rejected": -861.2830810546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.528697967529297, "rewards/margins": 26.151098251342773, "rewards/rejected": -33.67979431152344, "step": 2203 }, { "epoch": 1.3710730948678072, "grad_norm": 0.1820145547389984, "learning_rate": 3.016366989396035e-06, "logits/chosen": -2.080866813659668, "logits/rejected": 3.588825225830078, "logps/chosen": -406.08258056640625, "logps/rejected": -1154.141357421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.055073738098145, "rewards/margins": 35.42332077026367, "rewards/rejected": -44.478397369384766, "step": 2204 }, { "epoch": 1.3716951788491447, "grad_norm": 0.861481785774231, "learning_rate": 3.0152143845089906e-06, "logits/chosen": 1.8138333559036255, "logits/rejected": 2.679360866546631, "logps/chosen": -702.68603515625, "logps/rejected": -949.3597412109375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -11.829734802246094, "rewards/margins": 21.387401580810547, "rewards/rejected": -33.217140197753906, "step": 2205 }, { "epoch": 1.372317262830482, "grad_norm": 3.076730763496016e-07, "learning_rate": 3.014061779621946e-06, "logits/chosen": 0.05164957046508789, "logits/rejected": 2.419743537902832, "logps/chosen": -531.583984375, "logps/rejected": -1120.018798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.508309364318848, "rewards/margins": 40.142372131347656, "rewards/rejected": -50.65068054199219, "step": 2206 }, { "epoch": 1.3729393468118196, "grad_norm": 3.052052761631785e-07, "learning_rate": 3.012909174734901e-06, "logits/chosen": 0.6604109406471252, "logits/rejected": 3.84415340423584, "logps/chosen": -599.694091796875, "logps/rejected": -1072.8603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.875895977020264, "rewards/margins": 30.14890480041504, "rewards/rejected": -37.024803161621094, "step": 2207 }, { "epoch": 1.3735614307931572, "grad_norm": 0.050999533385038376, "learning_rate": 3.0117565698478563e-06, "logits/chosen": 0.33914634585380554, "logits/rejected": 4.714583396911621, "logps/chosen": -534.7384033203125, "logps/rejected": -1071.54931640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.826058387756348, "rewards/margins": 28.858766555786133, "rewards/rejected": -38.68482208251953, "step": 2208 }, { "epoch": 1.3741835147744945, "grad_norm": 2.4684137315489352e-05, "learning_rate": 3.0106039649608115e-06, "logits/chosen": 1.701751947402954, "logits/rejected": 3.701206684112549, "logps/chosen": -615.7250366210938, "logps/rejected": -1081.4918212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.088967323303223, "rewards/margins": 31.6005802154541, "rewards/rejected": -43.68954849243164, "step": 2209 }, { "epoch": 1.374805598755832, "grad_norm": 4.353829363007433e-12, "learning_rate": 3.0094513600737667e-06, "logits/chosen": 0.5934591293334961, "logits/rejected": 3.640597343444824, "logps/chosen": -633.1799926757812, "logps/rejected": -1189.581298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.416163444519043, "rewards/margins": 41.63801574707031, "rewards/rejected": -54.054176330566406, "step": 2210 }, { "epoch": 1.3754276827371696, "grad_norm": 3.4501149654388428, "learning_rate": 3.008298755186722e-06, "logits/chosen": 0.6663314700126648, "logits/rejected": 3.7698488235473633, "logps/chosen": -592.5361938476562, "logps/rejected": -1084.20751953125, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -10.985772132873535, "rewards/margins": 28.351715087890625, "rewards/rejected": -39.33749008178711, "step": 2211 }, { "epoch": 1.376049766718507, "grad_norm": 1.214347004890442, "learning_rate": 3.007146150299677e-06, "logits/chosen": -0.8623265027999878, "logits/rejected": 1.3673484325408936, "logps/chosen": -475.67144775390625, "logps/rejected": -960.241943359375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -11.342723846435547, "rewards/margins": 32.329654693603516, "rewards/rejected": -43.67237854003906, "step": 2212 }, { "epoch": 1.3766718506998445, "grad_norm": 3.2612618383609515e-07, "learning_rate": 3.005993545412633e-06, "logits/chosen": -4.33530330657959, "logits/rejected": 2.7184455394744873, "logps/chosen": -312.84344482421875, "logps/rejected": -1029.19580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.19746732711792, "rewards/margins": 33.2601318359375, "rewards/rejected": -39.457603454589844, "step": 2213 }, { "epoch": 1.3772939346811819, "grad_norm": 0.0015476691769436002, "learning_rate": 3.004840940525588e-06, "logits/chosen": -2.4688735008239746, "logits/rejected": 1.4806026220321655, "logps/chosen": -441.9232482910156, "logps/rejected": -934.3365478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.864378929138184, "rewards/margins": 26.260881423950195, "rewards/rejected": -33.12525939941406, "step": 2214 }, { "epoch": 1.3779160186625194, "grad_norm": 0.014643407426774502, "learning_rate": 3.0036883356385433e-06, "logits/chosen": 3.419661521911621, "logits/rejected": 4.806012153625488, "logps/chosen": -704.5573120117188, "logps/rejected": -1011.6292724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.705732345581055, "rewards/margins": 21.165294647216797, "rewards/rejected": -36.871028900146484, "step": 2215 }, { "epoch": 1.3785381026438568, "grad_norm": 8.841948509216309, "learning_rate": 3.0025357307514985e-06, "logits/chosen": -0.05596870183944702, "logits/rejected": 1.6367905139923096, "logps/chosen": -547.1561889648438, "logps/rejected": -957.1211547851562, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -9.281091690063477, "rewards/margins": 22.269014358520508, "rewards/rejected": -31.55010986328125, "step": 2216 }, { "epoch": 1.3791601866251944, "grad_norm": 51.19075012207031, "learning_rate": 3.0013831258644537e-06, "logits/chosen": 0.614906907081604, "logits/rejected": 3.123727798461914, "logps/chosen": -648.155029296875, "logps/rejected": -1151.25146484375, "loss": 2.2017, "rewards/accuracies": 0.875, "rewards/chosen": -11.067476272583008, "rewards/margins": 29.206144332885742, "rewards/rejected": -40.273624420166016, "step": 2217 }, { "epoch": 1.379782270606532, "grad_norm": 29.52722930908203, "learning_rate": 3.000230520977409e-06, "logits/chosen": -0.36638355255126953, "logits/rejected": 2.1038315296173096, "logps/chosen": -515.594970703125, "logps/rejected": -849.2973022460938, "loss": 0.3916, "rewards/accuracies": 0.875, "rewards/chosen": -8.794394493103027, "rewards/margins": 23.82428741455078, "rewards/rejected": -32.618682861328125, "step": 2218 }, { "epoch": 1.3804043545878693, "grad_norm": 1.017388105392456, "learning_rate": 2.999077916090364e-06, "logits/chosen": 2.244713306427002, "logits/rejected": 2.9050168991088867, "logps/chosen": -666.0272216796875, "logps/rejected": -969.8905029296875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -12.936464309692383, "rewards/margins": 22.269237518310547, "rewards/rejected": -35.20570373535156, "step": 2219 }, { "epoch": 1.3810264385692068, "grad_norm": 0.00017945458239410073, "learning_rate": 2.99792531120332e-06, "logits/chosen": 0.7347161173820496, "logits/rejected": 3.7560269832611084, "logps/chosen": -568.7210083007812, "logps/rejected": -1096.6600341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.303465843200684, "rewards/margins": 33.7734375, "rewards/rejected": -41.076904296875, "step": 2220 }, { "epoch": 1.3816485225505444, "grad_norm": 0.021343346685171127, "learning_rate": 2.996772706316275e-06, "logits/chosen": 1.6646852493286133, "logits/rejected": 4.067060470581055, "logps/chosen": -628.833740234375, "logps/rejected": -937.606689453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.903544425964355, "rewards/margins": 19.44011688232422, "rewards/rejected": -31.34366226196289, "step": 2221 }, { "epoch": 1.3822706065318817, "grad_norm": 0.30674031376838684, "learning_rate": 2.9956201014292303e-06, "logits/chosen": 2.5471351146698, "logits/rejected": 2.4241745471954346, "logps/chosen": -661.6102294921875, "logps/rejected": -892.48095703125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -10.633338928222656, "rewards/margins": 27.561359405517578, "rewards/rejected": -38.1947021484375, "step": 2222 }, { "epoch": 1.3828926905132193, "grad_norm": 0.3236640989780426, "learning_rate": 2.9944674965421855e-06, "logits/chosen": -1.1065866947174072, "logits/rejected": 1.563366413116455, "logps/chosen": -471.33837890625, "logps/rejected": -941.787109375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -10.373132705688477, "rewards/margins": 27.12007713317871, "rewards/rejected": -37.49320983886719, "step": 2223 }, { "epoch": 1.3835147744945568, "grad_norm": 1.721951961517334, "learning_rate": 2.9933148916551407e-06, "logits/chosen": 2.6324095726013184, "logits/rejected": 3.3818371295928955, "logps/chosen": -613.0682983398438, "logps/rejected": -748.551025390625, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -14.490086555480957, "rewards/margins": 12.575876235961914, "rewards/rejected": -27.065963745117188, "step": 2224 }, { "epoch": 1.3841368584758942, "grad_norm": 0.0003514946438372135, "learning_rate": 2.992162286768096e-06, "logits/chosen": -1.2207633256912231, "logits/rejected": 2.467137336730957, "logps/chosen": -384.5999450683594, "logps/rejected": -804.305908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.229488372802734, "rewards/margins": 23.717012405395508, "rewards/rejected": -31.946500778198242, "step": 2225 }, { "epoch": 1.3847589424572317, "grad_norm": 7.888747692108154, "learning_rate": 2.991009681881051e-06, "logits/chosen": 0.040084779262542725, "logits/rejected": 4.101715087890625, "logps/chosen": -499.83514404296875, "logps/rejected": -1046.8726806640625, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -9.456901550292969, "rewards/margins": 27.2252254486084, "rewards/rejected": -36.682125091552734, "step": 2226 }, { "epoch": 1.3853810264385693, "grad_norm": 0.031036915257573128, "learning_rate": 2.989857076994007e-06, "logits/chosen": -3.403718948364258, "logits/rejected": 2.1908183097839355, "logps/chosen": -233.61251831054688, "logps/rejected": -785.1973876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.809695720672607, "rewards/margins": 24.013778686523438, "rewards/rejected": -30.823474884033203, "step": 2227 }, { "epoch": 1.3860031104199066, "grad_norm": 0.4252808392047882, "learning_rate": 2.988704472106962e-06, "logits/chosen": 0.7094401717185974, "logits/rejected": 1.9696271419525146, "logps/chosen": -528.63037109375, "logps/rejected": -785.7152099609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.194915294647217, "rewards/margins": 25.390228271484375, "rewards/rejected": -32.58514404296875, "step": 2228 }, { "epoch": 1.3866251944012442, "grad_norm": 0.004287133924663067, "learning_rate": 2.9875518672199173e-06, "logits/chosen": -1.7506518363952637, "logits/rejected": 1.3359475135803223, "logps/chosen": -471.49822998046875, "logps/rejected": -945.1353759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.74657154083252, "rewards/margins": 30.829132080078125, "rewards/rejected": -39.57570266723633, "step": 2229 }, { "epoch": 1.3872472783825818, "grad_norm": 0.012623314745724201, "learning_rate": 2.9863992623328725e-06, "logits/chosen": -0.06425750255584717, "logits/rejected": 2.667785167694092, "logps/chosen": -570.6763916015625, "logps/rejected": -1034.1314697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.794634819030762, "rewards/margins": 22.76612091064453, "rewards/rejected": -34.560752868652344, "step": 2230 }, { "epoch": 1.3878693623639191, "grad_norm": 0.05512861907482147, "learning_rate": 2.9852466574458277e-06, "logits/chosen": 0.703946590423584, "logits/rejected": 4.535602569580078, "logps/chosen": -480.8987121582031, "logps/rejected": -936.260986328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.488350868225098, "rewards/margins": 27.323925018310547, "rewards/rejected": -35.81227493286133, "step": 2231 }, { "epoch": 1.3884914463452567, "grad_norm": 2.850918008334702e-06, "learning_rate": 2.984094052558783e-06, "logits/chosen": -3.057971715927124, "logits/rejected": 4.859375, "logps/chosen": -320.9295959472656, "logps/rejected": -1208.96435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.41721248626709, "rewards/margins": 38.850830078125, "rewards/rejected": -45.26803970336914, "step": 2232 }, { "epoch": 1.389113530326594, "grad_norm": 13.858604431152344, "learning_rate": 2.982941447671738e-06, "logits/chosen": -0.5668371319770813, "logits/rejected": 5.378746509552002, "logps/chosen": -516.4816284179688, "logps/rejected": -1168.6309814453125, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -11.723061561584473, "rewards/margins": 32.517276763916016, "rewards/rejected": -44.24034118652344, "step": 2233 }, { "epoch": 1.3897356143079316, "grad_norm": 0.018682241439819336, "learning_rate": 2.981788842784694e-06, "logits/chosen": -0.5560173392295837, "logits/rejected": 4.388740062713623, "logps/chosen": -478.714111328125, "logps/rejected": -1018.8369140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.856765270233154, "rewards/margins": 26.526676177978516, "rewards/rejected": -34.38343811035156, "step": 2234 }, { "epoch": 1.390357698289269, "grad_norm": 0.00024261536600533873, "learning_rate": 2.980636237897649e-06, "logits/chosen": 0.7792164087295532, "logits/rejected": 4.847288131713867, "logps/chosen": -501.6375732421875, "logps/rejected": -1087.42626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.4685640335083, "rewards/margins": 34.76447296142578, "rewards/rejected": -44.23303985595703, "step": 2235 }, { "epoch": 1.3909797822706065, "grad_norm": 0.3464057147502899, "learning_rate": 2.9794836330106043e-06, "logits/chosen": 1.542374849319458, "logits/rejected": 1.6514575481414795, "logps/chosen": -570.9237670898438, "logps/rejected": -815.7724609375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.963167190551758, "rewards/margins": 26.82428741455078, "rewards/rejected": -35.78745651245117, "step": 2236 }, { "epoch": 1.391601866251944, "grad_norm": 8.877638816833496, "learning_rate": 2.9783310281235595e-06, "logits/chosen": 2.5546209812164307, "logits/rejected": 2.9814341068267822, "logps/chosen": -670.451171875, "logps/rejected": -997.579833984375, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": -12.581817626953125, "rewards/margins": 26.853092193603516, "rewards/rejected": -39.434906005859375, "step": 2237 }, { "epoch": 1.3922239502332814, "grad_norm": 6.948227405548096, "learning_rate": 2.9771784232365147e-06, "logits/chosen": 1.135973334312439, "logits/rejected": 3.1979124546051025, "logps/chosen": -561.4859619140625, "logps/rejected": -965.5455322265625, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -4.59466552734375, "rewards/margins": 27.393587112426758, "rewards/rejected": -31.988248825073242, "step": 2238 }, { "epoch": 1.392846034214619, "grad_norm": 0.023626163601875305, "learning_rate": 2.97602581834947e-06, "logits/chosen": 1.0914082527160645, "logits/rejected": 2.423067569732666, "logps/chosen": -627.2195434570312, "logps/rejected": -938.776123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.695022583007812, "rewards/margins": 28.72723960876465, "rewards/rejected": -40.42226028442383, "step": 2239 }, { "epoch": 1.3934681181959565, "grad_norm": 0.019772088155150414, "learning_rate": 2.974873213462425e-06, "logits/chosen": 1.4812792539596558, "logits/rejected": 3.1909940242767334, "logps/chosen": -725.069091796875, "logps/rejected": -1122.466796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.017687797546387, "rewards/margins": 28.807287216186523, "rewards/rejected": -40.824974060058594, "step": 2240 }, { "epoch": 1.3940902021772938, "grad_norm": 16.312362670898438, "learning_rate": 2.9737206085753804e-06, "logits/chosen": 0.18421989679336548, "logits/rejected": 1.6313143968582153, "logps/chosen": -666.5517578125, "logps/rejected": -926.6065063476562, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": -9.068418502807617, "rewards/margins": 20.901081085205078, "rewards/rejected": -29.969497680664062, "step": 2241 }, { "epoch": 1.3947122861586314, "grad_norm": 16.657617568969727, "learning_rate": 2.972568003688336e-06, "logits/chosen": -0.5953693389892578, "logits/rejected": 1.4768779277801514, "logps/chosen": -424.75750732421875, "logps/rejected": -771.7853393554688, "loss": 0.0893, "rewards/accuracies": 0.875, "rewards/chosen": -7.179218769073486, "rewards/margins": 23.26014518737793, "rewards/rejected": -30.43936538696289, "step": 2242 }, { "epoch": 1.395334370139969, "grad_norm": 1.2527492376790406e-09, "learning_rate": 2.9714153988012913e-06, "logits/chosen": -1.4872426986694336, "logits/rejected": 3.9436469078063965, "logps/chosen": -395.34527587890625, "logps/rejected": -1169.1190185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.897050857543945, "rewards/margins": 39.89738082885742, "rewards/rejected": -48.79443359375, "step": 2243 }, { "epoch": 1.3959564541213063, "grad_norm": 1.4284208260662012e-09, "learning_rate": 2.9702627939142465e-06, "logits/chosen": 2.7672107219696045, "logits/rejected": 4.6261372566223145, "logps/chosen": -765.1670532226562, "logps/rejected": -1209.5228271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.569221496582031, "rewards/margins": 39.72999572753906, "rewards/rejected": -48.299217224121094, "step": 2244 }, { "epoch": 1.3965785381026439, "grad_norm": 32.85478973388672, "learning_rate": 2.9691101890272017e-06, "logits/chosen": -2.1204891204833984, "logits/rejected": 2.243065595626831, "logps/chosen": -384.271728515625, "logps/rejected": -898.5313720703125, "loss": 1.0508, "rewards/accuracies": 0.875, "rewards/chosen": -13.96297836303711, "rewards/margins": 19.47112274169922, "rewards/rejected": -33.43409729003906, "step": 2245 }, { "epoch": 1.3972006220839814, "grad_norm": 5.930408477783203, "learning_rate": 2.967957584140157e-06, "logits/chosen": 0.24247993528842926, "logits/rejected": 4.5272369384765625, "logps/chosen": -508.9815368652344, "logps/rejected": -1157.3570556640625, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": -9.666451454162598, "rewards/margins": 38.191650390625, "rewards/rejected": -47.85810089111328, "step": 2246 }, { "epoch": 1.3978227060653188, "grad_norm": 9.793144272407517e-05, "learning_rate": 2.966804979253112e-06, "logits/chosen": 0.7921026945114136, "logits/rejected": 4.247562885284424, "logps/chosen": -560.756591796875, "logps/rejected": -1108.50927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.537893295288086, "rewards/margins": 34.58842468261719, "rewards/rejected": -43.126312255859375, "step": 2247 }, { "epoch": 1.3984447900466563, "grad_norm": 0.8745850324630737, "learning_rate": 2.9656523743660674e-06, "logits/chosen": 0.8842285871505737, "logits/rejected": 2.2051475048065186, "logps/chosen": -557.5842895507812, "logps/rejected": -865.9134521484375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -10.312744140625, "rewards/margins": 24.951759338378906, "rewards/rejected": -35.264503479003906, "step": 2248 }, { "epoch": 1.399066874027994, "grad_norm": 0.029392994940280914, "learning_rate": 2.964499769479023e-06, "logits/chosen": 1.2681587934494019, "logits/rejected": 4.473254203796387, "logps/chosen": -582.9454956054688, "logps/rejected": -1216.732666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.867018699645996, "rewards/margins": 34.12983322143555, "rewards/rejected": -45.996849060058594, "step": 2249 }, { "epoch": 1.3996889580093312, "grad_norm": 0.3804221451282501, "learning_rate": 2.9633471645919783e-06, "logits/chosen": -1.6605898141860962, "logits/rejected": 3.4152517318725586, "logps/chosen": -508.3944091796875, "logps/rejected": -1163.526611328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -13.227392196655273, "rewards/margins": 37.94681167602539, "rewards/rejected": -51.1742057800293, "step": 2250 }, { "epoch": 1.4003110419906688, "grad_norm": 2.7944657698952824e-10, "learning_rate": 2.9621945597049335e-06, "logits/chosen": -1.5698888301849365, "logits/rejected": 3.95448637008667, "logps/chosen": -407.74139404296875, "logps/rejected": -1109.8446044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.576188087463379, "rewards/margins": 38.68739318847656, "rewards/rejected": -45.263580322265625, "step": 2251 }, { "epoch": 1.4009331259720061, "grad_norm": 1.1104753017425537, "learning_rate": 2.9610419548178887e-06, "logits/chosen": 0.010650875978171825, "logits/rejected": 1.6669846773147583, "logps/chosen": -573.3395385742188, "logps/rejected": -972.3338623046875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -9.772941589355469, "rewards/margins": 27.10225486755371, "rewards/rejected": -36.87519454956055, "step": 2252 }, { "epoch": 1.4015552099533437, "grad_norm": 0.0018733566394075751, "learning_rate": 2.959889349930844e-06, "logits/chosen": 3.3101394176483154, "logits/rejected": 5.083858013153076, "logps/chosen": -556.2260131835938, "logps/rejected": -927.8904418945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.198858261108398, "rewards/margins": 26.321956634521484, "rewards/rejected": -34.52081298828125, "step": 2253 }, { "epoch": 1.402177293934681, "grad_norm": 0.9949904084205627, "learning_rate": 2.958736745043799e-06, "logits/chosen": 0.13389703631401062, "logits/rejected": 2.4801292419433594, "logps/chosen": -541.17041015625, "logps/rejected": -893.1530151367188, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -11.718971252441406, "rewards/margins": 24.820383071899414, "rewards/rejected": -36.53935241699219, "step": 2254 }, { "epoch": 1.4027993779160186, "grad_norm": 1.041222731146263e-06, "learning_rate": 2.9575841401567544e-06, "logits/chosen": 0.26652705669403076, "logits/rejected": 4.539582252502441, "logps/chosen": -585.6783447265625, "logps/rejected": -1185.69580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.327654838562012, "rewards/margins": 33.012325286865234, "rewards/rejected": -45.33998107910156, "step": 2255 }, { "epoch": 1.4034214618973562, "grad_norm": 2.361824044783134e-06, "learning_rate": 2.95643153526971e-06, "logits/chosen": 0.5165107846260071, "logits/rejected": 3.6015896797180176, "logps/chosen": -619.5923461914062, "logps/rejected": -1163.2398681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.083619117736816, "rewards/margins": 40.63835144042969, "rewards/rejected": -50.72196960449219, "step": 2256 }, { "epoch": 1.4040435458786935, "grad_norm": 2.7180160486750538e-06, "learning_rate": 2.9552789303826653e-06, "logits/chosen": -1.3893684148788452, "logits/rejected": 3.102480888366699, "logps/chosen": -550.6753540039062, "logps/rejected": -1261.0013427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.005228042602539, "rewards/margins": 39.15861511230469, "rewards/rejected": -53.16384506225586, "step": 2257 }, { "epoch": 1.404665629860031, "grad_norm": 0.007969174534082413, "learning_rate": 2.9541263254956205e-06, "logits/chosen": 2.2342042922973633, "logits/rejected": 2.6852080821990967, "logps/chosen": -699.7215576171875, "logps/rejected": -965.824462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.429476737976074, "rewards/margins": 26.209827423095703, "rewards/rejected": -40.639305114746094, "step": 2258 }, { "epoch": 1.4052877138413686, "grad_norm": 3.5950510209659114e-05, "learning_rate": 2.9529737206085757e-06, "logits/chosen": 1.4159845113754272, "logits/rejected": 3.3737049102783203, "logps/chosen": -601.9073486328125, "logps/rejected": -980.881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.271068572998047, "rewards/margins": 29.318340301513672, "rewards/rejected": -41.58940887451172, "step": 2259 }, { "epoch": 1.405909797822706, "grad_norm": 0.0010115044424310327, "learning_rate": 2.951821115721531e-06, "logits/chosen": 0.3141559064388275, "logits/rejected": 3.4854958057403564, "logps/chosen": -485.7015686035156, "logps/rejected": -900.7958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.961186408996582, "rewards/margins": 25.101022720336914, "rewards/rejected": -34.06221008300781, "step": 2260 }, { "epoch": 1.4065318818040435, "grad_norm": 0.007804466411471367, "learning_rate": 2.950668510834486e-06, "logits/chosen": 0.22595328092575073, "logits/rejected": 3.9139413833618164, "logps/chosen": -598.7637939453125, "logps/rejected": -1206.5079345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.31296443939209, "rewards/margins": 35.79632568359375, "rewards/rejected": -44.10929489135742, "step": 2261 }, { "epoch": 1.407153965785381, "grad_norm": 29.254375457763672, "learning_rate": 2.9495159059474414e-06, "logits/chosen": -2.053928852081299, "logits/rejected": 3.762558698654175, "logps/chosen": -494.0701904296875, "logps/rejected": -1236.5478515625, "loss": 0.2308, "rewards/accuracies": 0.875, "rewards/chosen": -10.962211608886719, "rewards/margins": 37.9639778137207, "rewards/rejected": -48.926185607910156, "step": 2262 }, { "epoch": 1.4077760497667184, "grad_norm": 0.03585294261574745, "learning_rate": 2.9483633010603966e-06, "logits/chosen": 2.6447174549102783, "logits/rejected": 3.4780094623565674, "logps/chosen": -561.7684326171875, "logps/rejected": -900.8305053710938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.810640335083008, "rewards/margins": 23.371841430664062, "rewards/rejected": -37.18247985839844, "step": 2263 }, { "epoch": 1.408398133748056, "grad_norm": 9.315655915997922e-05, "learning_rate": 2.9472106961733522e-06, "logits/chosen": -0.3833075761795044, "logits/rejected": 2.9224395751953125, "logps/chosen": -433.1549377441406, "logps/rejected": -898.0396728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.635376930236816, "rewards/margins": 28.6442928314209, "rewards/rejected": -37.27967071533203, "step": 2264 }, { "epoch": 1.4090202177293936, "grad_norm": 0.024829663336277008, "learning_rate": 2.9460580912863075e-06, "logits/chosen": -1.8885233402252197, "logits/rejected": 1.543205976486206, "logps/chosen": -534.0154418945312, "logps/rejected": -1084.5830078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.128021240234375, "rewards/margins": 30.66397476196289, "rewards/rejected": -41.7919921875, "step": 2265 }, { "epoch": 1.409642301710731, "grad_norm": 0.21217453479766846, "learning_rate": 2.9449054863992627e-06, "logits/chosen": -0.9531707763671875, "logits/rejected": 2.394754409790039, "logps/chosen": -617.4303588867188, "logps/rejected": -1086.7529296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.659804344177246, "rewards/margins": 28.16089630126953, "rewards/rejected": -35.820701599121094, "step": 2266 }, { "epoch": 1.4102643856920685, "grad_norm": 1.5152208106883336e-05, "learning_rate": 2.943752881512218e-06, "logits/chosen": -0.8151946067810059, "logits/rejected": 3.052356004714966, "logps/chosen": -365.26904296875, "logps/rejected": -995.769775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.043517112731934, "rewards/margins": 35.24461364746094, "rewards/rejected": -40.28812789916992, "step": 2267 }, { "epoch": 1.410886469673406, "grad_norm": 21.003597259521484, "learning_rate": 2.942600276625173e-06, "logits/chosen": 1.9905128479003906, "logits/rejected": 4.831020355224609, "logps/chosen": -658.3651123046875, "logps/rejected": -1095.9422607421875, "loss": 0.1103, "rewards/accuracies": 0.875, "rewards/chosen": -9.824920654296875, "rewards/margins": 30.134084701538086, "rewards/rejected": -39.95900344848633, "step": 2268 }, { "epoch": 1.4115085536547434, "grad_norm": 0.0016211661277338862, "learning_rate": 2.9414476717381284e-06, "logits/chosen": 0.7444226741790771, "logits/rejected": 4.623043060302734, "logps/chosen": -614.892333984375, "logps/rejected": -1186.7662353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.60790729522705, "rewards/margins": 33.84346008300781, "rewards/rejected": -43.45137023925781, "step": 2269 }, { "epoch": 1.412130637636081, "grad_norm": 0.00010323274182155728, "learning_rate": 2.9402950668510836e-06, "logits/chosen": 1.018390417098999, "logits/rejected": 4.094212532043457, "logps/chosen": -384.023681640625, "logps/rejected": -894.3253173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.826488494873047, "rewards/margins": 29.77437400817871, "rewards/rejected": -37.60086441040039, "step": 2270 }, { "epoch": 1.4127527216174183, "grad_norm": 22.7961483001709, "learning_rate": 2.9391424619640392e-06, "logits/chosen": -0.5670567154884338, "logits/rejected": 0.5044700503349304, "logps/chosen": -561.7535400390625, "logps/rejected": -897.264892578125, "loss": 0.0895, "rewards/accuracies": 0.875, "rewards/chosen": -10.388435363769531, "rewards/margins": 25.081241607666016, "rewards/rejected": -35.46967697143555, "step": 2271 }, { "epoch": 1.4133748055987558, "grad_norm": 2.3609633445739746, "learning_rate": 2.9379898570769945e-06, "logits/chosen": 0.33859747648239136, "logits/rejected": 4.584858417510986, "logps/chosen": -459.1435546875, "logps/rejected": -949.4611206054688, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -10.196245193481445, "rewards/margins": 22.02639389038086, "rewards/rejected": -32.22264099121094, "step": 2272 }, { "epoch": 1.4139968895800932, "grad_norm": 0.01303979940712452, "learning_rate": 2.9368372521899497e-06, "logits/chosen": 0.0018563270568847656, "logits/rejected": 2.0656213760375977, "logps/chosen": -534.7927856445312, "logps/rejected": -904.1492919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.918089389801025, "rewards/margins": 28.851530075073242, "rewards/rejected": -35.76961898803711, "step": 2273 }, { "epoch": 1.4146189735614307, "grad_norm": 31.436941146850586, "learning_rate": 2.935684647302905e-06, "logits/chosen": 2.9472086429595947, "logits/rejected": 3.870821237564087, "logps/chosen": -646.843505859375, "logps/rejected": -1069.711669921875, "loss": 0.3968, "rewards/accuracies": 0.875, "rewards/chosen": -13.343791007995605, "rewards/margins": 31.182849884033203, "rewards/rejected": -44.526641845703125, "step": 2274 }, { "epoch": 1.4152410575427683, "grad_norm": 2.008999217650853e-06, "learning_rate": 2.93453204241586e-06, "logits/chosen": -2.290863037109375, "logits/rejected": 2.3186676502227783, "logps/chosen": -367.4084777832031, "logps/rejected": -951.1954345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.120453357696533, "rewards/margins": 29.2360782623291, "rewards/rejected": -34.356529235839844, "step": 2275 }, { "epoch": 1.4158631415241056, "grad_norm": 0.04517332464456558, "learning_rate": 2.9333794375288154e-06, "logits/chosen": -0.20349359512329102, "logits/rejected": 1.8677873611450195, "logps/chosen": -428.3921203613281, "logps/rejected": -739.129150390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.071450710296631, "rewards/margins": 20.734577178955078, "rewards/rejected": -27.8060302734375, "step": 2276 }, { "epoch": 1.4164852255054432, "grad_norm": 0.04162021726369858, "learning_rate": 2.9322268326417706e-06, "logits/chosen": 0.4728636145591736, "logits/rejected": 3.5390806198120117, "logps/chosen": -489.19610595703125, "logps/rejected": -1045.33056640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.421915054321289, "rewards/margins": 34.723148345947266, "rewards/rejected": -43.14506149291992, "step": 2277 }, { "epoch": 1.4171073094867808, "grad_norm": 0.0008894521743059158, "learning_rate": 2.9310742277547262e-06, "logits/chosen": -0.6772791147232056, "logits/rejected": 2.196185350418091, "logps/chosen": -344.69866943359375, "logps/rejected": -848.551025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.582999229431152, "rewards/margins": 31.66111183166504, "rewards/rejected": -37.244110107421875, "step": 2278 }, { "epoch": 1.417729393468118, "grad_norm": 6.018635811955164e-09, "learning_rate": 2.9299216228676815e-06, "logits/chosen": 0.21843993663787842, "logits/rejected": 4.459475517272949, "logps/chosen": -431.35394287109375, "logps/rejected": -998.3802490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.352250099182129, "rewards/margins": 32.244117736816406, "rewards/rejected": -42.59636688232422, "step": 2279 }, { "epoch": 1.4183514774494557, "grad_norm": 2.377005512244068e-05, "learning_rate": 2.9287690179806367e-06, "logits/chosen": 0.881413459777832, "logits/rejected": 3.4689924716949463, "logps/chosen": -461.482666015625, "logps/rejected": -943.070068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.424365997314453, "rewards/margins": 29.584659576416016, "rewards/rejected": -38.00902557373047, "step": 2280 }, { "epoch": 1.4189735614307932, "grad_norm": 3.8007092371117324e-05, "learning_rate": 2.927616413093592e-06, "logits/chosen": 0.7045872211456299, "logits/rejected": 2.315516710281372, "logps/chosen": -603.4990234375, "logps/rejected": -907.1836547851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.995718955993652, "rewards/margins": 25.444580078125, "rewards/rejected": -34.44029998779297, "step": 2281 }, { "epoch": 1.4195956454121306, "grad_norm": 42.845584869384766, "learning_rate": 2.926463808206547e-06, "logits/chosen": 0.6564313173294067, "logits/rejected": 2.4238367080688477, "logps/chosen": -567.2952880859375, "logps/rejected": -792.8505859375, "loss": 0.7587, "rewards/accuracies": 0.875, "rewards/chosen": -9.14471435546875, "rewards/margins": 19.311264038085938, "rewards/rejected": -28.455978393554688, "step": 2282 }, { "epoch": 1.4202177293934681, "grad_norm": 2.9977618964949215e-07, "learning_rate": 2.9253112033195024e-06, "logits/chosen": -2.066939115524292, "logits/rejected": 4.052614212036133, "logps/chosen": -263.885498046875, "logps/rejected": -1071.652099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.650771141052246, "rewards/margins": 40.807212829589844, "rewards/rejected": -45.457984924316406, "step": 2283 }, { "epoch": 1.4208398133748057, "grad_norm": 49.3410530090332, "learning_rate": 2.9241585984324576e-06, "logits/chosen": 2.585322856903076, "logits/rejected": 4.387059211730957, "logps/chosen": -590.73486328125, "logps/rejected": -903.32080078125, "loss": 0.9985, "rewards/accuracies": 0.75, "rewards/chosen": -9.027231216430664, "rewards/margins": 23.2886905670166, "rewards/rejected": -32.315921783447266, "step": 2284 }, { "epoch": 1.421461897356143, "grad_norm": 0.18115392327308655, "learning_rate": 2.9230059935454132e-06, "logits/chosen": 0.1543576568365097, "logits/rejected": 2.600271701812744, "logps/chosen": -613.9583740234375, "logps/rejected": -1017.9857788085938, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.766103744506836, "rewards/margins": 22.681135177612305, "rewards/rejected": -31.447237014770508, "step": 2285 }, { "epoch": 1.4220839813374806, "grad_norm": 0.007985980249941349, "learning_rate": 2.9218533886583685e-06, "logits/chosen": 0.11727690696716309, "logits/rejected": 1.414735198020935, "logps/chosen": -555.1712036132812, "logps/rejected": -842.997314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.974403381347656, "rewards/margins": 23.567520141601562, "rewards/rejected": -33.54192352294922, "step": 2286 }, { "epoch": 1.4227060653188182, "grad_norm": 0.11219903826713562, "learning_rate": 2.9207007837713237e-06, "logits/chosen": 1.3696763515472412, "logits/rejected": 3.7046830654144287, "logps/chosen": -551.7432250976562, "logps/rejected": -996.8988647460938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.567296981811523, "rewards/margins": 30.2374267578125, "rewards/rejected": -41.804725646972656, "step": 2287 }, { "epoch": 1.4233281493001555, "grad_norm": 0.0010436129523441195, "learning_rate": 2.919548178884279e-06, "logits/chosen": -1.4171717166900635, "logits/rejected": 2.6286206245422363, "logps/chosen": -534.484130859375, "logps/rejected": -1125.8145751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.40506362915039, "rewards/margins": 30.196651458740234, "rewards/rejected": -38.601715087890625, "step": 2288 }, { "epoch": 1.423950233281493, "grad_norm": 7.079758506733924e-05, "learning_rate": 2.918395573997234e-06, "logits/chosen": 0.5658026933670044, "logits/rejected": 3.566821575164795, "logps/chosen": -626.707275390625, "logps/rejected": -1013.3258056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.590632438659668, "rewards/margins": 25.71282958984375, "rewards/rejected": -34.303462982177734, "step": 2289 }, { "epoch": 1.4245723172628304, "grad_norm": 0.00018804903083946556, "learning_rate": 2.9172429691101894e-06, "logits/chosen": 1.4859108924865723, "logits/rejected": 4.400835990905762, "logps/chosen": -660.5794677734375, "logps/rejected": -1268.1781005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.740732192993164, "rewards/margins": 40.89760971069336, "rewards/rejected": -50.638336181640625, "step": 2290 }, { "epoch": 1.425194401244168, "grad_norm": 1.8972766399383545, "learning_rate": 2.9160903642231446e-06, "logits/chosen": 0.980305016040802, "logits/rejected": 3.3063576221466064, "logps/chosen": -612.8570556640625, "logps/rejected": -1093.697998046875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -8.83894157409668, "rewards/margins": 33.148048400878906, "rewards/rejected": -41.98699188232422, "step": 2291 }, { "epoch": 1.4258164852255055, "grad_norm": 1.6478937864303589, "learning_rate": 2.9149377593361e-06, "logits/chosen": -0.0209181010723114, "logits/rejected": 3.4129858016967773, "logps/chosen": -578.6953735351562, "logps/rejected": -907.123046875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -6.504940986633301, "rewards/margins": 20.077247619628906, "rewards/rejected": -26.582189559936523, "step": 2292 }, { "epoch": 1.4264385692068429, "grad_norm": 0.07732792943716049, "learning_rate": 2.9137851544490555e-06, "logits/chosen": -0.7631025314331055, "logits/rejected": 4.451732635498047, "logps/chosen": -446.68280029296875, "logps/rejected": -1024.406494140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.762255668640137, "rewards/margins": 31.770727157592773, "rewards/rejected": -40.532981872558594, "step": 2293 }, { "epoch": 1.4270606531881804, "grad_norm": 4.2958705307682976e-05, "learning_rate": 2.9126325495620107e-06, "logits/chosen": 0.22674143314361572, "logits/rejected": 3.4945809841156006, "logps/chosen": -500.32562255859375, "logps/rejected": -1025.9798583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.780029296875, "rewards/margins": 27.635435104370117, "rewards/rejected": -36.41546630859375, "step": 2294 }, { "epoch": 1.4276827371695178, "grad_norm": 42.8979377746582, "learning_rate": 2.911479944674966e-06, "logits/chosen": -0.6848071217536926, "logits/rejected": 2.201697587966919, "logps/chosen": -536.333251953125, "logps/rejected": -903.2261962890625, "loss": 0.6641, "rewards/accuracies": 0.875, "rewards/chosen": -9.489903450012207, "rewards/margins": 22.598129272460938, "rewards/rejected": -32.08803176879883, "step": 2295 }, { "epoch": 1.4283048211508553, "grad_norm": 0.030604323372244835, "learning_rate": 2.910327339787921e-06, "logits/chosen": -0.04041486978530884, "logits/rejected": 2.474832057952881, "logps/chosen": -581.4979248046875, "logps/rejected": -952.0086059570312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.742212295532227, "rewards/margins": 24.62486457824707, "rewards/rejected": -33.36707305908203, "step": 2296 }, { "epoch": 1.428926905132193, "grad_norm": 9.851161166807287e-07, "learning_rate": 2.9091747349008764e-06, "logits/chosen": 0.3778786063194275, "logits/rejected": 1.9946703910827637, "logps/chosen": -456.3149108886719, "logps/rejected": -777.9240112304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.893744468688965, "rewards/margins": 26.18206787109375, "rewards/rejected": -30.075809478759766, "step": 2297 }, { "epoch": 1.4295489891135302, "grad_norm": 0.0991005226969719, "learning_rate": 2.9080221300138316e-06, "logits/chosen": 2.706174373626709, "logits/rejected": 4.151269912719727, "logps/chosen": -758.4832763671875, "logps/rejected": -1072.541015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.922719955444336, "rewards/margins": 25.964866638183594, "rewards/rejected": -34.88758850097656, "step": 2298 }, { "epoch": 1.4301710730948678, "grad_norm": 0.07812916487455368, "learning_rate": 2.906869525126787e-06, "logits/chosen": -0.47702324390411377, "logits/rejected": 3.2301926612854004, "logps/chosen": -474.9299011230469, "logps/rejected": -996.20556640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.653593063354492, "rewards/margins": 23.850337982177734, "rewards/rejected": -33.503929138183594, "step": 2299 }, { "epoch": 1.4307931570762054, "grad_norm": 0.00029026303673163056, "learning_rate": 2.9057169202397425e-06, "logits/chosen": -0.5452579259872437, "logits/rejected": 4.23468542098999, "logps/chosen": -490.8875427246094, "logps/rejected": -1042.40869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9663872718811035, "rewards/margins": 32.5419807434082, "rewards/rejected": -39.50836944580078, "step": 2300 }, { "epoch": 1.4314152410575427, "grad_norm": 30.1119384765625, "learning_rate": 2.9045643153526977e-06, "logits/chosen": -2.7672839164733887, "logits/rejected": 2.42836594581604, "logps/chosen": -380.5882568359375, "logps/rejected": -959.0693359375, "loss": 0.2293, "rewards/accuracies": 0.875, "rewards/chosen": -8.109725952148438, "rewards/margins": 24.500015258789062, "rewards/rejected": -32.6097412109375, "step": 2301 }, { "epoch": 1.4320373250388803, "grad_norm": 4.354133129119873, "learning_rate": 2.903411710465653e-06, "logits/chosen": 1.5805208683013916, "logits/rejected": 3.6228115558624268, "logps/chosen": -873.7197265625, "logps/rejected": -1153.635498046875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -15.920944213867188, "rewards/margins": 16.957725524902344, "rewards/rejected": -32.87866973876953, "step": 2302 }, { "epoch": 1.4326594090202178, "grad_norm": 0.0239068903028965, "learning_rate": 2.902259105578608e-06, "logits/chosen": -2.2274341583251953, "logits/rejected": 0.8469206094741821, "logps/chosen": -358.734130859375, "logps/rejected": -833.1475219726562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.413700580596924, "rewards/margins": 25.46902084350586, "rewards/rejected": -32.882720947265625, "step": 2303 }, { "epoch": 1.4332814930015552, "grad_norm": 0.7325589656829834, "learning_rate": 2.9011065006915634e-06, "logits/chosen": -0.23840701580047607, "logits/rejected": 2.6510872840881348, "logps/chosen": -502.33648681640625, "logps/rejected": -865.9973754882812, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.252693176269531, "rewards/margins": 16.553138732910156, "rewards/rejected": -25.805830001831055, "step": 2304 }, { "epoch": 1.4339035769828927, "grad_norm": 2.370480537414551, "learning_rate": 2.8999538958045186e-06, "logits/chosen": -0.7831411361694336, "logits/rejected": 3.6987407207489014, "logps/chosen": -384.1199951171875, "logps/rejected": -955.3935546875, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -9.02347469329834, "rewards/margins": 23.749711990356445, "rewards/rejected": -32.77318572998047, "step": 2305 }, { "epoch": 1.4345256609642303, "grad_norm": 0.9532006978988647, "learning_rate": 2.898801290917474e-06, "logits/chosen": 1.7668182849884033, "logits/rejected": 4.3650007247924805, "logps/chosen": -664.02099609375, "logps/rejected": -1065.537353515625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -8.304007530212402, "rewards/margins": 27.521072387695312, "rewards/rejected": -35.82508087158203, "step": 2306 }, { "epoch": 1.4351477449455676, "grad_norm": 0.30767178535461426, "learning_rate": 2.8976486860304295e-06, "logits/chosen": 3.121201515197754, "logits/rejected": 3.2817912101745605, "logps/chosen": -619.9653930664062, "logps/rejected": -893.48779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -9.179896354675293, "rewards/margins": 22.813602447509766, "rewards/rejected": -31.993501663208008, "step": 2307 }, { "epoch": 1.4357698289269052, "grad_norm": 2.5073416054510744e-06, "learning_rate": 2.8964960811433847e-06, "logits/chosen": 0.9362043142318726, "logits/rejected": 3.051908493041992, "logps/chosen": -525.4351196289062, "logps/rejected": -1072.8572998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.091745376586914, "rewards/margins": 34.52039337158203, "rewards/rejected": -43.61213684082031, "step": 2308 }, { "epoch": 1.4363919129082425, "grad_norm": 1.418189525604248, "learning_rate": 2.89534347625634e-06, "logits/chosen": 2.421281576156616, "logits/rejected": 4.215733051300049, "logps/chosen": -625.8364868164062, "logps/rejected": -989.9420776367188, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -9.793773651123047, "rewards/margins": 25.186986923217773, "rewards/rejected": -34.98076248168945, "step": 2309 }, { "epoch": 1.43701399688958, "grad_norm": 0.0014669963857159019, "learning_rate": 2.894190871369295e-06, "logits/chosen": 0.5397623181343079, "logits/rejected": 3.229332447052002, "logps/chosen": -356.3837585449219, "logps/rejected": -761.350830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.371195316314697, "rewards/margins": 19.48046875, "rewards/rejected": -26.85166358947754, "step": 2310 }, { "epoch": 1.4376360808709177, "grad_norm": 0.5468123555183411, "learning_rate": 2.8930382664822504e-06, "logits/chosen": 2.6708908081054688, "logits/rejected": 3.487905502319336, "logps/chosen": -646.0723876953125, "logps/rejected": -917.741943359375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -8.835456848144531, "rewards/margins": 22.48293113708496, "rewards/rejected": -31.31838607788086, "step": 2311 }, { "epoch": 1.438258164852255, "grad_norm": 0.00034044316271319985, "learning_rate": 2.8918856615952056e-06, "logits/chosen": 0.7117612361907959, "logits/rejected": 4.120718479156494, "logps/chosen": -488.3861389160156, "logps/rejected": -1005.724853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.154720306396484, "rewards/margins": 28.170984268188477, "rewards/rejected": -40.325706481933594, "step": 2312 }, { "epoch": 1.4388802488335926, "grad_norm": 0.001113589503802359, "learning_rate": 2.890733056708161e-06, "logits/chosen": -0.4939787685871124, "logits/rejected": 3.5605859756469727, "logps/chosen": -464.910888671875, "logps/rejected": -1003.0372924804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.48483657836914, "rewards/margins": 29.896778106689453, "rewards/rejected": -38.381614685058594, "step": 2313 }, { "epoch": 1.43950233281493, "grad_norm": 7.407980918884277, "learning_rate": 2.8895804518211156e-06, "logits/chosen": 2.0751850605010986, "logits/rejected": 3.233450412750244, "logps/chosen": -584.488037109375, "logps/rejected": -939.03076171875, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -11.532438278198242, "rewards/margins": 26.578365325927734, "rewards/rejected": -38.11080551147461, "step": 2314 }, { "epoch": 1.4401244167962675, "grad_norm": 0.00016063018119893968, "learning_rate": 2.888427846934071e-06, "logits/chosen": -3.001732110977173, "logits/rejected": 0.1540934443473816, "logps/chosen": -333.6917724609375, "logps/rejected": -832.804443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.484884738922119, "rewards/margins": 26.101276397705078, "rewards/rejected": -33.58616256713867, "step": 2315 }, { "epoch": 1.440746500777605, "grad_norm": 0.0016659300308674574, "learning_rate": 2.887275242047026e-06, "logits/chosen": 0.07998377084732056, "logits/rejected": 4.144624710083008, "logps/chosen": -367.6268310546875, "logps/rejected": -895.8948974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.288524150848389, "rewards/margins": 32.77602767944336, "rewards/rejected": -40.064552307128906, "step": 2316 }, { "epoch": 1.4413685847589424, "grad_norm": 0.0014177010161802173, "learning_rate": 2.8861226371599817e-06, "logits/chosen": -2.2402052879333496, "logits/rejected": 4.038663864135742, "logps/chosen": -441.38323974609375, "logps/rejected": -1127.3525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.360583782196045, "rewards/margins": 32.027435302734375, "rewards/rejected": -39.388023376464844, "step": 2317 }, { "epoch": 1.44199066874028, "grad_norm": 0.0028884029015898705, "learning_rate": 2.884970032272937e-06, "logits/chosen": 1.2739320993423462, "logits/rejected": 4.6315460205078125, "logps/chosen": -595.5584716796875, "logps/rejected": -1157.019775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.700339317321777, "rewards/margins": 33.080440521240234, "rewards/rejected": -41.780784606933594, "step": 2318 }, { "epoch": 1.4426127527216175, "grad_norm": 0.28165727853775024, "learning_rate": 2.883817427385892e-06, "logits/chosen": 1.276587724685669, "logits/rejected": 3.1559951305389404, "logps/chosen": -727.439453125, "logps/rejected": -1092.28369140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -10.861395835876465, "rewards/margins": 26.8516845703125, "rewards/rejected": -37.71308135986328, "step": 2319 }, { "epoch": 1.4432348367029548, "grad_norm": 0.00015634715964552015, "learning_rate": 2.8826648224988474e-06, "logits/chosen": -0.27201998233795166, "logits/rejected": 4.20585823059082, "logps/chosen": -564.0928955078125, "logps/rejected": -1196.4190673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.63508415222168, "rewards/margins": 36.40064239501953, "rewards/rejected": -47.03572463989258, "step": 2320 }, { "epoch": 1.4438569206842924, "grad_norm": 59.58882522583008, "learning_rate": 2.8815122176118026e-06, "logits/chosen": 0.428855836391449, "logits/rejected": 2.460395336151123, "logps/chosen": -533.8165283203125, "logps/rejected": -808.375244140625, "loss": 1.2072, "rewards/accuracies": 0.875, "rewards/chosen": -12.576982498168945, "rewards/margins": 16.39482879638672, "rewards/rejected": -28.971811294555664, "step": 2321 }, { "epoch": 1.44447900466563, "grad_norm": 0.0001684689341345802, "learning_rate": 2.880359612724758e-06, "logits/chosen": 1.0388171672821045, "logits/rejected": 1.971908450126648, "logps/chosen": -688.4964599609375, "logps/rejected": -1014.7935180664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.04210090637207, "rewards/margins": 26.734935760498047, "rewards/rejected": -38.777034759521484, "step": 2322 }, { "epoch": 1.4451010886469673, "grad_norm": 2.0579311239998788e-05, "learning_rate": 2.879207007837713e-06, "logits/chosen": -0.8547032475471497, "logits/rejected": 3.58233380317688, "logps/chosen": -504.6190490722656, "logps/rejected": -1175.1273193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.368605613708496, "rewards/margins": 39.4583740234375, "rewards/rejected": -47.82698059082031, "step": 2323 }, { "epoch": 1.4457231726283049, "grad_norm": 1.1889311224422272e-07, "learning_rate": 2.8780544029506687e-06, "logits/chosen": 0.19701743125915527, "logits/rejected": 2.2015702724456787, "logps/chosen": -473.6476745605469, "logps/rejected": -878.5316772460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.067363739013672, "rewards/margins": 26.477081298828125, "rewards/rejected": -35.54444122314453, "step": 2324 }, { "epoch": 1.4463452566096424, "grad_norm": 49.5518798828125, "learning_rate": 2.876901798063624e-06, "logits/chosen": 2.072795867919922, "logits/rejected": 5.290923118591309, "logps/chosen": -674.4661254882812, "logps/rejected": -1051.1849365234375, "loss": 2.2263, "rewards/accuracies": 0.875, "rewards/chosen": -13.104040145874023, "rewards/margins": 23.05379295349121, "rewards/rejected": -36.157833099365234, "step": 2325 }, { "epoch": 1.4469673405909798, "grad_norm": 1.7055537700653076, "learning_rate": 2.875749193176579e-06, "logits/chosen": 2.415715456008911, "logits/rejected": 3.685410499572754, "logps/chosen": -636.4686889648438, "logps/rejected": -1046.666015625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -12.71038818359375, "rewards/margins": 28.964778900146484, "rewards/rejected": -41.67516326904297, "step": 2326 }, { "epoch": 1.4475894245723173, "grad_norm": 2.623699799642054e-07, "learning_rate": 2.8745965882895344e-06, "logits/chosen": -0.019688010215759277, "logits/rejected": 1.9653702974319458, "logps/chosen": -652.8423461914062, "logps/rejected": -1085.557373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.358902931213379, "rewards/margins": 37.280296325683594, "rewards/rejected": -48.63920211791992, "step": 2327 }, { "epoch": 1.4482115085536549, "grad_norm": 0.5979498624801636, "learning_rate": 2.8734439834024896e-06, "logits/chosen": 1.2116461992263794, "logits/rejected": 4.837029457092285, "logps/chosen": -471.36785888671875, "logps/rejected": -965.8245849609375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -6.9149322509765625, "rewards/margins": 30.018962860107422, "rewards/rejected": -36.93389892578125, "step": 2328 }, { "epoch": 1.4488335925349922, "grad_norm": 7.315420447184806e-08, "learning_rate": 2.872291378515445e-06, "logits/chosen": 1.088128924369812, "logits/rejected": 3.6922049522399902, "logps/chosen": -553.4324951171875, "logps/rejected": -1014.8348999023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.668688774108887, "rewards/margins": 28.717819213867188, "rewards/rejected": -39.38650894165039, "step": 2329 }, { "epoch": 1.4494556765163298, "grad_norm": 7.4477925300598145, "learning_rate": 2.8711387736284e-06, "logits/chosen": 0.9433234333992004, "logits/rejected": 2.2451412677764893, "logps/chosen": -598.5908203125, "logps/rejected": -948.7597045898438, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -12.012178421020508, "rewards/margins": 24.346149444580078, "rewards/rejected": -36.35832595825195, "step": 2330 }, { "epoch": 1.4500777604976671, "grad_norm": 0.0051433308981359005, "learning_rate": 2.8699861687413557e-06, "logits/chosen": -1.3141753673553467, "logits/rejected": 3.2086472511291504, "logps/chosen": -493.376953125, "logps/rejected": -1056.125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.405976295471191, "rewards/margins": 25.69782066345215, "rewards/rejected": -35.10379409790039, "step": 2331 }, { "epoch": 1.4506998444790047, "grad_norm": 0.0010937333572655916, "learning_rate": 2.868833563854311e-06, "logits/chosen": 1.9706964492797852, "logits/rejected": 4.621064186096191, "logps/chosen": -528.7137451171875, "logps/rejected": -984.925048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.555310249328613, "rewards/margins": 31.49054718017578, "rewards/rejected": -41.045860290527344, "step": 2332 }, { "epoch": 1.451321928460342, "grad_norm": 1.4169393580232281e-05, "learning_rate": 2.867680958967266e-06, "logits/chosen": -0.5840214490890503, "logits/rejected": 2.413980007171631, "logps/chosen": -461.7684020996094, "logps/rejected": -960.8614501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.614622116088867, "rewards/margins": 30.289093017578125, "rewards/rejected": -36.903717041015625, "step": 2333 }, { "epoch": 1.4519440124416796, "grad_norm": 29.227153778076172, "learning_rate": 2.8665283540802214e-06, "logits/chosen": -1.9500712156295776, "logits/rejected": 2.810960292816162, "logps/chosen": -483.628662109375, "logps/rejected": -1260.641357421875, "loss": 0.7312, "rewards/accuracies": 0.875, "rewards/chosen": -12.263227462768555, "rewards/margins": 42.335426330566406, "rewards/rejected": -54.598655700683594, "step": 2334 }, { "epoch": 1.4525660964230172, "grad_norm": 0.21363773941993713, "learning_rate": 2.8653757491931766e-06, "logits/chosen": -1.4968746900558472, "logits/rejected": 4.372405052185059, "logps/chosen": -456.53900146484375, "logps/rejected": -1048.5850830078125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.634637355804443, "rewards/margins": 28.53285789489746, "rewards/rejected": -35.16749572753906, "step": 2335 }, { "epoch": 1.4531881804043545, "grad_norm": 0.19996534287929535, "learning_rate": 2.864223144306132e-06, "logits/chosen": -0.618783712387085, "logits/rejected": 2.2022411823272705, "logps/chosen": -461.9523010253906, "logps/rejected": -760.0111083984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.399519920349121, "rewards/margins": 19.92711639404297, "rewards/rejected": -27.326637268066406, "step": 2336 }, { "epoch": 1.453810264385692, "grad_norm": 0.21690642833709717, "learning_rate": 2.863070539419087e-06, "logits/chosen": 0.9195233583450317, "logits/rejected": 3.0173544883728027, "logps/chosen": -619.1990966796875, "logps/rejected": -891.9837036132812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -11.781450271606445, "rewards/margins": 20.42620849609375, "rewards/rejected": -32.20765686035156, "step": 2337 }, { "epoch": 1.4544323483670296, "grad_norm": 0.003023615339770913, "learning_rate": 2.8619179345320423e-06, "logits/chosen": -0.1441906839609146, "logits/rejected": 4.001985549926758, "logps/chosen": -365.0175476074219, "logps/rejected": -927.1415405273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.58237075805664, "rewards/margins": 24.85255241394043, "rewards/rejected": -33.43492126464844, "step": 2338 }, { "epoch": 1.455054432348367, "grad_norm": 0.0015379984397441149, "learning_rate": 2.860765329644998e-06, "logits/chosen": 3.7588369846343994, "logits/rejected": 4.855842113494873, "logps/chosen": -703.90234375, "logps/rejected": -1005.2516479492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.806050300598145, "rewards/margins": 25.48776626586914, "rewards/rejected": -35.29381561279297, "step": 2339 }, { "epoch": 1.4556765163297045, "grad_norm": 0.0017797609325498343, "learning_rate": 2.859612724757953e-06, "logits/chosen": 0.6893689036369324, "logits/rejected": 2.4680938720703125, "logps/chosen": -492.37890625, "logps/rejected": -1000.2102661132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.166633605957031, "rewards/margins": 31.155921936035156, "rewards/rejected": -40.32255172729492, "step": 2340 }, { "epoch": 1.456298600311042, "grad_norm": 9.442226655664854e-06, "learning_rate": 2.8584601198709084e-06, "logits/chosen": -2.1514220237731934, "logits/rejected": 3.0572056770324707, "logps/chosen": -334.06134033203125, "logps/rejected": -919.5453491210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.310533046722412, "rewards/margins": 28.655559539794922, "rewards/rejected": -34.96609115600586, "step": 2341 }, { "epoch": 1.4569206842923794, "grad_norm": 4.326364040374756, "learning_rate": 2.8573075149838636e-06, "logits/chosen": 3.9358277320861816, "logits/rejected": 4.717712879180908, "logps/chosen": -596.55810546875, "logps/rejected": -774.9546508789062, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": -7.885163307189941, "rewards/margins": 19.06470489501953, "rewards/rejected": -26.949867248535156, "step": 2342 }, { "epoch": 1.457542768273717, "grad_norm": 0.00039138575084507465, "learning_rate": 2.856154910096819e-06, "logits/chosen": 1.4116837978363037, "logits/rejected": 3.5277867317199707, "logps/chosen": -538.282958984375, "logps/rejected": -876.333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.566531658172607, "rewards/margins": 24.43173599243164, "rewards/rejected": -31.99827003479004, "step": 2343 }, { "epoch": 1.4581648522550545, "grad_norm": 2.0726189613342285, "learning_rate": 2.855002305209774e-06, "logits/chosen": 2.4261767864227295, "logits/rejected": 4.365809440612793, "logps/chosen": -493.1600646972656, "logps/rejected": -836.7879028320312, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -7.41954231262207, "rewards/margins": 22.17431640625, "rewards/rejected": -29.593856811523438, "step": 2344 }, { "epoch": 1.4587869362363919, "grad_norm": 0.3835785388946533, "learning_rate": 2.8538497003227293e-06, "logits/chosen": -0.7283627390861511, "logits/rejected": 2.8423304557800293, "logps/chosen": -544.6996459960938, "logps/rejected": -972.6627197265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.42279577255249, "rewards/margins": 25.784692764282227, "rewards/rejected": -32.207489013671875, "step": 2345 }, { "epoch": 1.4594090202177294, "grad_norm": 0.3456924855709076, "learning_rate": 2.852697095435685e-06, "logits/chosen": 1.5055906772613525, "logits/rejected": 3.6082897186279297, "logps/chosen": -594.1642456054688, "logps/rejected": -965.9072875976562, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -8.597909927368164, "rewards/margins": 24.235122680664062, "rewards/rejected": -32.83303451538086, "step": 2346 }, { "epoch": 1.460031104199067, "grad_norm": 0.37873873114585876, "learning_rate": 2.85154449054864e-06, "logits/chosen": 0.9466390609741211, "logits/rejected": 3.1475629806518555, "logps/chosen": -460.72149658203125, "logps/rejected": -825.5015869140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -8.996981620788574, "rewards/margins": 24.10140609741211, "rewards/rejected": -33.098388671875, "step": 2347 }, { "epoch": 1.4606531881804043, "grad_norm": 9.690855979919434, "learning_rate": 2.8503918856615954e-06, "logits/chosen": -2.765359401702881, "logits/rejected": 1.0825361013412476, "logps/chosen": -454.8573303222656, "logps/rejected": -916.665283203125, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -7.064667224884033, "rewards/margins": 24.96207618713379, "rewards/rejected": -32.0267448425293, "step": 2348 }, { "epoch": 1.461275272161742, "grad_norm": 0.14686737954616547, "learning_rate": 2.8492392807745506e-06, "logits/chosen": -0.8677732944488525, "logits/rejected": 2.7581467628479004, "logps/chosen": -310.7578125, "logps/rejected": -701.1024780273438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.958008289337158, "rewards/margins": 18.651893615722656, "rewards/rejected": -23.609901428222656, "step": 2349 }, { "epoch": 1.4618973561430793, "grad_norm": 0.001136072096414864, "learning_rate": 2.848086675887506e-06, "logits/chosen": 1.9676111936569214, "logits/rejected": 4.360522270202637, "logps/chosen": -757.5195922851562, "logps/rejected": -1163.509521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.460611343383789, "rewards/margins": 27.386371612548828, "rewards/rejected": -41.84698486328125, "step": 2350 }, { "epoch": 1.4625194401244168, "grad_norm": 0.0009974318090826273, "learning_rate": 2.846934071000461e-06, "logits/chosen": 2.166215419769287, "logits/rejected": 4.188321113586426, "logps/chosen": -620.139892578125, "logps/rejected": -986.1962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.4574174880981445, "rewards/margins": 23.120960235595703, "rewards/rejected": -30.578380584716797, "step": 2351 }, { "epoch": 1.4631415241057542, "grad_norm": 1.2499357461929321, "learning_rate": 2.8457814661134163e-06, "logits/chosen": 0.13246458768844604, "logits/rejected": 2.2372920513153076, "logps/chosen": -439.4615173339844, "logps/rejected": -887.6943359375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -8.784399032592773, "rewards/margins": 30.512107849121094, "rewards/rejected": -39.2965087890625, "step": 2352 }, { "epoch": 1.4637636080870917, "grad_norm": 35.474639892578125, "learning_rate": 2.844628861226372e-06, "logits/chosen": 1.5007219314575195, "logits/rejected": 3.3255417346954346, "logps/chosen": -552.41259765625, "logps/rejected": -902.187255859375, "loss": 0.5765, "rewards/accuracies": 0.875, "rewards/chosen": -6.2433037757873535, "rewards/margins": 23.953628540039062, "rewards/rejected": -30.19693374633789, "step": 2353 }, { "epoch": 1.4643856920684293, "grad_norm": 0.017408454790711403, "learning_rate": 2.843476256339327e-06, "logits/chosen": -4.301645278930664, "logits/rejected": 0.49649691581726074, "logps/chosen": -350.7183532714844, "logps/rejected": -843.7274169921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.945578575134277, "rewards/margins": 28.868030548095703, "rewards/rejected": -33.8136100769043, "step": 2354 }, { "epoch": 1.4650077760497666, "grad_norm": 3.2871270179748535, "learning_rate": 2.8423236514522824e-06, "logits/chosen": 0.1433262825012207, "logits/rejected": 1.8936108350753784, "logps/chosen": -549.3184814453125, "logps/rejected": -852.3326416015625, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -11.849102973937988, "rewards/margins": 22.03537940979004, "rewards/rejected": -33.88447952270508, "step": 2355 }, { "epoch": 1.4656298600311042, "grad_norm": 3.6574772821040824e-05, "learning_rate": 2.8411710465652376e-06, "logits/chosen": -2.466205596923828, "logits/rejected": 2.333536148071289, "logps/chosen": -354.087890625, "logps/rejected": -1063.402099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.740349292755127, "rewards/margins": 38.1416015625, "rewards/rejected": -44.881954193115234, "step": 2356 }, { "epoch": 1.4662519440124417, "grad_norm": 0.0006809753249399364, "learning_rate": 2.840018441678193e-06, "logits/chosen": 1.7734744548797607, "logits/rejected": 4.814844131469727, "logps/chosen": -519.6259155273438, "logps/rejected": -1013.4092407226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.913437843322754, "rewards/margins": 28.07283592224121, "rewards/rejected": -35.98627471923828, "step": 2357 }, { "epoch": 1.466874027993779, "grad_norm": 0.11370241641998291, "learning_rate": 2.838865836791148e-06, "logits/chosen": 1.595003366470337, "logits/rejected": 3.9001097679138184, "logps/chosen": -567.801513671875, "logps/rejected": -948.3878784179688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.882749557495117, "rewards/margins": 24.700820922851562, "rewards/rejected": -31.583572387695312, "step": 2358 }, { "epoch": 1.4674961119751166, "grad_norm": 18.743146896362305, "learning_rate": 2.8377132319041033e-06, "logits/chosen": 0.6721723079681396, "logits/rejected": 3.64046049118042, "logps/chosen": -472.2651672363281, "logps/rejected": -921.5863037109375, "loss": 0.1213, "rewards/accuracies": 0.875, "rewards/chosen": -8.745011329650879, "rewards/margins": 23.231292724609375, "rewards/rejected": -31.976303100585938, "step": 2359 }, { "epoch": 1.4681181959564542, "grad_norm": 0.0008493398199789226, "learning_rate": 2.836560627017059e-06, "logits/chosen": 0.07488232851028442, "logits/rejected": 2.289384603500366, "logps/chosen": -517.2086181640625, "logps/rejected": -871.466064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.47705602645874, "rewards/margins": 22.173673629760742, "rewards/rejected": -29.65073013305664, "step": 2360 }, { "epoch": 1.4687402799377915, "grad_norm": 0.00484496122226119, "learning_rate": 2.835408022130014e-06, "logits/chosen": -0.6169958114624023, "logits/rejected": 3.451166868209839, "logps/chosen": -461.492431640625, "logps/rejected": -915.6463623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.851531028747559, "rewards/margins": 24.448511123657227, "rewards/rejected": -33.30004119873047, "step": 2361 }, { "epoch": 1.469362363919129, "grad_norm": 0.004142931196838617, "learning_rate": 2.8342554172429694e-06, "logits/chosen": 2.3081841468811035, "logits/rejected": 2.309966564178467, "logps/chosen": -713.6710205078125, "logps/rejected": -979.0992431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.44482421875, "rewards/margins": 25.762096405029297, "rewards/rejected": -35.2069206237793, "step": 2362 }, { "epoch": 1.4699844479004667, "grad_norm": 22.25066566467285, "learning_rate": 2.8331028123559246e-06, "logits/chosen": -2.046438694000244, "logits/rejected": 2.8966598510742188, "logps/chosen": -330.8453369140625, "logps/rejected": -708.985595703125, "loss": 0.1171, "rewards/accuracies": 0.875, "rewards/chosen": -6.468657493591309, "rewards/margins": 15.020978927612305, "rewards/rejected": -21.489635467529297, "step": 2363 }, { "epoch": 1.470606531881804, "grad_norm": 0.6886146664619446, "learning_rate": 2.83195020746888e-06, "logits/chosen": 0.38471102714538574, "logits/rejected": 3.2078094482421875, "logps/chosen": -600.56396484375, "logps/rejected": -885.523193359375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -9.067018508911133, "rewards/margins": 15.421733856201172, "rewards/rejected": -24.488752365112305, "step": 2364 }, { "epoch": 1.4712286158631416, "grad_norm": 0.13812090456485748, "learning_rate": 2.830797602581835e-06, "logits/chosen": 0.7420355081558228, "logits/rejected": 1.6655001640319824, "logps/chosen": -575.5955810546875, "logps/rejected": -820.6859130859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -10.355046272277832, "rewards/margins": 20.670995712280273, "rewards/rejected": -31.02604103088379, "step": 2365 }, { "epoch": 1.4718506998444791, "grad_norm": 4.970563531969674e-05, "learning_rate": 2.8296449976947903e-06, "logits/chosen": -0.8490327596664429, "logits/rejected": 3.9662766456604004, "logps/chosen": -535.6664428710938, "logps/rejected": -1070.893310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.740767478942871, "rewards/margins": 26.906150817871094, "rewards/rejected": -36.64691925048828, "step": 2366 }, { "epoch": 1.4724727838258165, "grad_norm": 0.0006937950383871794, "learning_rate": 2.8284923928077455e-06, "logits/chosen": -0.45125436782836914, "logits/rejected": 2.927877902984619, "logps/chosen": -407.20904541015625, "logps/rejected": -850.866943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.919216156005859, "rewards/margins": 28.38515281677246, "rewards/rejected": -34.30436706542969, "step": 2367 }, { "epoch": 1.473094867807154, "grad_norm": 4.14547061920166, "learning_rate": 2.827339787920701e-06, "logits/chosen": 1.7581843137741089, "logits/rejected": 3.994076728820801, "logps/chosen": -676.138671875, "logps/rejected": -961.73974609375, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -11.300919532775879, "rewards/margins": 17.93962860107422, "rewards/rejected": -29.24054527282715, "step": 2368 }, { "epoch": 1.4737169517884914, "grad_norm": 7.2804209594323765e-06, "learning_rate": 2.8261871830336564e-06, "logits/chosen": 0.9338542222976685, "logits/rejected": 2.5756583213806152, "logps/chosen": -636.443603515625, "logps/rejected": -1058.41796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.650700569152832, "rewards/margins": 31.83884048461914, "rewards/rejected": -40.489540100097656, "step": 2369 }, { "epoch": 1.474339035769829, "grad_norm": 0.13495945930480957, "learning_rate": 2.8250345781466116e-06, "logits/chosen": -0.1379644274711609, "logits/rejected": 4.099135398864746, "logps/chosen": -435.06072998046875, "logps/rejected": -1011.3875732421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.849440574645996, "rewards/margins": 29.290332794189453, "rewards/rejected": -34.139774322509766, "step": 2370 }, { "epoch": 1.4749611197511663, "grad_norm": 34.87321853637695, "learning_rate": 2.823881973259567e-06, "logits/chosen": 0.5016416907310486, "logits/rejected": 3.4392948150634766, "logps/chosen": -574.3455810546875, "logps/rejected": -914.9444580078125, "loss": 0.8498, "rewards/accuracies": 0.875, "rewards/chosen": -9.708126068115234, "rewards/margins": 23.428733825683594, "rewards/rejected": -33.136863708496094, "step": 2371 }, { "epoch": 1.4755832037325038, "grad_norm": 4.791884862243023e-07, "learning_rate": 2.822729368372522e-06, "logits/chosen": 2.531529188156128, "logits/rejected": 3.9152517318725586, "logps/chosen": -632.7254638671875, "logps/rejected": -985.106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.28458309173584, "rewards/margins": 27.402944564819336, "rewards/rejected": -34.687530517578125, "step": 2372 }, { "epoch": 1.4762052877138414, "grad_norm": 0.00026815864839591086, "learning_rate": 2.8215767634854773e-06, "logits/chosen": 0.4294940233230591, "logits/rejected": 2.460738182067871, "logps/chosen": -562.446044921875, "logps/rejected": -992.881591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.796026229858398, "rewards/margins": 30.108896255493164, "rewards/rejected": -37.90492248535156, "step": 2373 }, { "epoch": 1.4768273716951787, "grad_norm": 16.857969284057617, "learning_rate": 2.8204241585984325e-06, "logits/chosen": 0.3010619282722473, "logits/rejected": 2.009814739227295, "logps/chosen": -580.1434326171875, "logps/rejected": -995.7860107421875, "loss": 0.1045, "rewards/accuracies": 0.875, "rewards/chosen": -11.051149368286133, "rewards/margins": 28.202312469482422, "rewards/rejected": -39.25346374511719, "step": 2374 }, { "epoch": 1.4774494556765163, "grad_norm": 0.00010196808580076322, "learning_rate": 2.819271553711388e-06, "logits/chosen": 0.31497353315353394, "logits/rejected": 2.954072952270508, "logps/chosen": -638.5074462890625, "logps/rejected": -1035.220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.442541122436523, "rewards/margins": 24.339162826538086, "rewards/rejected": -36.781707763671875, "step": 2375 }, { "epoch": 1.4780715396578539, "grad_norm": 16.672515869140625, "learning_rate": 2.8181189488243434e-06, "logits/chosen": 0.9463157057762146, "logits/rejected": 2.8403208255767822, "logps/chosen": -544.5169677734375, "logps/rejected": -791.8751831054688, "loss": 0.0848, "rewards/accuracies": 1.0, "rewards/chosen": -7.374601364135742, "rewards/margins": 18.103866577148438, "rewards/rejected": -25.47846794128418, "step": 2376 }, { "epoch": 1.4786936236391912, "grad_norm": 0.0009514009580016136, "learning_rate": 2.8169663439372986e-06, "logits/chosen": 0.3465360999107361, "logits/rejected": 2.0250134468078613, "logps/chosen": -541.4112548828125, "logps/rejected": -1036.6761474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.47393798828125, "rewards/margins": 31.21034049987793, "rewards/rejected": -43.68428039550781, "step": 2377 }, { "epoch": 1.4793157076205288, "grad_norm": 0.003587897401303053, "learning_rate": 2.815813739050254e-06, "logits/chosen": 0.45701614022254944, "logits/rejected": 4.645224571228027, "logps/chosen": -472.3916931152344, "logps/rejected": -922.6441650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.343234539031982, "rewards/margins": 19.303733825683594, "rewards/rejected": -26.646968841552734, "step": 2378 }, { "epoch": 1.4799377916018663, "grad_norm": 1.2445507049560547, "learning_rate": 2.814661134163209e-06, "logits/chosen": 0.3036624789237976, "logits/rejected": 4.275981426239014, "logps/chosen": -469.282958984375, "logps/rejected": -1064.331298828125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -8.483030319213867, "rewards/margins": 35.476776123046875, "rewards/rejected": -43.95980453491211, "step": 2379 }, { "epoch": 1.4805598755832037, "grad_norm": 6.064235549274599e-07, "learning_rate": 2.8135085292761642e-06, "logits/chosen": -0.9457730054855347, "logits/rejected": 3.1594090461730957, "logps/chosen": -401.4171142578125, "logps/rejected": -926.5023803710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.529729127883911, "rewards/margins": 27.780242919921875, "rewards/rejected": -30.309972763061523, "step": 2380 }, { "epoch": 1.4811819595645412, "grad_norm": 1.4774746894836426, "learning_rate": 2.8123559243891195e-06, "logits/chosen": -1.9062919616699219, "logits/rejected": 4.1249895095825195, "logps/chosen": -458.60137939453125, "logps/rejected": -1110.5926513671875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -9.662704467773438, "rewards/margins": 28.99850845336914, "rewards/rejected": -38.66121292114258, "step": 2381 }, { "epoch": 1.4818040435458788, "grad_norm": 0.015088031068444252, "learning_rate": 2.811203319502075e-06, "logits/chosen": -0.5311957597732544, "logits/rejected": 0.9081388115882874, "logps/chosen": -559.6624755859375, "logps/rejected": -943.973876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.011931419372559, "rewards/margins": 30.31785774230957, "rewards/rejected": -39.32978820800781, "step": 2382 }, { "epoch": 1.4824261275272161, "grad_norm": 0.1541348546743393, "learning_rate": 2.8100507146150303e-06, "logits/chosen": 0.4068309962749481, "logits/rejected": 2.128772735595703, "logps/chosen": -577.6309204101562, "logps/rejected": -1012.070556640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.201326370239258, "rewards/margins": 29.582557678222656, "rewards/rejected": -37.78388214111328, "step": 2383 }, { "epoch": 1.4830482115085537, "grad_norm": 9.980174464629954e-10, "learning_rate": 2.8088981097279856e-06, "logits/chosen": -1.31401789188385, "logits/rejected": 2.629791021347046, "logps/chosen": -518.3517456054688, "logps/rejected": -1158.66015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.827581405639648, "rewards/margins": 32.846656799316406, "rewards/rejected": -46.67423629760742, "step": 2384 }, { "epoch": 1.4836702954898913, "grad_norm": 54.381248474121094, "learning_rate": 2.807745504840941e-06, "logits/chosen": -0.3003752827644348, "logits/rejected": 3.1268844604492188, "logps/chosen": -413.068603515625, "logps/rejected": -895.3505859375, "loss": 1.0718, "rewards/accuracies": 0.875, "rewards/chosen": -8.030431747436523, "rewards/margins": 25.447994232177734, "rewards/rejected": -33.478424072265625, "step": 2385 }, { "epoch": 1.4842923794712286, "grad_norm": 19.62732696533203, "learning_rate": 2.806592899953896e-06, "logits/chosen": 0.8926951885223389, "logits/rejected": 2.025569438934326, "logps/chosen": -564.5762329101562, "logps/rejected": -894.4605712890625, "loss": 0.0952, "rewards/accuracies": 0.875, "rewards/chosen": -10.947629928588867, "rewards/margins": 26.661720275878906, "rewards/rejected": -37.609352111816406, "step": 2386 }, { "epoch": 1.4849144634525662, "grad_norm": 0.08074700832366943, "learning_rate": 2.8054402950668512e-06, "logits/chosen": -2.5688016414642334, "logits/rejected": 2.8136441707611084, "logps/chosen": -407.8417053222656, "logps/rejected": -908.9759521484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.713031768798828, "rewards/margins": 24.314756393432617, "rewards/rejected": -30.027786254882812, "step": 2387 }, { "epoch": 1.4855365474339035, "grad_norm": 6.001190211435414e-09, "learning_rate": 2.8042876901798065e-06, "logits/chosen": 0.46303045749664307, "logits/rejected": 4.923661231994629, "logps/chosen": -580.6307373046875, "logps/rejected": -1194.9835205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.170421600341797, "rewards/margins": 36.211002349853516, "rewards/rejected": -45.38142395019531, "step": 2388 }, { "epoch": 1.486158631415241, "grad_norm": 3.6265132427215576, "learning_rate": 2.8031350852927617e-06, "logits/chosen": 0.7024605870246887, "logits/rejected": 3.2725701332092285, "logps/chosen": -553.1900024414062, "logps/rejected": -950.600830078125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -6.3401360511779785, "rewards/margins": 20.61620330810547, "rewards/rejected": -26.956340789794922, "step": 2389 }, { "epoch": 1.4867807153965784, "grad_norm": 29.599863052368164, "learning_rate": 2.8019824804057173e-06, "logits/chosen": 0.4895651340484619, "logits/rejected": 2.5251917839050293, "logps/chosen": -574.4183959960938, "logps/rejected": -955.8038330078125, "loss": 0.1737, "rewards/accuracies": 0.875, "rewards/chosen": -13.665485382080078, "rewards/margins": 21.212478637695312, "rewards/rejected": -34.877960205078125, "step": 2390 }, { "epoch": 1.487402799377916, "grad_norm": 0.0006918342551216483, "learning_rate": 2.8008298755186726e-06, "logits/chosen": -0.6636053323745728, "logits/rejected": 2.680243968963623, "logps/chosen": -415.1027526855469, "logps/rejected": -902.0926513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.417555809020996, "rewards/margins": 26.878646850585938, "rewards/rejected": -37.296199798583984, "step": 2391 }, { "epoch": 1.4880248833592535, "grad_norm": 0.006915316917002201, "learning_rate": 2.799677270631628e-06, "logits/chosen": -3.2200422286987305, "logits/rejected": 0.09259417653083801, "logps/chosen": -331.16986083984375, "logps/rejected": -823.7048950195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.002388954162598, "rewards/margins": 33.96484375, "rewards/rejected": -40.96723556518555, "step": 2392 }, { "epoch": 1.4886469673405909, "grad_norm": 22.634033203125, "learning_rate": 2.798524665744583e-06, "logits/chosen": -2.392181873321533, "logits/rejected": 3.8128175735473633, "logps/chosen": -507.95855712890625, "logps/rejected": -1193.1182861328125, "loss": 0.1397, "rewards/accuracies": 0.875, "rewards/chosen": -6.660463809967041, "rewards/margins": 31.40123748779297, "rewards/rejected": -38.061702728271484, "step": 2393 }, { "epoch": 1.4892690513219284, "grad_norm": 8.997950553894043, "learning_rate": 2.7973720608575382e-06, "logits/chosen": -0.7331479787826538, "logits/rejected": 4.469404697418213, "logps/chosen": -514.8806762695312, "logps/rejected": -1097.8677978515625, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -7.211360931396484, "rewards/margins": 28.35735321044922, "rewards/rejected": -35.5687141418457, "step": 2394 }, { "epoch": 1.489891135303266, "grad_norm": 13.998196601867676, "learning_rate": 2.7962194559704935e-06, "logits/chosen": 1.6632962226867676, "logits/rejected": 5.319121360778809, "logps/chosen": -552.4025268554688, "logps/rejected": -1164.188720703125, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": -10.791094779968262, "rewards/margins": 33.89095687866211, "rewards/rejected": -44.68205261230469, "step": 2395 }, { "epoch": 1.4905132192846033, "grad_norm": 39.58431625366211, "learning_rate": 2.7950668510834487e-06, "logits/chosen": -0.7600829005241394, "logits/rejected": 4.01608419418335, "logps/chosen": -558.3819580078125, "logps/rejected": -1057.7418212890625, "loss": 0.2663, "rewards/accuracies": 0.875, "rewards/chosen": -6.818014621734619, "rewards/margins": 28.740753173828125, "rewards/rejected": -35.55876922607422, "step": 2396 }, { "epoch": 1.491135303265941, "grad_norm": 0.0052307723090052605, "learning_rate": 2.7939142461964043e-06, "logits/chosen": -0.843339204788208, "logits/rejected": 2.3956422805786133, "logps/chosen": -650.19873046875, "logps/rejected": -1152.5457763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.485787391662598, "rewards/margins": 31.080204010009766, "rewards/rejected": -45.56599426269531, "step": 2397 }, { "epoch": 1.4917573872472785, "grad_norm": 6.963534815440653e-08, "learning_rate": 2.7927616413093596e-06, "logits/chosen": -2.2730250358581543, "logits/rejected": 2.440248489379883, "logps/chosen": -517.8671875, "logps/rejected": -1140.94873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.223237037658691, "rewards/margins": 35.51857376098633, "rewards/rejected": -46.74181365966797, "step": 2398 }, { "epoch": 1.4923794712286158, "grad_norm": 11.254664421081543, "learning_rate": 2.791609036422315e-06, "logits/chosen": 0.6596075892448425, "logits/rejected": 2.2641704082489014, "logps/chosen": -540.0404052734375, "logps/rejected": -805.3961181640625, "loss": 0.1092, "rewards/accuracies": 0.875, "rewards/chosen": -9.902894973754883, "rewards/margins": 15.764644622802734, "rewards/rejected": -25.667539596557617, "step": 2399 }, { "epoch": 1.4930015552099534, "grad_norm": 14.557866096496582, "learning_rate": 2.79045643153527e-06, "logits/chosen": 0.24826712906360626, "logits/rejected": 2.864927291870117, "logps/chosen": -531.9868774414062, "logps/rejected": -907.9967041015625, "loss": 0.123, "rewards/accuracies": 0.875, "rewards/chosen": -7.463442802429199, "rewards/margins": 25.134521484375, "rewards/rejected": -32.59796142578125, "step": 2400 }, { "epoch": 1.493623639191291, "grad_norm": 17.49602699279785, "learning_rate": 2.7893038266482252e-06, "logits/chosen": 1.243196725845337, "logits/rejected": 4.108986854553223, "logps/chosen": -593.2510986328125, "logps/rejected": -947.8580322265625, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -10.749677658081055, "rewards/margins": 21.538341522216797, "rewards/rejected": -32.288021087646484, "step": 2401 }, { "epoch": 1.4942457231726283, "grad_norm": 9.584007329976885e-07, "learning_rate": 2.7881512217611805e-06, "logits/chosen": 2.3788280487060547, "logits/rejected": 3.814729690551758, "logps/chosen": -558.6822509765625, "logps/rejected": -915.1446533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.254256248474121, "rewards/margins": 29.16737937927246, "rewards/rejected": -38.42163848876953, "step": 2402 }, { "epoch": 1.4948678071539658, "grad_norm": 0.0007437972817569971, "learning_rate": 2.7869986168741357e-06, "logits/chosen": -2.4514694213867188, "logits/rejected": 2.2673568725585938, "logps/chosen": -317.77044677734375, "logps/rejected": -846.106689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.250749588012695, "rewards/margins": 26.648483276367188, "rewards/rejected": -31.899234771728516, "step": 2403 }, { "epoch": 1.4954898911353034, "grad_norm": 2.4952751118689775e-05, "learning_rate": 2.7858460119870913e-06, "logits/chosen": 0.03027331829071045, "logits/rejected": 3.5894298553466797, "logps/chosen": -426.458251953125, "logps/rejected": -831.8810424804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.599156379699707, "rewards/margins": 19.644079208374023, "rewards/rejected": -28.243236541748047, "step": 2404 }, { "epoch": 1.4961119751166407, "grad_norm": 0.021602489054203033, "learning_rate": 2.7846934071000466e-06, "logits/chosen": -0.24435366690158844, "logits/rejected": 2.2700464725494385, "logps/chosen": -568.7886962890625, "logps/rejected": -955.20751953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.295473098754883, "rewards/margins": 25.87757682800293, "rewards/rejected": -38.17304992675781, "step": 2405 }, { "epoch": 1.4967340590979783, "grad_norm": 49.02490997314453, "learning_rate": 2.7835408022130018e-06, "logits/chosen": -0.45938390493392944, "logits/rejected": 1.8616430759429932, "logps/chosen": -490.0686340332031, "logps/rejected": -981.237060546875, "loss": 0.4472, "rewards/accuracies": 0.875, "rewards/chosen": -8.393717765808105, "rewards/margins": 27.700580596923828, "rewards/rejected": -36.094303131103516, "step": 2406 }, { "epoch": 1.4973561430793156, "grad_norm": 1.6386419534683228, "learning_rate": 2.782388197325957e-06, "logits/chosen": -0.5539921522140503, "logits/rejected": 2.1543989181518555, "logps/chosen": -465.9166259765625, "logps/rejected": -878.362548828125, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -6.275735378265381, "rewards/margins": 26.394481658935547, "rewards/rejected": -32.67021560668945, "step": 2407 }, { "epoch": 1.4979782270606532, "grad_norm": 0.004477610811591148, "learning_rate": 2.7812355924389122e-06, "logits/chosen": -0.5151447057723999, "logits/rejected": 3.5004169940948486, "logps/chosen": -492.4542236328125, "logps/rejected": -857.807861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.714780807495117, "rewards/margins": 21.09079360961914, "rewards/rejected": -31.805574417114258, "step": 2408 }, { "epoch": 1.4986003110419905, "grad_norm": 0.049627888947725296, "learning_rate": 2.7800829875518675e-06, "logits/chosen": 2.9913344383239746, "logits/rejected": 3.396819591522217, "logps/chosen": -797.1928100585938, "logps/rejected": -1062.549072265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.408965110778809, "rewards/margins": 27.537857055664062, "rewards/rejected": -39.94682312011719, "step": 2409 }, { "epoch": 1.499222395023328, "grad_norm": 0.9994378089904785, "learning_rate": 2.7789303826648227e-06, "logits/chosen": -0.025385677814483643, "logits/rejected": 2.2981178760528564, "logps/chosen": -588.76025390625, "logps/rejected": -896.0594482421875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -11.430543899536133, "rewards/margins": 17.48183250427246, "rewards/rejected": -28.912376403808594, "step": 2410 }, { "epoch": 1.4998444790046657, "grad_norm": 3.7340683937072754, "learning_rate": 2.7777777777777783e-06, "logits/chosen": -2.0321695804595947, "logits/rejected": 1.818973183631897, "logps/chosen": -518.645751953125, "logps/rejected": -1016.1298217773438, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -9.84179973602295, "rewards/margins": 25.71894645690918, "rewards/rejected": -35.56074523925781, "step": 2411 }, { "epoch": 1.500466562986003, "grad_norm": 0.8598123788833618, "learning_rate": 2.7766251728907336e-06, "logits/chosen": 2.643171787261963, "logits/rejected": 3.5609869956970215, "logps/chosen": -689.77197265625, "logps/rejected": -989.2908935546875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -13.747472763061523, "rewards/margins": 24.951494216918945, "rewards/rejected": -38.69896697998047, "step": 2412 }, { "epoch": 1.5010886469673406, "grad_norm": 0.0394817516207695, "learning_rate": 2.7754725680036888e-06, "logits/chosen": -0.7106583714485168, "logits/rejected": 1.8233951330184937, "logps/chosen": -603.9002685546875, "logps/rejected": -1166.8470458984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.380064964294434, "rewards/margins": 37.42149353027344, "rewards/rejected": -46.80155944824219, "step": 2413 }, { "epoch": 1.5017107309486781, "grad_norm": 0.15796667337417603, "learning_rate": 2.774319963116644e-06, "logits/chosen": 0.41977572441101074, "logits/rejected": 2.346480369567871, "logps/chosen": -530.1688232421875, "logps/rejected": -940.331298828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.626280784606934, "rewards/margins": 26.990314483642578, "rewards/rejected": -37.61659622192383, "step": 2414 }, { "epoch": 1.5023328149300155, "grad_norm": 35.47513198852539, "learning_rate": 2.7731673582295992e-06, "logits/chosen": 0.3368704319000244, "logits/rejected": 2.681093454360962, "logps/chosen": -392.08807373046875, "logps/rejected": -673.9613647460938, "loss": 0.343, "rewards/accuracies": 0.875, "rewards/chosen": -8.13431453704834, "rewards/margins": 20.258684158325195, "rewards/rejected": -28.39299964904785, "step": 2415 }, { "epoch": 1.502954898911353, "grad_norm": 0.008840728551149368, "learning_rate": 2.7720147533425545e-06, "logits/chosen": 1.6901668310165405, "logits/rejected": 3.677003860473633, "logps/chosen": -688.974609375, "logps/rejected": -1004.6647338867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.885746002197266, "rewards/margins": 22.908794403076172, "rewards/rejected": -33.79454040527344, "step": 2416 }, { "epoch": 1.5035769828926906, "grad_norm": 0.00016085940296761692, "learning_rate": 2.7708621484555097e-06, "logits/chosen": -0.6000373363494873, "logits/rejected": 2.4729881286621094, "logps/chosen": -447.89678955078125, "logps/rejected": -982.514892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.45212173461914, "rewards/margins": 33.85790252685547, "rewards/rejected": -42.310020446777344, "step": 2417 }, { "epoch": 1.504199066874028, "grad_norm": 6.961882172618061e-05, "learning_rate": 2.769709543568465e-06, "logits/chosen": 1.9195138216018677, "logits/rejected": 3.281461715698242, "logps/chosen": -664.70751953125, "logps/rejected": -1080.923095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.098590850830078, "rewards/margins": 28.69293212890625, "rewards/rejected": -38.79152297973633, "step": 2418 }, { "epoch": 1.5048211508553655, "grad_norm": 0.03905373439192772, "learning_rate": 2.7685569386814206e-06, "logits/chosen": 0.5288473963737488, "logits/rejected": 2.606009006500244, "logps/chosen": -588.6915283203125, "logps/rejected": -1013.2470092773438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.714137077331543, "rewards/margins": 31.635147094726562, "rewards/rejected": -41.34928512573242, "step": 2419 }, { "epoch": 1.505443234836703, "grad_norm": 51.066200256347656, "learning_rate": 2.7674043337943758e-06, "logits/chosen": 0.3447500765323639, "logits/rejected": 2.5495049953460693, "logps/chosen": -701.102294921875, "logps/rejected": -1111.52685546875, "loss": 0.7873, "rewards/accuracies": 0.875, "rewards/chosen": -15.652717590332031, "rewards/margins": 34.65187072753906, "rewards/rejected": -50.30458450317383, "step": 2420 }, { "epoch": 1.5060653188180404, "grad_norm": 4.544334411621094, "learning_rate": 2.766251728907331e-06, "logits/chosen": -0.7580356001853943, "logits/rejected": 2.633445978164673, "logps/chosen": -559.7318725585938, "logps/rejected": -1010.6101684570312, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -10.76829719543457, "rewards/margins": 29.79266357421875, "rewards/rejected": -40.56096267700195, "step": 2421 }, { "epoch": 1.506687402799378, "grad_norm": 0.5855236649513245, "learning_rate": 2.7650991240202862e-06, "logits/chosen": 1.4826244115829468, "logits/rejected": 5.066828727722168, "logps/chosen": -537.537353515625, "logps/rejected": -894.7590942382812, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -9.093587875366211, "rewards/margins": 23.76456642150879, "rewards/rejected": -32.858154296875, "step": 2422 }, { "epoch": 1.5073094867807155, "grad_norm": 7.518645617210495e-08, "learning_rate": 2.7639465191332415e-06, "logits/chosen": 0.8941356539726257, "logits/rejected": 3.034396171569824, "logps/chosen": -535.9212646484375, "logps/rejected": -1033.3935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.346717834472656, "rewards/margins": 33.61785125732422, "rewards/rejected": -41.964569091796875, "step": 2423 }, { "epoch": 1.5079315707620529, "grad_norm": 2.77945876121521, "learning_rate": 2.7627939142461967e-06, "logits/chosen": 0.5460835695266724, "logits/rejected": 4.352009296417236, "logps/chosen": -500.4707946777344, "logps/rejected": -1004.465576171875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -9.690991401672363, "rewards/margins": 25.8135986328125, "rewards/rejected": -35.50458908081055, "step": 2424 }, { "epoch": 1.5085536547433902, "grad_norm": 9.093402475457424e-09, "learning_rate": 2.761641309359152e-06, "logits/chosen": -3.632844924926758, "logits/rejected": 3.0162880420684814, "logps/chosen": -412.83270263671875, "logps/rejected": -1213.8994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.727034091949463, "rewards/margins": 44.87699890136719, "rewards/rejected": -51.604034423828125, "step": 2425 }, { "epoch": 1.509175738724728, "grad_norm": 0.19694973528385162, "learning_rate": 2.7604887044721076e-06, "logits/chosen": 0.878905177116394, "logits/rejected": 3.315678596496582, "logps/chosen": -607.8718872070312, "logps/rejected": -988.4095458984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -10.330745697021484, "rewards/margins": 26.40803337097168, "rewards/rejected": -36.7387809753418, "step": 2426 }, { "epoch": 1.5097978227060653, "grad_norm": 1.872478060249705e-05, "learning_rate": 2.7593360995850628e-06, "logits/chosen": -0.5195698738098145, "logits/rejected": 1.6151765584945679, "logps/chosen": -601.8635864257812, "logps/rejected": -897.02099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.940296173095703, "rewards/margins": 25.409744262695312, "rewards/rejected": -35.350040435791016, "step": 2427 }, { "epoch": 1.5104199066874027, "grad_norm": 35.12553024291992, "learning_rate": 2.758183494698018e-06, "logits/chosen": -0.8151949644088745, "logits/rejected": 2.569201707839966, "logps/chosen": -508.86138916015625, "logps/rejected": -909.9303588867188, "loss": 0.4958, "rewards/accuracies": 0.875, "rewards/chosen": -10.214035034179688, "rewards/margins": 23.001964569091797, "rewards/rejected": -33.215999603271484, "step": 2428 }, { "epoch": 1.5110419906687402, "grad_norm": 3.7998273372650146, "learning_rate": 2.7570308898109732e-06, "logits/chosen": 1.9752259254455566, "logits/rejected": 4.596410274505615, "logps/chosen": -713.7828369140625, "logps/rejected": -1202.2008056640625, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -11.797355651855469, "rewards/margins": 27.169727325439453, "rewards/rejected": -38.96708297729492, "step": 2429 }, { "epoch": 1.5116640746500778, "grad_norm": 27.40531349182129, "learning_rate": 2.7558782849239285e-06, "logits/chosen": 2.0464298725128174, "logits/rejected": 4.918285369873047, "logps/chosen": -549.6773681640625, "logps/rejected": -924.9179077148438, "loss": 0.1815, "rewards/accuracies": 0.875, "rewards/chosen": -11.397310256958008, "rewards/margins": 23.025856018066406, "rewards/rejected": -34.42316436767578, "step": 2430 }, { "epoch": 1.5122861586314151, "grad_norm": 8.02556037902832, "learning_rate": 2.7547256800368837e-06, "logits/chosen": 0.1492936760187149, "logits/rejected": 1.9480422735214233, "logps/chosen": -600.2176513671875, "logps/rejected": -991.26806640625, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -9.710877418518066, "rewards/margins": 25.57059097290039, "rewards/rejected": -35.28146743774414, "step": 2431 }, { "epoch": 1.5129082426127527, "grad_norm": 0.0011324110673740506, "learning_rate": 2.753573075149839e-06, "logits/chosen": -2.1036806106567383, "logits/rejected": 2.825481414794922, "logps/chosen": -573.4466552734375, "logps/rejected": -1138.679931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.326138496398926, "rewards/margins": 28.386465072631836, "rewards/rejected": -42.71260452270508, "step": 2432 }, { "epoch": 1.5135303265940903, "grad_norm": 0.0010255653178319335, "learning_rate": 2.7524204702627945e-06, "logits/chosen": 2.479342460632324, "logits/rejected": 4.253589630126953, "logps/chosen": -537.177001953125, "logps/rejected": -864.4534301757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.309935569763184, "rewards/margins": 23.93538475036621, "rewards/rejected": -32.24531936645508, "step": 2433 }, { "epoch": 1.5141524105754276, "grad_norm": 1.3664502773735876e-07, "learning_rate": 2.7512678653757498e-06, "logits/chosen": -2.0502357482910156, "logits/rejected": 3.057553768157959, "logps/chosen": -561.5479125976562, "logps/rejected": -1400.86767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.96968936920166, "rewards/margins": 49.919158935546875, "rewards/rejected": -59.888851165771484, "step": 2434 }, { "epoch": 1.5147744945567652, "grad_norm": 1.4722614878337481e-06, "learning_rate": 2.750115260488705e-06, "logits/chosen": 0.7835877537727356, "logits/rejected": 3.7953410148620605, "logps/chosen": -514.9024047851562, "logps/rejected": -1092.03271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.49221420288086, "rewards/margins": 32.406341552734375, "rewards/rejected": -40.898555755615234, "step": 2435 }, { "epoch": 1.5153965785381027, "grad_norm": 0.000481670256704092, "learning_rate": 2.7489626556016602e-06, "logits/chosen": 1.926286220550537, "logits/rejected": 4.122426986694336, "logps/chosen": -615.120361328125, "logps/rejected": -1095.592529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.4006242752075195, "rewards/margins": 35.62769317626953, "rewards/rejected": -42.0283203125, "step": 2436 }, { "epoch": 1.51601866251944, "grad_norm": 0.13158220052719116, "learning_rate": 2.7478100507146154e-06, "logits/chosen": 2.438539505004883, "logits/rejected": 3.7112181186676025, "logps/chosen": -540.5350952148438, "logps/rejected": -866.4390869140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.910808563232422, "rewards/margins": 23.664012908935547, "rewards/rejected": -30.574819564819336, "step": 2437 }, { "epoch": 1.5166407465007776, "grad_norm": 1.392264485359192, "learning_rate": 2.7466574458275707e-06, "logits/chosen": 1.2793818712234497, "logits/rejected": 3.091960906982422, "logps/chosen": -674.7471313476562, "logps/rejected": -1019.5752563476562, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -16.084941864013672, "rewards/margins": 23.3023624420166, "rewards/rejected": -39.387306213378906, "step": 2438 }, { "epoch": 1.5172628304821152, "grad_norm": 22.623865127563477, "learning_rate": 2.745504840940526e-06, "logits/chosen": 0.5542949438095093, "logits/rejected": 1.8071708679199219, "logps/chosen": -590.0474243164062, "logps/rejected": -921.1709594726562, "loss": 0.2623, "rewards/accuracies": 0.875, "rewards/chosen": -13.720653533935547, "rewards/margins": 23.469879150390625, "rewards/rejected": -37.19053649902344, "step": 2439 }, { "epoch": 1.5178849144634525, "grad_norm": 6.497164577012882e-05, "learning_rate": 2.744352236053481e-06, "logits/chosen": 0.5079193115234375, "logits/rejected": 2.893798351287842, "logps/chosen": -563.1099853515625, "logps/rejected": -988.0438232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.849778175354004, "rewards/margins": 30.442718505859375, "rewards/rejected": -42.29249572753906, "step": 2440 }, { "epoch": 1.51850699844479, "grad_norm": 3.525198221206665, "learning_rate": 2.7431996311664368e-06, "logits/chosen": 1.9516997337341309, "logits/rejected": 2.452531337738037, "logps/chosen": -795.6206665039062, "logps/rejected": -1220.8349609375, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -15.373698234558105, "rewards/margins": 30.371505737304688, "rewards/rejected": -45.745201110839844, "step": 2441 }, { "epoch": 1.5191290824261277, "grad_norm": 0.2017599195241928, "learning_rate": 2.742047026279392e-06, "logits/chosen": -0.42940008640289307, "logits/rejected": 2.5336532592773438, "logps/chosen": -372.24713134765625, "logps/rejected": -885.3436279296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.162657737731934, "rewards/margins": 28.138328552246094, "rewards/rejected": -35.300987243652344, "step": 2442 }, { "epoch": 1.519751166407465, "grad_norm": 7.490428970413632e-07, "learning_rate": 2.7408944213923472e-06, "logits/chosen": 0.3331374526023865, "logits/rejected": 3.346038818359375, "logps/chosen": -534.9227294921875, "logps/rejected": -993.5345458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.115105628967285, "rewards/margins": 27.73911476135254, "rewards/rejected": -38.854217529296875, "step": 2443 }, { "epoch": 1.5203732503888023, "grad_norm": 0.21721665561199188, "learning_rate": 2.7397418165053024e-06, "logits/chosen": 0.08835101127624512, "logits/rejected": 1.9229445457458496, "logps/chosen": -594.24609375, "logps/rejected": -945.4306640625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.304940223693848, "rewards/margins": 23.95694351196289, "rewards/rejected": -33.261878967285156, "step": 2444 }, { "epoch": 1.5209953343701401, "grad_norm": 0.0029869587160646915, "learning_rate": 2.7385892116182577e-06, "logits/chosen": -0.9350767135620117, "logits/rejected": 2.4976651668548584, "logps/chosen": -440.30133056640625, "logps/rejected": -1002.122314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.273009300231934, "rewards/margins": 29.780683517456055, "rewards/rejected": -39.05369186401367, "step": 2445 }, { "epoch": 1.5216174183514775, "grad_norm": 0.018518727272748947, "learning_rate": 2.737436606731213e-06, "logits/chosen": -1.7265733480453491, "logits/rejected": 3.20708966255188, "logps/chosen": -387.7981872558594, "logps/rejected": -950.6136474609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.578176498413086, "rewards/margins": 31.040130615234375, "rewards/rejected": -39.618309020996094, "step": 2446 }, { "epoch": 1.5222395023328148, "grad_norm": 1.1724967956542969, "learning_rate": 2.736284001844168e-06, "logits/chosen": 0.09461307525634766, "logits/rejected": 3.0579278469085693, "logps/chosen": -470.50006103515625, "logps/rejected": -860.4100341796875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -8.176168441772461, "rewards/margins": 23.08479118347168, "rewards/rejected": -31.26095962524414, "step": 2447 }, { "epoch": 1.5228615863141524, "grad_norm": 0.5167633891105652, "learning_rate": 2.7351313969571238e-06, "logits/chosen": 1.8386075496673584, "logits/rejected": 2.3939671516418457, "logps/chosen": -635.957763671875, "logps/rejected": -812.0810546875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -13.093191146850586, "rewards/margins": 19.466598510742188, "rewards/rejected": -32.55978775024414, "step": 2448 }, { "epoch": 1.52348367029549, "grad_norm": 0.06474962085485458, "learning_rate": 2.733978792070078e-06, "logits/chosen": 0.4039455056190491, "logits/rejected": 3.9968762397766113, "logps/chosen": -536.2996826171875, "logps/rejected": -1120.7918701171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.25309944152832, "rewards/margins": 29.0009822845459, "rewards/rejected": -36.25408172607422, "step": 2449 }, { "epoch": 1.5241057542768273, "grad_norm": 1.1716054359567352e-05, "learning_rate": 2.732826187183034e-06, "logits/chosen": 2.6479225158691406, "logits/rejected": 3.409113645553589, "logps/chosen": -719.8601684570312, "logps/rejected": -981.4459228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.668313980102539, "rewards/margins": 25.581754684448242, "rewards/rejected": -36.250064849853516, "step": 2450 }, { "epoch": 1.5247278382581648, "grad_norm": 0.0022444785572588444, "learning_rate": 2.731673582295989e-06, "logits/chosen": 0.8789412975311279, "logits/rejected": 1.1855617761611938, "logps/chosen": -638.7889404296875, "logps/rejected": -830.7925415039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.223556518554688, "rewards/margins": 21.04960060119629, "rewards/rejected": -33.273155212402344, "step": 2451 }, { "epoch": 1.5253499222395024, "grad_norm": 1.4134977845969843e-06, "learning_rate": 2.7305209774089442e-06, "logits/chosen": -0.17751866579055786, "logits/rejected": 3.011488437652588, "logps/chosen": -519.5287475585938, "logps/rejected": -1034.9293212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.822946548461914, "rewards/margins": 32.01959228515625, "rewards/rejected": -45.8425407409668, "step": 2452 }, { "epoch": 1.5259720062208397, "grad_norm": 1.7072126823336475e-08, "learning_rate": 2.7293683725218995e-06, "logits/chosen": -0.6224250793457031, "logits/rejected": 3.233564853668213, "logps/chosen": -584.4635620117188, "logps/rejected": -1117.71728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.048900604248047, "rewards/margins": 32.317787170410156, "rewards/rejected": -41.36669158935547, "step": 2453 }, { "epoch": 1.5265940902021773, "grad_norm": 1.3031844900979195e-05, "learning_rate": 2.7282157676348547e-06, "logits/chosen": 3.390267848968506, "logits/rejected": 4.746077537536621, "logps/chosen": -832.5244140625, "logps/rejected": -1280.4586181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.035869598388672, "rewards/margins": 36.55662155151367, "rewards/rejected": -52.592491149902344, "step": 2454 }, { "epoch": 1.5272161741835149, "grad_norm": 0.4124351441860199, "learning_rate": 2.72706316274781e-06, "logits/chosen": 1.1203548908233643, "logits/rejected": 1.8139748573303223, "logps/chosen": -593.9185791015625, "logps/rejected": -884.8736572265625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.489469528198242, "rewards/margins": 25.796775817871094, "rewards/rejected": -33.2862434387207, "step": 2455 }, { "epoch": 1.5278382581648522, "grad_norm": 2.913459062576294, "learning_rate": 2.725910557860765e-06, "logits/chosen": 1.5062038898468018, "logits/rejected": 3.94240665435791, "logps/chosen": -485.2527770996094, "logps/rejected": -734.7945556640625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -9.377772331237793, "rewards/margins": 17.07063865661621, "rewards/rejected": -26.448410034179688, "step": 2456 }, { "epoch": 1.5284603421461898, "grad_norm": 1.1580379009246826, "learning_rate": 2.724757952973721e-06, "logits/chosen": -1.4506340026855469, "logits/rejected": 1.637460708618164, "logps/chosen": -512.544921875, "logps/rejected": -956.2432861328125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -8.50003719329834, "rewards/margins": 24.926979064941406, "rewards/rejected": -33.42701721191406, "step": 2457 }, { "epoch": 1.5290824261275273, "grad_norm": 0.837179958820343, "learning_rate": 2.723605348086676e-06, "logits/chosen": -2.831498622894287, "logits/rejected": 1.7902092933654785, "logps/chosen": -421.67547607421875, "logps/rejected": -812.24609375, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -9.50699234008789, "rewards/margins": 21.368635177612305, "rewards/rejected": -30.875625610351562, "step": 2458 }, { "epoch": 1.5297045101088647, "grad_norm": 0.04172046482563019, "learning_rate": 2.7224527431996312e-06, "logits/chosen": -3.2224364280700684, "logits/rejected": 3.2389562129974365, "logps/chosen": -271.430419921875, "logps/rejected": -872.8142700195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.220527648925781, "rewards/margins": 31.60157012939453, "rewards/rejected": -37.82209777832031, "step": 2459 }, { "epoch": 1.5303265940902022, "grad_norm": 0.00036741772782988846, "learning_rate": 2.7213001383125865e-06, "logits/chosen": -0.7951878905296326, "logits/rejected": 2.4324963092803955, "logps/chosen": -371.46630859375, "logps/rejected": -739.7484741210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.226638317108154, "rewards/margins": 19.29977798461914, "rewards/rejected": -25.526412963867188, "step": 2460 }, { "epoch": 1.5309486780715398, "grad_norm": 33.964141845703125, "learning_rate": 2.7201475334255417e-06, "logits/chosen": 1.0293166637420654, "logits/rejected": 3.4800872802734375, "logps/chosen": -537.6439819335938, "logps/rejected": -1003.99462890625, "loss": 0.4305, "rewards/accuracies": 0.875, "rewards/chosen": -8.282417297363281, "rewards/margins": 34.82405090332031, "rewards/rejected": -43.10646438598633, "step": 2461 }, { "epoch": 1.5315707620528771, "grad_norm": 0.007006136234849691, "learning_rate": 2.718994928538497e-06, "logits/chosen": 0.6648842692375183, "logits/rejected": -0.3263876438140869, "logps/chosen": -736.0245361328125, "logps/rejected": -854.2238159179688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.421032905578613, "rewards/margins": 23.76239585876465, "rewards/rejected": -31.183427810668945, "step": 2462 }, { "epoch": 1.5321928460342145, "grad_norm": 0.023654131218791008, "learning_rate": 2.717842323651452e-06, "logits/chosen": -0.5091513991355896, "logits/rejected": 2.5750484466552734, "logps/chosen": -364.62310791015625, "logps/rejected": -830.571044921875, "loss": 0.0866, "rewards/accuracies": 0.875, "rewards/chosen": -7.813291549682617, "rewards/margins": 25.79483413696289, "rewards/rejected": -33.608123779296875, "step": 2463 }, { "epoch": 1.5328149300155522, "grad_norm": 2.9299342713784426e-05, "learning_rate": 2.7166897187644074e-06, "logits/chosen": -0.37343019247055054, "logits/rejected": 1.7793900966644287, "logps/chosen": -414.75274658203125, "logps/rejected": -872.8060913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2207536697387695, "rewards/margins": 34.36773681640625, "rewards/rejected": -40.58848571777344, "step": 2464 }, { "epoch": 1.5334370139968896, "grad_norm": 0.12796840071678162, "learning_rate": 2.715537113877363e-06, "logits/chosen": -0.22504746913909912, "logits/rejected": 2.651764392852783, "logps/chosen": -532.3455810546875, "logps/rejected": -983.6930541992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.367204666137695, "rewards/margins": 27.68272590637207, "rewards/rejected": -37.049930572509766, "step": 2465 }, { "epoch": 1.534059097978227, "grad_norm": 22.160669326782227, "learning_rate": 2.7143845089903182e-06, "logits/chosen": -0.8312405943870544, "logits/rejected": 1.0451204776763916, "logps/chosen": -529.1278076171875, "logps/rejected": -887.6492919921875, "loss": 0.2574, "rewards/accuracies": 0.875, "rewards/chosen": -12.91517162322998, "rewards/margins": 27.073076248168945, "rewards/rejected": -39.988250732421875, "step": 2466 }, { "epoch": 1.5346811819595645, "grad_norm": 0.0007610557368025184, "learning_rate": 2.7132319041032735e-06, "logits/chosen": -1.3976175785064697, "logits/rejected": 3.0815062522888184, "logps/chosen": -397.5416259765625, "logps/rejected": -866.3143310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.541437149047852, "rewards/margins": 28.345874786376953, "rewards/rejected": -37.88731384277344, "step": 2467 }, { "epoch": 1.535303265940902, "grad_norm": 0.014254836365580559, "learning_rate": 2.7120792992162287e-06, "logits/chosen": -0.7080047130584717, "logits/rejected": 2.4305598735809326, "logps/chosen": -506.60711669921875, "logps/rejected": -929.2200927734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.860275268554688, "rewards/margins": 21.704132080078125, "rewards/rejected": -34.56440734863281, "step": 2468 }, { "epoch": 1.5359253499222394, "grad_norm": 0.08221913874149323, "learning_rate": 2.710926694329184e-06, "logits/chosen": -0.03373962268233299, "logits/rejected": 2.4926323890686035, "logps/chosen": -512.6092529296875, "logps/rejected": -846.0359497070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.146074295043945, "rewards/margins": 20.820676803588867, "rewards/rejected": -27.96674919128418, "step": 2469 }, { "epoch": 1.536547433903577, "grad_norm": 0.9291184544563293, "learning_rate": 2.709774089442139e-06, "logits/chosen": 2.647587537765503, "logits/rejected": 3.6963562965393066, "logps/chosen": -655.6061401367188, "logps/rejected": -922.4675903320312, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -10.961267471313477, "rewards/margins": 19.87137222290039, "rewards/rejected": -30.8326416015625, "step": 2470 }, { "epoch": 1.5371695178849145, "grad_norm": 0.0023878749925643206, "learning_rate": 2.7086214845550944e-06, "logits/chosen": -0.08294537663459778, "logits/rejected": 3.287048578262329, "logps/chosen": -385.4952392578125, "logps/rejected": -815.072021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.807106018066406, "rewards/margins": 21.306137084960938, "rewards/rejected": -30.113243103027344, "step": 2471 }, { "epoch": 1.5377916018662519, "grad_norm": 0.7342345714569092, "learning_rate": 2.70746887966805e-06, "logits/chosen": 0.5753480195999146, "logits/rejected": 3.7676868438720703, "logps/chosen": -441.8882751464844, "logps/rejected": -854.6257934570312, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -8.644134521484375, "rewards/margins": 24.062170028686523, "rewards/rejected": -32.70630645751953, "step": 2472 }, { "epoch": 1.5384136858475894, "grad_norm": 27.15131187438965, "learning_rate": 2.7063162747810052e-06, "logits/chosen": 1.2550212144851685, "logits/rejected": 2.9827208518981934, "logps/chosen": -442.3372802734375, "logps/rejected": -729.8902587890625, "loss": 0.1199, "rewards/accuracies": 0.875, "rewards/chosen": -10.713069915771484, "rewards/margins": 15.846502304077148, "rewards/rejected": -26.5595703125, "step": 2473 }, { "epoch": 1.539035769828927, "grad_norm": 0.022780759260058403, "learning_rate": 2.7051636698939605e-06, "logits/chosen": 0.1612262725830078, "logits/rejected": 4.642159938812256, "logps/chosen": -553.856689453125, "logps/rejected": -1031.6358642578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.07167911529541, "rewards/margins": 21.338481903076172, "rewards/rejected": -30.4101619720459, "step": 2474 }, { "epoch": 1.5396578538102643, "grad_norm": 3.3649816266745347e-10, "learning_rate": 2.7040110650069157e-06, "logits/chosen": -3.152029275894165, "logits/rejected": 1.914430856704712, "logps/chosen": -467.9855651855469, "logps/rejected": -1212.340576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.553437232971191, "rewards/margins": 41.90711975097656, "rewards/rejected": -53.46055603027344, "step": 2475 }, { "epoch": 1.5402799377916019, "grad_norm": 3.6554156395141035e-06, "learning_rate": 2.702858460119871e-06, "logits/chosen": 2.662741184234619, "logits/rejected": 4.871638298034668, "logps/chosen": -587.4205322265625, "logps/rejected": -1073.63818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.387510299682617, "rewards/margins": 34.884681701660156, "rewards/rejected": -44.272193908691406, "step": 2476 }, { "epoch": 1.5409020217729394, "grad_norm": 2.009286880493164, "learning_rate": 2.701705855232826e-06, "logits/chosen": 2.2703564167022705, "logits/rejected": 2.3993289470672607, "logps/chosen": -774.0054931640625, "logps/rejected": -1000.2762451171875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -17.877897262573242, "rewards/margins": 19.677772521972656, "rewards/rejected": -37.555667877197266, "step": 2477 }, { "epoch": 1.5415241057542768, "grad_norm": 0.000742044416256249, "learning_rate": 2.7005532503457814e-06, "logits/chosen": 0.48302167654037476, "logits/rejected": 4.2788166999816895, "logps/chosen": -413.0840148925781, "logps/rejected": -951.339111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.892075538635254, "rewards/margins": 32.75634765625, "rewards/rejected": -41.64841842651367, "step": 2478 }, { "epoch": 1.5421461897356143, "grad_norm": 1.013292738605287e-08, "learning_rate": 2.699400645458737e-06, "logits/chosen": 0.2683802843093872, "logits/rejected": 2.4963221549987793, "logps/chosen": -575.1116943359375, "logps/rejected": -1050.36376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.652692794799805, "rewards/margins": 36.25019454956055, "rewards/rejected": -45.90288543701172, "step": 2479 }, { "epoch": 1.542768273716952, "grad_norm": 6.289964949246496e-05, "learning_rate": 2.6982480405716922e-06, "logits/chosen": -0.9888131618499756, "logits/rejected": 3.092804193496704, "logps/chosen": -430.79046630859375, "logps/rejected": -926.0091552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.098524570465088, "rewards/margins": 26.761119842529297, "rewards/rejected": -32.859642028808594, "step": 2480 }, { "epoch": 1.5433903576982893, "grad_norm": 0.0013569953152909875, "learning_rate": 2.6970954356846475e-06, "logits/chosen": -0.7292362451553345, "logits/rejected": 1.174302101135254, "logps/chosen": -625.8265991210938, "logps/rejected": -1065.358154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.744695663452148, "rewards/margins": 28.380962371826172, "rewards/rejected": -39.12565994262695, "step": 2481 }, { "epoch": 1.5440124416796266, "grad_norm": 0.002672493224963546, "learning_rate": 2.6959428307976027e-06, "logits/chosen": 1.855053186416626, "logits/rejected": 3.0483992099761963, "logps/chosen": -619.2630615234375, "logps/rejected": -1022.6942138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.304459571838379, "rewards/margins": 29.886611938476562, "rewards/rejected": -40.191070556640625, "step": 2482 }, { "epoch": 1.5446345256609644, "grad_norm": 5.420322486315854e-05, "learning_rate": 2.694790225910558e-06, "logits/chosen": -1.8340150117874146, "logits/rejected": 3.607595920562744, "logps/chosen": -360.5854187011719, "logps/rejected": -1015.0472412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2733869552612305, "rewards/margins": 32.604515075683594, "rewards/rejected": -36.87790298461914, "step": 2483 }, { "epoch": 1.5452566096423017, "grad_norm": 0.0011562893632799387, "learning_rate": 2.693637621023513e-06, "logits/chosen": -2.7047109603881836, "logits/rejected": 3.4061014652252197, "logps/chosen": -437.25604248046875, "logps/rejected": -1212.28857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.983414649963379, "rewards/margins": 38.062217712402344, "rewards/rejected": -46.04563522338867, "step": 2484 }, { "epoch": 1.545878693623639, "grad_norm": 1.156277176050935e-05, "learning_rate": 2.6924850161364684e-06, "logits/chosen": -1.4244234561920166, "logits/rejected": 4.303253173828125, "logps/chosen": -339.56890869140625, "logps/rejected": -957.9353637695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.711836814880371, "rewards/margins": 32.08358383178711, "rewards/rejected": -36.7954216003418, "step": 2485 }, { "epoch": 1.5465007776049766, "grad_norm": 1.7454749468015507e-07, "learning_rate": 2.691332411249424e-06, "logits/chosen": 1.288722276687622, "logits/rejected": 2.1820712089538574, "logps/chosen": -599.7692260742188, "logps/rejected": -972.705810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.94448471069336, "rewards/margins": 29.24241065979004, "rewards/rejected": -40.18689727783203, "step": 2486 }, { "epoch": 1.5471228615863142, "grad_norm": 23.994794845581055, "learning_rate": 2.6901798063623792e-06, "logits/chosen": 0.03151065111160278, "logits/rejected": 2.8123786449432373, "logps/chosen": -593.8890991210938, "logps/rejected": -979.3603515625, "loss": 0.1584, "rewards/accuracies": 0.875, "rewards/chosen": -10.582526206970215, "rewards/margins": 21.869956970214844, "rewards/rejected": -32.452484130859375, "step": 2487 }, { "epoch": 1.5477449455676515, "grad_norm": 9.391483146714563e-09, "learning_rate": 2.6890272014753345e-06, "logits/chosen": 0.1529102921485901, "logits/rejected": 3.4798989295959473, "logps/chosen": -538.7216186523438, "logps/rejected": -1052.495361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.785987854003906, "rewards/margins": 36.01325988769531, "rewards/rejected": -44.79924774169922, "step": 2488 }, { "epoch": 1.548367029548989, "grad_norm": 12.872499465942383, "learning_rate": 2.6878745965882897e-06, "logits/chosen": -3.528820037841797, "logits/rejected": 2.9935293197631836, "logps/chosen": -178.8984832763672, "logps/rejected": -872.1593017578125, "loss": 0.1691, "rewards/accuracies": 0.875, "rewards/chosen": -3.5141162872314453, "rewards/margins": 30.633007049560547, "rewards/rejected": -34.14712142944336, "step": 2489 }, { "epoch": 1.5489891135303266, "grad_norm": 28.490171432495117, "learning_rate": 2.686721991701245e-06, "logits/chosen": -0.568493127822876, "logits/rejected": 1.1433310508728027, "logps/chosen": -645.1004638671875, "logps/rejected": -984.1376342773438, "loss": 0.1649, "rewards/accuracies": 0.875, "rewards/chosen": -6.578904151916504, "rewards/margins": 25.876937866210938, "rewards/rejected": -32.45584487915039, "step": 2490 }, { "epoch": 1.549611197511664, "grad_norm": 9.463408470153809, "learning_rate": 2.6855693868142e-06, "logits/chosen": -1.7124032974243164, "logits/rejected": 1.8130053281784058, "logps/chosen": -408.69232177734375, "logps/rejected": -863.88818359375, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -11.570595741271973, "rewards/margins": 27.266246795654297, "rewards/rejected": -38.83684539794922, "step": 2491 }, { "epoch": 1.5502332814930015, "grad_norm": 5.1917506738163866e-08, "learning_rate": 2.6844167819271554e-06, "logits/chosen": 0.7790787220001221, "logits/rejected": 3.551978588104248, "logps/chosen": -534.1652221679688, "logps/rejected": -1087.25048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.447636604309082, "rewards/margins": 35.59351348876953, "rewards/rejected": -41.0411491394043, "step": 2492 }, { "epoch": 1.550855365474339, "grad_norm": 0.009952358901500702, "learning_rate": 2.6832641770401106e-06, "logits/chosen": -2.5790116786956787, "logits/rejected": 0.9824144840240479, "logps/chosen": -362.249267578125, "logps/rejected": -817.4982299804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.19536018371582, "rewards/margins": 20.45709800720215, "rewards/rejected": -25.65245819091797, "step": 2493 }, { "epoch": 1.5514774494556764, "grad_norm": 0.006530395243316889, "learning_rate": 2.6821115721530662e-06, "logits/chosen": -1.5302128791809082, "logits/rejected": 1.6881933212280273, "logps/chosen": -605.1315307617188, "logps/rejected": -1031.45849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.122232437133789, "rewards/margins": 30.1520938873291, "rewards/rejected": -39.27432632446289, "step": 2494 }, { "epoch": 1.552099533437014, "grad_norm": 0.45922571420669556, "learning_rate": 2.6809589672660214e-06, "logits/chosen": -0.26042747497558594, "logits/rejected": 4.046167373657227, "logps/chosen": -430.70587158203125, "logps/rejected": -943.998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -7.969429016113281, "rewards/margins": 24.93759536743164, "rewards/rejected": -32.90702438354492, "step": 2495 }, { "epoch": 1.5527216174183516, "grad_norm": 9.564102219883353e-05, "learning_rate": 2.6798063623789767e-06, "logits/chosen": 1.4396380186080933, "logits/rejected": 2.8060150146484375, "logps/chosen": -577.88134765625, "logps/rejected": -895.8618774414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.6483793258667, "rewards/margins": 23.74332046508789, "rewards/rejected": -33.391700744628906, "step": 2496 }, { "epoch": 1.553343701399689, "grad_norm": 7.810917468376033e-10, "learning_rate": 2.678653757491932e-06, "logits/chosen": 2.06510066986084, "logits/rejected": 4.934891700744629, "logps/chosen": -564.6988525390625, "logps/rejected": -1128.0897216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.261550903320312, "rewards/margins": 31.091615676879883, "rewards/rejected": -41.35316848754883, "step": 2497 }, { "epoch": 1.5539657853810265, "grad_norm": 0.0008484581485390663, "learning_rate": 2.677501152604887e-06, "logits/chosen": -2.6576812267303467, "logits/rejected": 2.4716832637786865, "logps/chosen": -291.2125244140625, "logps/rejected": -789.5697631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.161551475524902, "rewards/margins": 22.731517791748047, "rewards/rejected": -26.893070220947266, "step": 2498 }, { "epoch": 1.554587869362364, "grad_norm": 0.0002879021340049803, "learning_rate": 2.6763485477178423e-06, "logits/chosen": 0.3646266460418701, "logits/rejected": 3.86550235748291, "logps/chosen": -518.783935546875, "logps/rejected": -969.0770263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.659425258636475, "rewards/margins": 29.202505111694336, "rewards/rejected": -35.86193084716797, "step": 2499 }, { "epoch": 1.5552099533437014, "grad_norm": 57.68034362792969, "learning_rate": 2.6751959428307976e-06, "logits/chosen": -0.09455686807632446, "logits/rejected": 2.160449743270874, "logps/chosen": -619.3577270507812, "logps/rejected": -954.7877197265625, "loss": 0.9767, "rewards/accuracies": 0.875, "rewards/chosen": -10.064574241638184, "rewards/margins": 24.755352020263672, "rewards/rejected": -34.819923400878906, "step": 2500 }, { "epoch": 1.5558320373250387, "grad_norm": 29.71711540222168, "learning_rate": 2.6740433379437532e-06, "logits/chosen": 0.9189615249633789, "logits/rejected": 3.5247128009796143, "logps/chosen": -403.5267639160156, "logps/rejected": -714.1881713867188, "loss": 0.6582, "rewards/accuracies": 0.875, "rewards/chosen": -9.70235824584961, "rewards/margins": 14.166807174682617, "rewards/rejected": -23.869165420532227, "step": 2501 }, { "epoch": 1.5564541213063765, "grad_norm": 2.8825539288845903e-07, "learning_rate": 2.6728907330567084e-06, "logits/chosen": -1.2510281801223755, "logits/rejected": 4.012646675109863, "logps/chosen": -478.6166687011719, "logps/rejected": -1164.77587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.776491165161133, "rewards/margins": 35.69042205810547, "rewards/rejected": -43.46691131591797, "step": 2502 }, { "epoch": 1.5570762052877138, "grad_norm": 0.05821898207068443, "learning_rate": 2.6717381281696637e-06, "logits/chosen": 2.096285343170166, "logits/rejected": 2.3310656547546387, "logps/chosen": -583.2084350585938, "logps/rejected": -805.5694580078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.98892593383789, "rewards/margins": 24.44229507446289, "rewards/rejected": -34.43122100830078, "step": 2503 }, { "epoch": 1.5576982892690512, "grad_norm": 82.08670806884766, "learning_rate": 2.670585523282619e-06, "logits/chosen": -0.4332718551158905, "logits/rejected": 4.304185390472412, "logps/chosen": -328.9471130371094, "logps/rejected": -675.8569946289062, "loss": 0.544, "rewards/accuracies": 0.875, "rewards/chosen": -7.469512939453125, "rewards/margins": 16.837844848632812, "rewards/rejected": -24.307355880737305, "step": 2504 }, { "epoch": 1.558320373250389, "grad_norm": 18.179847717285156, "learning_rate": 2.669432918395574e-06, "logits/chosen": 1.644677758216858, "logits/rejected": 4.1040730476379395, "logps/chosen": -453.4781799316406, "logps/rejected": -896.974853515625, "loss": 0.1027, "rewards/accuracies": 0.875, "rewards/chosen": -12.000197410583496, "rewards/margins": 27.992036819458008, "rewards/rejected": -39.99223327636719, "step": 2505 }, { "epoch": 1.5589424572317263, "grad_norm": 0.9634746313095093, "learning_rate": 2.6682803135085293e-06, "logits/chosen": -0.5393826961517334, "logits/rejected": 3.3745839595794678, "logps/chosen": -550.09814453125, "logps/rejected": -988.7060546875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -9.148344039916992, "rewards/margins": 24.866161346435547, "rewards/rejected": -34.01450729370117, "step": 2506 }, { "epoch": 1.5595645412130636, "grad_norm": 36.72014617919922, "learning_rate": 2.6671277086214846e-06, "logits/chosen": 2.456897258758545, "logits/rejected": 1.9290426969528198, "logps/chosen": -667.2633056640625, "logps/rejected": -770.6574096679688, "loss": 0.2178, "rewards/accuracies": 0.875, "rewards/chosen": -9.517938613891602, "rewards/margins": 16.047889709472656, "rewards/rejected": -25.565826416015625, "step": 2507 }, { "epoch": 1.5601866251944012, "grad_norm": 0.004348399117588997, "learning_rate": 2.6659751037344402e-06, "logits/chosen": -1.0786043405532837, "logits/rejected": 4.487873077392578, "logps/chosen": -415.68646240234375, "logps/rejected": -1152.6275634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.8546223640441895, "rewards/margins": 38.18126678466797, "rewards/rejected": -43.035888671875, "step": 2508 }, { "epoch": 1.5608087091757388, "grad_norm": 37.94123458862305, "learning_rate": 2.6648224988473954e-06, "logits/chosen": 0.8928492069244385, "logits/rejected": 2.8876805305480957, "logps/chosen": -651.49755859375, "logps/rejected": -904.5684814453125, "loss": 0.767, "rewards/accuracies": 0.875, "rewards/chosen": -10.068072319030762, "rewards/margins": 11.500648498535156, "rewards/rejected": -21.5687198638916, "step": 2509 }, { "epoch": 1.5614307931570761, "grad_norm": 0.11989522725343704, "learning_rate": 2.6636698939603507e-06, "logits/chosen": -1.96262526512146, "logits/rejected": 2.2925920486450195, "logps/chosen": -503.9068908691406, "logps/rejected": -1025.4134521484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.269535064697266, "rewards/margins": 29.875804901123047, "rewards/rejected": -41.14533996582031, "step": 2510 }, { "epoch": 1.5620528771384137, "grad_norm": 0.36885467171669006, "learning_rate": 2.662517289073306e-06, "logits/chosen": -0.9184327125549316, "logits/rejected": 2.790034294128418, "logps/chosen": -406.4637145996094, "logps/rejected": -781.6912231445312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.292177677154541, "rewards/margins": 23.567413330078125, "rewards/rejected": -27.859588623046875, "step": 2511 }, { "epoch": 1.5626749611197512, "grad_norm": 0.2901647686958313, "learning_rate": 2.661364684186261e-06, "logits/chosen": -0.9580250978469849, "logits/rejected": 2.781895160675049, "logps/chosen": -444.0966796875, "logps/rejected": -925.6182250976562, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.922597885131836, "rewards/margins": 26.735271453857422, "rewards/rejected": -35.657867431640625, "step": 2512 }, { "epoch": 1.5632970451010886, "grad_norm": 0.2968682050704956, "learning_rate": 2.6602120792992163e-06, "logits/chosen": -0.31334781646728516, "logits/rejected": 3.522827625274658, "logps/chosen": -569.567138671875, "logps/rejected": -1027.34765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -10.26767349243164, "rewards/margins": 28.82740592956543, "rewards/rejected": -39.09507751464844, "step": 2513 }, { "epoch": 1.5639191290824261, "grad_norm": 1.1502807140350342, "learning_rate": 2.6590594744121716e-06, "logits/chosen": 0.24424254894256592, "logits/rejected": 2.969736099243164, "logps/chosen": -649.736328125, "logps/rejected": -989.1493530273438, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -5.951969623565674, "rewards/margins": 20.423797607421875, "rewards/rejected": -26.37576675415039, "step": 2514 }, { "epoch": 1.5645412130637637, "grad_norm": 20.15816879272461, "learning_rate": 2.657906869525127e-06, "logits/chosen": -1.0590736865997314, "logits/rejected": 2.900858163833618, "logps/chosen": -347.2210693359375, "logps/rejected": -889.9716796875, "loss": 0.1205, "rewards/accuracies": 0.875, "rewards/chosen": -7.696262836456299, "rewards/margins": 23.77630615234375, "rewards/rejected": -31.47256851196289, "step": 2515 }, { "epoch": 1.565163297045101, "grad_norm": 1.8690065145492554, "learning_rate": 2.6567542646380824e-06, "logits/chosen": -2.203958034515381, "logits/rejected": 3.374868631362915, "logps/chosen": -402.5797424316406, "logps/rejected": -912.735107421875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -6.828686237335205, "rewards/margins": 24.617942810058594, "rewards/rejected": -31.44662857055664, "step": 2516 }, { "epoch": 1.5657853810264386, "grad_norm": 8.570445061195642e-06, "learning_rate": 2.6556016597510377e-06, "logits/chosen": 0.857057511806488, "logits/rejected": 4.9921956062316895, "logps/chosen": -502.5346374511719, "logps/rejected": -1055.5001220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.580801486968994, "rewards/margins": 30.187623977661133, "rewards/rejected": -35.76842498779297, "step": 2517 }, { "epoch": 1.5664074650077762, "grad_norm": 1.9115601901376067e-07, "learning_rate": 2.654449054863993e-06, "logits/chosen": 1.3328273296356201, "logits/rejected": 2.9461498260498047, "logps/chosen": -550.345947265625, "logps/rejected": -998.7139892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.301694869995117, "rewards/margins": 32.61174011230469, "rewards/rejected": -41.91343688964844, "step": 2518 }, { "epoch": 1.5670295489891135, "grad_norm": 0.005492346826940775, "learning_rate": 2.653296449976948e-06, "logits/chosen": -2.038207769393921, "logits/rejected": 2.8221802711486816, "logps/chosen": -496.6783447265625, "logps/rejected": -1076.4049072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.93287467956543, "rewards/margins": 31.873998641967773, "rewards/rejected": -41.80687713623047, "step": 2519 }, { "epoch": 1.5676516329704508, "grad_norm": 31.12177848815918, "learning_rate": 2.6521438450899033e-06, "logits/chosen": 2.806741237640381, "logits/rejected": 3.6385693550109863, "logps/chosen": -586.6148681640625, "logps/rejected": -871.7191162109375, "loss": 0.9491, "rewards/accuracies": 0.875, "rewards/chosen": -11.129293441772461, "rewards/margins": 19.070999145507812, "rewards/rejected": -30.200294494628906, "step": 2520 }, { "epoch": 1.5682737169517886, "grad_norm": 39.04676055908203, "learning_rate": 2.6509912402028586e-06, "logits/chosen": 2.220421314239502, "logits/rejected": 4.0747294425964355, "logps/chosen": -629.5321655273438, "logps/rejected": -880.70751953125, "loss": 1.7587, "rewards/accuracies": 0.875, "rewards/chosen": -11.306745529174805, "rewards/margins": 16.847389221191406, "rewards/rejected": -28.15413475036621, "step": 2521 }, { "epoch": 1.568895800933126, "grad_norm": 0.3181990087032318, "learning_rate": 2.6498386353158138e-06, "logits/chosen": 3.5077221393585205, "logits/rejected": 4.509550094604492, "logps/chosen": -669.1798706054688, "logps/rejected": -964.48974609375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -11.942821502685547, "rewards/margins": 23.189241409301758, "rewards/rejected": -35.13206481933594, "step": 2522 }, { "epoch": 1.5695178849144633, "grad_norm": 0.04694477468729019, "learning_rate": 2.6486860304287694e-06, "logits/chosen": -1.1715391874313354, "logits/rejected": 2.6850781440734863, "logps/chosen": -528.1036376953125, "logps/rejected": -1065.68701171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.915897369384766, "rewards/margins": 33.24100875854492, "rewards/rejected": -40.15690612792969, "step": 2523 }, { "epoch": 1.570139968895801, "grad_norm": 1.0720814458409222e-07, "learning_rate": 2.6475334255417247e-06, "logits/chosen": -1.3283451795578003, "logits/rejected": 2.9149234294891357, "logps/chosen": -557.9683837890625, "logps/rejected": -1072.668701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.794567108154297, "rewards/margins": 29.346282958984375, "rewards/rejected": -38.14085006713867, "step": 2524 }, { "epoch": 1.5707620528771384, "grad_norm": 9.388706416757486e-07, "learning_rate": 2.64638082065468e-06, "logits/chosen": 0.9473726749420166, "logits/rejected": 3.064026355743408, "logps/chosen": -638.7322998046875, "logps/rejected": -964.4024658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.972917556762695, "rewards/margins": 27.92718505859375, "rewards/rejected": -33.90010070800781, "step": 2525 }, { "epoch": 1.5713841368584758, "grad_norm": 0.046334926038980484, "learning_rate": 2.645228215767635e-06, "logits/chosen": 0.19112294912338257, "logits/rejected": 2.3191397190093994, "logps/chosen": -578.079833984375, "logps/rejected": -947.2376098632812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.907577037811279, "rewards/margins": 25.607810974121094, "rewards/rejected": -33.51538848876953, "step": 2526 }, { "epoch": 1.5720062208398133, "grad_norm": 4.616542816162109, "learning_rate": 2.6440756108805903e-06, "logits/chosen": 2.2481207847595215, "logits/rejected": 2.875807762145996, "logps/chosen": -721.6137084960938, "logps/rejected": -1072.4210205078125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -11.013412475585938, "rewards/margins": 27.015226364135742, "rewards/rejected": -38.02864074707031, "step": 2527 }, { "epoch": 1.572628304821151, "grad_norm": 0.07997529953718185, "learning_rate": 2.6429230059935456e-06, "logits/chosen": 0.14837861061096191, "logits/rejected": 3.7757630348205566, "logps/chosen": -531.468994140625, "logps/rejected": -1102.809326171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.239072799682617, "rewards/margins": 31.316261291503906, "rewards/rejected": -40.55533218383789, "step": 2528 }, { "epoch": 1.5732503888024882, "grad_norm": 0.1496405154466629, "learning_rate": 2.6417704011065008e-06, "logits/chosen": -1.293025016784668, "logits/rejected": 4.773388385772705, "logps/chosen": -350.036376953125, "logps/rejected": -869.2445068359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.083723545074463, "rewards/margins": 17.739789962768555, "rewards/rejected": -22.823514938354492, "step": 2529 }, { "epoch": 1.5738724727838258, "grad_norm": 0.05532778799533844, "learning_rate": 2.6406177962194564e-06, "logits/chosen": 0.01579982042312622, "logits/rejected": 2.789395332336426, "logps/chosen": -499.70263671875, "logps/rejected": -814.7089233398438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.12386417388916, "rewards/margins": 21.891815185546875, "rewards/rejected": -27.015682220458984, "step": 2530 }, { "epoch": 1.5744945567651634, "grad_norm": 8.699598402017727e-05, "learning_rate": 2.6394651913324117e-06, "logits/chosen": 2.4416699409484863, "logits/rejected": 5.013975620269775, "logps/chosen": -410.20770263671875, "logps/rejected": -835.2908935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.419612884521484, "rewards/margins": 23.25516128540039, "rewards/rejected": -29.67477798461914, "step": 2531 }, { "epoch": 1.5751166407465007, "grad_norm": 0.0021628057584166527, "learning_rate": 2.638312586445367e-06, "logits/chosen": -0.6670438051223755, "logits/rejected": 3.6675217151641846, "logps/chosen": -384.84075927734375, "logps/rejected": -905.62646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.705150604248047, "rewards/margins": 26.648361206054688, "rewards/rejected": -35.353511810302734, "step": 2532 }, { "epoch": 1.5757387247278383, "grad_norm": 0.00378401973284781, "learning_rate": 2.637159981558322e-06, "logits/chosen": 1.4846572875976562, "logits/rejected": 2.9995572566986084, "logps/chosen": -725.524658203125, "logps/rejected": -958.9528198242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.081862926483154, "rewards/margins": 21.75735855102539, "rewards/rejected": -28.839221954345703, "step": 2533 }, { "epoch": 1.5763608087091758, "grad_norm": 5.04378604888916, "learning_rate": 2.6360073766712773e-06, "logits/chosen": 1.3019071817398071, "logits/rejected": 4.482569217681885, "logps/chosen": -533.6031494140625, "logps/rejected": -985.0313110351562, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -7.342979907989502, "rewards/margins": 23.088260650634766, "rewards/rejected": -30.431243896484375, "step": 2534 }, { "epoch": 1.5769828926905132, "grad_norm": 8.794490895525087e-06, "learning_rate": 2.6348547717842326e-06, "logits/chosen": 1.8717117309570312, "logits/rejected": 3.6646323204040527, "logps/chosen": -630.3969116210938, "logps/rejected": -1010.4031982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.561976909637451, "rewards/margins": 26.8465633392334, "rewards/rejected": -33.408538818359375, "step": 2535 }, { "epoch": 1.5776049766718507, "grad_norm": 2.2234980860957876e-05, "learning_rate": 2.6337021668971878e-06, "logits/chosen": -0.8583243489265442, "logits/rejected": 3.619313955307007, "logps/chosen": -408.3638610839844, "logps/rejected": -999.8567504882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.225172996520996, "rewards/margins": 28.894039154052734, "rewards/rejected": -37.11920928955078, "step": 2536 }, { "epoch": 1.5782270606531883, "grad_norm": 0.012370456010103226, "learning_rate": 2.6325495620101434e-06, "logits/chosen": -1.1759496927261353, "logits/rejected": 2.753152370452881, "logps/chosen": -449.21856689453125, "logps/rejected": -964.8666381835938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.146978378295898, "rewards/margins": 27.28769302368164, "rewards/rejected": -34.43467330932617, "step": 2537 }, { "epoch": 1.5788491446345256, "grad_norm": 0.048277225345373154, "learning_rate": 2.6313969571230987e-06, "logits/chosen": -1.496280550956726, "logits/rejected": 2.0494561195373535, "logps/chosen": -304.0895080566406, "logps/rejected": -812.8955688476562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.053216934204102, "rewards/margins": 28.015701293945312, "rewards/rejected": -34.06891632080078, "step": 2538 }, { "epoch": 1.579471228615863, "grad_norm": 0.0003371371713001281, "learning_rate": 2.630244352236054e-06, "logits/chosen": -0.8229646682739258, "logits/rejected": 0.5232251286506653, "logps/chosen": -396.1567077636719, "logps/rejected": -720.4967041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.08257007598877, "rewards/margins": 23.936092376708984, "rewards/rejected": -32.01866149902344, "step": 2539 }, { "epoch": 1.5800933125972008, "grad_norm": 0.001109063159674406, "learning_rate": 2.629091747349009e-06, "logits/chosen": 1.09111750125885, "logits/rejected": 3.661069631576538, "logps/chosen": -433.0865478515625, "logps/rejected": -773.2035522460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.496565818786621, "rewards/margins": 25.523311614990234, "rewards/rejected": -34.01987838745117, "step": 2540 }, { "epoch": 1.580715396578538, "grad_norm": 2.954153751488775e-05, "learning_rate": 2.6279391424619643e-06, "logits/chosen": 1.3929088115692139, "logits/rejected": 3.76798152923584, "logps/chosen": -631.7744140625, "logps/rejected": -977.82080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.168238639831543, "rewards/margins": 28.04422950744629, "rewards/rejected": -34.21247100830078, "step": 2541 }, { "epoch": 1.5813374805598754, "grad_norm": 3.4777238368988037, "learning_rate": 2.6267865375749196e-06, "logits/chosen": 0.6678463220596313, "logits/rejected": 3.8288257122039795, "logps/chosen": -564.5045166015625, "logps/rejected": -951.9814453125, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -7.784692764282227, "rewards/margins": 19.069561004638672, "rewards/rejected": -26.854251861572266, "step": 2542 }, { "epoch": 1.5819595645412132, "grad_norm": 0.014230997301638126, "learning_rate": 2.6256339326878748e-06, "logits/chosen": 0.6849037408828735, "logits/rejected": 3.4208898544311523, "logps/chosen": -505.7391662597656, "logps/rejected": -984.9599609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.177289962768555, "rewards/margins": 24.551013946533203, "rewards/rejected": -32.728302001953125, "step": 2543 }, { "epoch": 1.5825816485225506, "grad_norm": 0.00025358598213642836, "learning_rate": 2.62448132780083e-06, "logits/chosen": -2.680936574935913, "logits/rejected": 0.6628610491752625, "logps/chosen": -421.4416198730469, "logps/rejected": -838.618408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.892261028289795, "rewards/margins": 24.422157287597656, "rewards/rejected": -29.31441879272461, "step": 2544 }, { "epoch": 1.583203732503888, "grad_norm": 0.2544800937175751, "learning_rate": 2.6233287229137857e-06, "logits/chosen": 1.9035508632659912, "logits/rejected": 2.245786428451538, "logps/chosen": -658.3436889648438, "logps/rejected": -877.4723510742188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -10.054585456848145, "rewards/margins": 20.301048278808594, "rewards/rejected": -30.355634689331055, "step": 2545 }, { "epoch": 1.5838258164852255, "grad_norm": 0.07018294930458069, "learning_rate": 2.622176118026741e-06, "logits/chosen": 0.526107668876648, "logits/rejected": 3.307961940765381, "logps/chosen": -544.033203125, "logps/rejected": -850.1934204101562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.851484298706055, "rewards/margins": 23.675559997558594, "rewards/rejected": -29.52704620361328, "step": 2546 }, { "epoch": 1.584447900466563, "grad_norm": 0.0002997084229718894, "learning_rate": 2.621023513139696e-06, "logits/chosen": -0.7003446221351624, "logits/rejected": 3.4765868186950684, "logps/chosen": -441.3431091308594, "logps/rejected": -953.63671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.216616153717041, "rewards/margins": 24.402572631835938, "rewards/rejected": -31.61918830871582, "step": 2547 }, { "epoch": 1.5850699844479004, "grad_norm": 0.0024575558491051197, "learning_rate": 2.6198709082526513e-06, "logits/chosen": 0.7880875468254089, "logits/rejected": 4.125187397003174, "logps/chosen": -468.6260070800781, "logps/rejected": -896.840087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1893792152404785, "rewards/margins": 22.992290496826172, "rewards/rejected": -29.181671142578125, "step": 2548 }, { "epoch": 1.585692068429238, "grad_norm": 0.0008942119893617928, "learning_rate": 2.6187183033656065e-06, "logits/chosen": 1.5010969638824463, "logits/rejected": 3.093844175338745, "logps/chosen": -624.7542724609375, "logps/rejected": -999.3758544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.038987159729004, "rewards/margins": 24.880168914794922, "rewards/rejected": -32.919158935546875, "step": 2549 }, { "epoch": 1.5863141524105755, "grad_norm": 0.01182605978101492, "learning_rate": 2.6175656984785618e-06, "logits/chosen": -2.0643815994262695, "logits/rejected": 2.822075366973877, "logps/chosen": -418.46539306640625, "logps/rejected": -1023.8269653320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2684173583984375, "rewards/margins": 29.310277938842773, "rewards/rejected": -35.578697204589844, "step": 2550 }, { "epoch": 1.5869362363919128, "grad_norm": 0.20442859828472137, "learning_rate": 2.616413093591517e-06, "logits/chosen": 1.9439702033996582, "logits/rejected": 3.6782374382019043, "logps/chosen": -584.3154296875, "logps/rejected": -1012.0755615234375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.359824180603027, "rewards/margins": 26.538776397705078, "rewards/rejected": -32.89860153198242, "step": 2551 }, { "epoch": 1.5875583203732504, "grad_norm": 4.9473346734885126e-05, "learning_rate": 2.6152604887044726e-06, "logits/chosen": -0.7652897834777832, "logits/rejected": 3.000626802444458, "logps/chosen": -460.4172058105469, "logps/rejected": -990.1380615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.667954921722412, "rewards/margins": 30.455583572387695, "rewards/rejected": -38.123538970947266, "step": 2552 }, { "epoch": 1.588180404354588, "grad_norm": 0.3778168559074402, "learning_rate": 2.614107883817428e-06, "logits/chosen": 0.441825270652771, "logits/rejected": 3.4026546478271484, "logps/chosen": -420.4871520996094, "logps/rejected": -755.599365234375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -6.588260650634766, "rewards/margins": 17.710298538208008, "rewards/rejected": -24.298559188842773, "step": 2553 }, { "epoch": 1.5888024883359253, "grad_norm": 0.08141383528709412, "learning_rate": 2.612955278930383e-06, "logits/chosen": 0.27586179971694946, "logits/rejected": 3.2958290576934814, "logps/chosen": -504.78094482421875, "logps/rejected": -861.496337890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.5833024978637695, "rewards/margins": 24.973854064941406, "rewards/rejected": -29.55715560913086, "step": 2554 }, { "epoch": 1.5894245723172629, "grad_norm": 0.0002683989005163312, "learning_rate": 2.6118026740433383e-06, "logits/chosen": 0.30600738525390625, "logits/rejected": 3.9830586910247803, "logps/chosen": -449.1526794433594, "logps/rejected": -920.802490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.895366191864014, "rewards/margins": 22.761844635009766, "rewards/rejected": -30.657211303710938, "step": 2555 }, { "epoch": 1.5900466562986004, "grad_norm": 0.4529314339160919, "learning_rate": 2.6106500691562935e-06, "logits/chosen": 0.1846950650215149, "logits/rejected": 1.4357441663742065, "logps/chosen": -549.7300415039062, "logps/rejected": -770.741943359375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -7.734827041625977, "rewards/margins": 17.410337448120117, "rewards/rejected": -25.14516830444336, "step": 2556 }, { "epoch": 1.5906687402799378, "grad_norm": 0.0962265357375145, "learning_rate": 2.6094974642692488e-06, "logits/chosen": 1.6746344566345215, "logits/rejected": 3.797276496887207, "logps/chosen": -772.9541625976562, "logps/rejected": -1070.3515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.528155326843262, "rewards/margins": 22.276142120361328, "rewards/rejected": -27.804298400878906, "step": 2557 }, { "epoch": 1.5912908242612753, "grad_norm": 0.037062421441078186, "learning_rate": 2.608344859382204e-06, "logits/chosen": 1.5409696102142334, "logits/rejected": 3.426913261413574, "logps/chosen": -573.5651245117188, "logps/rejected": -867.2743530273438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.677513122558594, "rewards/margins": 21.017486572265625, "rewards/rejected": -28.69500160217285, "step": 2558 }, { "epoch": 1.591912908242613, "grad_norm": 0.07313279807567596, "learning_rate": 2.6071922544951596e-06, "logits/chosen": 1.7334628105163574, "logits/rejected": 2.109466075897217, "logps/chosen": -693.92529296875, "logps/rejected": -962.532470703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.267555236816406, "rewards/margins": 23.251060485839844, "rewards/rejected": -36.51861572265625, "step": 2559 }, { "epoch": 1.5925349922239502, "grad_norm": 0.02839500457048416, "learning_rate": 2.606039649608115e-06, "logits/chosen": -3.9982075691223145, "logits/rejected": 2.8284850120544434, "logps/chosen": -334.660888671875, "logps/rejected": -1039.389404296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.840045690536499, "rewards/margins": 29.952611923217773, "rewards/rejected": -33.792659759521484, "step": 2560 }, { "epoch": 1.5931570762052876, "grad_norm": 0.5425875782966614, "learning_rate": 2.60488704472107e-06, "logits/chosen": 2.1814193725585938, "logits/rejected": 4.387759685516357, "logps/chosen": -606.7564697265625, "logps/rejected": -958.6481323242188, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -9.972704887390137, "rewards/margins": 20.99521255493164, "rewards/rejected": -30.96791648864746, "step": 2561 }, { "epoch": 1.5937791601866254, "grad_norm": 0.09757669270038605, "learning_rate": 2.6037344398340253e-06, "logits/chosen": -1.089568018913269, "logits/rejected": 3.5683395862579346, "logps/chosen": -398.23394775390625, "logps/rejected": -1009.205810546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.4523613452911377, "rewards/margins": 32.37748718261719, "rewards/rejected": -35.82984924316406, "step": 2562 }, { "epoch": 1.5944012441679627, "grad_norm": 0.0016663366695865989, "learning_rate": 2.6025818349469805e-06, "logits/chosen": 2.186467170715332, "logits/rejected": 4.869716644287109, "logps/chosen": -648.9400634765625, "logps/rejected": -1087.5924072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.644506454467773, "rewards/margins": 29.386463165283203, "rewards/rejected": -41.03097152709961, "step": 2563 }, { "epoch": 1.5950233281493, "grad_norm": 41.84226989746094, "learning_rate": 2.6014292300599358e-06, "logits/chosen": 2.0851328372955322, "logits/rejected": 2.8250632286071777, "logps/chosen": -555.81494140625, "logps/rejected": -855.7350463867188, "loss": 0.2556, "rewards/accuracies": 0.875, "rewards/chosen": -9.315744400024414, "rewards/margins": 22.709354400634766, "rewards/rejected": -32.02510070800781, "step": 2564 }, { "epoch": 1.5956454121306376, "grad_norm": 3.323015334899537e-05, "learning_rate": 2.600276625172891e-06, "logits/chosen": 1.2171071767807007, "logits/rejected": 4.171212196350098, "logps/chosen": -536.38916015625, "logps/rejected": -1118.4737548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.4384260177612305, "rewards/margins": 34.14727783203125, "rewards/rejected": -41.58570861816406, "step": 2565 }, { "epoch": 1.5962674961119752, "grad_norm": 7.136670112609863, "learning_rate": 2.5991240202858462e-06, "logits/chosen": -3.851240634918213, "logits/rejected": 1.4777321815490723, "logps/chosen": -300.756591796875, "logps/rejected": -792.384765625, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -7.0310492515563965, "rewards/margins": 17.15809440612793, "rewards/rejected": -24.189144134521484, "step": 2566 }, { "epoch": 1.5968895800933125, "grad_norm": 0.03129857778549194, "learning_rate": 2.597971415398802e-06, "logits/chosen": -0.48873722553253174, "logits/rejected": 4.124366283416748, "logps/chosen": -465.2950439453125, "logps/rejected": -994.5694580078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.589336395263672, "rewards/margins": 25.836570739746094, "rewards/rejected": -36.425907135009766, "step": 2567 }, { "epoch": 1.59751166407465, "grad_norm": 1.2106145732104778e-05, "learning_rate": 2.596818810511757e-06, "logits/chosen": 1.2782320976257324, "logits/rejected": 2.5213370323181152, "logps/chosen": -537.9226684570312, "logps/rejected": -860.0325317382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.05621337890625, "rewards/margins": 26.37557601928711, "rewards/rejected": -32.43178939819336, "step": 2568 }, { "epoch": 1.5981337480559876, "grad_norm": 0.0014095701044425368, "learning_rate": 2.5956662056247123e-06, "logits/chosen": -1.6308845281600952, "logits/rejected": 1.9259669780731201, "logps/chosen": -571.4871826171875, "logps/rejected": -1069.501708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.04758358001709, "rewards/margins": 34.30439758300781, "rewards/rejected": -43.35198211669922, "step": 2569 }, { "epoch": 1.598755832037325, "grad_norm": 0.000554277969058603, "learning_rate": 2.5945136007376675e-06, "logits/chosen": -0.8610889315605164, "logits/rejected": 3.7429866790771484, "logps/chosen": -375.9157409667969, "logps/rejected": -974.5948486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.605518341064453, "rewards/margins": 28.76226043701172, "rewards/rejected": -34.36778259277344, "step": 2570 }, { "epoch": 1.5993779160186625, "grad_norm": 3.883296813000925e-05, "learning_rate": 2.5933609958506228e-06, "logits/chosen": -2.023681640625, "logits/rejected": 4.468254566192627, "logps/chosen": -376.07720947265625, "logps/rejected": -1144.0804443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.148895263671875, "rewards/margins": 39.1790771484375, "rewards/rejected": -44.327972412109375, "step": 2571 }, { "epoch": 1.6, "grad_norm": 0.05322521552443504, "learning_rate": 2.592208390963578e-06, "logits/chosen": -2.8253278732299805, "logits/rejected": 2.3830480575561523, "logps/chosen": -418.5565185546875, "logps/rejected": -1005.6823120117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.197831630706787, "rewards/margins": 26.426393508911133, "rewards/rejected": -29.624225616455078, "step": 2572 }, { "epoch": 1.6006220839813374, "grad_norm": 11.513082504272461, "learning_rate": 2.5910557860765332e-06, "logits/chosen": -1.173416018486023, "logits/rejected": 1.7636840343475342, "logps/chosen": -539.8458862304688, "logps/rejected": -1030.737060546875, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -5.969292640686035, "rewards/margins": 24.555206298828125, "rewards/rejected": -30.524499893188477, "step": 2573 }, { "epoch": 1.601244167962675, "grad_norm": 0.017018593847751617, "learning_rate": 2.589903181189489e-06, "logits/chosen": -2.8795456886291504, "logits/rejected": 1.4251694679260254, "logps/chosen": -391.2342224121094, "logps/rejected": -964.24072265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.388416290283203, "rewards/margins": 28.631431579589844, "rewards/rejected": -35.01984405517578, "step": 2574 }, { "epoch": 1.6018662519440126, "grad_norm": 32.19961166381836, "learning_rate": 2.588750576302444e-06, "logits/chosen": -1.7024846076965332, "logits/rejected": 3.0102920532226562, "logps/chosen": -383.92584228515625, "logps/rejected": -823.80712890625, "loss": 0.7058, "rewards/accuracies": 0.875, "rewards/chosen": -6.452936172485352, "rewards/margins": 15.71203899383545, "rewards/rejected": -22.164974212646484, "step": 2575 }, { "epoch": 1.60248833592535, "grad_norm": 9.193151527142618e-06, "learning_rate": 2.5875979714153993e-06, "logits/chosen": -0.4175698757171631, "logits/rejected": 4.685636520385742, "logps/chosen": -467.2003173828125, "logps/rejected": -1059.797607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.230841159820557, "rewards/margins": 32.583526611328125, "rewards/rejected": -39.81436538696289, "step": 2576 }, { "epoch": 1.6031104199066875, "grad_norm": 0.6161825656890869, "learning_rate": 2.5864453665283545e-06, "logits/chosen": -0.6858171224594116, "logits/rejected": 2.824033737182617, "logps/chosen": -561.7140502929688, "logps/rejected": -998.8251342773438, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.853142738342285, "rewards/margins": 32.505619049072266, "rewards/rejected": -37.3587646484375, "step": 2577 }, { "epoch": 1.603732503888025, "grad_norm": 26.230451583862305, "learning_rate": 2.5852927616413098e-06, "logits/chosen": 0.6289217472076416, "logits/rejected": 3.2599964141845703, "logps/chosen": -524.7599487304688, "logps/rejected": -822.0818481445312, "loss": 0.1951, "rewards/accuracies": 0.875, "rewards/chosen": -6.978344917297363, "rewards/margins": 17.70925521850586, "rewards/rejected": -24.687599182128906, "step": 2578 }, { "epoch": 1.6043545878693624, "grad_norm": 4.318963320315561e-08, "learning_rate": 2.584140156754265e-06, "logits/chosen": -2.402571439743042, "logits/rejected": 3.2633743286132812, "logps/chosen": -379.7547302246094, "logps/rejected": -1089.806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.327663421630859, "rewards/margins": 37.135555267333984, "rewards/rejected": -44.463218688964844, "step": 2579 }, { "epoch": 1.6049766718506997, "grad_norm": 0.19019412994384766, "learning_rate": 2.58298755186722e-06, "logits/chosen": 2.633239984512329, "logits/rejected": 4.513375282287598, "logps/chosen": -677.1762084960938, "logps/rejected": -1026.538330078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -11.649065017700195, "rewards/margins": 23.47222328186035, "rewards/rejected": -35.12129211425781, "step": 2580 }, { "epoch": 1.6055987558320375, "grad_norm": 0.10817208886146545, "learning_rate": 2.581834946980176e-06, "logits/chosen": -1.4973026514053345, "logits/rejected": 2.0379388332366943, "logps/chosen": -446.043212890625, "logps/rejected": -862.3677368164062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.164411544799805, "rewards/margins": 23.581241607666016, "rewards/rejected": -30.74565315246582, "step": 2581 }, { "epoch": 1.6062208398133748, "grad_norm": 0.1274665892124176, "learning_rate": 2.580682342093131e-06, "logits/chosen": -0.8409813642501831, "logits/rejected": 2.7474489212036133, "logps/chosen": -393.14324951171875, "logps/rejected": -764.0924072265625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.663969039916992, "rewards/margins": 22.817779541015625, "rewards/rejected": -31.48175048828125, "step": 2582 }, { "epoch": 1.6068429237947122, "grad_norm": 0.03009512461721897, "learning_rate": 2.5795297372060863e-06, "logits/chosen": 0.9661927819252014, "logits/rejected": 3.4408862590789795, "logps/chosen": -397.1964416503906, "logps/rejected": -772.956787109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.04042911529541, "rewards/margins": 22.691429138183594, "rewards/rejected": -28.731861114501953, "step": 2583 }, { "epoch": 1.6074650077760497, "grad_norm": 0.24635007977485657, "learning_rate": 2.5783771323190415e-06, "logits/chosen": -1.9829556941986084, "logits/rejected": 3.6001546382904053, "logps/chosen": -356.6353759765625, "logps/rejected": -788.8740234375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.533819198608398, "rewards/margins": 16.42125701904297, "rewards/rejected": -21.955076217651367, "step": 2584 }, { "epoch": 1.6080870917573873, "grad_norm": 29.279674530029297, "learning_rate": 2.5772245274319963e-06, "logits/chosen": 1.2748240232467651, "logits/rejected": 2.8580141067504883, "logps/chosen": -573.7198486328125, "logps/rejected": -938.4467163085938, "loss": 0.3834, "rewards/accuracies": 0.875, "rewards/chosen": -8.002640724182129, "rewards/margins": 25.643817901611328, "rewards/rejected": -33.646461486816406, "step": 2585 }, { "epoch": 1.6087091757387246, "grad_norm": 23.010272979736328, "learning_rate": 2.5760719225449516e-06, "logits/chosen": 1.743993878364563, "logits/rejected": 3.330801248550415, "logps/chosen": -648.9251098632812, "logps/rejected": -1052.299560546875, "loss": 0.2618, "rewards/accuracies": 0.875, "rewards/chosen": -8.065129280090332, "rewards/margins": 26.94137954711914, "rewards/rejected": -35.00651168823242, "step": 2586 }, { "epoch": 1.6093312597200622, "grad_norm": 1.394989226355392e-07, "learning_rate": 2.5749193176579068e-06, "logits/chosen": 2.410019636154175, "logits/rejected": 4.149812698364258, "logps/chosen": -608.6632690429688, "logps/rejected": -938.4107666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.392426490783691, "rewards/margins": 26.489665985107422, "rewards/rejected": -34.8820915222168, "step": 2587 }, { "epoch": 1.6099533437013998, "grad_norm": 0.0016567111015319824, "learning_rate": 2.573766712770862e-06, "logits/chosen": -1.261190414428711, "logits/rejected": 3.9460859298706055, "logps/chosen": -293.0774841308594, "logps/rejected": -949.5228271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.228933334350586, "rewards/margins": 29.283653259277344, "rewards/rejected": -35.51258850097656, "step": 2588 }, { "epoch": 1.610575427682737, "grad_norm": 0.02118654176592827, "learning_rate": 2.5726141078838172e-06, "logits/chosen": -0.7358344197273254, "logits/rejected": 3.6361420154571533, "logps/chosen": -437.2724609375, "logps/rejected": -1011.6895751953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.786621570587158, "rewards/margins": 28.29082679748535, "rewards/rejected": -36.077449798583984, "step": 2589 }, { "epoch": 1.6111975116640747, "grad_norm": 34.734352111816406, "learning_rate": 2.5714615029967725e-06, "logits/chosen": 1.2995983362197876, "logits/rejected": 3.4828128814697266, "logps/chosen": -684.9298095703125, "logps/rejected": -985.8946533203125, "loss": 0.2266, "rewards/accuracies": 0.875, "rewards/chosen": -8.939543724060059, "rewards/margins": 19.27617073059082, "rewards/rejected": -28.215713500976562, "step": 2590 }, { "epoch": 1.6118195956454122, "grad_norm": 0.23033146560192108, "learning_rate": 2.570308898109728e-06, "logits/chosen": -2.0494282245635986, "logits/rejected": 2.8577654361724854, "logps/chosen": -346.23577880859375, "logps/rejected": -907.2470092773438, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.1096112728118896, "rewards/margins": 27.401147842407227, "rewards/rejected": -30.510757446289062, "step": 2591 }, { "epoch": 1.6124416796267496, "grad_norm": 0.8053516745567322, "learning_rate": 2.5691562932226833e-06, "logits/chosen": -1.868981957435608, "logits/rejected": 3.8824474811553955, "logps/chosen": -460.58026123046875, "logps/rejected": -1142.6533203125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -7.4744157791137695, "rewards/margins": 29.066810607910156, "rewards/rejected": -36.541229248046875, "step": 2592 }, { "epoch": 1.6130637636080871, "grad_norm": 4.806786060333252, "learning_rate": 2.5680036883356386e-06, "logits/chosen": 1.3408687114715576, "logits/rejected": 0.943589448928833, "logps/chosen": -657.0845336914062, "logps/rejected": -760.260009765625, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -8.541203498840332, "rewards/margins": 18.67476463317871, "rewards/rejected": -27.215970993041992, "step": 2593 }, { "epoch": 1.6136858475894247, "grad_norm": 7.727682532276958e-05, "learning_rate": 2.5668510834485938e-06, "logits/chosen": -0.061612486839294434, "logits/rejected": 3.8385701179504395, "logps/chosen": -550.5499267578125, "logps/rejected": -1054.3939208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.10532808303833, "rewards/margins": 26.229053497314453, "rewards/rejected": -33.334381103515625, "step": 2594 }, { "epoch": 1.614307931570762, "grad_norm": 0.4082733392715454, "learning_rate": 2.565698478561549e-06, "logits/chosen": -1.452492117881775, "logits/rejected": 1.2722666263580322, "logps/chosen": -538.8363647460938, "logps/rejected": -978.8455200195312, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -11.732166290283203, "rewards/margins": 27.385570526123047, "rewards/rejected": -39.11773681640625, "step": 2595 }, { "epoch": 1.6149300155520996, "grad_norm": 0.04512008652091026, "learning_rate": 2.5645458736745042e-06, "logits/chosen": 0.16232812404632568, "logits/rejected": 2.1995935440063477, "logps/chosen": -695.5032348632812, "logps/rejected": -1052.70947265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.921582221984863, "rewards/margins": 25.841278076171875, "rewards/rejected": -38.762855529785156, "step": 2596 }, { "epoch": 1.6155520995334371, "grad_norm": 0.00561953941360116, "learning_rate": 2.5633932687874595e-06, "logits/chosen": 0.4279603064060211, "logits/rejected": 3.640723943710327, "logps/chosen": -489.49615478515625, "logps/rejected": -974.6058349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.134950160980225, "rewards/margins": 29.59377098083496, "rewards/rejected": -34.728721618652344, "step": 2597 }, { "epoch": 1.6161741835147745, "grad_norm": 0.0004975904012098908, "learning_rate": 2.562240663900415e-06, "logits/chosen": 2.388624668121338, "logits/rejected": 3.4154815673828125, "logps/chosen": -653.7855224609375, "logps/rejected": -907.9586181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.451631546020508, "rewards/margins": 22.841678619384766, "rewards/rejected": -33.293312072753906, "step": 2598 }, { "epoch": 1.6167962674961118, "grad_norm": 2.29254254469069e-11, "learning_rate": 2.5610880590133703e-06, "logits/chosen": -2.7268898487091064, "logits/rejected": 3.9294936656951904, "logps/chosen": -445.59332275390625, "logps/rejected": -1288.5296630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.320531845092773, "rewards/margins": 41.719940185546875, "rewards/rejected": -47.04046630859375, "step": 2599 }, { "epoch": 1.6174183514774496, "grad_norm": 0.00903357844799757, "learning_rate": 2.5599354541263256e-06, "logits/chosen": -2.188351631164551, "logits/rejected": 3.1762428283691406, "logps/chosen": -226.08502197265625, "logps/rejected": -817.2967529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.105920791625977, "rewards/margins": 27.778106689453125, "rewards/rejected": -31.884029388427734, "step": 2600 }, { "epoch": 1.618040435458787, "grad_norm": 0.022304048761725426, "learning_rate": 2.5587828492392808e-06, "logits/chosen": -1.54436457157135, "logits/rejected": 3.401952028274536, "logps/chosen": -393.1194152832031, "logps/rejected": -939.4529418945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.2694783210754395, "rewards/margins": 23.11556625366211, "rewards/rejected": -27.38504409790039, "step": 2601 }, { "epoch": 1.6186625194401243, "grad_norm": 0.47571098804473877, "learning_rate": 2.557630244352236e-06, "logits/chosen": 0.1825639009475708, "logits/rejected": 3.720527172088623, "logps/chosen": -609.8028564453125, "logps/rejected": -1145.288818359375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -7.5856475830078125, "rewards/margins": 31.391204833984375, "rewards/rejected": -38.97685241699219, "step": 2602 }, { "epoch": 1.6192846034214619, "grad_norm": 0.07786351442337036, "learning_rate": 2.5564776394651912e-06, "logits/chosen": 0.14193564653396606, "logits/rejected": 1.8101928234100342, "logps/chosen": -526.883056640625, "logps/rejected": -821.777587890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.243754863739014, "rewards/margins": 23.589351654052734, "rewards/rejected": -29.833106994628906, "step": 2603 }, { "epoch": 1.6199066874027994, "grad_norm": 13.48662281036377, "learning_rate": 2.5553250345781465e-06, "logits/chosen": 0.29353073239326477, "logits/rejected": 1.9300470352172852, "logps/chosen": -537.5771484375, "logps/rejected": -801.8447265625, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": -8.729201316833496, "rewards/margins": 18.486591339111328, "rewards/rejected": -27.21579360961914, "step": 2604 }, { "epoch": 1.6205287713841368, "grad_norm": 14.752580642700195, "learning_rate": 2.554172429691102e-06, "logits/chosen": 0.7777003645896912, "logits/rejected": 3.255272388458252, "logps/chosen": -503.46063232421875, "logps/rejected": -869.5552978515625, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": -5.107442378997803, "rewards/margins": 21.075592041015625, "rewards/rejected": -26.183034896850586, "step": 2605 }, { "epoch": 1.6211508553654743, "grad_norm": 6.323744310066104e-05, "learning_rate": 2.5530198248040573e-06, "logits/chosen": 1.3028900623321533, "logits/rejected": 3.118048667907715, "logps/chosen": -682.3069458007812, "logps/rejected": -1054.4993896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.212747573852539, "rewards/margins": 24.12259292602539, "rewards/rejected": -35.3353385925293, "step": 2606 }, { "epoch": 1.6217729393468119, "grad_norm": 3.8181777000427246, "learning_rate": 2.5518672199170125e-06, "logits/chosen": 2.5419797897338867, "logits/rejected": 3.7514326572418213, "logps/chosen": -671.3524780273438, "logps/rejected": -910.8513793945312, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -8.456694602966309, "rewards/margins": 18.08182144165039, "rewards/rejected": -26.538516998291016, "step": 2607 }, { "epoch": 1.6223950233281492, "grad_norm": 0.0002522819268051535, "learning_rate": 2.5507146150299678e-06, "logits/chosen": 3.79829478263855, "logits/rejected": 3.902897357940674, "logps/chosen": -660.8634033203125, "logps/rejected": -977.2813720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.231990814208984, "rewards/margins": 29.680944442749023, "rewards/rejected": -37.91293716430664, "step": 2608 }, { "epoch": 1.6230171073094868, "grad_norm": 0.04555211216211319, "learning_rate": 2.549562010142923e-06, "logits/chosen": -1.7312321662902832, "logits/rejected": 0.7760089635848999, "logps/chosen": -510.12109375, "logps/rejected": -881.8781127929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.821007251739502, "rewards/margins": 24.35039520263672, "rewards/rejected": -29.171401977539062, "step": 2609 }, { "epoch": 1.6236391912908243, "grad_norm": 4.4340089933037063e-10, "learning_rate": 2.5484094052558782e-06, "logits/chosen": 1.1520129442214966, "logits/rejected": 5.368868350982666, "logps/chosen": -535.7080078125, "logps/rejected": -1051.622802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.899171352386475, "rewards/margins": 31.874183654785156, "rewards/rejected": -38.77335739135742, "step": 2610 }, { "epoch": 1.6242612752721617, "grad_norm": 0.0008103394648060203, "learning_rate": 2.5472568003688334e-06, "logits/chosen": -0.27327293157577515, "logits/rejected": 3.8780975341796875, "logps/chosen": -451.706787109375, "logps/rejected": -1007.6486206054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.71866226196289, "rewards/margins": 31.797969818115234, "rewards/rejected": -40.516632080078125, "step": 2611 }, { "epoch": 1.6248833592534992, "grad_norm": 2.2073519229888916, "learning_rate": 2.546104195481789e-06, "logits/chosen": -1.2786414623260498, "logits/rejected": 2.140981674194336, "logps/chosen": -542.6721801757812, "logps/rejected": -953.30615234375, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -9.801692962646484, "rewards/margins": 23.804019927978516, "rewards/rejected": -33.605712890625, "step": 2612 }, { "epoch": 1.6255054432348368, "grad_norm": 0.0158245787024498, "learning_rate": 2.5449515905947443e-06, "logits/chosen": -0.201128751039505, "logits/rejected": 2.1382110118865967, "logps/chosen": -388.56170654296875, "logps/rejected": -689.6513061523438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2910847663879395, "rewards/margins": 19.573322296142578, "rewards/rejected": -22.86440658569336, "step": 2613 }, { "epoch": 1.6261275272161742, "grad_norm": 0.0037320577539503574, "learning_rate": 2.5437989857076995e-06, "logits/chosen": -1.5668559074401855, "logits/rejected": 3.3568079471588135, "logps/chosen": -420.1706848144531, "logps/rejected": -986.5573120117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.247982025146484, "rewards/margins": 28.099079132080078, "rewards/rejected": -33.34706115722656, "step": 2614 }, { "epoch": 1.6267496111975117, "grad_norm": 0.9762111902236938, "learning_rate": 2.5426463808206548e-06, "logits/chosen": -0.7459107041358948, "logits/rejected": 3.1922781467437744, "logps/chosen": -568.686767578125, "logps/rejected": -1133.5242919921875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -9.407186508178711, "rewards/margins": 23.96875, "rewards/rejected": -33.375938415527344, "step": 2615 }, { "epoch": 1.6273716951788493, "grad_norm": 0.19420433044433594, "learning_rate": 2.54149377593361e-06, "logits/chosen": 0.6126387119293213, "logits/rejected": 2.9637625217437744, "logps/chosen": -562.302978515625, "logps/rejected": -1003.8385009765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -10.56039047241211, "rewards/margins": 21.085357666015625, "rewards/rejected": -31.6457462310791, "step": 2616 }, { "epoch": 1.6279937791601866, "grad_norm": 0.011737792752683163, "learning_rate": 2.5403411710465652e-06, "logits/chosen": 2.8269572257995605, "logits/rejected": 3.9817934036254883, "logps/chosen": -689.129150390625, "logps/rejected": -986.1431884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.855367660522461, "rewards/margins": 21.466367721557617, "rewards/rejected": -31.321735382080078, "step": 2617 }, { "epoch": 1.628615863141524, "grad_norm": 3.9934420585632324, "learning_rate": 2.5391885661595204e-06, "logits/chosen": 1.5089527368545532, "logits/rejected": 2.406381130218506, "logps/chosen": -523.5970458984375, "logps/rejected": -897.9669799804688, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -7.596930027008057, "rewards/margins": 27.403316497802734, "rewards/rejected": -35.000244140625, "step": 2618 }, { "epoch": 1.6292379471228617, "grad_norm": 0.024353763088583946, "learning_rate": 2.5380359612724757e-06, "logits/chosen": 2.6592142581939697, "logits/rejected": 4.7618513107299805, "logps/chosen": -617.1805419921875, "logps/rejected": -967.5352783203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.64953327178955, "rewards/margins": 27.00077247619629, "rewards/rejected": -36.650306701660156, "step": 2619 }, { "epoch": 1.629860031104199, "grad_norm": 0.06505515426397324, "learning_rate": 2.5368833563854313e-06, "logits/chosen": 2.7376463413238525, "logits/rejected": 3.918849468231201, "logps/chosen": -727.3867797851562, "logps/rejected": -1086.908447265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.635740280151367, "rewards/margins": 27.26062774658203, "rewards/rejected": -38.896366119384766, "step": 2620 }, { "epoch": 1.6304821150855364, "grad_norm": 0.014469513669610023, "learning_rate": 2.5357307514983865e-06, "logits/chosen": -0.44161003828048706, "logits/rejected": 2.54278564453125, "logps/chosen": -645.5219116210938, "logps/rejected": -1192.2489013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.379627227783203, "rewards/margins": 30.334142684936523, "rewards/rejected": -39.71377182006836, "step": 2621 }, { "epoch": 1.631104199066874, "grad_norm": 0.0001569920714246109, "learning_rate": 2.5345781466113418e-06, "logits/chosen": 0.8897266387939453, "logits/rejected": 3.378631353378296, "logps/chosen": -578.5534057617188, "logps/rejected": -1006.815673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.458053588867188, "rewards/margins": 26.728559494018555, "rewards/rejected": -35.186614990234375, "step": 2622 }, { "epoch": 1.6317262830482115, "grad_norm": 2.3231365048559383e-05, "learning_rate": 2.533425541724297e-06, "logits/chosen": -1.3021610975265503, "logits/rejected": 1.9647281169891357, "logps/chosen": -413.54571533203125, "logps/rejected": -846.652099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.324384689331055, "rewards/margins": 26.152280807495117, "rewards/rejected": -34.47666549682617, "step": 2623 }, { "epoch": 1.6323483670295489, "grad_norm": 0.0006543318158946931, "learning_rate": 2.5322729368372522e-06, "logits/chosen": 2.5640270709991455, "logits/rejected": 4.1576032638549805, "logps/chosen": -714.5624389648438, "logps/rejected": -1073.593017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.278242111206055, "rewards/margins": 27.065113067626953, "rewards/rejected": -36.343353271484375, "step": 2624 }, { "epoch": 1.6329704510108864, "grad_norm": 0.13595524430274963, "learning_rate": 2.5311203319502074e-06, "logits/chosen": 1.7532628774642944, "logits/rejected": 3.9578118324279785, "logps/chosen": -714.988037109375, "logps/rejected": -1059.7442626953125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -12.232544898986816, "rewards/margins": 24.83914566040039, "rewards/rejected": -37.07168960571289, "step": 2625 }, { "epoch": 1.633592534992224, "grad_norm": 0.005432057660073042, "learning_rate": 2.5299677270631627e-06, "logits/chosen": -1.5166964530944824, "logits/rejected": 2.6830894947052, "logps/chosen": -349.53704833984375, "logps/rejected": -1010.3942260742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.998991966247559, "rewards/margins": 37.300437927246094, "rewards/rejected": -43.29943084716797, "step": 2626 }, { "epoch": 1.6342146189735614, "grad_norm": 0.2473265379667282, "learning_rate": 2.5288151221761183e-06, "logits/chosen": -2.057335615158081, "logits/rejected": 3.281980514526367, "logps/chosen": -425.85528564453125, "logps/rejected": -965.0843505859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -10.48233699798584, "rewards/margins": 27.600360870361328, "rewards/rejected": -38.082698822021484, "step": 2627 }, { "epoch": 1.634836702954899, "grad_norm": 0.0459589809179306, "learning_rate": 2.5276625172890735e-06, "logits/chosen": 1.5313283205032349, "logits/rejected": 3.6381654739379883, "logps/chosen": -675.596923828125, "logps/rejected": -1058.517578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.262785911560059, "rewards/margins": 25.024362564086914, "rewards/rejected": -34.287147521972656, "step": 2628 }, { "epoch": 1.6354587869362365, "grad_norm": 15.045350074768066, "learning_rate": 2.5265099124020288e-06, "logits/chosen": -1.2002232074737549, "logits/rejected": 1.7366091012954712, "logps/chosen": -623.9976806640625, "logps/rejected": -1112.90966796875, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": -8.058524131774902, "rewards/margins": 33.36068344116211, "rewards/rejected": -41.419212341308594, "step": 2629 }, { "epoch": 1.6360808709175738, "grad_norm": 0.17426888644695282, "learning_rate": 2.525357307514984e-06, "logits/chosen": -0.4459741711616516, "logits/rejected": 1.9978004693984985, "logps/chosen": -593.0241088867188, "logps/rejected": -930.87646484375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.513236999511719, "rewards/margins": 23.07077407836914, "rewards/rejected": -32.58401107788086, "step": 2630 }, { "epoch": 1.6367029548989114, "grad_norm": 0.032606858760118484, "learning_rate": 2.5242047026279392e-06, "logits/chosen": 2.1993825435638428, "logits/rejected": 2.9777932167053223, "logps/chosen": -734.6822509765625, "logps/rejected": -1108.458984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.36622142791748, "rewards/margins": 29.997440338134766, "rewards/rejected": -42.36366271972656, "step": 2631 }, { "epoch": 1.637325038880249, "grad_norm": 0.2596385180950165, "learning_rate": 2.5230520977408944e-06, "logits/chosen": -0.051331907510757446, "logits/rejected": 4.525763988494873, "logps/chosen": -351.0687561035156, "logps/rejected": -884.1585693359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.428440570831299, "rewards/margins": 27.59353256225586, "rewards/rejected": -31.02197265625, "step": 2632 }, { "epoch": 1.6379471228615863, "grad_norm": 2.970715045928955, "learning_rate": 2.5218994928538497e-06, "logits/chosen": -2.0356454849243164, "logits/rejected": 3.7598862648010254, "logps/chosen": -416.25897216796875, "logps/rejected": -1123.38330078125, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -6.475407600402832, "rewards/margins": 33.963050842285156, "rewards/rejected": -40.43846130371094, "step": 2633 }, { "epoch": 1.6385692068429238, "grad_norm": 0.053677089512348175, "learning_rate": 2.5207468879668053e-06, "logits/chosen": 3.2241604328155518, "logits/rejected": 5.152410507202148, "logps/chosen": -655.5747680664062, "logps/rejected": -1016.7356567382812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.689812660217285, "rewards/margins": 28.69457244873047, "rewards/rejected": -35.38438415527344, "step": 2634 }, { "epoch": 1.6391912908242614, "grad_norm": 29.179723739624023, "learning_rate": 2.5195942830797605e-06, "logits/chosen": 3.355369806289673, "logits/rejected": 4.142183303833008, "logps/chosen": -637.4825439453125, "logps/rejected": -853.29833984375, "loss": 0.2064, "rewards/accuracies": 0.875, "rewards/chosen": -9.102502822875977, "rewards/margins": 18.019689559936523, "rewards/rejected": -27.1221923828125, "step": 2635 }, { "epoch": 1.6398133748055987, "grad_norm": 2.7533092498779297, "learning_rate": 2.5184416781927158e-06, "logits/chosen": 0.7892777919769287, "logits/rejected": 2.5396130084991455, "logps/chosen": -529.934326171875, "logps/rejected": -895.4093017578125, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -9.260340690612793, "rewards/margins": 23.903886795043945, "rewards/rejected": -33.16422653198242, "step": 2636 }, { "epoch": 1.640435458786936, "grad_norm": 0.0008083415450528264, "learning_rate": 2.517289073305671e-06, "logits/chosen": -1.9454562664031982, "logits/rejected": 4.325829982757568, "logps/chosen": -366.5877990722656, "logps/rejected": -1101.1285400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.116681098937988, "rewards/margins": 35.933876037597656, "rewards/rejected": -41.050559997558594, "step": 2637 }, { "epoch": 1.6410575427682739, "grad_norm": 0.020745502784848213, "learning_rate": 2.516136468418626e-06, "logits/chosen": -0.43713444471359253, "logits/rejected": 4.419112205505371, "logps/chosen": -330.8953857421875, "logps/rejected": -810.3660888671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.58512020111084, "rewards/margins": 23.428924560546875, "rewards/rejected": -28.0140438079834, "step": 2638 }, { "epoch": 1.6416796267496112, "grad_norm": 1.2889726349385455e-05, "learning_rate": 2.5149838635315814e-06, "logits/chosen": -0.5015338659286499, "logits/rejected": 2.5819554328918457, "logps/chosen": -532.0968017578125, "logps/rejected": -1029.7630615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.803220748901367, "rewards/margins": 35.672523498535156, "rewards/rejected": -42.475738525390625, "step": 2639 }, { "epoch": 1.6423017107309485, "grad_norm": 0.001544899307191372, "learning_rate": 2.5138312586445367e-06, "logits/chosen": 0.0655108094215393, "logits/rejected": 3.8382391929626465, "logps/chosen": -442.2047119140625, "logps/rejected": -894.7894287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.998608589172363, "rewards/margins": 24.21469497680664, "rewards/rejected": -32.21330261230469, "step": 2640 }, { "epoch": 1.6429237947122861, "grad_norm": 0.00020783714717254043, "learning_rate": 2.512678653757492e-06, "logits/chosen": -3.7620840072631836, "logits/rejected": 3.1680679321289062, "logps/chosen": -407.64263916015625, "logps/rejected": -1092.920654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.393203735351562, "rewards/margins": 35.05226135253906, "rewards/rejected": -43.445465087890625, "step": 2641 }, { "epoch": 1.6435458786936237, "grad_norm": 15.702348709106445, "learning_rate": 2.5115260488704475e-06, "logits/chosen": 1.081176519393921, "logits/rejected": 3.1471967697143555, "logps/chosen": -568.99365234375, "logps/rejected": -961.87890625, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": -9.569413185119629, "rewards/margins": 24.506071090698242, "rewards/rejected": -34.07548522949219, "step": 2642 }, { "epoch": 1.644167962674961, "grad_norm": 1.2208596672280692e-05, "learning_rate": 2.5103734439834028e-06, "logits/chosen": -0.700672447681427, "logits/rejected": 1.970914602279663, "logps/chosen": -495.0232849121094, "logps/rejected": -855.750244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2229461669921875, "rewards/margins": 29.222091674804688, "rewards/rejected": -35.445037841796875, "step": 2643 }, { "epoch": 1.6447900466562986, "grad_norm": 0.570807695388794, "learning_rate": 2.509220839096358e-06, "logits/chosen": 1.094158411026001, "logits/rejected": 1.5800312757492065, "logps/chosen": -552.4247436523438, "logps/rejected": -832.7394409179688, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -5.968006134033203, "rewards/margins": 25.20488166809082, "rewards/rejected": -31.172887802124023, "step": 2644 }, { "epoch": 1.6454121306376361, "grad_norm": 5.437176878331229e-05, "learning_rate": 2.508068234209313e-06, "logits/chosen": -0.7897230982780457, "logits/rejected": 3.430896282196045, "logps/chosen": -423.3076171875, "logps/rejected": -1002.4334716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.253915309906006, "rewards/margins": 30.8887882232666, "rewards/rejected": -36.1427001953125, "step": 2645 }, { "epoch": 1.6460342146189735, "grad_norm": 22.25408172607422, "learning_rate": 2.5069156293222684e-06, "logits/chosen": -0.08154647052288055, "logits/rejected": 2.993760585784912, "logps/chosen": -472.1348876953125, "logps/rejected": -845.7042846679688, "loss": 0.1614, "rewards/accuracies": 0.875, "rewards/chosen": -8.82140064239502, "rewards/margins": 18.36347770690918, "rewards/rejected": -27.18487548828125, "step": 2646 }, { "epoch": 1.646656298600311, "grad_norm": 26.578590393066406, "learning_rate": 2.5057630244352237e-06, "logits/chosen": 0.6960867643356323, "logits/rejected": 4.30970573425293, "logps/chosen": -570.57421875, "logps/rejected": -988.6622924804688, "loss": 0.1848, "rewards/accuracies": 0.875, "rewards/chosen": -12.502416610717773, "rewards/margins": 23.228342056274414, "rewards/rejected": -35.73075866699219, "step": 2647 }, { "epoch": 1.6472783825816486, "grad_norm": 2.429492235183716, "learning_rate": 2.504610419548179e-06, "logits/chosen": 0.29884105920791626, "logits/rejected": 3.7831315994262695, "logps/chosen": -520.1424560546875, "logps/rejected": -938.1561279296875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -7.274698257446289, "rewards/margins": 23.346614837646484, "rewards/rejected": -30.621315002441406, "step": 2648 }, { "epoch": 1.647900466562986, "grad_norm": 0.22774247825145721, "learning_rate": 2.5034578146611345e-06, "logits/chosen": -0.22361783683300018, "logits/rejected": 4.840500354766846, "logps/chosen": -470.9745178222656, "logps/rejected": -1212.837158203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.539072036743164, "rewards/margins": 39.26922607421875, "rewards/rejected": -47.80830383300781, "step": 2649 }, { "epoch": 1.6485225505443235, "grad_norm": 0.0011797187617048621, "learning_rate": 2.5023052097740898e-06, "logits/chosen": 0.7142741680145264, "logits/rejected": 3.0429601669311523, "logps/chosen": -618.66357421875, "logps/rejected": -1096.073974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.319652557373047, "rewards/margins": 34.73102569580078, "rewards/rejected": -44.05067825317383, "step": 2650 }, { "epoch": 1.649144634525661, "grad_norm": 26.476112365722656, "learning_rate": 2.501152604887045e-06, "logits/chosen": 2.0527851581573486, "logits/rejected": 4.556172847747803, "logps/chosen": -514.275146484375, "logps/rejected": -912.59033203125, "loss": 0.5968, "rewards/accuracies": 0.875, "rewards/chosen": -8.415748596191406, "rewards/margins": 24.877079010009766, "rewards/rejected": -33.292823791503906, "step": 2651 }, { "epoch": 1.6497667185069984, "grad_norm": 2.9838106632232666, "learning_rate": 2.5e-06, "logits/chosen": 1.1987992525100708, "logits/rejected": 2.6071865558624268, "logps/chosen": -553.8780517578125, "logps/rejected": -1002.5031127929688, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -10.879100799560547, "rewards/margins": 28.60495376586914, "rewards/rejected": -39.48405456542969, "step": 2652 }, { "epoch": 1.650388802488336, "grad_norm": 4.720027936855331e-05, "learning_rate": 2.4988473951129554e-06, "logits/chosen": 0.23911744356155396, "logits/rejected": 4.47409725189209, "logps/chosen": -397.372802734375, "logps/rejected": -932.7152099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.55997085571289, "rewards/margins": 27.874908447265625, "rewards/rejected": -37.43488311767578, "step": 2653 }, { "epoch": 1.6510108864696735, "grad_norm": 9.448405265808105, "learning_rate": 2.4976947902259107e-06, "logits/chosen": 1.5364556312561035, "logits/rejected": 3.421743392944336, "logps/chosen": -592.2041015625, "logps/rejected": -891.9800415039062, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -7.87186861038208, "rewards/margins": 19.73807716369629, "rewards/rejected": -27.60994529724121, "step": 2654 }, { "epoch": 1.6516329704510109, "grad_norm": 0.0007873836439102888, "learning_rate": 2.496542185338866e-06, "logits/chosen": 2.8756916522979736, "logits/rejected": 3.758998394012451, "logps/chosen": -642.5400390625, "logps/rejected": -866.63671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.303086280822754, "rewards/margins": 21.229276657104492, "rewards/rejected": -26.532363891601562, "step": 2655 }, { "epoch": 1.6522550544323482, "grad_norm": 1.6545012613278232e-06, "learning_rate": 2.4953895804518215e-06, "logits/chosen": -3.027890682220459, "logits/rejected": 2.6755690574645996, "logps/chosen": -334.79962158203125, "logps/rejected": -1057.5010986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5030996799468994, "rewards/margins": 34.03456115722656, "rewards/rejected": -37.537662506103516, "step": 2656 }, { "epoch": 1.652877138413686, "grad_norm": 0.04323378950357437, "learning_rate": 2.4942369755647768e-06, "logits/chosen": 0.9792221784591675, "logits/rejected": 4.009556770324707, "logps/chosen": -543.080322265625, "logps/rejected": -979.8177490234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.486639022827148, "rewards/margins": 25.986400604248047, "rewards/rejected": -33.47304153442383, "step": 2657 }, { "epoch": 1.6534992223950233, "grad_norm": 23.02494239807129, "learning_rate": 2.493084370677732e-06, "logits/chosen": -3.052624464035034, "logits/rejected": 1.8419592380523682, "logps/chosen": -465.3800354003906, "logps/rejected": -1080.67822265625, "loss": 0.5405, "rewards/accuracies": 0.875, "rewards/chosen": -9.965513229370117, "rewards/margins": 28.879623413085938, "rewards/rejected": -38.84513854980469, "step": 2658 }, { "epoch": 1.6541213063763607, "grad_norm": 0.21242384612560272, "learning_rate": 2.491931765790687e-06, "logits/chosen": -0.4600183367729187, "logits/rejected": 3.5004982948303223, "logps/chosen": -311.2583312988281, "logps/rejected": -733.16455078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.480689525604248, "rewards/margins": 22.707656860351562, "rewards/rejected": -28.188343048095703, "step": 2659 }, { "epoch": 1.6547433903576982, "grad_norm": 17.66877555847168, "learning_rate": 2.4907791609036424e-06, "logits/chosen": -1.8398863077163696, "logits/rejected": 2.277012348175049, "logps/chosen": -525.8780517578125, "logps/rejected": -1049.6021728515625, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": -10.22800350189209, "rewards/margins": 26.594133377075195, "rewards/rejected": -36.82213592529297, "step": 2660 }, { "epoch": 1.6553654743390358, "grad_norm": 0.014850149862468243, "learning_rate": 2.4896265560165977e-06, "logits/chosen": 1.2716033458709717, "logits/rejected": 4.249362945556641, "logps/chosen": -592.276611328125, "logps/rejected": -952.0533447265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.303014755249023, "rewards/margins": 26.30194091796875, "rewards/rejected": -35.604957580566406, "step": 2661 }, { "epoch": 1.6559875583203731, "grad_norm": 2.063870670099277e-05, "learning_rate": 2.488473951129553e-06, "logits/chosen": 1.3646904230117798, "logits/rejected": 3.710188388824463, "logps/chosen": -458.9728088378906, "logps/rejected": -885.9873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.571175575256348, "rewards/margins": 28.973085403442383, "rewards/rejected": -37.54425811767578, "step": 2662 }, { "epoch": 1.6566096423017107, "grad_norm": 3.696420431137085, "learning_rate": 2.4873213462425085e-06, "logits/chosen": -0.5779656767845154, "logits/rejected": 2.220045328140259, "logps/chosen": -499.6197814941406, "logps/rejected": -768.114013671875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -10.123802185058594, "rewards/margins": 11.481657028198242, "rewards/rejected": -21.60546112060547, "step": 2663 }, { "epoch": 1.6572317262830483, "grad_norm": 13.726298332214355, "learning_rate": 2.4861687413554637e-06, "logits/chosen": 2.0920047760009766, "logits/rejected": 2.3564300537109375, "logps/chosen": -493.8480224609375, "logps/rejected": -653.603271484375, "loss": 0.0679, "rewards/accuracies": 1.0, "rewards/chosen": -3.378700017929077, "rewards/margins": 18.358959197998047, "rewards/rejected": -21.737659454345703, "step": 2664 }, { "epoch": 1.6578538102643856, "grad_norm": 11.536592483520508, "learning_rate": 2.485016136468419e-06, "logits/chosen": 1.7251183986663818, "logits/rejected": 3.7293882369995117, "logps/chosen": -574.5499267578125, "logps/rejected": -829.0257568359375, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": -6.837594985961914, "rewards/margins": 15.651803970336914, "rewards/rejected": -22.489398956298828, "step": 2665 }, { "epoch": 1.6584758942457232, "grad_norm": 9.621564865112305, "learning_rate": 2.483863531581374e-06, "logits/chosen": -1.3720073699951172, "logits/rejected": 2.4826555252075195, "logps/chosen": -432.24261474609375, "logps/rejected": -852.5631103515625, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": -7.460178375244141, "rewards/margins": 20.206809997558594, "rewards/rejected": -27.666988372802734, "step": 2666 }, { "epoch": 1.6590979782270607, "grad_norm": 12.04741096496582, "learning_rate": 2.4827109266943294e-06, "logits/chosen": -0.24524717032909393, "logits/rejected": 4.3620710372924805, "logps/chosen": -528.10302734375, "logps/rejected": -1026.803466796875, "loss": 0.101, "rewards/accuracies": 0.875, "rewards/chosen": -9.096864700317383, "rewards/margins": 21.553415298461914, "rewards/rejected": -30.650278091430664, "step": 2667 }, { "epoch": 1.659720062208398, "grad_norm": 0.01621832698583603, "learning_rate": 2.4815583218072846e-06, "logits/chosen": 0.9092066287994385, "logits/rejected": 4.583636283874512, "logps/chosen": -399.865966796875, "logps/rejected": -795.4120483398438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.871487617492676, "rewards/margins": 22.655391693115234, "rewards/rejected": -28.526880264282227, "step": 2668 }, { "epoch": 1.6603421461897356, "grad_norm": 0.0032031771261245012, "learning_rate": 2.48040571692024e-06, "logits/chosen": 1.1127208471298218, "logits/rejected": 2.875021457672119, "logps/chosen": -594.210693359375, "logps/rejected": -1003.0892944335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.634833335876465, "rewards/margins": 27.350683212280273, "rewards/rejected": -37.98551559448242, "step": 2669 }, { "epoch": 1.6609642301710732, "grad_norm": 2.2323959569803264e-07, "learning_rate": 2.479253112033195e-06, "logits/chosen": -0.4398040771484375, "logits/rejected": 2.9848246574401855, "logps/chosen": -536.8930053710938, "logps/rejected": -1051.15185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.939126968383789, "rewards/margins": 30.918827056884766, "rewards/rejected": -37.85795593261719, "step": 2670 }, { "epoch": 1.6615863141524105, "grad_norm": 1.4484525918960571, "learning_rate": 2.4781005071461507e-06, "logits/chosen": -0.9866859316825867, "logits/rejected": 3.3200957775115967, "logps/chosen": -473.4273681640625, "logps/rejected": -996.3983154296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -6.616700172424316, "rewards/margins": 25.130720138549805, "rewards/rejected": -31.747419357299805, "step": 2671 }, { "epoch": 1.662208398133748, "grad_norm": 0.07499787956476212, "learning_rate": 2.476947902259106e-06, "logits/chosen": 0.25226056575775146, "logits/rejected": 4.216619968414307, "logps/chosen": -366.23028564453125, "logps/rejected": -829.1866455078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.613929748535156, "rewards/margins": 22.871925354003906, "rewards/rejected": -27.485855102539062, "step": 2672 }, { "epoch": 1.6628304821150857, "grad_norm": 8.993841038318351e-05, "learning_rate": 2.475795297372061e-06, "logits/chosen": -1.5489999055862427, "logits/rejected": 2.7130203247070312, "logps/chosen": -463.1144714355469, "logps/rejected": -911.884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4509150981903076, "rewards/margins": 21.476350784301758, "rewards/rejected": -24.927265167236328, "step": 2673 }, { "epoch": 1.663452566096423, "grad_norm": 18.458757400512695, "learning_rate": 2.4746426924850164e-06, "logits/chosen": 1.1556620597839355, "logits/rejected": 2.756657838821411, "logps/chosen": -557.0864868164062, "logps/rejected": -773.4407348632812, "loss": 0.1201, "rewards/accuracies": 0.875, "rewards/chosen": -3.7018070220947266, "rewards/margins": 15.727760314941406, "rewards/rejected": -19.4295654296875, "step": 2674 }, { "epoch": 1.6640746500777603, "grad_norm": 14.458096504211426, "learning_rate": 2.4734900875979716e-06, "logits/chosen": 2.844008684158325, "logits/rejected": 2.5082483291625977, "logps/chosen": -716.7899169921875, "logps/rejected": -886.850341796875, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": -8.73927116394043, "rewards/margins": 21.637420654296875, "rewards/rejected": -30.376689910888672, "step": 2675 }, { "epoch": 1.6646967340590981, "grad_norm": 9.570556640625, "learning_rate": 2.472337482710927e-06, "logits/chosen": 2.666499614715576, "logits/rejected": 5.031964302062988, "logps/chosen": -681.7529907226562, "logps/rejected": -1033.83203125, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -9.99263858795166, "rewards/margins": 21.586830139160156, "rewards/rejected": -31.579469680786133, "step": 2676 }, { "epoch": 1.6653188180404355, "grad_norm": 0.020050466060638428, "learning_rate": 2.471184877823882e-06, "logits/chosen": -0.3638615012168884, "logits/rejected": 3.2650585174560547, "logps/chosen": -377.27508544921875, "logps/rejected": -859.4393920898438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.265905380249023, "rewards/margins": 25.773866653442383, "rewards/rejected": -31.039772033691406, "step": 2677 }, { "epoch": 1.6659409020217728, "grad_norm": 1.8163225945500017e-07, "learning_rate": 2.4700322729368377e-06, "logits/chosen": -0.49591100215911865, "logits/rejected": 3.8754587173461914, "logps/chosen": -543.1971435546875, "logps/rejected": -1206.58251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.626020431518555, "rewards/margins": 37.646209716796875, "rewards/rejected": -50.27223205566406, "step": 2678 }, { "epoch": 1.6665629860031104, "grad_norm": 0.0008349265553988516, "learning_rate": 2.468879668049793e-06, "logits/chosen": 0.5656048059463501, "logits/rejected": 0.990628719329834, "logps/chosen": -471.417236328125, "logps/rejected": -761.2012329101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.046195983886719, "rewards/margins": 21.776756286621094, "rewards/rejected": -29.822952270507812, "step": 2679 }, { "epoch": 1.667185069984448, "grad_norm": 0.4058059751987457, "learning_rate": 2.467727063162748e-06, "logits/chosen": -0.5524758696556091, "logits/rejected": 2.338270664215088, "logps/chosen": -484.1187744140625, "logps/rejected": -918.57861328125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.968391418457031, "rewards/margins": 26.87673568725586, "rewards/rejected": -31.84512710571289, "step": 2680 }, { "epoch": 1.6678071539657853, "grad_norm": 0.0031695840880274773, "learning_rate": 2.4665744582757034e-06, "logits/chosen": 0.9984191656112671, "logits/rejected": 3.44390606880188, "logps/chosen": -575.0689086914062, "logps/rejected": -1073.3946533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.461224555969238, "rewards/margins": 33.61181640625, "rewards/rejected": -38.07304000854492, "step": 2681 }, { "epoch": 1.6684292379471228, "grad_norm": 1.2835079132855753e-07, "learning_rate": 2.4654218533886586e-06, "logits/chosen": 0.9636375904083252, "logits/rejected": 3.6253345012664795, "logps/chosen": -727.8519287109375, "logps/rejected": -1174.5240478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.085047721862793, "rewards/margins": 32.849456787109375, "rewards/rejected": -41.93450164794922, "step": 2682 }, { "epoch": 1.6690513219284604, "grad_norm": 0.014250471256673336, "learning_rate": 2.464269248501614e-06, "logits/chosen": -0.09940612316131592, "logits/rejected": 3.5841808319091797, "logps/chosen": -389.5401611328125, "logps/rejected": -736.5833740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.445569038391113, "rewards/margins": 18.353742599487305, "rewards/rejected": -22.799312591552734, "step": 2683 }, { "epoch": 1.6696734059097977, "grad_norm": 0.008063657209277153, "learning_rate": 2.463116643614569e-06, "logits/chosen": 0.6216780543327332, "logits/rejected": 0.8254268765449524, "logps/chosen": -548.0035400390625, "logps/rejected": -790.623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.906831741333008, "rewards/margins": 23.314350128173828, "rewards/rejected": -32.22118377685547, "step": 2684 }, { "epoch": 1.6702954898911353, "grad_norm": 0.0015282687963917851, "learning_rate": 2.4619640387275247e-06, "logits/chosen": 0.9856696128845215, "logits/rejected": 4.668768882751465, "logps/chosen": -539.461669921875, "logps/rejected": -1043.248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.886101722717285, "rewards/margins": 27.577598571777344, "rewards/rejected": -36.46370315551758, "step": 2685 }, { "epoch": 1.6709175738724729, "grad_norm": 4.341231822967529, "learning_rate": 2.4608114338404795e-06, "logits/chosen": 1.3563194274902344, "logits/rejected": 3.6629714965820312, "logps/chosen": -544.7206420898438, "logps/rejected": -966.00830078125, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -12.258726119995117, "rewards/margins": 26.233715057373047, "rewards/rejected": -38.4924430847168, "step": 2686 }, { "epoch": 1.6715396578538102, "grad_norm": 0.00014986857422627509, "learning_rate": 2.4596588289534348e-06, "logits/chosen": -1.0385351181030273, "logits/rejected": 3.6127541065216064, "logps/chosen": -399.8589172363281, "logps/rejected": -1015.7762451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.090690612792969, "rewards/margins": 34.113670349121094, "rewards/rejected": -40.20436096191406, "step": 2687 }, { "epoch": 1.6721617418351478, "grad_norm": 0.012877055443823338, "learning_rate": 2.45850622406639e-06, "logits/chosen": 0.6718133091926575, "logits/rejected": 2.5280394554138184, "logps/chosen": -479.4302978515625, "logps/rejected": -812.8926391601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.698501586914062, "rewards/margins": 20.51024055480957, "rewards/rejected": -29.208742141723633, "step": 2688 }, { "epoch": 1.6727838258164853, "grad_norm": 0.006198279093950987, "learning_rate": 2.4573536191793452e-06, "logits/chosen": 0.344510555267334, "logits/rejected": 3.332859992980957, "logps/chosen": -612.3021240234375, "logps/rejected": -1001.0293579101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.828897476196289, "rewards/margins": 25.03092384338379, "rewards/rejected": -34.85982131958008, "step": 2689 }, { "epoch": 1.6734059097978227, "grad_norm": 10.438737869262695, "learning_rate": 2.456201014292301e-06, "logits/chosen": -2.088998794555664, "logits/rejected": 4.074522972106934, "logps/chosen": -356.7595520019531, "logps/rejected": -990.0142211914062, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": -4.425658226013184, "rewards/margins": 36.38076400756836, "rewards/rejected": -40.80641555786133, "step": 2690 }, { "epoch": 1.6740279937791602, "grad_norm": 0.02665124461054802, "learning_rate": 2.455048409405256e-06, "logits/chosen": 0.4028273820877075, "logits/rejected": 1.4046893119812012, "logps/chosen": -633.2741088867188, "logps/rejected": -923.4190673828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.680094242095947, "rewards/margins": 26.50547981262207, "rewards/rejected": -34.18557357788086, "step": 2691 }, { "epoch": 1.6746500777604978, "grad_norm": 0.074191614985466, "learning_rate": 2.4538958045182113e-06, "logits/chosen": -1.7957860231399536, "logits/rejected": 2.5487489700317383, "logps/chosen": -488.90869140625, "logps/rejected": -1007.2379760742188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.99670696258545, "rewards/margins": 27.577075958251953, "rewards/rejected": -36.57378387451172, "step": 2692 }, { "epoch": 1.6752721617418351, "grad_norm": 45.34721374511719, "learning_rate": 2.4527431996311665e-06, "logits/chosen": 2.733384132385254, "logits/rejected": 4.8411126136779785, "logps/chosen": -693.03857421875, "logps/rejected": -1121.6904296875, "loss": 0.3278, "rewards/accuracies": 0.875, "rewards/chosen": -7.350863456726074, "rewards/margins": 31.43549346923828, "rewards/rejected": -38.78635787963867, "step": 2693 }, { "epoch": 1.6758942457231725, "grad_norm": 0.1667236089706421, "learning_rate": 2.4515905947441218e-06, "logits/chosen": -1.2241672277450562, "logits/rejected": 3.4006261825561523, "logps/chosen": -472.51177978515625, "logps/rejected": -1073.1248779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.828851222991943, "rewards/margins": 29.12144660949707, "rewards/rejected": -36.950294494628906, "step": 2694 }, { "epoch": 1.6765163297045103, "grad_norm": 4.75455162813887e-05, "learning_rate": 2.450437989857077e-06, "logits/chosen": -0.4841141402721405, "logits/rejected": 1.1751519441604614, "logps/chosen": -469.4749755859375, "logps/rejected": -779.6455688476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.378645896911621, "rewards/margins": 25.129650115966797, "rewards/rejected": -31.5082950592041, "step": 2695 }, { "epoch": 1.6771384136858476, "grad_norm": 0.00032651741639710963, "learning_rate": 2.449285384970032e-06, "logits/chosen": -0.706736147403717, "logits/rejected": 3.2579140663146973, "logps/chosen": -420.28155517578125, "logps/rejected": -1104.028564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8741631507873535, "rewards/margins": 39.49601364135742, "rewards/rejected": -42.37017822265625, "step": 2696 }, { "epoch": 1.677760497667185, "grad_norm": 0.0009474342223256826, "learning_rate": 2.448132780082988e-06, "logits/chosen": 0.9678953289985657, "logits/rejected": 3.6299071311950684, "logps/chosen": -473.0794372558594, "logps/rejected": -827.8023681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.318397521972656, "rewards/margins": 26.759435653686523, "rewards/rejected": -35.07783508300781, "step": 2697 }, { "epoch": 1.6783825816485225, "grad_norm": 1.3759820376435528e-06, "learning_rate": 2.446980175195943e-06, "logits/chosen": -0.22475984692573547, "logits/rejected": 4.250744819641113, "logps/chosen": -539.4481201171875, "logps/rejected": -1178.7088623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.235435485839844, "rewards/margins": 36.99909210205078, "rewards/rejected": -46.234527587890625, "step": 2698 }, { "epoch": 1.67900466562986, "grad_norm": 0.0020632497034966946, "learning_rate": 2.4458275703088983e-06, "logits/chosen": -0.722686767578125, "logits/rejected": 1.4689903259277344, "logps/chosen": -516.1554565429688, "logps/rejected": -869.0540771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.267215728759766, "rewards/margins": 19.979875564575195, "rewards/rejected": -28.247089385986328, "step": 2699 }, { "epoch": 1.6796267496111974, "grad_norm": 0.0010660639964044094, "learning_rate": 2.4446749654218535e-06, "logits/chosen": -1.3368785381317139, "logits/rejected": 2.3945975303649902, "logps/chosen": -339.3318176269531, "logps/rejected": -753.3385009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.133460998535156, "rewards/margins": 23.696168899536133, "rewards/rejected": -30.829631805419922, "step": 2700 }, { "epoch": 1.680248833592535, "grad_norm": 2.6615917682647705, "learning_rate": 2.4435223605348088e-06, "logits/chosen": -0.7420621514320374, "logits/rejected": 3.047687530517578, "logps/chosen": -508.665283203125, "logps/rejected": -948.4867553710938, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -8.16337776184082, "rewards/margins": 25.176620483398438, "rewards/rejected": -33.339996337890625, "step": 2701 }, { "epoch": 1.6808709175738725, "grad_norm": 0.4478207230567932, "learning_rate": 2.442369755647764e-06, "logits/chosen": -2.0094776153564453, "logits/rejected": 4.143111228942871, "logps/chosen": -497.0942077636719, "logps/rejected": -1188.542724609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.448010444641113, "rewards/margins": 30.105907440185547, "rewards/rejected": -38.553916931152344, "step": 2702 }, { "epoch": 1.6814930015552099, "grad_norm": 21.752195358276367, "learning_rate": 2.441217150760719e-06, "logits/chosen": 0.8991308212280273, "logits/rejected": 4.451169013977051, "logps/chosen": -482.6705627441406, "logps/rejected": -925.1748046875, "loss": 0.253, "rewards/accuracies": 0.875, "rewards/chosen": -5.439286231994629, "rewards/margins": 27.2860050201416, "rewards/rejected": -32.72529220581055, "step": 2703 }, { "epoch": 1.6821150855365474, "grad_norm": 0.10334199666976929, "learning_rate": 2.4400645458736744e-06, "logits/chosen": -1.01015305519104, "logits/rejected": 2.3958823680877686, "logps/chosen": -373.493896484375, "logps/rejected": -759.80517578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -6.089910507202148, "rewards/margins": 15.053412437438965, "rewards/rejected": -21.14332389831543, "step": 2704 }, { "epoch": 1.682737169517885, "grad_norm": 3.677429676055908, "learning_rate": 2.43891194098663e-06, "logits/chosen": 2.817261219024658, "logits/rejected": 5.3078293800354, "logps/chosen": -631.369140625, "logps/rejected": -1031.2825927734375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -9.647318840026855, "rewards/margins": 25.45352554321289, "rewards/rejected": -35.10084533691406, "step": 2705 }, { "epoch": 1.6833592534992223, "grad_norm": 0.00020113641221541911, "learning_rate": 2.4377593360995853e-06, "logits/chosen": -1.9404137134552002, "logits/rejected": 1.4332096576690674, "logps/chosen": -344.89678955078125, "logps/rejected": -818.8267822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.624368190765381, "rewards/margins": 24.991334915161133, "rewards/rejected": -32.61570358276367, "step": 2706 }, { "epoch": 1.68398133748056, "grad_norm": 33.51328659057617, "learning_rate": 2.4366067312125405e-06, "logits/chosen": 0.8392902612686157, "logits/rejected": 3.4847161769866943, "logps/chosen": -521.1593017578125, "logps/rejected": -924.17236328125, "loss": 0.5981, "rewards/accuracies": 0.875, "rewards/chosen": -7.168278694152832, "rewards/margins": 27.528364181518555, "rewards/rejected": -34.6966438293457, "step": 2707 }, { "epoch": 1.6846034214618975, "grad_norm": 9.50869591398451e-12, "learning_rate": 2.4354541263254958e-06, "logits/chosen": -1.568687915802002, "logits/rejected": 4.22953462600708, "logps/chosen": -385.8027038574219, "logps/rejected": -1056.0511474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.476365566253662, "rewards/margins": 36.176177978515625, "rewards/rejected": -40.65254211425781, "step": 2708 }, { "epoch": 1.6852255054432348, "grad_norm": 4.177229881286621, "learning_rate": 2.434301521438451e-06, "logits/chosen": -2.9155402183532715, "logits/rejected": -0.22680601477622986, "logps/chosen": -460.20953369140625, "logps/rejected": -860.6627807617188, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -8.720498085021973, "rewards/margins": 23.418338775634766, "rewards/rejected": -32.13883972167969, "step": 2709 }, { "epoch": 1.6858475894245724, "grad_norm": 0.007003345992416143, "learning_rate": 2.433148916551406e-06, "logits/chosen": -2.622126579284668, "logits/rejected": 2.777906656265259, "logps/chosen": -300.8578186035156, "logps/rejected": -924.3973388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.914675712585449, "rewards/margins": 33.299461364746094, "rewards/rejected": -41.214141845703125, "step": 2710 }, { "epoch": 1.68646967340591, "grad_norm": 8.189059257507324, "learning_rate": 2.4319963116643614e-06, "logits/chosen": -1.0699472427368164, "logits/rejected": 2.6137197017669678, "logps/chosen": -341.54754638671875, "logps/rejected": -804.4307861328125, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": -6.407834529876709, "rewards/margins": 24.09420394897461, "rewards/rejected": -30.502038955688477, "step": 2711 }, { "epoch": 1.6870917573872473, "grad_norm": 1.527038335800171, "learning_rate": 2.430843706777317e-06, "logits/chosen": 0.6405558586120605, "logits/rejected": 3.023784875869751, "logps/chosen": -500.26556396484375, "logps/rejected": -883.9600219726562, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -8.72463607788086, "rewards/margins": 21.619857788085938, "rewards/rejected": -30.344491958618164, "step": 2712 }, { "epoch": 1.6877138413685846, "grad_norm": 4.083947715116665e-05, "learning_rate": 2.4296911018902723e-06, "logits/chosen": -1.9219568967819214, "logits/rejected": 4.124215126037598, "logps/chosen": -245.7783660888672, "logps/rejected": -898.5387573242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.52880859375, "rewards/margins": 25.843339920043945, "rewards/rejected": -28.372146606445312, "step": 2713 }, { "epoch": 1.6883359253499224, "grad_norm": 0.034476667642593384, "learning_rate": 2.4285384970032275e-06, "logits/chosen": -0.8637582659721375, "logits/rejected": 3.500040054321289, "logps/chosen": -500.3390197753906, "logps/rejected": -1043.5906982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.800796508789062, "rewards/margins": 25.491060256958008, "rewards/rejected": -34.29185485839844, "step": 2714 }, { "epoch": 1.6889580093312597, "grad_norm": 0.021597065031528473, "learning_rate": 2.4273858921161828e-06, "logits/chosen": 0.38649308681488037, "logits/rejected": 3.550788402557373, "logps/chosen": -556.5277099609375, "logps/rejected": -993.5703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.913545608520508, "rewards/margins": 27.199634552001953, "rewards/rejected": -33.113182067871094, "step": 2715 }, { "epoch": 1.689580093312597, "grad_norm": 1.3178468179830816e-05, "learning_rate": 2.426233287229138e-06, "logits/chosen": 0.597633421421051, "logits/rejected": 3.9349923133850098, "logps/chosen": -514.322265625, "logps/rejected": -1022.6780395507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5213823318481445, "rewards/margins": 34.16990280151367, "rewards/rejected": -41.691287994384766, "step": 2716 }, { "epoch": 1.6902021772939346, "grad_norm": 0.0011442669201642275, "learning_rate": 2.425080682342093e-06, "logits/chosen": -2.8341634273529053, "logits/rejected": 0.4455097019672394, "logps/chosen": -428.048828125, "logps/rejected": -1065.8172607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.660099983215332, "rewards/margins": 35.430763244628906, "rewards/rejected": -46.09086227416992, "step": 2717 }, { "epoch": 1.6908242612752722, "grad_norm": 33.787418365478516, "learning_rate": 2.4239280774550484e-06, "logits/chosen": -2.904046058654785, "logits/rejected": 4.311407566070557, "logps/chosen": -310.52142333984375, "logps/rejected": -1213.693115234375, "loss": 0.4385, "rewards/accuracies": 0.875, "rewards/chosen": -4.004347801208496, "rewards/margins": 42.72428512573242, "rewards/rejected": -46.728633880615234, "step": 2718 }, { "epoch": 1.6914463452566095, "grad_norm": 0.011313035152852535, "learning_rate": 2.422775472568004e-06, "logits/chosen": 1.6661875247955322, "logits/rejected": 4.304853439331055, "logps/chosen": -686.5845947265625, "logps/rejected": -1155.77490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.166810989379883, "rewards/margins": 20.23712921142578, "rewards/rejected": -28.40393829345703, "step": 2719 }, { "epoch": 1.692068429237947, "grad_norm": 0.2662597894668579, "learning_rate": 2.4216228676809593e-06, "logits/chosen": -1.647039532661438, "logits/rejected": 3.3279147148132324, "logps/chosen": -423.85504150390625, "logps/rejected": -987.908447265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.690751075744629, "rewards/margins": 33.81493377685547, "rewards/rejected": -40.50568389892578, "step": 2720 }, { "epoch": 1.6926905132192847, "grad_norm": 32.57624816894531, "learning_rate": 2.4204702627939145e-06, "logits/chosen": 2.2442877292633057, "logits/rejected": 4.182276248931885, "logps/chosen": -627.66650390625, "logps/rejected": -963.42333984375, "loss": 0.571, "rewards/accuracies": 0.875, "rewards/chosen": -10.049556732177734, "rewards/margins": 19.074588775634766, "rewards/rejected": -29.1241455078125, "step": 2721 }, { "epoch": 1.693312597200622, "grad_norm": 7.951205770950764e-05, "learning_rate": 2.4193176579068697e-06, "logits/chosen": -0.030267775058746338, "logits/rejected": 3.133117914199829, "logps/chosen": -561.3271484375, "logps/rejected": -1074.7281494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.656089782714844, "rewards/margins": 34.97183609008789, "rewards/rejected": -44.627925872802734, "step": 2722 }, { "epoch": 1.6939346811819596, "grad_norm": 2.12374210357666, "learning_rate": 2.418165053019825e-06, "logits/chosen": -1.011715054512024, "logits/rejected": 4.0569586753845215, "logps/chosen": -334.70367431640625, "logps/rejected": -818.4017333984375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -5.086916923522949, "rewards/margins": 20.90202522277832, "rewards/rejected": -25.988941192626953, "step": 2723 }, { "epoch": 1.6945567651632971, "grad_norm": 0.5422797799110413, "learning_rate": 2.41701244813278e-06, "logits/chosen": 2.399137020111084, "logits/rejected": 3.993281602859497, "logps/chosen": -638.6915283203125, "logps/rejected": -1012.2086181640625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -8.717671394348145, "rewards/margins": 25.65497398376465, "rewards/rejected": -34.372642517089844, "step": 2724 }, { "epoch": 1.6951788491446345, "grad_norm": 0.0035224133171141148, "learning_rate": 2.4158598432457354e-06, "logits/chosen": 2.443415641784668, "logits/rejected": 3.793722629547119, "logps/chosen": -642.5726318359375, "logps/rejected": -1068.98974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.700204849243164, "rewards/margins": 30.676729202270508, "rewards/rejected": -38.37693405151367, "step": 2725 }, { "epoch": 1.695800933125972, "grad_norm": 0.0012602354399859905, "learning_rate": 2.414707238358691e-06, "logits/chosen": -2.665707588195801, "logits/rejected": 3.0817317962646484, "logps/chosen": -313.6740417480469, "logps/rejected": -992.2149658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.802724838256836, "rewards/margins": 29.130760192871094, "rewards/rejected": -36.9334831237793, "step": 2726 }, { "epoch": 1.6964230171073096, "grad_norm": 0.08769652247428894, "learning_rate": 2.4135546334716463e-06, "logits/chosen": 2.0320892333984375, "logits/rejected": 4.716381549835205, "logps/chosen": -652.5599975585938, "logps/rejected": -1002.2882080078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.021942138671875, "rewards/margins": 24.541967391967773, "rewards/rejected": -35.56391143798828, "step": 2727 }, { "epoch": 1.697045101088647, "grad_norm": 0.00015202997019514441, "learning_rate": 2.4124020285846015e-06, "logits/chosen": -0.2173956036567688, "logits/rejected": 3.3320794105529785, "logps/chosen": -304.7808532714844, "logps/rejected": -714.9826049804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9645328521728516, "rewards/margins": 22.323930740356445, "rewards/rejected": -26.288463592529297, "step": 2728 }, { "epoch": 1.6976671850699845, "grad_norm": 5.48606014251709, "learning_rate": 2.4112494236975567e-06, "logits/chosen": 2.7743520736694336, "logits/rejected": 2.2740674018859863, "logps/chosen": -813.4549560546875, "logps/rejected": -881.2877197265625, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -14.876110076904297, "rewards/margins": 16.42871856689453, "rewards/rejected": -31.304828643798828, "step": 2729 }, { "epoch": 1.698289269051322, "grad_norm": 0.05634181573987007, "learning_rate": 2.410096818810512e-06, "logits/chosen": 0.47589847445487976, "logits/rejected": 4.360468864440918, "logps/chosen": -400.2265625, "logps/rejected": -927.064453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.384169578552246, "rewards/margins": 31.364051818847656, "rewards/rejected": -40.74821853637695, "step": 2730 }, { "epoch": 1.6989113530326594, "grad_norm": 38.13288497924805, "learning_rate": 2.408944213923467e-06, "logits/chosen": -0.2454141080379486, "logits/rejected": 1.3477165699005127, "logps/chosen": -612.4234619140625, "logps/rejected": -934.793212890625, "loss": 0.6544, "rewards/accuracies": 0.875, "rewards/chosen": -13.737934112548828, "rewards/margins": 18.501102447509766, "rewards/rejected": -32.239036560058594, "step": 2731 }, { "epoch": 1.6995334370139967, "grad_norm": 0.011911272071301937, "learning_rate": 2.4077916090364224e-06, "logits/chosen": 2.55433988571167, "logits/rejected": 4.622102737426758, "logps/chosen": -639.3505249023438, "logps/rejected": -1007.958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.38956069946289, "rewards/margins": 23.871074676513672, "rewards/rejected": -33.26063537597656, "step": 2732 }, { "epoch": 1.7001555209953345, "grad_norm": 4.579018059303053e-05, "learning_rate": 2.4066390041493776e-06, "logits/chosen": 1.351335883140564, "logits/rejected": 4.477890968322754, "logps/chosen": -523.9299926757812, "logps/rejected": -966.3905029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.234797477722168, "rewards/margins": 29.752262115478516, "rewards/rejected": -38.987060546875, "step": 2733 }, { "epoch": 1.7007776049766719, "grad_norm": 13.588505744934082, "learning_rate": 2.4054863992623333e-06, "logits/chosen": 1.3708151578903198, "logits/rejected": 2.2401483058929443, "logps/chosen": -545.5647583007812, "logps/rejected": -833.3558349609375, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -10.524633407592773, "rewards/margins": 26.133468627929688, "rewards/rejected": -36.658103942871094, "step": 2734 }, { "epoch": 1.7013996889580092, "grad_norm": 42.44413757324219, "learning_rate": 2.4043337943752885e-06, "logits/chosen": -1.0746033191680908, "logits/rejected": 2.049215793609619, "logps/chosen": -386.47906494140625, "logps/rejected": -853.283203125, "loss": 0.5919, "rewards/accuracies": 0.875, "rewards/chosen": -7.945468902587891, "rewards/margins": 23.645946502685547, "rewards/rejected": -31.591413497924805, "step": 2735 }, { "epoch": 1.702021772939347, "grad_norm": 0.10011202841997147, "learning_rate": 2.4031811894882437e-06, "logits/chosen": -0.11232280731201172, "logits/rejected": 2.443506956100464, "logps/chosen": -652.34033203125, "logps/rejected": -1059.8670654296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -13.735544204711914, "rewards/margins": 28.015451431274414, "rewards/rejected": -41.75099563598633, "step": 2736 }, { "epoch": 1.7026438569206843, "grad_norm": 6.48001766204834, "learning_rate": 2.402028584601199e-06, "logits/chosen": 0.16253745555877686, "logits/rejected": 3.0159144401550293, "logps/chosen": -469.88543701171875, "logps/rejected": -903.1436767578125, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": -7.632349014282227, "rewards/margins": 28.997150421142578, "rewards/rejected": -36.62950134277344, "step": 2737 }, { "epoch": 1.7032659409020217, "grad_norm": 0.21324293315410614, "learning_rate": 2.400875979714154e-06, "logits/chosen": 0.3878498673439026, "logits/rejected": 3.4972376823425293, "logps/chosen": -622.3283081054688, "logps/rejected": -1015.4915161132812, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -13.050553321838379, "rewards/margins": 21.196529388427734, "rewards/rejected": -34.2470817565918, "step": 2738 }, { "epoch": 1.7038880248833592, "grad_norm": 35.615596771240234, "learning_rate": 2.3997233748271094e-06, "logits/chosen": -1.6907641887664795, "logits/rejected": 1.8408029079437256, "logps/chosen": -509.0423583984375, "logps/rejected": -1010.310546875, "loss": 0.356, "rewards/accuracies": 0.875, "rewards/chosen": -8.704755783081055, "rewards/margins": 23.956729888916016, "rewards/rejected": -32.66148376464844, "step": 2739 }, { "epoch": 1.7045101088646968, "grad_norm": 0.3253224790096283, "learning_rate": 2.3985707699400646e-06, "logits/chosen": -0.516049325466156, "logits/rejected": 2.495758533477783, "logps/chosen": -390.4578857421875, "logps/rejected": -747.568115234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.596771240234375, "rewards/margins": 21.8277587890625, "rewards/rejected": -31.424530029296875, "step": 2740 }, { "epoch": 1.7051321928460341, "grad_norm": 0.36244937777519226, "learning_rate": 2.3974181650530203e-06, "logits/chosen": -1.3871111869812012, "logits/rejected": 4.006772994995117, "logps/chosen": -278.1317443847656, "logps/rejected": -832.1084594726562, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.295748710632324, "rewards/margins": 28.429582595825195, "rewards/rejected": -34.72533416748047, "step": 2741 }, { "epoch": 1.7057542768273717, "grad_norm": 0.0001849048276199028, "learning_rate": 2.3962655601659755e-06, "logits/chosen": -1.333700180053711, "logits/rejected": 2.9659676551818848, "logps/chosen": -408.71209716796875, "logps/rejected": -999.737060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.499065399169922, "rewards/margins": 31.57623863220215, "rewards/rejected": -37.0753059387207, "step": 2742 }, { "epoch": 1.7063763608087092, "grad_norm": 37.42914962768555, "learning_rate": 2.3951129552789307e-06, "logits/chosen": 0.06055879592895508, "logits/rejected": 0.357525110244751, "logps/chosen": -564.270751953125, "logps/rejected": -872.297119140625, "loss": 1.0488, "rewards/accuracies": 0.875, "rewards/chosen": -14.182369232177734, "rewards/margins": 22.586198806762695, "rewards/rejected": -36.76856994628906, "step": 2743 }, { "epoch": 1.7069984447900466, "grad_norm": 0.03632102906703949, "learning_rate": 2.393960350391886e-06, "logits/chosen": -2.4644315242767334, "logits/rejected": 4.031020164489746, "logps/chosen": -366.7651062011719, "logps/rejected": -1044.26123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.470329284667969, "rewards/margins": 32.28361511230469, "rewards/rejected": -39.753944396972656, "step": 2744 }, { "epoch": 1.7076205287713841, "grad_norm": 2.2521191567648202e-05, "learning_rate": 2.392807745504841e-06, "logits/chosen": -0.43236926198005676, "logits/rejected": 2.319559335708618, "logps/chosen": -581.4365234375, "logps/rejected": -1093.714111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.591827392578125, "rewards/margins": 35.70243835449219, "rewards/rejected": -53.29426193237305, "step": 2745 }, { "epoch": 1.7082426127527217, "grad_norm": 23.961801528930664, "learning_rate": 2.3916551406177964e-06, "logits/chosen": -2.98201847076416, "logits/rejected": 3.573429584503174, "logps/chosen": -358.71954345703125, "logps/rejected": -1015.135009765625, "loss": 0.1875, "rewards/accuracies": 0.875, "rewards/chosen": -6.754096508026123, "rewards/margins": 30.13132667541504, "rewards/rejected": -36.88542175292969, "step": 2746 }, { "epoch": 1.708864696734059, "grad_norm": 0.001443828223273158, "learning_rate": 2.3905025357307516e-06, "logits/chosen": 1.7244963645935059, "logits/rejected": 4.353095531463623, "logps/chosen": -530.1221313476562, "logps/rejected": -912.9786376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.947088241577148, "rewards/margins": 25.049970626831055, "rewards/rejected": -31.997058868408203, "step": 2747 }, { "epoch": 1.7094867807153966, "grad_norm": 0.13827162981033325, "learning_rate": 2.3893499308437073e-06, "logits/chosen": 1.7216299772262573, "logits/rejected": 4.234375, "logps/chosen": -632.4540405273438, "logps/rejected": -1099.68310546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -11.171640396118164, "rewards/margins": 25.345455169677734, "rewards/rejected": -36.517093658447266, "step": 2748 }, { "epoch": 1.7101088646967342, "grad_norm": 6.640329360961914, "learning_rate": 2.3881973259566625e-06, "logits/chosen": -0.3900489807128906, "logits/rejected": 4.642871379852295, "logps/chosen": -533.6476440429688, "logps/rejected": -1143.8360595703125, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -8.386316299438477, "rewards/margins": 27.875993728637695, "rewards/rejected": -36.262306213378906, "step": 2749 }, { "epoch": 1.7107309486780715, "grad_norm": 0.018535036593675613, "learning_rate": 2.3870447210696177e-06, "logits/chosen": -0.20886629819869995, "logits/rejected": 3.061354160308838, "logps/chosen": -448.07391357421875, "logps/rejected": -983.8624877929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.103137016296387, "rewards/margins": 32.060157775878906, "rewards/rejected": -41.16329574584961, "step": 2750 }, { "epoch": 1.7113530326594089, "grad_norm": 0.004795982502400875, "learning_rate": 2.385892116182573e-06, "logits/chosen": 2.007289171218872, "logits/rejected": 3.2250893115997314, "logps/chosen": -746.975341796875, "logps/rejected": -1015.2783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.418895721435547, "rewards/margins": 21.668270111083984, "rewards/rejected": -34.08716583251953, "step": 2751 }, { "epoch": 1.7119751166407466, "grad_norm": 56.460609436035156, "learning_rate": 2.384739511295528e-06, "logits/chosen": 1.8035542964935303, "logits/rejected": 3.231412172317505, "logps/chosen": -719.7908935546875, "logps/rejected": -941.8114624023438, "loss": 0.6946, "rewards/accuracies": 0.875, "rewards/chosen": -14.651267051696777, "rewards/margins": 18.337669372558594, "rewards/rejected": -32.98893737792969, "step": 2752 }, { "epoch": 1.712597200622084, "grad_norm": 1.7252594261663035e-05, "learning_rate": 2.3835869064084834e-06, "logits/chosen": 2.040581703186035, "logits/rejected": 3.0739097595214844, "logps/chosen": -595.2687377929688, "logps/rejected": -847.3399658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.129035949707031, "rewards/margins": 23.222198486328125, "rewards/rejected": -32.35123062133789, "step": 2753 }, { "epoch": 1.7132192846034213, "grad_norm": 0.00926581397652626, "learning_rate": 2.3824343015214386e-06, "logits/chosen": 0.37493985891342163, "logits/rejected": 3.2370142936706543, "logps/chosen": -436.7816162109375, "logps/rejected": -841.2584838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.670589447021484, "rewards/margins": 23.545188903808594, "rewards/rejected": -34.21577835083008, "step": 2754 }, { "epoch": 1.713841368584759, "grad_norm": 0.05381428450345993, "learning_rate": 2.381281696634394e-06, "logits/chosen": 1.032725214958191, "logits/rejected": 3.626753091812134, "logps/chosen": -596.6863403320312, "logps/rejected": -1086.4066162109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.142266273498535, "rewards/margins": 33.237281799316406, "rewards/rejected": -43.379547119140625, "step": 2755 }, { "epoch": 1.7144634525660964, "grad_norm": 43.99714279174805, "learning_rate": 2.380129091747349e-06, "logits/chosen": 2.4638986587524414, "logits/rejected": 1.1975902318954468, "logps/chosen": -824.5519409179688, "logps/rejected": -935.77685546875, "loss": 0.5373, "rewards/accuracies": 0.875, "rewards/chosen": -18.057632446289062, "rewards/margins": 17.896528244018555, "rewards/rejected": -35.95416259765625, "step": 2756 }, { "epoch": 1.7150855365474338, "grad_norm": 2.8142278097220697e-05, "learning_rate": 2.3789764868603043e-06, "logits/chosen": -0.084682896733284, "logits/rejected": 4.622895240783691, "logps/chosen": -513.091552734375, "logps/rejected": -1204.34423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.010409355163574, "rewards/margins": 35.475242614746094, "rewards/rejected": -46.48564910888672, "step": 2757 }, { "epoch": 1.7157076205287713, "grad_norm": 5.099128657093388e-07, "learning_rate": 2.3778238819732595e-06, "logits/chosen": 1.8338658809661865, "logits/rejected": 3.0492074489593506, "logps/chosen": -530.6513671875, "logps/rejected": -860.2197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.946867942810059, "rewards/margins": 25.688880920410156, "rewards/rejected": -35.63574981689453, "step": 2758 }, { "epoch": 1.716329704510109, "grad_norm": 0.05735553056001663, "learning_rate": 2.3766712770862148e-06, "logits/chosen": 0.5481299161911011, "logits/rejected": 1.9903779029846191, "logps/chosen": -658.283935546875, "logps/rejected": -961.1427001953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.964715003967285, "rewards/margins": 20.74061393737793, "rewards/rejected": -31.70532989501953, "step": 2759 }, { "epoch": 1.7169517884914463, "grad_norm": 1.0057343757807757e-07, "learning_rate": 2.3755186721991704e-06, "logits/chosen": -2.410283088684082, "logits/rejected": 1.853485345840454, "logps/chosen": -428.63189697265625, "logps/rejected": -974.8028564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.190696716308594, "rewards/margins": 34.52707290649414, "rewards/rejected": -40.7177734375, "step": 2760 }, { "epoch": 1.7175738724727838, "grad_norm": 0.04589163884520531, "learning_rate": 2.3743660673121256e-06, "logits/chosen": 0.30356281995773315, "logits/rejected": 2.435214042663574, "logps/chosen": -465.5194091796875, "logps/rejected": -819.4593505859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.391858100891113, "rewards/margins": 19.34056282043457, "rewards/rejected": -26.732418060302734, "step": 2761 }, { "epoch": 1.7181959564541214, "grad_norm": 0.19371838867664337, "learning_rate": 2.373213462425081e-06, "logits/chosen": 1.2789655923843384, "logits/rejected": 3.6189393997192383, "logps/chosen": -504.8437805175781, "logps/rejected": -906.9939575195312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -11.996271133422852, "rewards/margins": 23.702014923095703, "rewards/rejected": -35.69828796386719, "step": 2762 }, { "epoch": 1.7188180404354587, "grad_norm": 0.0007012194837443531, "learning_rate": 2.372060857538036e-06, "logits/chosen": -2.735713481903076, "logits/rejected": 1.4813268184661865, "logps/chosen": -385.73150634765625, "logps/rejected": -942.9842529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.717445373535156, "rewards/margins": 34.88406753540039, "rewards/rejected": -42.60151290893555, "step": 2763 }, { "epoch": 1.7194401244167963, "grad_norm": 0.005390803795307875, "learning_rate": 2.3709082526509913e-06, "logits/chosen": 0.9662128686904907, "logits/rejected": 1.7179337739944458, "logps/chosen": -537.8453369140625, "logps/rejected": -877.6395263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.155290603637695, "rewards/margins": 24.83075714111328, "rewards/rejected": -35.98604965209961, "step": 2764 }, { "epoch": 1.7200622083981338, "grad_norm": 0.04520804435014725, "learning_rate": 2.3697556477639465e-06, "logits/chosen": 0.6427794694900513, "logits/rejected": 3.6675214767456055, "logps/chosen": -479.03680419921875, "logps/rejected": -872.9364013671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.6263628005981445, "rewards/margins": 19.985591888427734, "rewards/rejected": -26.611953735351562, "step": 2765 }, { "epoch": 1.7206842923794712, "grad_norm": 0.061935242265462875, "learning_rate": 2.3686030428769018e-06, "logits/chosen": -0.41115838289260864, "logits/rejected": 4.158937454223633, "logps/chosen": -559.28076171875, "logps/rejected": -1320.9888916015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -14.832966804504395, "rewards/margins": 41.016700744628906, "rewards/rejected": -55.84966278076172, "step": 2766 }, { "epoch": 1.7213063763608087, "grad_norm": 0.0008467906154692173, "learning_rate": 2.367450437989857e-06, "logits/chosen": 2.045358180999756, "logits/rejected": 2.9397809505462646, "logps/chosen": -575.7149658203125, "logps/rejected": -990.0404052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.39871883392334, "rewards/margins": 32.345008850097656, "rewards/rejected": -42.74372863769531, "step": 2767 }, { "epoch": 1.7219284603421463, "grad_norm": 0.035182323306798935, "learning_rate": 2.3662978331028126e-06, "logits/chosen": -1.1947062015533447, "logits/rejected": 3.6915230751037598, "logps/chosen": -364.73077392578125, "logps/rejected": -939.2349853515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.385026693344116, "rewards/margins": 26.972509384155273, "rewards/rejected": -30.357534408569336, "step": 2768 }, { "epoch": 1.7225505443234836, "grad_norm": 0.0013195689534768462, "learning_rate": 2.365145228215768e-06, "logits/chosen": 0.15518411993980408, "logits/rejected": 2.3073744773864746, "logps/chosen": -576.5823364257812, "logps/rejected": -856.1112060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.871328353881836, "rewards/margins": 18.359949111938477, "rewards/rejected": -27.231277465820312, "step": 2769 }, { "epoch": 1.723172628304821, "grad_norm": 4.51564359664917, "learning_rate": 2.363992623328723e-06, "logits/chosen": 0.6936973929405212, "logits/rejected": 3.6317267417907715, "logps/chosen": -504.4945983886719, "logps/rejected": -747.7782592773438, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -8.503636360168457, "rewards/margins": 18.5909423828125, "rewards/rejected": -27.09457778930664, "step": 2770 }, { "epoch": 1.7237947122861588, "grad_norm": 0.011435436084866524, "learning_rate": 2.3628400184416783e-06, "logits/chosen": -2.0588650703430176, "logits/rejected": 1.1596896648406982, "logps/chosen": -426.8011474609375, "logps/rejected": -997.5834350585938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.577409267425537, "rewards/margins": 33.40705871582031, "rewards/rejected": -40.984466552734375, "step": 2771 }, { "epoch": 1.724416796267496, "grad_norm": 0.796137273311615, "learning_rate": 2.3616874135546335e-06, "logits/chosen": -0.6903742551803589, "logits/rejected": 3.3889074325561523, "logps/chosen": -453.154541015625, "logps/rejected": -941.4500732421875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -9.413330078125, "rewards/margins": 27.53225326538086, "rewards/rejected": -36.94558334350586, "step": 2772 }, { "epoch": 1.7250388802488335, "grad_norm": 0.00024519202997907996, "learning_rate": 2.3605348086675888e-06, "logits/chosen": 2.3214950561523438, "logits/rejected": 3.429938316345215, "logps/chosen": -568.1884765625, "logps/rejected": -873.7118530273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.79106330871582, "rewards/margins": 25.14430046081543, "rewards/rejected": -32.93536376953125, "step": 2773 }, { "epoch": 1.7256609642301712, "grad_norm": 9.089024388231337e-06, "learning_rate": 2.359382203780544e-06, "logits/chosen": 3.637298822402954, "logits/rejected": 3.8487226963043213, "logps/chosen": -681.5171508789062, "logps/rejected": -1018.5064086914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.541778564453125, "rewards/margins": 26.06352996826172, "rewards/rejected": -33.605308532714844, "step": 2774 }, { "epoch": 1.7262830482115086, "grad_norm": 2.3466477394104004, "learning_rate": 2.3582295988934996e-06, "logits/chosen": 1.3259943723678589, "logits/rejected": 4.5670976638793945, "logps/chosen": -652.5491943359375, "logps/rejected": -1064.20263671875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -11.579160690307617, "rewards/margins": 26.290414810180664, "rewards/rejected": -37.86957550048828, "step": 2775 }, { "epoch": 1.726905132192846, "grad_norm": 0.16569431126117706, "learning_rate": 2.357076994006455e-06, "logits/chosen": -0.6097656488418579, "logits/rejected": 3.4430322647094727, "logps/chosen": -518.7841796875, "logps/rejected": -1055.17041015625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -9.744987487792969, "rewards/margins": 32.98575973510742, "rewards/rejected": -42.73074722290039, "step": 2776 }, { "epoch": 1.7275272161741835, "grad_norm": 0.09507567435503006, "learning_rate": 2.35592438911941e-06, "logits/chosen": -0.14290493726730347, "logits/rejected": 3.131923198699951, "logps/chosen": -484.4637756347656, "logps/rejected": -944.8180541992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.113192558288574, "rewards/margins": 27.75663948059082, "rewards/rejected": -36.869834899902344, "step": 2777 }, { "epoch": 1.728149300155521, "grad_norm": 0.20066502690315247, "learning_rate": 2.3547717842323653e-06, "logits/chosen": 2.386807441711426, "logits/rejected": 3.2670209407806396, "logps/chosen": -634.5061645507812, "logps/rejected": -960.57421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -13.417230606079102, "rewards/margins": 28.866121292114258, "rewards/rejected": -42.28335189819336, "step": 2778 }, { "epoch": 1.7287713841368584, "grad_norm": 27.666940689086914, "learning_rate": 2.3536191793453205e-06, "logits/chosen": 0.8378705978393555, "logits/rejected": 2.626453161239624, "logps/chosen": -686.529541015625, "logps/rejected": -1111.91162109375, "loss": 0.3747, "rewards/accuracies": 0.875, "rewards/chosen": -14.910959243774414, "rewards/margins": 30.519216537475586, "rewards/rejected": -45.430179595947266, "step": 2779 }, { "epoch": 1.729393468118196, "grad_norm": 4.822820663452148, "learning_rate": 2.3524665744582757e-06, "logits/chosen": 1.7815725803375244, "logits/rejected": 3.440255880355835, "logps/chosen": -554.303955078125, "logps/rejected": -881.9509887695312, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -9.29763412475586, "rewards/margins": 22.683753967285156, "rewards/rejected": -31.981388092041016, "step": 2780 }, { "epoch": 1.7300155520995335, "grad_norm": 0.04911397397518158, "learning_rate": 2.351313969571231e-06, "logits/chosen": -0.22427219152450562, "logits/rejected": 2.9939990043640137, "logps/chosen": -644.0833740234375, "logps/rejected": -1102.349365234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.63404369354248, "rewards/margins": 25.59818458557129, "rewards/rejected": -39.23223114013672, "step": 2781 }, { "epoch": 1.7306376360808708, "grad_norm": 2.280266046524048, "learning_rate": 2.3501613646841866e-06, "logits/chosen": -3.042137384414673, "logits/rejected": 2.958921432495117, "logps/chosen": -434.674072265625, "logps/rejected": -1080.346923828125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -11.471439361572266, "rewards/margins": 34.801605224609375, "rewards/rejected": -46.27304458618164, "step": 2782 }, { "epoch": 1.7312597200622084, "grad_norm": 34.36720275878906, "learning_rate": 2.349008759797142e-06, "logits/chosen": -0.14768370985984802, "logits/rejected": 2.648454427719116, "logps/chosen": -579.7083129882812, "logps/rejected": -1024.5987548828125, "loss": 0.2468, "rewards/accuracies": 0.875, "rewards/chosen": -10.255849838256836, "rewards/margins": 27.46382713317871, "rewards/rejected": -37.71967697143555, "step": 2783 }, { "epoch": 1.731881804043546, "grad_norm": 1.1667402759485412e-06, "learning_rate": 2.347856154910097e-06, "logits/chosen": -3.273422956466675, "logits/rejected": 3.320072650909424, "logps/chosen": -313.364990234375, "logps/rejected": -1038.2227783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1106157302856445, "rewards/margins": 32.74595642089844, "rewards/rejected": -38.856571197509766, "step": 2784 }, { "epoch": 1.7325038880248833, "grad_norm": 31.996156692504883, "learning_rate": 2.3467035500230523e-06, "logits/chosen": -1.5053670406341553, "logits/rejected": 1.9649426937103271, "logps/chosen": -432.020751953125, "logps/rejected": -886.978271484375, "loss": 0.4382, "rewards/accuracies": 0.875, "rewards/chosen": -6.486809253692627, "rewards/margins": 24.155384063720703, "rewards/rejected": -30.642192840576172, "step": 2785 }, { "epoch": 1.7331259720062209, "grad_norm": 2.6820787752512842e-05, "learning_rate": 2.3455509451360075e-06, "logits/chosen": -1.8410687446594238, "logits/rejected": 4.440993309020996, "logps/chosen": -359.7074279785156, "logps/rejected": -1087.0943603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.158833980560303, "rewards/margins": 34.07766342163086, "rewards/rejected": -40.23649978637695, "step": 2786 }, { "epoch": 1.7337480559875584, "grad_norm": 0.758760929107666, "learning_rate": 2.3443983402489627e-06, "logits/chosen": -1.3878371715545654, "logits/rejected": 2.5684714317321777, "logps/chosen": -493.8824768066406, "logps/rejected": -898.0422973632812, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -7.0507612228393555, "rewards/margins": 23.280902862548828, "rewards/rejected": -30.3316650390625, "step": 2787 }, { "epoch": 1.7343701399688958, "grad_norm": 1.4930262565612793, "learning_rate": 2.343245735361918e-06, "logits/chosen": -0.7004250884056091, "logits/rejected": 3.0102808475494385, "logps/chosen": -538.3475341796875, "logps/rejected": -971.8416748046875, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -8.780963897705078, "rewards/margins": 19.83646583557129, "rewards/rejected": -28.617431640625, "step": 2788 }, { "epoch": 1.7349922239502333, "grad_norm": 0.019264360889792442, "learning_rate": 2.3420931304748736e-06, "logits/chosen": -0.9465036392211914, "logits/rejected": 4.264335632324219, "logps/chosen": -327.2490234375, "logps/rejected": -816.83544921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.298092842102051, "rewards/margins": 21.538646697998047, "rewards/rejected": -25.836740493774414, "step": 2789 }, { "epoch": 1.735614307931571, "grad_norm": 0.00041238003177568316, "learning_rate": 2.340940525587829e-06, "logits/chosen": -0.6670334339141846, "logits/rejected": 1.9680249691009521, "logps/chosen": -429.46240234375, "logps/rejected": -969.256103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.832972049713135, "rewards/margins": 31.79166030883789, "rewards/rejected": -39.6246337890625, "step": 2790 }, { "epoch": 1.7362363919129082, "grad_norm": 6.324278831481934, "learning_rate": 2.339787920700784e-06, "logits/chosen": 2.0768911838531494, "logits/rejected": 2.6396563053131104, "logps/chosen": -616.1771240234375, "logps/rejected": -860.7713012695312, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -8.11803913116455, "rewards/margins": 17.913665771484375, "rewards/rejected": -26.031702041625977, "step": 2791 }, { "epoch": 1.7368584758942456, "grad_norm": 0.00336162350140512, "learning_rate": 2.3386353158137393e-06, "logits/chosen": 0.6120978593826294, "logits/rejected": 2.5396242141723633, "logps/chosen": -566.498046875, "logps/rejected": -1051.150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.51701831817627, "rewards/margins": 32.585655212402344, "rewards/rejected": -42.10266876220703, "step": 2792 }, { "epoch": 1.7374805598755834, "grad_norm": 0.03778179734945297, "learning_rate": 2.3374827109266945e-06, "logits/chosen": -1.0540292263031006, "logits/rejected": 2.4589219093322754, "logps/chosen": -366.5378723144531, "logps/rejected": -811.3502197265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.14518404006958, "rewards/margins": 30.593111038208008, "rewards/rejected": -37.73829650878906, "step": 2793 }, { "epoch": 1.7381026438569207, "grad_norm": 2.461298942565918, "learning_rate": 2.3363301060396497e-06, "logits/chosen": 2.8499398231506348, "logits/rejected": 4.111413478851318, "logps/chosen": -592.90869140625, "logps/rejected": -959.670166015625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -10.37989616394043, "rewards/margins": 23.281414031982422, "rewards/rejected": -33.661312103271484, "step": 2794 }, { "epoch": 1.738724727838258, "grad_norm": 0.03133802488446236, "learning_rate": 2.335177501152605e-06, "logits/chosen": -0.18570584058761597, "logits/rejected": 2.8326315879821777, "logps/chosen": -496.2568664550781, "logps/rejected": -912.7903442382812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.508269309997559, "rewards/margins": 27.135391235351562, "rewards/rejected": -35.64365768432617, "step": 2795 }, { "epoch": 1.7393468118195956, "grad_norm": 0.04039645567536354, "learning_rate": 2.33402489626556e-06, "logits/chosen": -1.090985894203186, "logits/rejected": 3.0338008403778076, "logps/chosen": -468.44683837890625, "logps/rejected": -1013.5341186523438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.679754257202148, "rewards/margins": 33.22745132446289, "rewards/rejected": -40.907203674316406, "step": 2796 }, { "epoch": 1.7399688958009332, "grad_norm": 0.003894314868375659, "learning_rate": 2.332872291378516e-06, "logits/chosen": 0.48016834259033203, "logits/rejected": 0.8828748464584351, "logps/chosen": -588.974853515625, "logps/rejected": -899.0206298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.459451675415039, "rewards/margins": 24.44631576538086, "rewards/rejected": -32.90576934814453, "step": 2797 }, { "epoch": 1.7405909797822705, "grad_norm": 4.184944191365503e-05, "learning_rate": 2.331719686491471e-06, "logits/chosen": -2.460817337036133, "logits/rejected": 2.706566333770752, "logps/chosen": -324.81427001953125, "logps/rejected": -978.3182373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.0268778800964355, "rewards/margins": 31.31508445739746, "rewards/rejected": -37.34196090698242, "step": 2798 }, { "epoch": 1.741213063763608, "grad_norm": 0.0037377155385911465, "learning_rate": 2.3305670816044263e-06, "logits/chosen": -2.2354516983032227, "logits/rejected": 3.795807123184204, "logps/chosen": -327.38311767578125, "logps/rejected": -927.3829956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.096700191497803, "rewards/margins": 28.419921875, "rewards/rejected": -33.516624450683594, "step": 2799 }, { "epoch": 1.7418351477449456, "grad_norm": 0.01131622213870287, "learning_rate": 2.3294144767173815e-06, "logits/chosen": 0.8565537929534912, "logits/rejected": 3.1421103477478027, "logps/chosen": -564.3128051757812, "logps/rejected": -958.5903930664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.258453369140625, "rewards/margins": 24.025192260742188, "rewards/rejected": -33.28364562988281, "step": 2800 }, { "epoch": 1.742457231726283, "grad_norm": 0.9999305605888367, "learning_rate": 2.3282618718303367e-06, "logits/chosen": 0.04175460338592529, "logits/rejected": 3.4293456077575684, "logps/chosen": -473.6128845214844, "logps/rejected": -915.9506225585938, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -10.805809020996094, "rewards/margins": 23.354106903076172, "rewards/rejected": -34.15991973876953, "step": 2801 }, { "epoch": 1.7430793157076205, "grad_norm": 0.37862786650657654, "learning_rate": 2.327109266943292e-06, "logits/chosen": 2.9525222778320312, "logits/rejected": 5.147446155548096, "logps/chosen": -635.0414428710938, "logps/rejected": -1057.5360107421875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.820849418640137, "rewards/margins": 24.550661087036133, "rewards/rejected": -33.37151336669922, "step": 2802 }, { "epoch": 1.743701399688958, "grad_norm": 37.64432907104492, "learning_rate": 2.325956662056247e-06, "logits/chosen": 1.7595758438110352, "logits/rejected": 3.8820478916168213, "logps/chosen": -615.9675903320312, "logps/rejected": -1010.1990966796875, "loss": 0.4328, "rewards/accuracies": 0.875, "rewards/chosen": -11.145980834960938, "rewards/margins": 27.22187042236328, "rewards/rejected": -38.36784744262695, "step": 2803 }, { "epoch": 1.7443234836702954, "grad_norm": 0.122687928378582, "learning_rate": 2.324804057169203e-06, "logits/chosen": -1.2267993688583374, "logits/rejected": 4.899316310882568, "logps/chosen": -335.3673095703125, "logps/rejected": -985.9779052734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.552623748779297, "rewards/margins": 32.96437072753906, "rewards/rejected": -38.51699447631836, "step": 2804 }, { "epoch": 1.744945567651633, "grad_norm": 0.007713071536272764, "learning_rate": 2.323651452282158e-06, "logits/chosen": -1.778456449508667, "logits/rejected": 0.725517749786377, "logps/chosen": -432.5553894042969, "logps/rejected": -830.3568115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.305593490600586, "rewards/margins": 23.346324920654297, "rewards/rejected": -31.651918411254883, "step": 2805 }, { "epoch": 1.7455676516329706, "grad_norm": 4.1068889800044417e-07, "learning_rate": 2.3224988473951133e-06, "logits/chosen": -0.16772279143333435, "logits/rejected": 2.112006425857544, "logps/chosen": -525.9559326171875, "logps/rejected": -964.70751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.622228622436523, "rewards/margins": 28.970535278320312, "rewards/rejected": -38.5927619934082, "step": 2806 }, { "epoch": 1.746189735614308, "grad_norm": 1.9309496565256268e-05, "learning_rate": 2.3213462425080685e-06, "logits/chosen": -0.6594813466072083, "logits/rejected": 3.5559253692626953, "logps/chosen": -404.66748046875, "logps/rejected": -869.7897338867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.164316177368164, "rewards/margins": 29.088703155517578, "rewards/rejected": -36.253021240234375, "step": 2807 }, { "epoch": 1.7468118195956455, "grad_norm": 34.09669494628906, "learning_rate": 2.3201936376210237e-06, "logits/chosen": 3.4891278743743896, "logits/rejected": 4.456305503845215, "logps/chosen": -647.476318359375, "logps/rejected": -957.98681640625, "loss": 0.5016, "rewards/accuracies": 0.875, "rewards/chosen": -12.324559211730957, "rewards/margins": 23.308271408081055, "rewards/rejected": -35.63282775878906, "step": 2808 }, { "epoch": 1.747433903576983, "grad_norm": 1.8752028942108154, "learning_rate": 2.319041032733979e-06, "logits/chosen": -1.88791024684906, "logits/rejected": 2.6826634407043457, "logps/chosen": -335.170654296875, "logps/rejected": -904.3746337890625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -5.0012407302856445, "rewards/margins": 25.391626358032227, "rewards/rejected": -30.392868041992188, "step": 2809 }, { "epoch": 1.7480559875583204, "grad_norm": 0.08498809486627579, "learning_rate": 2.317888427846934e-06, "logits/chosen": 2.0526623725891113, "logits/rejected": 3.381930351257324, "logps/chosen": -548.2327880859375, "logps/rejected": -890.7827758789062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.699272155761719, "rewards/margins": 24.447311401367188, "rewards/rejected": -35.146583557128906, "step": 2810 }, { "epoch": 1.7486780715396577, "grad_norm": 0.1443328857421875, "learning_rate": 2.31673582295989e-06, "logits/chosen": 2.4920990467071533, "logits/rejected": 4.312612533569336, "logps/chosen": -722.9078979492188, "logps/rejected": -1082.167724609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -14.12423324584961, "rewards/margins": 30.722488403320312, "rewards/rejected": -44.84672546386719, "step": 2811 }, { "epoch": 1.7493001555209955, "grad_norm": 0.00010770368680823594, "learning_rate": 2.315583218072845e-06, "logits/chosen": -0.9948385953903198, "logits/rejected": 1.3194243907928467, "logps/chosen": -566.56201171875, "logps/rejected": -934.32080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.929027080535889, "rewards/margins": 26.289602279663086, "rewards/rejected": -33.2186279296875, "step": 2812 }, { "epoch": 1.7499222395023328, "grad_norm": 46.462738037109375, "learning_rate": 2.3144306131858003e-06, "logits/chosen": 3.6122045516967773, "logits/rejected": 3.540928840637207, "logps/chosen": -630.3472900390625, "logps/rejected": -921.0729370117188, "loss": 0.8578, "rewards/accuracies": 0.875, "rewards/chosen": -13.057168960571289, "rewards/margins": 23.821197509765625, "rewards/rejected": -36.87836456298828, "step": 2813 }, { "epoch": 1.7505443234836702, "grad_norm": 9.17495059967041, "learning_rate": 2.3132780082987555e-06, "logits/chosen": -0.8413869142532349, "logits/rejected": 2.3990821838378906, "logps/chosen": -383.0685729980469, "logps/rejected": -798.2875366210938, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": -11.45396900177002, "rewards/margins": 21.561416625976562, "rewards/rejected": -33.015384674072266, "step": 2814 }, { "epoch": 1.7511664074650077, "grad_norm": 4.03289270401001, "learning_rate": 2.3121254034117107e-06, "logits/chosen": 2.9473519325256348, "logits/rejected": 3.943220853805542, "logps/chosen": -617.802490234375, "logps/rejected": -904.656982421875, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -5.362736701965332, "rewards/margins": 22.11973762512207, "rewards/rejected": -27.48247718811035, "step": 2815 }, { "epoch": 1.7517884914463453, "grad_norm": 22.0452938079834, "learning_rate": 2.310972798524666e-06, "logits/chosen": 1.4050730466842651, "logits/rejected": 4.609858512878418, "logps/chosen": -589.161376953125, "logps/rejected": -979.752685546875, "loss": 0.1047, "rewards/accuracies": 0.875, "rewards/chosen": -8.366984367370605, "rewards/margins": 24.246492385864258, "rewards/rejected": -32.61347579956055, "step": 2816 }, { "epoch": 1.7524105754276826, "grad_norm": 3.8451528549194336, "learning_rate": 2.309820193637621e-06, "logits/chosen": 0.7891393303871155, "logits/rejected": 4.272323131561279, "logps/chosen": -428.27484130859375, "logps/rejected": -965.9637451171875, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -5.005939483642578, "rewards/margins": 30.316749572753906, "rewards/rejected": -35.32268524169922, "step": 2817 }, { "epoch": 1.7530326594090202, "grad_norm": 0.0004849430697504431, "learning_rate": 2.3086675887505764e-06, "logits/chosen": 2.0776607990264893, "logits/rejected": 4.065606117248535, "logps/chosen": -484.43988037109375, "logps/rejected": -874.045654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.818570137023926, "rewards/margins": 25.894359588623047, "rewards/rejected": -33.712928771972656, "step": 2818 }, { "epoch": 1.7536547433903578, "grad_norm": 0.0005741657223552465, "learning_rate": 2.307514983863532e-06, "logits/chosen": 0.3438476622104645, "logits/rejected": 3.1698288917541504, "logps/chosen": -536.3983764648438, "logps/rejected": -1008.376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.154132843017578, "rewards/margins": 29.072219848632812, "rewards/rejected": -36.22635269165039, "step": 2819 }, { "epoch": 1.754276827371695, "grad_norm": 0.0035577313974499702, "learning_rate": 2.3063623789764873e-06, "logits/chosen": 0.8933321237564087, "logits/rejected": 3.381267547607422, "logps/chosen": -666.1956787109375, "logps/rejected": -1114.95361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.84673547744751, "rewards/margins": 30.009090423583984, "rewards/rejected": -37.8558235168457, "step": 2820 }, { "epoch": 1.7548989113530327, "grad_norm": 34.45783615112305, "learning_rate": 2.3052097740894425e-06, "logits/chosen": 0.24750453233718872, "logits/rejected": 2.9766762256622314, "logps/chosen": -476.5849304199219, "logps/rejected": -839.9825439453125, "loss": 0.7048, "rewards/accuracies": 0.875, "rewards/chosen": -9.244043350219727, "rewards/margins": 19.534896850585938, "rewards/rejected": -28.778940200805664, "step": 2821 }, { "epoch": 1.7555209953343702, "grad_norm": 0.3499009609222412, "learning_rate": 2.3040571692023973e-06, "logits/chosen": 0.20748788118362427, "logits/rejected": 1.776287317276001, "logps/chosen": -447.55999755859375, "logps/rejected": -747.993408203125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.3524370193481445, "rewards/margins": 24.67609214782715, "rewards/rejected": -29.028528213500977, "step": 2822 }, { "epoch": 1.7561430793157076, "grad_norm": 0.004271762445569038, "learning_rate": 2.302904564315353e-06, "logits/chosen": 0.4916335642337799, "logits/rejected": 1.6427289247512817, "logps/chosen": -535.5093994140625, "logps/rejected": -822.4710693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.049504280090332, "rewards/margins": 23.968944549560547, "rewards/rejected": -32.01844787597656, "step": 2823 }, { "epoch": 1.7567651632970451, "grad_norm": 0.1598803550004959, "learning_rate": 2.301751959428308e-06, "logits/chosen": -0.5362465381622314, "logits/rejected": 1.9070513248443604, "logps/chosen": -549.949462890625, "logps/rejected": -816.508544921875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.776954174041748, "rewards/margins": 20.22863006591797, "rewards/rejected": -28.005582809448242, "step": 2824 }, { "epoch": 1.7573872472783827, "grad_norm": 3.6965129375457764, "learning_rate": 2.3005993545412634e-06, "logits/chosen": -0.49294036626815796, "logits/rejected": 2.3970351219177246, "logps/chosen": -467.9210205078125, "logps/rejected": -754.7039184570312, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -9.430497169494629, "rewards/margins": 14.597742080688477, "rewards/rejected": -24.028240203857422, "step": 2825 }, { "epoch": 1.75800933125972, "grad_norm": 0.037016257643699646, "learning_rate": 2.2994467496542186e-06, "logits/chosen": 1.0786442756652832, "logits/rejected": 3.0117878913879395, "logps/chosen": -633.4241943359375, "logps/rejected": -975.4942626953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.774829864501953, "rewards/margins": 25.39168930053711, "rewards/rejected": -34.16651916503906, "step": 2826 }, { "epoch": 1.7586314152410576, "grad_norm": 0.03342762216925621, "learning_rate": 2.298294144767174e-06, "logits/chosen": -1.3810304403305054, "logits/rejected": 2.7755300998687744, "logps/chosen": -399.886474609375, "logps/rejected": -855.031494140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.9426751136779785, "rewards/margins": 23.406360626220703, "rewards/rejected": -30.349037170410156, "step": 2827 }, { "epoch": 1.7592534992223952, "grad_norm": 0.0004126617859583348, "learning_rate": 2.297141539880129e-06, "logits/chosen": -3.138012409210205, "logits/rejected": 0.9484289884567261, "logps/chosen": -409.16693115234375, "logps/rejected": -1011.06591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.72064208984375, "rewards/margins": 36.936763763427734, "rewards/rejected": -40.65740203857422, "step": 2828 }, { "epoch": 1.7598755832037325, "grad_norm": 2.3251948732649907e-05, "learning_rate": 2.2959889349930843e-06, "logits/chosen": 1.5177479982376099, "logits/rejected": 4.266093730926514, "logps/chosen": -490.1104736328125, "logps/rejected": -894.9434814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.51591968536377, "rewards/margins": 26.004505157470703, "rewards/rejected": -34.520423889160156, "step": 2829 }, { "epoch": 1.7604976671850698, "grad_norm": 0.47916364669799805, "learning_rate": 2.2948363301060395e-06, "logits/chosen": 0.8490583300590515, "logits/rejected": 3.6363015174865723, "logps/chosen": -456.613525390625, "logps/rejected": -754.70166015625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.169865608215332, "rewards/margins": 18.686918258666992, "rewards/rejected": -24.85678482055664, "step": 2830 }, { "epoch": 1.7611197511664076, "grad_norm": 0.026581356301903725, "learning_rate": 2.293683725218995e-06, "logits/chosen": 1.8229038715362549, "logits/rejected": 3.4620201587677, "logps/chosen": -599.924560546875, "logps/rejected": -994.2450561523438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.186615943908691, "rewards/margins": 31.1335506439209, "rewards/rejected": -39.320167541503906, "step": 2831 }, { "epoch": 1.761741835147745, "grad_norm": 17.492971420288086, "learning_rate": 2.2925311203319504e-06, "logits/chosen": 1.6246124505996704, "logits/rejected": 4.336266040802002, "logps/chosen": -587.4801635742188, "logps/rejected": -970.076904296875, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": -12.888565063476562, "rewards/margins": 20.833152770996094, "rewards/rejected": -33.721717834472656, "step": 2832 }, { "epoch": 1.7623639191290823, "grad_norm": 0.0129725756123662, "learning_rate": 2.2913785154449056e-06, "logits/chosen": 0.6306408643722534, "logits/rejected": 3.076450824737549, "logps/chosen": -573.4195556640625, "logps/rejected": -956.2833251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.969671726226807, "rewards/margins": 26.17241859436035, "rewards/rejected": -34.14208984375, "step": 2833 }, { "epoch": 1.7629860031104199, "grad_norm": 31.76062774658203, "learning_rate": 2.290225910557861e-06, "logits/chosen": 2.5573315620422363, "logits/rejected": 1.8043968677520752, "logps/chosen": -538.5514526367188, "logps/rejected": -797.49755859375, "loss": 0.3282, "rewards/accuracies": 0.875, "rewards/chosen": -6.024632453918457, "rewards/margins": 24.634727478027344, "rewards/rejected": -30.659358978271484, "step": 2834 }, { "epoch": 1.7636080870917574, "grad_norm": 35.535221099853516, "learning_rate": 2.289073305670816e-06, "logits/chosen": 1.011389970779419, "logits/rejected": 3.940976619720459, "logps/chosen": -593.7369384765625, "logps/rejected": -1046.2943115234375, "loss": 0.4384, "rewards/accuracies": 0.875, "rewards/chosen": -9.93221664428711, "rewards/margins": 26.355077743530273, "rewards/rejected": -36.287296295166016, "step": 2835 }, { "epoch": 1.7642301710730948, "grad_norm": 0.001468529924750328, "learning_rate": 2.2879207007837713e-06, "logits/chosen": 2.8411571979522705, "logits/rejected": 3.874950647354126, "logps/chosen": -710.3514404296875, "logps/rejected": -1009.4497680664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.734901428222656, "rewards/margins": 25.648292541503906, "rewards/rejected": -36.38319396972656, "step": 2836 }, { "epoch": 1.7648522550544323, "grad_norm": 0.00015201901260297745, "learning_rate": 2.2867680958967265e-06, "logits/chosen": -0.3207424283027649, "logits/rejected": 1.9411276578903198, "logps/chosen": -592.6459350585938, "logps/rejected": -1078.9656982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.434858322143555, "rewards/margins": 30.316349029541016, "rewards/rejected": -39.75120544433594, "step": 2837 }, { "epoch": 1.76547433903577, "grad_norm": 2.018131971359253, "learning_rate": 2.285615491009682e-06, "logits/chosen": 0.6795270442962646, "logits/rejected": 2.601144790649414, "logps/chosen": -646.558837890625, "logps/rejected": -951.0234375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -10.77408218383789, "rewards/margins": 15.429933547973633, "rewards/rejected": -26.204017639160156, "step": 2838 }, { "epoch": 1.7660964230171072, "grad_norm": 0.20884829759597778, "learning_rate": 2.2844628861226374e-06, "logits/chosen": -1.0158138275146484, "logits/rejected": 2.9894943237304688, "logps/chosen": -364.9871826171875, "logps/rejected": -824.8662109375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.625598430633545, "rewards/margins": 23.467449188232422, "rewards/rejected": -28.093048095703125, "step": 2839 }, { "epoch": 1.7667185069984448, "grad_norm": 15.738641738891602, "learning_rate": 2.2833102812355926e-06, "logits/chosen": -1.3841941356658936, "logits/rejected": 2.4583823680877686, "logps/chosen": -478.6824951171875, "logps/rejected": -981.4010009765625, "loss": 0.0995, "rewards/accuracies": 0.875, "rewards/chosen": -5.98832368850708, "rewards/margins": 32.27064895629883, "rewards/rejected": -38.25897216796875, "step": 2840 }, { "epoch": 1.7673405909797824, "grad_norm": 36.28713607788086, "learning_rate": 2.282157676348548e-06, "logits/chosen": 2.22945237159729, "logits/rejected": 3.3262252807617188, "logps/chosen": -731.6435546875, "logps/rejected": -985.1890869140625, "loss": 0.512, "rewards/accuracies": 0.875, "rewards/chosen": -9.859464645385742, "rewards/margins": 26.305404663085938, "rewards/rejected": -36.16486358642578, "step": 2841 }, { "epoch": 1.7679626749611197, "grad_norm": 0.7895492911338806, "learning_rate": 2.281005071461503e-06, "logits/chosen": 0.8218408226966858, "logits/rejected": 3.196619987487793, "logps/chosen": -571.4409790039062, "logps/rejected": -920.848388671875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -8.964709281921387, "rewards/margins": 18.337387084960938, "rewards/rejected": -27.30209732055664, "step": 2842 }, { "epoch": 1.7685847589424573, "grad_norm": 7.703816890716553, "learning_rate": 2.2798524665744583e-06, "logits/chosen": -1.191411018371582, "logits/rejected": 2.2585504055023193, "logps/chosen": -438.60821533203125, "logps/rejected": -823.6568603515625, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": -7.269340515136719, "rewards/margins": 19.628704071044922, "rewards/rejected": -26.89804458618164, "step": 2843 }, { "epoch": 1.7692068429237948, "grad_norm": 0.982222318649292, "learning_rate": 2.2786998616874135e-06, "logits/chosen": 1.8104430437088013, "logits/rejected": 5.031914234161377, "logps/chosen": -563.5733032226562, "logps/rejected": -995.634765625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -7.055357456207275, "rewards/margins": 30.996875762939453, "rewards/rejected": -38.05223083496094, "step": 2844 }, { "epoch": 1.7698289269051322, "grad_norm": 13.193146705627441, "learning_rate": 2.277547256800369e-06, "logits/chosen": -0.04513192176818848, "logits/rejected": 5.120041370391846, "logps/chosen": -335.22283935546875, "logps/rejected": -888.48193359375, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -3.7368974685668945, "rewards/margins": 23.650060653686523, "rewards/rejected": -27.386959075927734, "step": 2845 }, { "epoch": 1.7704510108864697, "grad_norm": 0.0032247763592749834, "learning_rate": 2.2763946519133244e-06, "logits/chosen": -0.46913138031959534, "logits/rejected": 2.5801992416381836, "logps/chosen": -450.40704345703125, "logps/rejected": -831.6661987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.872275352478027, "rewards/margins": 24.720630645751953, "rewards/rejected": -33.5929069519043, "step": 2846 }, { "epoch": 1.7710730948678073, "grad_norm": 0.027071982622146606, "learning_rate": 2.2752420470262796e-06, "logits/chosen": 0.2968297600746155, "logits/rejected": 4.050349235534668, "logps/chosen": -465.64208984375, "logps/rejected": -938.214111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.954277038574219, "rewards/margins": 26.31043243408203, "rewards/rejected": -33.26470947265625, "step": 2847 }, { "epoch": 1.7716951788491446, "grad_norm": 4.982742662917872e-09, "learning_rate": 2.274089442139235e-06, "logits/chosen": 0.42632216215133667, "logits/rejected": 2.5242867469787598, "logps/chosen": -637.7935180664062, "logps/rejected": -1124.866455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.32619857788086, "rewards/margins": 33.32003402709961, "rewards/rejected": -46.64623260498047, "step": 2848 }, { "epoch": 1.772317262830482, "grad_norm": 0.4207907021045685, "learning_rate": 2.27293683725219e-06, "logits/chosen": -1.1957554817199707, "logits/rejected": 3.567960262298584, "logps/chosen": -490.2234191894531, "logps/rejected": -1094.930419921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -9.037872314453125, "rewards/margins": 35.78264617919922, "rewards/rejected": -44.820518493652344, "step": 2849 }, { "epoch": 1.7729393468118197, "grad_norm": 2.5869648456573486, "learning_rate": 2.2717842323651453e-06, "logits/chosen": 1.9972779750823975, "logits/rejected": 4.186806678771973, "logps/chosen": -749.8709716796875, "logps/rejected": -1108.9453125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -12.551399230957031, "rewards/margins": 24.46184539794922, "rewards/rejected": -37.01324462890625, "step": 2850 }, { "epoch": 1.773561430793157, "grad_norm": 34.92661666870117, "learning_rate": 2.2706316274781005e-06, "logits/chosen": 0.05643177032470703, "logits/rejected": 1.2682291269302368, "logps/chosen": -476.22760009765625, "logps/rejected": -719.2870483398438, "loss": 0.6597, "rewards/accuracies": 0.875, "rewards/chosen": -9.579065322875977, "rewards/margins": 18.019031524658203, "rewards/rejected": -27.598098754882812, "step": 2851 }, { "epoch": 1.7741835147744944, "grad_norm": 0.0038406543899327517, "learning_rate": 2.269479022591056e-06, "logits/chosen": 0.727659285068512, "logits/rejected": 3.0023317337036133, "logps/chosen": -624.4636840820312, "logps/rejected": -1009.10302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.819724082946777, "rewards/margins": 27.708839416503906, "rewards/rejected": -34.528564453125, "step": 2852 }, { "epoch": 1.774805598755832, "grad_norm": 0.0031099789775907993, "learning_rate": 2.2683264177040114e-06, "logits/chosen": -1.9862315654754639, "logits/rejected": 2.229929208755493, "logps/chosen": -434.7628479003906, "logps/rejected": -976.1802978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.795281410217285, "rewards/margins": 25.33068084716797, "rewards/rejected": -30.12596321105957, "step": 2853 }, { "epoch": 1.7754276827371696, "grad_norm": 0.014582900330424309, "learning_rate": 2.2671738128169666e-06, "logits/chosen": -1.2985285520553589, "logits/rejected": 3.906200885772705, "logps/chosen": -451.67852783203125, "logps/rejected": -1184.2401123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.578330039978027, "rewards/margins": 40.39623260498047, "rewards/rejected": -46.97456359863281, "step": 2854 }, { "epoch": 1.776049766718507, "grad_norm": 0.0002128417690983042, "learning_rate": 2.266021207929922e-06, "logits/chosen": 1.6485921144485474, "logits/rejected": 2.5614266395568848, "logps/chosen": -751.6290893554688, "logps/rejected": -1172.1341552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.660265922546387, "rewards/margins": 33.579036712646484, "rewards/rejected": -49.23930358886719, "step": 2855 }, { "epoch": 1.7766718506998445, "grad_norm": 0.3063058853149414, "learning_rate": 2.264868603042877e-06, "logits/chosen": 0.09333009272813797, "logits/rejected": 3.864189624786377, "logps/chosen": -471.91925048828125, "logps/rejected": -956.8831176757812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -9.546854972839355, "rewards/margins": 28.44036865234375, "rewards/rejected": -37.987220764160156, "step": 2856 }, { "epoch": 1.777293934681182, "grad_norm": 0.1482161283493042, "learning_rate": 2.2637159981558323e-06, "logits/chosen": 1.1796531677246094, "logits/rejected": 3.714764356613159, "logps/chosen": -590.8246459960938, "logps/rejected": -972.3810424804688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.212776184082031, "rewards/margins": 24.3189697265625, "rewards/rejected": -32.53174591064453, "step": 2857 }, { "epoch": 1.7779160186625194, "grad_norm": 43.31864547729492, "learning_rate": 2.2625633932687875e-06, "logits/chosen": 1.916248083114624, "logits/rejected": 3.472468376159668, "logps/chosen": -631.3201293945312, "logps/rejected": -899.087890625, "loss": 0.7806, "rewards/accuracies": 0.875, "rewards/chosen": -11.413410186767578, "rewards/margins": 16.619535446166992, "rewards/rejected": -28.03294563293457, "step": 2858 }, { "epoch": 1.778538102643857, "grad_norm": 1.5456307664862834e-05, "learning_rate": 2.2614107883817427e-06, "logits/chosen": -1.0370193719863892, "logits/rejected": 3.6356005668640137, "logps/chosen": -546.611572265625, "logps/rejected": -1105.85595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.225702285766602, "rewards/margins": 32.230499267578125, "rewards/rejected": -43.456199645996094, "step": 2859 }, { "epoch": 1.7791601866251945, "grad_norm": 0.20345354080200195, "learning_rate": 2.2602581834946984e-06, "logits/chosen": 1.2066192626953125, "logits/rejected": 4.191374778747559, "logps/chosen": -637.443359375, "logps/rejected": -1108.2515869140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.954766273498535, "rewards/margins": 28.498451232910156, "rewards/rejected": -38.453216552734375, "step": 2860 }, { "epoch": 1.7797822706065318, "grad_norm": 1.7547857761383057, "learning_rate": 2.2591055786076536e-06, "logits/chosen": -0.753386378288269, "logits/rejected": 3.332653045654297, "logps/chosen": -511.5549011230469, "logps/rejected": -918.64013671875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -5.331133842468262, "rewards/margins": 20.307628631591797, "rewards/rejected": -25.638763427734375, "step": 2861 }, { "epoch": 1.7804043545878694, "grad_norm": 0.1727510541677475, "learning_rate": 2.257952973720609e-06, "logits/chosen": -1.888128399848938, "logits/rejected": 2.080550193786621, "logps/chosen": -435.1989440917969, "logps/rejected": -920.7476806640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.537738800048828, "rewards/margins": 24.849666595458984, "rewards/rejected": -32.38740539550781, "step": 2862 }, { "epoch": 1.781026438569207, "grad_norm": 1.2717350728053134e-07, "learning_rate": 2.256800368833564e-06, "logits/chosen": 0.18511630594730377, "logits/rejected": 3.744037628173828, "logps/chosen": -495.83209228515625, "logps/rejected": -1016.9049072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.336804389953613, "rewards/margins": 28.20050621032715, "rewards/rejected": -37.53730773925781, "step": 2863 }, { "epoch": 1.7816485225505443, "grad_norm": 10.340002059936523, "learning_rate": 2.2556477639465193e-06, "logits/chosen": 2.4061026573181152, "logits/rejected": 3.0818214416503906, "logps/chosen": -627.86572265625, "logps/rejected": -845.392333984375, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": -10.683109283447266, "rewards/margins": 19.959367752075195, "rewards/rejected": -30.642478942871094, "step": 2864 }, { "epoch": 1.7822706065318819, "grad_norm": 1.0460494756698608, "learning_rate": 2.2544951590594745e-06, "logits/chosen": 1.9467957019805908, "logits/rejected": 3.067915439605713, "logps/chosen": -678.021484375, "logps/rejected": -1033.32421875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -7.412694931030273, "rewards/margins": 28.45948600769043, "rewards/rejected": -35.87217712402344, "step": 2865 }, { "epoch": 1.7828926905132194, "grad_norm": 0.0009686941630207002, "learning_rate": 2.2533425541724297e-06, "logits/chosen": -0.48395222425460815, "logits/rejected": 2.3878097534179688, "logps/chosen": -477.0278015136719, "logps/rejected": -884.94580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.3412346839904785, "rewards/margins": 25.06332015991211, "rewards/rejected": -32.40455627441406, "step": 2866 }, { "epoch": 1.7835147744945568, "grad_norm": 1.9869872331619263, "learning_rate": 2.2521899492853854e-06, "logits/chosen": -0.11884996294975281, "logits/rejected": 4.540460586547852, "logps/chosen": -442.8230895996094, "logps/rejected": -1045.760986328125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -7.427252769470215, "rewards/margins": 27.349628448486328, "rewards/rejected": -34.776878356933594, "step": 2867 }, { "epoch": 1.784136858475894, "grad_norm": 5.745584964752197, "learning_rate": 2.2510373443983406e-06, "logits/chosen": -0.665834903717041, "logits/rejected": 2.0192759037017822, "logps/chosen": -589.6995849609375, "logps/rejected": -1074.5523681640625, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": -7.3974528312683105, "rewards/margins": 25.565839767456055, "rewards/rejected": -32.96329116821289, "step": 2868 }, { "epoch": 1.7847589424572319, "grad_norm": 0.004010713193565607, "learning_rate": 2.249884739511296e-06, "logits/chosen": 0.010303676128387451, "logits/rejected": 4.516445159912109, "logps/chosen": -394.00146484375, "logps/rejected": -921.01611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.836312770843506, "rewards/margins": 24.702938079833984, "rewards/rejected": -28.539249420166016, "step": 2869 }, { "epoch": 1.7853810264385692, "grad_norm": 0.0002325717214262113, "learning_rate": 2.248732134624251e-06, "logits/chosen": 0.8579131364822388, "logits/rejected": 4.581618785858154, "logps/chosen": -591.1415405273438, "logps/rejected": -1070.2159423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.982126235961914, "rewards/margins": 30.17152214050293, "rewards/rejected": -42.153648376464844, "step": 2870 }, { "epoch": 1.7860031104199066, "grad_norm": 0.08633695542812347, "learning_rate": 2.2475795297372063e-06, "logits/chosen": 2.029080390930176, "logits/rejected": 4.249919414520264, "logps/chosen": -643.3477172851562, "logps/rejected": -1013.6303100585938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.15836238861084, "rewards/margins": 28.098350524902344, "rewards/rejected": -37.256710052490234, "step": 2871 }, { "epoch": 1.7866251944012441, "grad_norm": 0.13996653258800507, "learning_rate": 2.2464269248501615e-06, "logits/chosen": -0.4132803678512573, "logits/rejected": 3.320643424987793, "logps/chosen": -524.7666015625, "logps/rejected": -1031.8675537109375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.282973289489746, "rewards/margins": 30.53451156616211, "rewards/rejected": -36.81748580932617, "step": 2872 }, { "epoch": 1.7872472783825817, "grad_norm": 0.4101029932498932, "learning_rate": 2.2452743199631167e-06, "logits/chosen": 1.6613099575042725, "logits/rejected": 4.061172008514404, "logps/chosen": -518.1925048828125, "logps/rejected": -979.4036865234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.9535813331604, "rewards/margins": 31.052183151245117, "rewards/rejected": -38.005767822265625, "step": 2873 }, { "epoch": 1.787869362363919, "grad_norm": 0.15557296574115753, "learning_rate": 2.2441217150760724e-06, "logits/chosen": 1.591304063796997, "logits/rejected": 0.4381207227706909, "logps/chosen": -690.9009399414062, "logps/rejected": -943.1353149414062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -11.526205062866211, "rewards/margins": 27.176612854003906, "rewards/rejected": -38.70281982421875, "step": 2874 }, { "epoch": 1.7884914463452566, "grad_norm": 16.046310424804688, "learning_rate": 2.2429691101890276e-06, "logits/chosen": 2.2174692153930664, "logits/rejected": 4.0461297035217285, "logps/chosen": -761.5391845703125, "logps/rejected": -1035.3385009765625, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -10.166966438293457, "rewards/margins": 18.014434814453125, "rewards/rejected": -28.181400299072266, "step": 2875 }, { "epoch": 1.7891135303265941, "grad_norm": 1.2151827812194824, "learning_rate": 2.241816505301983e-06, "logits/chosen": 0.9016231298446655, "logits/rejected": 3.3903098106384277, "logps/chosen": -545.459716796875, "logps/rejected": -949.28466796875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.686226844787598, "rewards/margins": 25.988252639770508, "rewards/rejected": -33.67448043823242, "step": 2876 }, { "epoch": 1.7897356143079315, "grad_norm": 0.03736743703484535, "learning_rate": 2.240663900414938e-06, "logits/chosen": 0.6556278467178345, "logits/rejected": 2.400874137878418, "logps/chosen": -350.68231201171875, "logps/rejected": -622.5861206054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.515726089477539, "rewards/margins": 19.54558753967285, "rewards/rejected": -25.06131362915039, "step": 2877 }, { "epoch": 1.790357698289269, "grad_norm": 7.153614569688216e-05, "learning_rate": 2.2395112955278933e-06, "logits/chosen": -0.9888482093811035, "logits/rejected": 3.460984230041504, "logps/chosen": -356.6788330078125, "logps/rejected": -922.9345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.053175449371338, "rewards/margins": 25.149484634399414, "rewards/rejected": -30.202659606933594, "step": 2878 }, { "epoch": 1.7909797822706066, "grad_norm": 4.209404869470745e-05, "learning_rate": 2.2383586906408485e-06, "logits/chosen": 0.8455231189727783, "logits/rejected": 4.013948440551758, "logps/chosen": -500.7359619140625, "logps/rejected": -1064.920166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.406131744384766, "rewards/margins": 30.79470443725586, "rewards/rejected": -38.200836181640625, "step": 2879 }, { "epoch": 1.791601866251944, "grad_norm": 0.6016518473625183, "learning_rate": 2.2372060857538037e-06, "logits/chosen": -1.218041181564331, "logits/rejected": 1.918828010559082, "logps/chosen": -378.580810546875, "logps/rejected": -893.1238403320312, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.370937824249268, "rewards/margins": 26.3651065826416, "rewards/rejected": -31.736042022705078, "step": 2880 }, { "epoch": 1.7922239502332815, "grad_norm": 0.06199437752366066, "learning_rate": 2.236053480866759e-06, "logits/chosen": 1.2523444890975952, "logits/rejected": 3.574066162109375, "logps/chosen": -564.341064453125, "logps/rejected": -926.0437622070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.147315502166748, "rewards/margins": 22.847570419311523, "rewards/rejected": -27.994884490966797, "step": 2881 }, { "epoch": 1.792846034214619, "grad_norm": 0.20693425834178925, "learning_rate": 2.2349008759797146e-06, "logits/chosen": 0.8386412262916565, "logits/rejected": 2.8027706146240234, "logps/chosen": -561.4254150390625, "logps/rejected": -894.841796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.587484359741211, "rewards/margins": 16.834026336669922, "rewards/rejected": -24.421510696411133, "step": 2882 }, { "epoch": 1.7934681181959564, "grad_norm": 0.0012074284022673965, "learning_rate": 2.23374827109267e-06, "logits/chosen": -1.2442289590835571, "logits/rejected": 3.0291409492492676, "logps/chosen": -401.638916015625, "logps/rejected": -942.6603393554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.9297685623168945, "rewards/margins": 30.98455810546875, "rewards/rejected": -38.91432189941406, "step": 2883 }, { "epoch": 1.794090202177294, "grad_norm": 0.03824201226234436, "learning_rate": 2.232595666205625e-06, "logits/chosen": 0.026376813650131226, "logits/rejected": 4.139131546020508, "logps/chosen": -498.52325439453125, "logps/rejected": -1008.8857421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.575507164001465, "rewards/margins": 26.977291107177734, "rewards/rejected": -34.552799224853516, "step": 2884 }, { "epoch": 1.7947122861586315, "grad_norm": 0.002526791300624609, "learning_rate": 2.2314430613185803e-06, "logits/chosen": 0.8544110655784607, "logits/rejected": 3.109708070755005, "logps/chosen": -500.810791015625, "logps/rejected": -909.2841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.763897895812988, "rewards/margins": 28.4268798828125, "rewards/rejected": -33.19077682495117, "step": 2885 }, { "epoch": 1.7953343701399689, "grad_norm": 0.004643771797418594, "learning_rate": 2.2302904564315355e-06, "logits/chosen": -0.9598997831344604, "logits/rejected": 4.1503777503967285, "logps/chosen": -420.0594482421875, "logps/rejected": -983.3630981445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.483625888824463, "rewards/margins": 30.00240707397461, "rewards/rejected": -35.48603439331055, "step": 2886 }, { "epoch": 1.7959564541213062, "grad_norm": 0.023572798818349838, "learning_rate": 2.2291378515444907e-06, "logits/chosen": 0.6749930381774902, "logits/rejected": 3.5571882724761963, "logps/chosen": -455.864501953125, "logps/rejected": -821.3405151367188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.834256172180176, "rewards/margins": 26.151960372924805, "rewards/rejected": -31.986217498779297, "step": 2887 }, { "epoch": 1.796578538102644, "grad_norm": 17.47933578491211, "learning_rate": 2.227985246657446e-06, "logits/chosen": 1.6231681108474731, "logits/rejected": 3.904000997543335, "logps/chosen": -675.1061401367188, "logps/rejected": -1025.37939453125, "loss": 0.1002, "rewards/accuracies": 0.875, "rewards/chosen": -3.872825860977173, "rewards/margins": 26.913835525512695, "rewards/rejected": -30.786663055419922, "step": 2888 }, { "epoch": 1.7972006220839813, "grad_norm": 7.277772510860814e-06, "learning_rate": 2.2268326417704016e-06, "logits/chosen": 0.35814934968948364, "logits/rejected": 2.3936288356781006, "logps/chosen": -615.1605224609375, "logps/rejected": -1072.79541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.465442657470703, "rewards/margins": 29.792701721191406, "rewards/rejected": -39.25814437866211, "step": 2889 }, { "epoch": 1.7978227060653187, "grad_norm": 0.00016307276382576674, "learning_rate": 2.2256800368833564e-06, "logits/chosen": 0.3945973515510559, "logits/rejected": 3.6338953971862793, "logps/chosen": -555.1265869140625, "logps/rejected": -1015.112548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.311851501464844, "rewards/margins": 23.719341278076172, "rewards/rejected": -32.03119659423828, "step": 2890 }, { "epoch": 1.7984447900466562, "grad_norm": 0.0038027490954846144, "learning_rate": 2.2245274319963116e-06, "logits/chosen": 0.6874729990959167, "logits/rejected": 2.82660174369812, "logps/chosen": -535.7744140625, "logps/rejected": -943.1077880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.335187911987305, "rewards/margins": 28.816408157348633, "rewards/rejected": -38.15159606933594, "step": 2891 }, { "epoch": 1.7990668740279938, "grad_norm": 1.2254604371264577e-06, "learning_rate": 2.223374827109267e-06, "logits/chosen": 2.0987632274627686, "logits/rejected": 4.418678283691406, "logps/chosen": -547.7008056640625, "logps/rejected": -925.8990478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.061464309692383, "rewards/margins": 27.80548667907715, "rewards/rejected": -35.86695098876953, "step": 2892 }, { "epoch": 1.7996889580093312, "grad_norm": 0.05550967901945114, "learning_rate": 2.222222222222222e-06, "logits/chosen": -0.3484824299812317, "logits/rejected": 2.3431625366210938, "logps/chosen": -382.5537109375, "logps/rejected": -790.096923828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.595591068267822, "rewards/margins": 22.2501220703125, "rewards/rejected": -26.845712661743164, "step": 2893 }, { "epoch": 1.8003110419906687, "grad_norm": 1.9200422229914693e-07, "learning_rate": 2.2210696173351777e-06, "logits/chosen": 1.3279170989990234, "logits/rejected": 2.890540838241577, "logps/chosen": -693.2498779296875, "logps/rejected": -1096.75, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.892668724060059, "rewards/margins": 33.936702728271484, "rewards/rejected": -45.82937240600586, "step": 2894 }, { "epoch": 1.8009331259720063, "grad_norm": 0.023555485531687737, "learning_rate": 2.219917012448133e-06, "logits/chosen": 0.3982091546058655, "logits/rejected": 2.7005228996276855, "logps/chosen": -511.42529296875, "logps/rejected": -1003.4261474609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.058162689208984, "rewards/margins": 32.33118438720703, "rewards/rejected": -40.389347076416016, "step": 2895 }, { "epoch": 1.8015552099533436, "grad_norm": 0.32222017645835876, "learning_rate": 2.218764407561088e-06, "logits/chosen": 2.824542999267578, "logits/rejected": 3.9358267784118652, "logps/chosen": -637.0455322265625, "logps/rejected": -999.7955932617188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -8.393836975097656, "rewards/margins": 29.571090698242188, "rewards/rejected": -37.96493148803711, "step": 2896 }, { "epoch": 1.8021772939346812, "grad_norm": 5.3932286391500384e-05, "learning_rate": 2.2176118026740434e-06, "logits/chosen": 0.723086416721344, "logits/rejected": 3.5180208683013916, "logps/chosen": -524.3895263671875, "logps/rejected": -977.4048461914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.150003433227539, "rewards/margins": 27.52701187133789, "rewards/rejected": -36.6770133972168, "step": 2897 }, { "epoch": 1.8027993779160187, "grad_norm": 1.2058066129684448, "learning_rate": 2.2164591977869986e-06, "logits/chosen": 3.3061118125915527, "logits/rejected": 4.3138508796691895, "logps/chosen": -613.2428588867188, "logps/rejected": -913.3850708007812, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -12.50782585144043, "rewards/margins": 20.202499389648438, "rewards/rejected": -32.7103271484375, "step": 2898 }, { "epoch": 1.803421461897356, "grad_norm": 0.019831910729408264, "learning_rate": 2.215306592899954e-06, "logits/chosen": -0.23215629160404205, "logits/rejected": 0.6817746162414551, "logps/chosen": -534.7474365234375, "logps/rejected": -899.2763061523438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.072684288024902, "rewards/margins": 29.39617156982422, "rewards/rejected": -36.46885681152344, "step": 2899 }, { "epoch": 1.8040435458786936, "grad_norm": 0.02706284634768963, "learning_rate": 2.214153988012909e-06, "logits/chosen": 0.3160700798034668, "logits/rejected": 4.163125991821289, "logps/chosen": -294.6250915527344, "logps/rejected": -732.25732421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.0198540687561035, "rewards/margins": 23.535429000854492, "rewards/rejected": -28.55528450012207, "step": 2900 }, { "epoch": 1.8046656298600312, "grad_norm": 0.303785502910614, "learning_rate": 2.2130013831258647e-06, "logits/chosen": 0.5170513391494751, "logits/rejected": 2.3560218811035156, "logps/chosen": -593.2955322265625, "logps/rejected": -927.296142578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.172282218933105, "rewards/margins": 24.161550521850586, "rewards/rejected": -32.333831787109375, "step": 2901 }, { "epoch": 1.8052877138413685, "grad_norm": 39.95273208618164, "learning_rate": 2.21184877823882e-06, "logits/chosen": 0.7138420343399048, "logits/rejected": 3.270815372467041, "logps/chosen": -482.2649841308594, "logps/rejected": -907.3980102539062, "loss": 1.3448, "rewards/accuracies": 0.875, "rewards/chosen": -8.218573570251465, "rewards/margins": 23.526567459106445, "rewards/rejected": -31.745140075683594, "step": 2902 }, { "epoch": 1.805909797822706, "grad_norm": 0.3934561610221863, "learning_rate": 2.210696173351775e-06, "logits/chosen": -2.243908405303955, "logits/rejected": 1.5933899879455566, "logps/chosen": -375.66131591796875, "logps/rejected": -851.6378784179688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.891318321228027, "rewards/margins": 26.80710792541504, "rewards/rejected": -31.69842529296875, "step": 2903 }, { "epoch": 1.8065318818040437, "grad_norm": 0.023749463260173798, "learning_rate": 2.2095435684647304e-06, "logits/chosen": 1.9159235954284668, "logits/rejected": 3.28167724609375, "logps/chosen": -548.1072387695312, "logps/rejected": -928.1199951171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.026692390441895, "rewards/margins": 27.322006225585938, "rewards/rejected": -35.348697662353516, "step": 2904 }, { "epoch": 1.807153965785381, "grad_norm": 2.144883394241333, "learning_rate": 2.2083909635776856e-06, "logits/chosen": -0.10407549142837524, "logits/rejected": 4.395783424377441, "logps/chosen": -437.296630859375, "logps/rejected": -834.9159545898438, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -9.79471492767334, "rewards/margins": 17.42055892944336, "rewards/rejected": -27.215274810791016, "step": 2905 }, { "epoch": 1.8077760497667184, "grad_norm": 5.723523432976663e-09, "learning_rate": 2.207238358690641e-06, "logits/chosen": -3.5189616680145264, "logits/rejected": 4.164684295654297, "logps/chosen": -288.29217529296875, "logps/rejected": -1121.138427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.8310956954956055, "rewards/margins": 34.23997497558594, "rewards/rejected": -42.071067810058594, "step": 2906 }, { "epoch": 1.8083981337480561, "grad_norm": 5.7696780686455895e-08, "learning_rate": 2.206085753803596e-06, "logits/chosen": -1.8566597700119019, "logits/rejected": 3.9218180179595947, "logps/chosen": -331.6076354980469, "logps/rejected": -1117.03857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.040053844451904, "rewards/margins": 40.208106994628906, "rewards/rejected": -46.24816131591797, "step": 2907 }, { "epoch": 1.8090202177293935, "grad_norm": 0.02445216104388237, "learning_rate": 2.2049331489165517e-06, "logits/chosen": -3.414188861846924, "logits/rejected": 3.4807980060577393, "logps/chosen": -381.92279052734375, "logps/rejected": -999.1416625976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.60594367980957, "rewards/margins": 25.19023895263672, "rewards/rejected": -29.796180725097656, "step": 2908 }, { "epoch": 1.8096423017107308, "grad_norm": 5.386016845703125, "learning_rate": 2.203780544029507e-06, "logits/chosen": -0.25347068905830383, "logits/rejected": 2.615142345428467, "logps/chosen": -591.55712890625, "logps/rejected": -973.9298095703125, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -7.074479103088379, "rewards/margins": 24.832969665527344, "rewards/rejected": -31.90744972229004, "step": 2909 }, { "epoch": 1.8102643856920684, "grad_norm": 0.0004156986833550036, "learning_rate": 2.202627939142462e-06, "logits/chosen": 2.4946722984313965, "logits/rejected": 3.7645764350891113, "logps/chosen": -769.0728759765625, "logps/rejected": -982.890869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.196980476379395, "rewards/margins": 22.227109909057617, "rewards/rejected": -34.42408752441406, "step": 2910 }, { "epoch": 1.810886469673406, "grad_norm": 0.0488068051636219, "learning_rate": 2.2014753342554174e-06, "logits/chosen": -0.22399890422821045, "logits/rejected": 3.620121479034424, "logps/chosen": -438.82940673828125, "logps/rejected": -879.4267578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.7626495361328125, "rewards/margins": 23.414827346801758, "rewards/rejected": -30.17747688293457, "step": 2911 }, { "epoch": 1.8115085536547433, "grad_norm": 0.025857780128717422, "learning_rate": 2.2003227293683726e-06, "logits/chosen": -0.0801013708114624, "logits/rejected": 3.76338791847229, "logps/chosen": -366.40399169921875, "logps/rejected": -822.9530029296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.393184661865234, "rewards/margins": 23.762935638427734, "rewards/rejected": -31.15612030029297, "step": 2912 }, { "epoch": 1.8121306376360808, "grad_norm": 2.441636004490988e-09, "learning_rate": 2.199170124481328e-06, "logits/chosen": 1.1459236145019531, "logits/rejected": 4.223677158355713, "logps/chosen": -656.9322509765625, "logps/rejected": -1171.351318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.336624145507812, "rewards/margins": 33.33731460571289, "rewards/rejected": -45.6739387512207, "step": 2913 }, { "epoch": 1.8127527216174184, "grad_norm": 9.004888852359727e-06, "learning_rate": 2.198017519594283e-06, "logits/chosen": -1.1242594718933105, "logits/rejected": 3.08487606048584, "logps/chosen": -347.3804016113281, "logps/rejected": -911.5828247070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.066286563873291, "rewards/margins": 29.527799606323242, "rewards/rejected": -34.594085693359375, "step": 2914 }, { "epoch": 1.8133748055987557, "grad_norm": 9.333149137091823e-06, "learning_rate": 2.1968649147072387e-06, "logits/chosen": 0.8879589438438416, "logits/rejected": 2.641096353530884, "logps/chosen": -611.4288330078125, "logps/rejected": -1026.958251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.871919631958008, "rewards/margins": 27.208112716674805, "rewards/rejected": -41.08003234863281, "step": 2915 }, { "epoch": 1.8139968895800933, "grad_norm": 5.846538066864014, "learning_rate": 2.195712309820194e-06, "logits/chosen": 0.099272221326828, "logits/rejected": 3.766204833984375, "logps/chosen": -506.4801940917969, "logps/rejected": -1010.7666015625, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -10.206683158874512, "rewards/margins": 27.946022033691406, "rewards/rejected": -38.15270233154297, "step": 2916 }, { "epoch": 1.8146189735614309, "grad_norm": 0.007777743507176638, "learning_rate": 2.194559704933149e-06, "logits/chosen": -0.36318880319595337, "logits/rejected": 1.6212220191955566, "logps/chosen": -473.5294189453125, "logps/rejected": -822.025634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.75701904296875, "rewards/margins": 25.177345275878906, "rewards/rejected": -31.934364318847656, "step": 2917 }, { "epoch": 1.8152410575427682, "grad_norm": 3.3365886338287964e-05, "learning_rate": 2.1934071000461044e-06, "logits/chosen": 1.2993437051773071, "logits/rejected": 3.2163896560668945, "logps/chosen": -691.1989135742188, "logps/rejected": -1104.626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.866948127746582, "rewards/margins": 35.912864685058594, "rewards/rejected": -44.77981185913086, "step": 2918 }, { "epoch": 1.8158631415241058, "grad_norm": 0.006478574126958847, "learning_rate": 2.1922544951590596e-06, "logits/chosen": -2.5405850410461426, "logits/rejected": 1.3741693496704102, "logps/chosen": -430.7892150878906, "logps/rejected": -984.012451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.790828704833984, "rewards/margins": 29.503963470458984, "rewards/rejected": -37.29479217529297, "step": 2919 }, { "epoch": 1.8164852255054433, "grad_norm": 0.00021122633188497275, "learning_rate": 2.191101890272015e-06, "logits/chosen": -0.9110089540481567, "logits/rejected": 3.0883960723876953, "logps/chosen": -460.5048522949219, "logps/rejected": -1125.7283935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.782463073730469, "rewards/margins": 36.10568618774414, "rewards/rejected": -41.88814926147461, "step": 2920 }, { "epoch": 1.8171073094867807, "grad_norm": 0.003954702522605658, "learning_rate": 2.18994928538497e-06, "logits/chosen": -2.058516025543213, "logits/rejected": 2.3706719875335693, "logps/chosen": -356.7776794433594, "logps/rejected": -962.4862670898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8709609508514404, "rewards/margins": 34.379554748535156, "rewards/rejected": -38.25051498413086, "step": 2921 }, { "epoch": 1.8177293934681182, "grad_norm": 1.9064320440520532e-05, "learning_rate": 2.1887966804979253e-06, "logits/chosen": 0.29505228996276855, "logits/rejected": 3.59993314743042, "logps/chosen": -487.88775634765625, "logps/rejected": -985.3267822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.258945465087891, "rewards/margins": 32.4913444519043, "rewards/rejected": -38.75028991699219, "step": 2922 }, { "epoch": 1.8183514774494558, "grad_norm": 28.823314666748047, "learning_rate": 2.187644075610881e-06, "logits/chosen": -0.5781339406967163, "logits/rejected": 2.4247617721557617, "logps/chosen": -513.3623657226562, "logps/rejected": -1042.141845703125, "loss": 0.4131, "rewards/accuracies": 0.875, "rewards/chosen": -7.648505687713623, "rewards/margins": 31.26897430419922, "rewards/rejected": -38.917476654052734, "step": 2923 }, { "epoch": 1.8189735614307931, "grad_norm": 0.005049367900937796, "learning_rate": 2.186491470723836e-06, "logits/chosen": 0.6467719078063965, "logits/rejected": 4.127745151519775, "logps/chosen": -454.96112060546875, "logps/rejected": -907.7899169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.762445449829102, "rewards/margins": 27.605257034301758, "rewards/rejected": -32.36770248413086, "step": 2924 }, { "epoch": 1.8195956454121305, "grad_norm": 0.5330542325973511, "learning_rate": 2.1853388658367914e-06, "logits/chosen": -1.1520477533340454, "logits/rejected": 3.201488971710205, "logps/chosen": -368.5358581542969, "logps/rejected": -902.8729248046875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.463862419128418, "rewards/margins": 32.56816864013672, "rewards/rejected": -36.03202819824219, "step": 2925 }, { "epoch": 1.8202177293934683, "grad_norm": 0.12580925226211548, "learning_rate": 2.1841862609497466e-06, "logits/chosen": 1.035218358039856, "logits/rejected": 2.849552631378174, "logps/chosen": -667.6565551757812, "logps/rejected": -1076.946044921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.761651039123535, "rewards/margins": 23.467363357543945, "rewards/rejected": -34.22901153564453, "step": 2926 }, { "epoch": 1.8208398133748056, "grad_norm": 0.020777558907866478, "learning_rate": 2.183033656062702e-06, "logits/chosen": 0.5492273569107056, "logits/rejected": 3.572721481323242, "logps/chosen": -448.18756103515625, "logps/rejected": -810.0396728515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.7897562980651855, "rewards/margins": 19.51482582092285, "rewards/rejected": -26.304582595825195, "step": 2927 }, { "epoch": 1.821461897356143, "grad_norm": 5.220741149969399e-05, "learning_rate": 2.181881051175657e-06, "logits/chosen": 1.1220362186431885, "logits/rejected": 3.1610453128814697, "logps/chosen": -531.509521484375, "logps/rejected": -936.9639892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.363021373748779, "rewards/margins": 30.606876373291016, "rewards/rejected": -36.96989822387695, "step": 2928 }, { "epoch": 1.8220839813374805, "grad_norm": 0.004893193952739239, "learning_rate": 2.1807284462886123e-06, "logits/chosen": 0.5991615653038025, "logits/rejected": 4.349677085876465, "logps/chosen": -508.3122253417969, "logps/rejected": -951.9404907226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.923589706420898, "rewards/margins": 27.25507164001465, "rewards/rejected": -35.17866134643555, "step": 2929 }, { "epoch": 1.822706065318818, "grad_norm": 0.015286240726709366, "learning_rate": 2.179575841401568e-06, "logits/chosen": 0.10364526510238647, "logits/rejected": 3.790127992630005, "logps/chosen": -611.4168701171875, "logps/rejected": -1101.678955078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.22943115234375, "rewards/margins": 28.932636260986328, "rewards/rejected": -37.16206741333008, "step": 2930 }, { "epoch": 1.8233281493001554, "grad_norm": 2.1218961876456888e-07, "learning_rate": 2.178423236514523e-06, "logits/chosen": -2.1993157863616943, "logits/rejected": 3.7309844493865967, "logps/chosen": -365.35125732421875, "logps/rejected": -1016.9790649414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.300674438476562, "rewards/margins": 31.86829376220703, "rewards/rejected": -40.168968200683594, "step": 2931 }, { "epoch": 1.823950233281493, "grad_norm": 0.011282279156148434, "learning_rate": 2.1772706316274784e-06, "logits/chosen": -1.7666422128677368, "logits/rejected": 1.8686922788619995, "logps/chosen": -422.510498046875, "logps/rejected": -993.54248046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.541168212890625, "rewards/margins": 30.807161331176758, "rewards/rejected": -40.34832763671875, "step": 2932 }, { "epoch": 1.8245723172628305, "grad_norm": 0.0015131094260141253, "learning_rate": 2.1761180267404336e-06, "logits/chosen": -1.056628942489624, "logits/rejected": 3.1450870037078857, "logps/chosen": -361.94989013671875, "logps/rejected": -995.957275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.591294765472412, "rewards/margins": 30.778053283691406, "rewards/rejected": -36.36935043334961, "step": 2933 }, { "epoch": 1.8251944012441679, "grad_norm": 1.8307373523712158, "learning_rate": 2.174965421853389e-06, "logits/chosen": -0.6615819931030273, "logits/rejected": 2.4734442234039307, "logps/chosen": -519.839599609375, "logps/rejected": -949.9840087890625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -9.32272720336914, "rewards/margins": 23.967910766601562, "rewards/rejected": -33.29063415527344, "step": 2934 }, { "epoch": 1.8258164852255054, "grad_norm": 0.0006201690994203091, "learning_rate": 2.173812816966344e-06, "logits/chosen": 4.023318290710449, "logits/rejected": 3.745248317718506, "logps/chosen": -664.39013671875, "logps/rejected": -877.3580322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.520995140075684, "rewards/margins": 26.294607162475586, "rewards/rejected": -31.815603256225586, "step": 2935 }, { "epoch": 1.826438569206843, "grad_norm": 0.00011524202272994444, "learning_rate": 2.1726602120792993e-06, "logits/chosen": 0.4154716730117798, "logits/rejected": 4.476164817810059, "logps/chosen": -483.99884033203125, "logps/rejected": -1042.3800048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.124652862548828, "rewards/margins": 30.8934326171875, "rewards/rejected": -39.01808166503906, "step": 2936 }, { "epoch": 1.8270606531881803, "grad_norm": 6.123228073120117, "learning_rate": 2.171507607192255e-06, "logits/chosen": 2.2037289142608643, "logits/rejected": 2.833683490753174, "logps/chosen": -729.1688842773438, "logps/rejected": -1059.06494140625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -10.01978588104248, "rewards/margins": 24.942218780517578, "rewards/rejected": -34.962005615234375, "step": 2937 }, { "epoch": 1.827682737169518, "grad_norm": 1.062633714354888e-06, "learning_rate": 2.17035500230521e-06, "logits/chosen": 0.33711880445480347, "logits/rejected": 3.4437479972839355, "logps/chosen": -421.8076171875, "logps/rejected": -1000.681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.936567306518555, "rewards/margins": 35.271427154541016, "rewards/rejected": -44.2079963684082, "step": 2938 }, { "epoch": 1.8283048211508555, "grad_norm": 1.1224675178527832, "learning_rate": 2.1692023974181654e-06, "logits/chosen": -2.050677537918091, "logits/rejected": 2.308436870574951, "logps/chosen": -409.9349060058594, "logps/rejected": -905.05419921875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -7.486188888549805, "rewards/margins": 25.246501922607422, "rewards/rejected": -32.732688903808594, "step": 2939 }, { "epoch": 1.8289269051321928, "grad_norm": 6.059678554534912, "learning_rate": 2.1680497925311206e-06, "logits/chosen": 1.1956887245178223, "logits/rejected": 3.2358450889587402, "logps/chosen": -490.0911865234375, "logps/rejected": -807.5560913085938, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -6.075892448425293, "rewards/margins": 21.591156005859375, "rewards/rejected": -27.667049407958984, "step": 2940 }, { "epoch": 1.8295489891135304, "grad_norm": 0.0003045987687073648, "learning_rate": 2.166897187644076e-06, "logits/chosen": 1.3800091743469238, "logits/rejected": 3.362034797668457, "logps/chosen": -597.3182373046875, "logps/rejected": -1002.1866455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.463356018066406, "rewards/margins": 30.19609832763672, "rewards/rejected": -43.659454345703125, "step": 2941 }, { "epoch": 1.830171073094868, "grad_norm": 0.0026972335763275623, "learning_rate": 2.165744582757031e-06, "logits/chosen": -1.7000247240066528, "logits/rejected": 3.2369625568389893, "logps/chosen": -315.9631652832031, "logps/rejected": -950.5713500976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.224560737609863, "rewards/margins": 28.419212341308594, "rewards/rejected": -33.64377212524414, "step": 2942 }, { "epoch": 1.8307931570762053, "grad_norm": 0.0006514900014735758, "learning_rate": 2.1645919778699863e-06, "logits/chosen": 1.6406525373458862, "logits/rejected": 2.945081949234009, "logps/chosen": -550.1603393554688, "logps/rejected": -932.3032836914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.099654197692871, "rewards/margins": 28.81879425048828, "rewards/rejected": -37.91844940185547, "step": 2943 }, { "epoch": 1.8314152410575426, "grad_norm": 3.3904993534088135, "learning_rate": 2.1634393729829415e-06, "logits/chosen": 1.3965401649475098, "logits/rejected": 2.7653379440307617, "logps/chosen": -546.5840454101562, "logps/rejected": -861.8883666992188, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -6.872170925140381, "rewards/margins": 23.634138107299805, "rewards/rejected": -30.506309509277344, "step": 2944 }, { "epoch": 1.8320373250388804, "grad_norm": 0.0024480209685862064, "learning_rate": 2.162286768095897e-06, "logits/chosen": -0.25313782691955566, "logits/rejected": 2.7993435859680176, "logps/chosen": -516.3167724609375, "logps/rejected": -857.3118286132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.381752967834473, "rewards/margins": 21.019704818725586, "rewards/rejected": -29.401458740234375, "step": 2945 }, { "epoch": 1.8326594090202177, "grad_norm": 2.8514961503134373e-09, "learning_rate": 2.1611341632088524e-06, "logits/chosen": -1.2172906398773193, "logits/rejected": 2.9055662155151367, "logps/chosen": -480.29754638671875, "logps/rejected": -1059.2327880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.863322734832764, "rewards/margins": 32.67042922973633, "rewards/rejected": -40.53375244140625, "step": 2946 }, { "epoch": 1.833281493001555, "grad_norm": 3.847206971840933e-05, "learning_rate": 2.1599815583218076e-06, "logits/chosen": 0.5809682011604309, "logits/rejected": 3.172386646270752, "logps/chosen": -445.71368408203125, "logps/rejected": -870.1534423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.905163764953613, "rewards/margins": 29.116558074951172, "rewards/rejected": -36.02172088623047, "step": 2947 }, { "epoch": 1.8339035769828926, "grad_norm": 0.11857543140649796, "learning_rate": 2.158828953434763e-06, "logits/chosen": -0.6377525329589844, "logits/rejected": 3.9645566940307617, "logps/chosen": -480.59234619140625, "logps/rejected": -1024.75390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.692419528961182, "rewards/margins": 25.763336181640625, "rewards/rejected": -31.45575523376465, "step": 2948 }, { "epoch": 1.8345256609642302, "grad_norm": 5.8594279289245605, "learning_rate": 2.157676348547718e-06, "logits/chosen": -2.2111575603485107, "logits/rejected": 3.7058584690093994, "logps/chosen": -376.0509338378906, "logps/rejected": -938.4148559570312, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": -10.637496948242188, "rewards/margins": 27.1528377532959, "rewards/rejected": -37.79033279418945, "step": 2949 }, { "epoch": 1.8351477449455675, "grad_norm": 0.3535236716270447, "learning_rate": 2.1565237436606733e-06, "logits/chosen": 0.1966608762741089, "logits/rejected": 2.6298699378967285, "logps/chosen": -603.3723754882812, "logps/rejected": -920.42138671875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -10.434402465820312, "rewards/margins": 24.166688919067383, "rewards/rejected": -34.60109329223633, "step": 2950 }, { "epoch": 1.835769828926905, "grad_norm": 1.51795320562087e-05, "learning_rate": 2.1553711387736285e-06, "logits/chosen": -1.9474167823791504, "logits/rejected": 1.8401274681091309, "logps/chosen": -547.075927734375, "logps/rejected": -1213.5771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.760676383972168, "rewards/margins": 37.2432746887207, "rewards/rejected": -52.00395202636719, "step": 2951 }, { "epoch": 1.8363919129082427, "grad_norm": 0.005809712689369917, "learning_rate": 2.154218533886584e-06, "logits/chosen": -1.4853514432907104, "logits/rejected": 3.4599609375, "logps/chosen": -325.3935852050781, "logps/rejected": -844.361572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.286863327026367, "rewards/margins": 23.762916564941406, "rewards/rejected": -30.049781799316406, "step": 2952 }, { "epoch": 1.83701399688958, "grad_norm": 1.4774353076063562e-05, "learning_rate": 2.1530659289995394e-06, "logits/chosen": 0.5670173168182373, "logits/rejected": 3.9193413257598877, "logps/chosen": -564.1218872070312, "logps/rejected": -1089.5821533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.208772659301758, "rewards/margins": 39.558135986328125, "rewards/rejected": -50.766910552978516, "step": 2953 }, { "epoch": 1.8376360808709176, "grad_norm": 0.0008804783574305475, "learning_rate": 2.1519133241124946e-06, "logits/chosen": -1.3862698078155518, "logits/rejected": 4.039997577667236, "logps/chosen": -367.4657287597656, "logps/rejected": -995.6171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9530906677246094, "rewards/margins": 34.095638275146484, "rewards/rejected": -38.04873275756836, "step": 2954 }, { "epoch": 1.8382581648522551, "grad_norm": 0.0034357395488768816, "learning_rate": 2.15076071922545e-06, "logits/chosen": 0.04776197671890259, "logits/rejected": 2.4046387672424316, "logps/chosen": -530.6160888671875, "logps/rejected": -937.1896362304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.533966064453125, "rewards/margins": 22.469799041748047, "rewards/rejected": -36.003761291503906, "step": 2955 }, { "epoch": 1.8388802488335925, "grad_norm": 1.5653957234462723e-05, "learning_rate": 2.149608114338405e-06, "logits/chosen": -0.19805996119976044, "logits/rejected": 3.1254231929779053, "logps/chosen": -556.4609375, "logps/rejected": -1079.94775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.580520629882812, "rewards/margins": 30.80333137512207, "rewards/rejected": -43.38385009765625, "step": 2956 }, { "epoch": 1.83950233281493, "grad_norm": 27.43037986755371, "learning_rate": 2.1484555094513603e-06, "logits/chosen": 2.576160430908203, "logits/rejected": 3.2386300563812256, "logps/chosen": -607.7432861328125, "logps/rejected": -908.0758056640625, "loss": 0.374, "rewards/accuracies": 0.875, "rewards/chosen": -10.76134967803955, "rewards/margins": 23.454696655273438, "rewards/rejected": -34.21604919433594, "step": 2957 }, { "epoch": 1.8401244167962676, "grad_norm": 6.398452281951904, "learning_rate": 2.1473029045643155e-06, "logits/chosen": -0.6255086064338684, "logits/rejected": 3.2505578994750977, "logps/chosen": -477.28216552734375, "logps/rejected": -866.9014892578125, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -8.571666717529297, "rewards/margins": 24.20078468322754, "rewards/rejected": -32.7724494934082, "step": 2958 }, { "epoch": 1.840746500777605, "grad_norm": 7.195072157628601e-06, "learning_rate": 2.1461502996772707e-06, "logits/chosen": -1.8814363479614258, "logits/rejected": 3.9996814727783203, "logps/chosen": -305.8305969238281, "logps/rejected": -1075.4239501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.674073219299316, "rewards/margins": 39.340084075927734, "rewards/rejected": -44.01416015625, "step": 2959 }, { "epoch": 1.8413685847589425, "grad_norm": 3.382543127372628e-06, "learning_rate": 2.144997694790226e-06, "logits/chosen": -0.4814430773258209, "logits/rejected": 3.463700771331787, "logps/chosen": -494.40863037109375, "logps/rejected": -1069.7625732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.398022651672363, "rewards/margins": 35.85649108886719, "rewards/rejected": -44.2545166015625, "step": 2960 }, { "epoch": 1.84199066874028, "grad_norm": 0.47370555996894836, "learning_rate": 2.143845089903181e-06, "logits/chosen": -2.028714895248413, "logits/rejected": 3.4398715496063232, "logps/chosen": -326.89898681640625, "logps/rejected": -946.994873046875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -5.010538101196289, "rewards/margins": 31.887008666992188, "rewards/rejected": -36.897544860839844, "step": 2961 }, { "epoch": 1.8426127527216174, "grad_norm": 0.9828805923461914, "learning_rate": 2.1426924850161364e-06, "logits/chosen": 2.851536750793457, "logits/rejected": 3.195272445678711, "logps/chosen": -591.3557739257812, "logps/rejected": -907.9046630859375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -12.238224029541016, "rewards/margins": 25.548540115356445, "rewards/rejected": -37.78676223754883, "step": 2962 }, { "epoch": 1.8432348367029547, "grad_norm": 2.494321194035365e-08, "learning_rate": 2.1415398801290916e-06, "logits/chosen": 0.753627359867096, "logits/rejected": 3.689297914505005, "logps/chosen": -475.31829833984375, "logps/rejected": -1157.6439208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.471179008483887, "rewards/margins": 39.312171936035156, "rewards/rejected": -47.783355712890625, "step": 2963 }, { "epoch": 1.8438569206842925, "grad_norm": 0.002211152808740735, "learning_rate": 2.1403872752420473e-06, "logits/chosen": 0.5270100831985474, "logits/rejected": 2.783477306365967, "logps/chosen": -550.474609375, "logps/rejected": -890.4298706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.78561782836914, "rewards/margins": 24.508562088012695, "rewards/rejected": -36.29418182373047, "step": 2964 }, { "epoch": 1.8444790046656299, "grad_norm": 21.535110473632812, "learning_rate": 2.1392346703550025e-06, "logits/chosen": -2.338355302810669, "logits/rejected": 0.6437476277351379, "logps/chosen": -490.95294189453125, "logps/rejected": -1015.9381103515625, "loss": 0.1576, "rewards/accuracies": 0.875, "rewards/chosen": -12.412555694580078, "rewards/margins": 26.055999755859375, "rewards/rejected": -38.46855545043945, "step": 2965 }, { "epoch": 1.8451010886469672, "grad_norm": 1.2399395018292125e-05, "learning_rate": 2.1380820654679577e-06, "logits/chosen": 1.6357576847076416, "logits/rejected": 3.4656600952148438, "logps/chosen": -639.3941040039062, "logps/rejected": -1004.3380737304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.001319885253906, "rewards/margins": 29.696578979492188, "rewards/rejected": -37.697898864746094, "step": 2966 }, { "epoch": 1.845723172628305, "grad_norm": 48.81344985961914, "learning_rate": 2.136929460580913e-06, "logits/chosen": 2.8996856212615967, "logits/rejected": 3.9725353717803955, "logps/chosen": -725.06103515625, "logps/rejected": -951.90576171875, "loss": 2.1172, "rewards/accuracies": 0.875, "rewards/chosen": -10.822303771972656, "rewards/margins": 21.377300262451172, "rewards/rejected": -32.19960403442383, "step": 2967 }, { "epoch": 1.8463452566096423, "grad_norm": 0.06352223455905914, "learning_rate": 2.135776855693868e-06, "logits/chosen": -1.567836046218872, "logits/rejected": 3.9059090614318848, "logps/chosen": -474.1429443359375, "logps/rejected": -1246.232666015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.337193489074707, "rewards/margins": 42.75863265991211, "rewards/rejected": -49.0958251953125, "step": 2968 }, { "epoch": 1.8469673405909797, "grad_norm": 0.023700682446360588, "learning_rate": 2.1346242508068234e-06, "logits/chosen": 2.8687596321105957, "logits/rejected": 2.6395351886749268, "logps/chosen": -682.3429565429688, "logps/rejected": -916.9752807617188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.061613082885742, "rewards/margins": 25.777587890625, "rewards/rejected": -34.839202880859375, "step": 2969 }, { "epoch": 1.8475894245723172, "grad_norm": 0.2039632946252823, "learning_rate": 2.1334716459197786e-06, "logits/chosen": 1.3555502891540527, "logits/rejected": 4.361049175262451, "logps/chosen": -466.4350891113281, "logps/rejected": -843.00341796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -11.455344200134277, "rewards/margins": 20.132835388183594, "rewards/rejected": -31.588180541992188, "step": 2970 }, { "epoch": 1.8482115085536548, "grad_norm": 2.145111999141136e-08, "learning_rate": 2.1323190410327343e-06, "logits/chosen": -0.839304506778717, "logits/rejected": 2.2905335426330566, "logps/chosen": -520.3160400390625, "logps/rejected": -1045.048095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.831880569458008, "rewards/margins": 34.59035873413086, "rewards/rejected": -42.4222412109375, "step": 2971 }, { "epoch": 1.8488335925349921, "grad_norm": 15.03640365600586, "learning_rate": 2.1311664361456895e-06, "logits/chosen": 0.5526972413063049, "logits/rejected": 2.9396677017211914, "logps/chosen": -543.9615478515625, "logps/rejected": -1039.202392578125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -12.499490737915039, "rewards/margins": 31.284423828125, "rewards/rejected": -43.783912658691406, "step": 2972 }, { "epoch": 1.8494556765163297, "grad_norm": 6.252834339193214e-08, "learning_rate": 2.1300138312586447e-06, "logits/chosen": -1.3490384817123413, "logits/rejected": 1.9333854913711548, "logps/chosen": -523.3990478515625, "logps/rejected": -1060.8382568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.62447738647461, "rewards/margins": 31.01898765563965, "rewards/rejected": -40.643463134765625, "step": 2973 }, { "epoch": 1.8500777604976673, "grad_norm": 0.47883057594299316, "learning_rate": 2.1288612263716e-06, "logits/chosen": 0.83540278673172, "logits/rejected": 3.4483134746551514, "logps/chosen": -649.0042724609375, "logps/rejected": -1084.9862060546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.918928146362305, "rewards/margins": 27.286611557006836, "rewards/rejected": -37.20553970336914, "step": 2974 }, { "epoch": 1.8506998444790046, "grad_norm": 0.08642778545618057, "learning_rate": 2.127708621484555e-06, "logits/chosen": 1.8379980325698853, "logits/rejected": 2.3691537380218506, "logps/chosen": -613.3511962890625, "logps/rejected": -848.9517211914062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.631416320800781, "rewards/margins": 22.10049819946289, "rewards/rejected": -31.731916427612305, "step": 2975 }, { "epoch": 1.8513219284603422, "grad_norm": 37.839088439941406, "learning_rate": 2.1265560165975104e-06, "logits/chosen": 1.2364885807037354, "logits/rejected": 2.4372706413269043, "logps/chosen": -585.241455078125, "logps/rejected": -831.3947143554688, "loss": 0.5229, "rewards/accuracies": 0.75, "rewards/chosen": -10.798276901245117, "rewards/margins": 21.3815860748291, "rewards/rejected": -32.17986297607422, "step": 2976 }, { "epoch": 1.8519440124416797, "grad_norm": 0.09387121349573135, "learning_rate": 2.1254034117104656e-06, "logits/chosen": 0.7087660431861877, "logits/rejected": 3.379647970199585, "logps/chosen": -636.924072265625, "logps/rejected": -1199.20947265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.0158891677856445, "rewards/margins": 37.23964309692383, "rewards/rejected": -43.255531311035156, "step": 2977 }, { "epoch": 1.852566096423017, "grad_norm": 11.74792766571045, "learning_rate": 2.1242508068234213e-06, "logits/chosen": 0.6603624820709229, "logits/rejected": 3.59389591217041, "logps/chosen": -573.1044311523438, "logps/rejected": -1002.54296875, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": -12.622593879699707, "rewards/margins": 24.730194091796875, "rewards/rejected": -37.352787017822266, "step": 2978 }, { "epoch": 1.8531881804043546, "grad_norm": 5.23779344803188e-05, "learning_rate": 2.1230982019363765e-06, "logits/chosen": 0.8842979669570923, "logits/rejected": 3.95865535736084, "logps/chosen": -411.5668640136719, "logps/rejected": -885.6905517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.89336109161377, "rewards/margins": 30.12948226928711, "rewards/rejected": -39.02284240722656, "step": 2979 }, { "epoch": 1.8538102643856922, "grad_norm": 9.278995513916016, "learning_rate": 2.1219455970493317e-06, "logits/chosen": 2.5439319610595703, "logits/rejected": 4.2560038566589355, "logps/chosen": -725.4393310546875, "logps/rejected": -1119.859375, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -10.018448829650879, "rewards/margins": 28.870258331298828, "rewards/rejected": -38.888702392578125, "step": 2980 }, { "epoch": 1.8544323483670295, "grad_norm": 5.287654314400925e-09, "learning_rate": 2.120792992162287e-06, "logits/chosen": -1.8039573431015015, "logits/rejected": 2.1132235527038574, "logps/chosen": -485.0125427246094, "logps/rejected": -1005.99072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.288414001464844, "rewards/margins": 36.67291259765625, "rewards/rejected": -46.961326599121094, "step": 2981 }, { "epoch": 1.8550544323483669, "grad_norm": 0.4025889039039612, "learning_rate": 2.119640387275242e-06, "logits/chosen": 2.9755101203918457, "logits/rejected": 4.695067405700684, "logps/chosen": -648.7351684570312, "logps/rejected": -1055.5948486328125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -11.618903160095215, "rewards/margins": 28.467288970947266, "rewards/rejected": -40.08618927001953, "step": 2982 }, { "epoch": 1.8556765163297047, "grad_norm": 36.6614875793457, "learning_rate": 2.1184877823881974e-06, "logits/chosen": 2.5858490467071533, "logits/rejected": 3.5514469146728516, "logps/chosen": -845.2623901367188, "logps/rejected": -1315.9478759765625, "loss": 0.3482, "rewards/accuracies": 0.875, "rewards/chosen": -15.051424026489258, "rewards/margins": 32.73891830444336, "rewards/rejected": -47.790340423583984, "step": 2983 }, { "epoch": 1.856298600311042, "grad_norm": 29.610349655151367, "learning_rate": 2.1173351775011526e-06, "logits/chosen": -0.57902592420578, "logits/rejected": 3.0780577659606934, "logps/chosen": -524.6580810546875, "logps/rejected": -986.159423828125, "loss": 0.3359, "rewards/accuracies": 0.875, "rewards/chosen": -12.902968406677246, "rewards/margins": 26.206405639648438, "rewards/rejected": -39.109375, "step": 2984 }, { "epoch": 1.8569206842923793, "grad_norm": 0.00036348786670714617, "learning_rate": 2.116182572614108e-06, "logits/chosen": 1.5172300338745117, "logits/rejected": 3.862931489944458, "logps/chosen": -601.9778442382812, "logps/rejected": -1014.2064208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.697236061096191, "rewards/margins": 31.744977951049805, "rewards/rejected": -41.44221115112305, "step": 2985 }, { "epoch": 1.8575427682737171, "grad_norm": 0.6304136514663696, "learning_rate": 2.1150299677270635e-06, "logits/chosen": 1.1943728923797607, "logits/rejected": 3.9687066078186035, "logps/chosen": -582.301513671875, "logps/rejected": -1053.023681640625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -7.7523956298828125, "rewards/margins": 34.65993881225586, "rewards/rejected": -42.41233444213867, "step": 2986 }, { "epoch": 1.8581648522550545, "grad_norm": 0.0008032044279389083, "learning_rate": 2.1138773628400187e-06, "logits/chosen": -2.6957528591156006, "logits/rejected": 1.5795737504959106, "logps/chosen": -363.6121826171875, "logps/rejected": -886.4146728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.061027526855469, "rewards/margins": 30.693744659423828, "rewards/rejected": -36.7547721862793, "step": 2987 }, { "epoch": 1.8587869362363918, "grad_norm": 0.0013717946130782366, "learning_rate": 2.112724757952974e-06, "logits/chosen": -0.3157472610473633, "logits/rejected": 2.1115522384643555, "logps/chosen": -401.7276916503906, "logps/rejected": -775.2745361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.6762285232543945, "rewards/margins": 28.42954444885254, "rewards/rejected": -34.10577392578125, "step": 2988 }, { "epoch": 1.8594090202177294, "grad_norm": 8.54127677030192e-07, "learning_rate": 2.111572153065929e-06, "logits/chosen": 1.5187461376190186, "logits/rejected": 3.9361824989318848, "logps/chosen": -424.12322998046875, "logps/rejected": -852.22802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.521132946014404, "rewards/margins": 30.52448272705078, "rewards/rejected": -38.045616149902344, "step": 2989 }, { "epoch": 1.860031104199067, "grad_norm": 0.00021709220891352743, "learning_rate": 2.1104195481788844e-06, "logits/chosen": 0.541998028755188, "logits/rejected": 2.679088592529297, "logps/chosen": -595.886962890625, "logps/rejected": -1007.0936279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.939725875854492, "rewards/margins": 26.004108428955078, "rewards/rejected": -38.9438362121582, "step": 2990 }, { "epoch": 1.8606531881804043, "grad_norm": 0.042757876217365265, "learning_rate": 2.1092669432918396e-06, "logits/chosen": 0.20492124557495117, "logits/rejected": 2.4277796745300293, "logps/chosen": -542.8641357421875, "logps/rejected": -1056.92333984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.894628524780273, "rewards/margins": 35.557220458984375, "rewards/rejected": -45.45185089111328, "step": 2991 }, { "epoch": 1.8612752721617418, "grad_norm": 3.564086675643921, "learning_rate": 2.108114338404795e-06, "logits/chosen": 0.8177498579025269, "logits/rejected": 2.0094380378723145, "logps/chosen": -678.0740966796875, "logps/rejected": -905.52587890625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -12.322629928588867, "rewards/margins": 17.72020149230957, "rewards/rejected": -30.042829513549805, "step": 2992 }, { "epoch": 1.8618973561430794, "grad_norm": 27.561216354370117, "learning_rate": 2.1069617335177505e-06, "logits/chosen": -2.124131202697754, "logits/rejected": 2.3365206718444824, "logps/chosen": -346.3985290527344, "logps/rejected": -939.9415283203125, "loss": 0.9759, "rewards/accuracies": 0.875, "rewards/chosen": -7.955907821655273, "rewards/margins": 25.818593978881836, "rewards/rejected": -33.77450180053711, "step": 2993 }, { "epoch": 1.8625194401244167, "grad_norm": 2.435544013977051, "learning_rate": 2.1058091286307057e-06, "logits/chosen": -1.4700162410736084, "logits/rejected": 1.1362886428833008, "logps/chosen": -505.6947326660156, "logps/rejected": -871.2249755859375, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -11.700376510620117, "rewards/margins": 23.613595962524414, "rewards/rejected": -35.31397247314453, "step": 2994 }, { "epoch": 1.8631415241057543, "grad_norm": 7.134440898895264, "learning_rate": 2.104656523743661e-06, "logits/chosen": -0.5249059796333313, "logits/rejected": 2.251185417175293, "logps/chosen": -566.6573486328125, "logps/rejected": -883.6015625, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -6.398397445678711, "rewards/margins": 25.780942916870117, "rewards/rejected": -32.17934036254883, "step": 2995 }, { "epoch": 1.8637636080870918, "grad_norm": 6.2458311731461436e-06, "learning_rate": 2.103503918856616e-06, "logits/chosen": -1.0155529975891113, "logits/rejected": 3.2504520416259766, "logps/chosen": -421.27655029296875, "logps/rejected": -1078.7958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.56519889831543, "rewards/margins": 31.937530517578125, "rewards/rejected": -40.50273132324219, "step": 2996 }, { "epoch": 1.8643856920684292, "grad_norm": 0.0005573639646172523, "learning_rate": 2.1023513139695714e-06, "logits/chosen": 1.2448923587799072, "logits/rejected": 2.8676905632019043, "logps/chosen": -564.3466186523438, "logps/rejected": -998.9130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.508292198181152, "rewards/margins": 28.037822723388672, "rewards/rejected": -38.546112060546875, "step": 2997 }, { "epoch": 1.8650077760497668, "grad_norm": 9.827252824834432e-07, "learning_rate": 2.1011987090825266e-06, "logits/chosen": 1.301634669303894, "logits/rejected": 2.054259777069092, "logps/chosen": -562.0250244140625, "logps/rejected": -966.8475341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.389965057373047, "rewards/margins": 34.89493179321289, "rewards/rejected": -44.28489685058594, "step": 2998 }, { "epoch": 1.8656298600311043, "grad_norm": 36.70895004272461, "learning_rate": 2.100046104195482e-06, "logits/chosen": 1.1578892469406128, "logits/rejected": 4.549923896789551, "logps/chosen": -650.369384765625, "logps/rejected": -1236.646728515625, "loss": 0.7256, "rewards/accuracies": 0.875, "rewards/chosen": -10.710254669189453, "rewards/margins": 37.79402160644531, "rewards/rejected": -48.5042724609375, "step": 2999 }, { "epoch": 1.8662519440124417, "grad_norm": 6.277427466727659e-09, "learning_rate": 2.0988934993084375e-06, "logits/chosen": -1.7158828973770142, "logits/rejected": 2.6323788166046143, "logps/chosen": -478.2867431640625, "logps/rejected": -1147.33447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.997095108032227, "rewards/margins": 36.653255462646484, "rewards/rejected": -45.65034484863281, "step": 3000 }, { "epoch": 1.866874027993779, "grad_norm": 0.0013673205394297838, "learning_rate": 2.0977408944213927e-06, "logits/chosen": 0.34744882583618164, "logits/rejected": 2.0286967754364014, "logps/chosen": -579.9356079101562, "logps/rejected": -969.3897705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.04820728302002, "rewards/margins": 29.78260040283203, "rewards/rejected": -42.830806732177734, "step": 3001 }, { "epoch": 1.8674961119751168, "grad_norm": 32.87213134765625, "learning_rate": 2.096588289534348e-06, "logits/chosen": 1.6131173372268677, "logits/rejected": 4.298799991607666, "logps/chosen": -596.7760620117188, "logps/rejected": -1074.724853515625, "loss": 0.4465, "rewards/accuracies": 0.875, "rewards/chosen": -14.154102325439453, "rewards/margins": 29.750553131103516, "rewards/rejected": -43.904659271240234, "step": 3002 }, { "epoch": 1.8681181959564541, "grad_norm": 0.20293423533439636, "learning_rate": 2.095435684647303e-06, "logits/chosen": -1.015268325805664, "logits/rejected": 4.6616339683532715, "logps/chosen": -418.6412353515625, "logps/rejected": -1064.1949462890625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.623247146606445, "rewards/margins": 35.75114440917969, "rewards/rejected": -44.3743896484375, "step": 3003 }, { "epoch": 1.8687402799377915, "grad_norm": 0.10534223169088364, "learning_rate": 2.0942830797602584e-06, "logits/chosen": -0.0824170708656311, "logits/rejected": 2.3265902996063232, "logps/chosen": -361.28765869140625, "logps/rejected": -665.5305786132812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.109169006347656, "rewards/margins": 13.994619369506836, "rewards/rejected": -23.103790283203125, "step": 3004 }, { "epoch": 1.8693623639191292, "grad_norm": 0.5241131782531738, "learning_rate": 2.0931304748732136e-06, "logits/chosen": 1.0985878705978394, "logits/rejected": 3.5785131454467773, "logps/chosen": -479.8888244628906, "logps/rejected": -926.8046264648438, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.3170166015625, "rewards/margins": 29.623743057250977, "rewards/rejected": -37.940765380859375, "step": 3005 }, { "epoch": 1.8699844479004666, "grad_norm": 4.3681578972609714e-05, "learning_rate": 2.091977869986169e-06, "logits/chosen": 1.0532994270324707, "logits/rejected": 4.94828987121582, "logps/chosen": -617.1984252929688, "logps/rejected": -1163.953857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.137100219726562, "rewards/margins": 31.24260139465332, "rewards/rejected": -40.37969970703125, "step": 3006 }, { "epoch": 1.870606531881804, "grad_norm": 4.708970300271176e-05, "learning_rate": 2.090825265099124e-06, "logits/chosen": 1.2389825582504272, "logits/rejected": 3.942110776901245, "logps/chosen": -384.44927978515625, "logps/rejected": -990.743896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.256569862365723, "rewards/margins": 43.04841613769531, "rewards/rejected": -51.30498504638672, "step": 3007 }, { "epoch": 1.8712286158631415, "grad_norm": 0.00026734487619251013, "learning_rate": 2.0896726602120797e-06, "logits/chosen": -0.2838389277458191, "logits/rejected": 2.5366568565368652, "logps/chosen": -485.586181640625, "logps/rejected": -973.17626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.219608306884766, "rewards/margins": 31.877403259277344, "rewards/rejected": -41.09701156616211, "step": 3008 }, { "epoch": 1.871850699844479, "grad_norm": 0.7158253192901611, "learning_rate": 2.088520055325035e-06, "logits/chosen": 0.17250674962997437, "logits/rejected": 1.608120322227478, "logps/chosen": -633.865966796875, "logps/rejected": -921.3824462890625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -10.764053344726562, "rewards/margins": 23.544639587402344, "rewards/rejected": -34.308692932128906, "step": 3009 }, { "epoch": 1.8724727838258164, "grad_norm": 5.755442543886602e-05, "learning_rate": 2.08736745043799e-06, "logits/chosen": -2.124953031539917, "logits/rejected": 2.750650644302368, "logps/chosen": -467.41168212890625, "logps/rejected": -1126.162841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.77512264251709, "rewards/margins": 39.48204803466797, "rewards/rejected": -51.257171630859375, "step": 3010 }, { "epoch": 1.873094867807154, "grad_norm": 37.73600387573242, "learning_rate": 2.0862148455509454e-06, "logits/chosen": 2.946049451828003, "logits/rejected": 2.970284938812256, "logps/chosen": -768.961181640625, "logps/rejected": -1059.467529296875, "loss": 0.3578, "rewards/accuracies": 0.875, "rewards/chosen": -16.543516159057617, "rewards/margins": 29.980506896972656, "rewards/rejected": -46.52402114868164, "step": 3011 }, { "epoch": 1.8737169517884915, "grad_norm": 9.90644645690918, "learning_rate": 2.0850622406639006e-06, "logits/chosen": -1.2075892686843872, "logits/rejected": 3.742737054824829, "logps/chosen": -424.56170654296875, "logps/rejected": -896.1334838867188, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -4.003117561340332, "rewards/margins": 22.584789276123047, "rewards/rejected": -26.587905883789062, "step": 3012 }, { "epoch": 1.8743390357698289, "grad_norm": 1.2684682815233828e-07, "learning_rate": 2.083909635776856e-06, "logits/chosen": 0.23507288098335266, "logits/rejected": 3.0083565711975098, "logps/chosen": -632.8001708984375, "logps/rejected": -1120.076416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.256636619567871, "rewards/margins": 25.77067756652832, "rewards/rejected": -38.02731704711914, "step": 3013 }, { "epoch": 1.8749611197511664, "grad_norm": 1.4789420366287231, "learning_rate": 2.082757030889811e-06, "logits/chosen": 1.3540959358215332, "logits/rejected": 2.8773014545440674, "logps/chosen": -539.6065673828125, "logps/rejected": -920.03271484375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -8.775294303894043, "rewards/margins": 28.458703994750977, "rewards/rejected": -37.23400115966797, "step": 3014 }, { "epoch": 1.875583203732504, "grad_norm": 5.803546088145595e-08, "learning_rate": 2.0816044260027667e-06, "logits/chosen": 1.6014816761016846, "logits/rejected": 2.7704672813415527, "logps/chosen": -544.0638427734375, "logps/rejected": -985.9021606445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.202249526977539, "rewards/margins": 32.040164947509766, "rewards/rejected": -42.24241638183594, "step": 3015 }, { "epoch": 1.8762052877138413, "grad_norm": 0.0022250423207879066, "learning_rate": 2.080451821115722e-06, "logits/chosen": -0.5401831269264221, "logits/rejected": 2.8969578742980957, "logps/chosen": -490.593994140625, "logps/rejected": -882.6023559570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.626242637634277, "rewards/margins": 24.396589279174805, "rewards/rejected": -32.022830963134766, "step": 3016 }, { "epoch": 1.8768273716951789, "grad_norm": 3.121169356745668e-05, "learning_rate": 2.079299216228677e-06, "logits/chosen": -1.2195719480514526, "logits/rejected": 3.7740817070007324, "logps/chosen": -373.0020446777344, "logps/rejected": -971.7380981445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.059677124023438, "rewards/margins": 32.931053161621094, "rewards/rejected": -40.99073028564453, "step": 3017 }, { "epoch": 1.8774494556765164, "grad_norm": 7.038599014282227, "learning_rate": 2.0781466113416324e-06, "logits/chosen": 2.310530185699463, "logits/rejected": 3.1102864742279053, "logps/chosen": -638.824951171875, "logps/rejected": -905.7316284179688, "loss": 0.0928, "rewards/accuracies": 0.875, "rewards/chosen": -10.308221817016602, "rewards/margins": 23.656431198120117, "rewards/rejected": -33.96465301513672, "step": 3018 }, { "epoch": 1.8780715396578538, "grad_norm": 0.0008783753728494048, "learning_rate": 2.0769940064545876e-06, "logits/chosen": 2.319756269454956, "logits/rejected": 4.028102874755859, "logps/chosen": -705.4632568359375, "logps/rejected": -1112.9202880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.343297958374023, "rewards/margins": 29.65229606628418, "rewards/rejected": -42.9955940246582, "step": 3019 }, { "epoch": 1.8786936236391913, "grad_norm": 0.14819443225860596, "learning_rate": 2.075841401567543e-06, "logits/chosen": 1.2726922035217285, "logits/rejected": 2.144050121307373, "logps/chosen": -630.6361083984375, "logps/rejected": -954.7571411132812, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -12.983515739440918, "rewards/margins": 23.956439971923828, "rewards/rejected": -36.9399528503418, "step": 3020 }, { "epoch": 1.879315707620529, "grad_norm": 0.07809139043092728, "learning_rate": 2.074688796680498e-06, "logits/chosen": 2.2689342498779297, "logits/rejected": 4.868353843688965, "logps/chosen": -604.720458984375, "logps/rejected": -1018.6021118164062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.717222690582275, "rewards/margins": 28.454551696777344, "rewards/rejected": -35.171775817871094, "step": 3021 }, { "epoch": 1.8799377916018662, "grad_norm": 9.459550346946344e-05, "learning_rate": 2.0735361917934537e-06, "logits/chosen": 1.7424232959747314, "logits/rejected": 3.0601165294647217, "logps/chosen": -488.897216796875, "logps/rejected": -926.98291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.733386993408203, "rewards/margins": 33.84270477294922, "rewards/rejected": -41.57609558105469, "step": 3022 }, { "epoch": 1.8805598755832036, "grad_norm": 0.04000190272927284, "learning_rate": 2.072383586906409e-06, "logits/chosen": 1.3352985382080078, "logits/rejected": 3.4352452754974365, "logps/chosen": -559.6813354492188, "logps/rejected": -924.271728515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.983686447143555, "rewards/margins": 25.625911712646484, "rewards/rejected": -37.609596252441406, "step": 3023 }, { "epoch": 1.8811819595645414, "grad_norm": 0.10560665279626846, "learning_rate": 2.071230982019364e-06, "logits/chosen": 1.071738600730896, "logits/rejected": 2.951690196990967, "logps/chosen": -527.6966552734375, "logps/rejected": -841.054443359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.558955192565918, "rewards/margins": 20.486169815063477, "rewards/rejected": -31.04512596130371, "step": 3024 }, { "epoch": 1.8818040435458787, "grad_norm": 0.0009212340810336173, "learning_rate": 2.070078377132319e-06, "logits/chosen": -1.7650898694992065, "logits/rejected": 1.4234349727630615, "logps/chosen": -414.21185302734375, "logps/rejected": -1025.5460205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.888592720031738, "rewards/margins": 40.60430145263672, "rewards/rejected": -50.49289321899414, "step": 3025 }, { "epoch": 1.882426127527216, "grad_norm": 0.12818951904773712, "learning_rate": 2.068925772245274e-06, "logits/chosen": 2.31978178024292, "logits/rejected": 3.1467909812927246, "logps/chosen": -750.0935668945312, "logps/rejected": -1013.3674926757812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -14.906643867492676, "rewards/margins": 22.728199005126953, "rewards/rejected": -37.63484191894531, "step": 3026 }, { "epoch": 1.8830482115085536, "grad_norm": 0.23764050006866455, "learning_rate": 2.06777316735823e-06, "logits/chosen": -1.2640528678894043, "logits/rejected": 2.4256398677825928, "logps/chosen": -436.44805908203125, "logps/rejected": -976.5224609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.2282075881958, "rewards/margins": 34.364280700683594, "rewards/rejected": -42.59248733520508, "step": 3027 }, { "epoch": 1.8836702954898912, "grad_norm": 0.12176292389631271, "learning_rate": 2.066620562471185e-06, "logits/chosen": 0.8552002906799316, "logits/rejected": 4.851562023162842, "logps/chosen": -543.6951904296875, "logps/rejected": -1165.0093994140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -11.13092041015625, "rewards/margins": 32.57614517211914, "rewards/rejected": -43.707069396972656, "step": 3028 }, { "epoch": 1.8842923794712285, "grad_norm": 0.0009258187492378056, "learning_rate": 2.0654679575841403e-06, "logits/chosen": -1.5283629894256592, "logits/rejected": 2.731228828430176, "logps/chosen": -348.7073669433594, "logps/rejected": -837.8958740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.846237659454346, "rewards/margins": 25.294355392456055, "rewards/rejected": -30.140592575073242, "step": 3029 }, { "epoch": 1.884914463452566, "grad_norm": 0.0059629688039422035, "learning_rate": 2.0643153526970955e-06, "logits/chosen": 0.30039146542549133, "logits/rejected": 3.9497108459472656, "logps/chosen": -519.6970825195312, "logps/rejected": -1043.4434814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.718129634857178, "rewards/margins": 31.629749298095703, "rewards/rejected": -39.347877502441406, "step": 3030 }, { "epoch": 1.8855365474339036, "grad_norm": 8.508398605044931e-05, "learning_rate": 2.0631627478100507e-06, "logits/chosen": -2.2366878986358643, "logits/rejected": 0.8939638733863831, "logps/chosen": -473.6304931640625, "logps/rejected": -957.8397827148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.143477439880371, "rewards/margins": 27.332979202270508, "rewards/rejected": -38.47645568847656, "step": 3031 }, { "epoch": 1.886158631415241, "grad_norm": 0.000677596777677536, "learning_rate": 2.062010142923006e-06, "logits/chosen": 2.1652817726135254, "logits/rejected": 3.804422616958618, "logps/chosen": -669.2681884765625, "logps/rejected": -1044.4951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.551870346069336, "rewards/margins": 30.823772430419922, "rewards/rejected": -42.375640869140625, "step": 3032 }, { "epoch": 1.8867807153965785, "grad_norm": 2.0379013221827336e-05, "learning_rate": 2.060857538035961e-06, "logits/chosen": 2.494349479675293, "logits/rejected": 3.617753505706787, "logps/chosen": -645.7994384765625, "logps/rejected": -1071.411376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.979326248168945, "rewards/margins": 29.537216186523438, "rewards/rejected": -43.516544342041016, "step": 3033 }, { "epoch": 1.887402799377916, "grad_norm": 4.401593969305395e-07, "learning_rate": 2.059704933148917e-06, "logits/chosen": 1.1647454500198364, "logits/rejected": 4.53623628616333, "logps/chosen": -665.0794067382812, "logps/rejected": -1135.864501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.783852577209473, "rewards/margins": 31.66611099243164, "rewards/rejected": -46.44996643066406, "step": 3034 }, { "epoch": 1.8880248833592534, "grad_norm": 2.0084574222564697, "learning_rate": 2.058552328261872e-06, "logits/chosen": 1.410960078239441, "logits/rejected": 1.4644067287445068, "logps/chosen": -776.1810302734375, "logps/rejected": -915.4017333984375, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -9.6139497756958, "rewards/margins": 21.933719635009766, "rewards/rejected": -31.547666549682617, "step": 3035 }, { "epoch": 1.888646967340591, "grad_norm": 0.4876706600189209, "learning_rate": 2.0573997233748273e-06, "logits/chosen": -1.973346471786499, "logits/rejected": 3.7605323791503906, "logps/chosen": -389.125244140625, "logps/rejected": -962.2559814453125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -10.528801918029785, "rewards/margins": 30.331979751586914, "rewards/rejected": -40.860782623291016, "step": 3036 }, { "epoch": 1.8892690513219286, "grad_norm": 0.0003601381031330675, "learning_rate": 2.0562471184877825e-06, "logits/chosen": 2.1486668586730957, "logits/rejected": 3.886343002319336, "logps/chosen": -646.3818359375, "logps/rejected": -1020.0104370117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.100616455078125, "rewards/margins": 27.22791290283203, "rewards/rejected": -37.328529357910156, "step": 3037 }, { "epoch": 1.889891135303266, "grad_norm": 0.005591857247054577, "learning_rate": 2.0550945136007377e-06, "logits/chosen": -1.573172926902771, "logits/rejected": 1.094953179359436, "logps/chosen": -499.50726318359375, "logps/rejected": -951.7015380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.768106937408447, "rewards/margins": 31.28192901611328, "rewards/rejected": -39.05003356933594, "step": 3038 }, { "epoch": 1.8905132192846035, "grad_norm": 2.937760591506958, "learning_rate": 2.053941908713693e-06, "logits/chosen": 1.1670812368392944, "logits/rejected": 2.585752010345459, "logps/chosen": -494.4090576171875, "logps/rejected": -959.474853515625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -11.01852798461914, "rewards/margins": 28.86224365234375, "rewards/rejected": -39.880775451660156, "step": 3039 }, { "epoch": 1.891135303265941, "grad_norm": 0.011783706955611706, "learning_rate": 2.052789303826648e-06, "logits/chosen": -1.6418266296386719, "logits/rejected": 3.052611827850342, "logps/chosen": -261.74237060546875, "logps/rejected": -709.05126953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.9691755771636963, "rewards/margins": 29.911666870117188, "rewards/rejected": -33.88084411621094, "step": 3040 }, { "epoch": 1.8917573872472784, "grad_norm": 8.802350748737808e-06, "learning_rate": 2.051636698939604e-06, "logits/chosen": -1.3013746738433838, "logits/rejected": 3.662790060043335, "logps/chosen": -374.2080993652344, "logps/rejected": -957.27783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.102154731750488, "rewards/margins": 32.80384063720703, "rewards/rejected": -37.9059944152832, "step": 3041 }, { "epoch": 1.8923794712286157, "grad_norm": 0.0001305158803006634, "learning_rate": 2.050484094052559e-06, "logits/chosen": 0.9714987874031067, "logits/rejected": 3.3573224544525146, "logps/chosen": -463.06201171875, "logps/rejected": -1027.607177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.532149314880371, "rewards/margins": 34.60115051269531, "rewards/rejected": -45.13330078125, "step": 3042 }, { "epoch": 1.8930015552099535, "grad_norm": 3.116247171419673e-05, "learning_rate": 2.0493314891655143e-06, "logits/chosen": -4.505119323730469, "logits/rejected": 1.4350619316101074, "logps/chosen": -233.6446075439453, "logps/rejected": -951.873291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.3743133544921875, "rewards/margins": 34.8604850769043, "rewards/rejected": -41.23480224609375, "step": 3043 }, { "epoch": 1.8936236391912908, "grad_norm": 6.368558729263896e-07, "learning_rate": 2.0481788842784695e-06, "logits/chosen": -0.6455403566360474, "logits/rejected": 2.493967056274414, "logps/chosen": -473.7528991699219, "logps/rejected": -1045.281982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.817072868347168, "rewards/margins": 33.52661895751953, "rewards/rejected": -43.343692779541016, "step": 3044 }, { "epoch": 1.8942457231726282, "grad_norm": 35.97359085083008, "learning_rate": 2.0470262793914247e-06, "logits/chosen": 0.24772866070270538, "logits/rejected": 3.34269642829895, "logps/chosen": -497.4903259277344, "logps/rejected": -1017.6675415039062, "loss": 0.5899, "rewards/accuracies": 0.875, "rewards/chosen": -7.941030025482178, "rewards/margins": 32.35401916503906, "rewards/rejected": -40.2950439453125, "step": 3045 }, { "epoch": 1.8948678071539657, "grad_norm": 0.0004061897052451968, "learning_rate": 2.04587367450438e-06, "logits/chosen": 0.5643148422241211, "logits/rejected": 1.8211342096328735, "logps/chosen": -652.9474487304688, "logps/rejected": -934.273193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.59850788116455, "rewards/margins": 28.603839874267578, "rewards/rejected": -37.20234680175781, "step": 3046 }, { "epoch": 1.8954898911353033, "grad_norm": 4.04508637075196e-06, "learning_rate": 2.044721069617335e-06, "logits/chosen": 0.7040514945983887, "logits/rejected": 3.3264479637145996, "logps/chosen": -418.761474609375, "logps/rejected": -1029.391845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.296758651733398, "rewards/margins": 40.41428756713867, "rewards/rejected": -49.71105194091797, "step": 3047 }, { "epoch": 1.8961119751166406, "grad_norm": 0.11154375970363617, "learning_rate": 2.0435684647302904e-06, "logits/chosen": 1.0263574123382568, "logits/rejected": 3.8444151878356934, "logps/chosen": -580.5594482421875, "logps/rejected": -1068.85302734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.889599800109863, "rewards/margins": 32.126556396484375, "rewards/rejected": -44.01615905761719, "step": 3048 }, { "epoch": 1.8967340590979782, "grad_norm": 0.3560849130153656, "learning_rate": 2.042415859843246e-06, "logits/chosen": -0.8153203725814819, "logits/rejected": 3.3516063690185547, "logps/chosen": -435.34503173828125, "logps/rejected": -901.9808959960938, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.1541008949279785, "rewards/margins": 23.67719078063965, "rewards/rejected": -30.83129119873047, "step": 3049 }, { "epoch": 1.8973561430793158, "grad_norm": 0.004634470213204622, "learning_rate": 2.0412632549562013e-06, "logits/chosen": 2.4827990531921387, "logits/rejected": 3.3610711097717285, "logps/chosen": -683.9464111328125, "logps/rejected": -1042.1004638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.414421081542969, "rewards/margins": 26.508174896240234, "rewards/rejected": -36.9225959777832, "step": 3050 }, { "epoch": 1.8979782270606531, "grad_norm": 9.769371899892576e-06, "learning_rate": 2.0401106500691565e-06, "logits/chosen": -0.8136694431304932, "logits/rejected": 3.5954878330230713, "logps/chosen": -504.87060546875, "logps/rejected": -1096.439208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.774595260620117, "rewards/margins": 35.71419143676758, "rewards/rejected": -44.488792419433594, "step": 3051 }, { "epoch": 1.8986003110419907, "grad_norm": 0.7546086311340332, "learning_rate": 2.0389580451821117e-06, "logits/chosen": 0.20620578527450562, "logits/rejected": 2.4939072132110596, "logps/chosen": -550.8204345703125, "logps/rejected": -966.3890380859375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -9.345878601074219, "rewards/margins": 29.434968948364258, "rewards/rejected": -38.780845642089844, "step": 3052 }, { "epoch": 1.8992223950233282, "grad_norm": 8.60761547088623, "learning_rate": 2.037805440295067e-06, "logits/chosen": 0.5855912566184998, "logits/rejected": 3.697741746902466, "logps/chosen": -476.2772216796875, "logps/rejected": -933.6175537109375, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -10.937131881713867, "rewards/margins": 27.20270538330078, "rewards/rejected": -38.139835357666016, "step": 3053 }, { "epoch": 1.8998444790046656, "grad_norm": 1.1274566531938035e-05, "learning_rate": 2.036652835408022e-06, "logits/chosen": 0.19531968235969543, "logits/rejected": 2.492250919342041, "logps/chosen": -621.8941650390625, "logps/rejected": -1169.3470458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.341142654418945, "rewards/margins": 36.63648223876953, "rewards/rejected": -48.977622985839844, "step": 3054 }, { "epoch": 1.9004665629860031, "grad_norm": 0.10121874511241913, "learning_rate": 2.0355002305209774e-06, "logits/chosen": -0.518656849861145, "logits/rejected": 2.535217523574829, "logps/chosen": -494.6761779785156, "logps/rejected": -863.6760864257812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.946184158325195, "rewards/margins": 22.766103744506836, "rewards/rejected": -29.71228790283203, "step": 3055 }, { "epoch": 1.9010886469673407, "grad_norm": 0.0062729050405323505, "learning_rate": 2.034347625633933e-06, "logits/chosen": -0.871635377407074, "logits/rejected": 4.4975104331970215, "logps/chosen": -346.7241516113281, "logps/rejected": -954.4545288085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.209202766418457, "rewards/margins": 28.649002075195312, "rewards/rejected": -35.85820388793945, "step": 3056 }, { "epoch": 1.901710730948678, "grad_norm": 2.5780183321444383e-08, "learning_rate": 2.0331950207468883e-06, "logits/chosen": 0.47149020433425903, "logits/rejected": 4.374908447265625, "logps/chosen": -649.23583984375, "logps/rejected": -1248.1553955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.42192554473877, "rewards/margins": 35.7809944152832, "rewards/rejected": -48.202919006347656, "step": 3057 }, { "epoch": 1.9023328149300156, "grad_norm": 3.1426994340222336e-09, "learning_rate": 2.0320424158598435e-06, "logits/chosen": -1.6505568027496338, "logits/rejected": 1.8247491121292114, "logps/chosen": -557.5438232421875, "logps/rejected": -1146.343505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.328878402709961, "rewards/margins": 38.459896087646484, "rewards/rejected": -48.78877258300781, "step": 3058 }, { "epoch": 1.9029548989113532, "grad_norm": 6.946622477244091e-08, "learning_rate": 2.0308898109727987e-06, "logits/chosen": -1.7206653356552124, "logits/rejected": 3.2438573837280273, "logps/chosen": -463.35784912109375, "logps/rejected": -1229.4114990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.979928970336914, "rewards/margins": 44.707908630371094, "rewards/rejected": -56.687835693359375, "step": 3059 }, { "epoch": 1.9035769828926905, "grad_norm": 0.12513019144535065, "learning_rate": 2.029737206085754e-06, "logits/chosen": -1.8191311359405518, "logits/rejected": 2.4682679176330566, "logps/chosen": -370.4996643066406, "logps/rejected": -905.6737060546875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.9366374015808105, "rewards/margins": 27.555641174316406, "rewards/rejected": -34.492279052734375, "step": 3060 }, { "epoch": 1.9041990668740278, "grad_norm": 2.9927703508292325e-05, "learning_rate": 2.028584601198709e-06, "logits/chosen": -1.5963267087936401, "logits/rejected": 3.524061679840088, "logps/chosen": -300.2033386230469, "logps/rejected": -916.1325073242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.946621894836426, "rewards/margins": 32.93095016479492, "rewards/rejected": -38.87757110595703, "step": 3061 }, { "epoch": 1.9048211508553656, "grad_norm": 0.006748128682374954, "learning_rate": 2.0274319963116644e-06, "logits/chosen": 1.2222062349319458, "logits/rejected": 3.575316905975342, "logps/chosen": -649.7758178710938, "logps/rejected": -1057.25732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.95299243927002, "rewards/margins": 27.27806282043457, "rewards/rejected": -42.231056213378906, "step": 3062 }, { "epoch": 1.905443234836703, "grad_norm": 20.295888900756836, "learning_rate": 2.02627939142462e-06, "logits/chosen": 2.645158052444458, "logits/rejected": 4.364293575286865, "logps/chosen": -514.44384765625, "logps/rejected": -851.8877563476562, "loss": 0.7349, "rewards/accuracies": 0.875, "rewards/chosen": -7.85225772857666, "rewards/margins": 20.194347381591797, "rewards/rejected": -28.04660415649414, "step": 3063 }, { "epoch": 1.9060653188180403, "grad_norm": 0.026905400678515434, "learning_rate": 2.0251267865375752e-06, "logits/chosen": -2.920731544494629, "logits/rejected": 3.2363901138305664, "logps/chosen": -209.95999145507812, "logps/rejected": -882.4180297851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.916708946228027, "rewards/margins": 32.43791580200195, "rewards/rejected": -37.35462951660156, "step": 3064 }, { "epoch": 1.9066874027993779, "grad_norm": 1.3152927749615628e-05, "learning_rate": 2.0239741816505305e-06, "logits/chosen": -0.07215797901153564, "logits/rejected": 3.6048831939697266, "logps/chosen": -531.9854736328125, "logps/rejected": -1120.064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.932154655456543, "rewards/margins": 31.065196990966797, "rewards/rejected": -39.99734878540039, "step": 3065 }, { "epoch": 1.9073094867807154, "grad_norm": 22.387847900390625, "learning_rate": 2.0228215767634857e-06, "logits/chosen": 1.1178255081176758, "logits/rejected": 2.6430320739746094, "logps/chosen": -586.69921875, "logps/rejected": -957.328125, "loss": 0.1829, "rewards/accuracies": 0.875, "rewards/chosen": -9.733922958374023, "rewards/margins": 27.749778747558594, "rewards/rejected": -37.48370361328125, "step": 3066 }, { "epoch": 1.9079315707620528, "grad_norm": 7.578842442512723e-09, "learning_rate": 2.021668971876441e-06, "logits/chosen": 3.0557453632354736, "logits/rejected": 3.5825586318969727, "logps/chosen": -559.4607543945312, "logps/rejected": -944.7230224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9096903800964355, "rewards/margins": 32.48735809326172, "rewards/rejected": -39.39704895019531, "step": 3067 }, { "epoch": 1.9085536547433903, "grad_norm": 18.07485580444336, "learning_rate": 2.020516366989396e-06, "logits/chosen": 3.178652763366699, "logits/rejected": 2.9502663612365723, "logps/chosen": -688.4437255859375, "logps/rejected": -898.4486694335938, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": -11.985757827758789, "rewards/margins": 20.856191635131836, "rewards/rejected": -32.841949462890625, "step": 3068 }, { "epoch": 1.909175738724728, "grad_norm": 0.6955883502960205, "learning_rate": 2.0193637621023514e-06, "logits/chosen": 2.94408917427063, "logits/rejected": 2.766674041748047, "logps/chosen": -792.5679931640625, "logps/rejected": -956.5133056640625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -15.01635456085205, "rewards/margins": 19.517358779907227, "rewards/rejected": -34.533714294433594, "step": 3069 }, { "epoch": 1.9097978227060652, "grad_norm": 15.6476469039917, "learning_rate": 2.0182111572153066e-06, "logits/chosen": 1.1466944217681885, "logits/rejected": 3.3508975505828857, "logps/chosen": -475.7377014160156, "logps/rejected": -933.6702270507812, "loss": 0.0944, "rewards/accuracies": 0.875, "rewards/chosen": -7.101260185241699, "rewards/margins": 30.59676742553711, "rewards/rejected": -37.698028564453125, "step": 3070 }, { "epoch": 1.9104199066874028, "grad_norm": 0.03882289305329323, "learning_rate": 2.0170585523282622e-06, "logits/chosen": 0.9645749926567078, "logits/rejected": 3.7685093879699707, "logps/chosen": -419.4703674316406, "logps/rejected": -873.78564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.286841869354248, "rewards/margins": 28.846656799316406, "rewards/rejected": -34.13349914550781, "step": 3071 }, { "epoch": 1.9110419906687404, "grad_norm": 23.7106990814209, "learning_rate": 2.0159059474412175e-06, "logits/chosen": 4.3552751541137695, "logits/rejected": 5.19343376159668, "logps/chosen": -798.3819580078125, "logps/rejected": -1081.6822509765625, "loss": 0.3422, "rewards/accuracies": 0.875, "rewards/chosen": -15.164212226867676, "rewards/margins": 20.923892974853516, "rewards/rejected": -36.088104248046875, "step": 3072 }, { "epoch": 1.9116640746500777, "grad_norm": 0.0004619990650098771, "learning_rate": 2.0147533425541727e-06, "logits/chosen": 2.130751848220825, "logits/rejected": 4.413289546966553, "logps/chosen": -547.0361328125, "logps/rejected": -903.9354248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0716118812561035, "rewards/margins": 24.975723266601562, "rewards/rejected": -32.047332763671875, "step": 3073 }, { "epoch": 1.9122861586314153, "grad_norm": 2.6988864476606977e-08, "learning_rate": 2.013600737667128e-06, "logits/chosen": 1.7085301876068115, "logits/rejected": 4.056130409240723, "logps/chosen": -657.5714111328125, "logps/rejected": -1185.6339111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.055585861206055, "rewards/margins": 36.13258743286133, "rewards/rejected": -46.18817138671875, "step": 3074 }, { "epoch": 1.9129082426127528, "grad_norm": 1.0772266705316724e-06, "learning_rate": 2.012448132780083e-06, "logits/chosen": 1.6624712944030762, "logits/rejected": 4.163967609405518, "logps/chosen": -429.5469970703125, "logps/rejected": -896.1236572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.713283061981201, "rewards/margins": 30.268901824951172, "rewards/rejected": -37.98218536376953, "step": 3075 }, { "epoch": 1.9135303265940902, "grad_norm": 0.003041618037968874, "learning_rate": 2.0112955278930384e-06, "logits/chosen": 2.202765703201294, "logits/rejected": 4.012975215911865, "logps/chosen": -629.578857421875, "logps/rejected": -1021.6544189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.357513427734375, "rewards/margins": 25.704200744628906, "rewards/rejected": -35.06171417236328, "step": 3076 }, { "epoch": 1.9141524105754277, "grad_norm": 3.149693727493286, "learning_rate": 2.0101429230059936e-06, "logits/chosen": 2.2744429111480713, "logits/rejected": 2.2045953273773193, "logps/chosen": -632.7249145507812, "logps/rejected": -869.91650390625, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -10.000911712646484, "rewards/margins": 22.652973175048828, "rewards/rejected": -32.65388488769531, "step": 3077 }, { "epoch": 1.9147744945567653, "grad_norm": 15.868894577026367, "learning_rate": 2.0089903181189492e-06, "logits/chosen": 1.046784520149231, "logits/rejected": 3.364591598510742, "logps/chosen": -626.5619506835938, "logps/rejected": -1132.443115234375, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": -10.214308738708496, "rewards/margins": 35.30830383300781, "rewards/rejected": -45.522613525390625, "step": 3078 }, { "epoch": 1.9153965785381026, "grad_norm": 15.966095924377441, "learning_rate": 2.0078377132319045e-06, "logits/chosen": -0.5236443281173706, "logits/rejected": 2.8816840648651123, "logps/chosen": -622.166259765625, "logps/rejected": -1035.40380859375, "loss": 0.2252, "rewards/accuracies": 0.875, "rewards/chosen": -6.653708457946777, "rewards/margins": 31.47915267944336, "rewards/rejected": -38.13285827636719, "step": 3079 }, { "epoch": 1.91601866251944, "grad_norm": 5.1122162403771654e-05, "learning_rate": 2.0066851083448597e-06, "logits/chosen": -0.1377563625574112, "logits/rejected": 1.8764601945877075, "logps/chosen": -588.8012084960938, "logps/rejected": -952.9384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.899794578552246, "rewards/margins": 25.850767135620117, "rewards/rejected": -38.75056076049805, "step": 3080 }, { "epoch": 1.9166407465007778, "grad_norm": 2.693913698196411, "learning_rate": 2.005532503457815e-06, "logits/chosen": -2.1489388942718506, "logits/rejected": 1.2460088729858398, "logps/chosen": -409.9424743652344, "logps/rejected": -908.951416015625, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -10.152788162231445, "rewards/margins": 23.960559844970703, "rewards/rejected": -34.113346099853516, "step": 3081 }, { "epoch": 1.917262830482115, "grad_norm": 0.2958039343357086, "learning_rate": 2.00437989857077e-06, "logits/chosen": -1.046022891998291, "logits/rejected": 3.3509743213653564, "logps/chosen": -481.7524719238281, "logps/rejected": -1104.011962890625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.942147254943848, "rewards/margins": 38.9442253112793, "rewards/rejected": -47.88637161254883, "step": 3082 }, { "epoch": 1.9178849144634524, "grad_norm": 9.022804988489952e-06, "learning_rate": 2.0032272936837254e-06, "logits/chosen": 0.3423982858657837, "logits/rejected": 3.9999098777770996, "logps/chosen": -417.267822265625, "logps/rejected": -853.1267700195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.099996566772461, "rewards/margins": 27.42827796936035, "rewards/rejected": -35.52827453613281, "step": 3083 }, { "epoch": 1.91850699844479, "grad_norm": 8.890567779541016, "learning_rate": 2.0020746887966806e-06, "logits/chosen": 3.4113690853118896, "logits/rejected": 4.016191482543945, "logps/chosen": -667.6809692382812, "logps/rejected": -883.475830078125, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": -11.454672813415527, "rewards/margins": 20.978595733642578, "rewards/rejected": -32.43326950073242, "step": 3084 }, { "epoch": 1.9191290824261276, "grad_norm": 2.1542064132518135e-07, "learning_rate": 2.0009220839096362e-06, "logits/chosen": 0.25448083877563477, "logits/rejected": 4.301416397094727, "logps/chosen": -445.3734436035156, "logps/rejected": -1053.107666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.527276992797852, "rewards/margins": 33.589111328125, "rewards/rejected": -42.11638641357422, "step": 3085 }, { "epoch": 1.919751166407465, "grad_norm": 0.019370652735233307, "learning_rate": 1.9997694790225915e-06, "logits/chosen": -0.40544599294662476, "logits/rejected": 3.474851131439209, "logps/chosen": -469.44781494140625, "logps/rejected": -912.0271606445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.017045974731445, "rewards/margins": 25.133575439453125, "rewards/rejected": -34.15061950683594, "step": 3086 }, { "epoch": 1.9203732503888025, "grad_norm": 2.7604433853412047e-05, "learning_rate": 1.9986168741355467e-06, "logits/chosen": -2.633376121520996, "logits/rejected": 2.72335147857666, "logps/chosen": -397.2959289550781, "logps/rejected": -1001.3154907226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.045672416687012, "rewards/margins": 33.706565856933594, "rewards/rejected": -41.75223922729492, "step": 3087 }, { "epoch": 1.92099533437014, "grad_norm": 0.5521590709686279, "learning_rate": 1.997464269248502e-06, "logits/chosen": -2.533043384552002, "logits/rejected": 2.634922981262207, "logps/chosen": -496.95086669921875, "logps/rejected": -1041.037353515625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.341783046722412, "rewards/margins": 29.893619537353516, "rewards/rejected": -36.23540496826172, "step": 3088 }, { "epoch": 1.9216174183514774, "grad_norm": 33.23648452758789, "learning_rate": 1.996311664361457e-06, "logits/chosen": 0.3715195059776306, "logits/rejected": 3.6420249938964844, "logps/chosen": -478.12640380859375, "logps/rejected": -843.5733032226562, "loss": 0.4117, "rewards/accuracies": 0.875, "rewards/chosen": -6.215129852294922, "rewards/margins": 21.32658576965332, "rewards/rejected": -27.541715621948242, "step": 3089 }, { "epoch": 1.922239502332815, "grad_norm": 40.58311462402344, "learning_rate": 1.9951590594744124e-06, "logits/chosen": 1.1373742818832397, "logits/rejected": 4.414701461791992, "logps/chosen": -538.0445556640625, "logps/rejected": -901.5621337890625, "loss": 0.9431, "rewards/accuracies": 0.875, "rewards/chosen": -10.4848051071167, "rewards/margins": 15.416234970092773, "rewards/rejected": -25.901039123535156, "step": 3090 }, { "epoch": 1.9228615863141525, "grad_norm": 0.08620551228523254, "learning_rate": 1.9940064545873676e-06, "logits/chosen": -0.5909771919250488, "logits/rejected": 2.6995272636413574, "logps/chosen": -428.00830078125, "logps/rejected": -923.3511962890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.33791732788086, "rewards/margins": 27.793746948242188, "rewards/rejected": -36.13166046142578, "step": 3091 }, { "epoch": 1.9234836702954898, "grad_norm": 0.00960856769233942, "learning_rate": 1.9928538497003232e-06, "logits/chosen": 0.8783894181251526, "logits/rejected": 3.604226589202881, "logps/chosen": -500.4090270996094, "logps/rejected": -950.4783325195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.384622573852539, "rewards/margins": 30.068050384521484, "rewards/rejected": -34.452674865722656, "step": 3092 }, { "epoch": 1.9241057542768274, "grad_norm": 1.3824650049209595, "learning_rate": 1.991701244813278e-06, "logits/chosen": 0.8421534299850464, "logits/rejected": 3.5944361686706543, "logps/chosen": -551.550048828125, "logps/rejected": -1028.826416015625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.340576648712158, "rewards/margins": 25.5235595703125, "rewards/rejected": -30.864137649536133, "step": 3093 }, { "epoch": 1.924727838258165, "grad_norm": 1.3520052561943885e-05, "learning_rate": 1.9905486399262333e-06, "logits/chosen": 1.369136095046997, "logits/rejected": 3.906341075897217, "logps/chosen": -490.1999206542969, "logps/rejected": -1115.05517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.051058769226074, "rewards/margins": 40.5749397277832, "rewards/rejected": -50.62599563598633, "step": 3094 }, { "epoch": 1.9253499222395023, "grad_norm": 2.0884041786193848, "learning_rate": 1.9893960350391885e-06, "logits/chosen": -0.2868638336658478, "logits/rejected": 3.4033193588256836, "logps/chosen": -459.6829528808594, "logps/rejected": -1025.7183837890625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -6.993990421295166, "rewards/margins": 31.40522575378418, "rewards/rejected": -38.39921569824219, "step": 3095 }, { "epoch": 1.9259720062208399, "grad_norm": 0.04092787951231003, "learning_rate": 1.9882434301521437e-06, "logits/chosen": -0.23980551958084106, "logits/rejected": 3.9187815189361572, "logps/chosen": -501.53662109375, "logps/rejected": -1109.9281005859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.170280456542969, "rewards/margins": 35.897464752197266, "rewards/rejected": -45.06774139404297, "step": 3096 }, { "epoch": 1.9265940902021774, "grad_norm": 0.00010406249202787876, "learning_rate": 1.9870908252650994e-06, "logits/chosen": 3.265599250793457, "logits/rejected": 3.0254709720611572, "logps/chosen": -635.1368408203125, "logps/rejected": -902.3892822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.115925312042236, "rewards/margins": 24.14023780822754, "rewards/rejected": -29.256162643432617, "step": 3097 }, { "epoch": 1.9272161741835148, "grad_norm": 0.0016180593520402908, "learning_rate": 1.9859382203780546e-06, "logits/chosen": 0.5095869302749634, "logits/rejected": 2.9197235107421875, "logps/chosen": -520.6793212890625, "logps/rejected": -1036.481201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.886094093322754, "rewards/margins": 36.49535369873047, "rewards/rejected": -43.38145065307617, "step": 3098 }, { "epoch": 1.927838258164852, "grad_norm": 0.0008051490876823664, "learning_rate": 1.98478561549101e-06, "logits/chosen": -3.078697919845581, "logits/rejected": 3.5038082599639893, "logps/chosen": -302.9356689453125, "logps/rejected": -906.822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.008868217468262, "rewards/margins": 21.944965362548828, "rewards/rejected": -27.953832626342773, "step": 3099 }, { "epoch": 1.9284603421461899, "grad_norm": 1.0987540690621245e-06, "learning_rate": 1.983633010603965e-06, "logits/chosen": -1.677296757698059, "logits/rejected": 2.7449183464050293, "logps/chosen": -345.772705078125, "logps/rejected": -854.6087036132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.282731533050537, "rewards/margins": 30.456693649291992, "rewards/rejected": -36.73942565917969, "step": 3100 }, { "epoch": 1.9290824261275272, "grad_norm": 2.140411853790283, "learning_rate": 1.9824804057169203e-06, "logits/chosen": -0.08920153230428696, "logits/rejected": 3.9136552810668945, "logps/chosen": -519.4923706054688, "logps/rejected": -951.7150268554688, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -8.870290756225586, "rewards/margins": 23.316768646240234, "rewards/rejected": -32.18705749511719, "step": 3101 }, { "epoch": 1.9297045101088646, "grad_norm": 4.3668413162231445, "learning_rate": 1.9813278008298755e-06, "logits/chosen": 2.294924020767212, "logits/rejected": 3.8346686363220215, "logps/chosen": -517.449462890625, "logps/rejected": -826.955810546875, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -13.303144454956055, "rewards/margins": 21.79949188232422, "rewards/rejected": -35.10263442993164, "step": 3102 }, { "epoch": 1.9303265940902021, "grad_norm": 35.314605712890625, "learning_rate": 1.9801751959428307e-06, "logits/chosen": 1.7527735233306885, "logits/rejected": 2.813300371170044, "logps/chosen": -544.0867919921875, "logps/rejected": -810.4888916015625, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -5.789369583129883, "rewards/margins": 20.098064422607422, "rewards/rejected": -25.887434005737305, "step": 3103 }, { "epoch": 1.9309486780715397, "grad_norm": 0.37775611877441406, "learning_rate": 1.9790225910557864e-06, "logits/chosen": 2.292999505996704, "logits/rejected": 4.234982490539551, "logps/chosen": -575.2119140625, "logps/rejected": -962.66455078125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -10.68335247039795, "rewards/margins": 23.792911529541016, "rewards/rejected": -34.47626876831055, "step": 3104 }, { "epoch": 1.931570762052877, "grad_norm": 38.52018356323242, "learning_rate": 1.9778699861687416e-06, "logits/chosen": -0.1604076474905014, "logits/rejected": 0.9690057039260864, "logps/chosen": -589.8624267578125, "logps/rejected": -806.6043701171875, "loss": 0.6047, "rewards/accuracies": 0.875, "rewards/chosen": -6.873226165771484, "rewards/margins": 14.048189163208008, "rewards/rejected": -20.921417236328125, "step": 3105 }, { "epoch": 1.9321928460342146, "grad_norm": 0.0012302246177569032, "learning_rate": 1.976717381281697e-06, "logits/chosen": -2.3391997814178467, "logits/rejected": 3.5974345207214355, "logps/chosen": -271.0357360839844, "logps/rejected": -858.4124755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.7468342781066895, "rewards/margins": 24.664321899414062, "rewards/rejected": -30.41115951538086, "step": 3106 }, { "epoch": 1.9328149300155522, "grad_norm": 0.13548584282398224, "learning_rate": 1.975564776394652e-06, "logits/chosen": -1.131691575050354, "logits/rejected": 2.9434633255004883, "logps/chosen": -432.55963134765625, "logps/rejected": -941.6983032226562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.376041412353516, "rewards/margins": 30.401823043823242, "rewards/rejected": -38.777862548828125, "step": 3107 }, { "epoch": 1.9334370139968895, "grad_norm": 0.0010253810323774815, "learning_rate": 1.9744121715076073e-06, "logits/chosen": 1.5464913845062256, "logits/rejected": 1.995990514755249, "logps/chosen": -640.5809936523438, "logps/rejected": -906.3209228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.729105472564697, "rewards/margins": 24.215728759765625, "rewards/rejected": -31.944835662841797, "step": 3108 }, { "epoch": 1.934059097978227, "grad_norm": 1.0012019872665405, "learning_rate": 1.9732595666205625e-06, "logits/chosen": 0.8782744407653809, "logits/rejected": 3.3917250633239746, "logps/chosen": -675.33544921875, "logps/rejected": -1131.438720703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -13.280045509338379, "rewards/margins": 30.720184326171875, "rewards/rejected": -44.00022888183594, "step": 3109 }, { "epoch": 1.9346811819595646, "grad_norm": 0.0008394105243496597, "learning_rate": 1.9721069617335177e-06, "logits/chosen": 1.2346922159194946, "logits/rejected": 1.7204153537750244, "logps/chosen": -612.9161376953125, "logps/rejected": -899.165283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.591879844665527, "rewards/margins": 25.975309371948242, "rewards/rejected": -36.56719207763672, "step": 3110 }, { "epoch": 1.935303265940902, "grad_norm": 0.01554886344820261, "learning_rate": 1.970954356846473e-06, "logits/chosen": -1.2211658954620361, "logits/rejected": 3.6318647861480713, "logps/chosen": -531.0762329101562, "logps/rejected": -1141.5849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.235472202301025, "rewards/margins": 30.65101432800293, "rewards/rejected": -35.8864860534668, "step": 3111 }, { "epoch": 1.9359253499222395, "grad_norm": 4.545105934143066, "learning_rate": 1.9698017519594286e-06, "logits/chosen": -2.037519693374634, "logits/rejected": 2.981616497039795, "logps/chosen": -454.7665710449219, "logps/rejected": -1027.3382568359375, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -7.1797332763671875, "rewards/margins": 27.61724281311035, "rewards/rejected": -34.79697799682617, "step": 3112 }, { "epoch": 1.936547433903577, "grad_norm": 22.48046112060547, "learning_rate": 1.968649147072384e-06, "logits/chosen": -3.1317570209503174, "logits/rejected": 2.2430827617645264, "logps/chosen": -233.67686462402344, "logps/rejected": -833.810546875, "loss": 0.3444, "rewards/accuracies": 0.875, "rewards/chosen": -5.681829452514648, "rewards/margins": 26.82794189453125, "rewards/rejected": -32.509769439697266, "step": 3113 }, { "epoch": 1.9371695178849144, "grad_norm": 1.1949235158681404e-05, "learning_rate": 1.967496542185339e-06, "logits/chosen": 2.1707959175109863, "logits/rejected": 2.928690195083618, "logps/chosen": -630.8195190429688, "logps/rejected": -919.963623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.479073524475098, "rewards/margins": 28.746137619018555, "rewards/rejected": -35.22521209716797, "step": 3114 }, { "epoch": 1.937791601866252, "grad_norm": 0.00946116354316473, "learning_rate": 1.9663439372982943e-06, "logits/chosen": -0.10625946521759033, "logits/rejected": 3.2997217178344727, "logps/chosen": -392.2223815917969, "logps/rejected": -870.4689331054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0400776863098145, "rewards/margins": 26.645164489746094, "rewards/rejected": -31.68524169921875, "step": 3115 }, { "epoch": 1.9384136858475896, "grad_norm": 0.0013832555850967765, "learning_rate": 1.9651913324112495e-06, "logits/chosen": 2.235701084136963, "logits/rejected": 5.511145114898682, "logps/chosen": -671.1697998046875, "logps/rejected": -1116.8763427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.155223846435547, "rewards/margins": 26.336801528930664, "rewards/rejected": -36.492027282714844, "step": 3116 }, { "epoch": 1.939035769828927, "grad_norm": 0.0005869021988473833, "learning_rate": 1.9640387275242047e-06, "logits/chosen": -2.2219884395599365, "logits/rejected": 2.0224733352661133, "logps/chosen": -325.9281921386719, "logps/rejected": -870.1353759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.242877006530762, "rewards/margins": 30.151485443115234, "rewards/rejected": -34.39436340332031, "step": 3117 }, { "epoch": 1.9396578538102642, "grad_norm": 30.43111801147461, "learning_rate": 1.96288612263716e-06, "logits/chosen": 0.6214398145675659, "logits/rejected": 3.964189291000366, "logps/chosen": -570.104736328125, "logps/rejected": -1154.1322021484375, "loss": 0.1592, "rewards/accuracies": 0.875, "rewards/chosen": -11.116994857788086, "rewards/margins": 30.15409278869629, "rewards/rejected": -41.27109146118164, "step": 3118 }, { "epoch": 1.940279937791602, "grad_norm": 27.62190055847168, "learning_rate": 1.9617335177501156e-06, "logits/chosen": 2.633063793182373, "logits/rejected": 5.510431289672852, "logps/chosen": -557.0858154296875, "logps/rejected": -1135.85205078125, "loss": 0.2993, "rewards/accuracies": 0.875, "rewards/chosen": -6.610867977142334, "rewards/margins": 38.47575378417969, "rewards/rejected": -45.08662033081055, "step": 3119 }, { "epoch": 1.9409020217729394, "grad_norm": 28.94077491760254, "learning_rate": 1.960580912863071e-06, "logits/chosen": -1.3939131498336792, "logits/rejected": 3.7747256755828857, "logps/chosen": -485.427001953125, "logps/rejected": -1010.84375, "loss": 0.2913, "rewards/accuracies": 0.875, "rewards/chosen": -5.9063720703125, "rewards/margins": 21.018043518066406, "rewards/rejected": -26.924415588378906, "step": 3120 }, { "epoch": 1.9415241057542767, "grad_norm": 40.993804931640625, "learning_rate": 1.959428307976026e-06, "logits/chosen": 2.295240640640259, "logits/rejected": 4.34235143661499, "logps/chosen": -642.120361328125, "logps/rejected": -1075.3221435546875, "loss": 0.3546, "rewards/accuracies": 0.875, "rewards/chosen": -8.789778709411621, "rewards/margins": 22.575544357299805, "rewards/rejected": -31.36532211303711, "step": 3121 }, { "epoch": 1.9421461897356143, "grad_norm": 0.021397702395915985, "learning_rate": 1.9582757030889812e-06, "logits/chosen": -3.1025195121765137, "logits/rejected": 3.167198419570923, "logps/chosen": -357.3218994140625, "logps/rejected": -972.1842041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.450490951538086, "rewards/margins": 24.834686279296875, "rewards/rejected": -31.285175323486328, "step": 3122 }, { "epoch": 1.9427682737169518, "grad_norm": 0.4374660849571228, "learning_rate": 1.9571230982019365e-06, "logits/chosen": 1.4612170457839966, "logits/rejected": 4.9650397300720215, "logps/chosen": -562.9573364257812, "logps/rejected": -1090.174560546875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -7.2041778564453125, "rewards/margins": 26.764659881591797, "rewards/rejected": -33.96883773803711, "step": 3123 }, { "epoch": 1.9433903576982892, "grad_norm": 0.0014534497167915106, "learning_rate": 1.9559704933148917e-06, "logits/chosen": -1.5175460577011108, "logits/rejected": 3.670214891433716, "logps/chosen": -357.6202392578125, "logps/rejected": -976.2584838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.8302741050720215, "rewards/margins": 30.991336822509766, "rewards/rejected": -35.82160949707031, "step": 3124 }, { "epoch": 1.9440124416796267, "grad_norm": 40.93093490600586, "learning_rate": 1.954817888427847e-06, "logits/chosen": 0.8629130721092224, "logits/rejected": 3.1365485191345215, "logps/chosen": -601.42626953125, "logps/rejected": -1025.9072265625, "loss": 0.9715, "rewards/accuracies": 0.75, "rewards/chosen": -8.362449645996094, "rewards/margins": 26.06359100341797, "rewards/rejected": -34.42604064941406, "step": 3125 }, { "epoch": 1.9446345256609643, "grad_norm": 12.326611518859863, "learning_rate": 1.9536652835408026e-06, "logits/chosen": 2.843836784362793, "logits/rejected": 4.174145698547363, "logps/chosen": -592.7425537109375, "logps/rejected": -801.8739013671875, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -9.317679405212402, "rewards/margins": 20.747962951660156, "rewards/rejected": -30.065643310546875, "step": 3126 }, { "epoch": 1.9452566096423016, "grad_norm": 0.009106865152716637, "learning_rate": 1.952512678653758e-06, "logits/chosen": 0.664067804813385, "logits/rejected": 2.8401622772216797, "logps/chosen": -483.0638732910156, "logps/rejected": -1060.47265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1976847648620605, "rewards/margins": 36.17652893066406, "rewards/rejected": -43.37421417236328, "step": 3127 }, { "epoch": 1.9458786936236392, "grad_norm": 0.00035235649556852877, "learning_rate": 1.951360073766713e-06, "logits/chosen": 1.572568416595459, "logits/rejected": 3.067289352416992, "logps/chosen": -589.0814208984375, "logps/rejected": -878.741943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.191118240356445, "rewards/margins": 20.378524780273438, "rewards/rejected": -34.56964111328125, "step": 3128 }, { "epoch": 1.9465007776049768, "grad_norm": 2.083462823065929e-05, "learning_rate": 1.9502074688796682e-06, "logits/chosen": -1.6689331531524658, "logits/rejected": 3.7483890056610107, "logps/chosen": -418.681640625, "logps/rejected": -1105.7725830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.608135223388672, "rewards/margins": 34.870338439941406, "rewards/rejected": -41.47846984863281, "step": 3129 }, { "epoch": 1.947122861586314, "grad_norm": 1.515984296798706, "learning_rate": 1.9490548639926235e-06, "logits/chosen": -1.502424716949463, "logits/rejected": 3.913059711456299, "logps/chosen": -441.2313232421875, "logps/rejected": -1136.94677734375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -8.84985065460205, "rewards/margins": 34.735313415527344, "rewards/rejected": -43.585166931152344, "step": 3130 }, { "epoch": 1.9477449455676517, "grad_norm": 0.0010944779496639967, "learning_rate": 1.9479022591055787e-06, "logits/chosen": -1.6574513912200928, "logits/rejected": 2.5761234760284424, "logps/chosen": -434.24237060546875, "logps/rejected": -995.3330688476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.173585891723633, "rewards/margins": 29.175065994262695, "rewards/rejected": -37.34865188598633, "step": 3131 }, { "epoch": 1.9483670295489892, "grad_norm": 0.08957312256097794, "learning_rate": 1.946749654218534e-06, "logits/chosen": 2.9277377128601074, "logits/rejected": 3.3973681926727295, "logps/chosen": -821.0192260742188, "logps/rejected": -1138.6099853515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -11.827193260192871, "rewards/margins": 28.56563949584961, "rewards/rejected": -40.3928337097168, "step": 3132 }, { "epoch": 1.9489891135303266, "grad_norm": 5.443854433906381e-07, "learning_rate": 1.945597049331489e-06, "logits/chosen": 1.8225369453430176, "logits/rejected": 5.566961765289307, "logps/chosen": -481.93804931640625, "logps/rejected": -1042.929443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.911165714263916, "rewards/margins": 30.441822052001953, "rewards/rejected": -36.35298538208008, "step": 3133 }, { "epoch": 1.9496111975116641, "grad_norm": 3.7663533021259354e-06, "learning_rate": 1.944444444444445e-06, "logits/chosen": 0.8368363380432129, "logits/rejected": 3.7206904888153076, "logps/chosen": -463.287109375, "logps/rejected": -858.33349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.503768444061279, "rewards/margins": 25.793119430541992, "rewards/rejected": -33.29689025878906, "step": 3134 }, { "epoch": 1.9502332814930017, "grad_norm": 1.1044361514223056e-07, "learning_rate": 1.9432918395574e-06, "logits/chosen": -2.337228298187256, "logits/rejected": 3.89192795753479, "logps/chosen": -342.0597229003906, "logps/rejected": -1077.766845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.909173965454102, "rewards/margins": 34.26679229736328, "rewards/rejected": -42.175968170166016, "step": 3135 }, { "epoch": 1.950855365474339, "grad_norm": 25.050539016723633, "learning_rate": 1.9421392346703552e-06, "logits/chosen": 2.8053271770477295, "logits/rejected": 2.157046318054199, "logps/chosen": -723.6729736328125, "logps/rejected": -894.4259033203125, "loss": 0.2076, "rewards/accuracies": 0.875, "rewards/chosen": -11.69310474395752, "rewards/margins": 18.876235961914062, "rewards/rejected": -30.569339752197266, "step": 3136 }, { "epoch": 1.9514774494556764, "grad_norm": 0.028986474499106407, "learning_rate": 1.9409866297833105e-06, "logits/chosen": -3.2278780937194824, "logits/rejected": 0.645659327507019, "logps/chosen": -316.598876953125, "logps/rejected": -778.5980224609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.194002151489258, "rewards/margins": 28.001659393310547, "rewards/rejected": -32.19565963745117, "step": 3137 }, { "epoch": 1.9520995334370141, "grad_norm": 0.00019788251665886492, "learning_rate": 1.9398340248962657e-06, "logits/chosen": 1.0286705493927002, "logits/rejected": 2.940277338027954, "logps/chosen": -495.791748046875, "logps/rejected": -904.9774169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.718324661254883, "rewards/margins": 27.192028045654297, "rewards/rejected": -39.91035461425781, "step": 3138 }, { "epoch": 1.9527216174183515, "grad_norm": 4.4513002649182454e-05, "learning_rate": 1.938681420009221e-06, "logits/chosen": -0.6334162950515747, "logits/rejected": 2.7107186317443848, "logps/chosen": -450.05792236328125, "logps/rejected": -958.1580810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.791568279266357, "rewards/margins": 30.65761947631836, "rewards/rejected": -38.449188232421875, "step": 3139 }, { "epoch": 1.9533437013996888, "grad_norm": 0.0005697354790754616, "learning_rate": 1.937528815122176e-06, "logits/chosen": -1.5794157981872559, "logits/rejected": 3.3874337673187256, "logps/chosen": -283.36279296875, "logps/rejected": -919.2586669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.818148136138916, "rewards/margins": 35.102237701416016, "rewards/rejected": -38.92038345336914, "step": 3140 }, { "epoch": 1.9539657853810264, "grad_norm": 3.034701347351074, "learning_rate": 1.936376210235132e-06, "logits/chosen": 0.9317562580108643, "logits/rejected": 2.288961410522461, "logps/chosen": -659.7003173828125, "logps/rejected": -990.6065063476562, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -7.969992160797119, "rewards/margins": 22.054443359375, "rewards/rejected": -30.024436950683594, "step": 3141 }, { "epoch": 1.954587869362364, "grad_norm": 0.00017606717301532626, "learning_rate": 1.935223605348087e-06, "logits/chosen": 2.3919968605041504, "logits/rejected": 3.5712153911590576, "logps/chosen": -615.6132202148438, "logps/rejected": -1001.5966186523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.834830284118652, "rewards/margins": 30.781444549560547, "rewards/rejected": -41.61627197265625, "step": 3142 }, { "epoch": 1.9552099533437013, "grad_norm": 1.5589220083711552e-06, "learning_rate": 1.9340710004610422e-06, "logits/chosen": -0.3756348490715027, "logits/rejected": 3.8530774116516113, "logps/chosen": -409.1357116699219, "logps/rejected": -897.506591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.981670379638672, "rewards/margins": 29.417194366455078, "rewards/rejected": -38.39886474609375, "step": 3143 }, { "epoch": 1.9558320373250389, "grad_norm": 33.71767044067383, "learning_rate": 1.9329183955739975e-06, "logits/chosen": -3.283298969268799, "logits/rejected": -0.33032527565956116, "logps/chosen": -383.6517333984375, "logps/rejected": -775.3797607421875, "loss": 0.8141, "rewards/accuracies": 0.875, "rewards/chosen": -6.489877700805664, "rewards/margins": 20.71088409423828, "rewards/rejected": -27.200763702392578, "step": 3144 }, { "epoch": 1.9564541213063764, "grad_norm": 6.639469862790293e-09, "learning_rate": 1.9317657906869527e-06, "logits/chosen": -0.8790050745010376, "logits/rejected": 1.0028407573699951, "logps/chosen": -523.29443359375, "logps/rejected": -927.624755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.112761497497559, "rewards/margins": 33.71839141845703, "rewards/rejected": -38.831153869628906, "step": 3145 }, { "epoch": 1.9570762052877138, "grad_norm": 0.0003128540702164173, "learning_rate": 1.930613185799908e-06, "logits/chosen": -1.3517966270446777, "logits/rejected": 1.9820481538772583, "logps/chosen": -559.878173828125, "logps/rejected": -969.4806518554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.006365776062012, "rewards/margins": 28.887584686279297, "rewards/rejected": -40.893951416015625, "step": 3146 }, { "epoch": 1.9576982892690513, "grad_norm": 9.801250416785479e-05, "learning_rate": 1.929460580912863e-06, "logits/chosen": 0.4533102512359619, "logits/rejected": 2.669489860534668, "logps/chosen": -600.5856323242188, "logps/rejected": -1008.6813354492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.19776439666748, "rewards/margins": 30.614797592163086, "rewards/rejected": -39.812564849853516, "step": 3147 }, { "epoch": 1.9583203732503889, "grad_norm": 0.0008385455585084856, "learning_rate": 1.9283079760258188e-06, "logits/chosen": -0.3274199366569519, "logits/rejected": 1.4190573692321777, "logps/chosen": -520.9417724609375, "logps/rejected": -1010.8619995117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.59363079071045, "rewards/margins": 29.788921356201172, "rewards/rejected": -40.38255310058594, "step": 3148 }, { "epoch": 1.9589424572317262, "grad_norm": 0.3622640371322632, "learning_rate": 1.927155371138774e-06, "logits/chosen": -0.10235399007797241, "logits/rejected": 3.619846820831299, "logps/chosen": -502.5452880859375, "logps/rejected": -1006.097412109375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.385444164276123, "rewards/margins": 32.80233383178711, "rewards/rejected": -39.18777847290039, "step": 3149 }, { "epoch": 1.9595645412130638, "grad_norm": 2.1730106709583197e-06, "learning_rate": 1.9260027662517292e-06, "logits/chosen": -0.025127731263637543, "logits/rejected": 2.28237247467041, "logps/chosen": -590.5953369140625, "logps/rejected": -1034.8394775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.482217788696289, "rewards/margins": 31.997419357299805, "rewards/rejected": -40.479637145996094, "step": 3150 }, { "epoch": 1.9601866251944013, "grad_norm": 8.769025802612305, "learning_rate": 1.9248501613646845e-06, "logits/chosen": 0.180405855178833, "logits/rejected": 1.3878958225250244, "logps/chosen": -582.6483154296875, "logps/rejected": -890.475341796875, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": -13.298641204833984, "rewards/margins": 20.171621322631836, "rewards/rejected": -33.47026443481445, "step": 3151 }, { "epoch": 1.9608087091757387, "grad_norm": 0.0005307064857333899, "learning_rate": 1.9236975564776397e-06, "logits/chosen": 0.7239381670951843, "logits/rejected": 3.4129738807678223, "logps/chosen": -372.01153564453125, "logps/rejected": -790.05712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.203645706176758, "rewards/margins": 25.585803985595703, "rewards/rejected": -32.789451599121094, "step": 3152 }, { "epoch": 1.9614307931570762, "grad_norm": 8.47744083404541, "learning_rate": 1.922544951590595e-06, "logits/chosen": -0.6052994132041931, "logits/rejected": 1.3863511085510254, "logps/chosen": -600.923095703125, "logps/rejected": -1011.9559936523438, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -9.772683143615723, "rewards/margins": 23.889484405517578, "rewards/rejected": -33.66217041015625, "step": 3153 }, { "epoch": 1.9620528771384138, "grad_norm": 2.698514967036658e-09, "learning_rate": 1.92139234670355e-06, "logits/chosen": -3.1321492195129395, "logits/rejected": 3.270890951156616, "logps/chosen": -285.3479309082031, "logps/rejected": -935.8294677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.258288860321045, "rewards/margins": 38.576446533203125, "rewards/rejected": -43.83473587036133, "step": 3154 }, { "epoch": 1.9626749611197511, "grad_norm": 1.4546620832334156e-06, "learning_rate": 1.9202397418165058e-06, "logits/chosen": -1.279468059539795, "logits/rejected": 2.263674736022949, "logps/chosen": -536.9735717773438, "logps/rejected": -1118.637939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.917298316955566, "rewards/margins": 36.86639404296875, "rewards/rejected": -47.78369140625, "step": 3155 }, { "epoch": 1.9632970451010885, "grad_norm": 0.13870863616466522, "learning_rate": 1.919087136929461e-06, "logits/chosen": 2.533172130584717, "logits/rejected": 3.0230398178100586, "logps/chosen": -699.2755737304688, "logps/rejected": -861.6014404296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.90634536743164, "rewards/margins": 22.009307861328125, "rewards/rejected": -32.915653228759766, "step": 3156 }, { "epoch": 1.9639191290824263, "grad_norm": 0.0025135388132184744, "learning_rate": 1.9179345320424162e-06, "logits/chosen": -0.8763086199760437, "logits/rejected": 3.799420118331909, "logps/chosen": -385.1059265136719, "logps/rejected": -963.9544677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.895961761474609, "rewards/margins": 32.50831985473633, "rewards/rejected": -40.40427780151367, "step": 3157 }, { "epoch": 1.9645412130637636, "grad_norm": 1.0180851859331597e-05, "learning_rate": 1.9167819271553715e-06, "logits/chosen": 2.6572232246398926, "logits/rejected": 4.152245998382568, "logps/chosen": -740.9976806640625, "logps/rejected": -1174.274169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.86086654663086, "rewards/margins": 31.476028442382812, "rewards/rejected": -44.336891174316406, "step": 3158 }, { "epoch": 1.965163297045101, "grad_norm": 0.10397597402334213, "learning_rate": 1.9156293222683267e-06, "logits/chosen": 2.5373735427856445, "logits/rejected": 2.5043015480041504, "logps/chosen": -691.72265625, "logps/rejected": -875.3533325195312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.654552459716797, "rewards/margins": 25.71814727783203, "rewards/rejected": -35.37269973754883, "step": 3159 }, { "epoch": 1.9657853810264385, "grad_norm": 1.2996370060136542e-05, "learning_rate": 1.914476717381282e-06, "logits/chosen": -0.1440131962299347, "logits/rejected": 3.557115316390991, "logps/chosen": -502.76531982421875, "logps/rejected": -1063.85302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.445473670959473, "rewards/margins": 33.03824234008789, "rewards/rejected": -44.48371505737305, "step": 3160 }, { "epoch": 1.966407465007776, "grad_norm": 0.0939781591296196, "learning_rate": 1.913324112494237e-06, "logits/chosen": 1.9109691381454468, "logits/rejected": 4.250695705413818, "logps/chosen": -513.4877319335938, "logps/rejected": -964.0062866210938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -10.212984085083008, "rewards/margins": 28.262313842773438, "rewards/rejected": -38.47529602050781, "step": 3161 }, { "epoch": 1.9670295489891134, "grad_norm": 16.613849639892578, "learning_rate": 1.9121715076071924e-06, "logits/chosen": -2.215022563934326, "logits/rejected": 3.2000844478607178, "logps/chosen": -508.90728759765625, "logps/rejected": -1212.5, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": -12.177558898925781, "rewards/margins": 36.37632751464844, "rewards/rejected": -48.55388641357422, "step": 3162 }, { "epoch": 1.967651632970451, "grad_norm": 29.307523727416992, "learning_rate": 1.9110189027201476e-06, "logits/chosen": 0.535607099533081, "logits/rejected": 1.110487699508667, "logps/chosen": -581.4873657226562, "logps/rejected": -877.1924438476562, "loss": 0.1411, "rewards/accuracies": 0.875, "rewards/chosen": -10.076355934143066, "rewards/margins": 24.285526275634766, "rewards/rejected": -34.361881256103516, "step": 3163 }, { "epoch": 1.9682737169517885, "grad_norm": 12.159449577331543, "learning_rate": 1.909866297833103e-06, "logits/chosen": 2.109978199005127, "logits/rejected": 2.998699903488159, "logps/chosen": -659.17919921875, "logps/rejected": -887.56396484375, "loss": 0.1495, "rewards/accuracies": 0.875, "rewards/chosen": -9.49515151977539, "rewards/margins": 19.975492477416992, "rewards/rejected": -29.470645904541016, "step": 3164 }, { "epoch": 1.9688958009331259, "grad_norm": 3.3401072869310156e-05, "learning_rate": 1.908713692946058e-06, "logits/chosen": -0.23816433548927307, "logits/rejected": 4.679553985595703, "logps/chosen": -519.406982421875, "logps/rejected": -1159.0615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.095431327819824, "rewards/margins": 38.48530578613281, "rewards/rejected": -49.58073425292969, "step": 3165 }, { "epoch": 1.9695178849144634, "grad_norm": 3.759016564686135e-08, "learning_rate": 1.9075610880590133e-06, "logits/chosen": 0.09053629636764526, "logits/rejected": 3.3123433589935303, "logps/chosen": -542.893310546875, "logps/rejected": -1076.65576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.824052810668945, "rewards/margins": 34.43450927734375, "rewards/rejected": -46.25856018066406, "step": 3166 }, { "epoch": 1.970139968895801, "grad_norm": 9.492687968304381e-05, "learning_rate": 1.9064084831719687e-06, "logits/chosen": -2.267563819885254, "logits/rejected": 1.2888798713684082, "logps/chosen": -436.4515380859375, "logps/rejected": -974.1395263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.326604843139648, "rewards/margins": 34.898399353027344, "rewards/rejected": -41.225006103515625, "step": 3167 }, { "epoch": 1.9707620528771383, "grad_norm": 0.00013881105405744165, "learning_rate": 1.905255878284924e-06, "logits/chosen": -1.0494887828826904, "logits/rejected": 1.077904224395752, "logps/chosen": -445.4752197265625, "logps/rejected": -931.4921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.989595413208008, "rewards/margins": 30.04443359375, "rewards/rejected": -39.034027099609375, "step": 3168 }, { "epoch": 1.971384136858476, "grad_norm": 0.00014189987268764526, "learning_rate": 1.9041032733978794e-06, "logits/chosen": 0.719484806060791, "logits/rejected": 1.8682856559753418, "logps/chosen": -647.4837036132812, "logps/rejected": -1057.12109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.586885452270508, "rewards/margins": 31.471981048583984, "rewards/rejected": -47.058868408203125, "step": 3169 }, { "epoch": 1.9720062208398135, "grad_norm": 11.673099517822266, "learning_rate": 1.9029506685108346e-06, "logits/chosen": -1.9770435094833374, "logits/rejected": 2.3703324794769287, "logps/chosen": -462.7710876464844, "logps/rejected": -1014.6849975585938, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": -8.472570419311523, "rewards/margins": 35.858558654785156, "rewards/rejected": -44.33112716674805, "step": 3170 }, { "epoch": 1.9726283048211508, "grad_norm": 3.072976184625986e-08, "learning_rate": 1.9017980636237898e-06, "logits/chosen": 1.2289338111877441, "logits/rejected": 3.1060218811035156, "logps/chosen": -621.6762084960938, "logps/rejected": -1122.5240478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.191411018371582, "rewards/margins": 35.472572326660156, "rewards/rejected": -45.663978576660156, "step": 3171 }, { "epoch": 1.9732503888024884, "grad_norm": 8.41129894979531e-06, "learning_rate": 1.900645458736745e-06, "logits/chosen": -0.29541754722595215, "logits/rejected": 2.464428424835205, "logps/chosen": -483.1417541503906, "logps/rejected": -993.467529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.057136058807373, "rewards/margins": 33.919532775878906, "rewards/rejected": -39.97666931152344, "step": 3172 }, { "epoch": 1.973872472783826, "grad_norm": 0.007365781348198652, "learning_rate": 1.8994928538497005e-06, "logits/chosen": -0.018617630004882812, "logits/rejected": 3.561922550201416, "logps/chosen": -358.08642578125, "logps/rejected": -819.01806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.528288841247559, "rewards/margins": 30.811264038085938, "rewards/rejected": -36.33955383300781, "step": 3173 }, { "epoch": 1.9744945567651633, "grad_norm": 0.005658108275383711, "learning_rate": 1.8983402489626557e-06, "logits/chosen": -1.1695606708526611, "logits/rejected": 2.8908729553222656, "logps/chosen": -511.5259094238281, "logps/rejected": -1040.5850830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.388712882995605, "rewards/margins": 33.64862823486328, "rewards/rejected": -42.03733825683594, "step": 3174 }, { "epoch": 1.9751166407465006, "grad_norm": 0.0003242559905629605, "learning_rate": 1.897187644075611e-06, "logits/chosen": 0.11923173069953918, "logits/rejected": 3.546078681945801, "logps/chosen": -502.07183837890625, "logps/rejected": -1198.700927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.976399898529053, "rewards/margins": 43.29308319091797, "rewards/rejected": -51.26948547363281, "step": 3175 }, { "epoch": 1.9757387247278384, "grad_norm": 62.495479583740234, "learning_rate": 1.8960350391885661e-06, "logits/chosen": 2.8545210361480713, "logits/rejected": 3.2460947036743164, "logps/chosen": -766.4078979492188, "logps/rejected": -1039.130859375, "loss": 0.9158, "rewards/accuracies": 0.875, "rewards/chosen": -19.115346908569336, "rewards/margins": 22.217802047729492, "rewards/rejected": -41.33314895629883, "step": 3176 }, { "epoch": 1.9763608087091757, "grad_norm": 8.45496015244862e-06, "learning_rate": 1.8948824343015216e-06, "logits/chosen": -3.7129292488098145, "logits/rejected": 1.566472053527832, "logps/chosen": -306.4184265136719, "logps/rejected": -932.2579345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.600948333740234, "rewards/margins": 31.574434280395508, "rewards/rejected": -40.17538070678711, "step": 3177 }, { "epoch": 1.976982892690513, "grad_norm": 0.7843475937843323, "learning_rate": 1.8937298294144768e-06, "logits/chosen": 1.6225887537002563, "logits/rejected": 4.008382797241211, "logps/chosen": -745.7958984375, "logps/rejected": -1071.2774658203125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -14.640724182128906, "rewards/margins": 25.535783767700195, "rewards/rejected": -40.17650604248047, "step": 3178 }, { "epoch": 1.9776049766718506, "grad_norm": 2.848709357294865e-07, "learning_rate": 1.892577224527432e-06, "logits/chosen": -1.1793930530548096, "logits/rejected": 1.1641390323638916, "logps/chosen": -529.8934326171875, "logps/rejected": -1047.27001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.412664413452148, "rewards/margins": 36.38139343261719, "rewards/rejected": -47.794063568115234, "step": 3179 }, { "epoch": 1.9782270606531882, "grad_norm": 0.10748720169067383, "learning_rate": 1.8914246196403875e-06, "logits/chosen": 0.003985404968261719, "logits/rejected": 3.4207515716552734, "logps/chosen": -571.9930419921875, "logps/rejected": -1027.877197265625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.69100570678711, "rewards/margins": 28.41872787475586, "rewards/rejected": -37.10973358154297, "step": 3180 }, { "epoch": 1.9788491446345255, "grad_norm": 0.0021116528660058975, "learning_rate": 1.8902720147533427e-06, "logits/chosen": 0.023240089416503906, "logits/rejected": 4.275704383850098, "logps/chosen": -391.3278503417969, "logps/rejected": -927.5806884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.604829788208008, "rewards/margins": 29.584720611572266, "rewards/rejected": -38.18954849243164, "step": 3181 }, { "epoch": 1.979471228615863, "grad_norm": 2.9487199348920967e-09, "learning_rate": 1.889119409866298e-06, "logits/chosen": -0.26054155826568604, "logits/rejected": 3.042192220687866, "logps/chosen": -555.6188354492188, "logps/rejected": -1068.11572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.144253730773926, "rewards/margins": 34.46765899658203, "rewards/rejected": -43.61191177368164, "step": 3182 }, { "epoch": 1.9800933125972007, "grad_norm": 0.008355970494449139, "learning_rate": 1.8879668049792531e-06, "logits/chosen": 2.2057981491088867, "logits/rejected": 2.4470415115356445, "logps/chosen": -720.2744750976562, "logps/rejected": -873.899658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.665620803833008, "rewards/margins": 22.65416717529297, "rewards/rejected": -34.31978988647461, "step": 3183 }, { "epoch": 1.980715396578538, "grad_norm": 2.728590488433838, "learning_rate": 1.8868142000922086e-06, "logits/chosen": -1.2559614181518555, "logits/rejected": 3.207569122314453, "logps/chosen": -475.935302734375, "logps/rejected": -1004.422607421875, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -7.180890083312988, "rewards/margins": 29.82106590270996, "rewards/rejected": -37.001953125, "step": 3184 }, { "epoch": 1.9813374805598756, "grad_norm": 1.7883297687149025e-06, "learning_rate": 1.8856615952051638e-06, "logits/chosen": -1.5023488998413086, "logits/rejected": 4.09722375869751, "logps/chosen": -365.15325927734375, "logps/rejected": -1063.2884521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.08966064453125, "rewards/margins": 36.919166564941406, "rewards/rejected": -49.00883102416992, "step": 3185 }, { "epoch": 1.9819595645412131, "grad_norm": 21.962324142456055, "learning_rate": 1.884508990318119e-06, "logits/chosen": -1.3031964302062988, "logits/rejected": 2.3771743774414062, "logps/chosen": -550.832275390625, "logps/rejected": -880.1903686523438, "loss": 0.1747, "rewards/accuracies": 0.875, "rewards/chosen": -11.590015411376953, "rewards/margins": 20.501266479492188, "rewards/rejected": -32.091285705566406, "step": 3186 }, { "epoch": 1.9825816485225505, "grad_norm": 12.981456756591797, "learning_rate": 1.8833563854310745e-06, "logits/chosen": 2.1112420558929443, "logits/rejected": 2.808957576751709, "logps/chosen": -717.0997314453125, "logps/rejected": -899.928955078125, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": -12.56185531616211, "rewards/margins": 15.117238998413086, "rewards/rejected": -27.679094314575195, "step": 3187 }, { "epoch": 1.983203732503888, "grad_norm": 0.0059814429841935635, "learning_rate": 1.8822037805440297e-06, "logits/chosen": -1.7570008039474487, "logits/rejected": 1.8395037651062012, "logps/chosen": -407.08367919921875, "logps/rejected": -1051.62255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.58604621887207, "rewards/margins": 34.558738708496094, "rewards/rejected": -47.14478302001953, "step": 3188 }, { "epoch": 1.9838258164852256, "grad_norm": 0.15041697025299072, "learning_rate": 1.881051175656985e-06, "logits/chosen": 1.5227668285369873, "logits/rejected": 3.454390287399292, "logps/chosen": -638.9949951171875, "logps/rejected": -938.1869506835938, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.065284729003906, "rewards/margins": 27.189281463623047, "rewards/rejected": -36.25456619262695, "step": 3189 }, { "epoch": 1.984447900466563, "grad_norm": 0.41454270482063293, "learning_rate": 1.8798985707699401e-06, "logits/chosen": 0.650727391242981, "logits/rejected": 3.691972494125366, "logps/chosen": -396.9961853027344, "logps/rejected": -893.4281616210938, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -8.56352424621582, "rewards/margins": 25.493627548217773, "rewards/rejected": -34.057151794433594, "step": 3190 }, { "epoch": 1.9850699844479005, "grad_norm": 0.7126598358154297, "learning_rate": 1.8787459658828956e-06, "logits/chosen": 1.2246778011322021, "logits/rejected": 2.014221668243408, "logps/chosen": -533.6785888671875, "logps/rejected": -979.2806396484375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -11.698402404785156, "rewards/margins": 32.90081787109375, "rewards/rejected": -44.599220275878906, "step": 3191 }, { "epoch": 1.985692068429238, "grad_norm": 0.03736700490117073, "learning_rate": 1.8775933609958508e-06, "logits/chosen": -0.6881626844406128, "logits/rejected": 3.6436805725097656, "logps/chosen": -356.96368408203125, "logps/rejected": -787.3362426757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.223313331604004, "rewards/margins": 28.010154724121094, "rewards/rejected": -36.23346710205078, "step": 3192 }, { "epoch": 1.9863141524105754, "grad_norm": 30.400585174560547, "learning_rate": 1.876440756108806e-06, "logits/chosen": 3.5226330757141113, "logits/rejected": 4.738844394683838, "logps/chosen": -752.00146484375, "logps/rejected": -1067.257080078125, "loss": 0.3187, "rewards/accuracies": 0.875, "rewards/chosen": -10.198256492614746, "rewards/margins": 26.144824981689453, "rewards/rejected": -36.34307861328125, "step": 3193 }, { "epoch": 1.9869362363919127, "grad_norm": 1.7005794048309326, "learning_rate": 1.8752881512217612e-06, "logits/chosen": -0.09759989380836487, "logits/rejected": 1.11489999294281, "logps/chosen": -441.220703125, "logps/rejected": -719.19287109375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -9.71569538116455, "rewards/margins": 21.517513275146484, "rewards/rejected": -31.23320960998535, "step": 3194 }, { "epoch": 1.9875583203732505, "grad_norm": 0.014652427285909653, "learning_rate": 1.8741355463347167e-06, "logits/chosen": 0.6133676767349243, "logits/rejected": 2.9346764087677, "logps/chosen": -650.8162841796875, "logps/rejected": -1050.29736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.254842758178711, "rewards/margins": 30.56330108642578, "rewards/rejected": -43.818145751953125, "step": 3195 }, { "epoch": 1.9881804043545879, "grad_norm": 0.00039215991273522377, "learning_rate": 1.872982941447672e-06, "logits/chosen": -1.609758973121643, "logits/rejected": 2.8189010620117188, "logps/chosen": -491.3603210449219, "logps/rejected": -1019.6839599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.190193176269531, "rewards/margins": 33.10917663574219, "rewards/rejected": -44.29936599731445, "step": 3196 }, { "epoch": 1.9888024883359252, "grad_norm": 54.50910568237305, "learning_rate": 1.8718303365606271e-06, "logits/chosen": 1.285512924194336, "logits/rejected": 2.98002290725708, "logps/chosen": -510.360595703125, "logps/rejected": -891.5095825195312, "loss": 0.8237, "rewards/accuracies": 0.875, "rewards/chosen": -11.826030731201172, "rewards/margins": 25.11355209350586, "rewards/rejected": -36.93958282470703, "step": 3197 }, { "epoch": 1.989424572317263, "grad_norm": 3.051857493119314e-05, "learning_rate": 1.8706777316735826e-06, "logits/chosen": -0.6766510009765625, "logits/rejected": 2.1143531799316406, "logps/chosen": -480.74951171875, "logps/rejected": -1022.847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.219396591186523, "rewards/margins": 33.2050666809082, "rewards/rejected": -43.42446517944336, "step": 3198 }, { "epoch": 1.9900466562986003, "grad_norm": 0.0019154062028974295, "learning_rate": 1.8695251267865378e-06, "logits/chosen": -0.9002442955970764, "logits/rejected": 3.898540496826172, "logps/chosen": -527.63720703125, "logps/rejected": -1305.6282958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.581677436828613, "rewards/margins": 49.44828796386719, "rewards/rejected": -61.02996826171875, "step": 3199 }, { "epoch": 1.9906687402799377, "grad_norm": 0.3469063639640808, "learning_rate": 1.868372521899493e-06, "logits/chosen": -0.160893514752388, "logits/rejected": 2.917184829711914, "logps/chosen": -553.887939453125, "logps/rejected": -994.0776977539062, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -9.018745422363281, "rewards/margins": 31.039907455444336, "rewards/rejected": -40.05865478515625, "step": 3200 }, { "epoch": 1.9912908242612752, "grad_norm": 9.508553375781048e-06, "learning_rate": 1.8672199170124482e-06, "logits/chosen": 0.7386203408241272, "logits/rejected": 3.1600570678710938, "logps/chosen": -444.47576904296875, "logps/rejected": -991.1026611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.113765716552734, "rewards/margins": 33.536102294921875, "rewards/rejected": -42.64986801147461, "step": 3201 }, { "epoch": 1.9919129082426128, "grad_norm": 36.303627014160156, "learning_rate": 1.8660673121254037e-06, "logits/chosen": 1.1433557271957397, "logits/rejected": 3.8839259147644043, "logps/chosen": -570.5936889648438, "logps/rejected": -945.875732421875, "loss": 0.8348, "rewards/accuracies": 0.875, "rewards/chosen": -11.896246910095215, "rewards/margins": 20.941421508789062, "rewards/rejected": -32.837669372558594, "step": 3202 }, { "epoch": 1.9925349922239501, "grad_norm": 0.05175577849149704, "learning_rate": 1.864914707238359e-06, "logits/chosen": -0.03805255889892578, "logits/rejected": 1.907156229019165, "logps/chosen": -590.1661376953125, "logps/rejected": -947.9773559570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -14.847996711730957, "rewards/margins": 21.76512336730957, "rewards/rejected": -36.613121032714844, "step": 3203 }, { "epoch": 1.9931570762052877, "grad_norm": 4.170333340880461e-05, "learning_rate": 1.8637621023513141e-06, "logits/chosen": -1.174929141998291, "logits/rejected": 1.828955054283142, "logps/chosen": -365.89617919921875, "logps/rejected": -905.7449340820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.122086524963379, "rewards/margins": 31.113018035888672, "rewards/rejected": -38.235107421875, "step": 3204 }, { "epoch": 1.9937791601866253, "grad_norm": 1.7264932539173827e-11, "learning_rate": 1.8626094974642693e-06, "logits/chosen": 2.5377910137176514, "logits/rejected": 4.4260687828063965, "logps/chosen": -697.2733154296875, "logps/rejected": -1260.3101806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.474903106689453, "rewards/margins": 38.30769729614258, "rewards/rejected": -49.78260040283203, "step": 3205 }, { "epoch": 1.9944012441679626, "grad_norm": 1.0426287651062012, "learning_rate": 1.8614568925772248e-06, "logits/chosen": -0.8568588495254517, "logits/rejected": 0.989220380783081, "logps/chosen": -438.51177978515625, "logps/rejected": -781.666259765625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -10.024365425109863, "rewards/margins": 24.262752532958984, "rewards/rejected": -34.28711700439453, "step": 3206 }, { "epoch": 1.9950233281493002, "grad_norm": 17.793310165405273, "learning_rate": 1.86030428769018e-06, "logits/chosen": 0.980941891670227, "logits/rejected": 1.8689048290252686, "logps/chosen": -675.7926025390625, "logps/rejected": -916.1185913085938, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": -17.662540435791016, "rewards/margins": 18.985729217529297, "rewards/rejected": -36.64826965332031, "step": 3207 }, { "epoch": 1.9956454121306377, "grad_norm": 0.10001546144485474, "learning_rate": 1.8591516828031352e-06, "logits/chosen": 1.4320085048675537, "logits/rejected": 3.508606433868408, "logps/chosen": -679.5264282226562, "logps/rejected": -1101.302978515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -15.195636749267578, "rewards/margins": 26.399478912353516, "rewards/rejected": -41.595115661621094, "step": 3208 }, { "epoch": 1.996267496111975, "grad_norm": 3.449564610491507e-05, "learning_rate": 1.8579990779160907e-06, "logits/chosen": -0.9296298027038574, "logits/rejected": 3.058635950088501, "logps/chosen": -398.24456787109375, "logps/rejected": -1040.71044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.891507148742676, "rewards/margins": 38.04267120361328, "rewards/rejected": -46.934181213378906, "step": 3209 }, { "epoch": 1.9968895800933126, "grad_norm": 9.294652409153059e-05, "learning_rate": 1.856846473029046e-06, "logits/chosen": 1.0654292106628418, "logits/rejected": 1.9909114837646484, "logps/chosen": -664.6588745117188, "logps/rejected": -1054.145751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.01765251159668, "rewards/margins": 31.69105339050293, "rewards/rejected": -45.70870590209961, "step": 3210 }, { "epoch": 1.9975116640746502, "grad_norm": 0.5575461387634277, "learning_rate": 1.8556938681420011e-06, "logits/chosen": 3.3123221397399902, "logits/rejected": 2.042398452758789, "logps/chosen": -661.1849365234375, "logps/rejected": -802.16162109375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -9.65969181060791, "rewards/margins": 23.294269561767578, "rewards/rejected": -32.95396423339844, "step": 3211 }, { "epoch": 1.9981337480559875, "grad_norm": 5.9800062444992363e-05, "learning_rate": 1.8545412632549563e-06, "logits/chosen": 0.1124124526977539, "logits/rejected": -0.4729093611240387, "logps/chosen": -529.6085205078125, "logps/rejected": -770.8993530273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.632349014282227, "rewards/margins": 20.30960464477539, "rewards/rejected": -32.941951751708984, "step": 3212 }, { "epoch": 1.9987558320373249, "grad_norm": 0.42236706614494324, "learning_rate": 1.8533886583679118e-06, "logits/chosen": 1.4251984357833862, "logits/rejected": 3.264601469039917, "logps/chosen": -543.8138427734375, "logps/rejected": -847.8862915039062, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -8.262736320495605, "rewards/margins": 20.891653060913086, "rewards/rejected": -29.154390335083008, "step": 3213 }, { "epoch": 1.9993779160186627, "grad_norm": 36.19878387451172, "learning_rate": 1.852236053480867e-06, "logits/chosen": -1.031221866607666, "logits/rejected": 2.1314122676849365, "logps/chosen": -418.09295654296875, "logps/rejected": -987.6477661132812, "loss": 0.1566, "rewards/accuracies": 0.875, "rewards/chosen": -5.54425048828125, "rewards/margins": 33.829002380371094, "rewards/rejected": -39.373252868652344, "step": 3214 }, { "epoch": 2.0, "grad_norm": 2.246525632187968e-08, "learning_rate": 1.8510834485938222e-06, "logits/chosen": 1.2374851703643799, "logits/rejected": 3.5844645500183105, "logps/chosen": -616.0235595703125, "logps/rejected": -1126.6861572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.12330436706543, "rewards/margins": 36.971405029296875, "rewards/rejected": -52.09471130371094, "step": 3215 }, { "epoch": 2.0006220839813373, "grad_norm": 1.0097607472392411e-10, "learning_rate": 1.8499308437067775e-06, "logits/chosen": -0.8583577871322632, "logits/rejected": 4.101155757904053, "logps/chosen": -515.8357543945312, "logps/rejected": -1275.03466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.491175651550293, "rewards/margins": 47.675228118896484, "rewards/rejected": -58.166404724121094, "step": 3216 }, { "epoch": 2.001244167962675, "grad_norm": 4.448748586582951e-05, "learning_rate": 1.8487782388197329e-06, "logits/chosen": 0.23793944716453552, "logits/rejected": 3.973947048187256, "logps/chosen": -618.2573852539062, "logps/rejected": -1212.6055908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.788324356079102, "rewards/margins": 40.03203582763672, "rewards/rejected": -50.82035827636719, "step": 3217 }, { "epoch": 2.0018662519440125, "grad_norm": 7.491581345675513e-05, "learning_rate": 1.8476256339326881e-06, "logits/chosen": 1.447487473487854, "logits/rejected": 4.082512378692627, "logps/chosen": -616.44384765625, "logps/rejected": -1120.1319580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.434648513793945, "rewards/margins": 38.114646911621094, "rewards/rejected": -49.54929733276367, "step": 3218 }, { "epoch": 2.00248833592535, "grad_norm": 3.84054183086846e-05, "learning_rate": 1.8464730290456433e-06, "logits/chosen": 1.8149409294128418, "logits/rejected": 4.784121513366699, "logps/chosen": -467.8046875, "logps/rejected": -872.73779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.681539535522461, "rewards/margins": 28.198171615600586, "rewards/rejected": -36.87971496582031, "step": 3219 }, { "epoch": 2.0031104199066876, "grad_norm": 0.043713755905628204, "learning_rate": 1.8453204241585988e-06, "logits/chosen": 0.8241747617721558, "logits/rejected": 3.4915554523468018, "logps/chosen": -591.9862670898438, "logps/rejected": -1083.1734619140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.339737892150879, "rewards/margins": 31.408842086791992, "rewards/rejected": -41.74858093261719, "step": 3220 }, { "epoch": 2.003732503888025, "grad_norm": 1.5662541272831731e-06, "learning_rate": 1.844167819271554e-06, "logits/chosen": -3.3587021827697754, "logits/rejected": 3.4004125595092773, "logps/chosen": -293.3941650390625, "logps/rejected": -941.4876708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.155046463012695, "rewards/margins": 30.374483108520508, "rewards/rejected": -37.5295295715332, "step": 3221 }, { "epoch": 2.0043545878693623, "grad_norm": 7.130699634552002, "learning_rate": 1.8430152143845092e-06, "logits/chosen": -1.5600241422653198, "logits/rejected": 2.922963857650757, "logps/chosen": -547.20849609375, "logps/rejected": -1075.0660400390625, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -14.9271821975708, "rewards/margins": 28.489486694335938, "rewards/rejected": -43.41667175292969, "step": 3222 }, { "epoch": 2.0049766718507, "grad_norm": 1.1248069142766326e-09, "learning_rate": 1.8418626094974645e-06, "logits/chosen": -2.2901358604431152, "logits/rejected": 4.997872829437256, "logps/chosen": -412.37506103515625, "logps/rejected": -1133.8994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.321676254272461, "rewards/margins": 33.20163345336914, "rewards/rejected": -43.52330780029297, "step": 3223 }, { "epoch": 2.0055987558320374, "grad_norm": 1.6254787169600604e-06, "learning_rate": 1.8407100046104199e-06, "logits/chosen": 0.942064642906189, "logits/rejected": 4.401899814605713, "logps/chosen": -590.219970703125, "logps/rejected": -1120.31787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.195745468139648, "rewards/margins": 31.794509887695312, "rewards/rejected": -42.990257263183594, "step": 3224 }, { "epoch": 2.0062208398133747, "grad_norm": 2.278909960296005e-05, "learning_rate": 1.8395573997233751e-06, "logits/chosen": -2.379765033721924, "logits/rejected": 3.167349338531494, "logps/chosen": -284.0023193359375, "logps/rejected": -844.4675903320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.441474437713623, "rewards/margins": 31.908851623535156, "rewards/rejected": -35.35032653808594, "step": 3225 }, { "epoch": 2.006842923794712, "grad_norm": 1.1007481813430786, "learning_rate": 1.8384047948363303e-06, "logits/chosen": -0.15077215433120728, "logits/rejected": 2.8373799324035645, "logps/chosen": -441.18719482421875, "logps/rejected": -808.1421508789062, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -11.933378219604492, "rewards/margins": 23.44791030883789, "rewards/rejected": -35.381290435791016, "step": 3226 }, { "epoch": 2.00746500777605, "grad_norm": 6.995454100433562e-07, "learning_rate": 1.8372521899492856e-06, "logits/chosen": -0.511811375617981, "logits/rejected": 3.8422083854675293, "logps/chosen": -386.70245361328125, "logps/rejected": -960.3453369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.07375955581665, "rewards/margins": 31.684070587158203, "rewards/rejected": -37.75782775878906, "step": 3227 }, { "epoch": 2.008087091757387, "grad_norm": 1.6627997589946375e-11, "learning_rate": 1.836099585062241e-06, "logits/chosen": 0.5001524686813354, "logits/rejected": 1.829136848449707, "logps/chosen": -693.051025390625, "logps/rejected": -1198.63330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.901176452636719, "rewards/margins": 39.069000244140625, "rewards/rejected": -52.97017288208008, "step": 3228 }, { "epoch": 2.0087091757387245, "grad_norm": 0.2621017396450043, "learning_rate": 1.834946980175196e-06, "logits/chosen": -0.9760973453521729, "logits/rejected": 3.2193968296051025, "logps/chosen": -499.0474853515625, "logps/rejected": -1006.688232421875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -10.02197265625, "rewards/margins": 30.229379653930664, "rewards/rejected": -40.25135040283203, "step": 3229 }, { "epoch": 2.0093312597200623, "grad_norm": 0.003114903811365366, "learning_rate": 1.8337943752881512e-06, "logits/chosen": -2.958038568496704, "logits/rejected": 2.047264575958252, "logps/chosen": -356.1661071777344, "logps/rejected": -884.7626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.416285991668701, "rewards/margins": 27.496191024780273, "rewards/rejected": -34.912479400634766, "step": 3230 }, { "epoch": 2.0099533437013997, "grad_norm": 0.003051260020583868, "learning_rate": 1.8326417704011065e-06, "logits/chosen": -0.30247652530670166, "logits/rejected": 2.661752462387085, "logps/chosen": -552.0884399414062, "logps/rejected": -1117.947021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.436690330505371, "rewards/margins": 34.702999114990234, "rewards/rejected": -47.13969039916992, "step": 3231 }, { "epoch": 2.010575427682737, "grad_norm": 0.03463459387421608, "learning_rate": 1.831489165514062e-06, "logits/chosen": 0.2040191888809204, "logits/rejected": 3.293041706085205, "logps/chosen": -549.462890625, "logps/rejected": -954.4931640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.499516487121582, "rewards/margins": 24.763486862182617, "rewards/rejected": -35.26300811767578, "step": 3232 }, { "epoch": 2.011197511664075, "grad_norm": 5.8191501011606306e-05, "learning_rate": 1.8303365606270171e-06, "logits/chosen": 0.7268747091293335, "logits/rejected": 0.9204151034355164, "logps/chosen": -638.6798095703125, "logps/rejected": -936.39453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.820808410644531, "rewards/margins": 26.461711883544922, "rewards/rejected": -39.28252029418945, "step": 3233 }, { "epoch": 2.011819595645412, "grad_norm": 1.3442055246670748e-09, "learning_rate": 1.8291839557399723e-06, "logits/chosen": -2.53243088722229, "logits/rejected": 2.388972282409668, "logps/chosen": -442.279541015625, "logps/rejected": -1245.7723388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.95213508605957, "rewards/margins": 50.84278106689453, "rewards/rejected": -62.794918060302734, "step": 3234 }, { "epoch": 2.0124416796267495, "grad_norm": 20.80923080444336, "learning_rate": 1.8280313508529276e-06, "logits/chosen": 1.5385990142822266, "logits/rejected": 4.438154220581055, "logps/chosen": -615.07470703125, "logps/rejected": -1099.8695068359375, "loss": 0.1284, "rewards/accuracies": 0.875, "rewards/chosen": -7.9859466552734375, "rewards/margins": 32.458778381347656, "rewards/rejected": -40.44472122192383, "step": 3235 }, { "epoch": 2.0130637636080873, "grad_norm": 0.001437014085240662, "learning_rate": 1.826878745965883e-06, "logits/chosen": 1.5975340604782104, "logits/rejected": 2.7934255599975586, "logps/chosen": -651.9356079101562, "logps/rejected": -1084.415771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.168201446533203, "rewards/margins": 37.92890167236328, "rewards/rejected": -48.097103118896484, "step": 3236 }, { "epoch": 2.0136858475894246, "grad_norm": 0.022464267909526825, "learning_rate": 1.8257261410788382e-06, "logits/chosen": -0.4906814992427826, "logits/rejected": 4.151356220245361, "logps/chosen": -384.986083984375, "logps/rejected": -939.2615966796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.647461891174316, "rewards/margins": 29.363422393798828, "rewards/rejected": -40.01088333129883, "step": 3237 }, { "epoch": 2.014307931570762, "grad_norm": 0.0007543100509792566, "learning_rate": 1.8245735361917935e-06, "logits/chosen": 0.009873226284980774, "logits/rejected": 2.72328519821167, "logps/chosen": -531.1657104492188, "logps/rejected": -1049.25439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.855783462524414, "rewards/margins": 33.60731887817383, "rewards/rejected": -45.46310043334961, "step": 3238 }, { "epoch": 2.0149300155520997, "grad_norm": 0.1094256192445755, "learning_rate": 1.8234209313047487e-06, "logits/chosen": 0.3360966444015503, "logits/rejected": 4.504080772399902, "logps/chosen": -476.1607666015625, "logps/rejected": -967.1328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.062921047210693, "rewards/margins": 27.761829376220703, "rewards/rejected": -34.82475280761719, "step": 3239 }, { "epoch": 2.015552099533437, "grad_norm": 0.0464254654943943, "learning_rate": 1.8222683264177041e-06, "logits/chosen": 1.2491750717163086, "logits/rejected": 3.8947272300720215, "logps/chosen": -432.4212646484375, "logps/rejected": -855.244873046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.43021011352539, "rewards/margins": 22.83322525024414, "rewards/rejected": -33.26343536376953, "step": 3240 }, { "epoch": 2.0161741835147744, "grad_norm": 21.57063865661621, "learning_rate": 1.8211157215306593e-06, "logits/chosen": 1.8107213973999023, "logits/rejected": 4.095378398895264, "logps/chosen": -570.1275634765625, "logps/rejected": -953.5574951171875, "loss": 0.1983, "rewards/accuracies": 0.875, "rewards/chosen": -9.565531730651855, "rewards/margins": 23.652999877929688, "rewards/rejected": -33.21853256225586, "step": 3241 }, { "epoch": 2.016796267496112, "grad_norm": 8.953101314546075e-06, "learning_rate": 1.8199631166436146e-06, "logits/chosen": 3.1453516483306885, "logits/rejected": 3.4556429386138916, "logps/chosen": -581.106689453125, "logps/rejected": -922.162841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.440402030944824, "rewards/margins": 27.36144256591797, "rewards/rejected": -39.80184555053711, "step": 3242 }, { "epoch": 2.0174183514774495, "grad_norm": 0.07161377370357513, "learning_rate": 1.81881051175657e-06, "logits/chosen": 1.7500510215759277, "logits/rejected": 3.6969451904296875, "logps/chosen": -632.09765625, "logps/rejected": -1028.01953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.109907150268555, "rewards/margins": 28.601673126220703, "rewards/rejected": -40.71158218383789, "step": 3243 }, { "epoch": 2.018040435458787, "grad_norm": 9.346132173959631e-06, "learning_rate": 1.8176579068695252e-06, "logits/chosen": -0.8983386754989624, "logits/rejected": 2.404198169708252, "logps/chosen": -537.7662353515625, "logps/rejected": -1138.3782958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.997988700866699, "rewards/margins": 38.091087341308594, "rewards/rejected": -46.089073181152344, "step": 3244 }, { "epoch": 2.018662519440124, "grad_norm": 0.00020862782548647374, "learning_rate": 1.8165053019824805e-06, "logits/chosen": 0.0696406364440918, "logits/rejected": 3.6975560188293457, "logps/chosen": -413.0459289550781, "logps/rejected": -961.5831909179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.714500904083252, "rewards/margins": 35.98335266113281, "rewards/rejected": -43.69785690307617, "step": 3245 }, { "epoch": 2.019284603421462, "grad_norm": 0.004104145802557468, "learning_rate": 1.8153526970954357e-06, "logits/chosen": -0.13870498538017273, "logits/rejected": 4.5534257888793945, "logps/chosen": -294.36688232421875, "logps/rejected": -854.3531494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.45639181137085, "rewards/margins": 26.36554718017578, "rewards/rejected": -32.821937561035156, "step": 3246 }, { "epoch": 2.0199066874027993, "grad_norm": 3.7008983326813905e-06, "learning_rate": 1.8142000922083911e-06, "logits/chosen": 0.7690372467041016, "logits/rejected": 2.658047914505005, "logps/chosen": -630.7483520507812, "logps/rejected": -1073.966064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.671428680419922, "rewards/margins": 31.08428955078125, "rewards/rejected": -42.75571823120117, "step": 3247 }, { "epoch": 2.0205287713841367, "grad_norm": 0.07437684386968613, "learning_rate": 1.8130474873213463e-06, "logits/chosen": 0.5262776613235474, "logits/rejected": 2.2410149574279785, "logps/chosen": -577.2639770507812, "logps/rejected": -973.701416015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.476058959960938, "rewards/margins": 30.7393798828125, "rewards/rejected": -41.21543884277344, "step": 3248 }, { "epoch": 2.0211508553654745, "grad_norm": 1.5357197523117065, "learning_rate": 1.8118948824343016e-06, "logits/chosen": 1.70393705368042, "logits/rejected": 3.8450446128845215, "logps/chosen": -481.91070556640625, "logps/rejected": -1086.33935546875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -8.68971061706543, "rewards/margins": 40.52362060546875, "rewards/rejected": -49.21332931518555, "step": 3249 }, { "epoch": 2.021772939346812, "grad_norm": 5.022254834230466e-10, "learning_rate": 1.810742277547257e-06, "logits/chosen": -2.6166677474975586, "logits/rejected": 2.7093167304992676, "logps/chosen": -303.3418273925781, "logps/rejected": -1076.861083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.309881210327148, "rewards/margins": 42.87946319580078, "rewards/rejected": -49.1893424987793, "step": 3250 }, { "epoch": 2.022395023328149, "grad_norm": 5.978255271911621, "learning_rate": 1.8095896726602122e-06, "logits/chosen": 0.09582383185625076, "logits/rejected": 1.5433762073516846, "logps/chosen": -550.7335205078125, "logps/rejected": -959.3916625976562, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -10.984166145324707, "rewards/margins": 23.924039840698242, "rewards/rejected": -34.908203125, "step": 3251 }, { "epoch": 2.023017107309487, "grad_norm": 0.005254325456917286, "learning_rate": 1.8084370677731675e-06, "logits/chosen": -0.8678733706474304, "logits/rejected": 2.8479673862457275, "logps/chosen": -551.5311279296875, "logps/rejected": -1241.876708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.364095687866211, "rewards/margins": 42.996192932128906, "rewards/rejected": -56.360286712646484, "step": 3252 }, { "epoch": 2.0236391912908243, "grad_norm": 0.026929447427392006, "learning_rate": 1.8072844628861227e-06, "logits/chosen": -1.9292072057724, "logits/rejected": 2.830716848373413, "logps/chosen": -461.7641906738281, "logps/rejected": -977.4414672851562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.050446510314941, "rewards/margins": 23.504653930664062, "rewards/rejected": -30.55510139465332, "step": 3253 }, { "epoch": 2.0242612752721616, "grad_norm": 35.80807876586914, "learning_rate": 1.8061318579990781e-06, "logits/chosen": 0.03676527738571167, "logits/rejected": 2.638430595397949, "logps/chosen": -380.2243347167969, "logps/rejected": -805.8291015625, "loss": 0.3045, "rewards/accuracies": 0.875, "rewards/chosen": -13.26235580444336, "rewards/margins": 25.53285026550293, "rewards/rejected": -38.795204162597656, "step": 3254 }, { "epoch": 2.0248833592534994, "grad_norm": 0.05008547008037567, "learning_rate": 1.8049792531120333e-06, "logits/chosen": -0.5707646608352661, "logits/rejected": 3.5213394165039062, "logps/chosen": -453.5271301269531, "logps/rejected": -1093.7734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.335705757141113, "rewards/margins": 42.951393127441406, "rewards/rejected": -48.2870979309082, "step": 3255 }, { "epoch": 2.0255054432348367, "grad_norm": 16.085357666015625, "learning_rate": 1.8038266482249886e-06, "logits/chosen": 2.096897602081299, "logits/rejected": 2.4062466621398926, "logps/chosen": -702.8720092773438, "logps/rejected": -930.53564453125, "loss": 0.091, "rewards/accuracies": 0.875, "rewards/chosen": -12.675012588500977, "rewards/margins": 20.07527732849121, "rewards/rejected": -32.75028991699219, "step": 3256 }, { "epoch": 2.026127527216174, "grad_norm": 0.00011278959573246539, "learning_rate": 1.8026740433379438e-06, "logits/chosen": 2.4136128425598145, "logits/rejected": 3.5406317710876465, "logps/chosen": -606.5416259765625, "logps/rejected": -968.1136474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.13314437866211, "rewards/margins": 29.38345718383789, "rewards/rejected": -42.5166015625, "step": 3257 }, { "epoch": 2.026749611197512, "grad_norm": 0.0005387061974033713, "learning_rate": 1.8015214384508992e-06, "logits/chosen": -0.6184181571006775, "logits/rejected": 2.548027992248535, "logps/chosen": -573.9535522460938, "logps/rejected": -1113.246337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.945446014404297, "rewards/margins": 35.821556091308594, "rewards/rejected": -50.76700210571289, "step": 3258 }, { "epoch": 2.027371695178849, "grad_norm": 0.00011549627379281446, "learning_rate": 1.8003688335638544e-06, "logits/chosen": 3.146450996398926, "logits/rejected": 3.83984375, "logps/chosen": -783.1937255859375, "logps/rejected": -1222.3145751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.651394844055176, "rewards/margins": 37.6688232421875, "rewards/rejected": -53.320220947265625, "step": 3259 }, { "epoch": 2.0279937791601865, "grad_norm": 0.006031322292983532, "learning_rate": 1.7992162286768097e-06, "logits/chosen": -1.1792547702789307, "logits/rejected": 0.5242654085159302, "logps/chosen": -531.763916015625, "logps/rejected": -996.9135131835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.301654815673828, "rewards/margins": 33.844364166259766, "rewards/rejected": -44.146018981933594, "step": 3260 }, { "epoch": 2.0286158631415243, "grad_norm": 0.005457498598843813, "learning_rate": 1.7980636237897651e-06, "logits/chosen": -0.2665392756462097, "logits/rejected": 4.692824363708496, "logps/chosen": -368.7724914550781, "logps/rejected": -965.8126220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.76845645904541, "rewards/margins": 28.322364807128906, "rewards/rejected": -36.0908203125, "step": 3261 }, { "epoch": 2.0292379471228617, "grad_norm": 1.9818975488306023e-05, "learning_rate": 1.7969110189027203e-06, "logits/chosen": 0.24156644940376282, "logits/rejected": 2.885503053665161, "logps/chosen": -686.6699829101562, "logps/rejected": -1015.3580322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.752955436706543, "rewards/margins": 30.202939987182617, "rewards/rejected": -42.95589828491211, "step": 3262 }, { "epoch": 2.029860031104199, "grad_norm": 4.516140563737281e-07, "learning_rate": 1.7957584140156756e-06, "logits/chosen": 1.4977079629898071, "logits/rejected": 4.380037307739258, "logps/chosen": -616.4777221679688, "logps/rejected": -1208.46826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.326394081115723, "rewards/margins": 39.57026672363281, "rewards/rejected": -52.89665603637695, "step": 3263 }, { "epoch": 2.0304821150855363, "grad_norm": 7.894611826486653e-07, "learning_rate": 1.7946058091286308e-06, "logits/chosen": 1.232468843460083, "logits/rejected": 3.525763511657715, "logps/chosen": -512.837646484375, "logps/rejected": -1061.3662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.058338165283203, "rewards/margins": 38.82855224609375, "rewards/rejected": -47.88689422607422, "step": 3264 }, { "epoch": 2.031104199066874, "grad_norm": 0.46222686767578125, "learning_rate": 1.7934532042415862e-06, "logits/chosen": 0.8871013522148132, "logits/rejected": 3.695953607559204, "logps/chosen": -575.14892578125, "logps/rejected": -972.8055419921875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -10.340843200683594, "rewards/margins": 26.107912063598633, "rewards/rejected": -36.448753356933594, "step": 3265 }, { "epoch": 2.0317262830482115, "grad_norm": 0.00018285130499862134, "learning_rate": 1.7923005993545414e-06, "logits/chosen": 0.45569974184036255, "logits/rejected": 2.566760301589966, "logps/chosen": -628.85400390625, "logps/rejected": -1045.263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.879467010498047, "rewards/margins": 29.43852996826172, "rewards/rejected": -41.317996978759766, "step": 3266 }, { "epoch": 2.032348367029549, "grad_norm": 4.834855644730851e-05, "learning_rate": 1.7911479944674967e-06, "logits/chosen": 1.829100489616394, "logits/rejected": 3.4712557792663574, "logps/chosen": -811.4913330078125, "logps/rejected": -1266.06103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.349407196044922, "rewards/margins": 36.72393798828125, "rewards/rejected": -52.07334518432617, "step": 3267 }, { "epoch": 2.0329704510108866, "grad_norm": 11.432663917541504, "learning_rate": 1.789995389580452e-06, "logits/chosen": -2.6283178329467773, "logits/rejected": 2.5972487926483154, "logps/chosen": -460.37530517578125, "logps/rejected": -983.4398803710938, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": -10.121039390563965, "rewards/margins": 27.89306640625, "rewards/rejected": -38.01410675048828, "step": 3268 }, { "epoch": 2.033592534992224, "grad_norm": 5.5460273870266974e-05, "learning_rate": 1.7888427846934073e-06, "logits/chosen": 0.5614759922027588, "logits/rejected": 2.7525126934051514, "logps/chosen": -524.7399291992188, "logps/rejected": -1027.02294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.975117683410645, "rewards/margins": 32.141090393066406, "rewards/rejected": -41.1162109375, "step": 3269 }, { "epoch": 2.0342146189735613, "grad_norm": 0.0012390101328492165, "learning_rate": 1.7876901798063626e-06, "logits/chosen": 0.3229471743106842, "logits/rejected": 2.942484140396118, "logps/chosen": -524.5540771484375, "logps/rejected": -1004.2750244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.311838626861572, "rewards/margins": 36.90262985229492, "rewards/rejected": -42.21446990966797, "step": 3270 }, { "epoch": 2.034836702954899, "grad_norm": 0.002542484551668167, "learning_rate": 1.7865375749193178e-06, "logits/chosen": 0.5060664415359497, "logits/rejected": 4.053060531616211, "logps/chosen": -528.406005859375, "logps/rejected": -1041.412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.696809768676758, "rewards/margins": 26.39920425415039, "rewards/rejected": -35.09601593017578, "step": 3271 }, { "epoch": 2.0354587869362364, "grad_norm": 0.0006695652264170349, "learning_rate": 1.7853849700322732e-06, "logits/chosen": 1.4988611936569214, "logits/rejected": 1.3970237970352173, "logps/chosen": -627.5516967773438, "logps/rejected": -843.8731689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.545269012451172, "rewards/margins": 24.648292541503906, "rewards/rejected": -37.19355773925781, "step": 3272 }, { "epoch": 2.0360808709175737, "grad_norm": 0.011035816743969917, "learning_rate": 1.7842323651452284e-06, "logits/chosen": -2.0264055728912354, "logits/rejected": 2.839123487472534, "logps/chosen": -318.1818542480469, "logps/rejected": -856.6058959960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.792394161224365, "rewards/margins": 23.563783645629883, "rewards/rejected": -30.356178283691406, "step": 3273 }, { "epoch": 2.0367029548989115, "grad_norm": 0.00011552633077371866, "learning_rate": 1.7830797602581837e-06, "logits/chosen": 0.0791429877281189, "logits/rejected": 3.939262866973877, "logps/chosen": -413.80029296875, "logps/rejected": -987.649169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.731544017791748, "rewards/margins": 38.70989990234375, "rewards/rejected": -44.441444396972656, "step": 3274 }, { "epoch": 2.037325038880249, "grad_norm": 0.19351507723331451, "learning_rate": 1.7819271553711389e-06, "logits/chosen": 0.4769981801509857, "logits/rejected": 2.4208312034606934, "logps/chosen": -651.798095703125, "logps/rejected": -1105.50830078125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -12.518138885498047, "rewards/margins": 32.211429595947266, "rewards/rejected": -44.72956848144531, "step": 3275 }, { "epoch": 2.037947122861586, "grad_norm": 1.6189000362487604e-08, "learning_rate": 1.7807745504840943e-06, "logits/chosen": 0.6967576146125793, "logits/rejected": 4.596070289611816, "logps/chosen": -495.2347412109375, "logps/rejected": -1113.9619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.830886840820312, "rewards/margins": 35.10674285888672, "rewards/rejected": -43.93762969970703, "step": 3276 }, { "epoch": 2.038569206842924, "grad_norm": 3.4301083360332996e-05, "learning_rate": 1.7796219455970496e-06, "logits/chosen": 0.39109957218170166, "logits/rejected": 3.4164555072784424, "logps/chosen": -509.5931091308594, "logps/rejected": -1002.6499633789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.291837692260742, "rewards/margins": 33.81525421142578, "rewards/rejected": -46.107093811035156, "step": 3277 }, { "epoch": 2.0391912908242613, "grad_norm": 0.714177131652832, "learning_rate": 1.7784693407100048e-06, "logits/chosen": 2.7856783866882324, "logits/rejected": 3.950831413269043, "logps/chosen": -630.0526123046875, "logps/rejected": -963.7937622070312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -9.394908905029297, "rewards/margins": 25.152563095092773, "rewards/rejected": -34.5474739074707, "step": 3278 }, { "epoch": 2.0398133748055987, "grad_norm": 0.05921145901083946, "learning_rate": 1.77731673582296e-06, "logits/chosen": -0.07561540603637695, "logits/rejected": 1.551999807357788, "logps/chosen": -601.9862060546875, "logps/rejected": -1023.5673828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -14.382827758789062, "rewards/margins": 30.679702758789062, "rewards/rejected": -45.06253433227539, "step": 3279 }, { "epoch": 2.0404354587869364, "grad_norm": 0.019323257729411125, "learning_rate": 1.7761641309359154e-06, "logits/chosen": -1.1875865459442139, "logits/rejected": 2.3914713859558105, "logps/chosen": -543.6219482421875, "logps/rejected": -960.1560668945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.5393648147583, "rewards/margins": 24.20549774169922, "rewards/rejected": -32.74486541748047, "step": 3280 }, { "epoch": 2.041057542768274, "grad_norm": 0.0016454965807497501, "learning_rate": 1.7750115260488707e-06, "logits/chosen": 3.365413188934326, "logits/rejected": 4.608278274536133, "logps/chosen": -793.6561889648438, "logps/rejected": -1262.06640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.539536476135254, "rewards/margins": 35.477149963378906, "rewards/rejected": -48.016685485839844, "step": 3281 }, { "epoch": 2.041679626749611, "grad_norm": 0.00505446782335639, "learning_rate": 1.7738589211618259e-06, "logits/chosen": 1.5577607154846191, "logits/rejected": 1.9160778522491455, "logps/chosen": -656.432861328125, "logps/rejected": -947.4666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.272841453552246, "rewards/margins": 25.096763610839844, "rewards/rejected": -34.369606018066406, "step": 3282 }, { "epoch": 2.0423017107309485, "grad_norm": 4.4753107886208454e-07, "learning_rate": 1.7727063162747813e-06, "logits/chosen": 0.7870282530784607, "logits/rejected": 4.364411354064941, "logps/chosen": -598.6383056640625, "logps/rejected": -1166.548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.089197158813477, "rewards/margins": 34.72199249267578, "rewards/rejected": -47.811187744140625, "step": 3283 }, { "epoch": 2.0429237947122862, "grad_norm": 0.0002891090407501906, "learning_rate": 1.7715537113877366e-06, "logits/chosen": 0.13575269281864166, "logits/rejected": 2.0142300128936768, "logps/chosen": -677.8521728515625, "logps/rejected": -1039.75830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.51553726196289, "rewards/margins": 25.58846092224121, "rewards/rejected": -41.10399627685547, "step": 3284 }, { "epoch": 2.0435458786936236, "grad_norm": 1.2564136397941184e-07, "learning_rate": 1.7704011065006918e-06, "logits/chosen": -3.027099847793579, "logits/rejected": 4.060395240783691, "logps/chosen": -396.2166748046875, "logps/rejected": -1203.6392822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.413979530334473, "rewards/margins": 37.810646057128906, "rewards/rejected": -45.22462463378906, "step": 3285 }, { "epoch": 2.044167962674961, "grad_norm": 29.117576599121094, "learning_rate": 1.769248501613647e-06, "logits/chosen": 0.6583254933357239, "logits/rejected": 2.5260579586029053, "logps/chosen": -604.4136962890625, "logps/rejected": -879.0269165039062, "loss": 0.3192, "rewards/accuracies": 0.875, "rewards/chosen": -8.834407806396484, "rewards/margins": 22.25692367553711, "rewards/rejected": -31.091331481933594, "step": 3286 }, { "epoch": 2.0447900466562987, "grad_norm": 0.0004560309462249279, "learning_rate": 1.7680958967266024e-06, "logits/chosen": 0.19445538520812988, "logits/rejected": 2.462914228439331, "logps/chosen": -562.7223510742188, "logps/rejected": -1123.900634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.67230224609375, "rewards/margins": 36.951412200927734, "rewards/rejected": -49.62371826171875, "step": 3287 }, { "epoch": 2.045412130637636, "grad_norm": 0.005535934120416641, "learning_rate": 1.7669432918395577e-06, "logits/chosen": -0.4256298840045929, "logits/rejected": 2.6482839584350586, "logps/chosen": -507.5963134765625, "logps/rejected": -932.339599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.090707778930664, "rewards/margins": 28.398929595947266, "rewards/rejected": -38.4896354675293, "step": 3288 }, { "epoch": 2.0460342146189734, "grad_norm": 1.1572271585464478, "learning_rate": 1.7657906869525129e-06, "logits/chosen": 0.5644538998603821, "logits/rejected": 1.5637774467468262, "logps/chosen": -629.345458984375, "logps/rejected": -1000.5640869140625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -13.741057395935059, "rewards/margins": 32.58911895751953, "rewards/rejected": -46.330177307128906, "step": 3289 }, { "epoch": 2.046656298600311, "grad_norm": 1.0075103044509888, "learning_rate": 1.7646380820654681e-06, "logits/chosen": 1.313632845878601, "logits/rejected": 3.4829115867614746, "logps/chosen": -548.7958984375, "logps/rejected": -918.281005859375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -9.908761978149414, "rewards/margins": 25.477642059326172, "rewards/rejected": -35.38640594482422, "step": 3290 }, { "epoch": 2.0472783825816485, "grad_norm": 3.8606403904850595e-06, "learning_rate": 1.7634854771784235e-06, "logits/chosen": -1.0874159336090088, "logits/rejected": 3.1560041904449463, "logps/chosen": -512.3287963867188, "logps/rejected": -1114.589599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.583832740783691, "rewards/margins": 35.8284912109375, "rewards/rejected": -45.412322998046875, "step": 3291 }, { "epoch": 2.047900466562986, "grad_norm": 1.0839836761533661e-07, "learning_rate": 1.7623328722913788e-06, "logits/chosen": 0.6895605325698853, "logits/rejected": 2.0665345191955566, "logps/chosen": -607.904296875, "logps/rejected": -1044.9071044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.638944625854492, "rewards/margins": 32.77497100830078, "rewards/rejected": -46.413917541503906, "step": 3292 }, { "epoch": 2.0485225505443236, "grad_norm": 0.1251257359981537, "learning_rate": 1.761180267404334e-06, "logits/chosen": -0.2770364284515381, "logits/rejected": 2.772951364517212, "logps/chosen": -615.9885864257812, "logps/rejected": -1054.5169677734375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -12.234400749206543, "rewards/margins": 32.018253326416016, "rewards/rejected": -44.252655029296875, "step": 3293 }, { "epoch": 2.049144634525661, "grad_norm": 34.915321350097656, "learning_rate": 1.7600276625172894e-06, "logits/chosen": -1.7377254962921143, "logits/rejected": 0.7142691612243652, "logps/chosen": -563.9683837890625, "logps/rejected": -1082.34619140625, "loss": 0.2039, "rewards/accuracies": 0.875, "rewards/chosen": -14.72260570526123, "rewards/margins": 31.14119529724121, "rewards/rejected": -45.863800048828125, "step": 3294 }, { "epoch": 2.0497667185069983, "grad_norm": 5.692875862121582, "learning_rate": 1.7588750576302447e-06, "logits/chosen": -1.636199712753296, "logits/rejected": 2.049520254135132, "logps/chosen": -406.2523193359375, "logps/rejected": -944.5439453125, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": -11.766059875488281, "rewards/margins": 29.158157348632812, "rewards/rejected": -40.924217224121094, "step": 3295 }, { "epoch": 2.050388802488336, "grad_norm": 5.243829946266487e-05, "learning_rate": 1.7577224527431997e-06, "logits/chosen": 3.2839221954345703, "logits/rejected": 4.257046699523926, "logps/chosen": -686.9495849609375, "logps/rejected": -1010.24462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.418400764465332, "rewards/margins": 29.747093200683594, "rewards/rejected": -41.16549301147461, "step": 3296 }, { "epoch": 2.0510108864696734, "grad_norm": 0.005440168082714081, "learning_rate": 1.756569847856155e-06, "logits/chosen": 1.3700108528137207, "logits/rejected": 2.389298677444458, "logps/chosen": -632.6429443359375, "logps/rejected": -1019.9666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.622817993164062, "rewards/margins": 30.42552947998047, "rewards/rejected": -42.04834747314453, "step": 3297 }, { "epoch": 2.051632970451011, "grad_norm": 28.08281707763672, "learning_rate": 1.7554172429691101e-06, "logits/chosen": -0.11010386794805527, "logits/rejected": 3.2671074867248535, "logps/chosen": -394.72784423828125, "logps/rejected": -937.2503662109375, "loss": 0.5615, "rewards/accuracies": 0.875, "rewards/chosen": -9.893918991088867, "rewards/margins": 30.8527774810791, "rewards/rejected": -40.74669647216797, "step": 3298 }, { "epoch": 2.0522550544323486, "grad_norm": 6.840485002612695e-05, "learning_rate": 1.7542646380820656e-06, "logits/chosen": 2.731478214263916, "logits/rejected": 4.624332427978516, "logps/chosen": -590.8049926757812, "logps/rejected": -916.9151000976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.759002685546875, "rewards/margins": 25.12192153930664, "rewards/rejected": -34.88092041015625, "step": 3299 }, { "epoch": 2.052877138413686, "grad_norm": 0.005580862518399954, "learning_rate": 1.7531120331950208e-06, "logits/chosen": -1.3007276058197021, "logits/rejected": 2.0316357612609863, "logps/chosen": -466.2745361328125, "logps/rejected": -922.46044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.656286239624023, "rewards/margins": 25.383224487304688, "rewards/rejected": -36.03950881958008, "step": 3300 }, { "epoch": 2.0534992223950232, "grad_norm": 3.2947580814361572, "learning_rate": 1.751959428307976e-06, "logits/chosen": -1.4168139696121216, "logits/rejected": 1.8039307594299316, "logps/chosen": -466.0013732910156, "logps/rejected": -957.6085205078125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -8.91163444519043, "rewards/margins": 30.08930206298828, "rewards/rejected": -39.000938415527344, "step": 3301 }, { "epoch": 2.054121306376361, "grad_norm": 0.0028978725895285606, "learning_rate": 1.7508068234209312e-06, "logits/chosen": 0.9679357409477234, "logits/rejected": 2.403862237930298, "logps/chosen": -657.3065185546875, "logps/rejected": -984.7806396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.79493236541748, "rewards/margins": 24.7813720703125, "rewards/rejected": -34.57630157470703, "step": 3302 }, { "epoch": 2.0547433903576984, "grad_norm": 0.0019351892406120896, "learning_rate": 1.7496542185338867e-06, "logits/chosen": 2.3574891090393066, "logits/rejected": 4.697743892669678, "logps/chosen": -553.681884765625, "logps/rejected": -966.7857055664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.650367736816406, "rewards/margins": 25.174945831298828, "rewards/rejected": -35.825313568115234, "step": 3303 }, { "epoch": 2.0553654743390357, "grad_norm": 0.10551003366708755, "learning_rate": 1.7485016136468419e-06, "logits/chosen": 1.1634535789489746, "logits/rejected": 2.7659077644348145, "logps/chosen": -536.2604370117188, "logps/rejected": -869.1211547851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.846975326538086, "rewards/margins": 19.269046783447266, "rewards/rejected": -27.11602210998535, "step": 3304 }, { "epoch": 2.055987558320373, "grad_norm": 0.004820887930691242, "learning_rate": 1.7473490087597971e-06, "logits/chosen": 2.5007271766662598, "logits/rejected": 2.895714282989502, "logps/chosen": -633.784912109375, "logps/rejected": -966.4913330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.281123161315918, "rewards/margins": 29.767860412597656, "rewards/rejected": -36.04898452758789, "step": 3305 }, { "epoch": 2.056609642301711, "grad_norm": 4.788481237483211e-05, "learning_rate": 1.7461964038727526e-06, "logits/chosen": -1.0666640996932983, "logits/rejected": 2.2224929332733154, "logps/chosen": -327.1462707519531, "logps/rejected": -840.1002197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.442052364349365, "rewards/margins": 33.94697189331055, "rewards/rejected": -39.38902282714844, "step": 3306 }, { "epoch": 2.057231726283048, "grad_norm": 0.0034600174985826015, "learning_rate": 1.7450437989857078e-06, "logits/chosen": 0.42211639881134033, "logits/rejected": 4.9143571853637695, "logps/chosen": -468.89068603515625, "logps/rejected": -1009.6427001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.933347225189209, "rewards/margins": 27.643991470336914, "rewards/rejected": -35.57733917236328, "step": 3307 }, { "epoch": 2.0578538102643855, "grad_norm": 0.00637791259214282, "learning_rate": 1.743891194098663e-06, "logits/chosen": 0.0010982751846313477, "logits/rejected": 0.983103334903717, "logps/chosen": -619.0855712890625, "logps/rejected": -880.09619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.18000316619873, "rewards/margins": 24.80661964416504, "rewards/rejected": -35.98662185668945, "step": 3308 }, { "epoch": 2.0584758942457233, "grad_norm": 2.640011916810181e-05, "learning_rate": 1.7427385892116182e-06, "logits/chosen": 0.31347817182540894, "logits/rejected": 3.793872117996216, "logps/chosen": -429.73309326171875, "logps/rejected": -894.3707885742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.5145440101623535, "rewards/margins": 27.934934616088867, "rewards/rejected": -34.44947814941406, "step": 3309 }, { "epoch": 2.0590979782270606, "grad_norm": 0.08523139357566833, "learning_rate": 1.7415859843245737e-06, "logits/chosen": 1.4979889392852783, "logits/rejected": 2.8870656490325928, "logps/chosen": -652.359375, "logps/rejected": -1072.613037109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.882624626159668, "rewards/margins": 29.20603370666504, "rewards/rejected": -43.08865737915039, "step": 3310 }, { "epoch": 2.059720062208398, "grad_norm": 1.8696160316467285, "learning_rate": 1.7404333794375289e-06, "logits/chosen": -1.357941746711731, "logits/rejected": 5.028107166290283, "logps/chosen": -366.6391296386719, "logps/rejected": -1070.263916015625, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -11.512500762939453, "rewards/margins": 30.921201705932617, "rewards/rejected": -42.4337043762207, "step": 3311 }, { "epoch": 2.0603421461897358, "grad_norm": 4.39236537204124e-05, "learning_rate": 1.7392807745504841e-06, "logits/chosen": -0.5997156500816345, "logits/rejected": 3.348984956741333, "logps/chosen": -473.5180969238281, "logps/rejected": -960.445556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.715170860290527, "rewards/margins": 27.744476318359375, "rewards/rejected": -34.45964813232422, "step": 3312 }, { "epoch": 2.060964230171073, "grad_norm": 10.655519485473633, "learning_rate": 1.7381281696634396e-06, "logits/chosen": 2.1897690296173096, "logits/rejected": 4.3341875076293945, "logps/chosen": -429.31158447265625, "logps/rejected": -806.4674072265625, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": -4.688857078552246, "rewards/margins": 27.4539737701416, "rewards/rejected": -32.1428337097168, "step": 3313 }, { "epoch": 2.0615863141524104, "grad_norm": 17.07646369934082, "learning_rate": 1.7369755647763948e-06, "logits/chosen": 1.3907215595245361, "logits/rejected": 2.856285333633423, "logps/chosen": -531.179443359375, "logps/rejected": -748.7376708984375, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": -8.567995071411133, "rewards/margins": 19.450092315673828, "rewards/rejected": -28.01808738708496, "step": 3314 }, { "epoch": 2.0622083981337482, "grad_norm": 0.2608616054058075, "learning_rate": 1.73582295988935e-06, "logits/chosen": -3.7539005279541016, "logits/rejected": 3.1033551692962646, "logps/chosen": -227.97682189941406, "logps/rejected": -840.460205078125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -3.802971363067627, "rewards/margins": 24.070194244384766, "rewards/rejected": -27.873165130615234, "step": 3315 }, { "epoch": 2.0628304821150856, "grad_norm": 0.00168923893943429, "learning_rate": 1.7346703550023052e-06, "logits/chosen": -0.24053560197353363, "logits/rejected": 1.6045324802398682, "logps/chosen": -540.8382568359375, "logps/rejected": -906.5202026367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.838738441467285, "rewards/margins": 26.211814880371094, "rewards/rejected": -37.05055236816406, "step": 3316 }, { "epoch": 2.063452566096423, "grad_norm": 19.610145568847656, "learning_rate": 1.7335177501152607e-06, "logits/chosen": 1.4513070583343506, "logits/rejected": 2.9208927154541016, "logps/chosen": -546.4124145507812, "logps/rejected": -900.282470703125, "loss": 0.1005, "rewards/accuracies": 0.875, "rewards/chosen": -8.553874969482422, "rewards/margins": 29.052043914794922, "rewards/rejected": -37.605918884277344, "step": 3317 }, { "epoch": 2.0640746500777607, "grad_norm": 4.993141919840127e-05, "learning_rate": 1.7323651452282159e-06, "logits/chosen": -4.351912498474121, "logits/rejected": 2.076814651489258, "logps/chosen": -291.6829833984375, "logps/rejected": -931.6229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.039106369018555, "rewards/margins": 25.343379974365234, "rewards/rejected": -30.38248634338379, "step": 3318 }, { "epoch": 2.064696734059098, "grad_norm": 0.47496530413627625, "learning_rate": 1.7312125403411711e-06, "logits/chosen": -1.0925979614257812, "logits/rejected": 2.5883841514587402, "logps/chosen": -294.2505187988281, "logps/rejected": -662.4988403320312, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -3.046053886413574, "rewards/margins": 20.85523223876953, "rewards/rejected": -23.901287078857422, "step": 3319 }, { "epoch": 2.0653188180404354, "grad_norm": 0.0007322177407331765, "learning_rate": 1.7300599354541263e-06, "logits/chosen": -1.0787445306777954, "logits/rejected": 3.1141369342803955, "logps/chosen": -682.3014526367188, "logps/rejected": -1381.6591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.946907043457031, "rewards/margins": 40.99354934692383, "rewards/rejected": -53.940452575683594, "step": 3320 }, { "epoch": 2.065940902021773, "grad_norm": 0.7636861205101013, "learning_rate": 1.7289073305670818e-06, "logits/chosen": -2.0277695655822754, "logits/rejected": 1.954400897026062, "logps/chosen": -455.1981201171875, "logps/rejected": -830.0636596679688, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -7.5624308586120605, "rewards/margins": 14.610343933105469, "rewards/rejected": -22.172775268554688, "step": 3321 }, { "epoch": 2.0665629860031105, "grad_norm": 0.0060247681103646755, "learning_rate": 1.727754725680037e-06, "logits/chosen": 0.31880348920822144, "logits/rejected": 2.8384175300598145, "logps/chosen": -446.51092529296875, "logps/rejected": -844.023681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.652673721313477, "rewards/margins": 25.704570770263672, "rewards/rejected": -35.357242584228516, "step": 3322 }, { "epoch": 2.067185069984448, "grad_norm": 0.015240863896906376, "learning_rate": 1.7266021207929922e-06, "logits/chosen": -1.3900054693222046, "logits/rejected": 2.368161916732788, "logps/chosen": -461.9625244140625, "logps/rejected": -941.9267578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.914365768432617, "rewards/margins": 31.33608055114746, "rewards/rejected": -38.25044631958008, "step": 3323 }, { "epoch": 2.067807153965785, "grad_norm": 7.038326543806761e-07, "learning_rate": 1.7254495159059477e-06, "logits/chosen": -1.3869534730911255, "logits/rejected": 2.0281357765197754, "logps/chosen": -487.2365417480469, "logps/rejected": -1085.301513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.76966381072998, "rewards/margins": 36.14636993408203, "rewards/rejected": -44.91603469848633, "step": 3324 }, { "epoch": 2.068429237947123, "grad_norm": 0.047666098922491074, "learning_rate": 1.7242969110189029e-06, "logits/chosen": 2.2894299030303955, "logits/rejected": 3.7836549282073975, "logps/chosen": -689.18798828125, "logps/rejected": -1073.497802734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.957679748535156, "rewards/margins": 26.87308120727539, "rewards/rejected": -40.83075714111328, "step": 3325 }, { "epoch": 2.0690513219284603, "grad_norm": 1.717453734784158e-08, "learning_rate": 1.7231443061318581e-06, "logits/chosen": 1.9342631101608276, "logits/rejected": 2.905841588973999, "logps/chosen": -699.0042724609375, "logps/rejected": -1064.35498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.291414260864258, "rewards/margins": 30.3492488861084, "rewards/rejected": -41.640663146972656, "step": 3326 }, { "epoch": 2.0696734059097976, "grad_norm": 0.1288936734199524, "learning_rate": 1.7219917012448133e-06, "logits/chosen": 3.1761245727539062, "logits/rejected": 3.488016128540039, "logps/chosen": -561.826904296875, "logps/rejected": -791.1370849609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.260245323181152, "rewards/margins": 22.008047103881836, "rewards/rejected": -30.268291473388672, "step": 3327 }, { "epoch": 2.0702954898911354, "grad_norm": 4.1853404075808953e-10, "learning_rate": 1.7208390963577688e-06, "logits/chosen": 1.2564480304718018, "logits/rejected": 4.029335021972656, "logps/chosen": -612.748291015625, "logps/rejected": -1131.76025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.627924919128418, "rewards/margins": 35.778465270996094, "rewards/rejected": -46.40639114379883, "step": 3328 }, { "epoch": 2.0709175738724728, "grad_norm": 0.004606824833899736, "learning_rate": 1.719686491470724e-06, "logits/chosen": 1.6285595893859863, "logits/rejected": 4.791367530822754, "logps/chosen": -513.8687744140625, "logps/rejected": -1024.88525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.402234077453613, "rewards/margins": 32.802608489990234, "rewards/rejected": -41.20484161376953, "step": 3329 }, { "epoch": 2.07153965785381, "grad_norm": 0.041494857519865036, "learning_rate": 1.7185338865836792e-06, "logits/chosen": -1.059615969657898, "logits/rejected": 2.409153938293457, "logps/chosen": -484.38726806640625, "logps/rejected": -1028.5125732421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.8254194259643555, "rewards/margins": 32.270713806152344, "rewards/rejected": -39.09613037109375, "step": 3330 }, { "epoch": 2.072161741835148, "grad_norm": 0.1615261733531952, "learning_rate": 1.7173812816966344e-06, "logits/chosen": -0.8603125214576721, "logits/rejected": 3.1495981216430664, "logps/chosen": -456.7115173339844, "logps/rejected": -837.7604370117188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.486283302307129, "rewards/margins": 18.79230308532715, "rewards/rejected": -27.27858543395996, "step": 3331 }, { "epoch": 2.0727838258164852, "grad_norm": 1.1796037142630666e-05, "learning_rate": 1.7162286768095899e-06, "logits/chosen": 2.506577491760254, "logits/rejected": 4.20957612991333, "logps/chosen": -557.8585205078125, "logps/rejected": -940.8349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.77398681640625, "rewards/margins": 27.37487030029297, "rewards/rejected": -39.14885711669922, "step": 3332 }, { "epoch": 2.0734059097978226, "grad_norm": 0.10842271149158478, "learning_rate": 1.715076071922545e-06, "logits/chosen": 1.1956841945648193, "logits/rejected": 2.755904197692871, "logps/chosen": -685.4227905273438, "logps/rejected": -1112.8021240234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -14.265910148620605, "rewards/margins": 30.79898452758789, "rewards/rejected": -45.06489562988281, "step": 3333 }, { "epoch": 2.0740279937791604, "grad_norm": 0.09202323108911514, "learning_rate": 1.7139234670355003e-06, "logits/chosen": 0.308301717042923, "logits/rejected": 3.1039891242980957, "logps/chosen": -546.5222778320312, "logps/rejected": -1041.5968017578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -12.068633079528809, "rewards/margins": 29.088891983032227, "rewards/rejected": -41.15752410888672, "step": 3334 }, { "epoch": 2.0746500777604977, "grad_norm": 1.3221635526861064e-05, "learning_rate": 1.7127708621484558e-06, "logits/chosen": 2.04056453704834, "logits/rejected": 3.642124652862549, "logps/chosen": -615.4324951171875, "logps/rejected": -992.840087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.023149490356445, "rewards/margins": 27.139328002929688, "rewards/rejected": -40.162479400634766, "step": 3335 }, { "epoch": 2.075272161741835, "grad_norm": 0.001122101442888379, "learning_rate": 1.711618257261411e-06, "logits/chosen": 2.35383939743042, "logits/rejected": 4.766437530517578, "logps/chosen": -628.700439453125, "logps/rejected": -1054.2471923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.110316276550293, "rewards/margins": 25.99763298034668, "rewards/rejected": -37.107948303222656, "step": 3336 }, { "epoch": 2.075894245723173, "grad_norm": 0.00026429022545926273, "learning_rate": 1.7104656523743662e-06, "logits/chosen": 0.6324070692062378, "logits/rejected": 3.297117233276367, "logps/chosen": -642.6302490234375, "logps/rejected": -1133.9599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.002674102783203, "rewards/margins": 33.64592361450195, "rewards/rejected": -44.648597717285156, "step": 3337 }, { "epoch": 2.07651632970451, "grad_norm": 0.2311714142560959, "learning_rate": 1.7093130474873214e-06, "logits/chosen": 1.3173719644546509, "logits/rejected": 4.45114803314209, "logps/chosen": -575.9068603515625, "logps/rejected": -1084.0186767578125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -12.138073921203613, "rewards/margins": 27.18564224243164, "rewards/rejected": -39.32371520996094, "step": 3338 }, { "epoch": 2.0771384136858475, "grad_norm": 1.0550995284575038e-05, "learning_rate": 1.7081604426002769e-06, "logits/chosen": -1.0990958213806152, "logits/rejected": 1.1802140474319458, "logps/chosen": -554.207275390625, "logps/rejected": -947.847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.284149169921875, "rewards/margins": 32.44597244262695, "rewards/rejected": -44.730125427246094, "step": 3339 }, { "epoch": 2.0777604976671853, "grad_norm": 0.31715869903564453, "learning_rate": 1.707007837713232e-06, "logits/chosen": 0.9920371770858765, "logits/rejected": 4.3308634757995605, "logps/chosen": -586.5159912109375, "logps/rejected": -1092.0472412109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -11.493955612182617, "rewards/margins": 28.599620819091797, "rewards/rejected": -40.09357452392578, "step": 3340 }, { "epoch": 2.0783825816485226, "grad_norm": 0.0001579702802700922, "learning_rate": 1.7058552328261873e-06, "logits/chosen": 1.2911337614059448, "logits/rejected": 3.543045997619629, "logps/chosen": -585.599609375, "logps/rejected": -1032.3271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.259281158447266, "rewards/margins": 31.118511199951172, "rewards/rejected": -40.37779235839844, "step": 3341 }, { "epoch": 2.07900466562986, "grad_norm": 7.858145245620562e-09, "learning_rate": 1.7047026279391426e-06, "logits/chosen": -0.6377176642417908, "logits/rejected": 2.4123430252075195, "logps/chosen": -489.8824768066406, "logps/rejected": -1049.6268310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.102771759033203, "rewards/margins": 35.268489837646484, "rewards/rejected": -43.37126159667969, "step": 3342 }, { "epoch": 2.0796267496111973, "grad_norm": 8.685908881034266e-09, "learning_rate": 1.703550023052098e-06, "logits/chosen": 0.5078180432319641, "logits/rejected": 3.226416826248169, "logps/chosen": -583.5855102539062, "logps/rejected": -1118.15673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.772845268249512, "rewards/margins": 36.71405029296875, "rewards/rejected": -44.48689270019531, "step": 3343 }, { "epoch": 2.080248833592535, "grad_norm": 0.0039777737110853195, "learning_rate": 1.7023974181650532e-06, "logits/chosen": 0.7485368251800537, "logits/rejected": 3.2258193492889404, "logps/chosen": -613.4325561523438, "logps/rejected": -1030.727783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.436695098876953, "rewards/margins": 24.86806869506836, "rewards/rejected": -36.30476379394531, "step": 3344 }, { "epoch": 2.0808709175738724, "grad_norm": 0.1329488307237625, "learning_rate": 1.7012448132780084e-06, "logits/chosen": 0.3994715213775635, "logits/rejected": 1.6882116794586182, "logps/chosen": -573.140869140625, "logps/rejected": -1020.3795166015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.334638595581055, "rewards/margins": 33.32188415527344, "rewards/rejected": -42.656524658203125, "step": 3345 }, { "epoch": 2.0814930015552098, "grad_norm": 4.0168673876905814e-05, "learning_rate": 1.7000922083909639e-06, "logits/chosen": -1.8198471069335938, "logits/rejected": 2.12861967086792, "logps/chosen": -425.8285827636719, "logps/rejected": -976.2532958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.025663375854492, "rewards/margins": 29.32131004333496, "rewards/rejected": -39.34697723388672, "step": 3346 }, { "epoch": 2.0821150855365476, "grad_norm": 0.0009763347334228456, "learning_rate": 1.698939603503919e-06, "logits/chosen": -0.5600918531417847, "logits/rejected": 3.825688600540161, "logps/chosen": -500.5871276855469, "logps/rejected": -1128.254638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.031661987304688, "rewards/margins": 36.95503234863281, "rewards/rejected": -48.9866943359375, "step": 3347 }, { "epoch": 2.082737169517885, "grad_norm": 5.5919431360962335e-06, "learning_rate": 1.6977869986168743e-06, "logits/chosen": 1.264890193939209, "logits/rejected": 4.474057674407959, "logps/chosen": -614.8546142578125, "logps/rejected": -1293.04052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.77999496459961, "rewards/margins": 41.97685241699219, "rewards/rejected": -56.7568473815918, "step": 3348 }, { "epoch": 2.0833592534992222, "grad_norm": 0.012092916294932365, "learning_rate": 1.6966343937298295e-06, "logits/chosen": 1.270735263824463, "logits/rejected": 2.718186855316162, "logps/chosen": -504.2120666503906, "logps/rejected": -853.6555786132812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.743149757385254, "rewards/margins": 22.66098403930664, "rewards/rejected": -29.404136657714844, "step": 3349 }, { "epoch": 2.08398133748056, "grad_norm": 27.2022705078125, "learning_rate": 1.695481788842785e-06, "logits/chosen": -0.6622167825698853, "logits/rejected": 3.0031518936157227, "logps/chosen": -539.7035522460938, "logps/rejected": -962.2923583984375, "loss": 0.181, "rewards/accuracies": 0.875, "rewards/chosen": -11.674617767333984, "rewards/margins": 25.50714111328125, "rewards/rejected": -37.181758880615234, "step": 3350 }, { "epoch": 2.0846034214618974, "grad_norm": 0.1362578123807907, "learning_rate": 1.6943291839557402e-06, "logits/chosen": 0.1251983642578125, "logits/rejected": 1.8397150039672852, "logps/chosen": -611.310791015625, "logps/rejected": -951.3914794921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -11.344130516052246, "rewards/margins": 19.910261154174805, "rewards/rejected": -31.254390716552734, "step": 3351 }, { "epoch": 2.0852255054432347, "grad_norm": 0.47021928429603577, "learning_rate": 1.6931765790686954e-06, "logits/chosen": -0.5348381996154785, "logits/rejected": 5.024543285369873, "logps/chosen": -355.5381164550781, "logps/rejected": -895.0904541015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.548956394195557, "rewards/margins": 22.768566131591797, "rewards/rejected": -28.317522048950195, "step": 3352 }, { "epoch": 2.0858475894245725, "grad_norm": 0.0482005700469017, "learning_rate": 1.6920239741816507e-06, "logits/chosen": -1.475197672843933, "logits/rejected": 1.649541974067688, "logps/chosen": -457.4234619140625, "logps/rejected": -933.4058837890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.065631866455078, "rewards/margins": 29.441650390625, "rewards/rejected": -42.50728225708008, "step": 3353 }, { "epoch": 2.08646967340591, "grad_norm": 0.024374065920710564, "learning_rate": 1.690871369294606e-06, "logits/chosen": 0.28265321254730225, "logits/rejected": 3.3512191772460938, "logps/chosen": -564.5770263671875, "logps/rejected": -1192.9810791015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.895007133483887, "rewards/margins": 35.95161819458008, "rewards/rejected": -42.846622467041016, "step": 3354 }, { "epoch": 2.087091757387247, "grad_norm": 0.04075371474027634, "learning_rate": 1.6897187644075613e-06, "logits/chosen": 2.054584503173828, "logits/rejected": 3.301548480987549, "logps/chosen": -597.0030517578125, "logps/rejected": -926.091552734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.193412780761719, "rewards/margins": 23.50974464416504, "rewards/rejected": -32.703155517578125, "step": 3355 }, { "epoch": 2.087713841368585, "grad_norm": 0.08765062689781189, "learning_rate": 1.6885661595205165e-06, "logits/chosen": 0.8265056610107422, "logits/rejected": 2.3820858001708984, "logps/chosen": -497.92254638671875, "logps/rejected": -863.9046020507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.941751480102539, "rewards/margins": 28.759737014770508, "rewards/rejected": -36.70149230957031, "step": 3356 }, { "epoch": 2.0883359253499223, "grad_norm": 6.599282187380595e-06, "learning_rate": 1.687413554633472e-06, "logits/chosen": -1.7757030725479126, "logits/rejected": 3.9701287746429443, "logps/chosen": -296.7178649902344, "logps/rejected": -905.2001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.456328868865967, "rewards/margins": 29.916818618774414, "rewards/rejected": -34.373146057128906, "step": 3357 }, { "epoch": 2.0889580093312596, "grad_norm": 0.0847984030842781, "learning_rate": 1.6862609497464272e-06, "logits/chosen": -0.9706917405128479, "logits/rejected": 3.0828208923339844, "logps/chosen": -487.76605224609375, "logps/rejected": -959.8536987304688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.820488452911377, "rewards/margins": 30.454444885253906, "rewards/rejected": -36.274932861328125, "step": 3358 }, { "epoch": 2.0895800933125974, "grad_norm": 3.220686994609423e-05, "learning_rate": 1.6851083448593824e-06, "logits/chosen": 0.18418824672698975, "logits/rejected": 3.0592291355133057, "logps/chosen": -435.98345947265625, "logps/rejected": -890.0090942382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9094061851501465, "rewards/margins": 27.20081329345703, "rewards/rejected": -34.110225677490234, "step": 3359 }, { "epoch": 2.0902021772939348, "grad_norm": 3.048164742835979e-08, "learning_rate": 1.6839557399723377e-06, "logits/chosen": -2.1238396167755127, "logits/rejected": 1.927706241607666, "logps/chosen": -496.24481201171875, "logps/rejected": -1024.53955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.999995231628418, "rewards/margins": 35.04389953613281, "rewards/rejected": -43.04389953613281, "step": 3360 }, { "epoch": 2.090824261275272, "grad_norm": 0.4297603964805603, "learning_rate": 1.682803135085293e-06, "logits/chosen": 1.2899434566497803, "logits/rejected": 1.664573073387146, "logps/chosen": -559.8187255859375, "logps/rejected": -939.0767822265625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.20655632019043, "rewards/margins": 28.083295822143555, "rewards/rejected": -35.289852142333984, "step": 3361 }, { "epoch": 2.0914463452566094, "grad_norm": 2.5927631668309914e-06, "learning_rate": 1.6816505301982483e-06, "logits/chosen": -0.8606783151626587, "logits/rejected": 1.8166348934173584, "logps/chosen": -439.0218200683594, "logps/rejected": -962.8194580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.483155250549316, "rewards/margins": 36.84089660644531, "rewards/rejected": -44.32405090332031, "step": 3362 }, { "epoch": 2.0920684292379472, "grad_norm": 0.004372260067611933, "learning_rate": 1.6804979253112035e-06, "logits/chosen": -1.8572988510131836, "logits/rejected": 2.5752906799316406, "logps/chosen": -525.1903076171875, "logps/rejected": -1165.911865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.921345710754395, "rewards/margins": 36.948970794677734, "rewards/rejected": -45.87031936645508, "step": 3363 }, { "epoch": 2.0926905132192846, "grad_norm": 0.0018370038596913218, "learning_rate": 1.6793453204241586e-06, "logits/chosen": 0.052701711654663086, "logits/rejected": 2.7828869819641113, "logps/chosen": -501.08148193359375, "logps/rejected": -904.0264892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.115810394287109, "rewards/margins": 22.015249252319336, "rewards/rejected": -29.131057739257812, "step": 3364 }, { "epoch": 2.093312597200622, "grad_norm": 7.0257151492114644e-06, "learning_rate": 1.6781927155371138e-06, "logits/chosen": 1.1880685091018677, "logits/rejected": 2.866147756576538, "logps/chosen": -572.2569580078125, "logps/rejected": -942.1068115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.220047950744629, "rewards/margins": 26.508953094482422, "rewards/rejected": -39.729000091552734, "step": 3365 }, { "epoch": 2.0939346811819597, "grad_norm": 0.17728877067565918, "learning_rate": 1.6770401106500692e-06, "logits/chosen": 1.9238035678863525, "logits/rejected": 4.696476936340332, "logps/chosen": -634.623779296875, "logps/rejected": -1072.55908203125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -11.632692337036133, "rewards/margins": 28.64840316772461, "rewards/rejected": -40.281097412109375, "step": 3366 }, { "epoch": 2.094556765163297, "grad_norm": 0.02830907702445984, "learning_rate": 1.6758875057630244e-06, "logits/chosen": -2.150949001312256, "logits/rejected": 2.483363628387451, "logps/chosen": -298.2734069824219, "logps/rejected": -914.7220458984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.8420822620391846, "rewards/margins": 32.87299346923828, "rewards/rejected": -36.71507263183594, "step": 3367 }, { "epoch": 2.0951788491446344, "grad_norm": 0.008550606667995453, "learning_rate": 1.6747349008759797e-06, "logits/chosen": -0.103985995054245, "logits/rejected": 2.6301164627075195, "logps/chosen": -555.9187622070312, "logps/rejected": -1000.498291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.633752822875977, "rewards/margins": 32.593162536621094, "rewards/rejected": -39.2269172668457, "step": 3368 }, { "epoch": 2.095800933125972, "grad_norm": 2.6436568077770062e-05, "learning_rate": 1.673582295988935e-06, "logits/chosen": -0.7505025863647461, "logits/rejected": 3.1368703842163086, "logps/chosen": -460.7519226074219, "logps/rejected": -933.426513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.978192329406738, "rewards/margins": 23.834163665771484, "rewards/rejected": -32.812355041503906, "step": 3369 }, { "epoch": 2.0964230171073095, "grad_norm": 4.5708739015992705e-09, "learning_rate": 1.6724296911018903e-06, "logits/chosen": -0.32886290550231934, "logits/rejected": 4.3517279624938965, "logps/chosen": -455.4180908203125, "logps/rejected": -1061.9700927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.620471000671387, "rewards/margins": 33.597320556640625, "rewards/rejected": -41.21778869628906, "step": 3370 }, { "epoch": 2.097045101088647, "grad_norm": 0.005900989286601543, "learning_rate": 1.6712770862148456e-06, "logits/chosen": -0.8842804431915283, "logits/rejected": 2.112257957458496, "logps/chosen": -372.7100524902344, "logps/rejected": -772.2435913085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.723258018493652, "rewards/margins": 25.396638870239258, "rewards/rejected": -32.119895935058594, "step": 3371 }, { "epoch": 2.0976671850699846, "grad_norm": 0.00877196155488491, "learning_rate": 1.6701244813278008e-06, "logits/chosen": 3.7252695560455322, "logits/rejected": 4.46113395690918, "logps/chosen": -819.5318603515625, "logps/rejected": -1156.776123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.046567916870117, "rewards/margins": 26.81656265258789, "rewards/rejected": -41.863128662109375, "step": 3372 }, { "epoch": 2.098289269051322, "grad_norm": 0.02428249642252922, "learning_rate": 1.6689718764407562e-06, "logits/chosen": -1.1189624071121216, "logits/rejected": 1.4620922803878784, "logps/chosen": -482.97113037109375, "logps/rejected": -990.8453979492188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -13.332776069641113, "rewards/margins": 33.3018798828125, "rewards/rejected": -46.63465881347656, "step": 3373 }, { "epoch": 2.0989113530326593, "grad_norm": 0.001983237685635686, "learning_rate": 1.6678192715537114e-06, "logits/chosen": 0.7820011973381042, "logits/rejected": 4.40946626663208, "logps/chosen": -550.2945556640625, "logps/rejected": -1139.11083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.764097213745117, "rewards/margins": 37.623233795166016, "rewards/rejected": -50.3873291015625, "step": 3374 }, { "epoch": 2.099533437013997, "grad_norm": 2.0523664545635256e-07, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -0.4806283414363861, "logits/rejected": 4.106095790863037, "logps/chosen": -491.4980163574219, "logps/rejected": -1238.3123779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.256150245666504, "rewards/margins": 43.39604568481445, "rewards/rejected": -52.65219497680664, "step": 3375 }, { "epoch": 2.1001555209953344, "grad_norm": 0.03712281957268715, "learning_rate": 1.665514061779622e-06, "logits/chosen": -0.9505767822265625, "logits/rejected": 1.9326263666152954, "logps/chosen": -391.64251708984375, "logps/rejected": -812.211181640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.85351037979126, "rewards/margins": 31.238975524902344, "rewards/rejected": -36.09248733520508, "step": 3376 }, { "epoch": 2.1007776049766718, "grad_norm": 15.193599700927734, "learning_rate": 1.6643614568925773e-06, "logits/chosen": 0.22194261848926544, "logits/rejected": 4.298823833465576, "logps/chosen": -400.57037353515625, "logps/rejected": -1007.1203002929688, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -5.647911548614502, "rewards/margins": 27.597047805786133, "rewards/rejected": -33.244956970214844, "step": 3377 }, { "epoch": 2.1013996889580095, "grad_norm": 0.00013249566836748272, "learning_rate": 1.6632088520055325e-06, "logits/chosen": -0.22353875637054443, "logits/rejected": 3.013558864593506, "logps/chosen": -383.1407775878906, "logps/rejected": -873.5018310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.963991165161133, "rewards/margins": 32.24616241455078, "rewards/rejected": -39.21015167236328, "step": 3378 }, { "epoch": 2.102021772939347, "grad_norm": 0.010166036896407604, "learning_rate": 1.6620562471184878e-06, "logits/chosen": 2.5323030948638916, "logits/rejected": 3.62776517868042, "logps/chosen": -519.2050170898438, "logps/rejected": -897.3560180664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.967037200927734, "rewards/margins": 29.247364044189453, "rewards/rejected": -39.21440124511719, "step": 3379 }, { "epoch": 2.1026438569206842, "grad_norm": 0.010199688374996185, "learning_rate": 1.6609036422314432e-06, "logits/chosen": -0.9682947397232056, "logits/rejected": 2.1650946140289307, "logps/chosen": -393.393798828125, "logps/rejected": -769.2060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.042436599731445, "rewards/margins": 21.88577651977539, "rewards/rejected": -30.928213119506836, "step": 3380 }, { "epoch": 2.1032659409020216, "grad_norm": 3.648558731583762e-06, "learning_rate": 1.6597510373443984e-06, "logits/chosen": -1.0206509828567505, "logits/rejected": 1.8315527439117432, "logps/chosen": -546.6287231445312, "logps/rejected": -1130.8519287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.928306579589844, "rewards/margins": 32.370277404785156, "rewards/rejected": -44.298583984375, "step": 3381 }, { "epoch": 2.1038880248833594, "grad_norm": 7.011471439000161e-07, "learning_rate": 1.6585984324573537e-06, "logits/chosen": 0.48149287700653076, "logits/rejected": 3.7991549968719482, "logps/chosen": -459.12738037109375, "logps/rejected": -922.0175170898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.526046752929688, "rewards/margins": 27.56385040283203, "rewards/rejected": -39.08989715576172, "step": 3382 }, { "epoch": 2.1045101088646967, "grad_norm": 0.5247215628623962, "learning_rate": 1.6574458275703089e-06, "logits/chosen": 2.620265245437622, "logits/rejected": 3.077489137649536, "logps/chosen": -687.5137329101562, "logps/rejected": -940.1102905273438, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -11.058935165405273, "rewards/margins": 25.865474700927734, "rewards/rejected": -36.924407958984375, "step": 3383 }, { "epoch": 2.105132192846034, "grad_norm": 0.48539650440216064, "learning_rate": 1.6562932226832643e-06, "logits/chosen": -0.010341644287109375, "logits/rejected": 0.931721031665802, "logps/chosen": -578.7786254882812, "logps/rejected": -866.927734375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -10.459258079528809, "rewards/margins": 21.96541976928711, "rewards/rejected": -32.42467498779297, "step": 3384 }, { "epoch": 2.105754276827372, "grad_norm": 0.00542947044596076, "learning_rate": 1.6551406177962195e-06, "logits/chosen": 1.2294321060180664, "logits/rejected": 3.682133436203003, "logps/chosen": -672.1929931640625, "logps/rejected": -1026.928955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.094335556030273, "rewards/margins": 23.090843200683594, "rewards/rejected": -36.1851806640625, "step": 3385 }, { "epoch": 2.106376360808709, "grad_norm": 0.00023597065592184663, "learning_rate": 1.6539880129091748e-06, "logits/chosen": -0.3777759075164795, "logits/rejected": 2.5276408195495605, "logps/chosen": -501.2518310546875, "logps/rejected": -1042.737060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.490774154663086, "rewards/margins": 31.137483596801758, "rewards/rejected": -42.628257751464844, "step": 3386 }, { "epoch": 2.1069984447900465, "grad_norm": 5.728524411097169e-05, "learning_rate": 1.6528354080221302e-06, "logits/chosen": -1.2409263849258423, "logits/rejected": 4.19053316116333, "logps/chosen": -437.630126953125, "logps/rejected": -1081.5054931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.32150650024414, "rewards/margins": 29.635597229003906, "rewards/rejected": -39.95710372924805, "step": 3387 }, { "epoch": 2.1076205287713843, "grad_norm": 9.662021511758212e-07, "learning_rate": 1.6516828031350854e-06, "logits/chosen": -0.9957299828529358, "logits/rejected": 4.041959285736084, "logps/chosen": -295.1841125488281, "logps/rejected": -854.8515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7770609855651855, "rewards/margins": 31.328670501708984, "rewards/rejected": -35.10573196411133, "step": 3388 }, { "epoch": 2.1082426127527216, "grad_norm": 0.015900224447250366, "learning_rate": 1.6505301982480407e-06, "logits/chosen": 0.956409752368927, "logits/rejected": 2.972102642059326, "logps/chosen": -722.85546875, "logps/rejected": -1107.1051025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.309172630310059, "rewards/margins": 29.619922637939453, "rewards/rejected": -43.92909240722656, "step": 3389 }, { "epoch": 2.108864696734059, "grad_norm": 9.378606796264648, "learning_rate": 1.6493775933609959e-06, "logits/chosen": -0.19955027103424072, "logits/rejected": 2.5805864334106445, "logps/chosen": -541.113525390625, "logps/rejected": -942.9413452148438, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -7.94508695602417, "rewards/margins": 25.10898208618164, "rewards/rejected": -33.05406951904297, "step": 3390 }, { "epoch": 2.1094867807153967, "grad_norm": 20.579364776611328, "learning_rate": 1.6482249884739513e-06, "logits/chosen": 0.4862733781337738, "logits/rejected": 2.504495859146118, "logps/chosen": -518.5618896484375, "logps/rejected": -958.5907592773438, "loss": 0.6058, "rewards/accuracies": 0.875, "rewards/chosen": -11.772661209106445, "rewards/margins": 31.137737274169922, "rewards/rejected": -42.910400390625, "step": 3391 }, { "epoch": 2.110108864696734, "grad_norm": 0.0027981658931821585, "learning_rate": 1.6470723835869065e-06, "logits/chosen": 0.07086589932441711, "logits/rejected": 2.659494161605835, "logps/chosen": -546.3900756835938, "logps/rejected": -847.4092407226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.78062629699707, "rewards/margins": 18.225074768066406, "rewards/rejected": -29.005701065063477, "step": 3392 }, { "epoch": 2.1107309486780714, "grad_norm": 0.0020019779913127422, "learning_rate": 1.6459197786998618e-06, "logits/chosen": 0.425553560256958, "logits/rejected": 3.445070266723633, "logps/chosen": -598.8272705078125, "logps/rejected": -1064.2998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.582368850708008, "rewards/margins": 30.0719051361084, "rewards/rejected": -41.654273986816406, "step": 3393 }, { "epoch": 2.111353032659409, "grad_norm": 10.503533363342285, "learning_rate": 1.644767173812817e-06, "logits/chosen": 0.857345461845398, "logits/rejected": 3.003244638442993, "logps/chosen": -442.4684753417969, "logps/rejected": -779.4535522460938, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -5.216536521911621, "rewards/margins": 25.76353645324707, "rewards/rejected": -30.980072021484375, "step": 3394 }, { "epoch": 2.1119751166407466, "grad_norm": 0.1677616685628891, "learning_rate": 1.6436145689257724e-06, "logits/chosen": -0.8002194166183472, "logits/rejected": 3.6724236011505127, "logps/chosen": -200.62562561035156, "logps/rejected": -812.24609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.35098123550415, "rewards/margins": 32.549068450927734, "rewards/rejected": -36.90005111694336, "step": 3395 }, { "epoch": 2.112597200622084, "grad_norm": 1.996715582208708e-06, "learning_rate": 1.6424619640387277e-06, "logits/chosen": 0.9634472131729126, "logits/rejected": 3.9523916244506836, "logps/chosen": -513.762451171875, "logps/rejected": -1041.6044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.443663597106934, "rewards/margins": 33.710235595703125, "rewards/rejected": -43.153900146484375, "step": 3396 }, { "epoch": 2.1132192846034217, "grad_norm": 0.031665410846471786, "learning_rate": 1.6413093591516829e-06, "logits/chosen": 0.4123764634132385, "logits/rejected": 2.6448452472686768, "logps/chosen": -600.843505859375, "logps/rejected": -983.0840454101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.967991828918457, "rewards/margins": 24.33465576171875, "rewards/rejected": -33.30264663696289, "step": 3397 }, { "epoch": 2.113841368584759, "grad_norm": 0.022291993722319603, "learning_rate": 1.6401567542646383e-06, "logits/chosen": -0.33354711532592773, "logits/rejected": 3.285107135772705, "logps/chosen": -541.2667236328125, "logps/rejected": -1024.5386962890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.661428451538086, "rewards/margins": 31.472816467285156, "rewards/rejected": -43.134246826171875, "step": 3398 }, { "epoch": 2.1144634525660964, "grad_norm": 2.6611015796661377, "learning_rate": 1.6390041493775935e-06, "logits/chosen": -0.07903802394866943, "logits/rejected": 2.2965927124023438, "logps/chosen": -485.8436279296875, "logps/rejected": -1028.42138671875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -8.215842247009277, "rewards/margins": 34.56201934814453, "rewards/rejected": -42.777862548828125, "step": 3399 }, { "epoch": 2.1150855365474337, "grad_norm": 3.3976443774008658e-06, "learning_rate": 1.6378515444905488e-06, "logits/chosen": 0.8789270520210266, "logits/rejected": 2.5680906772613525, "logps/chosen": -475.8021240234375, "logps/rejected": -883.010498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.538758754730225, "rewards/margins": 30.40880584716797, "rewards/rejected": -37.947566986083984, "step": 3400 }, { "epoch": 2.1157076205287715, "grad_norm": 1.922818410093896e-05, "learning_rate": 1.636698939603504e-06, "logits/chosen": -0.7577810883522034, "logits/rejected": 3.907153844833374, "logps/chosen": -377.5826721191406, "logps/rejected": -1134.6827392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.530715942382812, "rewards/margins": 43.018455505371094, "rewards/rejected": -54.549171447753906, "step": 3401 }, { "epoch": 2.116329704510109, "grad_norm": 9.223941802978516, "learning_rate": 1.6355463347164594e-06, "logits/chosen": -1.7244789600372314, "logits/rejected": 2.035127878189087, "logps/chosen": -496.82421875, "logps/rejected": -924.963134765625, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -9.925373077392578, "rewards/margins": 29.017684936523438, "rewards/rejected": -38.943058013916016, "step": 3402 }, { "epoch": 2.116951788491446, "grad_norm": 0.3632352352142334, "learning_rate": 1.6343937298294146e-06, "logits/chosen": 0.20769548416137695, "logits/rejected": 2.323042154312134, "logps/chosen": -590.65380859375, "logps/rejected": -1021.50634765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.967580795288086, "rewards/margins": 28.152793884277344, "rewards/rejected": -36.12037658691406, "step": 3403 }, { "epoch": 2.117573872472784, "grad_norm": 0.0003732674231287092, "learning_rate": 1.6332411249423699e-06, "logits/chosen": -0.18807101249694824, "logits/rejected": 3.297891616821289, "logps/chosen": -446.2677001953125, "logps/rejected": -963.2024536132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.80330753326416, "rewards/margins": 34.15058135986328, "rewards/rejected": -40.953887939453125, "step": 3404 }, { "epoch": 2.1181959564541213, "grad_norm": 0.023219313472509384, "learning_rate": 1.632088520055325e-06, "logits/chosen": 1.1393150091171265, "logits/rejected": 3.2419910430908203, "logps/chosen": -510.16485595703125, "logps/rejected": -958.189453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.300631046295166, "rewards/margins": 33.69567108154297, "rewards/rejected": -39.99630355834961, "step": 3405 }, { "epoch": 2.1188180404354586, "grad_norm": 0.09577471017837524, "learning_rate": 1.6309359151682805e-06, "logits/chosen": -0.7650696635246277, "logits/rejected": 2.7050087451934814, "logps/chosen": -434.15020751953125, "logps/rejected": -878.897216796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.322816848754883, "rewards/margins": 26.0329647064209, "rewards/rejected": -31.35578155517578, "step": 3406 }, { "epoch": 2.1194401244167964, "grad_norm": 0.00028814279357902706, "learning_rate": 1.6297833102812358e-06, "logits/chosen": 0.2786529064178467, "logits/rejected": 3.1810197830200195, "logps/chosen": -590.8251953125, "logps/rejected": -1019.7881469726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.678223609924316, "rewards/margins": 23.088512420654297, "rewards/rejected": -36.76673889160156, "step": 3407 }, { "epoch": 2.1200622083981338, "grad_norm": 0.001266329549252987, "learning_rate": 1.628630705394191e-06, "logits/chosen": 0.22288648784160614, "logits/rejected": 3.409573554992676, "logps/chosen": -519.0152587890625, "logps/rejected": -972.437744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.787503719329834, "rewards/margins": 30.214773178100586, "rewards/rejected": -37.00227737426758, "step": 3408 }, { "epoch": 2.120684292379471, "grad_norm": 0.003725613933056593, "learning_rate": 1.6274781005071464e-06, "logits/chosen": -0.06311874091625214, "logits/rejected": 2.947871446609497, "logps/chosen": -577.43115234375, "logps/rejected": -1169.604736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.592594146728516, "rewards/margins": 36.942161560058594, "rewards/rejected": -45.534751892089844, "step": 3409 }, { "epoch": 2.121306376360809, "grad_norm": 0.0010662629501894116, "learning_rate": 1.6263254956201016e-06, "logits/chosen": 0.02673649787902832, "logits/rejected": 1.3852254152297974, "logps/chosen": -534.2376708984375, "logps/rejected": -950.355712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.195487976074219, "rewards/margins": 28.430917739868164, "rewards/rejected": -40.62640380859375, "step": 3410 }, { "epoch": 2.121928460342146, "grad_norm": 0.11500487476587296, "learning_rate": 1.6251728907330569e-06, "logits/chosen": -1.8423373699188232, "logits/rejected": 1.2112541198730469, "logps/chosen": -461.163818359375, "logps/rejected": -946.5360107421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.386780261993408, "rewards/margins": 32.830833435058594, "rewards/rejected": -40.217613220214844, "step": 3411 }, { "epoch": 2.1225505443234836, "grad_norm": 12.089064598083496, "learning_rate": 1.624020285846012e-06, "logits/chosen": 2.3610942363739014, "logits/rejected": 4.344902992248535, "logps/chosen": -795.449951171875, "logps/rejected": -1055.771484375, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -6.550273418426514, "rewards/margins": 17.250858306884766, "rewards/rejected": -23.801132202148438, "step": 3412 }, { "epoch": 2.1231726283048213, "grad_norm": 0.4581535756587982, "learning_rate": 1.6228676809589675e-06, "logits/chosen": -0.11082451045513153, "logits/rejected": 2.212818145751953, "logps/chosen": -644.3795166015625, "logps/rejected": -961.1854248046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -11.852404594421387, "rewards/margins": 21.750226974487305, "rewards/rejected": -33.602630615234375, "step": 3413 }, { "epoch": 2.1237947122861587, "grad_norm": 5.110677193442825e-06, "learning_rate": 1.6217150760719228e-06, "logits/chosen": -0.10364311933517456, "logits/rejected": 3.4550118446350098, "logps/chosen": -476.4292907714844, "logps/rejected": -1043.4764404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.02131462097168, "rewards/margins": 33.4831428527832, "rewards/rejected": -40.50445556640625, "step": 3414 }, { "epoch": 2.124416796267496, "grad_norm": 1.0429807240086575e-08, "learning_rate": 1.620562471184878e-06, "logits/chosen": 0.379156231880188, "logits/rejected": 3.817188262939453, "logps/chosen": -701.6832885742188, "logps/rejected": -1382.80029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.1474609375, "rewards/margins": 36.24828338623047, "rewards/rejected": -48.395748138427734, "step": 3415 }, { "epoch": 2.125038880248834, "grad_norm": 3.745609262750804e-07, "learning_rate": 1.6194098662978332e-06, "logits/chosen": -0.8152501583099365, "logits/rejected": 2.818958044052124, "logps/chosen": -384.119384765625, "logps/rejected": -905.9598999023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.373373031616211, "rewards/margins": 32.04167175292969, "rewards/rejected": -38.415042877197266, "step": 3416 }, { "epoch": 2.125660964230171, "grad_norm": 0.004811655264347792, "learning_rate": 1.6182572614107886e-06, "logits/chosen": -2.818270444869995, "logits/rejected": 4.097588062286377, "logps/chosen": -214.08514404296875, "logps/rejected": -970.8871459960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.288216590881348, "rewards/margins": 32.85670852661133, "rewards/rejected": -37.14492416381836, "step": 3417 }, { "epoch": 2.1262830482115085, "grad_norm": 3.656461715698242, "learning_rate": 1.6171046565237439e-06, "logits/chosen": -0.7900360226631165, "logits/rejected": 4.9393696784973145, "logps/chosen": -380.90673828125, "logps/rejected": -1163.0694580078125, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -2.9158077239990234, "rewards/margins": 39.61353302001953, "rewards/rejected": -42.52934265136719, "step": 3418 }, { "epoch": 2.126905132192846, "grad_norm": 0.00022827822249382734, "learning_rate": 1.615952051636699e-06, "logits/chosen": -0.14041665196418762, "logits/rejected": 0.7800382971763611, "logps/chosen": -509.66851806640625, "logps/rejected": -845.5088500976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.2577290534973145, "rewards/margins": 25.382957458496094, "rewards/rejected": -32.64068603515625, "step": 3419 }, { "epoch": 2.1275272161741836, "grad_norm": 0.004320131614804268, "learning_rate": 1.6147994467496545e-06, "logits/chosen": -0.5010417699813843, "logits/rejected": 3.5509495735168457, "logps/chosen": -485.992431640625, "logps/rejected": -963.5689697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.654045104980469, "rewards/margins": 28.273672103881836, "rewards/rejected": -36.92771530151367, "step": 3420 }, { "epoch": 2.128149300155521, "grad_norm": 1.9214366986375353e-08, "learning_rate": 1.6136468418626098e-06, "logits/chosen": -1.4153292179107666, "logits/rejected": 2.755247116088867, "logps/chosen": -371.322998046875, "logps/rejected": -966.787841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.08968448638916, "rewards/margins": 36.248077392578125, "rewards/rejected": -41.337764739990234, "step": 3421 }, { "epoch": 2.1287713841368583, "grad_norm": 6.000399288552671e-09, "learning_rate": 1.612494236975565e-06, "logits/chosen": -2.731515407562256, "logits/rejected": 2.4720258712768555, "logps/chosen": -431.3476867675781, "logps/rejected": -1083.6611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.9705119132995605, "rewards/margins": 29.507789611816406, "rewards/rejected": -37.478302001953125, "step": 3422 }, { "epoch": 2.129393468118196, "grad_norm": 0.264360249042511, "learning_rate": 1.6113416320885202e-06, "logits/chosen": -0.4833368957042694, "logits/rejected": 2.7763330936431885, "logps/chosen": -526.0587768554688, "logps/rejected": -1065.353271484375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -11.281364440917969, "rewards/margins": 33.10901641845703, "rewards/rejected": -44.390380859375, "step": 3423 }, { "epoch": 2.1300155520995334, "grad_norm": 23.688011169433594, "learning_rate": 1.6101890272014756e-06, "logits/chosen": -0.3603689968585968, "logits/rejected": 3.46712064743042, "logps/chosen": -398.64654541015625, "logps/rejected": -885.201904296875, "loss": 0.1506, "rewards/accuracies": 0.875, "rewards/chosen": -8.763185501098633, "rewards/margins": 26.965011596679688, "rewards/rejected": -35.72819900512695, "step": 3424 }, { "epoch": 2.1306376360808708, "grad_norm": 0.0001959709479706362, "learning_rate": 1.6090364223144309e-06, "logits/chosen": 0.1824033260345459, "logits/rejected": 2.3923895359039307, "logps/chosen": -480.4774169921875, "logps/rejected": -901.778076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.293122291564941, "rewards/margins": 25.791786193847656, "rewards/rejected": -38.08490753173828, "step": 3425 }, { "epoch": 2.1312597200622085, "grad_norm": 2.947682787635131e-06, "learning_rate": 1.607883817427386e-06, "logits/chosen": 1.6706695556640625, "logits/rejected": 1.5100889205932617, "logps/chosen": -709.5021362304688, "logps/rejected": -1123.13623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.592023849487305, "rewards/margins": 37.92151641845703, "rewards/rejected": -50.51354217529297, "step": 3426 }, { "epoch": 2.131881804043546, "grad_norm": 8.702866580279078e-06, "learning_rate": 1.6067312125403415e-06, "logits/chosen": -3.008915424346924, "logits/rejected": 3.2472410202026367, "logps/chosen": -362.72802734375, "logps/rejected": -1140.3369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.616853713989258, "rewards/margins": 39.08551025390625, "rewards/rejected": -47.70236587524414, "step": 3427 }, { "epoch": 2.132503888024883, "grad_norm": 8.457972580799833e-05, "learning_rate": 1.6055786076532967e-06, "logits/chosen": 0.2832450866699219, "logits/rejected": 5.077991008758545, "logps/chosen": -337.720458984375, "logps/rejected": -969.3193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.811567783355713, "rewards/margins": 29.737640380859375, "rewards/rejected": -34.5492057800293, "step": 3428 }, { "epoch": 2.133125972006221, "grad_norm": 0.019046053290367126, "learning_rate": 1.604426002766252e-06, "logits/chosen": 1.4136080741882324, "logits/rejected": 2.035309314727783, "logps/chosen": -560.418212890625, "logps/rejected": -939.0333251953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.986815929412842, "rewards/margins": 34.09614181518555, "rewards/rejected": -41.08295440673828, "step": 3429 }, { "epoch": 2.1337480559875583, "grad_norm": 0.00037207978311926126, "learning_rate": 1.6032733978792072e-06, "logits/chosen": 1.2080271244049072, "logits/rejected": 3.942293643951416, "logps/chosen": -448.4974365234375, "logps/rejected": -820.5440673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.193816184997559, "rewards/margins": 23.287883758544922, "rewards/rejected": -32.4817008972168, "step": 3430 }, { "epoch": 2.1343701399688957, "grad_norm": 5.110921859741211, "learning_rate": 1.6021207929921626e-06, "logits/chosen": -1.5018537044525146, "logits/rejected": 1.764155626296997, "logps/chosen": -333.2330322265625, "logps/rejected": -845.230712890625, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -7.84320068359375, "rewards/margins": 27.2366943359375, "rewards/rejected": -35.07989501953125, "step": 3431 }, { "epoch": 2.1349922239502335, "grad_norm": 0.10313411802053452, "learning_rate": 1.6009681881051176e-06, "logits/chosen": 0.2874959111213684, "logits/rejected": 3.128005027770996, "logps/chosen": -493.38836669921875, "logps/rejected": -862.8633422851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.106266021728516, "rewards/margins": 22.064449310302734, "rewards/rejected": -29.170717239379883, "step": 3432 }, { "epoch": 2.135614307931571, "grad_norm": 0.5469374060630798, "learning_rate": 1.5998155832180729e-06, "logits/chosen": 0.9954251050949097, "logits/rejected": 3.078653335571289, "logps/chosen": -632.5187377929688, "logps/rejected": -1006.222412109375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -16.858264923095703, "rewards/margins": 20.12818717956543, "rewards/rejected": -36.9864501953125, "step": 3433 }, { "epoch": 2.136236391912908, "grad_norm": 0.04896441847085953, "learning_rate": 1.598662978331028e-06, "logits/chosen": -0.063498854637146, "logits/rejected": 3.525094509124756, "logps/chosen": -441.2791442871094, "logps/rejected": -863.0089111328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.556578636169434, "rewards/margins": 21.468883514404297, "rewards/rejected": -29.025463104248047, "step": 3434 }, { "epoch": 2.136858475894246, "grad_norm": 8.239752787631005e-06, "learning_rate": 1.5975103734439833e-06, "logits/chosen": -0.23733371496200562, "logits/rejected": 0.8966051936149597, "logps/chosen": -587.71337890625, "logps/rejected": -949.0718994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.718193054199219, "rewards/margins": 26.919227600097656, "rewards/rejected": -37.637420654296875, "step": 3435 }, { "epoch": 2.1374805598755833, "grad_norm": 0.00015162007184699178, "learning_rate": 1.5963577685569388e-06, "logits/chosen": 0.4512918293476105, "logits/rejected": 3.2689778804779053, "logps/chosen": -581.7296752929688, "logps/rejected": -1036.16357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.346375465393066, "rewards/margins": 32.81805419921875, "rewards/rejected": -42.1644287109375, "step": 3436 }, { "epoch": 2.1381026438569206, "grad_norm": 0.00044463237281888723, "learning_rate": 1.595205163669894e-06, "logits/chosen": -0.19343051314353943, "logits/rejected": 1.666266679763794, "logps/chosen": -576.584716796875, "logps/rejected": -1199.6351318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0612592697143555, "rewards/margins": 40.66623306274414, "rewards/rejected": -45.72749328613281, "step": 3437 }, { "epoch": 2.138724727838258, "grad_norm": 6.797229161747964e-06, "learning_rate": 1.5940525587828492e-06, "logits/chosen": 2.218825340270996, "logits/rejected": 3.9413962364196777, "logps/chosen": -811.3069458007812, "logps/rejected": -1181.102294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.166580200195312, "rewards/margins": 31.07561492919922, "rewards/rejected": -44.24219512939453, "step": 3438 }, { "epoch": 2.1393468118195957, "grad_norm": 4.010344491689466e-05, "learning_rate": 1.5928999538958046e-06, "logits/chosen": 0.6854308843612671, "logits/rejected": 3.5298032760620117, "logps/chosen": -543.1837158203125, "logps/rejected": -993.5977783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.933540344238281, "rewards/margins": 31.57440185546875, "rewards/rejected": -42.50794219970703, "step": 3439 }, { "epoch": 2.139968895800933, "grad_norm": 4.72462797164917, "learning_rate": 1.5917473490087599e-06, "logits/chosen": 0.36550286412239075, "logits/rejected": -0.1433129608631134, "logps/chosen": -605.7955322265625, "logps/rejected": -873.3226928710938, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -10.314057350158691, "rewards/margins": 30.274394989013672, "rewards/rejected": -40.58845520019531, "step": 3440 }, { "epoch": 2.1405909797822704, "grad_norm": 2.0681025603153103e-07, "learning_rate": 1.590594744121715e-06, "logits/chosen": 0.44030457735061646, "logits/rejected": 1.914415955543518, "logps/chosen": -468.28497314453125, "logps/rejected": -858.67626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.888754844665527, "rewards/margins": 29.416000366210938, "rewards/rejected": -39.30475616455078, "step": 3441 }, { "epoch": 2.141213063763608, "grad_norm": 0.01545824483036995, "learning_rate": 1.5894421392346703e-06, "logits/chosen": 2.361666202545166, "logits/rejected": 3.3521080017089844, "logps/chosen": -671.66455078125, "logps/rejected": -1096.104248046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.397945404052734, "rewards/margins": 27.711437225341797, "rewards/rejected": -44.10938262939453, "step": 3442 }, { "epoch": 2.1418351477449455, "grad_norm": 0.4221373498439789, "learning_rate": 1.5882895343476258e-06, "logits/chosen": -1.1415373086929321, "logits/rejected": 2.070000410079956, "logps/chosen": -366.7683410644531, "logps/rejected": -928.0114135742188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -8.439271926879883, "rewards/margins": 36.66456985473633, "rewards/rejected": -45.103843688964844, "step": 3443 }, { "epoch": 2.142457231726283, "grad_norm": 0.022570105269551277, "learning_rate": 1.587136929460581e-06, "logits/chosen": -1.1626898050308228, "logits/rejected": 2.8129069805145264, "logps/chosen": -484.2257995605469, "logps/rejected": -1054.75, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.110090255737305, "rewards/margins": 33.480899810791016, "rewards/rejected": -40.59099197387695, "step": 3444 }, { "epoch": 2.1430793157076207, "grad_norm": 0.1024412214756012, "learning_rate": 1.5859843245735362e-06, "logits/chosen": -1.3715943098068237, "logits/rejected": 1.9006588459014893, "logps/chosen": -504.47418212890625, "logps/rejected": -1045.1580810546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.661272048950195, "rewards/margins": 30.334453582763672, "rewards/rejected": -42.9957275390625, "step": 3445 }, { "epoch": 2.143701399688958, "grad_norm": 5.425422668457031, "learning_rate": 1.5848317196864914e-06, "logits/chosen": -0.059883177280426025, "logits/rejected": 2.410794734954834, "logps/chosen": -361.6816711425781, "logps/rejected": -728.779052734375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -6.959995269775391, "rewards/margins": 21.38794708251953, "rewards/rejected": -28.347942352294922, "step": 3446 }, { "epoch": 2.1443234836702953, "grad_norm": 0.0003700493252836168, "learning_rate": 1.5836791147994469e-06, "logits/chosen": 0.10252094268798828, "logits/rejected": 3.9793825149536133, "logps/chosen": -514.8165283203125, "logps/rejected": -1008.554443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.945167541503906, "rewards/margins": 31.51917266845703, "rewards/rejected": -43.4643440246582, "step": 3447 }, { "epoch": 2.144945567651633, "grad_norm": 32.00484085083008, "learning_rate": 1.582526509912402e-06, "logits/chosen": 3.398038148880005, "logits/rejected": 2.6710593700408936, "logps/chosen": -794.04541015625, "logps/rejected": -1034.32861328125, "loss": 0.783, "rewards/accuracies": 0.875, "rewards/chosen": -13.406631469726562, "rewards/margins": 20.88903045654297, "rewards/rejected": -34.29566192626953, "step": 3448 }, { "epoch": 2.1455676516329705, "grad_norm": 0.0014944367576390505, "learning_rate": 1.5813739050253573e-06, "logits/chosen": -0.21664124727249146, "logits/rejected": 1.0657644271850586, "logps/chosen": -538.9214477539062, "logps/rejected": -952.3651123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.446172714233398, "rewards/margins": 31.40801239013672, "rewards/rejected": -42.854183197021484, "step": 3449 }, { "epoch": 2.146189735614308, "grad_norm": 0.009861108846962452, "learning_rate": 1.5802213001383128e-06, "logits/chosen": 2.1326751708984375, "logits/rejected": 3.010751962661743, "logps/chosen": -634.6590576171875, "logps/rejected": -910.4227294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.147429466247559, "rewards/margins": 24.965938568115234, "rewards/rejected": -36.11336898803711, "step": 3450 }, { "epoch": 2.1468118195956456, "grad_norm": 12.68981647491455, "learning_rate": 1.579068695251268e-06, "logits/chosen": 0.1347132921218872, "logits/rejected": 3.76957368850708, "logps/chosen": -437.06329345703125, "logps/rejected": -1030.0235595703125, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": -6.579730033874512, "rewards/margins": 32.528709411621094, "rewards/rejected": -39.108436584472656, "step": 3451 }, { "epoch": 2.147433903576983, "grad_norm": 0.004146880470216274, "learning_rate": 1.5779160903642232e-06, "logits/chosen": -3.4257616996765137, "logits/rejected": 0.663922905921936, "logps/chosen": -254.12570190429688, "logps/rejected": -789.0089721679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.897040843963623, "rewards/margins": 25.073740005493164, "rewards/rejected": -30.970783233642578, "step": 3452 }, { "epoch": 2.1480559875583203, "grad_norm": 0.004583262838423252, "learning_rate": 1.5767634854771784e-06, "logits/chosen": 1.2491068840026855, "logits/rejected": 4.228781223297119, "logps/chosen": -570.4674072265625, "logps/rejected": -1028.893310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.680624008178711, "rewards/margins": 32.67273712158203, "rewards/rejected": -42.353355407714844, "step": 3453 }, { "epoch": 2.148678071539658, "grad_norm": 1.6916555978241377e-05, "learning_rate": 1.5756108805901339e-06, "logits/chosen": 1.6966251134872437, "logits/rejected": 2.9529733657836914, "logps/chosen": -604.318359375, "logps/rejected": -914.4246826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.783700942993164, "rewards/margins": 26.6322021484375, "rewards/rejected": -38.4159049987793, "step": 3454 }, { "epoch": 2.1493001555209954, "grad_norm": 0.05122312530875206, "learning_rate": 1.574458275703089e-06, "logits/chosen": -3.1102006435394287, "logits/rejected": 1.0001440048217773, "logps/chosen": -423.76300048828125, "logps/rejected": -1014.8577880859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.17207145690918, "rewards/margins": 37.166168212890625, "rewards/rejected": -47.33823776245117, "step": 3455 }, { "epoch": 2.1499222395023327, "grad_norm": 0.00029756067669950426, "learning_rate": 1.5733056708160443e-06, "logits/chosen": -1.2222700119018555, "logits/rejected": 3.7177681922912598, "logps/chosen": -603.027587890625, "logps/rejected": -1247.6982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.437413215637207, "rewards/margins": 37.02989196777344, "rewards/rejected": -48.46730422973633, "step": 3456 }, { "epoch": 2.15054432348367, "grad_norm": 0.04954065755009651, "learning_rate": 1.5721530659289995e-06, "logits/chosen": 1.6569437980651855, "logits/rejected": 3.8350000381469727, "logps/chosen": -643.8868408203125, "logps/rejected": -1028.9217529296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.637311935424805, "rewards/margins": 28.497699737548828, "rewards/rejected": -38.135009765625, "step": 3457 }, { "epoch": 2.151166407465008, "grad_norm": 3.508955478668213, "learning_rate": 1.571000461041955e-06, "logits/chosen": -2.0015316009521484, "logits/rejected": 0.9972792863845825, "logps/chosen": -548.6949462890625, "logps/rejected": -1086.0467529296875, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -13.5714111328125, "rewards/margins": 30.41091537475586, "rewards/rejected": -43.98232650756836, "step": 3458 }, { "epoch": 2.151788491446345, "grad_norm": 8.965987035480794e-06, "learning_rate": 1.5698478561549102e-06, "logits/chosen": 1.6384658813476562, "logits/rejected": 4.505178451538086, "logps/chosen": -584.2413330078125, "logps/rejected": -1142.488525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.16094970703125, "rewards/margins": 36.01255798339844, "rewards/rejected": -43.17350769042969, "step": 3459 }, { "epoch": 2.1524105754276825, "grad_norm": 0.00012471464287955314, "learning_rate": 1.5686952512678654e-06, "logits/chosen": 2.938779592514038, "logits/rejected": 4.181041717529297, "logps/chosen": -635.9864501953125, "logps/rejected": -1054.688720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.60373306274414, "rewards/margins": 34.09574508666992, "rewards/rejected": -44.69947814941406, "step": 3460 }, { "epoch": 2.1530326594090203, "grad_norm": 2.1145135065125942e-07, "learning_rate": 1.5675426463808209e-06, "logits/chosen": 1.1438682079315186, "logits/rejected": 3.93155574798584, "logps/chosen": -453.353759765625, "logps/rejected": -957.1611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.293970108032227, "rewards/margins": 37.22956848144531, "rewards/rejected": -45.523536682128906, "step": 3461 }, { "epoch": 2.1536547433903577, "grad_norm": 6.02866823302961e-13, "learning_rate": 1.566390041493776e-06, "logits/chosen": -1.322139024734497, "logits/rejected": 3.6064188480377197, "logps/chosen": -487.96136474609375, "logps/rejected": -1197.617431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.6765775680542, "rewards/margins": 41.77438735961914, "rewards/rejected": -50.450965881347656, "step": 3462 }, { "epoch": 2.154276827371695, "grad_norm": 0.08622624725103378, "learning_rate": 1.5652374366067313e-06, "logits/chosen": 1.3654158115386963, "logits/rejected": 2.3155899047851562, "logps/chosen": -730.7118530273438, "logps/rejected": -1029.7301025390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -16.9762020111084, "rewards/margins": 20.96411895751953, "rewards/rejected": -37.9403190612793, "step": 3463 }, { "epoch": 2.154898911353033, "grad_norm": 0.009618532843887806, "learning_rate": 1.5640848317196865e-06, "logits/chosen": 0.9912912845611572, "logits/rejected": 2.8535585403442383, "logps/chosen": -479.0268859863281, "logps/rejected": -854.8507080078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.03948450088501, "rewards/margins": 29.229816436767578, "rewards/rejected": -36.2692985534668, "step": 3464 }, { "epoch": 2.15552099533437, "grad_norm": 0.10931253433227539, "learning_rate": 1.562932226832642e-06, "logits/chosen": -0.7888144254684448, "logits/rejected": 3.1281023025512695, "logps/chosen": -411.3115234375, "logps/rejected": -841.8549194335938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -8.393655776977539, "rewards/margins": 25.66720199584961, "rewards/rejected": -34.06085968017578, "step": 3465 }, { "epoch": 2.1561430793157075, "grad_norm": 0.00019762212468776852, "learning_rate": 1.5617796219455972e-06, "logits/chosen": -1.8151626586914062, "logits/rejected": 1.8206268548965454, "logps/chosen": -378.3106384277344, "logps/rejected": -955.4374389648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.059768676757812, "rewards/margins": 30.776660919189453, "rewards/rejected": -38.836429595947266, "step": 3466 }, { "epoch": 2.1567651632970453, "grad_norm": 1.8507220147512271e-06, "learning_rate": 1.5606270170585524e-06, "logits/chosen": 1.1831034421920776, "logits/rejected": 3.867389678955078, "logps/chosen": -505.9981994628906, "logps/rejected": -989.8499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.651363372802734, "rewards/margins": 34.708984375, "rewards/rejected": -42.36034393310547, "step": 3467 }, { "epoch": 2.1573872472783826, "grad_norm": 0.004638043697923422, "learning_rate": 1.5594744121715076e-06, "logits/chosen": -1.4236783981323242, "logits/rejected": 2.9329476356506348, "logps/chosen": -343.78955078125, "logps/rejected": -819.7637939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.332006454467773, "rewards/margins": 25.30170440673828, "rewards/rejected": -32.63371276855469, "step": 3468 }, { "epoch": 2.15800933125972, "grad_norm": 10.412405014038086, "learning_rate": 1.558321807284463e-06, "logits/chosen": -0.23219335079193115, "logits/rejected": 2.9971132278442383, "logps/chosen": -595.6908569335938, "logps/rejected": -1006.50537109375, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -8.477470397949219, "rewards/margins": 21.702659606933594, "rewards/rejected": -30.180130004882812, "step": 3469 }, { "epoch": 2.1586314152410577, "grad_norm": 0.003561669262126088, "learning_rate": 1.5571692023974183e-06, "logits/chosen": 0.5902441740036011, "logits/rejected": 1.6300718784332275, "logps/chosen": -610.6525268554688, "logps/rejected": -981.94580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.881414413452148, "rewards/margins": 30.17713165283203, "rewards/rejected": -45.05854034423828, "step": 3470 }, { "epoch": 2.159253499222395, "grad_norm": 0.0019642633851617575, "learning_rate": 1.5560165975103735e-06, "logits/chosen": 2.9218506813049316, "logits/rejected": 4.19120979309082, "logps/chosen": -704.3328247070312, "logps/rejected": -1116.96240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.29806900024414, "rewards/margins": 34.05742645263672, "rewards/rejected": -46.35549545288086, "step": 3471 }, { "epoch": 2.1598755832037324, "grad_norm": 4.151289867415642e-10, "learning_rate": 1.554863992623329e-06, "logits/chosen": 0.576399564743042, "logits/rejected": 0.6314655542373657, "logps/chosen": -596.7716064453125, "logps/rejected": -1056.8067626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.739250183105469, "rewards/margins": 37.79853057861328, "rewards/rejected": -48.53778076171875, "step": 3472 }, { "epoch": 2.16049766718507, "grad_norm": 35.283565521240234, "learning_rate": 1.5537113877362842e-06, "logits/chosen": -1.2791149616241455, "logits/rejected": 1.7859420776367188, "logps/chosen": -557.95166015625, "logps/rejected": -1006.7755126953125, "loss": 0.14, "rewards/accuracies": 0.875, "rewards/chosen": -12.688139915466309, "rewards/margins": 21.129825592041016, "rewards/rejected": -33.81796646118164, "step": 3473 }, { "epoch": 2.1611197511664075, "grad_norm": 0.09894830733537674, "learning_rate": 1.5525587828492394e-06, "logits/chosen": -0.0892077088356018, "logits/rejected": 1.6869478225708008, "logps/chosen": -536.9102172851562, "logps/rejected": -892.3147583007812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.899477005004883, "rewards/margins": 23.70627212524414, "rewards/rejected": -35.60574722290039, "step": 3474 }, { "epoch": 2.161741835147745, "grad_norm": 2.0548529624938965, "learning_rate": 1.5514061779621946e-06, "logits/chosen": 1.3693408966064453, "logits/rejected": 2.579724073410034, "logps/chosen": -536.6299438476562, "logps/rejected": -680.3751220703125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -7.347484588623047, "rewards/margins": 14.572774887084961, "rewards/rejected": -21.920257568359375, "step": 3475 }, { "epoch": 2.162363919129082, "grad_norm": 0.0224276315420866, "learning_rate": 1.55025357307515e-06, "logits/chosen": 0.5975302457809448, "logits/rejected": 4.302839279174805, "logps/chosen": -536.1854248046875, "logps/rejected": -939.0042724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.972611427307129, "rewards/margins": 20.14980125427246, "rewards/rejected": -28.122413635253906, "step": 3476 }, { "epoch": 2.16298600311042, "grad_norm": 0.021996496245265007, "learning_rate": 1.5491009681881053e-06, "logits/chosen": -0.18943238258361816, "logits/rejected": 2.7617297172546387, "logps/chosen": -565.2724609375, "logps/rejected": -1072.9237060546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.456770896911621, "rewards/margins": 34.614444732666016, "rewards/rejected": -42.07121658325195, "step": 3477 }, { "epoch": 2.1636080870917573, "grad_norm": 1.227846602169791e-09, "learning_rate": 1.5479483633010605e-06, "logits/chosen": -1.9056190252304077, "logits/rejected": 2.281365394592285, "logps/chosen": -429.4873046875, "logps/rejected": -1025.097900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.220512390136719, "rewards/margins": 37.59743118286133, "rewards/rejected": -47.81794738769531, "step": 3478 }, { "epoch": 2.1642301710730947, "grad_norm": 5.147202500666026e-06, "learning_rate": 1.5467957584140158e-06, "logits/chosen": 0.15378397703170776, "logits/rejected": 4.166439056396484, "logps/chosen": -535.3529052734375, "logps/rejected": -1126.279052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.468807220458984, "rewards/margins": 37.814125061035156, "rewards/rejected": -45.28293228149414, "step": 3479 }, { "epoch": 2.1648522550544325, "grad_norm": 6.843938899692148e-05, "learning_rate": 1.5456431535269712e-06, "logits/chosen": -0.10889559984207153, "logits/rejected": 2.361340045928955, "logps/chosen": -568.2152099609375, "logps/rejected": -1050.350341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.513697624206543, "rewards/margins": 35.53955841064453, "rewards/rejected": -50.053253173828125, "step": 3480 }, { "epoch": 2.16547433903577, "grad_norm": 0.0102471224963665, "learning_rate": 1.5444905486399264e-06, "logits/chosen": 0.7401965856552124, "logits/rejected": 3.1332480907440186, "logps/chosen": -639.3670654296875, "logps/rejected": -1051.7347412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.7382612228393555, "rewards/margins": 27.60154914855957, "rewards/rejected": -35.339813232421875, "step": 3481 }, { "epoch": 2.166096423017107, "grad_norm": 0.00041303454781882465, "learning_rate": 1.5433379437528816e-06, "logits/chosen": -0.9328422546386719, "logits/rejected": 3.0686655044555664, "logps/chosen": -263.8511047363281, "logps/rejected": -749.5258178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.905520915985107, "rewards/margins": 27.547443389892578, "rewards/rejected": -32.452964782714844, "step": 3482 }, { "epoch": 2.166718506998445, "grad_norm": 2.258577325164879e-09, "learning_rate": 1.542185338865837e-06, "logits/chosen": 2.8638532161712646, "logits/rejected": 3.6541571617126465, "logps/chosen": -686.989990234375, "logps/rejected": -1098.45458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.719980239868164, "rewards/margins": 35.45842742919922, "rewards/rejected": -48.17841339111328, "step": 3483 }, { "epoch": 2.1673405909797823, "grad_norm": 0.046206485480070114, "learning_rate": 1.5410327339787923e-06, "logits/chosen": 0.6019414067268372, "logits/rejected": 4.246374130249023, "logps/chosen": -591.260009765625, "logps/rejected": -1131.1484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.808977127075195, "rewards/margins": 32.77920150756836, "rewards/rejected": -46.58818054199219, "step": 3484 }, { "epoch": 2.1679626749611196, "grad_norm": 1.8323400020599365, "learning_rate": 1.5398801290917475e-06, "logits/chosen": 3.3164963722229004, "logits/rejected": 4.026242256164551, "logps/chosen": -558.5890502929688, "logps/rejected": -857.2715454101562, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -7.3273725509643555, "rewards/margins": 24.035884857177734, "rewards/rejected": -31.36325454711914, "step": 3485 }, { "epoch": 2.1685847589424574, "grad_norm": 0.0004539400397334248, "learning_rate": 1.5387275242047028e-06, "logits/chosen": 0.121127188205719, "logits/rejected": 3.4484081268310547, "logps/chosen": -671.3143310546875, "logps/rejected": -1044.74609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.50837516784668, "rewards/margins": 31.49677276611328, "rewards/rejected": -40.005149841308594, "step": 3486 }, { "epoch": 2.1692068429237947, "grad_norm": 4.255609198366983e-09, "learning_rate": 1.5375749193176582e-06, "logits/chosen": -0.1348937749862671, "logits/rejected": 4.0312323570251465, "logps/chosen": -372.6984558105469, "logps/rejected": -992.5408935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.464959621429443, "rewards/margins": 36.84416580200195, "rewards/rejected": -42.30912399291992, "step": 3487 }, { "epoch": 2.169828926905132, "grad_norm": 0.03619871661067009, "learning_rate": 1.5364223144306134e-06, "logits/chosen": -0.1388590782880783, "logits/rejected": 0.4599158763885498, "logps/chosen": -498.08544921875, "logps/rejected": -965.0997314453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.397851943969727, "rewards/margins": 33.254859924316406, "rewards/rejected": -42.6527099609375, "step": 3488 }, { "epoch": 2.17045101088647, "grad_norm": 0.0035405848175287247, "learning_rate": 1.5352697095435686e-06, "logits/chosen": 1.951183795928955, "logits/rejected": 3.039306640625, "logps/chosen": -680.4669189453125, "logps/rejected": -1072.6722412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.782991409301758, "rewards/margins": 27.88431739807129, "rewards/rejected": -41.66730880737305, "step": 3489 }, { "epoch": 2.171073094867807, "grad_norm": 0.2370285987854004, "learning_rate": 1.534117104656524e-06, "logits/chosen": 2.453774929046631, "logits/rejected": 4.862796783447266, "logps/chosen": -830.1021728515625, "logps/rejected": -1309.16845703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -14.799744606018066, "rewards/margins": 32.265045166015625, "rewards/rejected": -47.064788818359375, "step": 3490 }, { "epoch": 2.1716951788491445, "grad_norm": 0.01857941411435604, "learning_rate": 1.5329644997694793e-06, "logits/chosen": 1.4445005655288696, "logits/rejected": 3.2285866737365723, "logps/chosen": -647.0142822265625, "logps/rejected": -1020.450439453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.633213996887207, "rewards/margins": 27.143659591674805, "rewards/rejected": -38.77687072753906, "step": 3491 }, { "epoch": 2.1723172628304823, "grad_norm": 6.384193693520501e-05, "learning_rate": 1.5318118948824345e-06, "logits/chosen": 0.22042182087898254, "logits/rejected": 3.2595787048339844, "logps/chosen": -557.39599609375, "logps/rejected": -1137.6495361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.450281143188477, "rewards/margins": 40.4174919128418, "rewards/rejected": -49.867767333984375, "step": 3492 }, { "epoch": 2.1729393468118197, "grad_norm": 0.024043237790465355, "learning_rate": 1.5306592899953897e-06, "logits/chosen": -0.896227240562439, "logits/rejected": 2.541123390197754, "logps/chosen": -508.51806640625, "logps/rejected": -1031.8397216796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.197294235229492, "rewards/margins": 32.25333786010742, "rewards/rejected": -39.45063018798828, "step": 3493 }, { "epoch": 2.173561430793157, "grad_norm": 0.1542564034461975, "learning_rate": 1.5295066851083452e-06, "logits/chosen": 1.0172396898269653, "logits/rejected": 2.698474407196045, "logps/chosen": -455.7784423828125, "logps/rejected": -922.6686401367188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -13.295679092407227, "rewards/margins": 33.03749084472656, "rewards/rejected": -46.33317184448242, "step": 3494 }, { "epoch": 2.1741835147744943, "grad_norm": 3.4196689128875732, "learning_rate": 1.5283540802213004e-06, "logits/chosen": -1.5701600313186646, "logits/rejected": 0.7547981142997742, "logps/chosen": -522.6417236328125, "logps/rejected": -887.583251953125, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -10.89109992980957, "rewards/margins": 22.884368896484375, "rewards/rejected": -33.77547073364258, "step": 3495 }, { "epoch": 2.174805598755832, "grad_norm": 2.236021041870117, "learning_rate": 1.5272014753342556e-06, "logits/chosen": 2.0567288398742676, "logits/rejected": 3.635549306869507, "logps/chosen": -610.8658447265625, "logps/rejected": -1079.648193359375, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -11.349096298217773, "rewards/margins": 30.307376861572266, "rewards/rejected": -41.656471252441406, "step": 3496 }, { "epoch": 2.1754276827371695, "grad_norm": 19.92326545715332, "learning_rate": 1.5260488704472109e-06, "logits/chosen": -0.9514064788818359, "logits/rejected": 2.288212537765503, "logps/chosen": -524.08349609375, "logps/rejected": -1010.56298828125, "loss": 0.0905, "rewards/accuracies": 0.875, "rewards/chosen": -7.694738388061523, "rewards/margins": 33.93772888183594, "rewards/rejected": -41.632469177246094, "step": 3497 }, { "epoch": 2.176049766718507, "grad_norm": 0.17036820948123932, "learning_rate": 1.5248962655601663e-06, "logits/chosen": -2.106801748275757, "logits/rejected": 4.079695224761963, "logps/chosen": -369.3140869140625, "logps/rejected": -1085.123779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.724808692932129, "rewards/margins": 30.634748458862305, "rewards/rejected": -39.35955810546875, "step": 3498 }, { "epoch": 2.1766718506998446, "grad_norm": 1.4225753545761108, "learning_rate": 1.5237436606731215e-06, "logits/chosen": 0.6853106617927551, "logits/rejected": 3.601655960083008, "logps/chosen": -548.959716796875, "logps/rejected": -991.363037109375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -12.702157974243164, "rewards/margins": 26.26728630065918, "rewards/rejected": -38.969444274902344, "step": 3499 }, { "epoch": 2.177293934681182, "grad_norm": 0.2219383269548416, "learning_rate": 1.5225910557860765e-06, "logits/chosen": 0.3559526205062866, "logits/rejected": 3.7226366996765137, "logps/chosen": -449.0332946777344, "logps/rejected": -1052.79443359375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.326770782470703, "rewards/margins": 35.569976806640625, "rewards/rejected": -43.89674758911133, "step": 3500 }, { "epoch": 2.1779160186625193, "grad_norm": 35.52800750732422, "learning_rate": 1.5214384508990318e-06, "logits/chosen": 1.4980649948120117, "logits/rejected": 4.63604736328125, "logps/chosen": -586.69580078125, "logps/rejected": -1035.624267578125, "loss": 0.4524, "rewards/accuracies": 0.875, "rewards/chosen": -10.078465461730957, "rewards/margins": 31.11182975769043, "rewards/rejected": -41.19029235839844, "step": 3501 }, { "epoch": 2.178538102643857, "grad_norm": 1.1494609708506687e-08, "learning_rate": 1.5202858460119872e-06, "logits/chosen": -0.8052943348884583, "logits/rejected": 3.309441089630127, "logps/chosen": -453.7115173339844, "logps/rejected": -980.9556884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.90156364440918, "rewards/margins": 32.78784942626953, "rewards/rejected": -41.689414978027344, "step": 3502 }, { "epoch": 2.1791601866251944, "grad_norm": 0.46481531858444214, "learning_rate": 1.5191332411249424e-06, "logits/chosen": 0.530727744102478, "logits/rejected": 1.2075937986373901, "logps/chosen": -715.234619140625, "logps/rejected": -1115.29931640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -15.125618934631348, "rewards/margins": 25.974449157714844, "rewards/rejected": -41.100067138671875, "step": 3503 }, { "epoch": 2.1797822706065317, "grad_norm": 0.0032227826304733753, "learning_rate": 1.5179806362378976e-06, "logits/chosen": 0.8939819931983948, "logits/rejected": 2.9865517616271973, "logps/chosen": -603.0927734375, "logps/rejected": -1192.5281982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.699723243713379, "rewards/margins": 38.47848892211914, "rewards/rejected": -47.1782112121582, "step": 3504 }, { "epoch": 2.1804043545878695, "grad_norm": 0.009147186763584614, "learning_rate": 1.5168280313508529e-06, "logits/chosen": 3.3015215396881104, "logits/rejected": 1.3482310771942139, "logps/chosen": -738.5409545898438, "logps/rejected": -845.2588500976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.378486633300781, "rewards/margins": 20.573898315429688, "rewards/rejected": -33.95238494873047, "step": 3505 }, { "epoch": 2.181026438569207, "grad_norm": 6.6773923208529595e-06, "learning_rate": 1.5156754264638083e-06, "logits/chosen": -2.602396011352539, "logits/rejected": 2.857754707336426, "logps/chosen": -372.400146484375, "logps/rejected": -1036.571044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.945173263549805, "rewards/margins": 36.59382247924805, "rewards/rejected": -44.53899383544922, "step": 3506 }, { "epoch": 2.181648522550544, "grad_norm": 2.750762462615967, "learning_rate": 1.5145228215767635e-06, "logits/chosen": 1.6972405910491943, "logits/rejected": 0.563304603099823, "logps/chosen": -580.1192016601562, "logps/rejected": -831.8045654296875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -10.198119163513184, "rewards/margins": 26.365081787109375, "rewards/rejected": -36.56319808959961, "step": 3507 }, { "epoch": 2.182270606531882, "grad_norm": 0.04510970413684845, "learning_rate": 1.5133702166897188e-06, "logits/chosen": 1.1926695108413696, "logits/rejected": 2.631222724914551, "logps/chosen": -626.7939453125, "logps/rejected": -989.72119140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.63274097442627, "rewards/margins": 27.476999282836914, "rewards/rejected": -38.109737396240234, "step": 3508 }, { "epoch": 2.1828926905132193, "grad_norm": 0.1802457571029663, "learning_rate": 1.512217611802674e-06, "logits/chosen": -3.372152328491211, "logits/rejected": 0.6782333254814148, "logps/chosen": -441.4357604980469, "logps/rejected": -1135.593505859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -9.280879974365234, "rewards/margins": 39.73485565185547, "rewards/rejected": -49.0157356262207, "step": 3509 }, { "epoch": 2.1835147744945567, "grad_norm": 0.005605627316981554, "learning_rate": 1.5110650069156294e-06, "logits/chosen": -2.9359774589538574, "logits/rejected": 4.364621162414551, "logps/chosen": -342.99444580078125, "logps/rejected": -1007.0554809570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.298252105712891, "rewards/margins": 34.19009780883789, "rewards/rejected": -41.48834991455078, "step": 3510 }, { "epoch": 2.1841368584758944, "grad_norm": 14.325651168823242, "learning_rate": 1.5099124020285846e-06, "logits/chosen": 0.26200932264328003, "logits/rejected": 3.8386402130126953, "logps/chosen": -659.2767944335938, "logps/rejected": -1086.6053466796875, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": -10.393942832946777, "rewards/margins": 32.03108215332031, "rewards/rejected": -42.42502212524414, "step": 3511 }, { "epoch": 2.184758942457232, "grad_norm": 58.25017547607422, "learning_rate": 1.5087597971415399e-06, "logits/chosen": 2.0818560123443604, "logits/rejected": 3.9699525833129883, "logps/chosen": -757.948486328125, "logps/rejected": -1054.1805419921875, "loss": 0.3581, "rewards/accuracies": 0.875, "rewards/chosen": -15.566535949707031, "rewards/margins": 27.65494155883789, "rewards/rejected": -43.221473693847656, "step": 3512 }, { "epoch": 2.185381026438569, "grad_norm": 20.528139114379883, "learning_rate": 1.5076071922544953e-06, "logits/chosen": 1.878481149673462, "logits/rejected": 3.6764373779296875, "logps/chosen": -720.4259643554688, "logps/rejected": -1064.9002685546875, "loss": 0.1138, "rewards/accuracies": 0.875, "rewards/chosen": -15.61282730102539, "rewards/margins": 23.589069366455078, "rewards/rejected": -39.20189666748047, "step": 3513 }, { "epoch": 2.1860031104199065, "grad_norm": 7.523076055804268e-05, "learning_rate": 1.5064545873674505e-06, "logits/chosen": 0.2912430763244629, "logits/rejected": 4.2857818603515625, "logps/chosen": -687.9354858398438, "logps/rejected": -1222.1806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.304641723632812, "rewards/margins": 32.19824981689453, "rewards/rejected": -43.502891540527344, "step": 3514 }, { "epoch": 2.1866251944012443, "grad_norm": 0.00023828446865081787, "learning_rate": 1.5053019824804058e-06, "logits/chosen": -0.37384992837905884, "logits/rejected": 3.28475284576416, "logps/chosen": -357.19329833984375, "logps/rejected": -861.1342163085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.559617519378662, "rewards/margins": 26.56270980834961, "rewards/rejected": -34.12232971191406, "step": 3515 }, { "epoch": 2.1872472783825816, "grad_norm": 2.8168491553515196e-05, "learning_rate": 1.504149377593361e-06, "logits/chosen": -0.1479528546333313, "logits/rejected": 3.6196656227111816, "logps/chosen": -423.3016662597656, "logps/rejected": -1042.63134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.85476016998291, "rewards/margins": 35.65232467651367, "rewards/rejected": -41.50708770751953, "step": 3516 }, { "epoch": 2.187869362363919, "grad_norm": 0.0005493653588928282, "learning_rate": 1.5029967727063164e-06, "logits/chosen": -0.6248797178268433, "logits/rejected": 2.49371337890625, "logps/chosen": -552.1177368164062, "logps/rejected": -1033.814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.576824188232422, "rewards/margins": 29.918533325195312, "rewards/rejected": -41.495357513427734, "step": 3517 }, { "epoch": 2.1884914463452567, "grad_norm": 0.05042627453804016, "learning_rate": 1.5018441678192716e-06, "logits/chosen": -2.2561912536621094, "logits/rejected": 2.464254856109619, "logps/chosen": -401.0296630859375, "logps/rejected": -1022.3479614257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.056978225708008, "rewards/margins": 33.46770477294922, "rewards/rejected": -41.524681091308594, "step": 3518 }, { "epoch": 2.189113530326594, "grad_norm": 0.0019069320987910032, "learning_rate": 1.5006915629322269e-06, "logits/chosen": 0.8858805298805237, "logits/rejected": 2.8581292629241943, "logps/chosen": -599.8334350585938, "logps/rejected": -1049.4576416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.778345584869385, "rewards/margins": 33.91278839111328, "rewards/rejected": -40.691131591796875, "step": 3519 }, { "epoch": 2.1897356143079314, "grad_norm": 0.05174775794148445, "learning_rate": 1.499538958045182e-06, "logits/chosen": 0.19469568133354187, "logits/rejected": 2.310316562652588, "logps/chosen": -497.45458984375, "logps/rejected": -1019.6409912109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.488957405090332, "rewards/margins": 33.4749870300293, "rewards/rejected": -44.96394348144531, "step": 3520 }, { "epoch": 2.190357698289269, "grad_norm": 0.6884315609931946, "learning_rate": 1.4983863531581375e-06, "logits/chosen": -2.5548439025878906, "logits/rejected": 1.6989535093307495, "logps/chosen": -362.3365173339844, "logps/rejected": -820.731689453125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -6.82147741317749, "rewards/margins": 25.978748321533203, "rewards/rejected": -32.80022430419922, "step": 3521 }, { "epoch": 2.1909797822706065, "grad_norm": 0.0009601087076589465, "learning_rate": 1.4972337482710927e-06, "logits/chosen": 1.2198925018310547, "logits/rejected": 2.9722330570220947, "logps/chosen": -732.6044921875, "logps/rejected": -1182.9945068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.560627937316895, "rewards/margins": 33.31861114501953, "rewards/rejected": -44.87923812866211, "step": 3522 }, { "epoch": 2.191601866251944, "grad_norm": 0.27258390188217163, "learning_rate": 1.496081143384048e-06, "logits/chosen": -0.42896389961242676, "logits/rejected": 3.6511716842651367, "logps/chosen": -614.4691772460938, "logps/rejected": -1194.15576171875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -11.832674026489258, "rewards/margins": 40.16541290283203, "rewards/rejected": -51.99808883666992, "step": 3523 }, { "epoch": 2.1922239502332816, "grad_norm": 5.141660690307617, "learning_rate": 1.4949285384970034e-06, "logits/chosen": 0.1440633237361908, "logits/rejected": 3.3952267169952393, "logps/chosen": -544.6929321289062, "logps/rejected": -929.3987426757812, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -10.516995429992676, "rewards/margins": 21.96893310546875, "rewards/rejected": -32.48592758178711, "step": 3524 }, { "epoch": 2.192846034214619, "grad_norm": 0.30395951867103577, "learning_rate": 1.4937759336099586e-06, "logits/chosen": -0.002476602792739868, "logits/rejected": 1.466905951499939, "logps/chosen": -614.0319213867188, "logps/rejected": -995.0291748046875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -17.193775177001953, "rewards/margins": 24.900527954101562, "rewards/rejected": -42.09429931640625, "step": 3525 }, { "epoch": 2.1934681181959563, "grad_norm": 30.426597595214844, "learning_rate": 1.4926233287229139e-06, "logits/chosen": -0.20709985494613647, "logits/rejected": 2.804947853088379, "logps/chosen": -548.359619140625, "logps/rejected": -1074.06884765625, "loss": 0.3336, "rewards/accuracies": 0.875, "rewards/chosen": -10.806556701660156, "rewards/margins": 33.09553146362305, "rewards/rejected": -43.9020881652832, "step": 3526 }, { "epoch": 2.194090202177294, "grad_norm": 0.0884014144539833, "learning_rate": 1.491470723835869e-06, "logits/chosen": 2.929100513458252, "logits/rejected": 2.9884305000305176, "logps/chosen": -615.928466796875, "logps/rejected": -890.3482666015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.841999053955078, "rewards/margins": 27.70352554321289, "rewards/rejected": -37.54552459716797, "step": 3527 }, { "epoch": 2.1947122861586315, "grad_norm": 0.031009182333946228, "learning_rate": 1.4903181189488245e-06, "logits/chosen": 1.8256065845489502, "logits/rejected": 3.060570478439331, "logps/chosen": -713.8429565429688, "logps/rejected": -1134.923828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.26293659210205, "rewards/margins": 36.25966262817383, "rewards/rejected": -45.52259826660156, "step": 3528 }, { "epoch": 2.195334370139969, "grad_norm": 0.001949359430000186, "learning_rate": 1.4891655140617797e-06, "logits/chosen": -0.24708634614944458, "logits/rejected": 2.8352670669555664, "logps/chosen": -556.1632690429688, "logps/rejected": -1012.2968139648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.142890930175781, "rewards/margins": 31.787410736083984, "rewards/rejected": -36.93030548095703, "step": 3529 }, { "epoch": 2.1959564541213066, "grad_norm": 0.008359517902135849, "learning_rate": 1.488012909174735e-06, "logits/chosen": -1.7454040050506592, "logits/rejected": 2.6669399738311768, "logps/chosen": -508.08868408203125, "logps/rejected": -1210.76953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.647208213806152, "rewards/margins": 36.04959487915039, "rewards/rejected": -45.69680404663086, "step": 3530 }, { "epoch": 2.196578538102644, "grad_norm": 5.639753197783648e-08, "learning_rate": 1.4868603042876902e-06, "logits/chosen": -1.8157124519348145, "logits/rejected": 3.0563266277313232, "logps/chosen": -342.3044738769531, "logps/rejected": -1138.9464111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.511418342590332, "rewards/margins": 37.18653869628906, "rewards/rejected": -43.697959899902344, "step": 3531 }, { "epoch": 2.1972006220839813, "grad_norm": 0.13938391208648682, "learning_rate": 1.4857076994006456e-06, "logits/chosen": -0.25851160287857056, "logits/rejected": 3.7246131896972656, "logps/chosen": -504.6614990234375, "logps/rejected": -1072.123291015625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -10.598262786865234, "rewards/margins": 29.225683212280273, "rewards/rejected": -39.823944091796875, "step": 3532 }, { "epoch": 2.1978227060653186, "grad_norm": 0.28094029426574707, "learning_rate": 1.4845550945136009e-06, "logits/chosen": -0.2853562831878662, "logits/rejected": 0.4411306381225586, "logps/chosen": -508.27093505859375, "logps/rejected": -726.7429809570312, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -9.232595443725586, "rewards/margins": 19.054052352905273, "rewards/rejected": -28.286649703979492, "step": 3533 }, { "epoch": 2.1984447900466564, "grad_norm": 1.8335808249503316e-07, "learning_rate": 1.483402489626556e-06, "logits/chosen": -0.7460952401161194, "logits/rejected": 4.086732864379883, "logps/chosen": -472.76861572265625, "logps/rejected": -1131.452392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.650875568389893, "rewards/margins": 39.26519012451172, "rewards/rejected": -45.91606521606445, "step": 3534 }, { "epoch": 2.1990668740279937, "grad_norm": 0.0018125500064343214, "learning_rate": 1.4822498847395115e-06, "logits/chosen": 0.30365845561027527, "logits/rejected": 3.7823288440704346, "logps/chosen": -560.137451171875, "logps/rejected": -1056.3480224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.162646293640137, "rewards/margins": 33.35071563720703, "rewards/rejected": -44.51335906982422, "step": 3535 }, { "epoch": 2.199688958009331, "grad_norm": 0.0004930261638946831, "learning_rate": 1.4810972798524667e-06, "logits/chosen": 3.154604911804199, "logits/rejected": 2.6136741638183594, "logps/chosen": -609.3040161132812, "logps/rejected": -887.94482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.351789474487305, "rewards/margins": 26.85038185119629, "rewards/rejected": -41.20217514038086, "step": 3536 }, { "epoch": 2.200311041990669, "grad_norm": 0.13322575390338898, "learning_rate": 1.479944674965422e-06, "logits/chosen": 1.2375845909118652, "logits/rejected": 2.894868850708008, "logps/chosen": -503.1761474609375, "logps/rejected": -812.9686889648438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -12.426453590393066, "rewards/margins": 21.795515060424805, "rewards/rejected": -34.22196960449219, "step": 3537 }, { "epoch": 2.200933125972006, "grad_norm": 0.002348793437704444, "learning_rate": 1.4787920700783772e-06, "logits/chosen": 0.6488022804260254, "logits/rejected": 0.9976472854614258, "logps/chosen": -719.718505859375, "logps/rejected": -1167.53515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.721811294555664, "rewards/margins": 38.567386627197266, "rewards/rejected": -53.28919982910156, "step": 3538 }, { "epoch": 2.2015552099533435, "grad_norm": 8.701942277866692e-09, "learning_rate": 1.4776394651913326e-06, "logits/chosen": -2.041717767715454, "logits/rejected": 2.4373598098754883, "logps/chosen": -499.24285888671875, "logps/rejected": -1228.697998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.595501899719238, "rewards/margins": 39.86658477783203, "rewards/rejected": -53.46208572387695, "step": 3539 }, { "epoch": 2.2021772939346813, "grad_norm": 0.0034426345955580473, "learning_rate": 1.4764868603042879e-06, "logits/chosen": -0.7779150009155273, "logits/rejected": 2.817213535308838, "logps/chosen": -485.165771484375, "logps/rejected": -1033.595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.714851379394531, "rewards/margins": 30.76633644104004, "rewards/rejected": -39.4811897277832, "step": 3540 }, { "epoch": 2.2027993779160187, "grad_norm": 7.423425267916173e-05, "learning_rate": 1.475334255417243e-06, "logits/chosen": 1.6081801652908325, "logits/rejected": 3.639782428741455, "logps/chosen": -629.6107177734375, "logps/rejected": -1070.0302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.458868026733398, "rewards/margins": 27.184024810791016, "rewards/rejected": -37.64289474487305, "step": 3541 }, { "epoch": 2.203421461897356, "grad_norm": 10.609362602233887, "learning_rate": 1.4741816505301983e-06, "logits/chosen": 1.6137707233428955, "logits/rejected": 3.130721092224121, "logps/chosen": -699.2923583984375, "logps/rejected": -1112.9217529296875, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -12.161538124084473, "rewards/margins": 30.89093589782715, "rewards/rejected": -43.05247497558594, "step": 3542 }, { "epoch": 2.2040435458786938, "grad_norm": 0.0009758673259057105, "learning_rate": 1.4730290456431537e-06, "logits/chosen": -2.5931081771850586, "logits/rejected": 2.1481778621673584, "logps/chosen": -423.0606689453125, "logps/rejected": -1051.5849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.25644302368164, "rewards/margins": 34.31974792480469, "rewards/rejected": -45.576194763183594, "step": 3543 }, { "epoch": 2.204665629860031, "grad_norm": 5.5923883337527514e-05, "learning_rate": 1.471876440756109e-06, "logits/chosen": -0.7050518989562988, "logits/rejected": 1.3252148628234863, "logps/chosen": -466.5789489746094, "logps/rejected": -930.0560302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0426225662231445, "rewards/margins": 27.657917022705078, "rewards/rejected": -34.70054244995117, "step": 3544 }, { "epoch": 2.2052877138413685, "grad_norm": 13.390966415405273, "learning_rate": 1.4707238358690642e-06, "logits/chosen": -0.6056236624717712, "logits/rejected": 2.167903423309326, "logps/chosen": -402.4957275390625, "logps/rejected": -790.405029296875, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": -5.958978652954102, "rewards/margins": 24.03119468688965, "rewards/rejected": -29.99017333984375, "step": 3545 }, { "epoch": 2.2059097978227062, "grad_norm": 19.667755126953125, "learning_rate": 1.4695712309820196e-06, "logits/chosen": 0.19798988103866577, "logits/rejected": 3.2702062129974365, "logps/chosen": -622.1126098632812, "logps/rejected": -1035.2138671875, "loss": 0.1538, "rewards/accuracies": 0.875, "rewards/chosen": -10.878641128540039, "rewards/margins": 27.598281860351562, "rewards/rejected": -38.476924896240234, "step": 3546 }, { "epoch": 2.2065318818040436, "grad_norm": 33.967308044433594, "learning_rate": 1.4684186260949748e-06, "logits/chosen": 1.461820363998413, "logits/rejected": 1.3853689432144165, "logps/chosen": -551.6245727539062, "logps/rejected": -700.6829833984375, "loss": 0.2145, "rewards/accuracies": 0.875, "rewards/chosen": -9.361347198486328, "rewards/margins": 20.84174156188965, "rewards/rejected": -30.203088760375977, "step": 3547 }, { "epoch": 2.207153965785381, "grad_norm": 3.286201533114763e-08, "learning_rate": 1.46726602120793e-06, "logits/chosen": -0.11517900228500366, "logits/rejected": 2.263479709625244, "logps/chosen": -598.5966796875, "logps/rejected": -1188.088134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.82167911529541, "rewards/margins": 40.516597747802734, "rewards/rejected": -48.338279724121094, "step": 3548 }, { "epoch": 2.2077760497667187, "grad_norm": 2.1776845455169678, "learning_rate": 1.4661134163208853e-06, "logits/chosen": -0.0010031461715698242, "logits/rejected": 0.2503492832183838, "logps/chosen": -622.16064453125, "logps/rejected": -973.8892211914062, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -14.452604293823242, "rewards/margins": 29.204866409301758, "rewards/rejected": -43.657470703125, "step": 3549 }, { "epoch": 2.208398133748056, "grad_norm": 0.0007999642984941602, "learning_rate": 1.4649608114338407e-06, "logits/chosen": 2.285749912261963, "logits/rejected": 3.295093059539795, "logps/chosen": -692.86572265625, "logps/rejected": -972.4854736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.163134574890137, "rewards/margins": 21.923898696899414, "rewards/rejected": -37.087032318115234, "step": 3550 }, { "epoch": 2.2090202177293934, "grad_norm": 0.0001855127193266526, "learning_rate": 1.463808206546796e-06, "logits/chosen": -1.0556727647781372, "logits/rejected": 2.513211727142334, "logps/chosen": -542.8209838867188, "logps/rejected": -1080.12060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.84273910522461, "rewards/margins": 34.13514709472656, "rewards/rejected": -42.97788619995117, "step": 3551 }, { "epoch": 2.2096423017107307, "grad_norm": 0.2574240565299988, "learning_rate": 1.4626556016597512e-06, "logits/chosen": -0.01851367950439453, "logits/rejected": 4.146186351776123, "logps/chosen": -456.16351318359375, "logps/rejected": -1008.7926635742188, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -10.954230308532715, "rewards/margins": 23.800861358642578, "rewards/rejected": -34.75509262084961, "step": 3552 }, { "epoch": 2.2102643856920685, "grad_norm": 0.03723328188061714, "learning_rate": 1.4615029967727066e-06, "logits/chosen": 1.8422104120254517, "logits/rejected": 3.3534679412841797, "logps/chosen": -674.9481201171875, "logps/rejected": -1139.601806640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.71571159362793, "rewards/margins": 39.695892333984375, "rewards/rejected": -50.41160202026367, "step": 3553 }, { "epoch": 2.210886469673406, "grad_norm": 5.6771368690533563e-05, "learning_rate": 1.4603503918856618e-06, "logits/chosen": 0.5690759420394897, "logits/rejected": 3.7443671226501465, "logps/chosen": -590.1197509765625, "logps/rejected": -1096.5736083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.7386474609375, "rewards/margins": 32.21901321411133, "rewards/rejected": -46.95766067504883, "step": 3554 }, { "epoch": 2.211508553654743, "grad_norm": 2.097417350910291e-09, "learning_rate": 1.459197786998617e-06, "logits/chosen": -0.7746211886405945, "logits/rejected": 2.040572166442871, "logps/chosen": -559.7862548828125, "logps/rejected": -1210.46240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.710878372192383, "rewards/margins": 44.146278381347656, "rewards/rejected": -56.857154846191406, "step": 3555 }, { "epoch": 2.212130637636081, "grad_norm": 1.0708082337496094e-10, "learning_rate": 1.4580451821115723e-06, "logits/chosen": 0.0021647214889526367, "logits/rejected": 4.682831764221191, "logps/chosen": -480.7462158203125, "logps/rejected": -1162.875732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.413545608520508, "rewards/margins": 42.82459259033203, "rewards/rejected": -50.238136291503906, "step": 3556 }, { "epoch": 2.2127527216174183, "grad_norm": 0.5422202348709106, "learning_rate": 1.4568925772245277e-06, "logits/chosen": 1.54587721824646, "logits/rejected": 2.656994104385376, "logps/chosen": -693.7167358398438, "logps/rejected": -941.8779296875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -11.093391418457031, "rewards/margins": 20.14789581298828, "rewards/rejected": -31.241287231445312, "step": 3557 }, { "epoch": 2.2133748055987557, "grad_norm": 1.6232826709747314, "learning_rate": 1.455739972337483e-06, "logits/chosen": 1.7829499244689941, "logits/rejected": 3.495352268218994, "logps/chosen": -649.2700805664062, "logps/rejected": -998.00244140625, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -12.135042190551758, "rewards/margins": 21.831436157226562, "rewards/rejected": -33.96647644042969, "step": 3558 }, { "epoch": 2.2139968895800934, "grad_norm": 0.001622863463126123, "learning_rate": 1.4545873674504382e-06, "logits/chosen": -1.1253859996795654, "logits/rejected": 2.9452152252197266, "logps/chosen": -589.4984741210938, "logps/rejected": -1159.3382568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.697577476501465, "rewards/margins": 32.21438980102539, "rewards/rejected": -44.91196823120117, "step": 3559 }, { "epoch": 2.214618973561431, "grad_norm": 9.289454396821384e-07, "learning_rate": 1.4534347625633934e-06, "logits/chosen": -0.6384005546569824, "logits/rejected": 3.2510910034179688, "logps/chosen": -389.4755554199219, "logps/rejected": -893.7344970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.662978172302246, "rewards/margins": 32.42913818359375, "rewards/rejected": -39.09211730957031, "step": 3560 }, { "epoch": 2.215241057542768, "grad_norm": 0.006149108987301588, "learning_rate": 1.4522821576763488e-06, "logits/chosen": 2.7856993675231934, "logits/rejected": 3.0821292400360107, "logps/chosen": -736.5506591796875, "logps/rejected": -1034.728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.966891765594482, "rewards/margins": 28.720930099487305, "rewards/rejected": -35.68782043457031, "step": 3561 }, { "epoch": 2.215863141524106, "grad_norm": 2.35528302192688, "learning_rate": 1.451129552789304e-06, "logits/chosen": 1.4770406484603882, "logits/rejected": 4.091999053955078, "logps/chosen": -556.71435546875, "logps/rejected": -976.85791015625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -13.796460151672363, "rewards/margins": 25.918916702270508, "rewards/rejected": -39.71537780761719, "step": 3562 }, { "epoch": 2.2164852255054432, "grad_norm": 0.032568614929914474, "learning_rate": 1.4499769479022593e-06, "logits/chosen": 1.27259361743927, "logits/rejected": 2.114039659500122, "logps/chosen": -574.3934936523438, "logps/rejected": -933.9754638671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.658870697021484, "rewards/margins": 28.712207794189453, "rewards/rejected": -40.37107849121094, "step": 3563 }, { "epoch": 2.2171073094867806, "grad_norm": 6.021178705850616e-06, "learning_rate": 1.4488243430152147e-06, "logits/chosen": 0.02378600835800171, "logits/rejected": 3.564028263092041, "logps/chosen": -474.9874267578125, "logps/rejected": -1052.9521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.039875507354736, "rewards/margins": 36.47960662841797, "rewards/rejected": -43.51948547363281, "step": 3564 }, { "epoch": 2.2177293934681184, "grad_norm": 0.5587344765663147, "learning_rate": 1.44767173812817e-06, "logits/chosen": -1.4509693384170532, "logits/rejected": 2.435011148452759, "logps/chosen": -430.926513671875, "logps/rejected": -884.4074096679688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -7.396395683288574, "rewards/margins": 19.22774887084961, "rewards/rejected": -26.624147415161133, "step": 3565 }, { "epoch": 2.2183514774494557, "grad_norm": 0.11176523566246033, "learning_rate": 1.4465191332411252e-06, "logits/chosen": 1.9386296272277832, "logits/rejected": 2.4904065132141113, "logps/chosen": -648.920166015625, "logps/rejected": -935.1029052734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.740001678466797, "rewards/margins": 25.321617126464844, "rewards/rejected": -36.06161880493164, "step": 3566 }, { "epoch": 2.218973561430793, "grad_norm": 0.0056029753759503365, "learning_rate": 1.4453665283540804e-06, "logits/chosen": 1.5881600379943848, "logits/rejected": 4.458906650543213, "logps/chosen": -502.69049072265625, "logps/rejected": -960.2064819335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.035712242126465, "rewards/margins": 29.94509506225586, "rewards/rejected": -39.98080825805664, "step": 3567 }, { "epoch": 2.219595645412131, "grad_norm": 0.002757622394710779, "learning_rate": 1.4442139234670354e-06, "logits/chosen": 2.278160572052002, "logits/rejected": 4.256825923919678, "logps/chosen": -753.8636474609375, "logps/rejected": -1213.9505615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.023143768310547, "rewards/margins": 33.02347183227539, "rewards/rejected": -48.04661560058594, "step": 3568 }, { "epoch": 2.220217729393468, "grad_norm": 0.0002265808143420145, "learning_rate": 1.4430613185799909e-06, "logits/chosen": 0.007342390716075897, "logits/rejected": 1.5686745643615723, "logps/chosen": -709.2378540039062, "logps/rejected": -1120.9376220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.142677307128906, "rewards/margins": 34.99839782714844, "rewards/rejected": -47.141075134277344, "step": 3569 }, { "epoch": 2.2208398133748055, "grad_norm": 0.001974264159798622, "learning_rate": 1.441908713692946e-06, "logits/chosen": -2.754730701446533, "logits/rejected": -0.30940690636634827, "logps/chosen": -438.1318054199219, "logps/rejected": -972.1848754882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.748628616333008, "rewards/margins": 31.43011474609375, "rewards/rejected": -41.178741455078125, "step": 3570 }, { "epoch": 2.221461897356143, "grad_norm": 0.003989961929619312, "learning_rate": 1.4407561088059013e-06, "logits/chosen": -2.1097288131713867, "logits/rejected": 3.656294584274292, "logps/chosen": -410.3033752441406, "logps/rejected": -1147.75341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.758270263671875, "rewards/margins": 38.09152603149414, "rewards/rejected": -45.84979248046875, "step": 3571 }, { "epoch": 2.2220839813374806, "grad_norm": 13.953394889831543, "learning_rate": 1.4396035039188565e-06, "logits/chosen": 0.9869332313537598, "logits/rejected": 3.99074649810791, "logps/chosen": -527.534912109375, "logps/rejected": -1001.27001953125, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -8.06263542175293, "rewards/margins": 32.09954071044922, "rewards/rejected": -40.162174224853516, "step": 3572 }, { "epoch": 2.222706065318818, "grad_norm": 0.041325412690639496, "learning_rate": 1.438450899031812e-06, "logits/chosen": -1.103442907333374, "logits/rejected": 3.769481658935547, "logps/chosen": -384.44708251953125, "logps/rejected": -904.393798828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.902695655822754, "rewards/margins": 27.485366821289062, "rewards/rejected": -37.388065338134766, "step": 3573 }, { "epoch": 2.2233281493001553, "grad_norm": 8.32625971725065e-07, "learning_rate": 1.4372982941447672e-06, "logits/chosen": -1.5317946672439575, "logits/rejected": 2.258638858795166, "logps/chosen": -471.34619140625, "logps/rejected": -1037.48876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.764509201049805, "rewards/margins": 33.39957809448242, "rewards/rejected": -42.164085388183594, "step": 3574 }, { "epoch": 2.223950233281493, "grad_norm": 0.0006140259793028235, "learning_rate": 1.4361456892577224e-06, "logits/chosen": 3.8182244300842285, "logits/rejected": 3.684861183166504, "logps/chosen": -797.9357299804688, "logps/rejected": -1206.4022216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.626452445983887, "rewards/margins": 34.950870513916016, "rewards/rejected": -49.57732391357422, "step": 3575 }, { "epoch": 2.2245723172628304, "grad_norm": 0.0012920801527798176, "learning_rate": 1.4349930843706778e-06, "logits/chosen": 1.275935173034668, "logits/rejected": 2.668433904647827, "logps/chosen": -682.9571533203125, "logps/rejected": -1124.5068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.28410530090332, "rewards/margins": 31.946544647216797, "rewards/rejected": -44.23065185546875, "step": 3576 }, { "epoch": 2.225194401244168, "grad_norm": 0.09917772561311722, "learning_rate": 1.433840479483633e-06, "logits/chosen": -0.4102671146392822, "logits/rejected": 2.216341495513916, "logps/chosen": -454.63818359375, "logps/rejected": -1024.591796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.388709545135498, "rewards/margins": 33.88431167602539, "rewards/rejected": -41.27302169799805, "step": 3577 }, { "epoch": 2.2258164852255056, "grad_norm": 1.067162065737648e-05, "learning_rate": 1.4326878745965883e-06, "logits/chosen": -1.766862392425537, "logits/rejected": 2.5478920936584473, "logps/chosen": -330.01385498046875, "logps/rejected": -957.0518798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.559136867523193, "rewards/margins": 33.83004379272461, "rewards/rejected": -40.38917922973633, "step": 3578 }, { "epoch": 2.226438569206843, "grad_norm": 0.04910597577691078, "learning_rate": 1.4315352697095435e-06, "logits/chosen": -0.8890432119369507, "logits/rejected": 2.860970973968506, "logps/chosen": -427.6197814941406, "logps/rejected": -959.2499389648438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.983651161193848, "rewards/margins": 27.483963012695312, "rewards/rejected": -34.467613220214844, "step": 3579 }, { "epoch": 2.2270606531881803, "grad_norm": 7.004135568422498e-06, "learning_rate": 1.430382664822499e-06, "logits/chosen": 0.7989118695259094, "logits/rejected": 4.2649664878845215, "logps/chosen": -388.40765380859375, "logps/rejected": -975.993408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.377376556396484, "rewards/margins": 30.533058166503906, "rewards/rejected": -38.91043472290039, "step": 3580 }, { "epoch": 2.227682737169518, "grad_norm": 0.020427517592906952, "learning_rate": 1.4292300599354542e-06, "logits/chosen": 0.2810332775115967, "logits/rejected": 3.973931312561035, "logps/chosen": -458.1094970703125, "logps/rejected": -945.5018920898438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.795637130737305, "rewards/margins": 29.356910705566406, "rewards/rejected": -39.15254592895508, "step": 3581 }, { "epoch": 2.2283048211508554, "grad_norm": 6.906030654907227, "learning_rate": 1.4280774550484094e-06, "logits/chosen": -1.385702133178711, "logits/rejected": 1.523589849472046, "logps/chosen": -461.1471862792969, "logps/rejected": -835.5704345703125, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": -9.676605224609375, "rewards/margins": 27.159587860107422, "rewards/rejected": -36.8361930847168, "step": 3582 }, { "epoch": 2.2289269051321927, "grad_norm": 4.475971698760986, "learning_rate": 1.4269248501613646e-06, "logits/chosen": 2.221020460128784, "logits/rejected": 2.2579550743103027, "logps/chosen": -595.5123901367188, "logps/rejected": -788.226318359375, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -8.11880111694336, "rewards/margins": 20.095670700073242, "rewards/rejected": -28.21446990966797, "step": 3583 }, { "epoch": 2.2295489891135305, "grad_norm": 8.492868630582961e-08, "learning_rate": 1.42577224527432e-06, "logits/chosen": 3.062131881713867, "logits/rejected": 3.5542662143707275, "logps/chosen": -652.6409301757812, "logps/rejected": -1119.76123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.757875442504883, "rewards/margins": 34.784400939941406, "rewards/rejected": -46.542274475097656, "step": 3584 }, { "epoch": 2.230171073094868, "grad_norm": 0.07185564935207367, "learning_rate": 1.4246196403872753e-06, "logits/chosen": 0.11531239748001099, "logits/rejected": 3.3846631050109863, "logps/chosen": -489.6289978027344, "logps/rejected": -867.05078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.214015007019043, "rewards/margins": 20.78061294555664, "rewards/rejected": -28.994626998901367, "step": 3585 }, { "epoch": 2.230793157076205, "grad_norm": 24.446683883666992, "learning_rate": 1.4234670355002305e-06, "logits/chosen": 0.3510545492172241, "logits/rejected": 3.017117500305176, "logps/chosen": -532.3301391601562, "logps/rejected": -973.9273071289062, "loss": 0.212, "rewards/accuracies": 0.875, "rewards/chosen": -7.277337074279785, "rewards/margins": 32.72923278808594, "rewards/rejected": -40.006568908691406, "step": 3586 }, { "epoch": 2.231415241057543, "grad_norm": 0.00026219518622383475, "learning_rate": 1.422314430613186e-06, "logits/chosen": 0.8821203708648682, "logits/rejected": 3.143800735473633, "logps/chosen": -518.6629638671875, "logps/rejected": -997.9705810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.694472312927246, "rewards/margins": 35.719844818115234, "rewards/rejected": -46.41431427001953, "step": 3587 }, { "epoch": 2.2320373250388803, "grad_norm": 0.060858700424432755, "learning_rate": 1.4211618257261412e-06, "logits/chosen": 1.9092174768447876, "logits/rejected": 4.185993194580078, "logps/chosen": -546.7835693359375, "logps/rejected": -965.928955078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.56574535369873, "rewards/margins": 29.70265007019043, "rewards/rejected": -39.268394470214844, "step": 3588 }, { "epoch": 2.2326594090202176, "grad_norm": 36.87541580200195, "learning_rate": 1.4200092208390964e-06, "logits/chosen": -1.4777759313583374, "logits/rejected": 3.420780658721924, "logps/chosen": -500.4163818359375, "logps/rejected": -1042.6434326171875, "loss": 0.5542, "rewards/accuracies": 0.875, "rewards/chosen": -6.277176856994629, "rewards/margins": 30.546443939208984, "rewards/rejected": -36.82362365722656, "step": 3589 }, { "epoch": 2.233281493001555, "grad_norm": 0.6003460884094238, "learning_rate": 1.4188566159520516e-06, "logits/chosen": -0.2000950276851654, "logits/rejected": 3.9854304790496826, "logps/chosen": -382.73809814453125, "logps/rejected": -948.181640625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.3645100593566895, "rewards/margins": 31.50813865661621, "rewards/rejected": -35.872650146484375, "step": 3590 }, { "epoch": 2.2339035769828928, "grad_norm": 0.3768901526927948, "learning_rate": 1.417704011065007e-06, "logits/chosen": -3.7208173274993896, "logits/rejected": -0.07973974943161011, "logps/chosen": -310.0085754394531, "logps/rejected": -711.6843872070312, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.959542274475098, "rewards/margins": 24.725475311279297, "rewards/rejected": -29.685020446777344, "step": 3591 }, { "epoch": 2.23452566096423, "grad_norm": 3.6291356086730957, "learning_rate": 1.4165514061779623e-06, "logits/chosen": -1.0591273307800293, "logits/rejected": 1.9402902126312256, "logps/chosen": -393.0709533691406, "logps/rejected": -802.1497802734375, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -8.485906600952148, "rewards/margins": 24.194093704223633, "rewards/rejected": -32.68000030517578, "step": 3592 }, { "epoch": 2.2351477449455674, "grad_norm": 0.09102648496627808, "learning_rate": 1.4153988012909175e-06, "logits/chosen": 0.13417690992355347, "logits/rejected": 3.4707531929016113, "logps/chosen": -512.4104614257812, "logps/rejected": -935.8575439453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -13.069864273071289, "rewards/margins": 24.324996948242188, "rewards/rejected": -37.394859313964844, "step": 3593 }, { "epoch": 2.2357698289269052, "grad_norm": 9.27643632167019e-05, "learning_rate": 1.4142461964038727e-06, "logits/chosen": 0.36809054017066956, "logits/rejected": 4.21954345703125, "logps/chosen": -596.3086547851562, "logps/rejected": -1222.6021728515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.957084655761719, "rewards/margins": 40.438499450683594, "rewards/rejected": -50.39558029174805, "step": 3594 }, { "epoch": 2.2363919129082426, "grad_norm": 0.04157000035047531, "learning_rate": 1.4130935915168282e-06, "logits/chosen": 1.3686137199401855, "logits/rejected": 3.7084567546844482, "logps/chosen": -624.705322265625, "logps/rejected": -1096.7337646484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.065147399902344, "rewards/margins": 31.175518035888672, "rewards/rejected": -39.24066162109375, "step": 3595 }, { "epoch": 2.23701399688958, "grad_norm": 0.00014478390221484005, "learning_rate": 1.4119409866297834e-06, "logits/chosen": 2.2126994132995605, "logits/rejected": 3.524190902709961, "logps/chosen": -674.3311157226562, "logps/rejected": -1038.1907958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.26594352722168, "rewards/margins": 28.325271606445312, "rewards/rejected": -39.591217041015625, "step": 3596 }, { "epoch": 2.2376360808709177, "grad_norm": 0.001759856822900474, "learning_rate": 1.4107883817427386e-06, "logits/chosen": 0.9845359921455383, "logits/rejected": 2.5849828720092773, "logps/chosen": -755.294189453125, "logps/rejected": -1175.09912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.71370792388916, "rewards/margins": 28.954248428344727, "rewards/rejected": -44.6679573059082, "step": 3597 }, { "epoch": 2.238258164852255, "grad_norm": 1.3494692439053324e-06, "learning_rate": 1.409635776855694e-06, "logits/chosen": 2.1158127784729004, "logits/rejected": 3.403364419937134, "logps/chosen": -700.1041259765625, "logps/rejected": -1197.26611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.83034610748291, "rewards/margins": 38.487022399902344, "rewards/rejected": -49.31736755371094, "step": 3598 }, { "epoch": 2.2388802488335924, "grad_norm": 2.0819742679595947, "learning_rate": 1.4084831719686493e-06, "logits/chosen": -2.9958994388580322, "logits/rejected": 2.014313220977783, "logps/chosen": -350.19500732421875, "logps/rejected": -974.436279296875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -6.576704025268555, "rewards/margins": 32.14638900756836, "rewards/rejected": -38.72309494018555, "step": 3599 }, { "epoch": 2.23950233281493, "grad_norm": 0.08503194898366928, "learning_rate": 1.4073305670816045e-06, "logits/chosen": 3.557544708251953, "logits/rejected": 4.321353435516357, "logps/chosen": -751.0997314453125, "logps/rejected": -1050.1031494140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.91750431060791, "rewards/margins": 23.742610931396484, "rewards/rejected": -36.66011428833008, "step": 3600 }, { "epoch": 2.2401244167962675, "grad_norm": 0.005394472740590572, "learning_rate": 1.4061779621945597e-06, "logits/chosen": 1.0073996782302856, "logits/rejected": 2.4288978576660156, "logps/chosen": -554.6205444335938, "logps/rejected": -939.342041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.75421142578125, "rewards/margins": 23.233583450317383, "rewards/rejected": -34.98779296875, "step": 3601 }, { "epoch": 2.240746500777605, "grad_norm": 2.5468521736016214e-10, "learning_rate": 1.4050253573075152e-06, "logits/chosen": -0.81147301197052, "logits/rejected": 1.3437249660491943, "logps/chosen": -541.517822265625, "logps/rejected": -1117.1673583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.313809394836426, "rewards/margins": 40.41972732543945, "rewards/rejected": -49.73353576660156, "step": 3602 }, { "epoch": 2.2413685847589426, "grad_norm": 40.08323669433594, "learning_rate": 1.4038727524204704e-06, "logits/chosen": -2.013843297958374, "logits/rejected": 2.682800054550171, "logps/chosen": -492.3447570800781, "logps/rejected": -1076.1142578125, "loss": 0.6501, "rewards/accuracies": 0.875, "rewards/chosen": -6.914434909820557, "rewards/margins": 30.04220962524414, "rewards/rejected": -36.95664978027344, "step": 3603 }, { "epoch": 2.24199066874028, "grad_norm": 1.0822905904817048e-09, "learning_rate": 1.4027201475334256e-06, "logits/chosen": -3.263639450073242, "logits/rejected": 1.29646897315979, "logps/chosen": -425.00909423828125, "logps/rejected": -1020.944091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.435742378234863, "rewards/margins": 41.301151275634766, "rewards/rejected": -48.73689270019531, "step": 3604 }, { "epoch": 2.2426127527216173, "grad_norm": 6.468580722808838, "learning_rate": 1.4015675426463808e-06, "logits/chosen": 1.4337022304534912, "logits/rejected": 2.421602249145508, "logps/chosen": -648.6536254882812, "logps/rejected": -909.980224609375, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -11.684656143188477, "rewards/margins": 25.83026885986328, "rewards/rejected": -37.514923095703125, "step": 3605 }, { "epoch": 2.243234836702955, "grad_norm": 0.004268967546522617, "learning_rate": 1.4004149377593363e-06, "logits/chosen": -1.3249869346618652, "logits/rejected": 2.7571754455566406, "logps/chosen": -425.77606201171875, "logps/rejected": -983.8963012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.982041358947754, "rewards/margins": 32.781829833984375, "rewards/rejected": -40.76387023925781, "step": 3606 }, { "epoch": 2.2438569206842924, "grad_norm": 0.02053695172071457, "learning_rate": 1.3992623328722915e-06, "logits/chosen": 3.154069423675537, "logits/rejected": 2.9946210384368896, "logps/chosen": -675.71484375, "logps/rejected": -986.4381713867188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.358978271484375, "rewards/margins": 27.62502670288086, "rewards/rejected": -42.9840087890625, "step": 3607 }, { "epoch": 2.2444790046656298, "grad_norm": 0.036014728248119354, "learning_rate": 1.3981097279852467e-06, "logits/chosen": 0.1260930299758911, "logits/rejected": 3.830151081085205, "logps/chosen": -530.7362670898438, "logps/rejected": -1056.0645751953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.860038757324219, "rewards/margins": 32.02716064453125, "rewards/rejected": -41.88720703125, "step": 3608 }, { "epoch": 2.245101088646967, "grad_norm": 0.06608090549707413, "learning_rate": 1.3969571230982022e-06, "logits/chosen": 1.1909940242767334, "logits/rejected": 3.821021556854248, "logps/chosen": -635.0616455078125, "logps/rejected": -1033.9156494140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.48543930053711, "rewards/margins": 23.562484741210938, "rewards/rejected": -40.04792785644531, "step": 3609 }, { "epoch": 2.245723172628305, "grad_norm": 0.004007376730442047, "learning_rate": 1.3958045182111574e-06, "logits/chosen": -1.2943001985549927, "logits/rejected": 2.278123378753662, "logps/chosen": -464.9049072265625, "logps/rejected": -1044.6634521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.284080505371094, "rewards/margins": 35.11205291748047, "rewards/rejected": -43.39613342285156, "step": 3610 }, { "epoch": 2.2463452566096422, "grad_norm": 4.8539391173108015e-06, "learning_rate": 1.3946519133241126e-06, "logits/chosen": 1.5299954414367676, "logits/rejected": 3.668931722640991, "logps/chosen": -711.9598999023438, "logps/rejected": -1167.881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.675633430480957, "rewards/margins": 41.793670654296875, "rewards/rejected": -52.46930694580078, "step": 3611 }, { "epoch": 2.2469673405909796, "grad_norm": 0.10625211149454117, "learning_rate": 1.3934993084370678e-06, "logits/chosen": 0.4892570972442627, "logits/rejected": 3.181586503982544, "logps/chosen": -609.27490234375, "logps/rejected": -1167.6185302734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.308204650878906, "rewards/margins": 34.49951171875, "rewards/rejected": -44.80772018432617, "step": 3612 }, { "epoch": 2.2475894245723174, "grad_norm": 0.0002764218661468476, "learning_rate": 1.3923467035500233e-06, "logits/chosen": -0.6855688095092773, "logits/rejected": 4.369061470031738, "logps/chosen": -483.9892883300781, "logps/rejected": -1195.517333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.535987854003906, "rewards/margins": 36.80906295776367, "rewards/rejected": -47.345054626464844, "step": 3613 }, { "epoch": 2.2482115085536547, "grad_norm": 0.04360436648130417, "learning_rate": 1.3911940986629785e-06, "logits/chosen": -1.0055224895477295, "logits/rejected": 1.2355737686157227, "logps/chosen": -570.8172607421875, "logps/rejected": -991.0896606445312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -14.466044425964355, "rewards/margins": 27.260345458984375, "rewards/rejected": -41.72639083862305, "step": 3614 }, { "epoch": 2.248833592534992, "grad_norm": 0.01460373867303133, "learning_rate": 1.3900414937759337e-06, "logits/chosen": 2.5443148612976074, "logits/rejected": 3.125467300415039, "logps/chosen": -656.630859375, "logps/rejected": -941.162841796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.8216552734375, "rewards/margins": 26.766983032226562, "rewards/rejected": -35.58863830566406, "step": 3615 }, { "epoch": 2.24945567651633, "grad_norm": 0.040251053869724274, "learning_rate": 1.3888888888888892e-06, "logits/chosen": 1.715376853942871, "logits/rejected": 2.761099338531494, "logps/chosen": -572.9150390625, "logps/rejected": -828.9449462890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.75190544128418, "rewards/margins": 23.62649917602539, "rewards/rejected": -34.37840270996094, "step": 3616 }, { "epoch": 2.250077760497667, "grad_norm": 1.6713193329298548e-10, "learning_rate": 1.3877362840018444e-06, "logits/chosen": 0.12911105155944824, "logits/rejected": 3.3643741607666016, "logps/chosen": -502.5730285644531, "logps/rejected": -1142.19482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.952110290527344, "rewards/margins": 42.303550720214844, "rewards/rejected": -52.25566482543945, "step": 3617 }, { "epoch": 2.2506998444790045, "grad_norm": 13.921367645263672, "learning_rate": 1.3865836791147996e-06, "logits/chosen": -0.6900714635848999, "logits/rejected": 3.1058475971221924, "logps/chosen": -525.9616088867188, "logps/rejected": -1013.4569091796875, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": -9.485834121704102, "rewards/margins": 29.015771865844727, "rewards/rejected": -38.50160598754883, "step": 3618 }, { "epoch": 2.2513219284603423, "grad_norm": 0.06612343341112137, "learning_rate": 1.3854310742277548e-06, "logits/chosen": 0.1767033338546753, "logits/rejected": 3.644655466079712, "logps/chosen": -579.5123291015625, "logps/rejected": -1140.45263671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.06546401977539, "rewards/margins": 27.96875, "rewards/rejected": -39.034217834472656, "step": 3619 }, { "epoch": 2.2519440124416796, "grad_norm": 0.0021795639768242836, "learning_rate": 1.3842784693407103e-06, "logits/chosen": -0.03955802321434021, "logits/rejected": 3.334292411804199, "logps/chosen": -396.38818359375, "logps/rejected": -994.3783569335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.11500072479248, "rewards/margins": 34.99541473388672, "rewards/rejected": -45.11041259765625, "step": 3620 }, { "epoch": 2.252566096423017, "grad_norm": 5.8073277614312246e-05, "learning_rate": 1.3831258644536655e-06, "logits/chosen": 1.6798940896987915, "logits/rejected": 3.420667886734009, "logps/chosen": -564.4171752929688, "logps/rejected": -992.0733642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.620368003845215, "rewards/margins": 30.430877685546875, "rewards/rejected": -43.051246643066406, "step": 3621 }, { "epoch": 2.2531881804043548, "grad_norm": 0.0003387883771210909, "learning_rate": 1.3819732595666207e-06, "logits/chosen": -2.5185070037841797, "logits/rejected": 2.5935957431793213, "logps/chosen": -357.0126647949219, "logps/rejected": -965.9202880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.179485321044922, "rewards/margins": 34.00236511230469, "rewards/rejected": -41.181846618652344, "step": 3622 }, { "epoch": 2.253810264385692, "grad_norm": 0.001127618015743792, "learning_rate": 1.380820654679576e-06, "logits/chosen": 1.6329503059387207, "logits/rejected": 4.3267927169799805, "logps/chosen": -644.5361328125, "logps/rejected": -1160.6734619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.97283935546875, "rewards/margins": 32.02046203613281, "rewards/rejected": -41.99330520629883, "step": 3623 }, { "epoch": 2.2544323483670294, "grad_norm": 0.0018201852217316628, "learning_rate": 1.3796680497925314e-06, "logits/chosen": -1.341170072555542, "logits/rejected": 3.551229476928711, "logps/chosen": -598.9361572265625, "logps/rejected": -1234.8564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.985949516296387, "rewards/margins": 32.776268005371094, "rewards/rejected": -43.76221466064453, "step": 3624 }, { "epoch": 2.255054432348367, "grad_norm": 0.0016448667738586664, "learning_rate": 1.3785154449054866e-06, "logits/chosen": 0.6106576919555664, "logits/rejected": 1.8452892303466797, "logps/chosen": -522.1275024414062, "logps/rejected": -885.4285888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.987752437591553, "rewards/margins": 24.238380432128906, "rewards/rejected": -31.226133346557617, "step": 3625 }, { "epoch": 2.2556765163297046, "grad_norm": 0.018660522997379303, "learning_rate": 1.3773628400184418e-06, "logits/chosen": -1.0240919589996338, "logits/rejected": 1.6847429275512695, "logps/chosen": -570.3428955078125, "logps/rejected": -1055.249755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.822690963745117, "rewards/margins": 32.19108581542969, "rewards/rejected": -45.01377487182617, "step": 3626 }, { "epoch": 2.256298600311042, "grad_norm": 0.037954483181238174, "learning_rate": 1.3762102351313973e-06, "logits/chosen": 1.0277810096740723, "logits/rejected": 1.5962250232696533, "logps/chosen": -531.65966796875, "logps/rejected": -910.9324951171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.959372520446777, "rewards/margins": 29.962890625, "rewards/rejected": -35.922264099121094, "step": 3627 }, { "epoch": 2.2569206842923792, "grad_norm": 0.0028481753543019295, "learning_rate": 1.3750576302443525e-06, "logits/chosen": 1.854551076889038, "logits/rejected": 2.4571213722229004, "logps/chosen": -669.2379150390625, "logps/rejected": -1053.6103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.859801292419434, "rewards/margins": 27.96354866027832, "rewards/rejected": -38.82334899902344, "step": 3628 }, { "epoch": 2.257542768273717, "grad_norm": 0.03748118877410889, "learning_rate": 1.3739050253573077e-06, "logits/chosen": -2.21651029586792, "logits/rejected": 1.614790678024292, "logps/chosen": -392.2901611328125, "logps/rejected": -836.7474365234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.908618927001953, "rewards/margins": 18.771080017089844, "rewards/rejected": -26.679697036743164, "step": 3629 }, { "epoch": 2.2581648522550544, "grad_norm": 0.24403417110443115, "learning_rate": 1.372752420470263e-06, "logits/chosen": -0.4138451814651489, "logits/rejected": 2.3937408924102783, "logps/chosen": -594.5587768554688, "logps/rejected": -991.3351440429688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -14.23100471496582, "rewards/margins": 25.721485137939453, "rewards/rejected": -39.95248794555664, "step": 3630 }, { "epoch": 2.258786936236392, "grad_norm": 1.4337886568682734e-05, "learning_rate": 1.3715998155832184e-06, "logits/chosen": 0.9790940284729004, "logits/rejected": 3.165156126022339, "logps/chosen": -626.1618041992188, "logps/rejected": -1086.665283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.662452697753906, "rewards/margins": 33.48451232910156, "rewards/rejected": -48.146968841552734, "step": 3631 }, { "epoch": 2.2594090202177295, "grad_norm": 0.0001606412697583437, "learning_rate": 1.3704472106961736e-06, "logits/chosen": 0.9021769762039185, "logits/rejected": 2.9399120807647705, "logps/chosen": -685.17138671875, "logps/rejected": -1164.349853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.966161727905273, "rewards/margins": 27.477081298828125, "rewards/rejected": -38.44324493408203, "step": 3632 }, { "epoch": 2.260031104199067, "grad_norm": 0.002752943430095911, "learning_rate": 1.3692946058091288e-06, "logits/chosen": 0.6641960144042969, "logits/rejected": 2.6874494552612305, "logps/chosen": -600.1417236328125, "logps/rejected": -970.0240478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.83272933959961, "rewards/margins": 30.065876007080078, "rewards/rejected": -40.89860534667969, "step": 3633 }, { "epoch": 2.260653188180404, "grad_norm": 0.00016964755195658654, "learning_rate": 1.368142000922084e-06, "logits/chosen": -1.6499991416931152, "logits/rejected": 2.953476667404175, "logps/chosen": -455.2291259765625, "logps/rejected": -1102.061279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.72672176361084, "rewards/margins": 37.105812072753906, "rewards/rejected": -45.83253479003906, "step": 3634 }, { "epoch": 2.261275272161742, "grad_norm": 6.942920549590781e-07, "learning_rate": 1.366989396035039e-06, "logits/chosen": -0.9813140034675598, "logits/rejected": 2.0054101943969727, "logps/chosen": -452.45037841796875, "logps/rejected": -1088.82275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.222711563110352, "rewards/margins": 38.482872009277344, "rewards/rejected": -46.70558547973633, "step": 3635 }, { "epoch": 2.2618973561430793, "grad_norm": 46.79713439941406, "learning_rate": 1.3658367911479945e-06, "logits/chosen": 1.9303909540176392, "logits/rejected": 3.462801456451416, "logps/chosen": -649.6270751953125, "logps/rejected": -1096.564453125, "loss": 0.6223, "rewards/accuracies": 0.875, "rewards/chosen": -11.076848983764648, "rewards/margins": 28.809837341308594, "rewards/rejected": -39.88668441772461, "step": 3636 }, { "epoch": 2.2625194401244166, "grad_norm": 0.006422718986868858, "learning_rate": 1.3646841862609497e-06, "logits/chosen": 0.4042189121246338, "logits/rejected": 3.6927623748779297, "logps/chosen": -472.4306335449219, "logps/rejected": -1149.659423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.388535499572754, "rewards/margins": 36.08953094482422, "rewards/rejected": -44.47806167602539, "step": 3637 }, { "epoch": 2.2631415241057544, "grad_norm": 0.04757959023118019, "learning_rate": 1.363531581373905e-06, "logits/chosen": 1.9163646697998047, "logits/rejected": 2.9809160232543945, "logps/chosen": -683.015625, "logps/rejected": -975.9616088867188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.581829071044922, "rewards/margins": 26.310970306396484, "rewards/rejected": -35.892799377441406, "step": 3638 }, { "epoch": 2.2637636080870918, "grad_norm": 2.0798819605261087e-05, "learning_rate": 1.3623789764868604e-06, "logits/chosen": -2.256040096282959, "logits/rejected": -0.2808629274368286, "logps/chosen": -461.394775390625, "logps/rejected": -837.761962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.2075300216674805, "rewards/margins": 27.08609962463379, "rewards/rejected": -34.29363250732422, "step": 3639 }, { "epoch": 2.264385692068429, "grad_norm": 0.0005983648006804287, "learning_rate": 1.3612263715998156e-06, "logits/chosen": 1.4837771654129028, "logits/rejected": 2.5212080478668213, "logps/chosen": -584.5574951171875, "logps/rejected": -896.6384887695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.758228302001953, "rewards/margins": 25.834224700927734, "rewards/rejected": -35.59245300292969, "step": 3640 }, { "epoch": 2.265007776049767, "grad_norm": 0.01774199865758419, "learning_rate": 1.3600737667127708e-06, "logits/chosen": 1.4264962673187256, "logits/rejected": 1.4264075756072998, "logps/chosen": -644.2673950195312, "logps/rejected": -845.5146484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.36091423034668, "rewards/margins": 20.632631301879883, "rewards/rejected": -30.99354362487793, "step": 3641 }, { "epoch": 2.2656298600311042, "grad_norm": 0.09057408571243286, "learning_rate": 1.358921161825726e-06, "logits/chosen": 0.2792913317680359, "logits/rejected": 2.1801884174346924, "logps/chosen": -623.1153564453125, "logps/rejected": -980.2783203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.632190704345703, "rewards/margins": 21.55695343017578, "rewards/rejected": -32.189144134521484, "step": 3642 }, { "epoch": 2.2662519440124416, "grad_norm": 0.025631356984376907, "learning_rate": 1.3577685569386815e-06, "logits/chosen": 0.3164311647415161, "logits/rejected": 3.2249999046325684, "logps/chosen": -546.7294921875, "logps/rejected": -909.2017211914062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.281386852264404, "rewards/margins": 24.865476608276367, "rewards/rejected": -32.1468620300293, "step": 3643 }, { "epoch": 2.2668740279937794, "grad_norm": 23.9825439453125, "learning_rate": 1.3566159520516367e-06, "logits/chosen": 1.6409586668014526, "logits/rejected": 2.854196786880493, "logps/chosen": -668.395263671875, "logps/rejected": -926.1879272460938, "loss": 0.1152, "rewards/accuracies": 0.875, "rewards/chosen": -12.461042404174805, "rewards/margins": 21.571842193603516, "rewards/rejected": -34.03288269042969, "step": 3644 }, { "epoch": 2.2674961119751167, "grad_norm": 0.14037181437015533, "learning_rate": 1.355463347164592e-06, "logits/chosen": -1.410860300064087, "logits/rejected": 3.208120822906494, "logps/chosen": -488.7926330566406, "logps/rejected": -946.1668701171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.458749771118164, "rewards/margins": 26.631572723388672, "rewards/rejected": -38.09032440185547, "step": 3645 }, { "epoch": 2.268118195956454, "grad_norm": 0.25334399938583374, "learning_rate": 1.3543107422775472e-06, "logits/chosen": -1.4088850021362305, "logits/rejected": 2.939032554626465, "logps/chosen": -360.3502502441406, "logps/rejected": -973.505859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -9.68812084197998, "rewards/margins": 29.315006256103516, "rewards/rejected": -39.00312423706055, "step": 3646 }, { "epoch": 2.2687402799377914, "grad_norm": 0.15394827723503113, "learning_rate": 1.3531581373905026e-06, "logits/chosen": 0.8405669331550598, "logits/rejected": 3.7781574726104736, "logps/chosen": -464.24200439453125, "logps/rejected": -895.093505859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -10.683003425598145, "rewards/margins": 27.76809310913086, "rewards/rejected": -38.45109939575195, "step": 3647 }, { "epoch": 2.269362363919129, "grad_norm": 0.17489789426326752, "learning_rate": 1.3520055325034578e-06, "logits/chosen": -1.095194697380066, "logits/rejected": 1.75686776638031, "logps/chosen": -472.18212890625, "logps/rejected": -951.9043579101562, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.827794075012207, "rewards/margins": 31.027143478393555, "rewards/rejected": -38.85493850708008, "step": 3648 }, { "epoch": 2.2699844479004665, "grad_norm": 0.0005933581851422787, "learning_rate": 1.350852927616413e-06, "logits/chosen": -2.832789659500122, "logits/rejected": 2.9136810302734375, "logps/chosen": -395.674560546875, "logps/rejected": -1046.361572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.632064819335938, "rewards/margins": 30.736907958984375, "rewards/rejected": -40.36897659301758, "step": 3649 }, { "epoch": 2.2706065318818043, "grad_norm": 3.9638190269470215, "learning_rate": 1.3497003227293685e-06, "logits/chosen": 0.07114283740520477, "logits/rejected": 3.7599639892578125, "logps/chosen": -526.3443603515625, "logps/rejected": -1021.1272583007812, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -10.873167991638184, "rewards/margins": 27.13154411315918, "rewards/rejected": -38.00471496582031, "step": 3650 }, { "epoch": 2.2712286158631416, "grad_norm": 2.231683083664393e-06, "learning_rate": 1.3485477178423237e-06, "logits/chosen": -0.4136146306991577, "logits/rejected": 3.6203694343566895, "logps/chosen": -429.990234375, "logps/rejected": -1029.20166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.910269737243652, "rewards/margins": 41.052101135253906, "rewards/rejected": -48.96236801147461, "step": 3651 }, { "epoch": 2.271850699844479, "grad_norm": 2.4136397769325413e-05, "learning_rate": 1.347395112955279e-06, "logits/chosen": -1.157287836074829, "logits/rejected": 1.3219666481018066, "logps/chosen": -509.8829040527344, "logps/rejected": -1024.821044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.58205795288086, "rewards/margins": 38.33229064941406, "rewards/rejected": -47.91435241699219, "step": 3652 }, { "epoch": 2.2724727838258163, "grad_norm": 2.5297253181122414e-10, "learning_rate": 1.3462425080682342e-06, "logits/chosen": 1.1511964797973633, "logits/rejected": 3.777742385864258, "logps/chosen": -628.3382568359375, "logps/rejected": -1225.38427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.839183807373047, "rewards/margins": 38.61802673339844, "rewards/rejected": -48.45721435546875, "step": 3653 }, { "epoch": 2.273094867807154, "grad_norm": 36.658748626708984, "learning_rate": 1.3450899031811896e-06, "logits/chosen": 1.0936846733093262, "logits/rejected": 2.0753893852233887, "logps/chosen": -478.0395812988281, "logps/rejected": -874.01025390625, "loss": 0.7303, "rewards/accuracies": 0.875, "rewards/chosen": -10.23668098449707, "rewards/margins": 24.970584869384766, "rewards/rejected": -35.20726776123047, "step": 3654 }, { "epoch": 2.2737169517884914, "grad_norm": 0.8795388340950012, "learning_rate": 1.3439372982941448e-06, "logits/chosen": -0.4980417788028717, "logits/rejected": 2.420764684677124, "logps/chosen": -432.43310546875, "logps/rejected": -798.7457275390625, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -11.809070587158203, "rewards/margins": 18.36756706237793, "rewards/rejected": -30.176637649536133, "step": 3655 }, { "epoch": 2.2743390357698288, "grad_norm": 2.9704248905181885, "learning_rate": 1.3427846934071e-06, "logits/chosen": 1.417180061340332, "logits/rejected": 3.072629690170288, "logps/chosen": -572.999755859375, "logps/rejected": -979.4898681640625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -7.005555629730225, "rewards/margins": 30.64610481262207, "rewards/rejected": -37.65166091918945, "step": 3656 }, { "epoch": 2.2749611197511665, "grad_norm": 0.036785803735256195, "learning_rate": 1.3416320885200553e-06, "logits/chosen": -2.2123665809631348, "logits/rejected": 1.8106573820114136, "logps/chosen": -414.67205810546875, "logps/rejected": -904.7940673828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.934468746185303, "rewards/margins": 26.992454528808594, "rewards/rejected": -31.926923751831055, "step": 3657 }, { "epoch": 2.275583203732504, "grad_norm": 6.124455451965332, "learning_rate": 1.3404794836330107e-06, "logits/chosen": 2.324453830718994, "logits/rejected": 3.3866355419158936, "logps/chosen": -636.7662963867188, "logps/rejected": -936.2508544921875, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -8.091182708740234, "rewards/margins": 23.152149200439453, "rewards/rejected": -31.243331909179688, "step": 3658 }, { "epoch": 2.2762052877138412, "grad_norm": 51.40892028808594, "learning_rate": 1.339326878745966e-06, "logits/chosen": -0.03686082363128662, "logits/rejected": 4.1776814460754395, "logps/chosen": -506.9464416503906, "logps/rejected": -916.13818359375, "loss": 2.0112, "rewards/accuracies": 0.875, "rewards/chosen": -11.185665130615234, "rewards/margins": 27.882423400878906, "rewards/rejected": -39.06808853149414, "step": 3659 }, { "epoch": 2.276827371695179, "grad_norm": 0.000569489726331085, "learning_rate": 1.3381742738589212e-06, "logits/chosen": 0.07180686295032501, "logits/rejected": 4.02716588973999, "logps/chosen": -540.2124633789062, "logps/rejected": -1095.65576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.912123680114746, "rewards/margins": 28.8727970123291, "rewards/rejected": -42.7849235534668, "step": 3660 }, { "epoch": 2.2774494556765164, "grad_norm": 13.124598503112793, "learning_rate": 1.3370216689718766e-06, "logits/chosen": 0.4328814744949341, "logits/rejected": 3.6302595138549805, "logps/chosen": -453.68194580078125, "logps/rejected": -852.2915649414062, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": -6.634185791015625, "rewards/margins": 22.16658592224121, "rewards/rejected": -28.800771713256836, "step": 3661 }, { "epoch": 2.2780715396578537, "grad_norm": 0.0037217868957668543, "learning_rate": 1.3358690640848318e-06, "logits/chosen": 0.699317216873169, "logits/rejected": 4.026709079742432, "logps/chosen": -473.6370544433594, "logps/rejected": -785.733642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.310136318206787, "rewards/margins": 21.60482406616211, "rewards/rejected": -28.914958953857422, "step": 3662 }, { "epoch": 2.2786936236391915, "grad_norm": 0.5339617729187012, "learning_rate": 1.334716459197787e-06, "logits/chosen": 1.6939356327056885, "logits/rejected": 3.958897590637207, "logps/chosen": -640.4788818359375, "logps/rejected": -973.7042846679688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -8.455921173095703, "rewards/margins": 20.49496078491211, "rewards/rejected": -28.950881958007812, "step": 3663 }, { "epoch": 2.279315707620529, "grad_norm": 20.766817092895508, "learning_rate": 1.3335638543107423e-06, "logits/chosen": 0.04786163568496704, "logits/rejected": 4.414052486419678, "logps/chosen": -427.1124267578125, "logps/rejected": -1074.2518310546875, "loss": 0.0934, "rewards/accuracies": 0.875, "rewards/chosen": -9.711767196655273, "rewards/margins": 34.70350646972656, "rewards/rejected": -44.4152717590332, "step": 3664 }, { "epoch": 2.279937791601866, "grad_norm": 0.001962431240826845, "learning_rate": 1.3324112494236977e-06, "logits/chosen": 1.640702247619629, "logits/rejected": 2.41336989402771, "logps/chosen": -536.8012084960938, "logps/rejected": -902.0858154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.044398784637451, "rewards/margins": 28.664670944213867, "rewards/rejected": -33.70907211303711, "step": 3665 }, { "epoch": 2.2805598755832035, "grad_norm": 0.03671610355377197, "learning_rate": 1.331258644536653e-06, "logits/chosen": -1.8564445972442627, "logits/rejected": 0.5851628184318542, "logps/chosen": -477.62872314453125, "logps/rejected": -787.21337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.753632545471191, "rewards/margins": 22.151416778564453, "rewards/rejected": -31.905048370361328, "step": 3666 }, { "epoch": 2.2811819595645413, "grad_norm": 0.02714928612112999, "learning_rate": 1.3301060396496082e-06, "logits/chosen": -1.0161186456680298, "logits/rejected": 1.8596889972686768, "logps/chosen": -418.14984130859375, "logps/rejected": -873.814697265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.727672576904297, "rewards/margins": 25.745969772338867, "rewards/rejected": -32.47364044189453, "step": 3667 }, { "epoch": 2.2818040435458786, "grad_norm": 0.0004676782409660518, "learning_rate": 1.3289534347625634e-06, "logits/chosen": -0.7326065301895142, "logits/rejected": 2.5730059146881104, "logps/chosen": -493.71246337890625, "logps/rejected": -933.0047607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.3746919631958, "rewards/margins": 27.02305030822754, "rewards/rejected": -36.397743225097656, "step": 3668 }, { "epoch": 2.2824261275272164, "grad_norm": 2.2877783578678645e-07, "learning_rate": 1.3278008298755188e-06, "logits/chosen": -0.7244665026664734, "logits/rejected": 3.4912476539611816, "logps/chosen": -472.2742004394531, "logps/rejected": -1081.2777099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.408008575439453, "rewards/margins": 34.40707778930664, "rewards/rejected": -43.815086364746094, "step": 3669 }, { "epoch": 2.2830482115085537, "grad_norm": 4.795114705302694e-07, "learning_rate": 1.326648224988474e-06, "logits/chosen": -0.1821054220199585, "logits/rejected": 3.4906506538391113, "logps/chosen": -430.23553466796875, "logps/rejected": -934.4568481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.48881721496582, "rewards/margins": 31.01263427734375, "rewards/rejected": -39.50144958496094, "step": 3670 }, { "epoch": 2.283670295489891, "grad_norm": 0.00012371873890515417, "learning_rate": 1.3254956201014293e-06, "logits/chosen": -1.8504576683044434, "logits/rejected": 3.2555201053619385, "logps/chosen": -415.14703369140625, "logps/rejected": -1054.1136474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.276524543762207, "rewards/margins": 35.696468353271484, "rewards/rejected": -37.972991943359375, "step": 3671 }, { "epoch": 2.2842923794712284, "grad_norm": 0.0600084587931633, "learning_rate": 1.3243430152143847e-06, "logits/chosen": 2.238312244415283, "logits/rejected": 2.5055103302001953, "logps/chosen": -765.4758911132812, "logps/rejected": -1044.9024658203125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.58240509033203, "rewards/margins": 22.25725555419922, "rewards/rejected": -38.83966064453125, "step": 3672 }, { "epoch": 2.284914463452566, "grad_norm": 5.0989089012146, "learning_rate": 1.32319041032734e-06, "logits/chosen": 1.9975833892822266, "logits/rejected": 3.660245180130005, "logps/chosen": -602.4431762695312, "logps/rejected": -978.6756591796875, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -9.570245742797852, "rewards/margins": 28.15796661376953, "rewards/rejected": -37.728214263916016, "step": 3673 }, { "epoch": 2.2855365474339036, "grad_norm": 0.001256530056707561, "learning_rate": 1.3220378054402952e-06, "logits/chosen": -0.7467068433761597, "logits/rejected": 1.4600367546081543, "logps/chosen": -402.07843017578125, "logps/rejected": -849.447998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.446656703948975, "rewards/margins": 30.50889015197754, "rewards/rejected": -35.95554733276367, "step": 3674 }, { "epoch": 2.286158631415241, "grad_norm": 0.01776723749935627, "learning_rate": 1.3208852005532504e-06, "logits/chosen": -2.924178123474121, "logits/rejected": 0.3132919669151306, "logps/chosen": -428.3075256347656, "logps/rejected": -909.0307006835938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.815388679504395, "rewards/margins": 30.344745635986328, "rewards/rejected": -39.160133361816406, "step": 3675 }, { "epoch": 2.2867807153965787, "grad_norm": 0.021917715668678284, "learning_rate": 1.3197325956662058e-06, "logits/chosen": -1.6217856407165527, "logits/rejected": 3.1677675247192383, "logps/chosen": -456.10174560546875, "logps/rejected": -1087.2021484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.842765808105469, "rewards/margins": 33.89425277709961, "rewards/rejected": -42.73701477050781, "step": 3676 }, { "epoch": 2.287402799377916, "grad_norm": 37.577537536621094, "learning_rate": 1.318579990779161e-06, "logits/chosen": 0.7215325832366943, "logits/rejected": 3.7547783851623535, "logps/chosen": -687.1287841796875, "logps/rejected": -1236.029541015625, "loss": 0.3868, "rewards/accuracies": 0.875, "rewards/chosen": -10.96998405456543, "rewards/margins": 34.49425506591797, "rewards/rejected": -45.46424102783203, "step": 3677 }, { "epoch": 2.2880248833592534, "grad_norm": 0.00026209407951682806, "learning_rate": 1.3174273858921163e-06, "logits/chosen": -2.5744283199310303, "logits/rejected": 2.5794479846954346, "logps/chosen": -319.0574951171875, "logps/rejected": -996.4590454101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.7528181076049805, "rewards/margins": 33.89647674560547, "rewards/rejected": -38.6492919921875, "step": 3678 }, { "epoch": 2.288646967340591, "grad_norm": 3.9976178811684804e-08, "learning_rate": 1.3162747810050717e-06, "logits/chosen": -1.0795128345489502, "logits/rejected": 2.9608840942382812, "logps/chosen": -441.7596740722656, "logps/rejected": -1100.30078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.616857051849365, "rewards/margins": 30.922849655151367, "rewards/rejected": -37.539703369140625, "step": 3679 }, { "epoch": 2.2892690513219285, "grad_norm": 0.026899321004748344, "learning_rate": 1.315122176118027e-06, "logits/chosen": 1.6019718647003174, "logits/rejected": 3.604966878890991, "logps/chosen": -444.7810363769531, "logps/rejected": -873.5284423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.15468978881836, "rewards/margins": 30.436019897460938, "rewards/rejected": -38.5907096862793, "step": 3680 }, { "epoch": 2.289891135303266, "grad_norm": 0.000182815216248855, "learning_rate": 1.3139695712309822e-06, "logits/chosen": -2.478875160217285, "logits/rejected": 2.828974962234497, "logps/chosen": -309.87603759765625, "logps/rejected": -1102.597900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.028837203979492, "rewards/margins": 46.795127868652344, "rewards/rejected": -52.82395935058594, "step": 3681 }, { "epoch": 2.2905132192846036, "grad_norm": 17.601627349853516, "learning_rate": 1.3128169663439374e-06, "logits/chosen": 1.8089146614074707, "logits/rejected": 2.5463128089904785, "logps/chosen": -628.1402587890625, "logps/rejected": -938.7257080078125, "loss": 0.1131, "rewards/accuracies": 0.875, "rewards/chosen": -10.305665969848633, "rewards/margins": 21.779991149902344, "rewards/rejected": -32.085655212402344, "step": 3682 }, { "epoch": 2.291135303265941, "grad_norm": 2.2101880858826917e-06, "learning_rate": 1.3116643614568928e-06, "logits/chosen": 0.36230725049972534, "logits/rejected": 3.0662708282470703, "logps/chosen": -495.42108154296875, "logps/rejected": -1016.244873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6825504302978516, "rewards/margins": 34.046356201171875, "rewards/rejected": -37.72890853881836, "step": 3683 }, { "epoch": 2.2917573872472783, "grad_norm": 6.515670520457206e-06, "learning_rate": 1.310511756569848e-06, "logits/chosen": -1.1638526916503906, "logits/rejected": 3.4499146938323975, "logps/chosen": -362.0567932128906, "logps/rejected": -958.96044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.989096641540527, "rewards/margins": 33.7954216003418, "rewards/rejected": -40.78451919555664, "step": 3684 }, { "epoch": 2.2923794712286156, "grad_norm": 1.5253701803885633e-06, "learning_rate": 1.3093591516828033e-06, "logits/chosen": -0.05208313465118408, "logits/rejected": 4.282369613647461, "logps/chosen": -385.57952880859375, "logps/rejected": -955.6923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.062870979309082, "rewards/margins": 30.697647094726562, "rewards/rejected": -40.760520935058594, "step": 3685 }, { "epoch": 2.2930015552099534, "grad_norm": 2.5477407689322717e-05, "learning_rate": 1.3082065467957585e-06, "logits/chosen": 0.05571731925010681, "logits/rejected": 4.041232585906982, "logps/chosen": -421.4884033203125, "logps/rejected": -906.364013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.396073341369629, "rewards/margins": 27.576976776123047, "rewards/rejected": -32.973052978515625, "step": 3686 }, { "epoch": 2.2936236391912908, "grad_norm": 5.022000550525263e-05, "learning_rate": 1.307053941908714e-06, "logits/chosen": 1.8593449592590332, "logits/rejected": 4.012054443359375, "logps/chosen": -597.22509765625, "logps/rejected": -1045.4674072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.028395652770996, "rewards/margins": 36.970951080322266, "rewards/rejected": -47.99934387207031, "step": 3687 }, { "epoch": 2.2942457231726285, "grad_norm": 53.18421936035156, "learning_rate": 1.3059013370216692e-06, "logits/chosen": -2.442203998565674, "logits/rejected": 2.732808828353882, "logps/chosen": -360.65948486328125, "logps/rejected": -986.3630981445312, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": -9.006263732910156, "rewards/margins": 34.367088317871094, "rewards/rejected": -43.37335205078125, "step": 3688 }, { "epoch": 2.294867807153966, "grad_norm": 0.0002621853200253099, "learning_rate": 1.3047487321346244e-06, "logits/chosen": -0.44340795278549194, "logits/rejected": 3.2911224365234375, "logps/chosen": -488.1886901855469, "logps/rejected": -1053.859130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.731563568115234, "rewards/margins": 30.538379669189453, "rewards/rejected": -39.26993942260742, "step": 3689 }, { "epoch": 2.295489891135303, "grad_norm": 5.142366409301758, "learning_rate": 1.3035961272475798e-06, "logits/chosen": -0.5201451778411865, "logits/rejected": 3.9743244647979736, "logps/chosen": -435.2750244140625, "logps/rejected": -923.180908203125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -9.475119590759277, "rewards/margins": 20.42343521118164, "rewards/rejected": -29.8985538482666, "step": 3690 }, { "epoch": 2.2961119751166406, "grad_norm": 0.06476642191410065, "learning_rate": 1.302443522360535e-06, "logits/chosen": 0.49854815006256104, "logits/rejected": 3.6201465129852295, "logps/chosen": -443.612060546875, "logps/rejected": -888.561767578125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.8632965087890625, "rewards/margins": 26.700164794921875, "rewards/rejected": -34.56346130371094, "step": 3691 }, { "epoch": 2.2967340590979783, "grad_norm": 5.63675121156848e-06, "learning_rate": 1.3012909174734903e-06, "logits/chosen": -2.861495018005371, "logits/rejected": 3.953035593032837, "logps/chosen": -448.1300048828125, "logps/rejected": -1275.9620361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.200240135192871, "rewards/margins": 41.76849365234375, "rewards/rejected": -48.96873474121094, "step": 3692 }, { "epoch": 2.2973561430793157, "grad_norm": 36.50901794433594, "learning_rate": 1.3001383125864455e-06, "logits/chosen": 1.098271369934082, "logits/rejected": 5.046222686767578, "logps/chosen": -546.1812744140625, "logps/rejected": -1057.7093505859375, "loss": 0.6451, "rewards/accuracies": 0.875, "rewards/chosen": -12.096063613891602, "rewards/margins": 29.526365280151367, "rewards/rejected": -41.6224250793457, "step": 3693 }, { "epoch": 2.297978227060653, "grad_norm": 1.0181839570577722e-05, "learning_rate": 1.298985707699401e-06, "logits/chosen": -1.1529643535614014, "logits/rejected": 2.750061511993408, "logps/chosen": -423.95098876953125, "logps/rejected": -995.2757568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.522923469543457, "rewards/margins": 31.389129638671875, "rewards/rejected": -39.912052154541016, "step": 3694 }, { "epoch": 2.298600311041991, "grad_norm": 4.717450792668387e-06, "learning_rate": 1.2978331028123562e-06, "logits/chosen": -3.618411064147949, "logits/rejected": 3.5521435737609863, "logps/chosen": -414.3927001953125, "logps/rejected": -1095.0592041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.211316108703613, "rewards/margins": 30.482646942138672, "rewards/rejected": -37.69396209716797, "step": 3695 }, { "epoch": 2.299222395023328, "grad_norm": 0.20213347673416138, "learning_rate": 1.2966804979253114e-06, "logits/chosen": 1.5525379180908203, "logits/rejected": 3.612203598022461, "logps/chosen": -565.8038940429688, "logps/rejected": -1033.8426513671875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -13.375448226928711, "rewards/margins": 29.46129608154297, "rewards/rejected": -42.83674621582031, "step": 3696 }, { "epoch": 2.2998444790046655, "grad_norm": 0.07893336564302444, "learning_rate": 1.2955278930382666e-06, "logits/chosen": 0.6578959226608276, "logits/rejected": 3.951193332672119, "logps/chosen": -583.0952758789062, "logps/rejected": -1030.0029296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.314534187316895, "rewards/margins": 26.324974060058594, "rewards/rejected": -38.63950729370117, "step": 3697 }, { "epoch": 2.3004665629860033, "grad_norm": 0.0008938809623941779, "learning_rate": 1.294375288151222e-06, "logits/chosen": -0.3660479187965393, "logits/rejected": 3.558814287185669, "logps/chosen": -483.8442077636719, "logps/rejected": -1079.2012939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.533079147338867, "rewards/margins": 39.29969787597656, "rewards/rejected": -48.83277893066406, "step": 3698 }, { "epoch": 2.3010886469673406, "grad_norm": 1.2173177003860474, "learning_rate": 1.2932226832641773e-06, "logits/chosen": 1.2994377613067627, "logits/rejected": 2.7357635498046875, "logps/chosen": -549.0673828125, "logps/rejected": -878.2985229492188, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -7.204733371734619, "rewards/margins": 22.077877044677734, "rewards/rejected": -29.282608032226562, "step": 3699 }, { "epoch": 2.301710730948678, "grad_norm": 0.00022056486341170967, "learning_rate": 1.2920700783771325e-06, "logits/chosen": -1.6370930671691895, "logits/rejected": 2.731771469116211, "logps/chosen": -494.51861572265625, "logps/rejected": -1008.6734619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.655258178710938, "rewards/margins": 25.27872085571289, "rewards/rejected": -33.93397903442383, "step": 3700 }, { "epoch": 2.3023328149300157, "grad_norm": 0.0015736103523522615, "learning_rate": 1.290917473490088e-06, "logits/chosen": 0.6173830032348633, "logits/rejected": 3.2660341262817383, "logps/chosen": -566.87548828125, "logps/rejected": -1051.01904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.942275524139404, "rewards/margins": 29.693340301513672, "rewards/rejected": -36.635616302490234, "step": 3701 }, { "epoch": 2.302954898911353, "grad_norm": 0.06366714090108871, "learning_rate": 1.2897648686030432e-06, "logits/chosen": -3.899381637573242, "logits/rejected": 1.5277197360992432, "logps/chosen": -265.9858093261719, "logps/rejected": -905.1071166992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.637141704559326, "rewards/margins": 31.69051170349121, "rewards/rejected": -35.32765197753906, "step": 3702 }, { "epoch": 2.3035769828926904, "grad_norm": 0.08645754307508469, "learning_rate": 1.2886122637159982e-06, "logits/chosen": -0.9479325413703918, "logits/rejected": 3.7041172981262207, "logps/chosen": -424.2788391113281, "logps/rejected": -986.81396484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.513542175292969, "rewards/margins": 28.364822387695312, "rewards/rejected": -34.878360748291016, "step": 3703 }, { "epoch": 2.3041990668740278, "grad_norm": 3.70295765605988e-06, "learning_rate": 1.2874596588289534e-06, "logits/chosen": -1.080538272857666, "logits/rejected": 2.7991552352905273, "logps/chosen": -477.34503173828125, "logps/rejected": -989.5711669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.353320121765137, "rewards/margins": 30.877479553222656, "rewards/rejected": -41.23080062866211, "step": 3704 }, { "epoch": 2.3048211508553655, "grad_norm": 0.23479920625686646, "learning_rate": 1.2863070539419086e-06, "logits/chosen": 0.2994771897792816, "logits/rejected": 1.2180674076080322, "logps/chosen": -418.5895690917969, "logps/rejected": -672.30615234375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.518365859985352, "rewards/margins": 20.739582061767578, "rewards/rejected": -29.257946014404297, "step": 3705 }, { "epoch": 2.305443234836703, "grad_norm": 0.11313124746084213, "learning_rate": 1.285154449054864e-06, "logits/chosen": -1.9298100471496582, "logits/rejected": 3.333915948867798, "logps/chosen": -434.4066162109375, "logps/rejected": -1057.1881103515625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.3718719482421875, "rewards/margins": 29.832077026367188, "rewards/rejected": -37.203948974609375, "step": 3706 }, { "epoch": 2.3060653188180407, "grad_norm": 0.0011367382248863578, "learning_rate": 1.2840018441678193e-06, "logits/chosen": -0.14702105522155762, "logits/rejected": 1.7592847347259521, "logps/chosen": -625.2750854492188, "logps/rejected": -1098.627197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.312629699707031, "rewards/margins": 31.041439056396484, "rewards/rejected": -44.35406494140625, "step": 3707 }, { "epoch": 2.306687402799378, "grad_norm": 0.00044795998837798834, "learning_rate": 1.2828492392807745e-06, "logits/chosen": -0.23750460147857666, "logits/rejected": 3.907789468765259, "logps/chosen": -369.7756652832031, "logps/rejected": -845.3507690429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.302602767944336, "rewards/margins": 26.934505462646484, "rewards/rejected": -33.23710632324219, "step": 3708 }, { "epoch": 2.3073094867807153, "grad_norm": 5.019338459533174e-06, "learning_rate": 1.2816966343937297e-06, "logits/chosen": 3.057111978530884, "logits/rejected": 2.887117385864258, "logps/chosen": -658.5367431640625, "logps/rejected": -1031.13037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.958843231201172, "rewards/margins": 37.6265754699707, "rewards/rejected": -48.585418701171875, "step": 3709 }, { "epoch": 2.3079315707620527, "grad_norm": 0.5342081189155579, "learning_rate": 1.2805440295066852e-06, "logits/chosen": 2.534153699874878, "logits/rejected": 3.9704220294952393, "logps/chosen": -653.8551025390625, "logps/rejected": -1223.77392578125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -9.44595718383789, "rewards/margins": 41.70103454589844, "rewards/rejected": -51.14698791503906, "step": 3710 }, { "epoch": 2.3085536547433905, "grad_norm": 0.0002150165819330141, "learning_rate": 1.2793914246196404e-06, "logits/chosen": 1.2127676010131836, "logits/rejected": 4.247666358947754, "logps/chosen": -579.703125, "logps/rejected": -1159.847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.120686531066895, "rewards/margins": 35.937217712402344, "rewards/rejected": -45.057899475097656, "step": 3711 }, { "epoch": 2.309175738724728, "grad_norm": 0.14080031216144562, "learning_rate": 1.2782388197325956e-06, "logits/chosen": -1.2376487255096436, "logits/rejected": 3.667673110961914, "logps/chosen": -463.9857482910156, "logps/rejected": -885.1686401367188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.179271697998047, "rewards/margins": 16.996322631835938, "rewards/rejected": -27.175594329833984, "step": 3712 }, { "epoch": 2.309797822706065, "grad_norm": 2.617736936372239e-05, "learning_rate": 1.277086214845551e-06, "logits/chosen": 0.03214012831449509, "logits/rejected": 5.018609046936035, "logps/chosen": -528.091796875, "logps/rejected": -1093.7386474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.017572402954102, "rewards/margins": 34.35942840576172, "rewards/rejected": -42.37700271606445, "step": 3713 }, { "epoch": 2.310419906687403, "grad_norm": 2.1107603970449418e-05, "learning_rate": 1.2759336099585063e-06, "logits/chosen": -1.4206596612930298, "logits/rejected": 3.3120598793029785, "logps/chosen": -543.6156005859375, "logps/rejected": -1077.357666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.907739639282227, "rewards/margins": 34.17880630493164, "rewards/rejected": -44.0865478515625, "step": 3714 }, { "epoch": 2.3110419906687403, "grad_norm": 0.484291136264801, "learning_rate": 1.2747810050714615e-06, "logits/chosen": 0.2637067139148712, "logits/rejected": 4.386711597442627, "logps/chosen": -595.2559814453125, "logps/rejected": -1109.7808837890625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -14.601914405822754, "rewards/margins": 31.844104766845703, "rewards/rejected": -46.446022033691406, "step": 3715 }, { "epoch": 2.3116640746500776, "grad_norm": 4.2439531000582065e-08, "learning_rate": 1.2736284001844167e-06, "logits/chosen": -1.9014339447021484, "logits/rejected": 3.727208375930786, "logps/chosen": -412.7446594238281, "logps/rejected": -1121.034423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.875720500946045, "rewards/margins": 38.0162239074707, "rewards/rejected": -43.891944885253906, "step": 3716 }, { "epoch": 2.3122861586314154, "grad_norm": 5.432393209048314e-06, "learning_rate": 1.2724757952973722e-06, "logits/chosen": 0.21426761150360107, "logits/rejected": 3.167912244796753, "logps/chosen": -476.23675537109375, "logps/rejected": -981.8463134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.670164585113525, "rewards/margins": 26.259475708007812, "rewards/rejected": -32.92964172363281, "step": 3717 }, { "epoch": 2.3129082426127527, "grad_norm": 0.10672373324632645, "learning_rate": 1.2713231904103274e-06, "logits/chosen": -1.5727660655975342, "logits/rejected": 2.5053091049194336, "logps/chosen": -513.7788696289062, "logps/rejected": -1157.0867919921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.235604286193848, "rewards/margins": 41.920467376708984, "rewards/rejected": -51.156070709228516, "step": 3718 }, { "epoch": 2.31353032659409, "grad_norm": 1.4951767921447754, "learning_rate": 1.2701705855232826e-06, "logits/chosen": 1.4568402767181396, "logits/rejected": 4.061356544494629, "logps/chosen": -634.544921875, "logps/rejected": -984.9749755859375, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -12.60744857788086, "rewards/margins": 18.677616119384766, "rewards/rejected": -31.285064697265625, "step": 3719 }, { "epoch": 2.314152410575428, "grad_norm": 0.27352988719940186, "learning_rate": 1.2690179806362378e-06, "logits/chosen": -1.944580078125, "logits/rejected": 1.7055405378341675, "logps/chosen": -349.4586181640625, "logps/rejected": -630.8715209960938, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.460259437561035, "rewards/margins": 18.522550582885742, "rewards/rejected": -23.982810974121094, "step": 3720 }, { "epoch": 2.314774494556765, "grad_norm": 0.0011469712480902672, "learning_rate": 1.2678653757491933e-06, "logits/chosen": -1.8591368198394775, "logits/rejected": 2.5280864238739014, "logps/chosen": -326.79632568359375, "logps/rejected": -907.6506958007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.100767135620117, "rewards/margins": 30.164491653442383, "rewards/rejected": -36.2652587890625, "step": 3721 }, { "epoch": 2.3153965785381025, "grad_norm": 1.0237037713523023e-05, "learning_rate": 1.2667127708621485e-06, "logits/chosen": -3.6736159324645996, "logits/rejected": 1.2925745248794556, "logps/chosen": -230.48849487304688, "logps/rejected": -831.7344970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.988120079040527, "rewards/margins": 32.05707550048828, "rewards/rejected": -37.045196533203125, "step": 3722 }, { "epoch": 2.31601866251944, "grad_norm": 0.000444350007455796, "learning_rate": 1.2655601659751037e-06, "logits/chosen": -1.2352526187896729, "logits/rejected": 2.333470106124878, "logps/chosen": -599.5797729492188, "logps/rejected": -1209.1802978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.441747665405273, "rewards/margins": 37.17320251464844, "rewards/rejected": -46.614952087402344, "step": 3723 }, { "epoch": 2.3166407465007777, "grad_norm": 0.14415420591831207, "learning_rate": 1.2644075610880592e-06, "logits/chosen": -0.7493869662284851, "logits/rejected": 1.9482616186141968, "logps/chosen": -424.66912841796875, "logps/rejected": -905.3004760742188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.350069999694824, "rewards/margins": 26.499420166015625, "rewards/rejected": -34.84949493408203, "step": 3724 }, { "epoch": 2.317262830482115, "grad_norm": 0.00046594845480285585, "learning_rate": 1.2632549562010144e-06, "logits/chosen": 1.0839658975601196, "logits/rejected": 3.9135398864746094, "logps/chosen": -416.83905029296875, "logps/rejected": -828.8883056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0764641761779785, "rewards/margins": 24.667072296142578, "rewards/rejected": -31.7435359954834, "step": 3725 }, { "epoch": 2.317884914463453, "grad_norm": 5.600808435701765e-05, "learning_rate": 1.2621023513139696e-06, "logits/chosen": -1.369974136352539, "logits/rejected": 3.0765304565429688, "logps/chosen": -392.28057861328125, "logps/rejected": -940.5872192382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.665092945098877, "rewards/margins": 30.048261642456055, "rewards/rejected": -36.713356018066406, "step": 3726 }, { "epoch": 2.31850699844479, "grad_norm": 0.0072428504936397076, "learning_rate": 1.2609497464269248e-06, "logits/chosen": 0.6592839360237122, "logits/rejected": 2.507603883743286, "logps/chosen": -591.06396484375, "logps/rejected": -1104.629150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.804035186767578, "rewards/margins": 37.28501510620117, "rewards/rejected": -46.08905029296875, "step": 3727 }, { "epoch": 2.3191290824261275, "grad_norm": 1.4250758795242291e-08, "learning_rate": 1.2597971415398803e-06, "logits/chosen": -1.6458063125610352, "logits/rejected": 3.0889017581939697, "logps/chosen": -496.560546875, "logps/rejected": -1106.861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.856250286102295, "rewards/margins": 37.5319938659668, "rewards/rejected": -44.38824462890625, "step": 3728 }, { "epoch": 2.319751166407465, "grad_norm": 0.008206356316804886, "learning_rate": 1.2586445366528355e-06, "logits/chosen": 1.8181511163711548, "logits/rejected": 3.783684730529785, "logps/chosen": -530.9696044921875, "logps/rejected": -922.9013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.359803199768066, "rewards/margins": 23.870697021484375, "rewards/rejected": -37.230499267578125, "step": 3729 }, { "epoch": 2.3203732503888026, "grad_norm": 0.0004938808269798756, "learning_rate": 1.2574919317657907e-06, "logits/chosen": -0.7859050035476685, "logits/rejected": 3.8299612998962402, "logps/chosen": -334.7008972167969, "logps/rejected": -828.531982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.748422145843506, "rewards/margins": 24.402236938476562, "rewards/rejected": -30.150657653808594, "step": 3730 }, { "epoch": 2.32099533437014, "grad_norm": 0.6841702461242676, "learning_rate": 1.256339326878746e-06, "logits/chosen": -2.255345344543457, "logits/rejected": 1.1976604461669922, "logps/chosen": -395.2082824707031, "logps/rejected": -968.39453125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -7.6640472412109375, "rewards/margins": 31.672197341918945, "rewards/rejected": -39.33624267578125, "step": 3731 }, { "epoch": 2.3216174183514773, "grad_norm": 1.9100058423759947e-08, "learning_rate": 1.2551867219917014e-06, "logits/chosen": 2.8481311798095703, "logits/rejected": 4.289071083068848, "logps/chosen": -520.2772827148438, "logps/rejected": -1032.16943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.547584533691406, "rewards/margins": 37.49254608154297, "rewards/rejected": -45.04012680053711, "step": 3732 }, { "epoch": 2.322239502332815, "grad_norm": 0.3577936291694641, "learning_rate": 1.2540341171046566e-06, "logits/chosen": -1.3577563762664795, "logits/rejected": 2.7698278427124023, "logps/chosen": -363.3661193847656, "logps/rejected": -926.281982421875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.647355079650879, "rewards/margins": 31.246688842773438, "rewards/rejected": -37.89404296875, "step": 3733 }, { "epoch": 2.3228615863141524, "grad_norm": 5.641165898850886e-06, "learning_rate": 1.2528815122176118e-06, "logits/chosen": 2.2829861640930176, "logits/rejected": 0.9737235307693481, "logps/chosen": -635.3814697265625, "logps/rejected": -918.7669067382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.598127365112305, "rewards/margins": 28.54684066772461, "rewards/rejected": -36.14496994018555, "step": 3734 }, { "epoch": 2.3234836702954897, "grad_norm": 0.0003707819851115346, "learning_rate": 1.2517289073305673e-06, "logits/chosen": 0.35782626271247864, "logits/rejected": 3.968404769897461, "logps/chosen": -458.44110107421875, "logps/rejected": -952.4920654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.391024589538574, "rewards/margins": 27.741703033447266, "rewards/rejected": -34.13272476196289, "step": 3735 }, { "epoch": 2.3241057542768275, "grad_norm": 37.11952209472656, "learning_rate": 1.2505763024435225e-06, "logits/chosen": -1.1397258043289185, "logits/rejected": 0.5207135081291199, "logps/chosen": -553.2008056640625, "logps/rejected": -839.0127563476562, "loss": 1.1347, "rewards/accuracies": 0.875, "rewards/chosen": -8.864250183105469, "rewards/margins": 20.787813186645508, "rewards/rejected": -29.652063369750977, "step": 3736 }, { "epoch": 2.324727838258165, "grad_norm": 13.333917617797852, "learning_rate": 1.2494236975564777e-06, "logits/chosen": 3.0632495880126953, "logits/rejected": 2.8352396488189697, "logps/chosen": -815.4075927734375, "logps/rejected": -1069.552734375, "loss": 0.1319, "rewards/accuracies": 0.875, "rewards/chosen": -15.841561317443848, "rewards/margins": 21.505847930908203, "rewards/rejected": -37.347408294677734, "step": 3737 }, { "epoch": 2.325349922239502, "grad_norm": 0.3247649371623993, "learning_rate": 1.248271092669433e-06, "logits/chosen": 0.8813507556915283, "logits/rejected": 2.7931432723999023, "logps/chosen": -608.7620849609375, "logps/rejected": -886.5908813476562, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -10.79469108581543, "rewards/margins": 22.09004020690918, "rewards/rejected": -32.88473129272461, "step": 3738 }, { "epoch": 2.32597200622084, "grad_norm": 1.6823595762252808, "learning_rate": 1.2471184877823884e-06, "logits/chosen": 0.045462846755981445, "logits/rejected": 0.23739556968212128, "logps/chosen": -650.0916748046875, "logps/rejected": -884.3887939453125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -14.383567810058594, "rewards/margins": 21.003597259521484, "rewards/rejected": -35.38716506958008, "step": 3739 }, { "epoch": 2.3265940902021773, "grad_norm": 0.00010003484931075945, "learning_rate": 1.2459658828953436e-06, "logits/chosen": 0.15637314319610596, "logits/rejected": 3.1810343265533447, "logps/chosen": -496.74737548828125, "logps/rejected": -963.8161010742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.369870662689209, "rewards/margins": 32.24199676513672, "rewards/rejected": -38.61186599731445, "step": 3740 }, { "epoch": 2.3272161741835147, "grad_norm": 0.00047350634122267365, "learning_rate": 1.2448132780082988e-06, "logits/chosen": -2.524026870727539, "logits/rejected": 0.31463098526000977, "logps/chosen": -502.8329162597656, "logps/rejected": -1062.771240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.350840091705322, "rewards/margins": 33.23490524291992, "rewards/rejected": -39.58574676513672, "step": 3741 }, { "epoch": 2.327838258164852, "grad_norm": 21.89085578918457, "learning_rate": 1.2436606731212543e-06, "logits/chosen": 0.7648021578788757, "logits/rejected": 2.883105754852295, "logps/chosen": -567.5341796875, "logps/rejected": -897.75537109375, "loss": 0.1512, "rewards/accuracies": 0.875, "rewards/chosen": -9.583475112915039, "rewards/margins": 23.607681274414062, "rewards/rejected": -33.19115447998047, "step": 3742 }, { "epoch": 2.32846034214619, "grad_norm": 0.10280514508485794, "learning_rate": 1.2425080682342095e-06, "logits/chosen": 0.34326276183128357, "logits/rejected": 2.023019552230835, "logps/chosen": -484.177490234375, "logps/rejected": -807.7122802734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.493699550628662, "rewards/margins": 22.899011611938477, "rewards/rejected": -30.392711639404297, "step": 3743 }, { "epoch": 2.329082426127527, "grad_norm": 0.00037758261896669865, "learning_rate": 1.2413554633471647e-06, "logits/chosen": -0.4873642027378082, "logits/rejected": 4.205269813537598, "logps/chosen": -456.52215576171875, "logps/rejected": -1098.8558349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.478480339050293, "rewards/margins": 35.9359130859375, "rewards/rejected": -40.41439437866211, "step": 3744 }, { "epoch": 2.329704510108865, "grad_norm": 2.557085463195108e-05, "learning_rate": 1.24020285846012e-06, "logits/chosen": -0.8406679630279541, "logits/rejected": 3.6342380046844482, "logps/chosen": -493.90692138671875, "logps/rejected": -1012.0113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.140291213989258, "rewards/margins": 28.977548599243164, "rewards/rejected": -37.11783981323242, "step": 3745 }, { "epoch": 2.3303265940902023, "grad_norm": 0.00011655557318590581, "learning_rate": 1.2390502535730754e-06, "logits/chosen": 0.18190997838974, "logits/rejected": 2.5602996349334717, "logps/chosen": -287.9266052246094, "logps/rejected": -611.602294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.9288811683654785, "rewards/margins": 20.42547607421875, "rewards/rejected": -25.354358673095703, "step": 3746 }, { "epoch": 2.3309486780715396, "grad_norm": 0.028517745435237885, "learning_rate": 1.2378976486860306e-06, "logits/chosen": 3.981574535369873, "logits/rejected": 4.015433311462402, "logps/chosen": -651.6341552734375, "logps/rejected": -922.1114501953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.595947265625, "rewards/margins": 26.155662536621094, "rewards/rejected": -34.751609802246094, "step": 3747 }, { "epoch": 2.331570762052877, "grad_norm": 0.009231762029230595, "learning_rate": 1.2367450437989858e-06, "logits/chosen": 0.6380380392074585, "logits/rejected": 4.268522262573242, "logps/chosen": -441.7833557128906, "logps/rejected": -1023.3212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.484411239624023, "rewards/margins": 30.92462158203125, "rewards/rejected": -37.409034729003906, "step": 3748 }, { "epoch": 2.3321928460342147, "grad_norm": 3.534213277589515e-08, "learning_rate": 1.235592438911941e-06, "logits/chosen": -1.5921106338500977, "logits/rejected": 1.847858190536499, "logps/chosen": -451.6917419433594, "logps/rejected": -1046.4404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.33981704711914, "rewards/margins": 37.05653381347656, "rewards/rejected": -45.39634704589844, "step": 3749 }, { "epoch": 2.332814930015552, "grad_norm": 0.020619157701730728, "learning_rate": 1.2344398340248965e-06, "logits/chosen": 2.333313465118408, "logits/rejected": 3.4232683181762695, "logps/chosen": -633.7111206054688, "logps/rejected": -949.9092407226562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.208894729614258, "rewards/margins": 30.374826431274414, "rewards/rejected": -38.58372497558594, "step": 3750 }, { "epoch": 2.3334370139968894, "grad_norm": 48.95095443725586, "learning_rate": 1.2332872291378517e-06, "logits/chosen": 0.785729706287384, "logits/rejected": 3.3353562355041504, "logps/chosen": -461.3485412597656, "logps/rejected": -799.4615478515625, "loss": 2.0307, "rewards/accuracies": 0.875, "rewards/chosen": -8.850112915039062, "rewards/margins": 21.994503021240234, "rewards/rejected": -30.844614028930664, "step": 3751 }, { "epoch": 2.334059097978227, "grad_norm": 0.24490414559841156, "learning_rate": 1.232134624250807e-06, "logits/chosen": -0.022008508443832397, "logits/rejected": 3.6488747596740723, "logps/chosen": -464.0959777832031, "logps/rejected": -1023.082763671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.36753511428833, "rewards/margins": 33.127281188964844, "rewards/rejected": -39.494815826416016, "step": 3752 }, { "epoch": 2.3346811819595645, "grad_norm": 2.7800905399999465e-07, "learning_rate": 1.2309820193637624e-06, "logits/chosen": -1.0926700830459595, "logits/rejected": 4.033783912658691, "logps/chosen": -524.0411376953125, "logps/rejected": -1222.209716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.972342014312744, "rewards/margins": 34.16625213623047, "rewards/rejected": -42.13859558105469, "step": 3753 }, { "epoch": 2.335303265940902, "grad_norm": 0.027951369062066078, "learning_rate": 1.2298294144767174e-06, "logits/chosen": 1.3177515268325806, "logits/rejected": 3.4625697135925293, "logps/chosen": -495.0713195800781, "logps/rejected": -835.1082153320312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.218081951141357, "rewards/margins": 19.00119400024414, "rewards/rejected": -24.219276428222656, "step": 3754 }, { "epoch": 2.3359253499222397, "grad_norm": 0.00014580706192646176, "learning_rate": 1.2286768095896726e-06, "logits/chosen": 2.1752769947052, "logits/rejected": 2.639923334121704, "logps/chosen": -591.5237426757812, "logps/rejected": -898.7060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.46114444732666, "rewards/margins": 24.936874389648438, "rewards/rejected": -35.39802169799805, "step": 3755 }, { "epoch": 2.336547433903577, "grad_norm": 7.0838303565979, "learning_rate": 1.227524204702628e-06, "logits/chosen": -3.95389461517334, "logits/rejected": 3.1995513439178467, "logps/chosen": -346.8486328125, "logps/rejected": -1027.637451171875, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -5.685001373291016, "rewards/margins": 31.18999671936035, "rewards/rejected": -36.875, "step": 3756 }, { "epoch": 2.3371695178849143, "grad_norm": 3.141146421432495, "learning_rate": 1.2263715998155833e-06, "logits/chosen": 0.8513885736465454, "logits/rejected": 2.2651748657226562, "logps/chosen": -532.1912231445312, "logps/rejected": -856.8719482421875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -10.31003475189209, "rewards/margins": 20.228313446044922, "rewards/rejected": -30.53835105895996, "step": 3757 }, { "epoch": 2.337791601866252, "grad_norm": 0.02694685198366642, "learning_rate": 1.2252189949285385e-06, "logits/chosen": -2.7708561420440674, "logits/rejected": 0.9407970905303955, "logps/chosen": -405.43402099609375, "logps/rejected": -961.0596923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2348785400390625, "rewards/margins": 29.521310806274414, "rewards/rejected": -35.75619125366211, "step": 3758 }, { "epoch": 2.3384136858475895, "grad_norm": 0.23336142301559448, "learning_rate": 1.224066390041494e-06, "logits/chosen": 1.8623909950256348, "logits/rejected": 4.030655860900879, "logps/chosen": -567.2323608398438, "logps/rejected": -971.25927734375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -12.90607738494873, "rewards/margins": 25.02992057800293, "rewards/rejected": -37.935997009277344, "step": 3759 }, { "epoch": 2.339035769828927, "grad_norm": 9.65563678741455, "learning_rate": 1.2229137851544492e-06, "logits/chosen": -1.7390412092208862, "logits/rejected": 3.4814720153808594, "logps/chosen": -331.64794921875, "logps/rejected": -900.2626953125, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -5.608267307281494, "rewards/margins": 23.121736526489258, "rewards/rejected": -28.730003356933594, "step": 3760 }, { "epoch": 2.339657853810264, "grad_norm": 0.17411291599273682, "learning_rate": 1.2217611802674044e-06, "logits/chosen": -1.2542434930801392, "logits/rejected": 0.9479058384895325, "logps/chosen": -457.4659118652344, "logps/rejected": -841.8319702148438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.046476364135742, "rewards/margins": 23.45076560974121, "rewards/rejected": -31.497241973876953, "step": 3761 }, { "epoch": 2.340279937791602, "grad_norm": 7.5882954597473145, "learning_rate": 1.2206085753803596e-06, "logits/chosen": 3.2613232135772705, "logits/rejected": 3.1383419036865234, "logps/chosen": -656.0169677734375, "logps/rejected": -783.934814453125, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -8.506058692932129, "rewards/margins": 16.031143188476562, "rewards/rejected": -24.537200927734375, "step": 3762 }, { "epoch": 2.3409020217729393, "grad_norm": 9.263287211069837e-06, "learning_rate": 1.219455970493315e-06, "logits/chosen": -5.158586502075195, "logits/rejected": 0.3858991861343384, "logps/chosen": -193.34307861328125, "logps/rejected": -922.373291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7267749309539795, "rewards/margins": 30.67427635192871, "rewards/rejected": -33.40105056762695, "step": 3763 }, { "epoch": 2.341524105754277, "grad_norm": 0.0012843944132328033, "learning_rate": 1.2183033656062703e-06, "logits/chosen": 2.423126459121704, "logits/rejected": 3.8941612243652344, "logps/chosen": -614.8330078125, "logps/rejected": -967.76708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.1685209274292, "rewards/margins": 26.197811126708984, "rewards/rejected": -37.3663330078125, "step": 3764 }, { "epoch": 2.3421461897356144, "grad_norm": 3.2148877835425083e-06, "learning_rate": 1.2171507607192255e-06, "logits/chosen": -5.507774829864502, "logits/rejected": 1.925000786781311, "logps/chosen": -232.03387451171875, "logps/rejected": -932.1917114257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.883286476135254, "rewards/margins": 28.877832412719727, "rewards/rejected": -33.7611198425293, "step": 3765 }, { "epoch": 2.3427682737169517, "grad_norm": 0.05553580820560455, "learning_rate": 1.2159981558321807e-06, "logits/chosen": 1.5427436828613281, "logits/rejected": 2.6691017150878906, "logps/chosen": -650.1654052734375, "logps/rejected": -950.7757568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.771061897277832, "rewards/margins": 26.58896827697754, "rewards/rejected": -37.36003112792969, "step": 3766 }, { "epoch": 2.343390357698289, "grad_norm": 0.5102798342704773, "learning_rate": 1.2148455509451362e-06, "logits/chosen": 0.39026594161987305, "logits/rejected": 2.184535026550293, "logps/chosen": -531.0682373046875, "logps/rejected": -720.0987548828125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -5.892273902893066, "rewards/margins": 13.84701919555664, "rewards/rejected": -19.73929214477539, "step": 3767 }, { "epoch": 2.344012441679627, "grad_norm": 0.000627454777713865, "learning_rate": 1.2136929460580914e-06, "logits/chosen": 0.6169248819351196, "logits/rejected": 4.225925445556641, "logps/chosen": -434.4745178222656, "logps/rejected": -1024.3651123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.809660911560059, "rewards/margins": 35.267539978027344, "rewards/rejected": -46.07720184326172, "step": 3768 }, { "epoch": 2.344634525660964, "grad_norm": 0.0014896132051944733, "learning_rate": 1.2125403411710466e-06, "logits/chosen": -1.1250966787338257, "logits/rejected": 2.233417272567749, "logps/chosen": -375.6403503417969, "logps/rejected": -734.7423706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.249678611755371, "rewards/margins": 25.75054931640625, "rewards/rejected": -32.00022888183594, "step": 3769 }, { "epoch": 2.3452566096423015, "grad_norm": 1.1749808663807926e-06, "learning_rate": 1.211387736284002e-06, "logits/chosen": 1.9994325637817383, "logits/rejected": 3.6004791259765625, "logps/chosen": -579.676513671875, "logps/rejected": -1031.6995849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.611101150512695, "rewards/margins": 32.72962188720703, "rewards/rejected": -43.34072494506836, "step": 3770 }, { "epoch": 2.3458786936236393, "grad_norm": 3.606675988976349e-07, "learning_rate": 1.2102351313969573e-06, "logits/chosen": -3.1550188064575195, "logits/rejected": 3.289100170135498, "logps/chosen": -272.6792297363281, "logps/rejected": -1067.193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.123805999755859, "rewards/margins": 40.23991775512695, "rewards/rejected": -45.36371994018555, "step": 3771 }, { "epoch": 2.3465007776049767, "grad_norm": 0.0012340415269136429, "learning_rate": 1.2090825265099125e-06, "logits/chosen": 0.42929190397262573, "logits/rejected": 3.3133544921875, "logps/chosen": -572.3095703125, "logps/rejected": -1064.4798583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.947774887084961, "rewards/margins": 32.553855895996094, "rewards/rejected": -43.501625061035156, "step": 3772 }, { "epoch": 2.347122861586314, "grad_norm": 1.7274918718612753e-05, "learning_rate": 1.2079299216228677e-06, "logits/chosen": 0.21922864019870758, "logits/rejected": 2.7869038581848145, "logps/chosen": -489.15069580078125, "logps/rejected": -969.3250122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.346248626708984, "rewards/margins": 32.81340789794922, "rewards/rejected": -41.1596565246582, "step": 3773 }, { "epoch": 2.347744945567652, "grad_norm": 0.18060894310474396, "learning_rate": 1.2067773167358231e-06, "logits/chosen": -0.30459514260292053, "logits/rejected": 3.024898052215576, "logps/chosen": -569.720947265625, "logps/rejected": -1194.462646484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -8.834023475646973, "rewards/margins": 41.31550216674805, "rewards/rejected": -50.14952850341797, "step": 3774 }, { "epoch": 2.348367029548989, "grad_norm": 1.2039680480957031, "learning_rate": 1.2056247118487784e-06, "logits/chosen": 4.079957485198975, "logits/rejected": 4.113856315612793, "logps/chosen": -827.6793212890625, "logps/rejected": -970.1021118164062, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -11.64090347290039, "rewards/margins": 18.864543914794922, "rewards/rejected": -30.505447387695312, "step": 3775 }, { "epoch": 2.3489891135303265, "grad_norm": 1.346193790435791, "learning_rate": 1.2044721069617336e-06, "logits/chosen": 2.333740711212158, "logits/rejected": 3.8003430366516113, "logps/chosen": -715.4950561523438, "logps/rejected": -994.8343505859375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -12.749503135681152, "rewards/margins": 22.199554443359375, "rewards/rejected": -34.949058532714844, "step": 3776 }, { "epoch": 2.3496111975116643, "grad_norm": 0.28042879700660706, "learning_rate": 1.2033195020746888e-06, "logits/chosen": 0.6322274804115295, "logits/rejected": 2.010160446166992, "logps/chosen": -608.6675415039062, "logps/rejected": -957.8283081054688, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -8.078685760498047, "rewards/margins": 29.47532844543457, "rewards/rejected": -37.55401611328125, "step": 3777 }, { "epoch": 2.3502332814930016, "grad_norm": 5.428508757177042e-06, "learning_rate": 1.2021668971876443e-06, "logits/chosen": -0.7912687659263611, "logits/rejected": 2.985895872116089, "logps/chosen": -448.8441162109375, "logps/rejected": -971.3677368164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.1015825271606445, "rewards/margins": 28.909358978271484, "rewards/rejected": -34.01094055175781, "step": 3778 }, { "epoch": 2.350855365474339, "grad_norm": 5.192906246520579e-05, "learning_rate": 1.2010142923005995e-06, "logits/chosen": -0.4874184727668762, "logits/rejected": 2.1849145889282227, "logps/chosen": -482.5423583984375, "logps/rejected": -816.16796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.823034286499023, "rewards/margins": 21.69009017944336, "rewards/rejected": -31.513124465942383, "step": 3779 }, { "epoch": 2.3514774494556763, "grad_norm": 4.981512756785378e-05, "learning_rate": 1.1998616874135547e-06, "logits/chosen": 2.9160568714141846, "logits/rejected": 4.156764030456543, "logps/chosen": -776.1582641601562, "logps/rejected": -1080.82763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.34664535522461, "rewards/margins": 29.624601364135742, "rewards/rejected": -40.971248626708984, "step": 3780 }, { "epoch": 2.352099533437014, "grad_norm": 7.727591037750244, "learning_rate": 1.1987090825265101e-06, "logits/chosen": 2.0500614643096924, "logits/rejected": 2.1399617195129395, "logps/chosen": -669.6891479492188, "logps/rejected": -894.5279541015625, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -11.296379089355469, "rewards/margins": 20.505508422851562, "rewards/rejected": -31.80188751220703, "step": 3781 }, { "epoch": 2.3527216174183514, "grad_norm": 7.926801117719151e-06, "learning_rate": 1.1975564776394654e-06, "logits/chosen": 0.004293203353881836, "logits/rejected": 2.637110710144043, "logps/chosen": -400.6033630371094, "logps/rejected": -920.6117553710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.349189758300781, "rewards/margins": 33.85880661010742, "rewards/rejected": -43.2079963684082, "step": 3782 }, { "epoch": 2.353343701399689, "grad_norm": 0.06966577470302582, "learning_rate": 1.1964038727524206e-06, "logits/chosen": -0.31672847270965576, "logits/rejected": 3.4101476669311523, "logps/chosen": -515.6886596679688, "logps/rejected": -1000.330078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.874223232269287, "rewards/margins": 27.466777801513672, "rewards/rejected": -34.34100341796875, "step": 3783 }, { "epoch": 2.3539657853810265, "grad_norm": 8.371570587158203, "learning_rate": 1.1952512678653758e-06, "logits/chosen": 0.27338138222694397, "logits/rejected": 2.996030330657959, "logps/chosen": -582.8148193359375, "logps/rejected": -1207.8359375, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -13.131258010864258, "rewards/margins": 40.75477600097656, "rewards/rejected": -53.88603591918945, "step": 3784 }, { "epoch": 2.354587869362364, "grad_norm": 0.6094384789466858, "learning_rate": 1.1940986629783313e-06, "logits/chosen": -1.153893232345581, "logits/rejected": 0.9266235828399658, "logps/chosen": -530.5203247070312, "logps/rejected": -979.1256713867188, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -9.635900497436523, "rewards/margins": 31.250638961791992, "rewards/rejected": -40.886539459228516, "step": 3785 }, { "epoch": 2.355209953343701, "grad_norm": 22.516544342041016, "learning_rate": 1.1929460580912865e-06, "logits/chosen": -0.9157943725585938, "logits/rejected": 3.470613956451416, "logps/chosen": -388.00054931640625, "logps/rejected": -1011.9267578125, "loss": 0.1669, "rewards/accuracies": 0.875, "rewards/chosen": -6.712441444396973, "rewards/margins": 33.6087646484375, "rewards/rejected": -40.321205139160156, "step": 3786 }, { "epoch": 2.355832037325039, "grad_norm": 0.08950123935937881, "learning_rate": 1.1917934532042417e-06, "logits/chosen": 1.4992426633834839, "logits/rejected": 2.879100799560547, "logps/chosen": -615.9576416015625, "logps/rejected": -912.9107666015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.530094623565674, "rewards/margins": 21.183012008666992, "rewards/rejected": -28.713109970092773, "step": 3787 }, { "epoch": 2.3564541213063763, "grad_norm": 33.97612762451172, "learning_rate": 1.190640848317197e-06, "logits/chosen": 3.4221155643463135, "logits/rejected": 3.3171300888061523, "logps/chosen": -829.3216552734375, "logps/rejected": -1007.546875, "loss": 0.1565, "rewards/accuracies": 0.875, "rewards/chosen": -12.417081832885742, "rewards/margins": 20.075977325439453, "rewards/rejected": -32.49305725097656, "step": 3788 }, { "epoch": 2.3570762052877137, "grad_norm": 0.1435934156179428, "learning_rate": 1.1894882434301522e-06, "logits/chosen": 0.6667469143867493, "logits/rejected": 2.7253386974334717, "logps/chosen": -493.4066162109375, "logps/rejected": -719.7052001953125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.772191047668457, "rewards/margins": 16.941204071044922, "rewards/rejected": -23.713396072387695, "step": 3789 }, { "epoch": 2.3576982892690515, "grad_norm": 5.4664010207261526e-08, "learning_rate": 1.1883356385431074e-06, "logits/chosen": -0.8374029397964478, "logits/rejected": 1.7547428607940674, "logps/chosen": -557.086181640625, "logps/rejected": -1037.237060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.583539009094238, "rewards/margins": 28.51294708251953, "rewards/rejected": -40.09648513793945, "step": 3790 }, { "epoch": 2.358320373250389, "grad_norm": 0.010267133824527264, "learning_rate": 1.1871830336560628e-06, "logits/chosen": 1.42992103099823, "logits/rejected": 4.12758207321167, "logps/chosen": -597.4346923828125, "logps/rejected": -991.8429565429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.071941375732422, "rewards/margins": 26.114011764526367, "rewards/rejected": -36.18595504760742, "step": 3791 }, { "epoch": 2.358942457231726, "grad_norm": 0.018346259370446205, "learning_rate": 1.186030428769018e-06, "logits/chosen": -2.803020477294922, "logits/rejected": 0.8278892636299133, "logps/chosen": -464.4111328125, "logps/rejected": -1009.1217651367188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.344085693359375, "rewards/margins": 33.9010009765625, "rewards/rejected": -42.245086669921875, "step": 3792 }, { "epoch": 2.359564541213064, "grad_norm": 0.2930425703525543, "learning_rate": 1.1848778238819733e-06, "logits/chosen": 1.185457706451416, "logits/rejected": 3.8306405544281006, "logps/chosen": -636.3871459960938, "logps/rejected": -1095.2010498046875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -14.685548782348633, "rewards/margins": 33.11660385131836, "rewards/rejected": -47.802154541015625, "step": 3793 }, { "epoch": 2.3601866251944013, "grad_norm": 2.025911453529261e-05, "learning_rate": 1.1837252189949285e-06, "logits/chosen": 1.1972532272338867, "logits/rejected": 5.457921504974365, "logps/chosen": -427.31390380859375, "logps/rejected": -903.8753662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.011835098266602, "rewards/margins": 24.26134490966797, "rewards/rejected": -31.273181915283203, "step": 3794 }, { "epoch": 2.3608087091757386, "grad_norm": 0.002198511268943548, "learning_rate": 1.182572614107884e-06, "logits/chosen": 1.3402279615402222, "logits/rejected": 2.2236735820770264, "logps/chosen": -726.2381591796875, "logps/rejected": -1059.640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.72128677368164, "rewards/margins": 26.143020629882812, "rewards/rejected": -39.86431121826172, "step": 3795 }, { "epoch": 2.3614307931570764, "grad_norm": 3.081467628479004, "learning_rate": 1.1814200092208392e-06, "logits/chosen": 2.2028040885925293, "logits/rejected": 4.056054592132568, "logps/chosen": -568.4141235351562, "logps/rejected": -1015.677978515625, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -9.455324172973633, "rewards/margins": 30.97480010986328, "rewards/rejected": -40.43012237548828, "step": 3796 }, { "epoch": 2.3620528771384137, "grad_norm": 0.00034230336314067245, "learning_rate": 1.1802674043337944e-06, "logits/chosen": 0.28844231367111206, "logits/rejected": 3.227900505065918, "logps/chosen": -488.81158447265625, "logps/rejected": -851.3491821289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.182365417480469, "rewards/margins": 25.613388061523438, "rewards/rejected": -34.795753479003906, "step": 3797 }, { "epoch": 2.362674961119751, "grad_norm": 0.2191145271062851, "learning_rate": 1.1791147994467498e-06, "logits/chosen": -0.4650258421897888, "logits/rejected": 2.9938926696777344, "logps/chosen": -462.9337158203125, "logps/rejected": -946.4666748046875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.004414081573486, "rewards/margins": 27.973308563232422, "rewards/rejected": -34.97772216796875, "step": 3798 }, { "epoch": 2.3632970451010884, "grad_norm": 7.152175426483154, "learning_rate": 1.177962194559705e-06, "logits/chosen": 1.1919983625411987, "logits/rejected": 1.542420506477356, "logps/chosen": -616.858642578125, "logps/rejected": -875.9241943359375, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -8.420209884643555, "rewards/margins": 23.151426315307617, "rewards/rejected": -31.571636199951172, "step": 3799 }, { "epoch": 2.363919129082426, "grad_norm": 40.50632858276367, "learning_rate": 1.1768095896726603e-06, "logits/chosen": 1.6117632389068604, "logits/rejected": 3.467606544494629, "logps/chosen": -672.64111328125, "logps/rejected": -970.1424560546875, "loss": 0.4664, "rewards/accuracies": 0.875, "rewards/chosen": -13.211545944213867, "rewards/margins": 18.097660064697266, "rewards/rejected": -31.309207916259766, "step": 3800 }, { "epoch": 2.3645412130637635, "grad_norm": 0.0010592733742669225, "learning_rate": 1.1756569847856155e-06, "logits/chosen": -0.8161712884902954, "logits/rejected": 2.525038003921509, "logps/chosen": -487.45703125, "logps/rejected": -999.1900634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.189896583557129, "rewards/margins": 30.578916549682617, "rewards/rejected": -39.76881408691406, "step": 3801 }, { "epoch": 2.3651632970451013, "grad_norm": 0.0015007429756224155, "learning_rate": 1.174504379898571e-06, "logits/chosen": -0.36155182123184204, "logits/rejected": 3.1467785835266113, "logps/chosen": -473.1002197265625, "logps/rejected": -1033.470458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.926703453063965, "rewards/margins": 33.874717712402344, "rewards/rejected": -42.801422119140625, "step": 3802 }, { "epoch": 2.3657853810264386, "grad_norm": 1.6011492334655486e-05, "learning_rate": 1.1733517750115261e-06, "logits/chosen": -4.356368064880371, "logits/rejected": 1.788534164428711, "logps/chosen": -270.5740051269531, "logps/rejected": -933.327880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.053676605224609, "rewards/margins": 32.06562042236328, "rewards/rejected": -37.119300842285156, "step": 3803 }, { "epoch": 2.366407465007776, "grad_norm": 0.31292688846588135, "learning_rate": 1.1721991701244814e-06, "logits/chosen": -0.5484585165977478, "logits/rejected": 4.379980087280273, "logps/chosen": -338.7742919921875, "logps/rejected": -991.6512451171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.910788536071777, "rewards/margins": 33.38456726074219, "rewards/rejected": -38.29535675048828, "step": 3804 }, { "epoch": 2.3670295489891133, "grad_norm": 0.5742166042327881, "learning_rate": 1.1710465652374368e-06, "logits/chosen": 1.608454704284668, "logits/rejected": 3.2188591957092285, "logps/chosen": -593.5069580078125, "logps/rejected": -979.3258666992188, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -9.885505676269531, "rewards/margins": 25.568574905395508, "rewards/rejected": -35.45408248901367, "step": 3805 }, { "epoch": 2.367651632970451, "grad_norm": 0.19061534106731415, "learning_rate": 1.169893960350392e-06, "logits/chosen": 0.8193325400352478, "logits/rejected": 4.64123010635376, "logps/chosen": -569.2249755859375, "logps/rejected": -1197.062744140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.428016662597656, "rewards/margins": 36.00348663330078, "rewards/rejected": -45.43150329589844, "step": 3806 }, { "epoch": 2.3682737169517885, "grad_norm": 0.040032364428043365, "learning_rate": 1.1687413554633473e-06, "logits/chosen": 3.153968334197998, "logits/rejected": 4.467193126678467, "logps/chosen": -686.541259765625, "logps/rejected": -936.1351928710938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.390555381774902, "rewards/margins": 24.287389755249023, "rewards/rejected": -35.67794418334961, "step": 3807 }, { "epoch": 2.368895800933126, "grad_norm": 0.6300176978111267, "learning_rate": 1.1675887505763025e-06, "logits/chosen": -2.918030261993408, "logits/rejected": 2.932901620864868, "logps/chosen": -445.3351745605469, "logps/rejected": -1078.980224609375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -8.955216407775879, "rewards/margins": 36.79884338378906, "rewards/rejected": -45.754058837890625, "step": 3808 }, { "epoch": 2.3695178849144636, "grad_norm": 0.00032534674392081797, "learning_rate": 1.166436145689258e-06, "logits/chosen": 0.1272832453250885, "logits/rejected": 2.286240577697754, "logps/chosen": -382.1531066894531, "logps/rejected": -825.8199462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.393211364746094, "rewards/margins": 29.338640213012695, "rewards/rejected": -36.731849670410156, "step": 3809 }, { "epoch": 2.370139968895801, "grad_norm": 0.07315727323293686, "learning_rate": 1.1652835408022131e-06, "logits/chosen": -0.5689883828163147, "logits/rejected": 2.811748504638672, "logps/chosen": -570.2830200195312, "logps/rejected": -1137.9296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.510469436645508, "rewards/margins": 36.993011474609375, "rewards/rejected": -46.50347900390625, "step": 3810 }, { "epoch": 2.3707620528771383, "grad_norm": 23.180858612060547, "learning_rate": 1.1641309359151684e-06, "logits/chosen": 2.40103816986084, "logits/rejected": 5.413440704345703, "logps/chosen": -606.066650390625, "logps/rejected": -1113.716552734375, "loss": 0.1608, "rewards/accuracies": 0.875, "rewards/chosen": -10.312074661254883, "rewards/margins": 28.769729614257812, "rewards/rejected": -39.08180236816406, "step": 3811 }, { "epoch": 2.371384136858476, "grad_norm": 0.0006819193949922919, "learning_rate": 1.1629783310281236e-06, "logits/chosen": 1.8856199979782104, "logits/rejected": 3.725149631500244, "logps/chosen": -626.3080444335938, "logps/rejected": -1122.8994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.148897171020508, "rewards/margins": 37.54759216308594, "rewards/rejected": -46.69648742675781, "step": 3812 }, { "epoch": 2.3720062208398134, "grad_norm": 27.375221252441406, "learning_rate": 1.161825726141079e-06, "logits/chosen": -0.32031482458114624, "logits/rejected": 3.144876003265381, "logps/chosen": -426.2165222167969, "logps/rejected": -1021.3775634765625, "loss": 0.3604, "rewards/accuracies": 0.875, "rewards/chosen": -9.422128677368164, "rewards/margins": 35.854530334472656, "rewards/rejected": -45.27666091918945, "step": 3813 }, { "epoch": 2.3726283048211507, "grad_norm": 0.6310350298881531, "learning_rate": 1.1606731212540343e-06, "logits/chosen": -2.3323371410369873, "logits/rejected": 1.3802802562713623, "logps/chosen": -420.9634704589844, "logps/rejected": -897.345703125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -6.875608444213867, "rewards/margins": 27.24919319152832, "rewards/rejected": -34.12480163574219, "step": 3814 }, { "epoch": 2.3732503888024885, "grad_norm": 21.276466369628906, "learning_rate": 1.1595205163669895e-06, "logits/chosen": 1.3786993026733398, "logits/rejected": 2.2478296756744385, "logps/chosen": -536.0009765625, "logps/rejected": -677.1472778320312, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": -10.205318450927734, "rewards/margins": 18.344764709472656, "rewards/rejected": -28.55008316040039, "step": 3815 }, { "epoch": 2.373872472783826, "grad_norm": 0.13539540767669678, "learning_rate": 1.158367911479945e-06, "logits/chosen": 1.7972712516784668, "logits/rejected": 4.250777244567871, "logps/chosen": -696.4157104492188, "logps/rejected": -1102.195068359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.6190409660339355, "rewards/margins": 29.232839584350586, "rewards/rejected": -34.85187911987305, "step": 3816 }, { "epoch": 2.374494556765163, "grad_norm": 0.0012157351011410356, "learning_rate": 1.1572153065929001e-06, "logits/chosen": 1.0089893341064453, "logits/rejected": 3.8036298751831055, "logps/chosen": -535.8660888671875, "logps/rejected": -858.5618286132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.22950553894043, "rewards/margins": 25.268936157226562, "rewards/rejected": -36.498443603515625, "step": 3817 }, { "epoch": 2.3751166407465005, "grad_norm": 7.838989404262975e-06, "learning_rate": 1.1560627017058554e-06, "logits/chosen": -3.151318311691284, "logits/rejected": 2.062565326690674, "logps/chosen": -290.9604187011719, "logps/rejected": -1047.005126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.1016693115234375, "rewards/margins": 37.82139205932617, "rewards/rejected": -43.923065185546875, "step": 3818 }, { "epoch": 2.3757387247278383, "grad_norm": 0.09899670630693436, "learning_rate": 1.1549100968188106e-06, "logits/chosen": -1.221901535987854, "logits/rejected": 2.648987293243408, "logps/chosen": -322.9440612792969, "logps/rejected": -714.6885375976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.6279449462890625, "rewards/margins": 21.054927825927734, "rewards/rejected": -26.682872772216797, "step": 3819 }, { "epoch": 2.3763608087091757, "grad_norm": 0.756338894367218, "learning_rate": 1.153757491931766e-06, "logits/chosen": -2.508833885192871, "logits/rejected": 3.401827573776245, "logps/chosen": -351.54278564453125, "logps/rejected": -971.5526733398438, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -7.598222732543945, "rewards/margins": 28.914169311523438, "rewards/rejected": -36.51239013671875, "step": 3820 }, { "epoch": 2.3769828926905134, "grad_norm": 0.47002577781677246, "learning_rate": 1.1526048870447213e-06, "logits/chosen": 0.4595339000225067, "logits/rejected": 2.387575149536133, "logps/chosen": -424.815185546875, "logps/rejected": -771.0914916992188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.066505432128906, "rewards/margins": 25.422027587890625, "rewards/rejected": -33.48853302001953, "step": 3821 }, { "epoch": 2.377604976671851, "grad_norm": 5.515254088095389e-05, "learning_rate": 1.1514522821576765e-06, "logits/chosen": -2.9623970985412598, "logits/rejected": 2.3023247718811035, "logps/chosen": -388.5247802734375, "logps/rejected": -1023.3239135742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.27711296081543, "rewards/margins": 34.165740966796875, "rewards/rejected": -42.44285202026367, "step": 3822 }, { "epoch": 2.378227060653188, "grad_norm": 26.249530792236328, "learning_rate": 1.1502996772706317e-06, "logits/chosen": 0.5427840948104858, "logits/rejected": 3.053314208984375, "logps/chosen": -576.7138671875, "logps/rejected": -1053.32080078125, "loss": 0.1586, "rewards/accuracies": 0.875, "rewards/chosen": -11.067059516906738, "rewards/margins": 28.13719940185547, "rewards/rejected": -39.20425796508789, "step": 3823 }, { "epoch": 2.3788491446345255, "grad_norm": 0.00016750558279454708, "learning_rate": 1.149147072383587e-06, "logits/chosen": -0.615725576877594, "logits/rejected": 1.1009749174118042, "logps/chosen": -504.9210205078125, "logps/rejected": -933.6993408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.970019340515137, "rewards/margins": 28.58142852783203, "rewards/rejected": -34.551448822021484, "step": 3824 }, { "epoch": 2.3794712286158632, "grad_norm": 0.01796378567814827, "learning_rate": 1.1479944674965422e-06, "logits/chosen": 0.3534300923347473, "logits/rejected": 1.401143193244934, "logps/chosen": -554.0580444335938, "logps/rejected": -919.526123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.12806510925293, "rewards/margins": 26.107044219970703, "rewards/rejected": -36.235111236572266, "step": 3825 }, { "epoch": 2.3800933125972006, "grad_norm": 0.008272453211247921, "learning_rate": 1.1468418626094976e-06, "logits/chosen": 1.3457462787628174, "logits/rejected": 2.418550491333008, "logps/chosen": -616.002685546875, "logps/rejected": -895.5050659179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.227404594421387, "rewards/margins": 24.576496124267578, "rewards/rejected": -34.80390167236328, "step": 3826 }, { "epoch": 2.380715396578538, "grad_norm": 1.474096417427063, "learning_rate": 1.1456892577224528e-06, "logits/chosen": 0.9951915144920349, "logits/rejected": 3.792121410369873, "logps/chosen": -609.1929931640625, "logps/rejected": -1121.0908203125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -8.0488920211792, "rewards/margins": 34.88578796386719, "rewards/rejected": -42.9346809387207, "step": 3827 }, { "epoch": 2.3813374805598757, "grad_norm": 4.4082865715026855, "learning_rate": 1.144536652835408e-06, "logits/chosen": 0.9247102737426758, "logits/rejected": 2.380889892578125, "logps/chosen": -692.318359375, "logps/rejected": -1045.686279296875, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -11.291966438293457, "rewards/margins": 29.624252319335938, "rewards/rejected": -40.916221618652344, "step": 3828 }, { "epoch": 2.381959564541213, "grad_norm": 0.037429846823215485, "learning_rate": 1.1433840479483633e-06, "logits/chosen": -2.244234323501587, "logits/rejected": 3.7790298461914062, "logps/chosen": -458.18402099609375, "logps/rejected": -1182.460693359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.688623428344727, "rewards/margins": 36.90248489379883, "rewards/rejected": -44.59111022949219, "step": 3829 }, { "epoch": 2.3825816485225504, "grad_norm": 3.2809522963361815e-05, "learning_rate": 1.1422314430613187e-06, "logits/chosen": 0.8496948480606079, "logits/rejected": 3.935314655303955, "logps/chosen": -696.5582275390625, "logps/rejected": -1325.233154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.319162368774414, "rewards/margins": 40.249549865722656, "rewards/rejected": -50.56871032714844, "step": 3830 }, { "epoch": 2.383203732503888, "grad_norm": 1.7225253031938337e-05, "learning_rate": 1.141078838174274e-06, "logits/chosen": 1.3808107376098633, "logits/rejected": 1.7068805694580078, "logps/chosen": -491.5025634765625, "logps/rejected": -781.4146118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.127446174621582, "rewards/margins": 24.558727264404297, "rewards/rejected": -30.68617057800293, "step": 3831 }, { "epoch": 2.3838258164852255, "grad_norm": 0.08149093389511108, "learning_rate": 1.1399262332872291e-06, "logits/chosen": 0.25885581970214844, "logits/rejected": 4.693746089935303, "logps/chosen": -454.82904052734375, "logps/rejected": -899.7261962890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.123469352722168, "rewards/margins": 25.705184936523438, "rewards/rejected": -32.82865524291992, "step": 3832 }, { "epoch": 2.384447900466563, "grad_norm": 0.09399059414863586, "learning_rate": 1.1387736284001846e-06, "logits/chosen": -1.8209072351455688, "logits/rejected": 1.4271893501281738, "logps/chosen": -344.51904296875, "logps/rejected": -894.5800170898438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.07096529006958, "rewards/margins": 31.631628036499023, "rewards/rejected": -37.70259094238281, "step": 3833 }, { "epoch": 2.3850699844479006, "grad_norm": 31.503684997558594, "learning_rate": 1.1376210235131398e-06, "logits/chosen": 1.3686542510986328, "logits/rejected": 4.16377592086792, "logps/chosen": -628.8628540039062, "logps/rejected": -1065.0404052734375, "loss": 0.2273, "rewards/accuracies": 0.875, "rewards/chosen": -11.978160858154297, "rewards/margins": 29.86235809326172, "rewards/rejected": -41.84052276611328, "step": 3834 }, { "epoch": 2.385692068429238, "grad_norm": 0.10063165426254272, "learning_rate": 1.136468418626095e-06, "logits/chosen": -2.5170469284057617, "logits/rejected": 3.1774446964263916, "logps/chosen": -293.3402404785156, "logps/rejected": -847.16455078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.719707489013672, "rewards/margins": 25.059696197509766, "rewards/rejected": -30.779403686523438, "step": 3835 }, { "epoch": 2.3863141524105753, "grad_norm": 0.06871528923511505, "learning_rate": 1.1353158137390503e-06, "logits/chosen": 0.4909989833831787, "logits/rejected": 4.08764123916626, "logps/chosen": -477.0809631347656, "logps/rejected": -993.2660522460938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.19843864440918, "rewards/margins": 28.097043991088867, "rewards/rejected": -37.29548645019531, "step": 3836 }, { "epoch": 2.386936236391913, "grad_norm": 3.714032192903005e-09, "learning_rate": 1.1341632088520057e-06, "logits/chosen": 2.953510284423828, "logits/rejected": 3.4700748920440674, "logps/chosen": -828.0033569335938, "logps/rejected": -1372.037353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.156700134277344, "rewards/margins": 40.094810485839844, "rewards/rejected": -58.25151443481445, "step": 3837 }, { "epoch": 2.3875583203732504, "grad_norm": 3.741096258163452, "learning_rate": 1.133010603964961e-06, "logits/chosen": 2.4105310440063477, "logits/rejected": 2.8894474506378174, "logps/chosen": -609.405029296875, "logps/rejected": -775.5403442382812, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -11.283312797546387, "rewards/margins": 15.006702423095703, "rewards/rejected": -26.290014266967773, "step": 3838 }, { "epoch": 2.388180404354588, "grad_norm": 0.0006594893056899309, "learning_rate": 1.1318579990779161e-06, "logits/chosen": 1.8672668933868408, "logits/rejected": 3.5272293090820312, "logps/chosen": -618.9702758789062, "logps/rejected": -1041.2235107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.7736897468566895, "rewards/margins": 31.124055862426758, "rewards/rejected": -38.897743225097656, "step": 3839 }, { "epoch": 2.3888024883359256, "grad_norm": 1.4465635633786889e-10, "learning_rate": 1.1307053941908714e-06, "logits/chosen": -0.799127459526062, "logits/rejected": 3.0651865005493164, "logps/chosen": -531.7684326171875, "logps/rejected": -1095.8377685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.762605667114258, "rewards/margins": 38.612545013427734, "rewards/rejected": -47.375152587890625, "step": 3840 }, { "epoch": 2.389424572317263, "grad_norm": 3.5946497973782243e-06, "learning_rate": 1.1295527893038268e-06, "logits/chosen": -0.4164998531341553, "logits/rejected": 2.9457249641418457, "logps/chosen": -339.09429931640625, "logps/rejected": -858.0877075195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.4706830978393555, "rewards/margins": 29.20252799987793, "rewards/rejected": -33.67321014404297, "step": 3841 }, { "epoch": 2.3900466562986002, "grad_norm": 0.37519222497940063, "learning_rate": 1.128400184416782e-06, "logits/chosen": -0.43005144596099854, "logits/rejected": 4.186233043670654, "logps/chosen": -359.9776916503906, "logps/rejected": -879.0882568359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.06336498260498, "rewards/margins": 25.423934936523438, "rewards/rejected": -33.487300872802734, "step": 3842 }, { "epoch": 2.3906687402799376, "grad_norm": 2.3764117941027507e-05, "learning_rate": 1.1272475795297373e-06, "logits/chosen": -2.9976296424865723, "logits/rejected": 2.8074846267700195, "logps/chosen": -442.12554931640625, "logps/rejected": -1336.9715576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.334522247314453, "rewards/margins": 46.972328186035156, "rewards/rejected": -61.306854248046875, "step": 3843 }, { "epoch": 2.3912908242612754, "grad_norm": 0.03182898834347725, "learning_rate": 1.1260949746426927e-06, "logits/chosen": 0.5130317807197571, "logits/rejected": 2.4486565589904785, "logps/chosen": -550.2568969726562, "logps/rejected": -907.3411865234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.3552885055542, "rewards/margins": 28.428510665893555, "rewards/rejected": -36.78379821777344, "step": 3844 }, { "epoch": 2.3919129082426127, "grad_norm": 0.09916325658559799, "learning_rate": 1.124942369755648e-06, "logits/chosen": -0.9081926345825195, "logits/rejected": 3.497873306274414, "logps/chosen": -487.79779052734375, "logps/rejected": -1203.325927734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.939855575561523, "rewards/margins": 35.823936462402344, "rewards/rejected": -40.7637939453125, "step": 3845 }, { "epoch": 2.39253499222395, "grad_norm": 0.006481132935732603, "learning_rate": 1.1237897648686031e-06, "logits/chosen": -3.0908753871917725, "logits/rejected": 2.7621371746063232, "logps/chosen": -401.870849609375, "logps/rejected": -1008.1326904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.672129154205322, "rewards/margins": 31.289287567138672, "rewards/rejected": -37.9614143371582, "step": 3846 }, { "epoch": 2.393157076205288, "grad_norm": 7.337146598729305e-06, "learning_rate": 1.1226371599815584e-06, "logits/chosen": -0.8336185216903687, "logits/rejected": 2.953880548477173, "logps/chosen": -547.261962890625, "logps/rejected": -1239.45263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.123262405395508, "rewards/margins": 46.47114181518555, "rewards/rejected": -55.59440612792969, "step": 3847 }, { "epoch": 2.393779160186625, "grad_norm": 2.9044236725894734e-05, "learning_rate": 1.1214845550945138e-06, "logits/chosen": 1.4980781078338623, "logits/rejected": 3.6079821586608887, "logps/chosen": -619.3064575195312, "logps/rejected": -1130.49755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.297479629516602, "rewards/margins": 36.90601348876953, "rewards/rejected": -49.20349884033203, "step": 3848 }, { "epoch": 2.3944012441679625, "grad_norm": 1.0253877639770508, "learning_rate": 1.120331950207469e-06, "logits/chosen": 1.0810809135437012, "logits/rejected": 4.039799690246582, "logps/chosen": -595.2739868164062, "logps/rejected": -1004.406982421875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -8.904537200927734, "rewards/margins": 25.0423583984375, "rewards/rejected": -33.946895599365234, "step": 3849 }, { "epoch": 2.3950233281493003, "grad_norm": 0.003972693812102079, "learning_rate": 1.1191793453204243e-06, "logits/chosen": 0.4832313358783722, "logits/rejected": 2.6967647075653076, "logps/chosen": -546.1810913085938, "logps/rejected": -988.8588256835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.466804504394531, "rewards/margins": 35.22352600097656, "rewards/rejected": -43.690330505371094, "step": 3850 }, { "epoch": 2.3956454121306376, "grad_norm": 0.010902078822255135, "learning_rate": 1.1180267404333795e-06, "logits/chosen": -1.4263477325439453, "logits/rejected": 1.1521247625350952, "logps/chosen": -418.4461364746094, "logps/rejected": -798.664794921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.85601806640625, "rewards/margins": 20.761884689331055, "rewards/rejected": -30.617904663085938, "step": 3851 }, { "epoch": 2.396267496111975, "grad_norm": 0.0003886328195221722, "learning_rate": 1.116874135546335e-06, "logits/chosen": 1.5249922275543213, "logits/rejected": 3.464700698852539, "logps/chosen": -502.8961181640625, "logps/rejected": -1032.2509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9890570640563965, "rewards/margins": 36.86589431762695, "rewards/rejected": -42.854949951171875, "step": 3852 }, { "epoch": 2.3968895800933128, "grad_norm": 0.0013575759949162602, "learning_rate": 1.1157215306592901e-06, "logits/chosen": -0.5717923641204834, "logits/rejected": 3.2800910472869873, "logps/chosen": -446.5240478515625, "logps/rejected": -898.409423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.479028701782227, "rewards/margins": 28.62911605834961, "rewards/rejected": -37.10814666748047, "step": 3853 }, { "epoch": 2.39751166407465, "grad_norm": 33.908782958984375, "learning_rate": 1.1145689257722454e-06, "logits/chosen": 1.4796611070632935, "logits/rejected": 2.9911272525787354, "logps/chosen": -736.0274658203125, "logps/rejected": -1020.3888549804688, "loss": 0.7557, "rewards/accuracies": 0.875, "rewards/chosen": -17.031911849975586, "rewards/margins": 19.36725616455078, "rewards/rejected": -36.399169921875, "step": 3854 }, { "epoch": 2.3981337480559874, "grad_norm": 4.587909643305466e-06, "learning_rate": 1.1134163208852008e-06, "logits/chosen": 0.44414985179901123, "logits/rejected": 3.9183576107025146, "logps/chosen": -463.86669921875, "logps/rejected": -967.0587768554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.996728897094727, "rewards/margins": 27.84886360168457, "rewards/rejected": -34.8455924987793, "step": 3855 }, { "epoch": 2.3987558320373252, "grad_norm": 5.694292326552386e-07, "learning_rate": 1.1122637159981558e-06, "logits/chosen": -0.7950145602226257, "logits/rejected": 3.101360559463501, "logps/chosen": -555.7838745117188, "logps/rejected": -1047.280029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.646661758422852, "rewards/margins": 28.201671600341797, "rewards/rejected": -36.84833526611328, "step": 3856 }, { "epoch": 2.3993779160186626, "grad_norm": 4.2507390389801e-06, "learning_rate": 1.111111111111111e-06, "logits/chosen": 3.1061859130859375, "logits/rejected": 3.111227035522461, "logps/chosen": -794.248779296875, "logps/rejected": -1133.7769775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.302314758300781, "rewards/margins": 34.15514373779297, "rewards/rejected": -45.457462310791016, "step": 3857 }, { "epoch": 2.4, "grad_norm": 0.0018757757497951388, "learning_rate": 1.1099585062240665e-06, "logits/chosen": 0.4937957525253296, "logits/rejected": 3.476731300354004, "logps/chosen": -680.0784301757812, "logps/rejected": -1116.152099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.059968948364258, "rewards/margins": 34.34630584716797, "rewards/rejected": -42.40627670288086, "step": 3858 }, { "epoch": 2.4006220839813377, "grad_norm": 0.0005473470664583147, "learning_rate": 1.1088059013370217e-06, "logits/chosen": 0.9986388087272644, "logits/rejected": 3.233471155166626, "logps/chosen": -412.994873046875, "logps/rejected": -988.3786010742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.322336673736572, "rewards/margins": 37.020713806152344, "rewards/rejected": -44.343048095703125, "step": 3859 }, { "epoch": 2.401244167962675, "grad_norm": 2.3679943339516285e-08, "learning_rate": 1.107653296449977e-06, "logits/chosen": 1.7901991605758667, "logits/rejected": 3.3682918548583984, "logps/chosen": -713.5994873046875, "logps/rejected": -1195.774658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.00442886352539, "rewards/margins": 42.90016555786133, "rewards/rejected": -55.90459442138672, "step": 3860 }, { "epoch": 2.4018662519440124, "grad_norm": 52.9873046875, "learning_rate": 1.1065006915629324e-06, "logits/chosen": 0.24488091468811035, "logits/rejected": 1.826075792312622, "logps/chosen": -396.94622802734375, "logps/rejected": -682.343017578125, "loss": 0.7565, "rewards/accuracies": 0.875, "rewards/chosen": -4.514383316040039, "rewards/margins": 20.130586624145508, "rewards/rejected": -24.644969940185547, "step": 3861 }, { "epoch": 2.4024883359253497, "grad_norm": 0.00010500989446882159, "learning_rate": 1.1053480866758876e-06, "logits/chosen": -0.6162758469581604, "logits/rejected": 2.7350568771362305, "logps/chosen": -506.5060119628906, "logps/rejected": -1002.4456787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.694830894470215, "rewards/margins": 31.44960594177246, "rewards/rejected": -37.14443588256836, "step": 3862 }, { "epoch": 2.4031104199066875, "grad_norm": 0.437195360660553, "learning_rate": 1.1041954817888428e-06, "logits/chosen": -2.221411943435669, "logits/rejected": 2.240596294403076, "logps/chosen": -397.92572021484375, "logps/rejected": -886.3359375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.60426139831543, "rewards/margins": 26.445598602294922, "rewards/rejected": -34.049861907958984, "step": 3863 }, { "epoch": 2.403732503888025, "grad_norm": 0.0330638512969017, "learning_rate": 1.103042876901798e-06, "logits/chosen": -1.5246604681015015, "logits/rejected": 3.694162368774414, "logps/chosen": -434.1704406738281, "logps/rejected": -949.82373046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.017059326171875, "rewards/margins": 29.48417854309082, "rewards/rejected": -37.50123596191406, "step": 3864 }, { "epoch": 2.404354587869362, "grad_norm": 0.0012657060287892818, "learning_rate": 1.1018902720147535e-06, "logits/chosen": -0.6471145153045654, "logits/rejected": 2.905531167984009, "logps/chosen": -530.5743408203125, "logps/rejected": -1086.3309326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.827986717224121, "rewards/margins": 35.81536865234375, "rewards/rejected": -43.64336013793945, "step": 3865 }, { "epoch": 2.4049766718507, "grad_norm": 9.531051858857609e-08, "learning_rate": 1.1007376671277087e-06, "logits/chosen": 0.3572736084461212, "logits/rejected": 3.8673791885375977, "logps/chosen": -654.2926025390625, "logps/rejected": -1154.85888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.924564361572266, "rewards/margins": 35.805335998535156, "rewards/rejected": -44.729896545410156, "step": 3866 }, { "epoch": 2.4055987558320373, "grad_norm": 0.0006852780352346599, "learning_rate": 1.099585062240664e-06, "logits/chosen": -1.9400222301483154, "logits/rejected": 2.281996726989746, "logps/chosen": -417.958984375, "logps/rejected": -963.850830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.913776397705078, "rewards/margins": 31.837993621826172, "rewards/rejected": -37.75177001953125, "step": 3867 }, { "epoch": 2.4062208398133746, "grad_norm": 0.7762795090675354, "learning_rate": 1.0984324573536194e-06, "logits/chosen": -1.367218255996704, "logits/rejected": 2.897139072418213, "logps/chosen": -401.431884765625, "logps/rejected": -1006.3223876953125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -8.719179153442383, "rewards/margins": 31.281005859375, "rewards/rejected": -40.000186920166016, "step": 3868 }, { "epoch": 2.4068429237947124, "grad_norm": 1.8480974176782183e-05, "learning_rate": 1.0972798524665746e-06, "logits/chosen": 0.2994877099990845, "logits/rejected": 2.9668407440185547, "logps/chosen": -570.2740478515625, "logps/rejected": -981.5447998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.978969573974609, "rewards/margins": 30.225505828857422, "rewards/rejected": -35.204471588134766, "step": 3869 }, { "epoch": 2.4074650077760498, "grad_norm": 0.4169832170009613, "learning_rate": 1.0961272475795298e-06, "logits/chosen": -2.425063133239746, "logits/rejected": 2.970721960067749, "logps/chosen": -316.7384948730469, "logps/rejected": -876.7730712890625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.8234100341796875, "rewards/margins": 26.405567169189453, "rewards/rejected": -32.228973388671875, "step": 3870 }, { "epoch": 2.408087091757387, "grad_norm": 0.008011666126549244, "learning_rate": 1.094974642692485e-06, "logits/chosen": -1.890117883682251, "logits/rejected": 1.9428832530975342, "logps/chosen": -498.21478271484375, "logps/rejected": -1003.7666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.868483543395996, "rewards/margins": 26.999061584472656, "rewards/rejected": -36.8675422668457, "step": 3871 }, { "epoch": 2.408709175738725, "grad_norm": 0.014141597785055637, "learning_rate": 1.0938220378054405e-06, "logits/chosen": -3.8100922107696533, "logits/rejected": 1.1407690048217773, "logps/chosen": -336.95751953125, "logps/rejected": -872.5132446289062, "loss": 0.0867, "rewards/accuracies": 0.875, "rewards/chosen": -6.890573501586914, "rewards/margins": 26.666492462158203, "rewards/rejected": -33.55706787109375, "step": 3872 }, { "epoch": 2.4093312597200622, "grad_norm": 0.0007986845448613167, "learning_rate": 1.0926694329183957e-06, "logits/chosen": -2.3703830242156982, "logits/rejected": 2.6667072772979736, "logps/chosen": -360.1922302246094, "logps/rejected": -948.6698608398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.256704330444336, "rewards/margins": 34.69185256958008, "rewards/rejected": -42.94855499267578, "step": 3873 }, { "epoch": 2.4099533437013996, "grad_norm": 0.1704350709915161, "learning_rate": 1.091516828031351e-06, "logits/chosen": 1.2536743879318237, "logits/rejected": 1.5354732275009155, "logps/chosen": -570.9561157226562, "logps/rejected": -839.4619750976562, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -11.871989250183105, "rewards/margins": 22.955930709838867, "rewards/rejected": -34.827919006347656, "step": 3874 }, { "epoch": 2.4105754276827374, "grad_norm": 0.0066949715837836266, "learning_rate": 1.0903642231443061e-06, "logits/chosen": -1.0441203117370605, "logits/rejected": 4.152888298034668, "logps/chosen": -414.62628173828125, "logps/rejected": -960.6255493164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.822000503540039, "rewards/margins": 23.903718948364258, "rewards/rejected": -31.725719451904297, "step": 3875 }, { "epoch": 2.4111975116640747, "grad_norm": 0.07872223109006882, "learning_rate": 1.0892116182572616e-06, "logits/chosen": 0.5356428623199463, "logits/rejected": 2.545949697494507, "logps/chosen": -443.02252197265625, "logps/rejected": -925.462646484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.831305503845215, "rewards/margins": 27.70733070373535, "rewards/rejected": -34.53863525390625, "step": 3876 }, { "epoch": 2.411819595645412, "grad_norm": 0.0002740553754847497, "learning_rate": 1.0880590133702168e-06, "logits/chosen": 1.6327204704284668, "logits/rejected": 4.18537712097168, "logps/chosen": -581.2978515625, "logps/rejected": -1052.298583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.621686935424805, "rewards/margins": 25.028202056884766, "rewards/rejected": -34.64988708496094, "step": 3877 }, { "epoch": 2.41244167962675, "grad_norm": 0.010209181345999241, "learning_rate": 1.086906408483172e-06, "logits/chosen": -0.8912521600723267, "logits/rejected": 3.182377815246582, "logps/chosen": -574.5421142578125, "logps/rejected": -1049.0518798828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.360973358154297, "rewards/margins": 24.6096134185791, "rewards/rejected": -34.970584869384766, "step": 3878 }, { "epoch": 2.413063763608087, "grad_norm": 1.7555263184476644e-05, "learning_rate": 1.0857538035961275e-06, "logits/chosen": 1.3117332458496094, "logits/rejected": 5.049984931945801, "logps/chosen": -458.4806213378906, "logps/rejected": -928.1309814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.135470390319824, "rewards/margins": 27.78437614440918, "rewards/rejected": -33.91984558105469, "step": 3879 }, { "epoch": 2.4136858475894245, "grad_norm": 1.8339464664459229, "learning_rate": 1.0846011987090827e-06, "logits/chosen": -0.8407329320907593, "logits/rejected": 2.9085705280303955, "logps/chosen": -490.6579895019531, "logps/rejected": -1031.06787109375, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -7.46267032623291, "rewards/margins": 32.82110595703125, "rewards/rejected": -40.283775329589844, "step": 3880 }, { "epoch": 2.414307931570762, "grad_norm": 1.7781831047614105e-05, "learning_rate": 1.083448593822038e-06, "logits/chosen": -0.6590343713760376, "logits/rejected": 2.556335926055908, "logps/chosen": -533.1664428710938, "logps/rejected": -1097.8114013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.707242965698242, "rewards/margins": 37.932315826416016, "rewards/rejected": -48.639564514160156, "step": 3881 }, { "epoch": 2.4149300155520996, "grad_norm": 0.018035000190138817, "learning_rate": 1.0822959889349931e-06, "logits/chosen": 0.9073091149330139, "logits/rejected": 2.709404945373535, "logps/chosen": -545.5582885742188, "logps/rejected": -924.0416870117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.252019882202148, "rewards/margins": 27.85335922241211, "rewards/rejected": -36.10538101196289, "step": 3882 }, { "epoch": 2.415552099533437, "grad_norm": 0.00019639487436506897, "learning_rate": 1.0811433840479486e-06, "logits/chosen": 0.6960997581481934, "logits/rejected": 3.6727519035339355, "logps/chosen": -628.2368774414062, "logps/rejected": -1225.331298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.303197860717773, "rewards/margins": 40.59136199951172, "rewards/rejected": -49.89455795288086, "step": 3883 }, { "epoch": 2.4161741835147743, "grad_norm": 0.02851576916873455, "learning_rate": 1.0799907791609038e-06, "logits/chosen": 0.7198000550270081, "logits/rejected": 2.796893835067749, "logps/chosen": -575.7562255859375, "logps/rejected": -1004.9088745117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.440130233764648, "rewards/margins": 30.360107421875, "rewards/rejected": -38.80023956298828, "step": 3884 }, { "epoch": 2.416796267496112, "grad_norm": 0.005485597066581249, "learning_rate": 1.078838174273859e-06, "logits/chosen": -1.3771247863769531, "logits/rejected": 2.6898508071899414, "logps/chosen": -405.0865478515625, "logps/rejected": -927.6209716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.677170753479004, "rewards/margins": 31.92575454711914, "rewards/rejected": -38.60292434692383, "step": 3885 }, { "epoch": 2.4174183514774494, "grad_norm": 0.0021522575989365578, "learning_rate": 1.0776855693868142e-06, "logits/chosen": 1.534578561782837, "logits/rejected": 3.6858203411102295, "logps/chosen": -449.2783203125, "logps/rejected": -860.0929565429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.283685684204102, "rewards/margins": 24.695194244384766, "rewards/rejected": -33.978878021240234, "step": 3886 }, { "epoch": 2.4180404354587868, "grad_norm": 0.00910354033112526, "learning_rate": 1.0765329644997697e-06, "logits/chosen": -2.305527448654175, "logits/rejected": 1.6769919395446777, "logps/chosen": -355.90106201171875, "logps/rejected": -976.04443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1197309494018555, "rewards/margins": 37.60846710205078, "rewards/rejected": -44.72819900512695, "step": 3887 }, { "epoch": 2.4186625194401246, "grad_norm": 1.590985831967373e-08, "learning_rate": 1.075380359612725e-06, "logits/chosen": -0.454487681388855, "logits/rejected": 3.099392890930176, "logps/chosen": -555.4652099609375, "logps/rejected": -1095.21240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.411281585693359, "rewards/margins": 31.243457794189453, "rewards/rejected": -37.65473937988281, "step": 3888 }, { "epoch": 2.419284603421462, "grad_norm": 0.0012218141928315163, "learning_rate": 1.0742277547256801e-06, "logits/chosen": -0.7278669476509094, "logits/rejected": 1.473168969154358, "logps/chosen": -443.8373107910156, "logps/rejected": -928.9570922851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.763683795928955, "rewards/margins": 28.198726654052734, "rewards/rejected": -35.96240997314453, "step": 3889 }, { "epoch": 2.4199066874027992, "grad_norm": 4.316197009757161e-05, "learning_rate": 1.0730751498386354e-06, "logits/chosen": 0.8848274946212769, "logits/rejected": 3.4951579570770264, "logps/chosen": -535.738525390625, "logps/rejected": -1054.58544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.603253364562988, "rewards/margins": 32.85493469238281, "rewards/rejected": -42.458187103271484, "step": 3890 }, { "epoch": 2.420528771384137, "grad_norm": 0.00023110301117412746, "learning_rate": 1.0719225449515906e-06, "logits/chosen": -2.57528018951416, "logits/rejected": 3.3211045265197754, "logps/chosen": -384.0837707519531, "logps/rejected": -999.553466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.611985206604004, "rewards/margins": 29.31735610961914, "rewards/rejected": -38.92934036254883, "step": 3891 }, { "epoch": 2.4211508553654744, "grad_norm": 0.00010306945478077978, "learning_rate": 1.0707699400645458e-06, "logits/chosen": 0.6355469822883606, "logits/rejected": 3.7850501537323, "logps/chosen": -430.77984619140625, "logps/rejected": -878.0767822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.622600078582764, "rewards/margins": 27.778318405151367, "rewards/rejected": -33.400917053222656, "step": 3892 }, { "epoch": 2.4217729393468117, "grad_norm": 4.880786491412437e-06, "learning_rate": 1.0696173351775012e-06, "logits/chosen": -0.20533713698387146, "logits/rejected": 2.7818212509155273, "logps/chosen": -484.79132080078125, "logps/rejected": -1032.011474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.599923133850098, "rewards/margins": 35.451377868652344, "rewards/rejected": -46.051307678222656, "step": 3893 }, { "epoch": 2.4223950233281495, "grad_norm": 0.0029728247318416834, "learning_rate": 1.0684647302904565e-06, "logits/chosen": -1.0218257904052734, "logits/rejected": 1.424391269683838, "logps/chosen": -387.3336181640625, "logps/rejected": -822.9134521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.986425876617432, "rewards/margins": 28.502716064453125, "rewards/rejected": -35.48914337158203, "step": 3894 }, { "epoch": 2.423017107309487, "grad_norm": 3.79450602849829e-06, "learning_rate": 1.0673121254034117e-06, "logits/chosen": 1.6423513889312744, "logits/rejected": 2.8541207313537598, "logps/chosen": -503.12030029296875, "logps/rejected": -846.1358032226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.122786045074463, "rewards/margins": 26.406681060791016, "rewards/rejected": -33.52946472167969, "step": 3895 }, { "epoch": 2.423639191290824, "grad_norm": 0.07275314629077911, "learning_rate": 1.0661595205163671e-06, "logits/chosen": -0.7622900009155273, "logits/rejected": 3.9825644493103027, "logps/chosen": -450.01629638671875, "logps/rejected": -1051.365234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.296741485595703, "rewards/margins": 27.42078971862793, "rewards/rejected": -35.717533111572266, "step": 3896 }, { "epoch": 2.424261275272162, "grad_norm": 1.1182972192764282, "learning_rate": 1.0650069156293224e-06, "logits/chosen": 1.8294832706451416, "logits/rejected": -0.14126360416412354, "logps/chosen": -713.759033203125, "logps/rejected": -866.5440673828125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -9.934208869934082, "rewards/margins": 23.563343048095703, "rewards/rejected": -33.49755096435547, "step": 3897 }, { "epoch": 2.4248833592534993, "grad_norm": 4.143659680266865e-06, "learning_rate": 1.0638543107422776e-06, "logits/chosen": -1.2462234497070312, "logits/rejected": 2.659334182739258, "logps/chosen": -287.9202880859375, "logps/rejected": -765.412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.872437000274658, "rewards/margins": 28.731220245361328, "rewards/rejected": -32.60365676879883, "step": 3898 }, { "epoch": 2.4255054432348366, "grad_norm": 4.975401225237874e-06, "learning_rate": 1.0627017058552328e-06, "logits/chosen": 1.7069756984710693, "logits/rejected": 3.591083526611328, "logps/chosen": -539.0305786132812, "logps/rejected": -873.288330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.884143352508545, "rewards/margins": 27.36844253540039, "rewards/rejected": -35.252586364746094, "step": 3899 }, { "epoch": 2.426127527216174, "grad_norm": 0.014776756055653095, "learning_rate": 1.0615491009681882e-06, "logits/chosen": -0.04439480975270271, "logits/rejected": 2.4738411903381348, "logps/chosen": -548.758544921875, "logps/rejected": -1041.307373046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.613590717315674, "rewards/margins": 30.745086669921875, "rewards/rejected": -37.35867691040039, "step": 3900 }, { "epoch": 2.4267496111975118, "grad_norm": 0.00023643742315471172, "learning_rate": 1.0603964960811435e-06, "logits/chosen": 1.3689160346984863, "logits/rejected": 4.3477959632873535, "logps/chosen": -502.134033203125, "logps/rejected": -981.1270751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.091121673583984, "rewards/margins": 33.34466552734375, "rewards/rejected": -40.435787200927734, "step": 3901 }, { "epoch": 2.427371695178849, "grad_norm": 0.24226395785808563, "learning_rate": 1.0592438911940987e-06, "logits/chosen": 2.8122005462646484, "logits/rejected": 5.458620071411133, "logps/chosen": -536.3717651367188, "logps/rejected": -1013.8870849609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -10.13599967956543, "rewards/margins": 28.089576721191406, "rewards/rejected": -38.2255744934082, "step": 3902 }, { "epoch": 2.4279937791601864, "grad_norm": 2.852854095181101e-06, "learning_rate": 1.058091286307054e-06, "logits/chosen": -1.2406141757965088, "logits/rejected": 3.444408416748047, "logps/chosen": -490.6299743652344, "logps/rejected": -1048.380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.690634727478027, "rewards/margins": 33.460086822509766, "rewards/rejected": -45.15072250366211, "step": 3903 }, { "epoch": 2.4286158631415242, "grad_norm": 0.05775154381990433, "learning_rate": 1.0569386814200094e-06, "logits/chosen": 1.434791922569275, "logits/rejected": 2.4212846755981445, "logps/chosen": -573.909423828125, "logps/rejected": -889.8311157226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.915290355682373, "rewards/margins": 25.13017463684082, "rewards/rejected": -32.04546356201172, "step": 3904 }, { "epoch": 2.4292379471228616, "grad_norm": 0.019974645227193832, "learning_rate": 1.0557860765329646e-06, "logits/chosen": 1.4990663528442383, "logits/rejected": 2.492790699005127, "logps/chosen": -607.529296875, "logps/rejected": -915.8584594726562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.066529273986816, "rewards/margins": 22.119138717651367, "rewards/rejected": -29.185667037963867, "step": 3905 }, { "epoch": 2.429860031104199, "grad_norm": 0.0003226126718800515, "learning_rate": 1.0546334716459198e-06, "logits/chosen": 0.2538611590862274, "logits/rejected": 3.2478034496307373, "logps/chosen": -505.583251953125, "logps/rejected": -998.1884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.00172233581543, "rewards/margins": 26.9884033203125, "rewards/rejected": -37.99012756347656, "step": 3906 }, { "epoch": 2.4304821150855367, "grad_norm": 0.36915433406829834, "learning_rate": 1.0534808667588752e-06, "logits/chosen": 0.7074757218360901, "logits/rejected": 2.855924606323242, "logps/chosen": -600.1455688476562, "logps/rejected": -972.5783081054688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.70722770690918, "rewards/margins": 26.310523986816406, "rewards/rejected": -36.01775360107422, "step": 3907 }, { "epoch": 2.431104199066874, "grad_norm": 1.5006874036771478e-06, "learning_rate": 1.0523282618718305e-06, "logits/chosen": -0.2986694574356079, "logits/rejected": 2.294739246368408, "logps/chosen": -521.6359252929688, "logps/rejected": -982.6089477539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.400278091430664, "rewards/margins": 31.926898956298828, "rewards/rejected": -42.327178955078125, "step": 3908 }, { "epoch": 2.4317262830482114, "grad_norm": 0.000786642893217504, "learning_rate": 1.0511756569847857e-06, "logits/chosen": -1.4148194789886475, "logits/rejected": 0.4607158899307251, "logps/chosen": -343.89404296875, "logps/rejected": -714.0547485351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.373026371002197, "rewards/margins": 23.576187133789062, "rewards/rejected": -28.949214935302734, "step": 3909 }, { "epoch": 2.432348367029549, "grad_norm": 8.006913185119629, "learning_rate": 1.050023052097741e-06, "logits/chosen": 2.225314140319824, "logits/rejected": 2.388673782348633, "logps/chosen": -775.02392578125, "logps/rejected": -1070.6405029296875, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -12.142297744750977, "rewards/margins": 25.948198318481445, "rewards/rejected": -38.09049987792969, "step": 3910 }, { "epoch": 2.4329704510108865, "grad_norm": 0.15609769523143768, "learning_rate": 1.0488704472106964e-06, "logits/chosen": 1.5960623025894165, "logits/rejected": 3.988802671432495, "logps/chosen": -606.6814575195312, "logps/rejected": -972.6724243164062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.931401252746582, "rewards/margins": 21.659502029418945, "rewards/rejected": -31.590904235839844, "step": 3911 }, { "epoch": 2.433592534992224, "grad_norm": 0.0025720945559442043, "learning_rate": 1.0477178423236516e-06, "logits/chosen": -0.44914060831069946, "logits/rejected": 3.133753776550293, "logps/chosen": -466.37982177734375, "logps/rejected": -967.54736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.205682754516602, "rewards/margins": 31.63439178466797, "rewards/rejected": -37.84007263183594, "step": 3912 }, { "epoch": 2.4342146189735616, "grad_norm": 0.0004372483235783875, "learning_rate": 1.0465652374366068e-06, "logits/chosen": 0.052958518266677856, "logits/rejected": 4.425273895263672, "logps/chosen": -494.8092041015625, "logps/rejected": -966.57080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.776680946350098, "rewards/margins": 25.838638305664062, "rewards/rejected": -32.615318298339844, "step": 3913 }, { "epoch": 2.434836702954899, "grad_norm": 0.13159841299057007, "learning_rate": 1.045412632549562e-06, "logits/chosen": 0.2504895329475403, "logits/rejected": 2.2940564155578613, "logps/chosen": -595.7615356445312, "logps/rejected": -949.5869750976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.673824310302734, "rewards/margins": 22.880603790283203, "rewards/rejected": -33.55442810058594, "step": 3914 }, { "epoch": 2.4354587869362363, "grad_norm": 0.024950237944722176, "learning_rate": 1.0442600276625175e-06, "logits/chosen": -0.04401680454611778, "logits/rejected": 3.7909388542175293, "logps/chosen": -532.6128540039062, "logps/rejected": -1144.41650390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.95002269744873, "rewards/margins": 38.728492736816406, "rewards/rejected": -47.67851257324219, "step": 3915 }, { "epoch": 2.436080870917574, "grad_norm": 8.862929098540917e-05, "learning_rate": 1.0431074227754727e-06, "logits/chosen": 1.5957598686218262, "logits/rejected": 3.401139497756958, "logps/chosen": -713.1761474609375, "logps/rejected": -1047.773193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.424721717834473, "rewards/margins": 27.84493064880371, "rewards/rejected": -39.2696533203125, "step": 3916 }, { "epoch": 2.4367029548989114, "grad_norm": 3.7480590435734484e-06, "learning_rate": 1.041954817888428e-06, "logits/chosen": -0.12272043526172638, "logits/rejected": 1.3370565176010132, "logps/chosen": -578.545166015625, "logps/rejected": -850.365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.4892683029174805, "rewards/margins": 22.693389892578125, "rewards/rejected": -29.18265724182129, "step": 3917 }, { "epoch": 2.4373250388802488, "grad_norm": 47.579586029052734, "learning_rate": 1.0408022130013833e-06, "logits/chosen": 2.956791877746582, "logits/rejected": 3.071183681488037, "logps/chosen": -763.1408081054688, "logps/rejected": -979.1478881835938, "loss": 1.6257, "rewards/accuracies": 0.875, "rewards/chosen": -13.412786483764648, "rewards/margins": 21.867210388183594, "rewards/rejected": -35.279998779296875, "step": 3918 }, { "epoch": 2.437947122861586, "grad_norm": 8.301009802380577e-05, "learning_rate": 1.0396496081143386e-06, "logits/chosen": -3.1810388565063477, "logits/rejected": 0.9512239694595337, "logps/chosen": -333.6005554199219, "logps/rejected": -829.9498901367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.965550422668457, "rewards/margins": 23.553924560546875, "rewards/rejected": -28.51947593688965, "step": 3919 }, { "epoch": 2.438569206842924, "grad_norm": 8.164431619661627e-08, "learning_rate": 1.0384970032272938e-06, "logits/chosen": -1.9742231369018555, "logits/rejected": 4.877246379852295, "logps/chosen": -361.27850341796875, "logps/rejected": -1050.592041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.751445293426514, "rewards/margins": 33.7140007019043, "rewards/rejected": -38.46544647216797, "step": 3920 }, { "epoch": 2.4391912908242612, "grad_norm": 7.0156575020519085e-06, "learning_rate": 1.037344398340249e-06, "logits/chosen": -0.6629587411880493, "logits/rejected": 3.9200730323791504, "logps/chosen": -420.0581970214844, "logps/rejected": -914.085205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.827775478363037, "rewards/margins": 25.929183959960938, "rewards/rejected": -33.756961822509766, "step": 3921 }, { "epoch": 2.4398133748055986, "grad_norm": 4.913921657134779e-06, "learning_rate": 1.0361917934532045e-06, "logits/chosen": 0.21196871995925903, "logits/rejected": 2.7537970542907715, "logps/chosen": -604.2928466796875, "logps/rejected": -1049.777099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.621042251586914, "rewards/margins": 28.787445068359375, "rewards/rejected": -41.40848922729492, "step": 3922 }, { "epoch": 2.4404354587869364, "grad_norm": 0.0062870606780052185, "learning_rate": 1.0350391885661595e-06, "logits/chosen": -0.6903063058853149, "logits/rejected": 0.6735283136367798, "logps/chosen": -436.35601806640625, "logps/rejected": -874.8502197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.20759391784668, "rewards/margins": 29.023616790771484, "rewards/rejected": -36.2312126159668, "step": 3923 }, { "epoch": 2.4410575427682737, "grad_norm": 2.6101699859282235e-06, "learning_rate": 1.033886583679115e-06, "logits/chosen": 0.8353837728500366, "logits/rejected": 4.180713176727295, "logps/chosen": -465.27850341796875, "logps/rejected": -924.72021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.832421779632568, "rewards/margins": 32.02825927734375, "rewards/rejected": -38.860679626464844, "step": 3924 }, { "epoch": 2.441679626749611, "grad_norm": 5.736229013564298e-06, "learning_rate": 1.0327339787920701e-06, "logits/chosen": -1.6903998851776123, "logits/rejected": 3.183885097503662, "logps/chosen": -390.8701477050781, "logps/rejected": -957.973388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.082308769226074, "rewards/margins": 34.455909729003906, "rewards/rejected": -39.53821563720703, "step": 3925 }, { "epoch": 2.442301710730949, "grad_norm": 0.4600825905799866, "learning_rate": 1.0315813739050254e-06, "logits/chosen": -0.5375813245773315, "logits/rejected": 2.952937126159668, "logps/chosen": -584.4176025390625, "logps/rejected": -1137.98681640625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -6.461213111877441, "rewards/margins": 27.309200286865234, "rewards/rejected": -33.77041244506836, "step": 3926 }, { "epoch": 2.442923794712286, "grad_norm": 10.128042221069336, "learning_rate": 1.0304287690179806e-06, "logits/chosen": 1.3842074871063232, "logits/rejected": 2.9823427200317383, "logps/chosen": -661.162841796875, "logps/rejected": -954.5634155273438, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -10.083999633789062, "rewards/margins": 23.011592864990234, "rewards/rejected": -33.0955924987793, "step": 3927 }, { "epoch": 2.4435458786936235, "grad_norm": 0.00012620781490113586, "learning_rate": 1.029276164130936e-06, "logits/chosen": 1.541063904762268, "logits/rejected": 4.459494590759277, "logps/chosen": -573.2506713867188, "logps/rejected": -1087.428955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.81618595123291, "rewards/margins": 31.231414794921875, "rewards/rejected": -41.04759979248047, "step": 3928 }, { "epoch": 2.4441679626749613, "grad_norm": 0.038373615592718124, "learning_rate": 1.0281235592438912e-06, "logits/chosen": 2.5318827629089355, "logits/rejected": 3.6796557903289795, "logps/chosen": -503.311767578125, "logps/rejected": -754.3387451171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.870074272155762, "rewards/margins": 20.023391723632812, "rewards/rejected": -27.89346694946289, "step": 3929 }, { "epoch": 2.4447900466562986, "grad_norm": 0.0009832432260736823, "learning_rate": 1.0269709543568465e-06, "logits/chosen": 1.1359636783599854, "logits/rejected": 3.896103858947754, "logps/chosen": -525.8956909179688, "logps/rejected": -1105.4205322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.109336853027344, "rewards/margins": 34.28251647949219, "rewards/rejected": -46.391849517822266, "step": 3930 }, { "epoch": 2.445412130637636, "grad_norm": 0.3185408115386963, "learning_rate": 1.025818349469802e-06, "logits/chosen": 2.0907115936279297, "logits/rejected": 2.1502089500427246, "logps/chosen": -528.4436645507812, "logps/rejected": -699.7049560546875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.308149337768555, "rewards/margins": 16.63261604309082, "rewards/rejected": -21.940765380859375, "step": 3931 }, { "epoch": 2.4460342146189737, "grad_norm": 1.571313212878067e-08, "learning_rate": 1.0246657445827571e-06, "logits/chosen": -2.6080470085144043, "logits/rejected": 2.0387868881225586, "logps/chosen": -386.8277587890625, "logps/rejected": -1066.030029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.407026290893555, "rewards/margins": 36.1947021484375, "rewards/rejected": -41.60173034667969, "step": 3932 }, { "epoch": 2.446656298600311, "grad_norm": 4.174937930656597e-05, "learning_rate": 1.0235131396957124e-06, "logits/chosen": 0.15204091370105743, "logits/rejected": 3.042105197906494, "logps/chosen": -577.6109008789062, "logps/rejected": -972.4326782226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.046577453613281, "rewards/margins": 27.047788619995117, "rewards/rejected": -38.094364166259766, "step": 3933 }, { "epoch": 2.4472783825816484, "grad_norm": 2.9769933007628424e-06, "learning_rate": 1.0223605348086676e-06, "logits/chosen": 0.32063624262809753, "logits/rejected": 3.1496458053588867, "logps/chosen": -464.86419677734375, "logps/rejected": -989.5301513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.432979583740234, "rewards/margins": 32.06596374511719, "rewards/rejected": -39.49894714355469, "step": 3934 }, { "epoch": 2.447900466562986, "grad_norm": 0.002402025042101741, "learning_rate": 1.021207929921623e-06, "logits/chosen": 0.38487508893013, "logits/rejected": 4.500364303588867, "logps/chosen": -558.4069213867188, "logps/rejected": -1054.4920654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.773858070373535, "rewards/margins": 25.580459594726562, "rewards/rejected": -36.35431671142578, "step": 3935 }, { "epoch": 2.4485225505443236, "grad_norm": 2.8169645247544395e-06, "learning_rate": 1.0200553250345782e-06, "logits/chosen": 0.5297533869743347, "logits/rejected": 2.569145917892456, "logps/chosen": -503.478515625, "logps/rejected": -925.1766967773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.786959648132324, "rewards/margins": 30.930150985717773, "rewards/rejected": -38.71710968017578, "step": 3936 }, { "epoch": 2.449144634525661, "grad_norm": 2.3240552764036693e-05, "learning_rate": 1.0189027201475335e-06, "logits/chosen": 0.3535512089729309, "logits/rejected": 3.5029115676879883, "logps/chosen": -500.951416015625, "logps/rejected": -1047.8524169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.535778999328613, "rewards/margins": 39.683143615722656, "rewards/rejected": -48.21892547607422, "step": 3937 }, { "epoch": 2.4497667185069982, "grad_norm": 0.000920445250812918, "learning_rate": 1.0177501152604887e-06, "logits/chosen": -0.4453803300857544, "logits/rejected": 3.93086576461792, "logps/chosen": -493.6942138671875, "logps/rejected": -1113.318603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.954602241516113, "rewards/margins": 29.42861557006836, "rewards/rejected": -39.383216857910156, "step": 3938 }, { "epoch": 2.450388802488336, "grad_norm": 0.029540177434682846, "learning_rate": 1.0165975103734441e-06, "logits/chosen": -0.2629004716873169, "logits/rejected": 0.7652404308319092, "logps/chosen": -652.7841186523438, "logps/rejected": -818.8164672851562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.874147891998291, "rewards/margins": 22.586145401000977, "rewards/rejected": -28.46029281616211, "step": 3939 }, { "epoch": 2.4510108864696734, "grad_norm": 0.22855834662914276, "learning_rate": 1.0154449054863994e-06, "logits/chosen": 0.16000841557979584, "logits/rejected": 4.105987548828125, "logps/chosen": -521.3453979492188, "logps/rejected": -991.91650390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.033821105957031, "rewards/margins": 25.448020935058594, "rewards/rejected": -29.481842041015625, "step": 3940 }, { "epoch": 2.4516329704510107, "grad_norm": 0.000634959724266082, "learning_rate": 1.0142923005993546e-06, "logits/chosen": 0.7463221549987793, "logits/rejected": 3.813570499420166, "logps/chosen": -609.4283447265625, "logps/rejected": -1091.73388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.175848007202148, "rewards/margins": 26.847991943359375, "rewards/rejected": -35.023841857910156, "step": 3941 }, { "epoch": 2.4522550544323485, "grad_norm": 0.12524612247943878, "learning_rate": 1.01313969571231e-06, "logits/chosen": 1.860915184020996, "logits/rejected": 4.84216833114624, "logps/chosen": -530.1736450195312, "logps/rejected": -1021.5438232421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.089037895202637, "rewards/margins": 31.169198989868164, "rewards/rejected": -40.258235931396484, "step": 3942 }, { "epoch": 2.452877138413686, "grad_norm": 0.09399629384279251, "learning_rate": 1.0119870908252652e-06, "logits/chosen": 0.4427666664123535, "logits/rejected": 3.365427017211914, "logps/chosen": -567.5911865234375, "logps/rejected": -981.8191528320312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.774032592773438, "rewards/margins": 24.488821029663086, "rewards/rejected": -33.262855529785156, "step": 3943 }, { "epoch": 2.453499222395023, "grad_norm": 1.8980324268341064, "learning_rate": 1.0108344859382205e-06, "logits/chosen": 2.3022541999816895, "logits/rejected": 4.3741888999938965, "logps/chosen": -640.0008544921875, "logps/rejected": -1101.875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -9.442511558532715, "rewards/margins": 29.548015594482422, "rewards/rejected": -38.99052810668945, "step": 3944 }, { "epoch": 2.454121306376361, "grad_norm": 0.0036252494901418686, "learning_rate": 1.0096818810511757e-06, "logits/chosen": 0.5579085350036621, "logits/rejected": 0.8060776591300964, "logps/chosen": -473.23016357421875, "logps/rejected": -832.8922729492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.750743865966797, "rewards/margins": 28.729267120361328, "rewards/rejected": -36.480010986328125, "step": 3945 }, { "epoch": 2.4547433903576983, "grad_norm": 18.020030975341797, "learning_rate": 1.0085292761641311e-06, "logits/chosen": 3.9792816638946533, "logits/rejected": 3.7229690551757812, "logps/chosen": -703.1436767578125, "logps/rejected": -970.5345458984375, "loss": 0.0927, "rewards/accuracies": 0.875, "rewards/chosen": -12.347864151000977, "rewards/margins": 25.340940475463867, "rewards/rejected": -37.688804626464844, "step": 3946 }, { "epoch": 2.4553654743390356, "grad_norm": 0.0004174646455794573, "learning_rate": 1.0073766712770863e-06, "logits/chosen": 1.2467455863952637, "logits/rejected": 3.5107333660125732, "logps/chosen": -625.0026245117188, "logps/rejected": -1077.629638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.804898262023926, "rewards/margins": 27.94451141357422, "rewards/rejected": -40.74940872192383, "step": 3947 }, { "epoch": 2.4559875583203734, "grad_norm": 0.002254350110888481, "learning_rate": 1.0062240663900416e-06, "logits/chosen": -1.3291007280349731, "logits/rejected": 2.1017301082611084, "logps/chosen": -494.86334228515625, "logps/rejected": -916.5098266601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.163529396057129, "rewards/margins": 22.77774429321289, "rewards/rejected": -29.941272735595703, "step": 3948 }, { "epoch": 2.4566096423017107, "grad_norm": 0.05908467620611191, "learning_rate": 1.0050714615029968e-06, "logits/chosen": 2.6013591289520264, "logits/rejected": 3.2219948768615723, "logps/chosen": -668.7972412109375, "logps/rejected": -958.492431640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.942373275756836, "rewards/margins": 26.03789520263672, "rewards/rejected": -35.98026657104492, "step": 3949 }, { "epoch": 2.457231726283048, "grad_norm": 0.041411809623241425, "learning_rate": 1.0039188566159522e-06, "logits/chosen": 1.6593561172485352, "logits/rejected": 2.5953595638275146, "logps/chosen": -699.587646484375, "logps/rejected": -979.1827392578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.904213905334473, "rewards/margins": 24.109346389770508, "rewards/rejected": -34.01355743408203, "step": 3950 }, { "epoch": 2.457853810264386, "grad_norm": 0.012357287108898163, "learning_rate": 1.0027662517289075e-06, "logits/chosen": 1.712773084640503, "logits/rejected": 3.507235288619995, "logps/chosen": -491.7207336425781, "logps/rejected": -882.9329833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.298487663269043, "rewards/margins": 24.476160049438477, "rewards/rejected": -33.77465057373047, "step": 3951 }, { "epoch": 2.458475894245723, "grad_norm": 0.01435577031224966, "learning_rate": 1.0016136468418627e-06, "logits/chosen": -1.2487645149230957, "logits/rejected": 4.199995994567871, "logps/chosen": -386.044921875, "logps/rejected": -972.6033935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.299281597137451, "rewards/margins": 23.616321563720703, "rewards/rejected": -26.91560173034668, "step": 3952 }, { "epoch": 2.4590979782270606, "grad_norm": 8.740786142880097e-06, "learning_rate": 1.0004610419548181e-06, "logits/chosen": -2.7983384132385254, "logits/rejected": 2.749145746231079, "logps/chosen": -270.6513671875, "logps/rejected": -954.5782470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.159430980682373, "rewards/margins": 33.22806930541992, "rewards/rejected": -37.38750457763672, "step": 3953 }, { "epoch": 2.4597200622083983, "grad_norm": 9.067174687515944e-06, "learning_rate": 9.993084370677733e-07, "logits/chosen": -3.095334529876709, "logits/rejected": 4.330985069274902, "logps/chosen": -353.81378173828125, "logps/rejected": -1149.3958740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.141190528869629, "rewards/margins": 35.44187927246094, "rewards/rejected": -40.583072662353516, "step": 3954 }, { "epoch": 2.4603421461897357, "grad_norm": 5.59990294277668e-05, "learning_rate": 9.981558321807286e-07, "logits/chosen": -1.5986160039901733, "logits/rejected": 4.103527545928955, "logps/chosen": -443.53765869140625, "logps/rejected": -1152.404052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.053462982177734, "rewards/margins": 37.27567672729492, "rewards/rejected": -45.329139709472656, "step": 3955 }, { "epoch": 2.460964230171073, "grad_norm": 1.8919844251286122e-06, "learning_rate": 9.970032272936838e-07, "logits/chosen": -1.1473727226257324, "logits/rejected": 3.8609635829925537, "logps/chosen": -369.2405700683594, "logps/rejected": -948.0873413085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.060483932495117, "rewards/margins": 34.174312591552734, "rewards/rejected": -39.234798431396484, "step": 3956 }, { "epoch": 2.4615863141524104, "grad_norm": 1.6699502793926513e-06, "learning_rate": 9.95850622406639e-07, "logits/chosen": -1.3253543376922607, "logits/rejected": 2.317295551300049, "logps/chosen": -410.41363525390625, "logps/rejected": -983.7108154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.472805500030518, "rewards/margins": 35.197509765625, "rewards/rejected": -42.670310974121094, "step": 3957 }, { "epoch": 2.462208398133748, "grad_norm": 17.654504776000977, "learning_rate": 9.946980175195942e-07, "logits/chosen": 1.3565376996994019, "logits/rejected": 3.2873613834381104, "logps/chosen": -685.8858642578125, "logps/rejected": -1045.9437255859375, "loss": 0.0998, "rewards/accuracies": 0.875, "rewards/chosen": -3.6514360904693604, "rewards/margins": 25.847148895263672, "rewards/rejected": -29.498584747314453, "step": 3958 }, { "epoch": 2.4628304821150855, "grad_norm": 0.0009128287783823907, "learning_rate": 9.935454126325497e-07, "logits/chosen": -0.545260488986969, "logits/rejected": 3.329559803009033, "logps/chosen": -495.82403564453125, "logps/rejected": -995.2269287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.437385559082031, "rewards/margins": 28.629581451416016, "rewards/rejected": -40.06696701049805, "step": 3959 }, { "epoch": 2.463452566096423, "grad_norm": 0.07026296108961105, "learning_rate": 9.92392807745505e-07, "logits/chosen": 0.40682363510131836, "logits/rejected": 4.326059341430664, "logps/chosen": -515.2821044921875, "logps/rejected": -1210.7884521484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.251470565795898, "rewards/margins": 39.29925537109375, "rewards/rejected": -48.550724029541016, "step": 3960 }, { "epoch": 2.4640746500777606, "grad_norm": 0.6919077634811401, "learning_rate": 9.912402028584601e-07, "logits/chosen": -1.3996939659118652, "logits/rejected": 1.9476600885391235, "logps/chosen": -503.40545654296875, "logps/rejected": -861.3834228515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -6.144523620605469, "rewards/margins": 19.993408203125, "rewards/rejected": -26.13793182373047, "step": 3961 }, { "epoch": 2.464696734059098, "grad_norm": 0.33661141991615295, "learning_rate": 9.900875979714154e-07, "logits/chosen": -1.489673137664795, "logits/rejected": 3.2342751026153564, "logps/chosen": -340.26019287109375, "logps/rejected": -831.0149536132812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.035558700561523, "rewards/margins": 22.35528564453125, "rewards/rejected": -30.390844345092773, "step": 3962 }, { "epoch": 2.4653188180404353, "grad_norm": 0.686103880405426, "learning_rate": 9.889349930843708e-07, "logits/chosen": -0.1386818289756775, "logits/rejected": 3.1802377700805664, "logps/chosen": -565.9710693359375, "logps/rejected": -1054.5169677734375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.683035850524902, "rewards/margins": 23.252742767333984, "rewards/rejected": -30.93577766418457, "step": 3963 }, { "epoch": 2.465940902021773, "grad_norm": 0.025931192561984062, "learning_rate": 9.87782388197326e-07, "logits/chosen": -0.1779177188873291, "logits/rejected": 1.1597424745559692, "logps/chosen": -480.335693359375, "logps/rejected": -774.178955078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.642938613891602, "rewards/margins": 17.06674575805664, "rewards/rejected": -22.709684371948242, "step": 3964 }, { "epoch": 2.4665629860031104, "grad_norm": 6.1447601318359375, "learning_rate": 9.866297833102812e-07, "logits/chosen": 0.5171284079551697, "logits/rejected": 2.205604314804077, "logps/chosen": -633.0610961914062, "logps/rejected": -972.0784912109375, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -7.602591037750244, "rewards/margins": 23.704069137573242, "rewards/rejected": -31.306659698486328, "step": 3965 }, { "epoch": 2.4671850699844478, "grad_norm": 0.0006154404254630208, "learning_rate": 9.854771784232365e-07, "logits/chosen": 0.5229877829551697, "logits/rejected": 3.7279510498046875, "logps/chosen": -525.7893676757812, "logps/rejected": -907.7853393554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.976510047912598, "rewards/margins": 23.866352081298828, "rewards/rejected": -30.84286117553711, "step": 3966 }, { "epoch": 2.4678071539657855, "grad_norm": 0.0013302437728270888, "learning_rate": 9.84324573536192e-07, "logits/chosen": 0.6051331758499146, "logits/rejected": 2.128033399581909, "logps/chosen": -508.7525329589844, "logps/rejected": -912.324951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.178055763244629, "rewards/margins": 27.136598587036133, "rewards/rejected": -37.31465148925781, "step": 3967 }, { "epoch": 2.468429237947123, "grad_norm": 0.0012908873613923788, "learning_rate": 9.831719686491471e-07, "logits/chosen": 1.2176079750061035, "logits/rejected": 2.406261444091797, "logps/chosen": -675.4288330078125, "logps/rejected": -951.7867431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.866052627563477, "rewards/margins": 23.51856231689453, "rewards/rejected": -36.384620666503906, "step": 3968 }, { "epoch": 2.46905132192846, "grad_norm": 0.008842960000038147, "learning_rate": 9.820193637621024e-07, "logits/chosen": 0.6855583786964417, "logits/rejected": 3.3531930446624756, "logps/chosen": -558.2369384765625, "logps/rejected": -1022.9857788085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.03244686126709, "rewards/margins": 28.938135147094727, "rewards/rejected": -37.9705810546875, "step": 3969 }, { "epoch": 2.469673405909798, "grad_norm": 0.056224849075078964, "learning_rate": 9.808667588750578e-07, "logits/chosen": 1.5705623626708984, "logits/rejected": 3.3909265995025635, "logps/chosen": -651.3974609375, "logps/rejected": -993.1527099609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.265022277832031, "rewards/margins": 24.45557403564453, "rewards/rejected": -35.72059631347656, "step": 3970 }, { "epoch": 2.4702954898911353, "grad_norm": 12.649288177490234, "learning_rate": 9.79714153988013e-07, "logits/chosen": 0.9670317769050598, "logits/rejected": 2.3478357791900635, "logps/chosen": -656.9765625, "logps/rejected": -1015.3643798828125, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -5.8673834800720215, "rewards/margins": 23.989347457885742, "rewards/rejected": -29.856731414794922, "step": 3971 }, { "epoch": 2.4709175738724727, "grad_norm": 0.20173630118370056, "learning_rate": 9.785615491009682e-07, "logits/chosen": 0.1819133162498474, "logits/rejected": 3.700805425643921, "logps/chosen": -584.2685546875, "logps/rejected": -1063.6900634765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.745945930480957, "rewards/margins": 33.38615417480469, "rewards/rejected": -41.132102966308594, "step": 3972 }, { "epoch": 2.4715396578538105, "grad_norm": 0.0004406968946568668, "learning_rate": 9.774089442139235e-07, "logits/chosen": 2.5935163497924805, "logits/rejected": 3.5712637901306152, "logps/chosen": -672.4451904296875, "logps/rejected": -965.9010009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.891907691955566, "rewards/margins": 24.52800178527832, "rewards/rejected": -35.4199104309082, "step": 3973 }, { "epoch": 2.472161741835148, "grad_norm": 0.16155090928077698, "learning_rate": 9.76256339326879e-07, "logits/chosen": -0.11952055990695953, "logits/rejected": 3.0214688777923584, "logps/chosen": -518.565673828125, "logps/rejected": -985.1463623046875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -6.818488121032715, "rewards/margins": 24.315500259399414, "rewards/rejected": -31.133987426757812, "step": 3974 }, { "epoch": 2.472783825816485, "grad_norm": 1.4269421626522671e-05, "learning_rate": 9.751037344398341e-07, "logits/chosen": 0.11723226308822632, "logits/rejected": 2.796970844268799, "logps/chosen": -579.084716796875, "logps/rejected": -1109.278564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.15622329711914, "rewards/margins": 32.47679138183594, "rewards/rejected": -42.63301467895508, "step": 3975 }, { "epoch": 2.4734059097978225, "grad_norm": 0.01373250875622034, "learning_rate": 9.739511295527893e-07, "logits/chosen": 0.29508113861083984, "logits/rejected": 3.23545503616333, "logps/chosen": -455.6143798828125, "logps/rejected": -863.3740844726562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.188558578491211, "rewards/margins": 25.89873504638672, "rewards/rejected": -33.08729553222656, "step": 3976 }, { "epoch": 2.4740279937791603, "grad_norm": 0.6342032551765442, "learning_rate": 9.727985246657446e-07, "logits/chosen": -1.2206673622131348, "logits/rejected": 2.5578463077545166, "logps/chosen": -481.9017028808594, "logps/rejected": -1017.375244140625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -12.518524169921875, "rewards/margins": 29.58747100830078, "rewards/rejected": -42.105995178222656, "step": 3977 }, { "epoch": 2.4746500777604976, "grad_norm": 6.656295681750635e-06, "learning_rate": 9.716459197787e-07, "logits/chosen": 0.7014501094818115, "logits/rejected": 3.1492178440093994, "logps/chosen": -630.0687255859375, "logps/rejected": -972.5453491210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.163032531738281, "rewards/margins": 29.593048095703125, "rewards/rejected": -38.756080627441406, "step": 3978 }, { "epoch": 2.4752721617418354, "grad_norm": 6.173561996547505e-05, "learning_rate": 9.704933148916552e-07, "logits/chosen": 1.347499966621399, "logits/rejected": 2.789525032043457, "logps/chosen": -606.9569091796875, "logps/rejected": -1078.0966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.588987350463867, "rewards/margins": 33.38792037963867, "rewards/rejected": -41.976905822753906, "step": 3979 }, { "epoch": 2.4758942457231727, "grad_norm": 0.266066312789917, "learning_rate": 9.693407100046105e-07, "logits/chosen": 0.7726764678955078, "logits/rejected": 3.6062095165252686, "logps/chosen": -460.413330078125, "logps/rejected": -1039.5133056640625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -7.640885353088379, "rewards/margins": 36.07872772216797, "rewards/rejected": -43.71961212158203, "step": 3980 }, { "epoch": 2.47651632970451, "grad_norm": 0.06791546195745468, "learning_rate": 9.68188105117566e-07, "logits/chosen": -0.8611456751823425, "logits/rejected": 4.008203983306885, "logps/chosen": -452.2554931640625, "logps/rejected": -1070.7613525390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.840206146240234, "rewards/margins": 29.160036087036133, "rewards/rejected": -35.000244140625, "step": 3981 }, { "epoch": 2.4771384136858474, "grad_norm": 4.322809843415598e-07, "learning_rate": 9.670355002305211e-07, "logits/chosen": 1.3722554445266724, "logits/rejected": 4.346024990081787, "logps/chosen": -635.0540771484375, "logps/rejected": -1267.4791259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.10350513458252, "rewards/margins": 36.77997589111328, "rewards/rejected": -48.88347625732422, "step": 3982 }, { "epoch": 2.477760497667185, "grad_norm": 1.7631964510655962e-06, "learning_rate": 9.658828953434763e-07, "logits/chosen": -1.542866826057434, "logits/rejected": 3.955106735229492, "logps/chosen": -379.46466064453125, "logps/rejected": -956.01416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.101266384124756, "rewards/margins": 32.51762390136719, "rewards/rejected": -38.618892669677734, "step": 3983 }, { "epoch": 2.4783825816485225, "grad_norm": 0.19948433339595795, "learning_rate": 9.647302904564316e-07, "logits/chosen": -0.9693173170089722, "logits/rejected": 1.6268550157546997, "logps/chosen": -582.1820068359375, "logps/rejected": -1092.36328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.305274963378906, "rewards/margins": 35.87574005126953, "rewards/rejected": -44.18102264404297, "step": 3984 }, { "epoch": 2.47900466562986, "grad_norm": 0.23587602376937866, "learning_rate": 9.63577685569387e-07, "logits/chosen": 1.4217019081115723, "logits/rejected": 1.7731695175170898, "logps/chosen": -605.095458984375, "logps/rejected": -938.7332763671875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -9.258292198181152, "rewards/margins": 30.493221282958984, "rewards/rejected": -39.75151443481445, "step": 3985 }, { "epoch": 2.4796267496111977, "grad_norm": 0.008622845634818077, "learning_rate": 9.624250806823422e-07, "logits/chosen": 1.8287463188171387, "logits/rejected": 4.628846168518066, "logps/chosen": -664.932861328125, "logps/rejected": -1096.986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.658853530883789, "rewards/margins": 25.907957077026367, "rewards/rejected": -34.566810607910156, "step": 3986 }, { "epoch": 2.480248833592535, "grad_norm": 0.00017098673561122268, "learning_rate": 9.612724757952975e-07, "logits/chosen": -2.468224048614502, "logits/rejected": 3.166496515274048, "logps/chosen": -466.069091796875, "logps/rejected": -1267.13671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.445119857788086, "rewards/margins": 34.76084899902344, "rewards/rejected": -45.205970764160156, "step": 3987 }, { "epoch": 2.4808709175738723, "grad_norm": 5.38523199793417e-05, "learning_rate": 9.601198709082529e-07, "logits/chosen": 0.47611796855926514, "logits/rejected": 3.0029196739196777, "logps/chosen": -367.03289794921875, "logps/rejected": -808.068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.614110469818115, "rewards/margins": 26.540752410888672, "rewards/rejected": -33.15486526489258, "step": 3988 }, { "epoch": 2.48149300155521, "grad_norm": 0.11246740072965622, "learning_rate": 9.589672660212081e-07, "logits/chosen": 1.517196774482727, "logits/rejected": 1.8936448097229004, "logps/chosen": -591.6846313476562, "logps/rejected": -892.910888671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.089322090148926, "rewards/margins": 25.842926025390625, "rewards/rejected": -34.932247161865234, "step": 3989 }, { "epoch": 2.4821150855365475, "grad_norm": 0.001232079230248928, "learning_rate": 9.578146611341633e-07, "logits/chosen": -0.5079509019851685, "logits/rejected": 2.6780648231506348, "logps/chosen": -469.6315612792969, "logps/rejected": -972.140380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.966546535491943, "rewards/margins": 27.8428897857666, "rewards/rejected": -34.8094367980957, "step": 3990 }, { "epoch": 2.482737169517885, "grad_norm": 13.245752334594727, "learning_rate": 9.566620562471186e-07, "logits/chosen": -0.15342366695404053, "logits/rejected": 3.0701396465301514, "logps/chosen": -640.5465087890625, "logps/rejected": -959.1682739257812, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": -13.822296142578125, "rewards/margins": 20.660457611083984, "rewards/rejected": -34.48275375366211, "step": 3991 }, { "epoch": 2.4833592534992226, "grad_norm": 4.577951884243703e-08, "learning_rate": 9.555094513600738e-07, "logits/chosen": 0.28653281927108765, "logits/rejected": 3.837348699569702, "logps/chosen": -554.57763671875, "logps/rejected": -1132.2530517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.2543363571167, "rewards/margins": 41.47886657714844, "rewards/rejected": -50.73320007324219, "step": 3992 }, { "epoch": 2.48398133748056, "grad_norm": 0.0004249585035722703, "learning_rate": 9.54356846473029e-07, "logits/chosen": 0.6426808834075928, "logits/rejected": 3.9812469482421875, "logps/chosen": -543.9054565429688, "logps/rejected": -978.4523315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.190975189208984, "rewards/margins": 26.68214225769043, "rewards/rejected": -33.87311553955078, "step": 3993 }, { "epoch": 2.4846034214618973, "grad_norm": 0.00025467202067375183, "learning_rate": 9.532042415859843e-07, "logits/chosen": -1.1799622774124146, "logits/rejected": 3.2941060066223145, "logps/chosen": -320.71478271484375, "logps/rejected": -886.0281372070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6708829402923584, "rewards/margins": 34.079280853271484, "rewards/rejected": -37.750160217285156, "step": 3994 }, { "epoch": 2.4852255054432346, "grad_norm": 38.0293083190918, "learning_rate": 9.520516366989397e-07, "logits/chosen": 2.034773349761963, "logits/rejected": 2.832022190093994, "logps/chosen": -488.5412902832031, "logps/rejected": -745.2386474609375, "loss": 0.2294, "rewards/accuracies": 0.875, "rewards/chosen": -3.242122173309326, "rewards/margins": 22.864137649536133, "rewards/rejected": -26.106258392333984, "step": 3995 }, { "epoch": 2.4858475894245724, "grad_norm": 0.8855314254760742, "learning_rate": 9.508990318118949e-07, "logits/chosen": -0.5881878733634949, "logits/rejected": 3.464144229888916, "logps/chosen": -478.4122314453125, "logps/rejected": -1075.0064697265625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -11.319843292236328, "rewards/margins": 38.023109436035156, "rewards/rejected": -49.34294891357422, "step": 3996 }, { "epoch": 2.4864696734059097, "grad_norm": 0.46036460995674133, "learning_rate": 9.497464269248502e-07, "logits/chosen": 0.5766007900238037, "logits/rejected": 3.380526065826416, "logps/chosen": -558.318359375, "logps/rejected": -1159.97900390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -10.872662544250488, "rewards/margins": 37.3363037109375, "rewards/rejected": -48.20896530151367, "step": 3997 }, { "epoch": 2.4870917573872475, "grad_norm": 9.886953193927184e-06, "learning_rate": 9.485938220378055e-07, "logits/chosen": 1.076728105545044, "logits/rejected": 3.4405856132507324, "logps/chosen": -507.3965759277344, "logps/rejected": -938.2133178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.397248268127441, "rewards/margins": 25.83782196044922, "rewards/rejected": -36.235069274902344, "step": 3998 }, { "epoch": 2.487713841368585, "grad_norm": 0.018210668116807938, "learning_rate": 9.474412171507608e-07, "logits/chosen": -0.3068583607673645, "logits/rejected": 3.235687494277954, "logps/chosen": -429.76239013671875, "logps/rejected": -877.342529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.804065704345703, "rewards/margins": 25.279605865478516, "rewards/rejected": -34.08367156982422, "step": 3999 }, { "epoch": 2.488335925349922, "grad_norm": 0.000381840713089332, "learning_rate": 9.46288612263716e-07, "logits/chosen": -1.823272466659546, "logits/rejected": 2.943857192993164, "logps/chosen": -354.3667907714844, "logps/rejected": -927.176513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.204986572265625, "rewards/margins": 28.99197769165039, "rewards/rejected": -34.196964263916016, "step": 4000 }, { "epoch": 2.4889580093312595, "grad_norm": 0.0003201756626367569, "learning_rate": 9.451360073766713e-07, "logits/chosen": 1.1355750560760498, "logits/rejected": 4.724059104919434, "logps/chosen": -541.935302734375, "logps/rejected": -1017.6828002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.335550785064697, "rewards/margins": 31.088001251220703, "rewards/rejected": -37.423553466796875, "step": 4001 }, { "epoch": 2.4895800933125973, "grad_norm": 0.5558943152427673, "learning_rate": 9.439834024896266e-07, "logits/chosen": 1.82474684715271, "logits/rejected": 4.334096908569336, "logps/chosen": -533.4063720703125, "logps/rejected": -910.9434814453125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -7.618231296539307, "rewards/margins": 23.29483413696289, "rewards/rejected": -30.91306495666504, "step": 4002 }, { "epoch": 2.4902021772939347, "grad_norm": 0.09100416302680969, "learning_rate": 9.428307976025819e-07, "logits/chosen": 0.3936919569969177, "logits/rejected": 2.573723077774048, "logps/chosen": -588.6266479492188, "logps/rejected": -961.934326171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.027005195617676, "rewards/margins": 23.147462844848633, "rewards/rejected": -32.174468994140625, "step": 4003 }, { "epoch": 2.490824261275272, "grad_norm": 0.05002165213227272, "learning_rate": 9.416781927155372e-07, "logits/chosen": 0.522924542427063, "logits/rejected": 3.548407554626465, "logps/chosen": -479.7599792480469, "logps/rejected": -1006.958984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.0311279296875, "rewards/margins": 27.491552352905273, "rewards/rejected": -35.522682189941406, "step": 4004 }, { "epoch": 2.49144634525661, "grad_norm": 0.1994076520204544, "learning_rate": 9.405255878284925e-07, "logits/chosen": -0.5984686017036438, "logits/rejected": 3.4913418292999268, "logps/chosen": -327.8326110839844, "logps/rejected": -711.6600341796875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.165585994720459, "rewards/margins": 22.621118545532227, "rewards/rejected": -26.786705017089844, "step": 4005 }, { "epoch": 2.492068429237947, "grad_norm": 0.0002858602092601359, "learning_rate": 9.393729829414478e-07, "logits/chosen": 3.4327967166900635, "logits/rejected": 5.520690441131592, "logps/chosen": -697.5659790039062, "logps/rejected": -1217.9140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.23094367980957, "rewards/margins": 35.31251525878906, "rewards/rejected": -46.54345703125, "step": 4006 }, { "epoch": 2.4926905132192845, "grad_norm": 1.2931499441037886e-05, "learning_rate": 9.38220378054403e-07, "logits/chosen": -1.832653522491455, "logits/rejected": 1.7804694175720215, "logps/chosen": -484.6544189453125, "logps/rejected": -992.6403198242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.544832229614258, "rewards/margins": 28.360626220703125, "rewards/rejected": -38.905460357666016, "step": 4007 }, { "epoch": 2.4933125972006223, "grad_norm": 0.00022435266873799264, "learning_rate": 9.370677731673583e-07, "logits/chosen": 1.0319801568984985, "logits/rejected": 1.8618669509887695, "logps/chosen": -561.7014770507812, "logps/rejected": -932.447998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.712716102600098, "rewards/margins": 28.847881317138672, "rewards/rejected": -40.56060028076172, "step": 4008 }, { "epoch": 2.4939346811819596, "grad_norm": 8.498398528899997e-05, "learning_rate": 9.359151682803136e-07, "logits/chosen": -1.1835764646530151, "logits/rejected": 3.480799674987793, "logps/chosen": -463.39447021484375, "logps/rejected": -1009.1600341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.3580193519592285, "rewards/margins": 32.19048309326172, "rewards/rejected": -38.54850387573242, "step": 4009 }, { "epoch": 2.494556765163297, "grad_norm": 0.1589866578578949, "learning_rate": 9.347625633932689e-07, "logits/chosen": -1.8735637664794922, "logits/rejected": 3.5191726684570312, "logps/chosen": -357.97406005859375, "logps/rejected": -895.1856689453125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.487676620483398, "rewards/margins": 25.94668197631836, "rewards/rejected": -32.434356689453125, "step": 4010 }, { "epoch": 2.4951788491446347, "grad_norm": 0.31055301427841187, "learning_rate": 9.336099585062241e-07, "logits/chosen": 0.4072756767272949, "logits/rejected": 4.024853706359863, "logps/chosen": -391.751953125, "logps/rejected": -752.5792236328125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -8.09948444366455, "rewards/margins": 15.251541137695312, "rewards/rejected": -23.351028442382812, "step": 4011 }, { "epoch": 2.495800933125972, "grad_norm": 0.0002815905027091503, "learning_rate": 9.324573536191794e-07, "logits/chosen": 0.9224781394004822, "logits/rejected": 3.1477391719818115, "logps/chosen": -464.29803466796875, "logps/rejected": -789.9879150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.706248760223389, "rewards/margins": 18.78945541381836, "rewards/rejected": -25.495704650878906, "step": 4012 }, { "epoch": 2.4964230171073094, "grad_norm": 0.013372275978326797, "learning_rate": 9.313047487321347e-07, "logits/chosen": 1.7922581434249878, "logits/rejected": 3.2224082946777344, "logps/chosen": -683.3353881835938, "logps/rejected": -1059.2672119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.253475189208984, "rewards/margins": 33.029808044433594, "rewards/rejected": -42.283287048339844, "step": 4013 }, { "epoch": 2.4970451010886467, "grad_norm": 1.9914675704058027e-06, "learning_rate": 9.3015214384509e-07, "logits/chosen": -2.2508482933044434, "logits/rejected": 1.2407808303833008, "logps/chosen": -380.6495666503906, "logps/rejected": -867.527099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3229730129241943, "rewards/margins": 31.814903259277344, "rewards/rejected": -34.137874603271484, "step": 4014 }, { "epoch": 2.4976671850699845, "grad_norm": 2.6373650143796112e-06, "learning_rate": 9.289995389580453e-07, "logits/chosen": -0.6663360595703125, "logits/rejected": 2.0772130489349365, "logps/chosen": -367.455078125, "logps/rejected": -937.1649780273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.327749252319336, "rewards/margins": 29.579191207885742, "rewards/rejected": -36.90694046020508, "step": 4015 }, { "epoch": 2.498289269051322, "grad_norm": 0.12220276147127151, "learning_rate": 9.278469340710006e-07, "logits/chosen": -0.1861596703529358, "logits/rejected": 3.5406994819641113, "logps/chosen": -510.4183654785156, "logps/rejected": -1062.176513671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -9.92317008972168, "rewards/margins": 27.95861053466797, "rewards/rejected": -37.881778717041016, "step": 4016 }, { "epoch": 2.4989113530326597, "grad_norm": 0.0015416694805026054, "learning_rate": 9.266943291839559e-07, "logits/chosen": -1.3664590120315552, "logits/rejected": 3.208428144454956, "logps/chosen": -441.11431884765625, "logps/rejected": -867.0030517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.116560935974121, "rewards/margins": 21.813411712646484, "rewards/rejected": -29.929973602294922, "step": 4017 }, { "epoch": 2.499533437013997, "grad_norm": 1.4782327525608707e-05, "learning_rate": 9.255417242969111e-07, "logits/chosen": 0.12820547819137573, "logits/rejected": 2.7607710361480713, "logps/chosen": -450.5256652832031, "logps/rejected": -907.6300048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.952629089355469, "rewards/margins": 28.850107192993164, "rewards/rejected": -37.802734375, "step": 4018 }, { "epoch": 2.5001555209953343, "grad_norm": 0.007651094812899828, "learning_rate": 9.243891194098664e-07, "logits/chosen": 2.0740067958831787, "logits/rejected": 3.646566390991211, "logps/chosen": -650.134521484375, "logps/rejected": -1027.8922119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.123130798339844, "rewards/margins": 30.832536697387695, "rewards/rejected": -40.95566940307617, "step": 4019 }, { "epoch": 2.5007776049766717, "grad_norm": 0.010663536377251148, "learning_rate": 9.232365145228217e-07, "logits/chosen": 2.3909530639648438, "logits/rejected": 2.9909896850585938, "logps/chosen": -634.50146484375, "logps/rejected": -975.9163818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.708215713500977, "rewards/margins": 28.67369842529297, "rewards/rejected": -39.38191604614258, "step": 4020 }, { "epoch": 2.5013996889580095, "grad_norm": 3.569019099813886e-05, "learning_rate": 9.22083909635777e-07, "logits/chosen": 0.3791891932487488, "logits/rejected": 3.304218292236328, "logps/chosen": -655.1154174804688, "logps/rejected": -1068.7027587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.697490692138672, "rewards/margins": 30.89398956298828, "rewards/rejected": -43.59148025512695, "step": 4021 }, { "epoch": 2.502021772939347, "grad_norm": 41.986019134521484, "learning_rate": 9.209313047487322e-07, "logits/chosen": 1.777288794517517, "logits/rejected": 1.3985198736190796, "logps/chosen": -622.2395629882812, "logps/rejected": -823.553466796875, "loss": 0.5233, "rewards/accuracies": 0.875, "rewards/chosen": -7.957545280456543, "rewards/margins": 16.908954620361328, "rewards/rejected": -24.866500854492188, "step": 4022 }, { "epoch": 2.502643856920684, "grad_norm": 1.0342598777413059e-08, "learning_rate": 9.197786998616876e-07, "logits/chosen": -1.3321645259857178, "logits/rejected": 2.8050286769866943, "logps/chosen": -496.466064453125, "logps/rejected": -1060.14453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.479050636291504, "rewards/margins": 32.64373779296875, "rewards/rejected": -41.12278747558594, "step": 4023 }, { "epoch": 2.503265940902022, "grad_norm": 0.008871527388691902, "learning_rate": 9.186260949746428e-07, "logits/chosen": -2.635526418685913, "logits/rejected": 2.7529656887054443, "logps/chosen": -385.7414245605469, "logps/rejected": -1028.83837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.784934997558594, "rewards/margins": 33.65802764892578, "rewards/rejected": -40.442962646484375, "step": 4024 }, { "epoch": 2.5038880248833593, "grad_norm": 3.446579648880288e-05, "learning_rate": 9.17473490087598e-07, "logits/chosen": 1.5855050086975098, "logits/rejected": 3.839909553527832, "logps/chosen": -599.10498046875, "logps/rejected": -959.9203491210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.241596221923828, "rewards/margins": 25.976350784301758, "rewards/rejected": -36.21794509887695, "step": 4025 }, { "epoch": 2.5045101088646966, "grad_norm": 10.38563060760498, "learning_rate": 9.163208852005532e-07, "logits/chosen": 0.5513242483139038, "logits/rejected": 2.5588459968566895, "logps/chosen": -555.89208984375, "logps/rejected": -875.7945556640625, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": -8.112430572509766, "rewards/margins": 20.556514739990234, "rewards/rejected": -28.668941497802734, "step": 4026 }, { "epoch": 2.505132192846034, "grad_norm": 0.0003059516893699765, "learning_rate": 9.151682803135086e-07, "logits/chosen": 3.751962423324585, "logits/rejected": 5.804811954498291, "logps/chosen": -769.1310424804688, "logps/rejected": -1162.177001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.83383846282959, "rewards/margins": 24.885425567626953, "rewards/rejected": -31.719261169433594, "step": 4027 }, { "epoch": 2.5057542768273717, "grad_norm": 7.580235251225531e-05, "learning_rate": 9.140156754264638e-07, "logits/chosen": 0.8473861217498779, "logits/rejected": 3.997392177581787, "logps/chosen": -625.984619140625, "logps/rejected": -1178.9935302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.654664993286133, "rewards/margins": 34.67824172973633, "rewards/rejected": -46.332908630371094, "step": 4028 }, { "epoch": 2.506376360808709, "grad_norm": 0.07776268571615219, "learning_rate": 9.128630705394191e-07, "logits/chosen": -1.9163434505462646, "logits/rejected": 4.200723171234131, "logps/chosen": -433.5632629394531, "logps/rejected": -1088.713134765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.527142524719238, "rewards/margins": 27.72796058654785, "rewards/rejected": -35.255104064941406, "step": 4029 }, { "epoch": 2.506998444790047, "grad_norm": 1.2928827345604077e-05, "learning_rate": 9.117104656523743e-07, "logits/chosen": -1.044769525527954, "logits/rejected": 1.8422355651855469, "logps/chosen": -442.69696044921875, "logps/rejected": -911.9422607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.976151466369629, "rewards/margins": 30.48032569885254, "rewards/rejected": -39.45647430419922, "step": 4030 }, { "epoch": 2.507620528771384, "grad_norm": 9.74557679001009e-07, "learning_rate": 9.105578607653297e-07, "logits/chosen": 2.877901554107666, "logits/rejected": 3.4200408458709717, "logps/chosen": -625.1708984375, "logps/rejected": -876.2762451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.222400665283203, "rewards/margins": 25.483015060424805, "rewards/rejected": -36.705413818359375, "step": 4031 }, { "epoch": 2.5082426127527215, "grad_norm": 0.00010444582585478202, "learning_rate": 9.09405255878285e-07, "logits/chosen": 0.2044382095336914, "logits/rejected": 2.0310511589050293, "logps/chosen": -485.7327880859375, "logps/rejected": -900.3209838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.183245658874512, "rewards/margins": 29.076282501220703, "rewards/rejected": -36.25952911376953, "step": 4032 }, { "epoch": 2.508864696734059, "grad_norm": 0.0003882237651851028, "learning_rate": 9.082526509912402e-07, "logits/chosen": -3.3604073524475098, "logits/rejected": 2.7178521156311035, "logps/chosen": -393.2567138671875, "logps/rejected": -1042.647216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.976676940917969, "rewards/margins": 30.853498458862305, "rewards/rejected": -37.830177307128906, "step": 4033 }, { "epoch": 2.5094867807153967, "grad_norm": 1.6229850053787231, "learning_rate": 9.071000461041956e-07, "logits/chosen": 1.8078299760818481, "logits/rejected": 2.902855157852173, "logps/chosen": -531.661376953125, "logps/rejected": -825.8492431640625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -9.28935718536377, "rewards/margins": 19.35369110107422, "rewards/rejected": -28.643049240112305, "step": 4034 }, { "epoch": 2.510108864696734, "grad_norm": 1.8153090476989746, "learning_rate": 9.059474412171508e-07, "logits/chosen": 1.5191748142242432, "logits/rejected": 3.1580471992492676, "logps/chosen": -539.547607421875, "logps/rejected": -956.634521484375, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -10.679039001464844, "rewards/margins": 28.129350662231445, "rewards/rejected": -38.808387756347656, "step": 4035 }, { "epoch": 2.510730948678072, "grad_norm": 3.546811580657959, "learning_rate": 9.047948363301061e-07, "logits/chosen": -0.5803591012954712, "logits/rejected": 1.0151821374893188, "logps/chosen": -532.4237060546875, "logps/rejected": -790.9904174804688, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -6.401581764221191, "rewards/margins": 18.95673370361328, "rewards/rejected": -25.35831642150879, "step": 4036 }, { "epoch": 2.511353032659409, "grad_norm": 23.09521484375, "learning_rate": 9.036422314430613e-07, "logits/chosen": 0.25977957248687744, "logits/rejected": 3.5475993156433105, "logps/chosen": -555.8200073242188, "logps/rejected": -1017.760498046875, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": -9.457131385803223, "rewards/margins": 29.075380325317383, "rewards/rejected": -38.532508850097656, "step": 4037 }, { "epoch": 2.5119751166407465, "grad_norm": 4.518855348578654e-05, "learning_rate": 9.024896265560167e-07, "logits/chosen": -1.4445290565490723, "logits/rejected": 3.5027639865875244, "logps/chosen": -242.11343383789062, "logps/rejected": -775.8106689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7559638023376465, "rewards/margins": 27.689367294311523, "rewards/rejected": -31.445327758789062, "step": 4038 }, { "epoch": 2.512597200622084, "grad_norm": 0.0008194184629246593, "learning_rate": 9.013370216689719e-07, "logits/chosen": -1.9092761278152466, "logits/rejected": 3.484623908996582, "logps/chosen": -335.6897277832031, "logps/rejected": -973.5714111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.017749786376953, "rewards/margins": 33.87322998046875, "rewards/rejected": -38.8909797668457, "step": 4039 }, { "epoch": 2.5132192846034216, "grad_norm": 0.06275061517953873, "learning_rate": 9.001844167819272e-07, "logits/chosen": 1.002368450164795, "logits/rejected": 2.0995421409606934, "logps/chosen": -733.36083984375, "logps/rejected": -1132.36328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.596542358398438, "rewards/margins": 32.84700012207031, "rewards/rejected": -44.443546295166016, "step": 4040 }, { "epoch": 2.513841368584759, "grad_norm": 0.14462482929229736, "learning_rate": 8.990318118948826e-07, "logits/chosen": 3.533893346786499, "logits/rejected": 2.424785614013672, "logps/chosen": -831.066650390625, "logps/rejected": -966.3888549804688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -12.034292221069336, "rewards/margins": 20.877782821655273, "rewards/rejected": -32.91207504272461, "step": 4041 }, { "epoch": 2.5144634525660963, "grad_norm": 1.888901923763342e-09, "learning_rate": 8.978792070078378e-07, "logits/chosen": 3.350219249725342, "logits/rejected": 4.0092010498046875, "logps/chosen": -673.218017578125, "logps/rejected": -1049.03564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.207067489624023, "rewards/margins": 31.169370651245117, "rewards/rejected": -40.37643814086914, "step": 4042 }, { "epoch": 2.515085536547434, "grad_norm": 5.1957790958567784e-08, "learning_rate": 8.967266021207931e-07, "logits/chosen": -0.6556673049926758, "logits/rejected": 2.5189614295959473, "logps/chosen": -485.61602783203125, "logps/rejected": -928.2960205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.341263771057129, "rewards/margins": 31.709503173828125, "rewards/rejected": -40.05076599121094, "step": 4043 }, { "epoch": 2.5157076205287714, "grad_norm": 0.9537517428398132, "learning_rate": 8.955739972337483e-07, "logits/chosen": -2.3854219913482666, "logits/rejected": 3.221060037612915, "logps/chosen": -317.2149658203125, "logps/rejected": -987.2667846679688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.228043556213379, "rewards/margins": 37.49151611328125, "rewards/rejected": -41.71955871582031, "step": 4044 }, { "epoch": 2.5163297045101087, "grad_norm": 9.389303158968687e-06, "learning_rate": 8.944213923467037e-07, "logits/chosen": -0.7763998508453369, "logits/rejected": 1.6843314170837402, "logps/chosen": -446.32879638671875, "logps/rejected": -863.89111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.270142555236816, "rewards/margins": 29.047107696533203, "rewards/rejected": -35.31725311279297, "step": 4045 }, { "epoch": 2.516951788491446, "grad_norm": 0.00035843587829731405, "learning_rate": 8.932687874596589e-07, "logits/chosen": -2.0564632415771484, "logits/rejected": 0.005192816257476807, "logps/chosen": -446.6129150390625, "logps/rejected": -927.9066162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.743563175201416, "rewards/margins": 36.39313507080078, "rewards/rejected": -44.13669967651367, "step": 4046 }, { "epoch": 2.517573872472784, "grad_norm": 0.007599648553878069, "learning_rate": 8.921161825726142e-07, "logits/chosen": 0.054366230964660645, "logits/rejected": 3.8880622386932373, "logps/chosen": -459.69073486328125, "logps/rejected": -890.9481201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.929388046264648, "rewards/margins": 22.893136978149414, "rewards/rejected": -29.822526931762695, "step": 4047 }, { "epoch": 2.518195956454121, "grad_norm": 0.008917691186070442, "learning_rate": 8.909635776855694e-07, "logits/chosen": 1.7491514682769775, "logits/rejected": 3.895791530609131, "logps/chosen": -583.1383056640625, "logps/rejected": -971.4913330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.932445526123047, "rewards/margins": 25.728708267211914, "rewards/rejected": -35.66115188598633, "step": 4048 }, { "epoch": 2.518818040435459, "grad_norm": 1.1484187841415405, "learning_rate": 8.898109727985248e-07, "logits/chosen": -0.4084659516811371, "logits/rejected": 0.8221626281738281, "logps/chosen": -441.1219787597656, "logps/rejected": -798.0272827148438, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -6.016278266906738, "rewards/margins": 27.955223083496094, "rewards/rejected": -33.971500396728516, "step": 4049 }, { "epoch": 2.5194401244167963, "grad_norm": 2.94388484954834, "learning_rate": 8.8865836791148e-07, "logits/chosen": 0.5502256751060486, "logits/rejected": 3.039674758911133, "logps/chosen": -525.4452514648438, "logps/rejected": -871.894775390625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -10.72391128540039, "rewards/margins": 26.043746948242188, "rewards/rejected": -36.76765441894531, "step": 4050 }, { "epoch": 2.5200622083981337, "grad_norm": 0.008566655218601227, "learning_rate": 8.875057630244353e-07, "logits/chosen": -1.273643136024475, "logits/rejected": 3.792297601699829, "logps/chosen": -435.6821594238281, "logps/rejected": -1057.8905029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.56712532043457, "rewards/margins": 27.878154754638672, "rewards/rejected": -37.445281982421875, "step": 4051 }, { "epoch": 2.520684292379471, "grad_norm": 0.903061032295227, "learning_rate": 8.863531581373907e-07, "logits/chosen": -1.5747716426849365, "logits/rejected": 2.0196101665496826, "logps/chosen": -382.30938720703125, "logps/rejected": -926.2044677734375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -6.719232559204102, "rewards/margins": 33.08274459838867, "rewards/rejected": -39.801979064941406, "step": 4052 }, { "epoch": 2.521306376360809, "grad_norm": 0.003625160548835993, "learning_rate": 8.852005532503459e-07, "logits/chosen": 0.947603702545166, "logits/rejected": 3.8829972743988037, "logps/chosen": -536.180419921875, "logps/rejected": -991.75146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.84277868270874, "rewards/margins": 31.205059051513672, "rewards/rejected": -38.04783630371094, "step": 4053 }, { "epoch": 2.521928460342146, "grad_norm": 0.37235498428344727, "learning_rate": 8.840479483633012e-07, "logits/chosen": -0.7206171751022339, "logits/rejected": 2.378139019012451, "logps/chosen": -513.6537475585938, "logps/rejected": -921.5626831054688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -11.372352600097656, "rewards/margins": 25.354045867919922, "rewards/rejected": -36.72639846801758, "step": 4054 }, { "epoch": 2.522550544323484, "grad_norm": 0.06400436162948608, "learning_rate": 8.828953434762564e-07, "logits/chosen": 2.3853330612182617, "logits/rejected": 3.798861265182495, "logps/chosen": -658.4878540039062, "logps/rejected": -968.2791748046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.424330711364746, "rewards/margins": 25.68733024597168, "rewards/rejected": -34.111663818359375, "step": 4055 }, { "epoch": 2.5231726283048213, "grad_norm": 1.104139982999186e-06, "learning_rate": 8.817427385892118e-07, "logits/chosen": -1.199948787689209, "logits/rejected": 2.331671953201294, "logps/chosen": -509.634033203125, "logps/rejected": -1042.027099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.206770896911621, "rewards/margins": 30.182756423950195, "rewards/rejected": -37.3895263671875, "step": 4056 }, { "epoch": 2.5237947122861586, "grad_norm": 0.0002867870789486915, "learning_rate": 8.80590133702167e-07, "logits/chosen": -2.356788158416748, "logits/rejected": 3.516348123550415, "logps/chosen": -356.5927734375, "logps/rejected": -1011.0084838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.522395133972168, "rewards/margins": 32.249908447265625, "rewards/rejected": -39.772300720214844, "step": 4057 }, { "epoch": 2.524416796267496, "grad_norm": 8.064653229666874e-06, "learning_rate": 8.794375288151223e-07, "logits/chosen": -0.5611940622329712, "logits/rejected": 3.5149805545806885, "logps/chosen": -508.0080871582031, "logps/rejected": -1095.08935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.362550735473633, "rewards/margins": 35.61705780029297, "rewards/rejected": -40.97960662841797, "step": 4058 }, { "epoch": 2.5250388802488337, "grad_norm": 0.00042883484275080264, "learning_rate": 8.782849239280774e-07, "logits/chosen": 1.1045475006103516, "logits/rejected": 3.2674155235290527, "logps/chosen": -499.8591613769531, "logps/rejected": -891.91748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.630616664886475, "rewards/margins": 25.193370819091797, "rewards/rejected": -32.82398986816406, "step": 4059 }, { "epoch": 2.525660964230171, "grad_norm": 1.2721602615783922e-06, "learning_rate": 8.771323190410328e-07, "logits/chosen": 3.0189056396484375, "logits/rejected": 4.2439775466918945, "logps/chosen": -627.486083984375, "logps/rejected": -1018.70263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.340555191040039, "rewards/margins": 31.816898345947266, "rewards/rejected": -45.15745544433594, "step": 4060 }, { "epoch": 2.5262830482115084, "grad_norm": 0.6605531573295593, "learning_rate": 8.75979714153988e-07, "logits/chosen": 0.18107308447360992, "logits/rejected": 3.3815932273864746, "logps/chosen": -640.6924438476562, "logps/rejected": -1271.03173828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -18.90386962890625, "rewards/margins": 34.992061614990234, "rewards/rejected": -53.895931243896484, "step": 4061 }, { "epoch": 2.526905132192846, "grad_norm": 0.000140212316182442, "learning_rate": 8.748271092669433e-07, "logits/chosen": 2.1460206508636475, "logits/rejected": 2.4291000366210938, "logps/chosen": -774.4283447265625, "logps/rejected": -1192.8189697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.135278701782227, "rewards/margins": 32.91741180419922, "rewards/rejected": -44.05268859863281, "step": 4062 }, { "epoch": 2.5275272161741835, "grad_norm": 37.31892013549805, "learning_rate": 8.736745043798986e-07, "logits/chosen": -0.12430325150489807, "logits/rejected": 2.1037116050720215, "logps/chosen": -624.890380859375, "logps/rejected": -914.820556640625, "loss": 1.4105, "rewards/accuracies": 0.875, "rewards/chosen": -11.072190284729004, "rewards/margins": 18.019466400146484, "rewards/rejected": -29.091657638549805, "step": 4063 }, { "epoch": 2.528149300155521, "grad_norm": 0.008869586512446404, "learning_rate": 8.725218994928539e-07, "logits/chosen": 0.9427728652954102, "logits/rejected": 2.078561782836914, "logps/chosen": -580.1802978515625, "logps/rejected": -899.5389404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.347139358520508, "rewards/margins": 27.37150001525879, "rewards/rejected": -39.7186393737793, "step": 4064 }, { "epoch": 2.528771384136858, "grad_norm": 0.43725720047950745, "learning_rate": 8.713692946058091e-07, "logits/chosen": -0.928475022315979, "logits/rejected": 2.0003530979156494, "logps/chosen": -554.10205078125, "logps/rejected": -907.9371948242188, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -9.801774978637695, "rewards/margins": 20.24555206298828, "rewards/rejected": -30.04732894897461, "step": 4065 }, { "epoch": 2.529393468118196, "grad_norm": 0.5570970177650452, "learning_rate": 8.702166897187644e-07, "logits/chosen": -0.08360552787780762, "logits/rejected": 2.402754306793213, "logps/chosen": -647.493896484375, "logps/rejected": -1055.98828125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -12.159369468688965, "rewards/margins": 29.314403533935547, "rewards/rejected": -41.47377014160156, "step": 4066 }, { "epoch": 2.5300155520995333, "grad_norm": 0.013835887424647808, "learning_rate": 8.690640848317198e-07, "logits/chosen": 2.6564700603485107, "logits/rejected": 4.413903713226318, "logps/chosen": -677.5421752929688, "logps/rejected": -1058.09619140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.916563987731934, "rewards/margins": 25.28716468811035, "rewards/rejected": -35.20372772216797, "step": 4067 }, { "epoch": 2.530637636080871, "grad_norm": 1.2323258715696284e-06, "learning_rate": 8.67911479944675e-07, "logits/chosen": 2.8368053436279297, "logits/rejected": 3.4437544345855713, "logps/chosen": -592.5868530273438, "logps/rejected": -943.364990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.618453979492188, "rewards/margins": 29.4837646484375, "rewards/rejected": -40.10221862792969, "step": 4068 }, { "epoch": 2.5312597200622085, "grad_norm": 0.00047519218060187995, "learning_rate": 8.667588750576303e-07, "logits/chosen": -2.7477948665618896, "logits/rejected": 3.0729355812072754, "logps/chosen": -308.309326171875, "logps/rejected": -936.3649291992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9158785343170166, "rewards/margins": 30.847537994384766, "rewards/rejected": -34.7634162902832, "step": 4069 }, { "epoch": 2.531881804043546, "grad_norm": 0.0025224664714187384, "learning_rate": 8.656062701705856e-07, "logits/chosen": -0.4186660647392273, "logits/rejected": 2.772813558578491, "logps/chosen": -483.03546142578125, "logps/rejected": -1053.2744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.237714767456055, "rewards/margins": 34.6431999206543, "rewards/rejected": -40.880916595458984, "step": 4070 }, { "epoch": 2.532503888024883, "grad_norm": 6.943326980035636e-07, "learning_rate": 8.644536652835409e-07, "logits/chosen": -1.8856847286224365, "logits/rejected": 2.604322671890259, "logps/chosen": -385.1220703125, "logps/rejected": -934.83447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.365512371063232, "rewards/margins": 30.374441146850586, "rewards/rejected": -34.73995590209961, "step": 4071 }, { "epoch": 2.533125972006221, "grad_norm": 0.07411924749612808, "learning_rate": 8.633010603964961e-07, "logits/chosen": 0.5203073024749756, "logits/rejected": 2.409273862838745, "logps/chosen": -471.6881103515625, "logps/rejected": -914.433349609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.214423656463623, "rewards/margins": 29.525753021240234, "rewards/rejected": -36.74017333984375, "step": 4072 }, { "epoch": 2.5337480559875583, "grad_norm": 8.959432307165116e-06, "learning_rate": 8.621484555094514e-07, "logits/chosen": -0.09636279940605164, "logits/rejected": 2.9820473194122314, "logps/chosen": -553.0858764648438, "logps/rejected": -1140.2852783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.259970664978027, "rewards/margins": 36.687904357910156, "rewards/rejected": -42.947872161865234, "step": 4073 }, { "epoch": 2.534370139968896, "grad_norm": 36.08942413330078, "learning_rate": 8.609958506224067e-07, "logits/chosen": -0.2618432343006134, "logits/rejected": 2.2460336685180664, "logps/chosen": -635.515380859375, "logps/rejected": -1065.712646484375, "loss": 0.4531, "rewards/accuracies": 0.875, "rewards/chosen": -13.69479751586914, "rewards/margins": 28.274282455444336, "rewards/rejected": -41.969078063964844, "step": 4074 }, { "epoch": 2.5349922239502334, "grad_norm": 0.05928949639201164, "learning_rate": 8.59843245735362e-07, "logits/chosen": 0.16672658920288086, "logits/rejected": 2.908172369003296, "logps/chosen": -507.34716796875, "logps/rejected": -994.593505859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.214090347290039, "rewards/margins": 30.99824333190918, "rewards/rejected": -41.21233367919922, "step": 4075 }, { "epoch": 2.5356143079315707, "grad_norm": 0.017843371257185936, "learning_rate": 8.586906408483172e-07, "logits/chosen": -1.8564605712890625, "logits/rejected": 0.6809482574462891, "logps/chosen": -592.2178344726562, "logps/rejected": -1067.1971435546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.823671340942383, "rewards/margins": 25.124004364013672, "rewards/rejected": -37.94767761230469, "step": 4076 }, { "epoch": 2.536236391912908, "grad_norm": 1.2402077231854491e-07, "learning_rate": 8.575380359612726e-07, "logits/chosen": -0.0989261269569397, "logits/rejected": 4.3484086990356445, "logps/chosen": -504.6379699707031, "logps/rejected": -1159.091552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.602766036987305, "rewards/margins": 37.80310821533203, "rewards/rejected": -46.40587615966797, "step": 4077 }, { "epoch": 2.536858475894246, "grad_norm": 39.21305847167969, "learning_rate": 8.563854310742279e-07, "logits/chosen": 1.0437443256378174, "logits/rejected": 2.75134539604187, "logps/chosen": -677.68212890625, "logps/rejected": -988.155517578125, "loss": 0.4726, "rewards/accuracies": 0.75, "rewards/chosen": -8.888240814208984, "rewards/margins": 20.878808975219727, "rewards/rejected": -29.76704978942871, "step": 4078 }, { "epoch": 2.537480559875583, "grad_norm": 0.02777073159813881, "learning_rate": 8.552328261871831e-07, "logits/chosen": 1.8793046474456787, "logits/rejected": 3.102606773376465, "logps/chosen": -432.9072265625, "logps/rejected": -698.9886474609375, "loss": 0.0867, "rewards/accuracies": 0.875, "rewards/chosen": -8.23219108581543, "rewards/margins": 21.439613342285156, "rewards/rejected": -29.67180633544922, "step": 4079 }, { "epoch": 2.5381026438569205, "grad_norm": 0.00032307422952726483, "learning_rate": 8.540802213001384e-07, "logits/chosen": -0.17820918560028076, "logits/rejected": 2.1350302696228027, "logps/chosen": -531.5789794921875, "logps/rejected": -941.7060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.655010223388672, "rewards/margins": 26.932619094848633, "rewards/rejected": -34.58762741088867, "step": 4080 }, { "epoch": 2.5387247278382583, "grad_norm": 0.33138206601142883, "learning_rate": 8.529276164130937e-07, "logits/chosen": -0.9489991664886475, "logits/rejected": 3.2138314247131348, "logps/chosen": -483.31146240234375, "logps/rejected": -1007.58056640625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -7.168546676635742, "rewards/margins": 27.229930877685547, "rewards/rejected": -34.39847946166992, "step": 4081 }, { "epoch": 2.5393468118195957, "grad_norm": 0.010503578931093216, "learning_rate": 8.51775011526049e-07, "logits/chosen": 1.4573490619659424, "logits/rejected": 2.668405771255493, "logps/chosen": -647.7239990234375, "logps/rejected": -1154.552001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.74909782409668, "rewards/margins": 39.35409927368164, "rewards/rejected": -50.10319137573242, "step": 4082 }, { "epoch": 2.539968895800933, "grad_norm": 0.0037981884088367224, "learning_rate": 8.506224066390042e-07, "logits/chosen": 0.5995239019393921, "logits/rejected": 2.5143589973449707, "logps/chosen": -560.736083984375, "logps/rejected": -825.2479858398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.262418746948242, "rewards/margins": 21.219337463378906, "rewards/rejected": -29.481754302978516, "step": 4083 }, { "epoch": 2.5405909797822703, "grad_norm": 0.054800570011138916, "learning_rate": 8.494698017519595e-07, "logits/chosen": 3.720679521560669, "logits/rejected": 3.344923973083496, "logps/chosen": -735.9842529296875, "logps/rejected": -956.2283325195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.82901668548584, "rewards/margins": 21.535974502563477, "rewards/rejected": -28.364992141723633, "step": 4084 }, { "epoch": 2.541213063763608, "grad_norm": 5.5997816161834635e-06, "learning_rate": 8.483171968649148e-07, "logits/chosen": -2.129356622695923, "logits/rejected": 2.2183260917663574, "logps/chosen": -430.0462646484375, "logps/rejected": -1063.35302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.9142327308654785, "rewards/margins": 35.044158935546875, "rewards/rejected": -40.95839309692383, "step": 4085 }, { "epoch": 2.5418351477449455, "grad_norm": 1.1461665630340576, "learning_rate": 8.471645919778701e-07, "logits/chosen": -1.286311149597168, "logits/rejected": 0.3549751043319702, "logps/chosen": -487.452392578125, "logps/rejected": -805.5723876953125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -7.289743900299072, "rewards/margins": 23.007722854614258, "rewards/rejected": -30.297466278076172, "step": 4086 }, { "epoch": 2.5424572317262832, "grad_norm": 0.21102778613567352, "learning_rate": 8.460119870908253e-07, "logits/chosen": -2.918485641479492, "logits/rejected": 0.9315952062606812, "logps/chosen": -469.7716064453125, "logps/rejected": -934.5232543945312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.402835845947266, "rewards/margins": 27.498199462890625, "rewards/rejected": -35.90103530883789, "step": 4087 }, { "epoch": 2.5430793157076206, "grad_norm": 9.886176109313965, "learning_rate": 8.448593822037807e-07, "logits/chosen": 1.0713324546813965, "logits/rejected": 2.689504861831665, "logps/chosen": -556.312744140625, "logps/rejected": -890.0252685546875, "loss": 0.1282, "rewards/accuracies": 0.875, "rewards/chosen": -7.405909538269043, "rewards/margins": 25.110034942626953, "rewards/rejected": -32.51594543457031, "step": 4088 }, { "epoch": 2.543701399688958, "grad_norm": 1.8649814592208713e-05, "learning_rate": 8.43706777316736e-07, "logits/chosen": 0.28653520345687866, "logits/rejected": 3.634840488433838, "logps/chosen": -469.40191650390625, "logps/rejected": -930.4893798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.568150043487549, "rewards/margins": 29.26486587524414, "rewards/rejected": -35.83301544189453, "step": 4089 }, { "epoch": 2.5443234836702953, "grad_norm": 0.00011260463361395523, "learning_rate": 8.425541724296912e-07, "logits/chosen": 2.7270326614379883, "logits/rejected": 1.5111112594604492, "logps/chosen": -816.8228149414062, "logps/rejected": -1039.581298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.286932945251465, "rewards/margins": 25.909936904907227, "rewards/rejected": -36.196868896484375, "step": 4090 }, { "epoch": 2.544945567651633, "grad_norm": 0.0286729633808136, "learning_rate": 8.414015675426465e-07, "logits/chosen": 0.7567600011825562, "logits/rejected": 2.3368992805480957, "logps/chosen": -482.7024841308594, "logps/rejected": -1020.5185546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.911073684692383, "rewards/margins": 35.30067443847656, "rewards/rejected": -40.21174621582031, "step": 4091 }, { "epoch": 2.5455676516329704, "grad_norm": 0.0016783374594524503, "learning_rate": 8.402489626556018e-07, "logits/chosen": 0.7466878294944763, "logits/rejected": 3.6340603828430176, "logps/chosen": -568.1629028320312, "logps/rejected": -1043.47900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.166143417358398, "rewards/margins": 33.148521423339844, "rewards/rejected": -41.314666748046875, "step": 4092 }, { "epoch": 2.546189735614308, "grad_norm": 0.5343481302261353, "learning_rate": 8.390963577685569e-07, "logits/chosen": 2.2212343215942383, "logits/rejected": 4.400791645050049, "logps/chosen": -635.5879516601562, "logps/rejected": -1042.9224853515625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -9.624446868896484, "rewards/margins": 27.347734451293945, "rewards/rejected": -36.9721794128418, "step": 4093 }, { "epoch": 2.5468118195956455, "grad_norm": 0.19887447357177734, "learning_rate": 8.379437528815122e-07, "logits/chosen": 0.01381150633096695, "logits/rejected": 1.177337408065796, "logps/chosen": -422.07208251953125, "logps/rejected": -756.161865234375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.822951793670654, "rewards/margins": 22.88251495361328, "rewards/rejected": -27.705467224121094, "step": 4094 }, { "epoch": 2.547433903576983, "grad_norm": 0.007180861197412014, "learning_rate": 8.367911479944676e-07, "logits/chosen": 1.0952752828598022, "logits/rejected": 2.7882447242736816, "logps/chosen": -600.28759765625, "logps/rejected": -1076.0926513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.935996055603027, "rewards/margins": 35.04020690917969, "rewards/rejected": -44.976200103759766, "step": 4095 }, { "epoch": 2.54805598755832, "grad_norm": 0.00010127165296580642, "learning_rate": 8.356385431074228e-07, "logits/chosen": 0.8541593551635742, "logits/rejected": 3.7372541427612305, "logps/chosen": -481.94366455078125, "logps/rejected": -1099.032958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.173066139221191, "rewards/margins": 35.346473693847656, "rewards/rejected": -41.51953887939453, "step": 4096 }, { "epoch": 2.548678071539658, "grad_norm": 0.0006400636048056185, "learning_rate": 8.344859382203781e-07, "logits/chosen": -0.9102742671966553, "logits/rejected": 2.529773712158203, "logps/chosen": -532.9279174804688, "logps/rejected": -923.9201049804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2139296531677246, "rewards/margins": 27.08374786376953, "rewards/rejected": -30.297679901123047, "step": 4097 }, { "epoch": 2.5493001555209953, "grad_norm": 0.0010316974949091673, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.8006272315979004, "logits/rejected": 2.707360029220581, "logps/chosen": -453.3794860839844, "logps/rejected": -1052.589111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.090571403503418, "rewards/margins": 31.266225814819336, "rewards/rejected": -40.35679626464844, "step": 4098 }, { "epoch": 2.5499222395023327, "grad_norm": 0.002551464596763253, "learning_rate": 8.321807284462887e-07, "logits/chosen": -0.6266838312149048, "logits/rejected": 3.1678671836853027, "logps/chosen": -529.3460693359375, "logps/rejected": -1069.198974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.927661895751953, "rewards/margins": 32.658512115478516, "rewards/rejected": -44.58617401123047, "step": 4099 }, { "epoch": 2.5505443234836704, "grad_norm": 3.0759070796193555e-05, "learning_rate": 8.310281235592439e-07, "logits/chosen": 1.4877252578735352, "logits/rejected": 3.491295337677002, "logps/chosen": -675.742431640625, "logps/rejected": -984.6753540039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.454578399658203, "rewards/margins": 24.514137268066406, "rewards/rejected": -32.96871566772461, "step": 4100 }, { "epoch": 2.551166407465008, "grad_norm": 0.09172773361206055, "learning_rate": 8.298755186721992e-07, "logits/chosen": 3.627704620361328, "logits/rejected": 4.454146385192871, "logps/chosen": -761.8814697265625, "logps/rejected": -997.3330688476562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.802077293395996, "rewards/margins": 21.68487548828125, "rewards/rejected": -32.48695373535156, "step": 4101 }, { "epoch": 2.551788491446345, "grad_norm": 0.2365218549966812, "learning_rate": 8.287229137851544e-07, "logits/chosen": 2.003429412841797, "logits/rejected": 3.921147346496582, "logps/chosen": -653.858642578125, "logps/rejected": -891.6544189453125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.306427955627441, "rewards/margins": 15.111352920532227, "rewards/rejected": -23.41777992248535, "step": 4102 }, { "epoch": 2.5524105754276825, "grad_norm": 3.3804278700699797e-06, "learning_rate": 8.275703088981098e-07, "logits/chosen": 2.205820083618164, "logits/rejected": 3.166776657104492, "logps/chosen": -649.1009521484375, "logps/rejected": -1086.208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.009281158447266, "rewards/margins": 33.14641189575195, "rewards/rejected": -43.155696868896484, "step": 4103 }, { "epoch": 2.5530326594090202, "grad_norm": 4.083518981933594, "learning_rate": 8.264177040110651e-07, "logits/chosen": 0.7583400011062622, "logits/rejected": 2.963535785675049, "logps/chosen": -558.18701171875, "logps/rejected": -928.1401977539062, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -10.89256763458252, "rewards/margins": 23.71649932861328, "rewards/rejected": -34.609066009521484, "step": 4104 }, { "epoch": 2.5536547433903576, "grad_norm": 2.8601727990462678e-06, "learning_rate": 8.252650991240203e-07, "logits/chosen": -2.2715840339660645, "logits/rejected": 4.363325119018555, "logps/chosen": -292.86273193359375, "logps/rejected": -984.2553100585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.256927490234375, "rewards/margins": 29.46300506591797, "rewards/rejected": -35.71993637084961, "step": 4105 }, { "epoch": 2.5542768273716954, "grad_norm": 38.109676361083984, "learning_rate": 8.241124942369757e-07, "logits/chosen": -0.6005755662918091, "logits/rejected": 3.6361074447631836, "logps/chosen": -520.59912109375, "logps/rejected": -1044.981201171875, "loss": 0.2634, "rewards/accuracies": 0.875, "rewards/chosen": -12.620247840881348, "rewards/margins": 26.426218032836914, "rewards/rejected": -39.04646682739258, "step": 4106 }, { "epoch": 2.5548989113530327, "grad_norm": 0.018291635438799858, "learning_rate": 8.229598893499309e-07, "logits/chosen": -0.0760377049446106, "logits/rejected": 3.9962425231933594, "logps/chosen": -563.3379516601562, "logps/rejected": -1018.1607666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.269431114196777, "rewards/margins": 21.766199111938477, "rewards/rejected": -31.03563117980957, "step": 4107 }, { "epoch": 2.55552099533437, "grad_norm": 0.09162867069244385, "learning_rate": 8.218072844628862e-07, "logits/chosen": 2.3309199810028076, "logits/rejected": 5.0157623291015625, "logps/chosen": -610.57568359375, "logps/rejected": -1068.809814453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.771368980407715, "rewards/margins": 27.12932586669922, "rewards/rejected": -36.900691986083984, "step": 4108 }, { "epoch": 2.5561430793157074, "grad_norm": 0.004627412185072899, "learning_rate": 8.206546795758414e-07, "logits/chosen": 1.278794765472412, "logits/rejected": 3.1798346042633057, "logps/chosen": -514.0498046875, "logps/rejected": -854.1100463867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.337404251098633, "rewards/margins": 23.32642364501953, "rewards/rejected": -32.66382598876953, "step": 4109 }, { "epoch": 2.556765163297045, "grad_norm": 5.289896011352539, "learning_rate": 8.195020746887968e-07, "logits/chosen": -1.7098119258880615, "logits/rejected": 1.4635165929794312, "logps/chosen": -458.4467468261719, "logps/rejected": -922.351806640625, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -5.773290157318115, "rewards/margins": 28.729084014892578, "rewards/rejected": -34.502376556396484, "step": 4110 }, { "epoch": 2.5573872472783825, "grad_norm": 17.51113510131836, "learning_rate": 8.18349469801752e-07, "logits/chosen": 0.880405843257904, "logits/rejected": 2.761213541030884, "logps/chosen": -465.3739318847656, "logps/rejected": -771.1871337890625, "loss": 0.1645, "rewards/accuracies": 0.875, "rewards/chosen": -6.4234137535095215, "rewards/margins": 18.73223876953125, "rewards/rejected": -25.155654907226562, "step": 4111 }, { "epoch": 2.5580093312597203, "grad_norm": 0.12192290276288986, "learning_rate": 8.171968649147073e-07, "logits/chosen": -0.6888011693954468, "logits/rejected": 3.9244179725646973, "logps/chosen": -356.2187194824219, "logps/rejected": -837.59716796875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.990688323974609, "rewards/margins": 20.161766052246094, "rewards/rejected": -26.152454376220703, "step": 4112 }, { "epoch": 2.5586314152410576, "grad_norm": 0.01936480775475502, "learning_rate": 8.160442600276625e-07, "logits/chosen": -1.2800441980361938, "logits/rejected": 1.316270112991333, "logps/chosen": -446.74847412109375, "logps/rejected": -940.384033203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.286430835723877, "rewards/margins": 28.183921813964844, "rewards/rejected": -33.47035598754883, "step": 4113 }, { "epoch": 2.559253499222395, "grad_norm": 0.00013975970796309412, "learning_rate": 8.148916551406179e-07, "logits/chosen": -1.8969745635986328, "logits/rejected": 3.673656702041626, "logps/chosen": -452.6147766113281, "logps/rejected": -1037.419921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.33104419708252, "rewards/margins": 31.03522491455078, "rewards/rejected": -39.36627197265625, "step": 4114 }, { "epoch": 2.5598755832037323, "grad_norm": 3.6346375509310747e-06, "learning_rate": 8.137390502535732e-07, "logits/chosen": -0.2831879258155823, "logits/rejected": 4.18792200088501, "logps/chosen": -457.58074951171875, "logps/rejected": -1098.911376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.230313301086426, "rewards/margins": 34.850311279296875, "rewards/rejected": -44.08062744140625, "step": 4115 }, { "epoch": 2.56049766718507, "grad_norm": 0.004861139692366123, "learning_rate": 8.125864453665284e-07, "logits/chosen": 2.9572341442108154, "logits/rejected": 3.5238685607910156, "logps/chosen": -657.5555419921875, "logps/rejected": -984.3204345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.549543380737305, "rewards/margins": 28.880319595336914, "rewards/rejected": -38.42986297607422, "step": 4116 }, { "epoch": 2.5611197511664074, "grad_norm": 5.274324621495907e-07, "learning_rate": 8.114338404794838e-07, "logits/chosen": -1.9102250337600708, "logits/rejected": 2.5244147777557373, "logps/chosen": -435.52679443359375, "logps/rejected": -883.0416259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.913174629211426, "rewards/margins": 28.199817657470703, "rewards/rejected": -35.11299133300781, "step": 4117 }, { "epoch": 2.561741835147745, "grad_norm": 0.00010716221731854603, "learning_rate": 8.10281235592439e-07, "logits/chosen": -0.9655880331993103, "logits/rejected": 2.9578256607055664, "logps/chosen": -345.596435546875, "logps/rejected": -819.7735595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.622903347015381, "rewards/margins": 25.935901641845703, "rewards/rejected": -30.55880355834961, "step": 4118 }, { "epoch": 2.5623639191290826, "grad_norm": 0.08033602684736252, "learning_rate": 8.091286307053943e-07, "logits/chosen": -2.8053464889526367, "logits/rejected": 2.981839179992676, "logps/chosen": -304.0949401855469, "logps/rejected": -941.830322265625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.969958305358887, "rewards/margins": 36.09151077270508, "rewards/rejected": -43.06147003173828, "step": 4119 }, { "epoch": 2.56298600311042, "grad_norm": 7.09777232259512e-05, "learning_rate": 8.079760258183495e-07, "logits/chosen": -1.7569762468338013, "logits/rejected": 1.9847595691680908, "logps/chosen": -415.6924133300781, "logps/rejected": -955.1690673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.376535892486572, "rewards/margins": 31.938705444335938, "rewards/rejected": -37.31523895263672, "step": 4120 }, { "epoch": 2.5636080870917572, "grad_norm": 0.000233715123613365, "learning_rate": 8.068234209313049e-07, "logits/chosen": -2.1115782260894775, "logits/rejected": 1.553727388381958, "logps/chosen": -362.45758056640625, "logps/rejected": -961.16357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.228207588195801, "rewards/margins": 34.83417510986328, "rewards/rejected": -39.0623779296875, "step": 4121 }, { "epoch": 2.564230171073095, "grad_norm": 1.8830749988555908, "learning_rate": 8.056708160442601e-07, "logits/chosen": 3.103546142578125, "logits/rejected": 3.7607216835021973, "logps/chosen": -695.34033203125, "logps/rejected": -887.8741455078125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -11.275184631347656, "rewards/margins": 15.883432388305664, "rewards/rejected": -27.15861701965332, "step": 4122 }, { "epoch": 2.5648522550544324, "grad_norm": 0.0042056795209646225, "learning_rate": 8.045182111572154e-07, "logits/chosen": 0.4300188720226288, "logits/rejected": 3.1653892993927, "logps/chosen": -509.19171142578125, "logps/rejected": -915.0361938476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.071253776550293, "rewards/margins": 25.42142677307129, "rewards/rejected": -32.492679595947266, "step": 4123 }, { "epoch": 2.5654743390357697, "grad_norm": 1.0041102170944214, "learning_rate": 8.033656062701708e-07, "logits/chosen": -0.7266380190849304, "logits/rejected": 1.9405543804168701, "logps/chosen": -543.4242553710938, "logps/rejected": -904.788330078125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -10.144115447998047, "rewards/margins": 25.701583862304688, "rewards/rejected": -35.84569549560547, "step": 4124 }, { "epoch": 2.5660964230171075, "grad_norm": 0.12149225175380707, "learning_rate": 8.02213001383126e-07, "logits/chosen": 1.6091845035552979, "logits/rejected": 3.8638434410095215, "logps/chosen": -571.7221069335938, "logps/rejected": -1004.748291015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.035507202148438, "rewards/margins": 25.663440704345703, "rewards/rejected": -34.69894790649414, "step": 4125 }, { "epoch": 2.566718506998445, "grad_norm": 2.1839077472686768, "learning_rate": 8.010603964960813e-07, "logits/chosen": 0.6656918525695801, "logits/rejected": 4.665249824523926, "logps/chosen": -567.3508911132812, "logps/rejected": -1130.19580078125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -8.491843223571777, "rewards/margins": 30.916309356689453, "rewards/rejected": -39.40815734863281, "step": 4126 }, { "epoch": 2.567340590979782, "grad_norm": 3.404394374229014e-05, "learning_rate": 7.999077916090364e-07, "logits/chosen": -1.7685790061950684, "logits/rejected": 2.534496307373047, "logps/chosen": -428.60205078125, "logps/rejected": -940.8599853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.921274185180664, "rewards/margins": 29.669403076171875, "rewards/rejected": -35.590675354003906, "step": 4127 }, { "epoch": 2.5679626749611195, "grad_norm": 1.0856837034225464, "learning_rate": 7.987551867219917e-07, "logits/chosen": -0.630801796913147, "logits/rejected": 0.9611508846282959, "logps/chosen": -548.1029663085938, "logps/rejected": -929.326416015625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -6.630784511566162, "rewards/margins": 31.183979034423828, "rewards/rejected": -37.814762115478516, "step": 4128 }, { "epoch": 2.5685847589424573, "grad_norm": 0.026657918468117714, "learning_rate": 7.97602581834947e-07, "logits/chosen": 0.9505861401557922, "logits/rejected": 2.9759345054626465, "logps/chosen": -468.04266357421875, "logps/rejected": -777.418701171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.444450855255127, "rewards/margins": 22.632177352905273, "rewards/rejected": -27.076627731323242, "step": 4129 }, { "epoch": 2.5692068429237946, "grad_norm": 0.0002953787916339934, "learning_rate": 7.964499769479023e-07, "logits/chosen": -2.286724090576172, "logits/rejected": 2.6744308471679688, "logps/chosen": -414.42919921875, "logps/rejected": -985.4376220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.565918922424316, "rewards/margins": 33.906394958496094, "rewards/rejected": -40.472312927246094, "step": 4130 }, { "epoch": 2.5698289269051324, "grad_norm": 4.818135721507133e-07, "learning_rate": 7.952973720608575e-07, "logits/chosen": 0.27260005474090576, "logits/rejected": 4.570462226867676, "logps/chosen": -350.6351013183594, "logps/rejected": -968.77294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.497164726257324, "rewards/margins": 35.60032653808594, "rewards/rejected": -41.09749221801758, "step": 4131 }, { "epoch": 2.5704510108864698, "grad_norm": 5.349870207282947e-06, "learning_rate": 7.941447671738129e-07, "logits/chosen": -1.763448715209961, "logits/rejected": 3.7824950218200684, "logps/chosen": -398.9009704589844, "logps/rejected": -1069.40380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.347376823425293, "rewards/margins": 31.30596351623535, "rewards/rejected": -40.653343200683594, "step": 4132 }, { "epoch": 2.571073094867807, "grad_norm": 0.0005738705513067544, "learning_rate": 7.929921622867681e-07, "logits/chosen": 2.2045533657073975, "logits/rejected": 4.384173393249512, "logps/chosen": -643.0028076171875, "logps/rejected": -1008.7015380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.411272048950195, "rewards/margins": 25.97161865234375, "rewards/rejected": -38.38289260864258, "step": 4133 }, { "epoch": 2.5716951788491444, "grad_norm": 0.05061626806855202, "learning_rate": 7.918395573997234e-07, "logits/chosen": 0.16258734464645386, "logits/rejected": 1.2322566509246826, "logps/chosen": -564.4116821289062, "logps/rejected": -876.4520263671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.655303955078125, "rewards/margins": 24.78386878967285, "rewards/rejected": -35.43917465209961, "step": 4134 }, { "epoch": 2.5723172628304822, "grad_norm": 0.00019210392201784998, "learning_rate": 7.906869525126787e-07, "logits/chosen": 0.23870764672756195, "logits/rejected": 3.414243698120117, "logps/chosen": -517.4119262695312, "logps/rejected": -1035.8192138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.417430877685547, "rewards/margins": 25.290889739990234, "rewards/rejected": -32.70832061767578, "step": 4135 }, { "epoch": 2.5729393468118196, "grad_norm": 5.645513738272712e-06, "learning_rate": 7.89534347625634e-07, "logits/chosen": -0.7859196066856384, "logits/rejected": 4.74886417388916, "logps/chosen": -418.10443115234375, "logps/rejected": -1050.834716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.610685348510742, "rewards/margins": 30.351909637451172, "rewards/rejected": -38.96259307861328, "step": 4136 }, { "epoch": 2.573561430793157, "grad_norm": 2.9019644260406494, "learning_rate": 7.883817427385892e-07, "logits/chosen": 2.8801445960998535, "logits/rejected": 3.751919746398926, "logps/chosen": -731.3704833984375, "logps/rejected": -1120.801025390625, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -10.350638389587402, "rewards/margins": 25.67172622680664, "rewards/rejected": -36.02236557006836, "step": 4137 }, { "epoch": 2.5741835147744947, "grad_norm": 2.1129725524815512e-08, "learning_rate": 7.872291378515445e-07, "logits/chosen": -1.025829553604126, "logits/rejected": 2.914036989212036, "logps/chosen": -440.2637939453125, "logps/rejected": -1089.4420166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.31710147857666, "rewards/margins": 42.682861328125, "rewards/rejected": -50.999969482421875, "step": 4138 }, { "epoch": 2.574805598755832, "grad_norm": 0.3254028856754303, "learning_rate": 7.860765329644998e-07, "logits/chosen": -0.7475310564041138, "logits/rejected": 3.239452362060547, "logps/chosen": -574.8551635742188, "logps/rejected": -1061.8125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -10.53398323059082, "rewards/margins": 30.228036880493164, "rewards/rejected": -40.762020111083984, "step": 4139 }, { "epoch": 2.5754276827371694, "grad_norm": 2.3314505597227253e-06, "learning_rate": 7.849239280774551e-07, "logits/chosen": 1.8743324279785156, "logits/rejected": 4.593924522399902, "logps/chosen": -735.50341796875, "logps/rejected": -1268.7764892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.629220962524414, "rewards/margins": 36.92953872680664, "rewards/rejected": -47.55876159667969, "step": 4140 }, { "epoch": 2.576049766718507, "grad_norm": 0.32990846037864685, "learning_rate": 7.837713231904104e-07, "logits/chosen": -0.5682883262634277, "logits/rejected": 2.464709758758545, "logps/chosen": -625.6953125, "logps/rejected": -1091.140869140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -11.468364715576172, "rewards/margins": 29.091880798339844, "rewards/rejected": -40.560245513916016, "step": 4141 }, { "epoch": 2.5766718506998445, "grad_norm": 0.0006232153391465545, "learning_rate": 7.826187183033657e-07, "logits/chosen": 0.9526250958442688, "logits/rejected": 4.00775146484375, "logps/chosen": -487.50927734375, "logps/rejected": -968.1290283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.43969440460205, "rewards/margins": 27.86787986755371, "rewards/rejected": -37.30757522583008, "step": 4142 }, { "epoch": 2.577293934681182, "grad_norm": 8.784759870650305e-07, "learning_rate": 7.81466113416321e-07, "logits/chosen": -1.9092073440551758, "logits/rejected": 1.6192662715911865, "logps/chosen": -502.8148193359375, "logps/rejected": -995.5576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.434041976928711, "rewards/margins": 29.768796920776367, "rewards/rejected": -37.202842712402344, "step": 4143 }, { "epoch": 2.5779160186625196, "grad_norm": 0.00010087468399433419, "learning_rate": 7.803135085292762e-07, "logits/chosen": 2.192214012145996, "logits/rejected": 3.749067544937134, "logps/chosen": -692.218505859375, "logps/rejected": -1114.0401611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.803261756896973, "rewards/margins": 32.304405212402344, "rewards/rejected": -40.107666015625, "step": 4144 }, { "epoch": 2.578538102643857, "grad_norm": 2.3301174223888665e-05, "learning_rate": 7.791609036422315e-07, "logits/chosen": 0.2807917594909668, "logits/rejected": 4.68834924697876, "logps/chosen": -480.9891662597656, "logps/rejected": -1048.3934326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.968709945678711, "rewards/margins": 30.439815521240234, "rewards/rejected": -41.40852355957031, "step": 4145 }, { "epoch": 2.5791601866251943, "grad_norm": 0.9815332889556885, "learning_rate": 7.780082987551868e-07, "logits/chosen": 0.13096857070922852, "logits/rejected": 3.9499459266662598, "logps/chosen": -571.0997924804688, "logps/rejected": -1036.63134765625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -10.803068161010742, "rewards/margins": 27.141456604003906, "rewards/rejected": -37.944522857666016, "step": 4146 }, { "epoch": 2.5797822706065316, "grad_norm": 7.814910411834717, "learning_rate": 7.768556938681421e-07, "logits/chosen": 0.5178133249282837, "logits/rejected": 1.794018268585205, "logps/chosen": -532.0501098632812, "logps/rejected": -758.7357177734375, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -8.415571212768555, "rewards/margins": 16.038002014160156, "rewards/rejected": -24.453575134277344, "step": 4147 }, { "epoch": 2.5804043545878694, "grad_norm": 2.3325019693487548e-09, "learning_rate": 7.757030889810973e-07, "logits/chosen": 0.5286740064620972, "logits/rejected": 3.41743540763855, "logps/chosen": -509.2735900878906, "logps/rejected": -1076.5255126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.825985908508301, "rewards/margins": 37.71453094482422, "rewards/rejected": -42.54051971435547, "step": 4148 }, { "epoch": 2.5810264385692068, "grad_norm": 1.063908712239936e-06, "learning_rate": 7.745504840940527e-07, "logits/chosen": -1.1882922649383545, "logits/rejected": 4.256172180175781, "logps/chosen": -430.41729736328125, "logps/rejected": -1153.875732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.591100692749023, "rewards/margins": 39.84674835205078, "rewards/rejected": -48.437843322753906, "step": 4149 }, { "epoch": 2.5816485225505446, "grad_norm": 1.0655003279680386e-05, "learning_rate": 7.733978792070079e-07, "logits/chosen": 0.24003244936466217, "logits/rejected": 3.3438003063201904, "logps/chosen": -398.0082092285156, "logps/rejected": -968.8842163085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.836625099182129, "rewards/margins": 35.46302795410156, "rewards/rejected": -44.29965591430664, "step": 4150 }, { "epoch": 2.582270606531882, "grad_norm": 0.35538250207901, "learning_rate": 7.722452743199632e-07, "logits/chosen": -0.1121729165315628, "logits/rejected": 3.873859405517578, "logps/chosen": -515.9468994140625, "logps/rejected": -1137.430419921875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -9.515294075012207, "rewards/margins": 38.027626037597656, "rewards/rejected": -47.54292297363281, "step": 4151 }, { "epoch": 2.5828926905132192, "grad_norm": 0.008247281424701214, "learning_rate": 7.710926694329185e-07, "logits/chosen": 0.6036416888237, "logits/rejected": 3.62092924118042, "logps/chosen": -544.2308349609375, "logps/rejected": -1042.4071044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.00452184677124, "rewards/margins": 30.49679183959961, "rewards/rejected": -35.501312255859375, "step": 4152 }, { "epoch": 2.5835147744945566, "grad_norm": 0.019443973898887634, "learning_rate": 7.699400645458738e-07, "logits/chosen": -0.25185853242874146, "logits/rejected": 3.4223010540008545, "logps/chosen": -470.89715576171875, "logps/rejected": -1007.6849365234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.071162223815918, "rewards/margins": 29.602224349975586, "rewards/rejected": -34.67338562011719, "step": 4153 }, { "epoch": 2.5841368584758944, "grad_norm": 0.03014482371509075, "learning_rate": 7.687874596588291e-07, "logits/chosen": 0.9726011157035828, "logits/rejected": 3.408278226852417, "logps/chosen": -525.7872314453125, "logps/rejected": -966.32666015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.844058990478516, "rewards/margins": 27.53978729248047, "rewards/rejected": -35.383846282958984, "step": 4154 }, { "epoch": 2.5847589424572317, "grad_norm": 0.0024584888014942408, "learning_rate": 7.676348547717843e-07, "logits/chosen": -1.4146912097930908, "logits/rejected": -0.32575997710227966, "logps/chosen": -458.1917724609375, "logps/rejected": -849.0999145507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.201642990112305, "rewards/margins": 30.501832962036133, "rewards/rejected": -39.70347595214844, "step": 4155 }, { "epoch": 2.585381026438569, "grad_norm": 1.9921230887121055e-07, "learning_rate": 7.664822498847396e-07, "logits/chosen": -1.1639642715454102, "logits/rejected": 1.7085157632827759, "logps/chosen": -621.3360595703125, "logps/rejected": -1296.9610595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.151021957397461, "rewards/margins": 46.671287536621094, "rewards/rejected": -54.82231140136719, "step": 4156 }, { "epoch": 2.586003110419907, "grad_norm": 1.6931274091080972e-11, "learning_rate": 7.653296449976949e-07, "logits/chosen": -1.5741413831710815, "logits/rejected": 2.825228214263916, "logps/chosen": -515.9703369140625, "logps/rejected": -1107.7664794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.022581100463867, "rewards/margins": 36.04176330566406, "rewards/rejected": -40.0643424987793, "step": 4157 }, { "epoch": 2.586625194401244, "grad_norm": 0.12946701049804688, "learning_rate": 7.641770401106502e-07, "logits/chosen": 1.9064527750015259, "logits/rejected": 3.7687430381774902, "logps/chosen": -698.0882568359375, "logps/rejected": -973.91845703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.335290908813477, "rewards/margins": 25.910003662109375, "rewards/rejected": -35.24529266357422, "step": 4158 }, { "epoch": 2.5872472783825815, "grad_norm": 28.270483016967773, "learning_rate": 7.630244352236054e-07, "logits/chosen": 1.3119711875915527, "logits/rejected": 1.9648979902267456, "logps/chosen": -762.5568237304688, "logps/rejected": -919.1646728515625, "loss": 0.1872, "rewards/accuracies": 0.875, "rewards/chosen": -12.725616455078125, "rewards/margins": 16.814102172851562, "rewards/rejected": -29.539718627929688, "step": 4159 }, { "epoch": 2.5878693623639193, "grad_norm": 1.0774549991765525e-05, "learning_rate": 7.618718303365608e-07, "logits/chosen": 0.4979928135871887, "logits/rejected": 3.370811700820923, "logps/chosen": -485.9146423339844, "logps/rejected": -1032.0020751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.306400299072266, "rewards/margins": 30.54342269897461, "rewards/rejected": -38.849822998046875, "step": 4160 }, { "epoch": 2.5884914463452566, "grad_norm": 0.6942704916000366, "learning_rate": 7.607192254495159e-07, "logits/chosen": 1.4790562391281128, "logits/rejected": 4.277195930480957, "logps/chosen": -449.6810302734375, "logps/rejected": -966.2116088867188, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -6.123173713684082, "rewards/margins": 26.700340270996094, "rewards/rejected": -32.82351303100586, "step": 4161 }, { "epoch": 2.589113530326594, "grad_norm": 0.10654214769601822, "learning_rate": 7.595666205624712e-07, "logits/chosen": -1.6439287662506104, "logits/rejected": 4.115915298461914, "logps/chosen": -498.05078125, "logps/rejected": -1107.330810546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.457066059112549, "rewards/margins": 31.892675399780273, "rewards/rejected": -38.34973907470703, "step": 4162 }, { "epoch": 2.5897356143079318, "grad_norm": 2.1090505697429762e-07, "learning_rate": 7.584140156754264e-07, "logits/chosen": 1.5892367362976074, "logits/rejected": 3.2688088417053223, "logps/chosen": -545.9234619140625, "logps/rejected": -955.6949462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.612250328063965, "rewards/margins": 31.79043197631836, "rewards/rejected": -38.402679443359375, "step": 4163 }, { "epoch": 2.590357698289269, "grad_norm": 0.054204028099775314, "learning_rate": 7.572614107883818e-07, "logits/chosen": 1.239471673965454, "logits/rejected": 2.5942583084106445, "logps/chosen": -691.4280395507812, "logps/rejected": -923.4697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.972702026367188, "rewards/margins": 16.35270881652832, "rewards/rejected": -25.325408935546875, "step": 4164 }, { "epoch": 2.5909797822706064, "grad_norm": 0.05186247453093529, "learning_rate": 7.56108805901337e-07, "logits/chosen": 1.7655837535858154, "logits/rejected": 3.956921100616455, "logps/chosen": -590.0676879882812, "logps/rejected": -1094.4619140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.105931282043457, "rewards/margins": 33.443580627441406, "rewards/rejected": -41.54951477050781, "step": 4165 }, { "epoch": 2.5916018662519438, "grad_norm": 0.020832421258091927, "learning_rate": 7.549562010142923e-07, "logits/chosen": -0.06915748119354248, "logits/rejected": 4.698666572570801, "logps/chosen": -433.6332702636719, "logps/rejected": -938.1735229492188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.690306663513184, "rewards/margins": 25.77753448486328, "rewards/rejected": -32.46784210205078, "step": 4166 }, { "epoch": 2.5922239502332816, "grad_norm": 0.0976177230477333, "learning_rate": 7.538035961272477e-07, "logits/chosen": 3.007206916809082, "logits/rejected": 3.8235087394714355, "logps/chosen": -775.51904296875, "logps/rejected": -1105.434814453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -10.156403541564941, "rewards/margins": 24.128873825073242, "rewards/rejected": -34.2852783203125, "step": 4167 }, { "epoch": 2.592846034214619, "grad_norm": 0.0019352661911398172, "learning_rate": 7.526509912402029e-07, "logits/chosen": -1.8974697589874268, "logits/rejected": 3.135394334793091, "logps/chosen": -375.7920837402344, "logps/rejected": -905.7713012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.973499298095703, "rewards/margins": 31.2205867767334, "rewards/rejected": -37.19408416748047, "step": 4168 }, { "epoch": 2.5934681181959567, "grad_norm": 0.020845483988523483, "learning_rate": 7.514983863531582e-07, "logits/chosen": 1.5700950622558594, "logits/rejected": 4.825531482696533, "logps/chosen": -542.24951171875, "logps/rejected": -868.7842407226562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.495453834533691, "rewards/margins": 17.05913734436035, "rewards/rejected": -24.55459213256836, "step": 4169 }, { "epoch": 2.594090202177294, "grad_norm": 1.3278538801841933e-07, "learning_rate": 7.503457814661134e-07, "logits/chosen": -1.3590807914733887, "logits/rejected": 3.1065785884857178, "logps/chosen": -428.7196044921875, "logps/rejected": -1092.657470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.047929763793945, "rewards/margins": 36.87749099731445, "rewards/rejected": -44.92542266845703, "step": 4170 }, { "epoch": 2.5947122861586314, "grad_norm": 0.5693565011024475, "learning_rate": 7.491931765790688e-07, "logits/chosen": -1.617149829864502, "logits/rejected": 2.5822722911834717, "logps/chosen": -352.824951171875, "logps/rejected": -885.1978759765625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -7.319247245788574, "rewards/margins": 29.19968032836914, "rewards/rejected": -36.51892852783203, "step": 4171 }, { "epoch": 2.5953343701399687, "grad_norm": 0.016859617084264755, "learning_rate": 7.48040571692024e-07, "logits/chosen": -1.226083755493164, "logits/rejected": 1.5435649156570435, "logps/chosen": -426.6649169921875, "logps/rejected": -858.7387084960938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.303621292114258, "rewards/margins": 23.738893508911133, "rewards/rejected": -31.04251480102539, "step": 4172 }, { "epoch": 2.5959564541213065, "grad_norm": 1.0266690830817993e-09, "learning_rate": 7.468879668049793e-07, "logits/chosen": -1.7223788499832153, "logits/rejected": 3.0244596004486084, "logps/chosen": -518.0752563476562, "logps/rejected": -1203.6407470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.524657249450684, "rewards/margins": 37.38650131225586, "rewards/rejected": -43.911155700683594, "step": 4173 }, { "epoch": 2.596578538102644, "grad_norm": 1.537376556370873e-05, "learning_rate": 7.457353619179345e-07, "logits/chosen": -2.407674789428711, "logits/rejected": 2.881885051727295, "logps/chosen": -268.8407897949219, "logps/rejected": -897.4033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.9550981521606445, "rewards/margins": 34.330726623535156, "rewards/rejected": -39.285823822021484, "step": 4174 }, { "epoch": 2.5972006220839816, "grad_norm": 7.735238614259288e-05, "learning_rate": 7.445827570308899e-07, "logits/chosen": -0.7990444898605347, "logits/rejected": 3.784900665283203, "logps/chosen": -430.090087890625, "logps/rejected": -1010.958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.116263389587402, "rewards/margins": 29.52863311767578, "rewards/rejected": -37.6448974609375, "step": 4175 }, { "epoch": 2.597822706065319, "grad_norm": 0.20494690537452698, "learning_rate": 7.434301521438451e-07, "logits/chosen": -1.503658652305603, "logits/rejected": 3.8516783714294434, "logps/chosen": -463.29217529296875, "logps/rejected": -1050.017578125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.306408405303955, "rewards/margins": 33.17970275878906, "rewards/rejected": -39.486106872558594, "step": 4176 }, { "epoch": 2.5984447900466563, "grad_norm": 0.01972721703350544, "learning_rate": 7.422775472568004e-07, "logits/chosen": 0.09046536684036255, "logits/rejected": 4.269755840301514, "logps/chosen": -529.8043212890625, "logps/rejected": -1187.9677734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.779526710510254, "rewards/margins": 39.77001190185547, "rewards/rejected": -49.54953384399414, "step": 4177 }, { "epoch": 2.5990668740279936, "grad_norm": 0.051523786038160324, "learning_rate": 7.411249423697558e-07, "logits/chosen": 1.389330267906189, "logits/rejected": 4.2536211013793945, "logps/chosen": -682.2140502929688, "logps/rejected": -1222.2752685546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.666873931884766, "rewards/margins": 33.9172477722168, "rewards/rejected": -46.58412170410156, "step": 4178 }, { "epoch": 2.5996889580093314, "grad_norm": 0.0033380291424691677, "learning_rate": 7.39972337482711e-07, "logits/chosen": 1.5119990110397339, "logits/rejected": 3.1918954849243164, "logps/chosen": -627.9554443359375, "logps/rejected": -1112.017822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.91726303100586, "rewards/margins": 32.120941162109375, "rewards/rejected": -47.038204193115234, "step": 4179 }, { "epoch": 2.6003110419906688, "grad_norm": 0.20266841351985931, "learning_rate": 7.388197325956663e-07, "logits/chosen": 1.38478422164917, "logits/rejected": 3.9541900157928467, "logps/chosen": -599.4830322265625, "logps/rejected": -1026.7489013671875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -7.972454071044922, "rewards/margins": 31.13959503173828, "rewards/rejected": -39.11205291748047, "step": 4180 }, { "epoch": 2.600933125972006, "grad_norm": 35.50774002075195, "learning_rate": 7.376671277086215e-07, "logits/chosen": 0.7602481245994568, "logits/rejected": 2.566112995147705, "logps/chosen": -683.4918823242188, "logps/rejected": -1027.614501953125, "loss": 0.3128, "rewards/accuracies": 0.875, "rewards/chosen": -13.960643768310547, "rewards/margins": 21.349445343017578, "rewards/rejected": -35.310089111328125, "step": 4181 }, { "epoch": 2.601555209953344, "grad_norm": 0.0006896215490996838, "learning_rate": 7.365145228215769e-07, "logits/chosen": -0.33327698707580566, "logits/rejected": 1.4698470830917358, "logps/chosen": -489.9069519042969, "logps/rejected": -799.717041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.703483581542969, "rewards/margins": 22.704559326171875, "rewards/rejected": -32.408042907714844, "step": 4182 }, { "epoch": 2.6021772939346812, "grad_norm": 1.4685786962509155, "learning_rate": 7.353619179345321e-07, "logits/chosen": 2.564486265182495, "logits/rejected": 5.060420989990234, "logps/chosen": -447.52593994140625, "logps/rejected": -886.5484619140625, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -6.470832347869873, "rewards/margins": 30.235698699951172, "rewards/rejected": -36.70652770996094, "step": 4183 }, { "epoch": 2.6027993779160186, "grad_norm": 8.54348618304357e-05, "learning_rate": 7.342093130474874e-07, "logits/chosen": 1.4889253377914429, "logits/rejected": 3.0097999572753906, "logps/chosen": -586.3075561523438, "logps/rejected": -1041.024658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.637677192687988, "rewards/margins": 31.820865631103516, "rewards/rejected": -40.45854187011719, "step": 4184 }, { "epoch": 2.603421461897356, "grad_norm": 13.519923210144043, "learning_rate": 7.330567081604426e-07, "logits/chosen": -0.6276093125343323, "logits/rejected": 3.1709046363830566, "logps/chosen": -478.00921630859375, "logps/rejected": -950.1629028320312, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": -7.828385353088379, "rewards/margins": 25.66582489013672, "rewards/rejected": -33.49420928955078, "step": 4185 }, { "epoch": 2.6040435458786937, "grad_norm": 1.037954278082509e-09, "learning_rate": 7.31904103273398e-07, "logits/chosen": -0.8776130080223083, "logits/rejected": 3.4253296852111816, "logps/chosen": -388.2559509277344, "logps/rejected": -1020.8163452148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2152509689331055, "rewards/margins": 38.56673049926758, "rewards/rejected": -44.781982421875, "step": 4186 }, { "epoch": 2.604665629860031, "grad_norm": 0.04148663580417633, "learning_rate": 7.307514983863533e-07, "logits/chosen": -2.193047046661377, "logits/rejected": 2.826978921890259, "logps/chosen": -340.1023254394531, "logps/rejected": -961.1428833007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.069295883178711, "rewards/margins": 31.567243576049805, "rewards/rejected": -37.636539459228516, "step": 4187 }, { "epoch": 2.605287713841369, "grad_norm": 0.0005553133087232709, "learning_rate": 7.295988934993085e-07, "logits/chosen": 0.2553267478942871, "logits/rejected": 2.0529046058654785, "logps/chosen": -596.5380249023438, "logps/rejected": -1032.4805908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.72260856628418, "rewards/margins": 35.551048278808594, "rewards/rejected": -46.27365493774414, "step": 4188 }, { "epoch": 2.605909797822706, "grad_norm": 0.00013095911708660424, "learning_rate": 7.284462886122639e-07, "logits/chosen": 0.29065048694610596, "logits/rejected": 3.1158859729766846, "logps/chosen": -503.50836181640625, "logps/rejected": -957.869384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.637614727020264, "rewards/margins": 27.772708892822266, "rewards/rejected": -34.41032409667969, "step": 4189 }, { "epoch": 2.6065318818040435, "grad_norm": 0.001143694738857448, "learning_rate": 7.272936837252191e-07, "logits/chosen": -0.21890854835510254, "logits/rejected": 3.120697498321533, "logps/chosen": -594.9986572265625, "logps/rejected": -1057.1905517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.957072257995605, "rewards/margins": 29.239805221557617, "rewards/rejected": -41.196876525878906, "step": 4190 }, { "epoch": 2.607153965785381, "grad_norm": 9.123002087108034e-08, "learning_rate": 7.261410788381744e-07, "logits/chosen": 3.8174686431884766, "logits/rejected": 4.417840003967285, "logps/chosen": -737.0982055664062, "logps/rejected": -1032.296142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.874161720275879, "rewards/margins": 29.188743591308594, "rewards/rejected": -37.06290817260742, "step": 4191 }, { "epoch": 2.6077760497667186, "grad_norm": 0.0028207304421812296, "learning_rate": 7.249884739511296e-07, "logits/chosen": -0.5841156244277954, "logits/rejected": 2.005009889602661, "logps/chosen": -495.2757568359375, "logps/rejected": -946.2617797851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.724778175354004, "rewards/margins": 30.42086410522461, "rewards/rejected": -39.14564514160156, "step": 4192 }, { "epoch": 2.608398133748056, "grad_norm": 0.032835643738508224, "learning_rate": 7.23835869064085e-07, "logits/chosen": 0.36219626665115356, "logits/rejected": 3.0236282348632812, "logps/chosen": -582.063720703125, "logps/rejected": -1049.0797119140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.190608978271484, "rewards/margins": 28.47592544555664, "rewards/rejected": -38.666534423828125, "step": 4193 }, { "epoch": 2.6090202177293937, "grad_norm": 9.084986651863858e-10, "learning_rate": 7.226832641770402e-07, "logits/chosen": -0.41260039806365967, "logits/rejected": 3.4702682495117188, "logps/chosen": -510.906982421875, "logps/rejected": -1089.784912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.021139144897461, "rewards/margins": 36.50783157348633, "rewards/rejected": -45.528968811035156, "step": 4194 }, { "epoch": 2.609642301710731, "grad_norm": 0.00016271619824692607, "learning_rate": 7.215306592899954e-07, "logits/chosen": -1.4239130020141602, "logits/rejected": 3.2922325134277344, "logps/chosen": -454.74200439453125, "logps/rejected": -1156.666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.663017272949219, "rewards/margins": 35.54042053222656, "rewards/rejected": -45.20343780517578, "step": 4195 }, { "epoch": 2.6102643856920684, "grad_norm": 0.011077502742409706, "learning_rate": 7.203780544029507e-07, "logits/chosen": 0.8665391802787781, "logits/rejected": 3.879478931427002, "logps/chosen": -521.7965087890625, "logps/rejected": -966.2922973632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.905708312988281, "rewards/margins": 27.83869171142578, "rewards/rejected": -36.74440002441406, "step": 4196 }, { "epoch": 2.6108864696734058, "grad_norm": 3.622733856900595e-05, "learning_rate": 7.19225449515906e-07, "logits/chosen": 0.4364289343357086, "logits/rejected": 3.5153493881225586, "logps/chosen": -476.50048828125, "logps/rejected": -972.5951538085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.1321916580200195, "rewards/margins": 33.4731330871582, "rewards/rejected": -38.605323791503906, "step": 4197 }, { "epoch": 2.6115085536547435, "grad_norm": 0.9528161883354187, "learning_rate": 7.180728446288612e-07, "logits/chosen": 2.507401943206787, "logits/rejected": 3.8850791454315186, "logps/chosen": -719.9276733398438, "logps/rejected": -1035.9569091796875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -9.24326229095459, "rewards/margins": 26.770294189453125, "rewards/rejected": -36.01355743408203, "step": 4198 }, { "epoch": 2.612130637636081, "grad_norm": 1.709455204945698e-06, "learning_rate": 7.169202397418165e-07, "logits/chosen": -4.1853742599487305, "logits/rejected": 3.332636594772339, "logps/chosen": -352.78643798828125, "logps/rejected": -1202.97021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.660493850708008, "rewards/margins": 36.10469055175781, "rewards/rejected": -40.76518249511719, "step": 4199 }, { "epoch": 2.6127527216174182, "grad_norm": 0.04437851160764694, "learning_rate": 7.157676348547718e-07, "logits/chosen": -1.551134467124939, "logits/rejected": 1.4714164733886719, "logps/chosen": -543.7584838867188, "logps/rejected": -1100.864013671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.794875621795654, "rewards/margins": 33.39886474609375, "rewards/rejected": -41.19374084472656, "step": 4200 }, { "epoch": 2.613374805598756, "grad_norm": 2.520590305328369, "learning_rate": 7.146150299677271e-07, "logits/chosen": 0.9646257162094116, "logits/rejected": 3.8723604679107666, "logps/chosen": -569.6227416992188, "logps/rejected": -960.802734375, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -8.596766471862793, "rewards/margins": 24.772981643676758, "rewards/rejected": -33.3697509765625, "step": 4201 }, { "epoch": 2.6139968895800934, "grad_norm": 0.025293413549661636, "learning_rate": 7.134624250806823e-07, "logits/chosen": 0.3296043872833252, "logits/rejected": 3.7700836658477783, "logps/chosen": -566.0994873046875, "logps/rejected": -1046.325439453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.096038818359375, "rewards/margins": 27.5986328125, "rewards/rejected": -36.694671630859375, "step": 4202 }, { "epoch": 2.6146189735614307, "grad_norm": 0.2624208629131317, "learning_rate": 7.123098201936376e-07, "logits/chosen": 1.185504674911499, "logits/rejected": 3.8389434814453125, "logps/chosen": -528.7069702148438, "logps/rejected": -997.303466796875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -8.241028785705566, "rewards/margins": 23.622182846069336, "rewards/rejected": -31.86321258544922, "step": 4203 }, { "epoch": 2.615241057542768, "grad_norm": 0.08798151463270187, "learning_rate": 7.11157215306593e-07, "logits/chosen": -4.298510551452637, "logits/rejected": 1.096667766571045, "logps/chosen": -258.92425537109375, "logps/rejected": -887.087646484375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.425273895263672, "rewards/margins": 31.15479278564453, "rewards/rejected": -35.5800666809082, "step": 4204 }, { "epoch": 2.615863141524106, "grad_norm": 0.2985028326511383, "learning_rate": 7.100046104195482e-07, "logits/chosen": 1.4374034404754639, "logits/rejected": 4.240245342254639, "logps/chosen": -624.4095458984375, "logps/rejected": -1162.5343017578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.385326385498047, "rewards/margins": 32.3258056640625, "rewards/rejected": -40.71113204956055, "step": 4205 }, { "epoch": 2.616485225505443, "grad_norm": 16.65906524658203, "learning_rate": 7.088520055325035e-07, "logits/chosen": 1.678907036781311, "logits/rejected": 4.096469402313232, "logps/chosen": -486.4000244140625, "logps/rejected": -908.15576171875, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": -5.184075355529785, "rewards/margins": 24.814620971679688, "rewards/rejected": -29.998699188232422, "step": 4206 }, { "epoch": 2.617107309486781, "grad_norm": 0.00014799633936490864, "learning_rate": 7.076994006454588e-07, "logits/chosen": 0.9702072739601135, "logits/rejected": 3.7747740745544434, "logps/chosen": -652.5638427734375, "logps/rejected": -1131.428955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.863973617553711, "rewards/margins": 33.608184814453125, "rewards/rejected": -42.47215270996094, "step": 4207 }, { "epoch": 2.6177293934681183, "grad_norm": 8.743905345909297e-05, "learning_rate": 7.065467957584141e-07, "logits/chosen": 2.3410634994506836, "logits/rejected": 2.0488579273223877, "logps/chosen": -680.400146484375, "logps/rejected": -1035.8314208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.8915433883667, "rewards/margins": 28.13915252685547, "rewards/rejected": -37.030696868896484, "step": 4208 }, { "epoch": 2.6183514774494556, "grad_norm": 5.205459956414416e-07, "learning_rate": 7.053941908713693e-07, "logits/chosen": -0.7137738466262817, "logits/rejected": 3.436619997024536, "logps/chosen": -405.1341247558594, "logps/rejected": -900.43310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.368278980255127, "rewards/margins": 26.511754989624023, "rewards/rejected": -31.880033493041992, "step": 4209 }, { "epoch": 2.618973561430793, "grad_norm": 0.01867208629846573, "learning_rate": 7.042415859843246e-07, "logits/chosen": -0.14740872383117676, "logits/rejected": 0.6668645739555359, "logps/chosen": -590.059326171875, "logps/rejected": -878.8848876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.67483901977539, "rewards/margins": 27.501344680786133, "rewards/rejected": -36.176185607910156, "step": 4210 }, { "epoch": 2.6195956454121307, "grad_norm": 0.36106786131858826, "learning_rate": 7.030889810972799e-07, "logits/chosen": -1.5582423210144043, "logits/rejected": 1.5341298580169678, "logps/chosen": -435.0976257324219, "logps/rejected": -801.2057495117188, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -8.826573371887207, "rewards/margins": 23.22080421447754, "rewards/rejected": -32.04737854003906, "step": 4211 }, { "epoch": 2.620217729393468, "grad_norm": 0.06946823000907898, "learning_rate": 7.019363762102352e-07, "logits/chosen": -0.030036628246307373, "logits/rejected": 3.2860264778137207, "logps/chosen": -477.8688049316406, "logps/rejected": -938.0089111328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.1399126052856445, "rewards/margins": 25.384279251098633, "rewards/rejected": -30.524192810058594, "step": 4212 }, { "epoch": 2.620839813374806, "grad_norm": 4.984333038330078, "learning_rate": 7.007837713231904e-07, "logits/chosen": -2.0527803897857666, "logits/rejected": 2.763148307800293, "logps/chosen": -382.4664306640625, "logps/rejected": -986.9502563476562, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -7.928335189819336, "rewards/margins": 27.36969757080078, "rewards/rejected": -35.298030853271484, "step": 4213 }, { "epoch": 2.621461897356143, "grad_norm": 0.27239108085632324, "learning_rate": 6.996311664361458e-07, "logits/chosen": 0.09156519174575806, "logits/rejected": 4.214585304260254, "logps/chosen": -508.62384033203125, "logps/rejected": -1109.5693359375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -10.311394691467285, "rewards/margins": 32.705902099609375, "rewards/rejected": -43.017295837402344, "step": 4214 }, { "epoch": 2.6220839813374806, "grad_norm": 5.2649149438366294e-05, "learning_rate": 6.984785615491011e-07, "logits/chosen": -0.7687242031097412, "logits/rejected": 2.682828903198242, "logps/chosen": -498.4187927246094, "logps/rejected": -976.85107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1673665046691895, "rewards/margins": 24.873815536499023, "rewards/rejected": -32.04118347167969, "step": 4215 }, { "epoch": 2.622706065318818, "grad_norm": 0.0028433254919946194, "learning_rate": 6.973259566620563e-07, "logits/chosen": 0.5688213109970093, "logits/rejected": 1.6913548707962036, "logps/chosen": -529.4207763671875, "logps/rejected": -961.5525512695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.866388320922852, "rewards/margins": 31.74820327758789, "rewards/rejected": -40.61458969116211, "step": 4216 }, { "epoch": 2.6233281493001557, "grad_norm": 0.11423134058713913, "learning_rate": 6.961733517750116e-07, "logits/chosen": 1.4282294511795044, "logits/rejected": 3.881552219390869, "logps/chosen": -537.96533203125, "logps/rejected": -983.5394897460938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.764850616455078, "rewards/margins": 26.096805572509766, "rewards/rejected": -34.86166000366211, "step": 4217 }, { "epoch": 2.623950233281493, "grad_norm": 2.352363480895292e-05, "learning_rate": 6.950207468879669e-07, "logits/chosen": 1.5548824071884155, "logits/rejected": 3.7878293991088867, "logps/chosen": -506.8604736328125, "logps/rejected": -974.7597045898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.410656929016113, "rewards/margins": 29.260358810424805, "rewards/rejected": -36.671016693115234, "step": 4218 }, { "epoch": 2.6245723172628304, "grad_norm": 0.002875835634768009, "learning_rate": 6.938681420009222e-07, "logits/chosen": 1.1211720705032349, "logits/rejected": 1.8514307737350464, "logps/chosen": -586.6343994140625, "logps/rejected": -912.5908813476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.89848518371582, "rewards/margins": 24.800308227539062, "rewards/rejected": -35.698795318603516, "step": 4219 }, { "epoch": 2.625194401244168, "grad_norm": 29.071430206298828, "learning_rate": 6.927155371138774e-07, "logits/chosen": 1.4410120248794556, "logits/rejected": 5.300469875335693, "logps/chosen": -619.1028442382812, "logps/rejected": -1141.4307861328125, "loss": 0.1668, "rewards/accuracies": 0.875, "rewards/chosen": -11.94244384765625, "rewards/margins": 26.28636932373047, "rewards/rejected": -38.22881317138672, "step": 4220 }, { "epoch": 2.6258164852255055, "grad_norm": 0.12146138399839401, "learning_rate": 6.915629322268328e-07, "logits/chosen": 0.5489065051078796, "logits/rejected": 2.0479743480682373, "logps/chosen": -506.0579833984375, "logps/rejected": -892.6707153320312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -7.431719779968262, "rewards/margins": 28.322105407714844, "rewards/rejected": -35.75382614135742, "step": 4221 }, { "epoch": 2.626438569206843, "grad_norm": 48.736846923828125, "learning_rate": 6.90410327339788e-07, "logits/chosen": -0.009646058082580566, "logits/rejected": 3.260806083679199, "logps/chosen": -595.076171875, "logps/rejected": -1000.2465209960938, "loss": 0.2363, "rewards/accuracies": 0.875, "rewards/chosen": -10.931441307067871, "rewards/margins": 22.281841278076172, "rewards/rejected": -33.21328353881836, "step": 4222 }, { "epoch": 2.62706065318818, "grad_norm": 0.11191926151514053, "learning_rate": 6.892577224527433e-07, "logits/chosen": 0.5235657691955566, "logits/rejected": 1.8013144731521606, "logps/chosen": -515.4962158203125, "logps/rejected": -802.4577026367188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.3546319007873535, "rewards/margins": 20.22857093811035, "rewards/rejected": -26.583202362060547, "step": 4223 }, { "epoch": 2.627682737169518, "grad_norm": 6.194474266862926e-09, "learning_rate": 6.881051175656986e-07, "logits/chosen": -1.7713825702667236, "logits/rejected": 0.584291398525238, "logps/chosen": -435.9535827636719, "logps/rejected": -888.5193481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.4394989013671875, "rewards/margins": 29.70003318786621, "rewards/rejected": -37.13953399658203, "step": 4224 }, { "epoch": 2.6283048211508553, "grad_norm": 0.0005869278102181852, "learning_rate": 6.869525126786539e-07, "logits/chosen": 0.9013730883598328, "logits/rejected": 3.2150769233703613, "logps/chosen": -431.56793212890625, "logps/rejected": -943.7103271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.473167419433594, "rewards/margins": 33.50733184814453, "rewards/rejected": -41.980499267578125, "step": 4225 }, { "epoch": 2.628926905132193, "grad_norm": 3.5658481121063232, "learning_rate": 6.857999077916092e-07, "logits/chosen": 1.838148832321167, "logits/rejected": 2.4090681076049805, "logps/chosen": -568.6758422851562, "logps/rejected": -890.5926513671875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -7.6999053955078125, "rewards/margins": 27.453750610351562, "rewards/rejected": -35.153656005859375, "step": 4226 }, { "epoch": 2.6295489891135304, "grad_norm": 0.2820209562778473, "learning_rate": 6.846473029045644e-07, "logits/chosen": -0.5001500844955444, "logits/rejected": 3.3401925563812256, "logps/chosen": -506.65838623046875, "logps/rejected": -1072.645751953125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -6.734173774719238, "rewards/margins": 38.09546661376953, "rewards/rejected": -44.82963943481445, "step": 4227 }, { "epoch": 2.6301710730948678, "grad_norm": 0.4728274345397949, "learning_rate": 6.834946980175195e-07, "logits/chosen": 1.5491831302642822, "logits/rejected": 2.317342519760132, "logps/chosen": -675.6054077148438, "logps/rejected": -868.92236328125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -9.160175323486328, "rewards/margins": 17.74323844909668, "rewards/rejected": -26.903411865234375, "step": 4228 }, { "epoch": 2.630793157076205, "grad_norm": 9.116470336914062, "learning_rate": 6.823420931304749e-07, "logits/chosen": -1.4375827312469482, "logits/rejected": 2.915127754211426, "logps/chosen": -476.7994384765625, "logps/rejected": -1074.4156494140625, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -8.458136558532715, "rewards/margins": 31.899354934692383, "rewards/rejected": -40.35749053955078, "step": 4229 }, { "epoch": 2.631415241057543, "grad_norm": 5.379433787311427e-07, "learning_rate": 6.811894882434302e-07, "logits/chosen": -0.5646690130233765, "logits/rejected": 2.736670970916748, "logps/chosen": -390.4752197265625, "logps/rejected": -953.4425048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.021817684173584, "rewards/margins": 34.986270904541016, "rewards/rejected": -41.008087158203125, "step": 4230 }, { "epoch": 2.63203732503888, "grad_norm": 0.08292002230882645, "learning_rate": 6.800368833563854e-07, "logits/chosen": -0.05222213268280029, "logits/rejected": 3.956101179122925, "logps/chosen": -325.4035339355469, "logps/rejected": -900.9954833984375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.976347923278809, "rewards/margins": 30.259437561035156, "rewards/rejected": -36.23578643798828, "step": 4231 }, { "epoch": 2.632659409020218, "grad_norm": 0.009427044540643692, "learning_rate": 6.788842784693408e-07, "logits/chosen": 1.4390075206756592, "logits/rejected": 3.480053186416626, "logps/chosen": -663.7787475585938, "logps/rejected": -959.8733520507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.171966552734375, "rewards/margins": 21.327939987182617, "rewards/rejected": -31.49990463256836, "step": 4232 }, { "epoch": 2.6332814930015553, "grad_norm": 0.06294838339090347, "learning_rate": 6.77731673582296e-07, "logits/chosen": -0.3899800181388855, "logits/rejected": 3.326247215270996, "logps/chosen": -421.4250793457031, "logps/rejected": -1036.8131103515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.207780361175537, "rewards/margins": 35.97682189941406, "rewards/rejected": -40.184600830078125, "step": 4233 }, { "epoch": 2.6339035769828927, "grad_norm": 0.1459599733352661, "learning_rate": 6.765790686952513e-07, "logits/chosen": 0.2932206392288208, "logits/rejected": 3.4834704399108887, "logps/chosen": -529.5743408203125, "logps/rejected": -1094.4615478515625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -12.644756317138672, "rewards/margins": 33.56782150268555, "rewards/rejected": -46.21257781982422, "step": 4234 }, { "epoch": 2.63452566096423, "grad_norm": 3.442986098889378e-06, "learning_rate": 6.754264638082065e-07, "logits/chosen": -1.2240034341812134, "logits/rejected": 2.0704448223114014, "logps/chosen": -433.75238037109375, "logps/rejected": -1079.27978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.548064231872559, "rewards/margins": 40.05144119262695, "rewards/rejected": -45.59950637817383, "step": 4235 }, { "epoch": 2.635147744945568, "grad_norm": 2.317560911178589, "learning_rate": 6.742738589211619e-07, "logits/chosen": -0.23785221576690674, "logits/rejected": 4.1717119216918945, "logps/chosen": -499.7503662109375, "logps/rejected": -1082.9619140625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -6.164165496826172, "rewards/margins": 29.362163543701172, "rewards/rejected": -35.526329040527344, "step": 4236 }, { "epoch": 2.635769828926905, "grad_norm": 0.22036594152450562, "learning_rate": 6.731212540341171e-07, "logits/chosen": 1.2997395992279053, "logits/rejected": 2.5405406951904297, "logps/chosen": -779.3462524414062, "logps/rejected": -1011.1041870117188, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -13.838093757629395, "rewards/margins": 20.889070510864258, "rewards/rejected": -34.72716522216797, "step": 4237 }, { "epoch": 2.6363919129082425, "grad_norm": 0.002922237850725651, "learning_rate": 6.719686491470724e-07, "logits/chosen": -0.7122418880462646, "logits/rejected": 2.964954376220703, "logps/chosen": -419.7462463378906, "logps/rejected": -898.150634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.0358076095581055, "rewards/margins": 24.98333168029785, "rewards/rejected": -31.019140243530273, "step": 4238 }, { "epoch": 2.6370139968895803, "grad_norm": 0.96134352684021, "learning_rate": 6.708160442600276e-07, "logits/chosen": 0.594042181968689, "logits/rejected": 2.318164348602295, "logps/chosen": -532.7374877929688, "logps/rejected": -886.8975830078125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -12.02415657043457, "rewards/margins": 23.79339599609375, "rewards/rejected": -35.81755447387695, "step": 4239 }, { "epoch": 2.6376360808709176, "grad_norm": 44.25870132446289, "learning_rate": 6.69663439372983e-07, "logits/chosen": 2.3529224395751953, "logits/rejected": 1.3345285654067993, "logps/chosen": -757.313232421875, "logps/rejected": -974.56298828125, "loss": 0.6363, "rewards/accuracies": 0.875, "rewards/chosen": -13.141068458557129, "rewards/margins": 16.502092361450195, "rewards/rejected": -29.643163681030273, "step": 4240 }, { "epoch": 2.638258164852255, "grad_norm": 22.81918716430664, "learning_rate": 6.685108344859383e-07, "logits/chosen": 2.233950614929199, "logits/rejected": 3.8848304748535156, "logps/chosen": -753.2386474609375, "logps/rejected": -1093.29833984375, "loss": 0.1278, "rewards/accuracies": 0.875, "rewards/chosen": -10.840153694152832, "rewards/margins": 23.329586029052734, "rewards/rejected": -34.16973876953125, "step": 4241 }, { "epoch": 2.6388802488335923, "grad_norm": 0.006664901971817017, "learning_rate": 6.673582295988935e-07, "logits/chosen": 0.8757166266441345, "logits/rejected": 1.6427512168884277, "logps/chosen": -549.896728515625, "logps/rejected": -811.0697631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.502927780151367, "rewards/margins": 22.282499313354492, "rewards/rejected": -30.78542709350586, "step": 4242 }, { "epoch": 2.63950233281493, "grad_norm": 0.22620686888694763, "learning_rate": 6.662056247118489e-07, "logits/chosen": 0.053068965673446655, "logits/rejected": 2.0805678367614746, "logps/chosen": -502.27984619140625, "logps/rejected": -819.5906982421875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -9.042535781860352, "rewards/margins": 20.76211166381836, "rewards/rejected": -29.804649353027344, "step": 4243 }, { "epoch": 2.6401244167962674, "grad_norm": 0.23999188840389252, "learning_rate": 6.650530198248041e-07, "logits/chosen": 2.5088069438934326, "logits/rejected": 3.2695717811584473, "logps/chosen": -784.7813110351562, "logps/rejected": -889.4437255859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.6802592277526855, "rewards/margins": 17.840471267700195, "rewards/rejected": -25.520729064941406, "step": 4244 }, { "epoch": 2.640746500777605, "grad_norm": 0.001498526893556118, "learning_rate": 6.639004149377594e-07, "logits/chosen": 1.8689944744110107, "logits/rejected": 4.283603191375732, "logps/chosen": -619.4450073242188, "logps/rejected": -1062.1727294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.174517631530762, "rewards/margins": 32.78430938720703, "rewards/rejected": -42.95882797241211, "step": 4245 }, { "epoch": 2.6413685847589425, "grad_norm": 1.6747339032008313e-05, "learning_rate": 6.627478100507146e-07, "logits/chosen": 1.833518385887146, "logits/rejected": 3.115447759628296, "logps/chosen": -584.89404296875, "logps/rejected": -947.2235717773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.330589294433594, "rewards/margins": 30.510927200317383, "rewards/rejected": -37.841514587402344, "step": 4246 }, { "epoch": 2.64199066874028, "grad_norm": 0.00851528998464346, "learning_rate": 6.6159520516367e-07, "logits/chosen": -2.964029312133789, "logits/rejected": 2.7679834365844727, "logps/chosen": -350.8724365234375, "logps/rejected": -992.7243041992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.659777641296387, "rewards/margins": 33.55049133300781, "rewards/rejected": -42.21027374267578, "step": 4247 }, { "epoch": 2.642612752721617, "grad_norm": 0.0012625380186364055, "learning_rate": 6.604426002766252e-07, "logits/chosen": -0.7499178051948547, "logits/rejected": 2.6562108993530273, "logps/chosen": -449.5132751464844, "logps/rejected": -996.1448364257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.215665817260742, "rewards/margins": 30.846622467041016, "rewards/rejected": -36.062286376953125, "step": 4248 }, { "epoch": 2.643234836702955, "grad_norm": 9.423551716736256e-09, "learning_rate": 6.592899953895805e-07, "logits/chosen": -0.9357733130455017, "logits/rejected": 3.503596067428589, "logps/chosen": -386.92279052734375, "logps/rejected": -1057.938232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.58002233505249, "rewards/margins": 38.17155838012695, "rewards/rejected": -45.75157928466797, "step": 4249 }, { "epoch": 2.6438569206842923, "grad_norm": 0.0967218279838562, "learning_rate": 6.581373905025359e-07, "logits/chosen": 0.3321017622947693, "logits/rejected": 2.5047409534454346, "logps/chosen": -614.6363525390625, "logps/rejected": -1019.7175903320312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.16756534576416, "rewards/margins": 29.467573165893555, "rewards/rejected": -39.63513946533203, "step": 4250 }, { "epoch": 2.64447900466563, "grad_norm": 0.010432337410748005, "learning_rate": 6.569847856154911e-07, "logits/chosen": -0.8032090663909912, "logits/rejected": 1.1724261045455933, "logps/chosen": -439.4854736328125, "logps/rejected": -797.3453369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.029415607452393, "rewards/margins": 22.670787811279297, "rewards/rejected": -29.70020294189453, "step": 4251 }, { "epoch": 2.6451010886469675, "grad_norm": 0.0005230961251072586, "learning_rate": 6.558321807284464e-07, "logits/chosen": -1.4568581581115723, "logits/rejected": 2.3403358459472656, "logps/chosen": -377.98309326171875, "logps/rejected": -865.370361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.655060291290283, "rewards/margins": 29.115367889404297, "rewards/rejected": -35.77043151855469, "step": 4252 }, { "epoch": 2.645723172628305, "grad_norm": 0.31717145442962646, "learning_rate": 6.546795758414016e-07, "logits/chosen": -1.226088285446167, "logits/rejected": 2.644490957260132, "logps/chosen": -423.13421630859375, "logps/rejected": -875.599853515625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -7.5810723304748535, "rewards/margins": 20.505836486816406, "rewards/rejected": -28.086910247802734, "step": 4253 }, { "epoch": 2.646345256609642, "grad_norm": 0.00606823805719614, "learning_rate": 6.53526970954357e-07, "logits/chosen": -1.1096882820129395, "logits/rejected": 3.2076497077941895, "logps/chosen": -490.0120849609375, "logps/rejected": -1097.8856201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.587215900421143, "rewards/margins": 33.65058898925781, "rewards/rejected": -41.23780059814453, "step": 4254 }, { "epoch": 2.64696734059098, "grad_norm": 0.004925860557705164, "learning_rate": 6.523743660673122e-07, "logits/chosen": -1.5207152366638184, "logits/rejected": 0.664267361164093, "logps/chosen": -440.29119873046875, "logps/rejected": -877.4176025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.222831726074219, "rewards/margins": 26.639625549316406, "rewards/rejected": -35.862457275390625, "step": 4255 }, { "epoch": 2.6475894245723173, "grad_norm": 20.94559097290039, "learning_rate": 6.512217611802675e-07, "logits/chosen": -0.6214895248413086, "logits/rejected": 2.9224040508270264, "logps/chosen": -532.4989013671875, "logps/rejected": -1065.483642578125, "loss": 0.1216, "rewards/accuracies": 0.875, "rewards/chosen": -9.967597007751465, "rewards/margins": 25.820096969604492, "rewards/rejected": -35.78769302368164, "step": 4256 }, { "epoch": 2.6482115085536546, "grad_norm": 0.10524414479732513, "learning_rate": 6.500691562932227e-07, "logits/chosen": 1.6900782585144043, "logits/rejected": 5.192551136016846, "logps/chosen": -601.140625, "logps/rejected": -1032.5745849609375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -14.735567092895508, "rewards/margins": 22.786855697631836, "rewards/rejected": -37.522422790527344, "step": 4257 }, { "epoch": 2.6488335925349924, "grad_norm": 0.00883454643189907, "learning_rate": 6.489165514061781e-07, "logits/chosen": 1.3252453804016113, "logits/rejected": 3.5295932292938232, "logps/chosen": -600.4019775390625, "logps/rejected": -1145.2413330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.716447830200195, "rewards/margins": 39.48173141479492, "rewards/rejected": -48.19818115234375, "step": 4258 }, { "epoch": 2.6494556765163297, "grad_norm": 1.2012814295303542e-05, "learning_rate": 6.477639465191333e-07, "logits/chosen": -1.301712155342102, "logits/rejected": 3.1368601322174072, "logps/chosen": -518.835205078125, "logps/rejected": -1065.5616455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.9481201171875, "rewards/margins": 32.27141571044922, "rewards/rejected": -40.219539642333984, "step": 4259 }, { "epoch": 2.650077760497667, "grad_norm": 5.147276897332631e-05, "learning_rate": 6.466113416320886e-07, "logits/chosen": -0.972149670124054, "logits/rejected": 3.9147684574127197, "logps/chosen": -478.2498779296875, "logps/rejected": -1068.1552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.849447250366211, "rewards/margins": 34.413818359375, "rewards/rejected": -43.263267517089844, "step": 4260 }, { "epoch": 2.6506998444790044, "grad_norm": 32.1840934753418, "learning_rate": 6.45458736745044e-07, "logits/chosen": 0.1018313467502594, "logits/rejected": 1.8838446140289307, "logps/chosen": -594.6072998046875, "logps/rejected": -981.5043334960938, "loss": 0.2721, "rewards/accuracies": 0.875, "rewards/chosen": -8.896601676940918, "rewards/margins": 23.032018661499023, "rewards/rejected": -31.928619384765625, "step": 4261 }, { "epoch": 2.651321928460342, "grad_norm": 0.035230863839387894, "learning_rate": 6.443061318579991e-07, "logits/chosen": -1.6414093971252441, "logits/rejected": 2.646522283554077, "logps/chosen": -417.251220703125, "logps/rejected": -1044.8880615234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.817538261413574, "rewards/margins": 37.852577209472656, "rewards/rejected": -42.67011642456055, "step": 4262 }, { "epoch": 2.6519440124416795, "grad_norm": 1.312959341248643e-07, "learning_rate": 6.431535269709543e-07, "logits/chosen": -0.18359392881393433, "logits/rejected": 3.251091718673706, "logps/chosen": -436.75213623046875, "logps/rejected": -925.1771240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.020618438720703, "rewards/margins": 33.024742126464844, "rewards/rejected": -39.04536437988281, "step": 4263 }, { "epoch": 2.6525660964230173, "grad_norm": 0.014321111142635345, "learning_rate": 6.420009220839096e-07, "logits/chosen": -2.6356351375579834, "logits/rejected": 2.7543556690216064, "logps/chosen": -335.8462829589844, "logps/rejected": -937.9099731445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.071978569030762, "rewards/margins": 26.716182708740234, "rewards/rejected": -32.78816223144531, "step": 4264 }, { "epoch": 2.6531881804043547, "grad_norm": 0.016877740621566772, "learning_rate": 6.408483171968649e-07, "logits/chosen": 1.5884143114089966, "logits/rejected": 2.4657225608825684, "logps/chosen": -485.8375549316406, "logps/rejected": -776.3801879882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.200467109680176, "rewards/margins": 20.69041633605957, "rewards/rejected": -29.890884399414062, "step": 4265 }, { "epoch": 2.653810264385692, "grad_norm": 0.009289245121181011, "learning_rate": 6.396957123098202e-07, "logits/chosen": -1.7212367057800293, "logits/rejected": 0.07681708037853241, "logps/chosen": -541.7117309570312, "logps/rejected": -913.375732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.911618232727051, "rewards/margins": 26.93120574951172, "rewards/rejected": -34.84282302856445, "step": 4266 }, { "epoch": 2.6544323483670293, "grad_norm": 1.5398443053982191e-07, "learning_rate": 6.385431074227755e-07, "logits/chosen": 2.1448214054107666, "logits/rejected": 4.380794048309326, "logps/chosen": -633.0781860351562, "logps/rejected": -1132.920654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.251069068908691, "rewards/margins": 30.245288848876953, "rewards/rejected": -40.49635696411133, "step": 4267 }, { "epoch": 2.655054432348367, "grad_norm": 0.00034798384876921773, "learning_rate": 6.373905025357307e-07, "logits/chosen": -0.5650123357772827, "logits/rejected": 3.188966989517212, "logps/chosen": -424.6273193359375, "logps/rejected": -955.9008178710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.289674758911133, "rewards/margins": 34.93653869628906, "rewards/rejected": -40.22621154785156, "step": 4268 }, { "epoch": 2.6556765163297045, "grad_norm": 3.408320903778076, "learning_rate": 6.362378976486861e-07, "logits/chosen": 0.25153806805610657, "logits/rejected": 2.325662612915039, "logps/chosen": -574.9864501953125, "logps/rejected": -909.0303955078125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -6.206477165222168, "rewards/margins": 25.306110382080078, "rewards/rejected": -31.51258659362793, "step": 4269 }, { "epoch": 2.6562986003110423, "grad_norm": 0.0008155781542882323, "learning_rate": 6.350852927616413e-07, "logits/chosen": 1.7965188026428223, "logits/rejected": 3.6532082557678223, "logps/chosen": -685.572509765625, "logps/rejected": -1040.648193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.555252075195312, "rewards/margins": 24.258285522460938, "rewards/rejected": -33.813533782958984, "step": 4270 }, { "epoch": 2.6569206842923796, "grad_norm": 0.0809859037399292, "learning_rate": 6.339326878745966e-07, "logits/chosen": 2.1448750495910645, "logits/rejected": 3.0449576377868652, "logps/chosen": -635.3004150390625, "logps/rejected": -1008.4331665039062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.257442474365234, "rewards/margins": 29.437252044677734, "rewards/rejected": -37.6946907043457, "step": 4271 }, { "epoch": 2.657542768273717, "grad_norm": 0.06920414417982101, "learning_rate": 6.327800829875519e-07, "logits/chosen": 0.02657720446586609, "logits/rejected": 0.1941850781440735, "logps/chosen": -641.6112060546875, "logps/rejected": -884.4364624023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.948108673095703, "rewards/margins": 23.97146224975586, "rewards/rejected": -35.91957092285156, "step": 4272 }, { "epoch": 2.6581648522550543, "grad_norm": 1.436835527420044, "learning_rate": 6.316274781005072e-07, "logits/chosen": 0.5097701549530029, "logits/rejected": 2.3904218673706055, "logps/chosen": -774.1339111328125, "logps/rejected": -1126.98681640625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -15.555923461914062, "rewards/margins": 26.73133087158203, "rewards/rejected": -42.287254333496094, "step": 4273 }, { "epoch": 2.658786936236392, "grad_norm": 0.03071177937090397, "learning_rate": 6.304748732134624e-07, "logits/chosen": 1.7224812507629395, "logits/rejected": 5.085427284240723, "logps/chosen": -598.9939575195312, "logps/rejected": -1024.482177734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.686386108398438, "rewards/margins": 31.341815948486328, "rewards/rejected": -40.028202056884766, "step": 4274 }, { "epoch": 2.6594090202177294, "grad_norm": 4.7121588977461215e-06, "learning_rate": 6.293222683264177e-07, "logits/chosen": -0.3162381649017334, "logits/rejected": 2.9769697189331055, "logps/chosen": -521.7335815429688, "logps/rejected": -1134.259521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.706584930419922, "rewards/margins": 37.74868392944336, "rewards/rejected": -46.45526885986328, "step": 4275 }, { "epoch": 2.6600311041990667, "grad_norm": 2.563671588897705, "learning_rate": 6.28169663439373e-07, "logits/chosen": 1.0807900428771973, "logits/rejected": 2.9324049949645996, "logps/chosen": -572.3258056640625, "logps/rejected": -898.4095458984375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -10.48967170715332, "rewards/margins": 23.60786247253418, "rewards/rejected": -34.0975341796875, "step": 4276 }, { "epoch": 2.6606531881804045, "grad_norm": 0.10538554191589355, "learning_rate": 6.270170585523283e-07, "logits/chosen": 2.7478926181793213, "logits/rejected": 4.047188758850098, "logps/chosen": -658.8651733398438, "logps/rejected": -1066.6217041015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.698392868041992, "rewards/margins": 29.148723602294922, "rewards/rejected": -36.84711456298828, "step": 4277 }, { "epoch": 2.661275272161742, "grad_norm": 1.1563502550125122, "learning_rate": 6.258644536652836e-07, "logits/chosen": -0.7198779582977295, "logits/rejected": 1.4708242416381836, "logps/chosen": -403.05499267578125, "logps/rejected": -645.0602416992188, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -6.093057155609131, "rewards/margins": 15.91860580444336, "rewards/rejected": -22.01166343688965, "step": 4278 }, { "epoch": 2.661897356143079, "grad_norm": 0.0004465764795895666, "learning_rate": 6.247118487782389e-07, "logits/chosen": -1.3386738300323486, "logits/rejected": 3.840517520904541, "logps/chosen": -447.1226501464844, "logps/rejected": -1076.5313720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7728443145751953, "rewards/margins": 30.563493728637695, "rewards/rejected": -34.33633804321289, "step": 4279 }, { "epoch": 2.6625194401244165, "grad_norm": 0.08408825099468231, "learning_rate": 6.235592438911942e-07, "logits/chosen": 0.062289535999298096, "logits/rejected": 1.252807855606079, "logps/chosen": -457.46234130859375, "logps/rejected": -842.30517578125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.778433322906494, "rewards/margins": 23.938804626464844, "rewards/rejected": -31.71723747253418, "step": 4280 }, { "epoch": 2.6631415241057543, "grad_norm": 5.8949448430212215e-05, "learning_rate": 6.224066390041494e-07, "logits/chosen": -1.4612160921096802, "logits/rejected": 3.185497283935547, "logps/chosen": -463.0811462402344, "logps/rejected": -1052.7869873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.640124797821045, "rewards/margins": 29.35283660888672, "rewards/rejected": -35.992958068847656, "step": 4281 }, { "epoch": 2.6637636080870917, "grad_norm": 0.01346707995980978, "learning_rate": 6.212540341171047e-07, "logits/chosen": 1.049379825592041, "logits/rejected": 1.272294282913208, "logps/chosen": -743.18115234375, "logps/rejected": -1055.4158935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.955890655517578, "rewards/margins": 30.826642990112305, "rewards/rejected": -41.782535552978516, "step": 4282 }, { "epoch": 2.6643856920684295, "grad_norm": 14.5199556350708, "learning_rate": 6.2010142923006e-07, "logits/chosen": -0.46947622299194336, "logits/rejected": 3.629220962524414, "logps/chosen": -535.6415405273438, "logps/rejected": -1027.4896240234375, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": -10.940256118774414, "rewards/margins": 23.86663818359375, "rewards/rejected": -34.80689239501953, "step": 4283 }, { "epoch": 2.665007776049767, "grad_norm": 0.0023726599756628275, "learning_rate": 6.189488243430153e-07, "logits/chosen": 0.3610234558582306, "logits/rejected": 2.9661874771118164, "logps/chosen": -427.2809753417969, "logps/rejected": -845.6876220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.674696922302246, "rewards/margins": 25.183311462402344, "rewards/rejected": -32.858009338378906, "step": 4284 }, { "epoch": 2.665629860031104, "grad_norm": 40.15415573120117, "learning_rate": 6.177962194559705e-07, "logits/chosen": -0.06265552341938019, "logits/rejected": 2.3903772830963135, "logps/chosen": -485.881591796875, "logps/rejected": -884.794921875, "loss": 1.5431, "rewards/accuracies": 0.875, "rewards/chosen": -9.829121589660645, "rewards/margins": 25.358768463134766, "rewards/rejected": -35.18789291381836, "step": 4285 }, { "epoch": 2.6662519440124415, "grad_norm": 0.6707028746604919, "learning_rate": 6.166436145689259e-07, "logits/chosen": 0.2432861328125, "logits/rejected": 2.304138660430908, "logps/chosen": -518.6677856445312, "logps/rejected": -877.533447265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -11.821560859680176, "rewards/margins": 27.72966766357422, "rewards/rejected": -39.55122756958008, "step": 4286 }, { "epoch": 2.6668740279937793, "grad_norm": 2.2541589714819565e-05, "learning_rate": 6.154910096818812e-07, "logits/chosen": -0.033841878175735474, "logits/rejected": 2.4506354331970215, "logps/chosen": -722.0032958984375, "logps/rejected": -1138.4862060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.616849899291992, "rewards/margins": 28.83795738220215, "rewards/rejected": -40.45480728149414, "step": 4287 }, { "epoch": 2.6674961119751166, "grad_norm": 0.0012312207836657763, "learning_rate": 6.143384047948363e-07, "logits/chosen": -2.3437626361846924, "logits/rejected": 2.3316233158111572, "logps/chosen": -242.3722381591797, "logps/rejected": -801.2603759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.0629377365112305, "rewards/margins": 33.32283020019531, "rewards/rejected": -38.385765075683594, "step": 4288 }, { "epoch": 2.6681181959564544, "grad_norm": 0.00012092386896256357, "learning_rate": 6.131857999077916e-07, "logits/chosen": 0.732921302318573, "logits/rejected": 3.0997118949890137, "logps/chosen": -423.0313415527344, "logps/rejected": -842.7108764648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.711177825927734, "rewards/margins": 28.025440216064453, "rewards/rejected": -34.73662185668945, "step": 4289 }, { "epoch": 2.6687402799377917, "grad_norm": 0.4224456548690796, "learning_rate": 6.12033195020747e-07, "logits/chosen": 0.03448355197906494, "logits/rejected": 1.8089535236358643, "logps/chosen": -472.80010986328125, "logps/rejected": -842.5576171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.802806854248047, "rewards/margins": 22.627397537231445, "rewards/rejected": -31.43020248413086, "step": 4290 }, { "epoch": 2.669362363919129, "grad_norm": 2.015106446151549e-07, "learning_rate": 6.108805901337022e-07, "logits/chosen": -2.042537212371826, "logits/rejected": 2.324296474456787, "logps/chosen": -361.269287109375, "logps/rejected": -1081.171630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.318136692047119, "rewards/margins": 41.60655212402344, "rewards/rejected": -48.92469024658203, "step": 4291 }, { "epoch": 2.6699844479004664, "grad_norm": 0.09945330768823624, "learning_rate": 6.097279852466575e-07, "logits/chosen": -1.8823904991149902, "logits/rejected": 3.3184216022491455, "logps/chosen": -398.1663818359375, "logps/rejected": -925.0068359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.197899341583252, "rewards/margins": 24.55364990234375, "rewards/rejected": -29.751550674438477, "step": 4292 }, { "epoch": 2.670606531881804, "grad_norm": 0.008346091955900192, "learning_rate": 6.085753803596127e-07, "logits/chosen": 0.5325585603713989, "logits/rejected": 1.5247480869293213, "logps/chosen": -607.7169799804688, "logps/rejected": -879.230224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.277191162109375, "rewards/margins": 23.130996704101562, "rewards/rejected": -34.40818786621094, "step": 4293 }, { "epoch": 2.6712286158631415, "grad_norm": 0.059563759714365005, "learning_rate": 6.074227754725681e-07, "logits/chosen": 1.1496225595474243, "logits/rejected": 2.825745105743408, "logps/chosen": -553.5374145507812, "logps/rejected": -909.6398315429688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.7813825607299805, "rewards/margins": 24.857816696166992, "rewards/rejected": -32.63920211791992, "step": 4294 }, { "epoch": 2.671850699844479, "grad_norm": 0.03193175047636032, "learning_rate": 6.062701705855233e-07, "logits/chosen": 1.2888555526733398, "logits/rejected": 1.7476494312286377, "logps/chosen": -692.4351806640625, "logps/rejected": -1036.4249267578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.082293510437012, "rewards/margins": 30.193002700805664, "rewards/rejected": -45.27529525756836, "step": 4295 }, { "epoch": 2.6724727838258167, "grad_norm": 6.026350456522778e-05, "learning_rate": 6.051175656984786e-07, "logits/chosen": -1.0907959938049316, "logits/rejected": 1.9298566579818726, "logps/chosen": -275.3390808105469, "logps/rejected": -685.6051635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2236406803131104, "rewards/margins": 23.281383514404297, "rewards/rejected": -25.505023956298828, "step": 4296 }, { "epoch": 2.673094867807154, "grad_norm": 28.11423683166504, "learning_rate": 6.039649608114339e-07, "logits/chosen": -2.3258821964263916, "logits/rejected": 1.7522019147872925, "logps/chosen": -397.2674255371094, "logps/rejected": -995.8222045898438, "loss": 0.5865, "rewards/accuracies": 0.875, "rewards/chosen": -7.696331024169922, "rewards/margins": 34.149314880371094, "rewards/rejected": -41.845645904541016, "step": 4297 }, { "epoch": 2.6737169517884913, "grad_norm": 0.022131238132715225, "learning_rate": 6.028123559243892e-07, "logits/chosen": 1.659833312034607, "logits/rejected": 2.845536231994629, "logps/chosen": -592.1507568359375, "logps/rejected": -993.156982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.496284484863281, "rewards/margins": 26.269489288330078, "rewards/rejected": -36.76577377319336, "step": 4298 }, { "epoch": 2.6743390357698287, "grad_norm": 0.003015428315848112, "learning_rate": 6.016597510373444e-07, "logits/chosen": 0.17328539490699768, "logits/rejected": 3.8262343406677246, "logps/chosen": -496.84283447265625, "logps/rejected": -1015.587646484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.074370384216309, "rewards/margins": 27.95060920715332, "rewards/rejected": -38.02498245239258, "step": 4299 }, { "epoch": 2.6749611197511665, "grad_norm": 0.28797683119773865, "learning_rate": 6.005071461502997e-07, "logits/chosen": 1.7421389818191528, "logits/rejected": 3.9504408836364746, "logps/chosen": -627.1280517578125, "logps/rejected": -947.1497192382812, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -11.968725204467773, "rewards/margins": 22.320602416992188, "rewards/rejected": -34.289329528808594, "step": 4300 }, { "epoch": 2.675583203732504, "grad_norm": 33.13662338256836, "learning_rate": 5.993545412632551e-07, "logits/chosen": 0.9550960659980774, "logits/rejected": 3.220900297164917, "logps/chosen": -402.09564208984375, "logps/rejected": -741.6087036132812, "loss": 0.3385, "rewards/accuracies": 0.875, "rewards/chosen": -8.56776237487793, "rewards/margins": 21.219226837158203, "rewards/rejected": -29.7869873046875, "step": 4301 }, { "epoch": 2.6762052877138416, "grad_norm": 0.04643818736076355, "learning_rate": 5.982019363762103e-07, "logits/chosen": -1.7970244884490967, "logits/rejected": 2.1972224712371826, "logps/chosen": -345.9138488769531, "logps/rejected": -744.7276611328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.161243438720703, "rewards/margins": 22.81188201904297, "rewards/rejected": -29.973127365112305, "step": 4302 }, { "epoch": 2.676827371695179, "grad_norm": 3.3912474606268006e-08, "learning_rate": 5.970493314891656e-07, "logits/chosen": 0.6632111072540283, "logits/rejected": 4.397903919219971, "logps/chosen": -500.2611999511719, "logps/rejected": -1201.0867919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.991384506225586, "rewards/margins": 40.941043853759766, "rewards/rejected": -48.932430267333984, "step": 4303 }, { "epoch": 2.6774494556765163, "grad_norm": 3.7330451011657715, "learning_rate": 5.958967266021209e-07, "logits/chosen": 2.151575803756714, "logits/rejected": 3.545137405395508, "logps/chosen": -677.5784912109375, "logps/rejected": -964.5394287109375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -13.74315071105957, "rewards/margins": 20.06719970703125, "rewards/rejected": -33.81035232543945, "step": 4304 }, { "epoch": 2.6780715396578536, "grad_norm": 0.014254480600357056, "learning_rate": 5.947441217150761e-07, "logits/chosen": 1.86721670627594, "logits/rejected": 3.160707950592041, "logps/chosen": -618.7957153320312, "logps/rejected": -1003.787109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.814743041992188, "rewards/margins": 26.127132415771484, "rewards/rejected": -37.94187927246094, "step": 4305 }, { "epoch": 2.6786936236391914, "grad_norm": 0.6654040813446045, "learning_rate": 5.935915168280314e-07, "logits/chosen": -1.4605464935302734, "logits/rejected": 2.656975507736206, "logps/chosen": -592.65185546875, "logps/rejected": -1145.5179443359375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -10.840351104736328, "rewards/margins": 33.5650749206543, "rewards/rejected": -44.405426025390625, "step": 4306 }, { "epoch": 2.6793157076205287, "grad_norm": 4.114058017730713, "learning_rate": 5.924389119409866e-07, "logits/chosen": -1.903027057647705, "logits/rejected": 2.2601137161254883, "logps/chosen": -370.8560791015625, "logps/rejected": -853.153076171875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -7.281823635101318, "rewards/margins": 29.14508819580078, "rewards/rejected": -36.426910400390625, "step": 4307 }, { "epoch": 2.6799377916018665, "grad_norm": 0.0327109694480896, "learning_rate": 5.91286307053942e-07, "logits/chosen": -0.14002388715744019, "logits/rejected": 3.2458126544952393, "logps/chosen": -501.65673828125, "logps/rejected": -1063.7379150390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.754554748535156, "rewards/margins": 33.78563690185547, "rewards/rejected": -44.540191650390625, "step": 4308 }, { "epoch": 2.680559875583204, "grad_norm": 0.024720264598727226, "learning_rate": 5.901337021668972e-07, "logits/chosen": 1.7462257146835327, "logits/rejected": 4.350376605987549, "logps/chosen": -506.29852294921875, "logps/rejected": -969.9747924804688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.304910659790039, "rewards/margins": 30.68954849243164, "rewards/rejected": -37.99446105957031, "step": 4309 }, { "epoch": 2.681181959564541, "grad_norm": 2.5972089767456055, "learning_rate": 5.889810972798525e-07, "logits/chosen": 1.9655194282531738, "logits/rejected": 3.740055561065674, "logps/chosen": -506.2864074707031, "logps/rejected": -855.5678100585938, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -9.148515701293945, "rewards/margins": 22.7083683013916, "rewards/rejected": -31.856887817382812, "step": 4310 }, { "epoch": 2.6818040435458785, "grad_norm": 0.045466069132089615, "learning_rate": 5.878284923928077e-07, "logits/chosen": 0.3978482484817505, "logits/rejected": 4.564357280731201, "logps/chosen": -550.11181640625, "logps/rejected": -1136.945068359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.231959342956543, "rewards/margins": 34.99109649658203, "rewards/rejected": -43.223052978515625, "step": 4311 }, { "epoch": 2.6824261275272163, "grad_norm": 1.9469133860638976e-07, "learning_rate": 5.866758875057631e-07, "logits/chosen": -0.7411868572235107, "logits/rejected": 3.781776189804077, "logps/chosen": -480.9675598144531, "logps/rejected": -1085.2298583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.085660934448242, "rewards/margins": 36.49788284301758, "rewards/rejected": -42.58354949951172, "step": 4312 }, { "epoch": 2.6830482115085537, "grad_norm": 0.014485174790024757, "learning_rate": 5.855232826187184e-07, "logits/chosen": 1.9372586011886597, "logits/rejected": 2.3054134845733643, "logps/chosen": -616.2396240234375, "logps/rejected": -815.8970947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.824966430664062, "rewards/margins": 19.153440475463867, "rewards/rejected": -27.97840690612793, "step": 4313 }, { "epoch": 2.683670295489891, "grad_norm": 0.05898513272404671, "learning_rate": 5.843706777316736e-07, "logits/chosen": 1.3665399551391602, "logits/rejected": 3.381399154663086, "logps/chosen": -555.00341796875, "logps/rejected": -955.5720825195312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.5432586669921875, "rewards/margins": 29.97431755065918, "rewards/rejected": -35.517578125, "step": 4314 }, { "epoch": 2.684292379471229, "grad_norm": 1.3017769560974557e-05, "learning_rate": 5.83218072844629e-07, "logits/chosen": -2.1644983291625977, "logits/rejected": 1.2285041809082031, "logps/chosen": -467.1058654785156, "logps/rejected": -1055.35400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.010286331176758, "rewards/margins": 35.792327880859375, "rewards/rejected": -45.8026123046875, "step": 4315 }, { "epoch": 2.684914463452566, "grad_norm": 3.057724952697754, "learning_rate": 5.820654679575842e-07, "logits/chosen": -0.4935888648033142, "logits/rejected": 2.8731842041015625, "logps/chosen": -602.6840209960938, "logps/rejected": -1008.2568359375, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -6.988034248352051, "rewards/margins": 27.111400604248047, "rewards/rejected": -34.09943771362305, "step": 4316 }, { "epoch": 2.6855365474339035, "grad_norm": 0.0020969114266335964, "learning_rate": 5.809128630705395e-07, "logits/chosen": -1.1788969039916992, "logits/rejected": 1.914081335067749, "logps/chosen": -526.4003295898438, "logps/rejected": -928.8703002929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.427753925323486, "rewards/margins": 26.911319732666016, "rewards/rejected": -34.33907699584961, "step": 4317 }, { "epoch": 2.686158631415241, "grad_norm": 0.1595601886510849, "learning_rate": 5.797602581834947e-07, "logits/chosen": -0.34268662333488464, "logits/rejected": 3.1043288707733154, "logps/chosen": -554.5541381835938, "logps/rejected": -1099.8505859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.969440937042236, "rewards/margins": 32.59528350830078, "rewards/rejected": -40.564727783203125, "step": 4318 }, { "epoch": 2.6867807153965786, "grad_norm": 0.0005578529671765864, "learning_rate": 5.786076532964501e-07, "logits/chosen": -1.6664105653762817, "logits/rejected": 1.3208072185516357, "logps/chosen": -356.27099609375, "logps/rejected": -855.5338134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.887505054473877, "rewards/margins": 26.851097106933594, "rewards/rejected": -32.73860168457031, "step": 4319 }, { "epoch": 2.687402799377916, "grad_norm": 0.017177654430270195, "learning_rate": 5.774550484094053e-07, "logits/chosen": 2.851513624191284, "logits/rejected": 3.758535861968994, "logps/chosen": -751.7113037109375, "logps/rejected": -1025.2801513671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.320971488952637, "rewards/margins": 24.983421325683594, "rewards/rejected": -35.30438995361328, "step": 4320 }, { "epoch": 2.6880248833592537, "grad_norm": 1.4212414026260376, "learning_rate": 5.763024435223606e-07, "logits/chosen": -1.5535696744918823, "logits/rejected": 2.343139886856079, "logps/chosen": -427.4454345703125, "logps/rejected": -885.6641845703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -11.263914108276367, "rewards/margins": 28.99860191345215, "rewards/rejected": -40.26251220703125, "step": 4321 }, { "epoch": 2.688646967340591, "grad_norm": 2.3622999378858367e-07, "learning_rate": 5.751498386353159e-07, "logits/chosen": -1.9685401916503906, "logits/rejected": 4.371389389038086, "logps/chosen": -284.93084716796875, "logps/rejected": -861.482666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5671565532684326, "rewards/margins": 29.86787223815918, "rewards/rejected": -32.435028076171875, "step": 4322 }, { "epoch": 2.6892690513219284, "grad_norm": 0.2193666398525238, "learning_rate": 5.739972337482711e-07, "logits/chosen": 0.7725979685783386, "logits/rejected": 2.998575448989868, "logps/chosen": -511.34527587890625, "logps/rejected": -871.75927734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -9.83578109741211, "rewards/margins": 22.641033172607422, "rewards/rejected": -32.47681427001953, "step": 4323 }, { "epoch": 2.6898911353032657, "grad_norm": 0.0006468931096605957, "learning_rate": 5.728446288612264e-07, "logits/chosen": -0.7275848388671875, "logits/rejected": 0.5731906890869141, "logps/chosen": -552.6305541992188, "logps/rejected": -949.397705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.355508804321289, "rewards/margins": 27.570133209228516, "rewards/rejected": -36.92564392089844, "step": 4324 }, { "epoch": 2.6905132192846035, "grad_norm": 0.6407142281532288, "learning_rate": 5.716920239741816e-07, "logits/chosen": -2.447523355484009, "logits/rejected": 0.9723237156867981, "logps/chosen": -477.42950439453125, "logps/rejected": -824.8375854492188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.186221599578857, "rewards/margins": 21.85727310180664, "rewards/rejected": -26.04349136352539, "step": 4325 }, { "epoch": 2.691135303265941, "grad_norm": 2.10383404919412e-06, "learning_rate": 5.70539419087137e-07, "logits/chosen": -1.00802743434906, "logits/rejected": 3.259660243988037, "logps/chosen": -507.43731689453125, "logps/rejected": -1148.611083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.593570709228516, "rewards/margins": 36.32147216796875, "rewards/rejected": -44.91504669189453, "step": 4326 }, { "epoch": 2.6917573872472786, "grad_norm": 0.01024108286947012, "learning_rate": 5.693868142000923e-07, "logits/chosen": -0.2035624384880066, "logits/rejected": 3.430694103240967, "logps/chosen": -612.5335693359375, "logps/rejected": -1008.0703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.98814868927002, "rewards/margins": 26.49085235595703, "rewards/rejected": -35.479000091552734, "step": 4327 }, { "epoch": 2.692379471228616, "grad_norm": 0.0010729931527748704, "learning_rate": 5.682342093130475e-07, "logits/chosen": 1.0878336429595947, "logits/rejected": 2.785027027130127, "logps/chosen": -598.495849609375, "logps/rejected": -1020.3568725585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.128654479980469, "rewards/margins": 30.003232955932617, "rewards/rejected": -38.13188934326172, "step": 4328 }, { "epoch": 2.6930015552099533, "grad_norm": 4.427639484405518, "learning_rate": 5.670816044260028e-07, "logits/chosen": 2.2699520587921143, "logits/rejected": 3.813429832458496, "logps/chosen": -613.0655517578125, "logps/rejected": -893.4889526367188, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -10.60464096069336, "rewards/margins": 25.513607025146484, "rewards/rejected": -36.11824417114258, "step": 4329 }, { "epoch": 2.6936236391912907, "grad_norm": 0.0002954995143227279, "learning_rate": 5.659289995389581e-07, "logits/chosen": -1.6988110542297363, "logits/rejected": 3.1969075202941895, "logps/chosen": -366.23980712890625, "logps/rejected": -999.2174682617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.072360038757324, "rewards/margins": 27.513774871826172, "rewards/rejected": -34.58613586425781, "step": 4330 }, { "epoch": 2.6942457231726284, "grad_norm": 1.4735642671585083, "learning_rate": 5.647763946519134e-07, "logits/chosen": 0.06061077117919922, "logits/rejected": 3.5560169219970703, "logps/chosen": -548.827392578125, "logps/rejected": -1081.3961181640625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -8.822676658630371, "rewards/margins": 34.31581497192383, "rewards/rejected": -43.13848876953125, "step": 4331 }, { "epoch": 2.694867807153966, "grad_norm": 0.0016329983482137322, "learning_rate": 5.636237897648686e-07, "logits/chosen": 1.7709248065948486, "logits/rejected": 2.6699976921081543, "logps/chosen": -579.9384765625, "logps/rejected": -892.5574340820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.768032073974609, "rewards/margins": 26.118501663208008, "rewards/rejected": -33.886531829833984, "step": 4332 }, { "epoch": 2.695489891135303, "grad_norm": 0.009099734015762806, "learning_rate": 5.62471184877824e-07, "logits/chosen": 1.7401294708251953, "logits/rejected": 3.335153102874756, "logps/chosen": -595.302490234375, "logps/rejected": -866.0000610351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.306253433227539, "rewards/margins": 18.30873680114746, "rewards/rejected": -28.614992141723633, "step": 4333 }, { "epoch": 2.696111975116641, "grad_norm": 31.583667755126953, "learning_rate": 5.613185799907792e-07, "logits/chosen": -0.46423840522766113, "logits/rejected": 3.5936923027038574, "logps/chosen": -613.9580078125, "logps/rejected": -1366.32666015625, "loss": 0.3121, "rewards/accuracies": 0.875, "rewards/chosen": -11.536033630371094, "rewards/margins": 39.808067321777344, "rewards/rejected": -51.34410095214844, "step": 4334 }, { "epoch": 2.6967340590979783, "grad_norm": 0.0018100934103131294, "learning_rate": 5.601659751037345e-07, "logits/chosen": -0.2569233775138855, "logits/rejected": 2.3816943168640137, "logps/chosen": -449.564453125, "logps/rejected": -746.4315185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.061365127563477, "rewards/margins": 23.170801162719727, "rewards/rejected": -29.23216438293457, "step": 4335 }, { "epoch": 2.6973561430793156, "grad_norm": 9.191144840769994e-07, "learning_rate": 5.590133702166897e-07, "logits/chosen": 2.2798638343811035, "logits/rejected": 4.180978775024414, "logps/chosen": -511.2894287109375, "logps/rejected": -893.4908447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.659784317016602, "rewards/margins": 29.60740089416504, "rewards/rejected": -40.26718521118164, "step": 4336 }, { "epoch": 2.697978227060653, "grad_norm": 8.901963610696839e-09, "learning_rate": 5.578607653296451e-07, "logits/chosen": -1.8893110752105713, "logits/rejected": 4.048734664916992, "logps/chosen": -402.8697509765625, "logps/rejected": -1160.6318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.05605697631836, "rewards/margins": 40.020328521728516, "rewards/rejected": -51.076385498046875, "step": 4337 }, { "epoch": 2.6986003110419907, "grad_norm": 17.029449462890625, "learning_rate": 5.567081604426004e-07, "logits/chosen": 1.0266895294189453, "logits/rejected": 2.2230403423309326, "logps/chosen": -618.4317016601562, "logps/rejected": -924.9429931640625, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": -15.199832916259766, "rewards/margins": 21.95380210876465, "rewards/rejected": -37.15363693237305, "step": 4338 }, { "epoch": 2.699222395023328, "grad_norm": 0.0002132367080776021, "learning_rate": 5.555555555555555e-07, "logits/chosen": -0.0829852819442749, "logits/rejected": 3.98368501663208, "logps/chosen": -583.0263061523438, "logps/rejected": -1198.749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.993731498718262, "rewards/margins": 35.4033088684082, "rewards/rejected": -46.39704132080078, "step": 4339 }, { "epoch": 2.699844479004666, "grad_norm": 0.432494580745697, "learning_rate": 5.544029506685108e-07, "logits/chosen": 1.575255036354065, "logits/rejected": 2.5109803676605225, "logps/chosen": -545.0003662109375, "logps/rejected": -840.7965698242188, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -6.728708267211914, "rewards/margins": 22.338848114013672, "rewards/rejected": -29.06755828857422, "step": 4340 }, { "epoch": 2.700466562986003, "grad_norm": 0.13241006433963776, "learning_rate": 5.532503457814662e-07, "logits/chosen": -0.6970356106758118, "logits/rejected": 0.8758705854415894, "logps/chosen": -580.6110229492188, "logps/rejected": -915.1602783203125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -10.863323211669922, "rewards/margins": 21.47724151611328, "rewards/rejected": -32.3405647277832, "step": 4341 }, { "epoch": 2.7010886469673405, "grad_norm": 0.00010155046038562432, "learning_rate": 5.520977408944214e-07, "logits/chosen": -1.4229021072387695, "logits/rejected": 1.0566169023513794, "logps/chosen": -456.4710388183594, "logps/rejected": -824.149169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.78543758392334, "rewards/margins": 26.162662506103516, "rewards/rejected": -31.948097229003906, "step": 4342 }, { "epoch": 2.701710730948678, "grad_norm": 3.846147792613275e-12, "learning_rate": 5.509451360073767e-07, "logits/chosen": 0.1892796754837036, "logits/rejected": 3.4125478267669678, "logps/chosen": -550.337158203125, "logps/rejected": -1173.93212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.669839859008789, "rewards/margins": 40.65834045410156, "rewards/rejected": -49.328182220458984, "step": 4343 }, { "epoch": 2.7023328149300156, "grad_norm": 0.17231476306915283, "learning_rate": 5.49792531120332e-07, "logits/chosen": 2.59763240814209, "logits/rejected": 3.944274425506592, "logps/chosen": -720.7247924804688, "logps/rejected": -1106.485595703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.691411972045898, "rewards/margins": 26.218000411987305, "rewards/rejected": -34.9094123840332, "step": 4344 }, { "epoch": 2.702954898911353, "grad_norm": 7.438750617438927e-05, "learning_rate": 5.486399262332873e-07, "logits/chosen": -0.7106736302375793, "logits/rejected": 3.2730722427368164, "logps/chosen": -424.83203125, "logps/rejected": -997.3363037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.957796096801758, "rewards/margins": 32.552249908447266, "rewards/rejected": -41.510047912597656, "step": 4345 }, { "epoch": 2.7035769828926908, "grad_norm": 8.103870641207322e-07, "learning_rate": 5.474873213462425e-07, "logits/chosen": 0.1780504584312439, "logits/rejected": 2.3171892166137695, "logps/chosen": -485.7713317871094, "logps/rejected": -894.5025634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.043638229370117, "rewards/margins": 26.9890193939209, "rewards/rejected": -34.032657623291016, "step": 4346 }, { "epoch": 2.704199066874028, "grad_norm": 4.82890427520033e-05, "learning_rate": 5.463347164591978e-07, "logits/chosen": -0.6762113571166992, "logits/rejected": 3.4721696376800537, "logps/chosen": -483.78619384765625, "logps/rejected": -1048.41748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.2759599685668945, "rewards/margins": 34.23728942871094, "rewards/rejected": -39.51325225830078, "step": 4347 }, { "epoch": 2.7048211508553655, "grad_norm": 0.0010803096229210496, "learning_rate": 5.451821115721531e-07, "logits/chosen": -0.5647343397140503, "logits/rejected": 3.118108034133911, "logps/chosen": -507.31219482421875, "logps/rejected": -1145.51220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.005629539489746, "rewards/margins": 35.81145477294922, "rewards/rejected": -41.81708526611328, "step": 4348 }, { "epoch": 2.705443234836703, "grad_norm": 0.03288925066590309, "learning_rate": 5.440295066851084e-07, "logits/chosen": -0.9186725616455078, "logits/rejected": 1.690115213394165, "logps/chosen": -437.88800048828125, "logps/rejected": -734.5858154296875, "loss": 0.0866, "rewards/accuracies": 0.875, "rewards/chosen": -5.427893161773682, "rewards/margins": 18.78788185119629, "rewards/rejected": -24.215774536132812, "step": 4349 }, { "epoch": 2.7060653188180406, "grad_norm": 1.1404665201553144e-05, "learning_rate": 5.428769017980637e-07, "logits/chosen": -1.5076873302459717, "logits/rejected": 3.565868377685547, "logps/chosen": -260.5267333984375, "logps/rejected": -897.2808227539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2050933837890625, "rewards/margins": 32.63178253173828, "rewards/rejected": -36.836875915527344, "step": 4350 }, { "epoch": 2.706687402799378, "grad_norm": 5.416849489847664e-06, "learning_rate": 5.41724296911019e-07, "logits/chosen": -1.6162619590759277, "logits/rejected": 3.268805742263794, "logps/chosen": -361.4254455566406, "logps/rejected": -998.807861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2600626945495605, "rewards/margins": 31.510923385620117, "rewards/rejected": -37.7709846496582, "step": 4351 }, { "epoch": 2.7073094867807153, "grad_norm": 0.08973027765750885, "learning_rate": 5.405716920239743e-07, "logits/chosen": -0.4660053551197052, "logits/rejected": 1.4866576194763184, "logps/chosen": -479.0644836425781, "logps/rejected": -909.916748046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.39615249633789, "rewards/margins": 30.067825317382812, "rewards/rejected": -38.4639778137207, "step": 4352 }, { "epoch": 2.707931570762053, "grad_norm": 8.65224046719959e-06, "learning_rate": 5.394190871369295e-07, "logits/chosen": -0.05632424354553223, "logits/rejected": 3.01267671585083, "logps/chosen": -523.555419921875, "logps/rejected": -921.5439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.667513847351074, "rewards/margins": 25.65847396850586, "rewards/rejected": -37.32598876953125, "step": 4353 }, { "epoch": 2.7085536547433904, "grad_norm": 31.374597549438477, "learning_rate": 5.382664822498848e-07, "logits/chosen": 1.0141353607177734, "logits/rejected": 2.414851427078247, "logps/chosen": -523.1561279296875, "logps/rejected": -741.9181518554688, "loss": 0.4007, "rewards/accuracies": 0.875, "rewards/chosen": -6.2355732917785645, "rewards/margins": 15.42635726928711, "rewards/rejected": -21.661930084228516, "step": 4354 }, { "epoch": 2.7091757387247277, "grad_norm": 0.00032730703242123127, "learning_rate": 5.371138773628401e-07, "logits/chosen": -1.0623137950897217, "logits/rejected": 1.790582537651062, "logps/chosen": -350.048828125, "logps/rejected": -757.1088256835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.168100357055664, "rewards/margins": 23.775062561035156, "rewards/rejected": -29.943164825439453, "step": 4355 }, { "epoch": 2.709797822706065, "grad_norm": 5.184572219848633, "learning_rate": 5.359612724757953e-07, "logits/chosen": -0.6515331268310547, "logits/rejected": 2.19075083732605, "logps/chosen": -407.2192077636719, "logps/rejected": -849.9705200195312, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -4.066438674926758, "rewards/margins": 25.576189041137695, "rewards/rejected": -29.64262580871582, "step": 4356 }, { "epoch": 2.710419906687403, "grad_norm": 30.339340209960938, "learning_rate": 5.348086675887506e-07, "logits/chosen": 0.5228167176246643, "logits/rejected": 1.639932632446289, "logps/chosen": -682.2943115234375, "logps/rejected": -1072.2547607421875, "loss": 0.1604, "rewards/accuracies": 0.875, "rewards/chosen": -11.397087097167969, "rewards/margins": 25.680978775024414, "rewards/rejected": -37.07806396484375, "step": 4357 }, { "epoch": 2.71104199066874, "grad_norm": 0.006391817703843117, "learning_rate": 5.336560627017058e-07, "logits/chosen": 1.4407583475112915, "logits/rejected": 3.6678295135498047, "logps/chosen": -530.218994140625, "logps/rejected": -1002.1013793945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.938558578491211, "rewards/margins": 32.15758514404297, "rewards/rejected": -41.09614562988281, "step": 4358 }, { "epoch": 2.711664074650078, "grad_norm": 0.00031976267928257585, "learning_rate": 5.325034578146612e-07, "logits/chosen": 1.9172717332839966, "logits/rejected": 3.0557916164398193, "logps/chosen": -624.5967407226562, "logps/rejected": -958.0452880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.343128204345703, "rewards/margins": 23.5101318359375, "rewards/rejected": -31.85325813293457, "step": 4359 }, { "epoch": 2.7122861586314153, "grad_norm": 0.2668883800506592, "learning_rate": 5.313508529276164e-07, "logits/chosen": -2.6089425086975098, "logits/rejected": 3.2084131240844727, "logps/chosen": -315.37908935546875, "logps/rejected": -944.3118896484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.8562798500061035, "rewards/margins": 26.441707611083984, "rewards/rejected": -32.29798889160156, "step": 4360 }, { "epoch": 2.7129082426127527, "grad_norm": 0.12905797362327576, "learning_rate": 5.301982480405717e-07, "logits/chosen": -3.010551929473877, "logits/rejected": 1.19906485080719, "logps/chosen": -329.3901672363281, "logps/rejected": -893.433837890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.323816776275635, "rewards/margins": 28.397506713867188, "rewards/rejected": -35.7213249206543, "step": 4361 }, { "epoch": 2.71353032659409, "grad_norm": 0.02742931619286537, "learning_rate": 5.29045643153527e-07, "logits/chosen": 0.43825221061706543, "logits/rejected": 2.532029628753662, "logps/chosen": -554.6838989257812, "logps/rejected": -1005.2730102539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.257204532623291, "rewards/margins": 28.405176162719727, "rewards/rejected": -33.66238021850586, "step": 4362 }, { "epoch": 2.7141524105754278, "grad_norm": 0.008566766045987606, "learning_rate": 5.278930382664823e-07, "logits/chosen": -1.9675729274749756, "logits/rejected": 2.2211780548095703, "logps/chosen": -344.1619567871094, "logps/rejected": -868.6703491210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.205824851989746, "rewards/margins": 20.807655334472656, "rewards/rejected": -29.013479232788086, "step": 4363 }, { "epoch": 2.714774494556765, "grad_norm": 0.11387711763381958, "learning_rate": 5.267404333794376e-07, "logits/chosen": -0.49191659688949585, "logits/rejected": 3.8870506286621094, "logps/chosen": -493.625, "logps/rejected": -1037.103759765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.6121931076049805, "rewards/margins": 28.777477264404297, "rewards/rejected": -35.389671325683594, "step": 4364 }, { "epoch": 2.715396578538103, "grad_norm": 0.0002502195711713284, "learning_rate": 5.255878284923928e-07, "logits/chosen": 0.7257356643676758, "logits/rejected": 3.592747688293457, "logps/chosen": -431.878173828125, "logps/rejected": -801.303466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.508918762207031, "rewards/margins": 22.942853927612305, "rewards/rejected": -30.451770782470703, "step": 4365 }, { "epoch": 2.7160186625194402, "grad_norm": 2.134223461151123, "learning_rate": 5.244352236053482e-07, "logits/chosen": 2.029142379760742, "logits/rejected": 3.5649404525756836, "logps/chosen": -642.5010986328125, "logps/rejected": -990.53173828125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -8.469472885131836, "rewards/margins": 29.194623947143555, "rewards/rejected": -37.66409683227539, "step": 4366 }, { "epoch": 2.7166407465007776, "grad_norm": 0.014666501432657242, "learning_rate": 5.232826187183034e-07, "logits/chosen": 0.12603336572647095, "logits/rejected": 2.8921942710876465, "logps/chosen": -487.640380859375, "logps/rejected": -989.414306640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.62231731414795, "rewards/margins": 25.92264175415039, "rewards/rejected": -38.544960021972656, "step": 4367 }, { "epoch": 2.717262830482115, "grad_norm": 0.15587158501148224, "learning_rate": 5.221300138312587e-07, "logits/chosen": -0.2560267150402069, "logits/rejected": 2.216273307800293, "logps/chosen": -526.2840576171875, "logps/rejected": -870.5951538085938, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -8.446675300598145, "rewards/margins": 24.011863708496094, "rewards/rejected": -32.45853805541992, "step": 4368 }, { "epoch": 2.7178849144634527, "grad_norm": 7.080076102283783e-06, "learning_rate": 5.20977408944214e-07, "logits/chosen": -2.0199179649353027, "logits/rejected": 2.431148052215576, "logps/chosen": -395.56085205078125, "logps/rejected": -969.3992919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.538591384887695, "rewards/margins": 36.18790054321289, "rewards/rejected": -42.72649383544922, "step": 4369 }, { "epoch": 2.71850699844479, "grad_norm": 2.202810492235585e-06, "learning_rate": 5.198248040571693e-07, "logits/chosen": -2.0969204902648926, "logits/rejected": 1.8270785808563232, "logps/chosen": -397.2028503417969, "logps/rejected": -844.73876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.526747703552246, "rewards/margins": 28.255794525146484, "rewards/rejected": -35.78254318237305, "step": 4370 }, { "epoch": 2.7191290824261274, "grad_norm": 0.0031678418163210154, "learning_rate": 5.186721991701245e-07, "logits/chosen": 3.7845406532287598, "logits/rejected": 3.3417134284973145, "logps/chosen": -703.6448974609375, "logps/rejected": -964.923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.213335990905762, "rewards/margins": 25.453189849853516, "rewards/rejected": -38.666526794433594, "step": 4371 }, { "epoch": 2.719751166407465, "grad_norm": 0.00013797474093735218, "learning_rate": 5.175195942830797e-07, "logits/chosen": 0.5534344911575317, "logits/rejected": 3.253185272216797, "logps/chosen": -467.7929992675781, "logps/rejected": -796.52197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.673547744750977, "rewards/margins": 23.377058029174805, "rewards/rejected": -29.05060577392578, "step": 4372 }, { "epoch": 2.7203732503888025, "grad_norm": 0.17591939866542816, "learning_rate": 5.163669893960351e-07, "logits/chosen": -2.0380706787109375, "logits/rejected": 3.314939022064209, "logps/chosen": -471.1107177734375, "logps/rejected": -1084.6961669921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.316633224487305, "rewards/margins": 33.47694396972656, "rewards/rejected": -42.7935791015625, "step": 4373 }, { "epoch": 2.72099533437014, "grad_norm": 1.0730345820775256e-05, "learning_rate": 5.152143845089903e-07, "logits/chosen": -1.1209220886230469, "logits/rejected": 1.6226786375045776, "logps/chosen": -454.8038024902344, "logps/rejected": -1002.0517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.348759651184082, "rewards/margins": 34.19036102294922, "rewards/rejected": -42.539119720458984, "step": 4374 }, { "epoch": 2.721617418351477, "grad_norm": 8.572400838602334e-05, "learning_rate": 5.140617796219456e-07, "logits/chosen": 1.7390587329864502, "logits/rejected": 3.3332607746124268, "logps/chosen": -618.982666015625, "logps/rejected": -1069.13720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.608708381652832, "rewards/margins": 30.47606086730957, "rewards/rejected": -43.08477020263672, "step": 4375 }, { "epoch": 2.722239502332815, "grad_norm": 1.4397851089142932e-07, "learning_rate": 5.12909174734901e-07, "logits/chosen": -1.9789304733276367, "logits/rejected": 2.7752182483673096, "logps/chosen": -425.43988037109375, "logps/rejected": -1153.5184326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.725929260253906, "rewards/margins": 41.87081527709961, "rewards/rejected": -49.596744537353516, "step": 4376 }, { "epoch": 2.7228615863141523, "grad_norm": 2.710033550101798e-05, "learning_rate": 5.117565698478562e-07, "logits/chosen": 0.715092658996582, "logits/rejected": 1.869808316230774, "logps/chosen": -517.2560424804688, "logps/rejected": -950.8013916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.218734264373779, "rewards/margins": 30.94306182861328, "rewards/rejected": -36.16179656982422, "step": 4377 }, { "epoch": 2.72348367029549, "grad_norm": 0.2727895677089691, "learning_rate": 5.106039649608115e-07, "logits/chosen": 0.3004588186740875, "logits/rejected": 4.007742404937744, "logps/chosen": -517.6912231445312, "logps/rejected": -988.560302734375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.110770225524902, "rewards/margins": 24.466075897216797, "rewards/rejected": -32.57684326171875, "step": 4378 }, { "epoch": 2.7241057542768274, "grad_norm": 5.683639301423682e-07, "learning_rate": 5.094513600737667e-07, "logits/chosen": -2.6395342350006104, "logits/rejected": 1.850848913192749, "logps/chosen": -472.1200256347656, "logps/rejected": -1047.662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.703500747680664, "rewards/margins": 36.30821228027344, "rewards/rejected": -46.011714935302734, "step": 4379 }, { "epoch": 2.724727838258165, "grad_norm": 0.015439534559845924, "learning_rate": 5.082987551867221e-07, "logits/chosen": -1.267011046409607, "logits/rejected": 3.612168550491333, "logps/chosen": -418.2823486328125, "logps/rejected": -964.836669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.654834747314453, "rewards/margins": 27.616714477539062, "rewards/rejected": -36.271549224853516, "step": 4380 }, { "epoch": 2.725349922239502, "grad_norm": 0.00018610125698614866, "learning_rate": 5.071461502996773e-07, "logits/chosen": 0.0017415881156921387, "logits/rejected": 2.2741801738739014, "logps/chosen": -552.3558349609375, "logps/rejected": -904.6097412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.071257591247559, "rewards/margins": 26.6653995513916, "rewards/rejected": -35.736656188964844, "step": 4381 }, { "epoch": 2.72597200622084, "grad_norm": 0.07816386222839355, "learning_rate": 5.059935454126326e-07, "logits/chosen": -2.1264593601226807, "logits/rejected": 2.295696496963501, "logps/chosen": -424.7033386230469, "logps/rejected": -1066.138671875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.95219898223877, "rewards/margins": 36.07676315307617, "rewards/rejected": -45.028961181640625, "step": 4382 }, { "epoch": 2.7265940902021772, "grad_norm": 0.1827636957168579, "learning_rate": 5.048409405255878e-07, "logits/chosen": 0.587814211845398, "logits/rejected": 1.550323247909546, "logps/chosen": -459.2896728515625, "logps/rejected": -797.319091796875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -9.950087547302246, "rewards/margins": 27.087358474731445, "rewards/rejected": -37.037445068359375, "step": 4383 }, { "epoch": 2.727216174183515, "grad_norm": 0.018102193251252174, "learning_rate": 5.036883356385432e-07, "logits/chosen": -1.172330617904663, "logits/rejected": 2.191157817840576, "logps/chosen": -502.16253662109375, "logps/rejected": -955.5294799804688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.382427215576172, "rewards/margins": 27.705997467041016, "rewards/rejected": -37.08842849731445, "step": 4384 }, { "epoch": 2.7278382581648524, "grad_norm": 0.0841863602399826, "learning_rate": 5.025357307514984e-07, "logits/chosen": 1.7941412925720215, "logits/rejected": 2.9506564140319824, "logps/chosen": -797.8958129882812, "logps/rejected": -1073.514892578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -14.37309455871582, "rewards/margins": 23.541181564331055, "rewards/rejected": -37.914276123046875, "step": 4385 }, { "epoch": 2.7284603421461897, "grad_norm": 0.05608074739575386, "learning_rate": 5.013831258644537e-07, "logits/chosen": 0.6043331623077393, "logits/rejected": 1.9576436281204224, "logps/chosen": -613.4882202148438, "logps/rejected": -1038.0992431640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.326175689697266, "rewards/margins": 32.297691345214844, "rewards/rejected": -39.623870849609375, "step": 4386 }, { "epoch": 2.729082426127527, "grad_norm": 0.09061995893716812, "learning_rate": 5.002305209774091e-07, "logits/chosen": 2.2635750770568848, "logits/rejected": 3.9595303535461426, "logps/chosen": -664.990966796875, "logps/rejected": -1020.809326171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.448055267333984, "rewards/margins": 25.402923583984375, "rewards/rejected": -32.85097885131836, "step": 4387 }, { "epoch": 2.729704510108865, "grad_norm": 0.0019873238634318113, "learning_rate": 4.990779160903643e-07, "logits/chosen": -3.2694735527038574, "logits/rejected": 1.7833689451217651, "logps/chosen": -427.5120849609375, "logps/rejected": -1065.823486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.329551696777344, "rewards/margins": 34.55972671508789, "rewards/rejected": -44.889278411865234, "step": 4388 }, { "epoch": 2.730326594090202, "grad_norm": 0.08666915446519852, "learning_rate": 4.979253112033195e-07, "logits/chosen": -1.542961597442627, "logits/rejected": 2.0044682025909424, "logps/chosen": -452.8818664550781, "logps/rejected": -1085.8095703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.248226642608643, "rewards/margins": 37.20758056640625, "rewards/rejected": -43.455806732177734, "step": 4389 }, { "epoch": 2.7309486780715395, "grad_norm": 9.85058879852295, "learning_rate": 4.967727063162748e-07, "logits/chosen": 1.9844534397125244, "logits/rejected": 3.299943685531616, "logps/chosen": -642.399169921875, "logps/rejected": -1097.474609375, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": -11.411534309387207, "rewards/margins": 29.400665283203125, "rewards/rejected": -40.81220245361328, "step": 4390 }, { "epoch": 2.7315707620528773, "grad_norm": 0.009679671376943588, "learning_rate": 4.956201014292301e-07, "logits/chosen": 0.39165472984313965, "logits/rejected": 4.176124572753906, "logps/chosen": -613.4664306640625, "logps/rejected": -1131.4798583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.620540142059326, "rewards/margins": 28.63414192199707, "rewards/rejected": -35.25468444824219, "step": 4391 }, { "epoch": 2.7321928460342146, "grad_norm": 6.542808478116058e-06, "learning_rate": 4.944674965421854e-07, "logits/chosen": 1.009442925453186, "logits/rejected": 2.719297409057617, "logps/chosen": -727.4083251953125, "logps/rejected": -1082.810791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.238219261169434, "rewards/margins": 30.47528076171875, "rewards/rejected": -41.7135009765625, "step": 4392 }, { "epoch": 2.732814930015552, "grad_norm": 1.4480074644088745, "learning_rate": 4.933148916551406e-07, "logits/chosen": -0.8078665733337402, "logits/rejected": 2.1155059337615967, "logps/chosen": -473.40826416015625, "logps/rejected": -780.3966064453125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -6.8958587646484375, "rewards/margins": 15.986709594726562, "rewards/rejected": -22.882568359375, "step": 4393 }, { "epoch": 2.7334370139968893, "grad_norm": 0.0005617666174657643, "learning_rate": 4.92162286768096e-07, "logits/chosen": -1.7720906734466553, "logits/rejected": 2.736274003982544, "logps/chosen": -399.5456848144531, "logps/rejected": -1101.6707763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.80557918548584, "rewards/margins": 35.24809265136719, "rewards/rejected": -42.053672790527344, "step": 4394 }, { "epoch": 2.734059097978227, "grad_norm": 0.002742925425991416, "learning_rate": 4.910096818810512e-07, "logits/chosen": -1.521073341369629, "logits/rejected": 2.829493999481201, "logps/chosen": -433.4801330566406, "logps/rejected": -994.6790771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.705349445343018, "rewards/margins": 33.4708251953125, "rewards/rejected": -40.176177978515625, "step": 4395 }, { "epoch": 2.7346811819595644, "grad_norm": 0.03669571876525879, "learning_rate": 4.898570769940065e-07, "logits/chosen": -1.7508848905563354, "logits/rejected": 1.0612797737121582, "logps/chosen": -548.3234252929688, "logps/rejected": -1094.0908203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.284712791442871, "rewards/margins": 34.96992492675781, "rewards/rejected": -47.254634857177734, "step": 4396 }, { "epoch": 2.7353032659409022, "grad_norm": 0.10406485944986343, "learning_rate": 4.887044721069617e-07, "logits/chosen": -0.0743609368801117, "logits/rejected": 4.600546836853027, "logps/chosen": -567.2088623046875, "logps/rejected": -1132.5198974609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.125041961669922, "rewards/margins": 28.387611389160156, "rewards/rejected": -37.51264953613281, "step": 4397 }, { "epoch": 2.7359253499222396, "grad_norm": 0.18575268983840942, "learning_rate": 4.875518672199171e-07, "logits/chosen": -0.16322201490402222, "logits/rejected": 4.049489974975586, "logps/chosen": -525.8208618164062, "logps/rejected": -1135.487060546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -8.548019409179688, "rewards/margins": 35.76894760131836, "rewards/rejected": -44.31696701049805, "step": 4398 }, { "epoch": 2.736547433903577, "grad_norm": 0.05440472811460495, "learning_rate": 4.863992623328723e-07, "logits/chosen": -0.8141235113143921, "logits/rejected": 4.115907192230225, "logps/chosen": -467.80584716796875, "logps/rejected": -1172.6866455078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.733298301696777, "rewards/margins": 39.912601470947266, "rewards/rejected": -50.645896911621094, "step": 4399 }, { "epoch": 2.7371695178849142, "grad_norm": 1.0240157166663266e-08, "learning_rate": 4.852466574458276e-07, "logits/chosen": -3.2142281532287598, "logits/rejected": 3.145437479019165, "logps/chosen": -326.4961242675781, "logps/rejected": -974.7950439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.854359149932861, "rewards/margins": 35.600135803222656, "rewards/rejected": -43.454498291015625, "step": 4400 }, { "epoch": 2.737791601866252, "grad_norm": 19.847810745239258, "learning_rate": 4.84094052558783e-07, "logits/chosen": 0.38483256101608276, "logits/rejected": 2.9243876934051514, "logps/chosen": -568.7760620117188, "logps/rejected": -860.8410034179688, "loss": 0.1395, "rewards/accuracies": 0.875, "rewards/chosen": -11.1824369430542, "rewards/margins": 18.987302780151367, "rewards/rejected": -30.169742584228516, "step": 4401 }, { "epoch": 2.7384136858475894, "grad_norm": 0.007317651528865099, "learning_rate": 4.829414476717382e-07, "logits/chosen": 0.06382274627685547, "logits/rejected": 2.503971576690674, "logps/chosen": -651.1002197265625, "logps/rejected": -989.615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.62253189086914, "rewards/margins": 26.692480087280273, "rewards/rejected": -36.31501007080078, "step": 4402 }, { "epoch": 2.739035769828927, "grad_norm": 0.0010312871308997273, "learning_rate": 4.817888427846935e-07, "logits/chosen": 0.0007169246673583984, "logits/rejected": 3.0172805786132812, "logps/chosen": -429.4022216796875, "logps/rejected": -827.8827514648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.983163833618164, "rewards/margins": 23.772552490234375, "rewards/rejected": -29.755714416503906, "step": 4403 }, { "epoch": 2.7396578538102645, "grad_norm": 2.757115602493286, "learning_rate": 4.806362378976487e-07, "logits/chosen": 0.38977721333503723, "logits/rejected": 2.635216236114502, "logps/chosen": -659.2049560546875, "logps/rejected": -903.1165771484375, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -7.251816749572754, "rewards/margins": 14.301630973815918, "rewards/rejected": -21.553447723388672, "step": 4404 }, { "epoch": 2.740279937791602, "grad_norm": 0.03386557102203369, "learning_rate": 4.794836330106041e-07, "logits/chosen": 2.021665573120117, "logits/rejected": 3.6409528255462646, "logps/chosen": -555.582763671875, "logps/rejected": -898.3143310546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.261466979980469, "rewards/margins": 22.672351837158203, "rewards/rejected": -29.933818817138672, "step": 4405 }, { "epoch": 2.740902021772939, "grad_norm": 0.00021831082995049655, "learning_rate": 4.783310281235593e-07, "logits/chosen": -1.1020753383636475, "logits/rejected": 2.6016407012939453, "logps/chosen": -515.0692749023438, "logps/rejected": -1046.1241455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.593591213226318, "rewards/margins": 31.335071563720703, "rewards/rejected": -38.92866134643555, "step": 4406 }, { "epoch": 2.741524105754277, "grad_norm": 1.8972793817520142, "learning_rate": 4.771784232365145e-07, "logits/chosen": 3.971691846847534, "logits/rejected": 4.097461223602295, "logps/chosen": -809.5933227539062, "logps/rejected": -1073.0018310546875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -8.114361763000488, "rewards/margins": 22.81982421875, "rewards/rejected": -30.934188842773438, "step": 4407 }, { "epoch": 2.7421461897356143, "grad_norm": 3.0011490252945805e-06, "learning_rate": 4.7602581834946984e-07, "logits/chosen": -0.5607624650001526, "logits/rejected": 0.6232677102088928, "logps/chosen": -511.8693542480469, "logps/rejected": -843.8756103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.825777530670166, "rewards/margins": 25.076919555664062, "rewards/rejected": -32.9026985168457, "step": 4408 }, { "epoch": 2.7427682737169516, "grad_norm": 51.671817779541016, "learning_rate": 4.748732134624251e-07, "logits/chosen": 0.5230734944343567, "logits/rejected": 2.3547849655151367, "logps/chosen": -531.5377807617188, "logps/rejected": -943.6278076171875, "loss": 0.3686, "rewards/accuracies": 0.875, "rewards/chosen": -10.170397758483887, "rewards/margins": 27.10638999938965, "rewards/rejected": -37.27678680419922, "step": 4409 }, { "epoch": 2.7433903576982894, "grad_norm": 1.494800550005948e-08, "learning_rate": 4.737206085753804e-07, "logits/chosen": 1.9931923151016235, "logits/rejected": 3.8050477504730225, "logps/chosen": -661.6253662109375, "logps/rejected": -1054.855224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.947713851928711, "rewards/margins": 32.363162994384766, "rewards/rejected": -43.31087875366211, "step": 4410 }, { "epoch": 2.7440124416796268, "grad_norm": 2.0981217403459596e-06, "learning_rate": 4.7256800368833567e-07, "logits/chosen": 0.8072133660316467, "logits/rejected": 4.635097026824951, "logps/chosen": -504.37249755859375, "logps/rejected": -1036.0225830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.078475952148438, "rewards/margins": 30.97477149963379, "rewards/rejected": -40.053245544433594, "step": 4411 }, { "epoch": 2.744634525660964, "grad_norm": 1.6119975043693557e-05, "learning_rate": 4.7141539880129095e-07, "logits/chosen": 0.30014097690582275, "logits/rejected": 2.809941291809082, "logps/chosen": -473.0091857910156, "logps/rejected": -881.51611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.943344116210938, "rewards/margins": 25.527257919311523, "rewards/rejected": -34.47060012817383, "step": 4412 }, { "epoch": 2.7452566096423014, "grad_norm": 0.10137374699115753, "learning_rate": 4.702627939142462e-07, "logits/chosen": 1.4836374521255493, "logits/rejected": 3.366061210632324, "logps/chosen": -570.8435668945312, "logps/rejected": -958.2086181640625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -11.571364402770996, "rewards/margins": 27.339611053466797, "rewards/rejected": -38.910972595214844, "step": 4413 }, { "epoch": 2.7458786936236392, "grad_norm": 0.029282765462994576, "learning_rate": 4.691101890272015e-07, "logits/chosen": -0.6731588840484619, "logits/rejected": 2.8208022117614746, "logps/chosen": -574.0529174804688, "logps/rejected": -1081.0164794921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.4242472648620605, "rewards/margins": 30.81102752685547, "rewards/rejected": -38.23527526855469, "step": 4414 }, { "epoch": 2.7465007776049766, "grad_norm": 0.43205007910728455, "learning_rate": 4.679575841401568e-07, "logits/chosen": 0.5426101088523865, "logits/rejected": 2.9084668159484863, "logps/chosen": -659.4035034179688, "logps/rejected": -1100.140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -10.613444328308105, "rewards/margins": 25.915149688720703, "rewards/rejected": -36.528594970703125, "step": 4415 }, { "epoch": 2.7471228615863144, "grad_norm": 8.718097524251789e-05, "learning_rate": 4.6680497925311206e-07, "logits/chosen": -3.586723804473877, "logits/rejected": -0.23334333300590515, "logps/chosen": -410.1449890136719, "logps/rejected": -930.001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.797764301300049, "rewards/margins": 33.449073791503906, "rewards/rejected": -38.2468376159668, "step": 4416 }, { "epoch": 2.7477449455676517, "grad_norm": 0.34267765283584595, "learning_rate": 4.6565237436606734e-07, "logits/chosen": 2.4590861797332764, "logits/rejected": 4.483171463012695, "logps/chosen": -570.5287475585938, "logps/rejected": -870.416015625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.81176233291626, "rewards/margins": 20.070743560791016, "rewards/rejected": -27.882505416870117, "step": 4417 }, { "epoch": 2.748367029548989, "grad_norm": 10.427054405212402, "learning_rate": 4.6449976947902267e-07, "logits/chosen": -0.17008568346500397, "logits/rejected": 2.956789016723633, "logps/chosen": -639.9490966796875, "logps/rejected": -1002.490478515625, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -10.339786529541016, "rewards/margins": 24.56946563720703, "rewards/rejected": -34.90925216674805, "step": 4418 }, { "epoch": 2.7489891135303264, "grad_norm": 0.023447509855031967, "learning_rate": 4.6334716459197795e-07, "logits/chosen": 1.941941738128662, "logits/rejected": 5.219603061676025, "logps/chosen": -556.0087280273438, "logps/rejected": -958.0289306640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.532724380493164, "rewards/margins": 26.292644500732422, "rewards/rejected": -33.82536697387695, "step": 4419 }, { "epoch": 2.749611197511664, "grad_norm": 0.8535892963409424, "learning_rate": 4.621945597049332e-07, "logits/chosen": 0.18626666069030762, "logits/rejected": 3.3300750255584717, "logps/chosen": -559.193603515625, "logps/rejected": -988.401611328125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -11.761933326721191, "rewards/margins": 21.86284065246582, "rewards/rejected": -33.62477111816406, "step": 4420 }, { "epoch": 2.7502332814930015, "grad_norm": 0.0011312151327729225, "learning_rate": 4.610419548178885e-07, "logits/chosen": 0.1989070177078247, "logits/rejected": 4.104744911193848, "logps/chosen": -471.160400390625, "logps/rejected": -1040.41552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.696352005004883, "rewards/margins": 33.65787887573242, "rewards/rejected": -44.35422897338867, "step": 4421 }, { "epoch": 2.7508553654743393, "grad_norm": 0.02870684675872326, "learning_rate": 4.598893499308438e-07, "logits/chosen": -2.251566171646118, "logits/rejected": 3.607893705368042, "logps/chosen": -386.98480224609375, "logps/rejected": -1097.69384765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.63227653503418, "rewards/margins": 32.436458587646484, "rewards/rejected": -41.06873321533203, "step": 4422 }, { "epoch": 2.7514774494556766, "grad_norm": 41.48528289794922, "learning_rate": 4.58736745043799e-07, "logits/chosen": -2.2110953330993652, "logits/rejected": 2.933452606201172, "logps/chosen": -511.3760070800781, "logps/rejected": -1112.38916015625, "loss": 0.5971, "rewards/accuracies": 0.875, "rewards/chosen": -8.268409729003906, "rewards/margins": 28.931161880493164, "rewards/rejected": -37.1995735168457, "step": 4423 }, { "epoch": 2.752099533437014, "grad_norm": 0.0033406876027584076, "learning_rate": 4.575841401567543e-07, "logits/chosen": -0.09376183152198792, "logits/rejected": 2.7225241661071777, "logps/chosen": -523.5830078125, "logps/rejected": -1034.5584716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.04576301574707, "rewards/margins": 32.29011154174805, "rewards/rejected": -41.33587646484375, "step": 4424 }, { "epoch": 2.7527216174183513, "grad_norm": 0.0003896571579389274, "learning_rate": 4.5643153526970956e-07, "logits/chosen": -0.23421478271484375, "logits/rejected": 3.9801552295684814, "logps/chosen": -536.8067626953125, "logps/rejected": -1102.0413818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.031307220458984, "rewards/margins": 35.871116638183594, "rewards/rejected": -44.90242004394531, "step": 4425 }, { "epoch": 2.753343701399689, "grad_norm": 0.13651464879512787, "learning_rate": 4.5527893038266484e-07, "logits/chosen": 1.7377511262893677, "logits/rejected": 2.5145273208618164, "logps/chosen": -729.896484375, "logps/rejected": -944.122802734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.97828483581543, "rewards/margins": 22.620149612426758, "rewards/rejected": -33.59843444824219, "step": 4426 }, { "epoch": 2.7539657853810264, "grad_norm": 0.0006300930981524289, "learning_rate": 4.541263254956201e-07, "logits/chosen": -1.6989445686340332, "logits/rejected": 4.08779239654541, "logps/chosen": -300.9146728515625, "logps/rejected": -1021.2611083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.203524112701416, "rewards/margins": 39.75101089477539, "rewards/rejected": -43.95453643798828, "step": 4427 }, { "epoch": 2.7545878693623638, "grad_norm": 7.86553391662892e-06, "learning_rate": 4.529737206085754e-07, "logits/chosen": 1.9402748346328735, "logits/rejected": 3.665144443511963, "logps/chosen": -575.7200927734375, "logps/rejected": -916.6759643554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.00657844543457, "rewards/margins": 26.892932891845703, "rewards/rejected": -35.899513244628906, "step": 4428 }, { "epoch": 2.7552099533437016, "grad_norm": 2.7728248824132606e-05, "learning_rate": 4.5182111572153067e-07, "logits/chosen": 2.1965889930725098, "logits/rejected": 2.6689367294311523, "logps/chosen": -504.8052062988281, "logps/rejected": -885.7496337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.474845886230469, "rewards/margins": 29.399259567260742, "rewards/rejected": -38.874107360839844, "step": 4429 }, { "epoch": 2.755832037325039, "grad_norm": 0.06133288890123367, "learning_rate": 4.5066851083448595e-07, "logits/chosen": 1.7043501138687134, "logits/rejected": 3.910205125808716, "logps/chosen": -513.8598022460938, "logps/rejected": -825.887451171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -8.094804763793945, "rewards/margins": 23.070405960083008, "rewards/rejected": -31.165210723876953, "step": 4430 }, { "epoch": 2.7564541213063762, "grad_norm": 0.05586942657828331, "learning_rate": 4.495159059474413e-07, "logits/chosen": 0.8189829587936401, "logits/rejected": 2.6572515964508057, "logps/chosen": -621.164794921875, "logps/rejected": -890.162353515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -11.553102493286133, "rewards/margins": 17.43643569946289, "rewards/rejected": -28.989540100097656, "step": 4431 }, { "epoch": 2.7570762052877136, "grad_norm": 0.06295454502105713, "learning_rate": 4.4836330106039656e-07, "logits/chosen": 0.7370268106460571, "logits/rejected": 2.5953383445739746, "logps/chosen": -539.400390625, "logps/rejected": -952.5663452148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.905994415283203, "rewards/margins": 27.762670516967773, "rewards/rejected": -36.668663024902344, "step": 4432 }, { "epoch": 2.7576982892690514, "grad_norm": 26.892684936523438, "learning_rate": 4.4721069617335183e-07, "logits/chosen": -2.1785507202148438, "logits/rejected": 2.2861528396606445, "logps/chosen": -470.45953369140625, "logps/rejected": -901.58642578125, "loss": 0.1868, "rewards/accuracies": 0.875, "rewards/chosen": -7.75625467300415, "rewards/margins": 17.602706909179688, "rewards/rejected": -25.358963012695312, "step": 4433 }, { "epoch": 2.7583203732503887, "grad_norm": 6.219872375368141e-06, "learning_rate": 4.460580912863071e-07, "logits/chosen": 1.8374208211898804, "logits/rejected": 3.0011253356933594, "logps/chosen": -687.8511352539062, "logps/rejected": -1078.059814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.497608184814453, "rewards/margins": 29.768260955810547, "rewards/rejected": -44.265869140625, "step": 4434 }, { "epoch": 2.7589424572317265, "grad_norm": 0.0023374557495117188, "learning_rate": 4.449054863992624e-07, "logits/chosen": -0.7675460577011108, "logits/rejected": 4.285062789916992, "logps/chosen": -476.12841796875, "logps/rejected": -1053.994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.0022873878479, "rewards/margins": 27.987964630126953, "rewards/rejected": -33.99024963378906, "step": 4435 }, { "epoch": 2.759564541213064, "grad_norm": 6.796044181101024e-05, "learning_rate": 4.4375288151221767e-07, "logits/chosen": -3.5497636795043945, "logits/rejected": 2.2416834831237793, "logps/chosen": -369.75775146484375, "logps/rejected": -1062.5948486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.879331588745117, "rewards/margins": 31.077716827392578, "rewards/rejected": -38.95705032348633, "step": 4436 }, { "epoch": 2.760186625194401, "grad_norm": 5.898369015433502e-10, "learning_rate": 4.4260027662517294e-07, "logits/chosen": 0.45849087834358215, "logits/rejected": 2.6137187480926514, "logps/chosen": -578.7003173828125, "logps/rejected": -978.3519287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.692137241363525, "rewards/margins": 34.749053955078125, "rewards/rejected": -40.441192626953125, "step": 4437 }, { "epoch": 2.7608087091757385, "grad_norm": 0.0017215252155438066, "learning_rate": 4.414476717381282e-07, "logits/chosen": -1.5381698608398438, "logits/rejected": 3.0950610637664795, "logps/chosen": -471.3437194824219, "logps/rejected": -1032.288330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.139705657958984, "rewards/margins": 33.45232009887695, "rewards/rejected": -44.59202575683594, "step": 4438 }, { "epoch": 2.7614307931570763, "grad_norm": 3.451422691345215, "learning_rate": 4.402950668510835e-07, "logits/chosen": 0.4012671113014221, "logits/rejected": 2.3096537590026855, "logps/chosen": -562.41796875, "logps/rejected": -933.130126953125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -12.880684852600098, "rewards/margins": 20.48157501220703, "rewards/rejected": -33.36226272583008, "step": 4439 }, { "epoch": 2.7620528771384136, "grad_norm": 0.32829925417900085, "learning_rate": 4.391424619640387e-07, "logits/chosen": 0.8047013878822327, "logits/rejected": 4.585402488708496, "logps/chosen": -422.2301025390625, "logps/rejected": -914.16748046875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -7.537834644317627, "rewards/margins": 23.84255599975586, "rewards/rejected": -31.380390167236328, "step": 4440 }, { "epoch": 2.7626749611197514, "grad_norm": 0.0013651353074237704, "learning_rate": 4.37989857076994e-07, "logits/chosen": -1.0214922428131104, "logits/rejected": 3.771331787109375, "logps/chosen": -444.5788269042969, "logps/rejected": -1036.6204833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.631132125854492, "rewards/margins": 32.456321716308594, "rewards/rejected": -44.08745193481445, "step": 4441 }, { "epoch": 2.7632970451010888, "grad_norm": 0.030825063586235046, "learning_rate": 4.368372521899493e-07, "logits/chosen": 1.3546017408370972, "logits/rejected": 4.663175106048584, "logps/chosen": -685.845458984375, "logps/rejected": -1188.11474609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.846428871154785, "rewards/margins": 31.212594985961914, "rewards/rejected": -39.059024810791016, "step": 4442 }, { "epoch": 2.763919129082426, "grad_norm": 1.188171625137329, "learning_rate": 4.3568464730290456e-07, "logits/chosen": 0.717941164970398, "logits/rejected": 3.334520101547241, "logps/chosen": -399.744140625, "logps/rejected": -762.46923828125, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -5.898489952087402, "rewards/margins": 20.447917938232422, "rewards/rejected": -26.346406936645508, "step": 4443 }, { "epoch": 2.7645412130637634, "grad_norm": 1.4317882061004639, "learning_rate": 4.345320424158599e-07, "logits/chosen": 1.807151198387146, "logits/rejected": 3.9615252017974854, "logps/chosen": -687.8176879882812, "logps/rejected": -1120.4713134765625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -12.907222747802734, "rewards/margins": 27.348201751708984, "rewards/rejected": -40.25542449951172, "step": 4444 }, { "epoch": 2.765163297045101, "grad_norm": 0.00014324445510283113, "learning_rate": 4.3337943752881517e-07, "logits/chosen": 0.6386379599571228, "logits/rejected": 3.6459169387817383, "logps/chosen": -518.3583984375, "logps/rejected": -903.1107788085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.458932876586914, "rewards/margins": 24.56821632385254, "rewards/rejected": -33.02714538574219, "step": 4445 }, { "epoch": 2.7657853810264386, "grad_norm": 2.535586190788308e-06, "learning_rate": 4.3222683264177044e-07, "logits/chosen": -0.27193400263786316, "logits/rejected": 1.8687951564788818, "logps/chosen": -444.036376953125, "logps/rejected": -907.522216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.857758045196533, "rewards/margins": 32.78850555419922, "rewards/rejected": -40.646263122558594, "step": 4446 }, { "epoch": 2.766407465007776, "grad_norm": 0.7894816994667053, "learning_rate": 4.310742277547257e-07, "logits/chosen": -0.8826746940612793, "logits/rejected": 2.4022674560546875, "logps/chosen": -437.020263671875, "logps/rejected": -951.0142822265625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -5.185009956359863, "rewards/margins": 29.816877365112305, "rewards/rejected": -35.001888275146484, "step": 4447 }, { "epoch": 2.7670295489891137, "grad_norm": 4.225468001095578e-05, "learning_rate": 4.29921622867681e-07, "logits/chosen": 0.9749264717102051, "logits/rejected": 2.5457139015197754, "logps/chosen": -483.837646484375, "logps/rejected": -826.4235229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.304150104522705, "rewards/margins": 26.983564376831055, "rewards/rejected": -33.28771209716797, "step": 4448 }, { "epoch": 2.767651632970451, "grad_norm": 8.3215105405543e-05, "learning_rate": 4.287690179806363e-07, "logits/chosen": -1.8269789218902588, "logits/rejected": 2.920605182647705, "logps/chosen": -286.757080078125, "logps/rejected": -869.705810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.869862079620361, "rewards/margins": 28.93183708190918, "rewards/rejected": -35.801700592041016, "step": 4449 }, { "epoch": 2.7682737169517884, "grad_norm": 0.00034528595278970897, "learning_rate": 4.2761641309359155e-07, "logits/chosen": 1.6075466871261597, "logits/rejected": 3.9959115982055664, "logps/chosen": -627.35205078125, "logps/rejected": -1138.360107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.445072174072266, "rewards/margins": 32.44855880737305, "rewards/rejected": -43.89363098144531, "step": 4450 }, { "epoch": 2.7688958009331257, "grad_norm": 0.00025485100923106074, "learning_rate": 4.2646380820654683e-07, "logits/chosen": 0.14457732439041138, "logits/rejected": 3.260528564453125, "logps/chosen": -507.02459716796875, "logps/rejected": -1023.6856689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.381438255310059, "rewards/margins": 34.37653732299805, "rewards/rejected": -42.757972717285156, "step": 4451 }, { "epoch": 2.7695178849144635, "grad_norm": 0.012433369643986225, "learning_rate": 4.253112033195021e-07, "logits/chosen": 0.6450670957565308, "logits/rejected": 2.940737009048462, "logps/chosen": -562.45263671875, "logps/rejected": -1125.9609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.368751525878906, "rewards/margins": 34.92451858520508, "rewards/rejected": -42.29327392578125, "step": 4452 }, { "epoch": 2.770139968895801, "grad_norm": 6.034801117493771e-05, "learning_rate": 4.241585984324574e-07, "logits/chosen": 1.3404961824417114, "logits/rejected": 3.4674181938171387, "logps/chosen": -577.1162109375, "logps/rejected": -947.2723999023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.663448333740234, "rewards/margins": 31.020864486694336, "rewards/rejected": -39.6843147277832, "step": 4453 }, { "epoch": 2.7707620528771386, "grad_norm": 38.70772933959961, "learning_rate": 4.2300599354541266e-07, "logits/chosen": -0.14834386110305786, "logits/rejected": 2.671330213546753, "logps/chosen": -505.04937744140625, "logps/rejected": -908.2825927734375, "loss": 0.4691, "rewards/accuracies": 0.875, "rewards/chosen": -8.919454574584961, "rewards/margins": 24.19499397277832, "rewards/rejected": -33.114444732666016, "step": 4454 }, { "epoch": 2.771384136858476, "grad_norm": 0.008293528109788895, "learning_rate": 4.21853388658368e-07, "logits/chosen": -1.5592821836471558, "logits/rejected": 2.4315786361694336, "logps/chosen": -591.888671875, "logps/rejected": -1101.0301513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.967531204223633, "rewards/margins": 32.232948303222656, "rewards/rejected": -43.20048141479492, "step": 4455 }, { "epoch": 2.7720062208398133, "grad_norm": 0.4753230810165405, "learning_rate": 4.2070078377132327e-07, "logits/chosen": -1.0049407482147217, "logits/rejected": 1.6527700424194336, "logps/chosen": -504.166748046875, "logps/rejected": -987.4268798828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -9.276983261108398, "rewards/margins": 26.664608001708984, "rewards/rejected": -35.94158935546875, "step": 4456 }, { "epoch": 2.7726283048211506, "grad_norm": 0.0030523869208991528, "learning_rate": 4.1954817888427844e-07, "logits/chosen": -0.23812870681285858, "logits/rejected": 2.4745800495147705, "logps/chosen": -390.5793762207031, "logps/rejected": -880.6826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.854804515838623, "rewards/margins": 28.500247955322266, "rewards/rejected": -35.35505294799805, "step": 4457 }, { "epoch": 2.7732503888024884, "grad_norm": 1.311348825083769e-07, "learning_rate": 4.183955739972338e-07, "logits/chosen": 0.2634304463863373, "logits/rejected": 2.0421676635742188, "logps/chosen": -627.1038208007812, "logps/rejected": -1146.34423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.411871910095215, "rewards/margins": 36.43199920654297, "rewards/rejected": -46.843868255615234, "step": 4458 }, { "epoch": 2.7738724727838258, "grad_norm": 0.019252663478255272, "learning_rate": 4.1724296911018905e-07, "logits/chosen": -0.02178037166595459, "logits/rejected": 2.942920207977295, "logps/chosen": -406.655517578125, "logps/rejected": -904.6901245117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.956550598144531, "rewards/margins": 27.37434959411621, "rewards/rejected": -36.330902099609375, "step": 4459 }, { "epoch": 2.7744945567651635, "grad_norm": 0.012821480631828308, "learning_rate": 4.1609036422314433e-07, "logits/chosen": 1.586922526359558, "logits/rejected": 0.9899404048919678, "logps/chosen": -728.854736328125, "logps/rejected": -999.9464111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.720413208007812, "rewards/margins": 25.15831184387207, "rewards/rejected": -36.878726959228516, "step": 4460 }, { "epoch": 2.775116640746501, "grad_norm": 0.0007586081046611071, "learning_rate": 4.149377593360996e-07, "logits/chosen": 1.5977089405059814, "logits/rejected": 3.58918833732605, "logps/chosen": -558.2069702148438, "logps/rejected": -910.0013427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.199235916137695, "rewards/margins": 22.81451416015625, "rewards/rejected": -27.013748168945312, "step": 4461 }, { "epoch": 2.7757387247278382, "grad_norm": 0.08420630544424057, "learning_rate": 4.137851544490549e-07, "logits/chosen": 0.057189732789993286, "logits/rejected": 1.9672027826309204, "logps/chosen": -508.4757080078125, "logps/rejected": -935.8024291992188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.608369827270508, "rewards/margins": 26.938785552978516, "rewards/rejected": -33.547157287597656, "step": 4462 }, { "epoch": 2.7763608087091756, "grad_norm": 4.442808130988851e-05, "learning_rate": 4.1263254956201016e-07, "logits/chosen": 2.554918050765991, "logits/rejected": 3.760105848312378, "logps/chosen": -675.0526123046875, "logps/rejected": -1031.32080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.590851783752441, "rewards/margins": 26.76144027709961, "rewards/rejected": -34.352291107177734, "step": 4463 }, { "epoch": 2.7769828926905133, "grad_norm": 29.725894927978516, "learning_rate": 4.1147994467496544e-07, "logits/chosen": 2.8413314819335938, "logits/rejected": 5.103183746337891, "logps/chosen": -694.7820434570312, "logps/rejected": -1091.3458251953125, "loss": 0.2112, "rewards/accuracies": 0.875, "rewards/chosen": -10.534599304199219, "rewards/margins": 25.128934860229492, "rewards/rejected": -35.66353225708008, "step": 4464 }, { "epoch": 2.7776049766718507, "grad_norm": 2.02376651763916, "learning_rate": 4.103273397879207e-07, "logits/chosen": -0.6693498492240906, "logits/rejected": 1.4553828239440918, "logps/chosen": -387.061279296875, "logps/rejected": -801.5008544921875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -3.7194764614105225, "rewards/margins": 25.815839767456055, "rewards/rejected": -29.535316467285156, "step": 4465 }, { "epoch": 2.778227060653188, "grad_norm": 0.07350405305624008, "learning_rate": 4.09174734900876e-07, "logits/chosen": 0.9615667462348938, "logits/rejected": 3.035022258758545, "logps/chosen": -574.1995239257812, "logps/rejected": -965.72607421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.539543151855469, "rewards/margins": 29.673593521118164, "rewards/rejected": -35.213134765625, "step": 4466 }, { "epoch": 2.778849144634526, "grad_norm": 1.4751883745193481, "learning_rate": 4.080221300138313e-07, "logits/chosen": -1.4429993629455566, "logits/rejected": 0.7418410778045654, "logps/chosen": -419.0539245605469, "logps/rejected": -773.825439453125, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -8.026326179504395, "rewards/margins": 21.981840133666992, "rewards/rejected": -30.008167266845703, "step": 4467 }, { "epoch": 2.779471228615863, "grad_norm": 4.9203539674635977e-05, "learning_rate": 4.068695251267866e-07, "logits/chosen": -1.568742275238037, "logits/rejected": 1.5835626125335693, "logps/chosen": -339.059814453125, "logps/rejected": -995.8690185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.751646518707275, "rewards/margins": 37.416934967041016, "rewards/rejected": -42.168582916259766, "step": 4468 }, { "epoch": 2.7800933125972005, "grad_norm": 0.014542018994688988, "learning_rate": 4.057169202397419e-07, "logits/chosen": 0.9699658751487732, "logits/rejected": 0.5567260980606079, "logps/chosen": -613.11572265625, "logps/rejected": -860.3040771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.781393051147461, "rewards/margins": 26.79910659790039, "rewards/rejected": -35.58049774169922, "step": 4469 }, { "epoch": 2.780715396578538, "grad_norm": 0.810660183429718, "learning_rate": 4.0456431535269716e-07, "logits/chosen": 1.6214731931686401, "logits/rejected": 1.755047082901001, "logps/chosen": -628.4044799804688, "logps/rejected": -830.4536743164062, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -9.023491859436035, "rewards/margins": 22.268905639648438, "rewards/rejected": -31.292400360107422, "step": 4470 }, { "epoch": 2.7813374805598756, "grad_norm": 0.725084662437439, "learning_rate": 4.0341171046565244e-07, "logits/chosen": -2.426719903945923, "logits/rejected": 3.6258738040924072, "logps/chosen": -349.99017333984375, "logps/rejected": -1080.127685546875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -5.08149528503418, "rewards/margins": 35.7020149230957, "rewards/rejected": -40.78350830078125, "step": 4471 }, { "epoch": 2.781959564541213, "grad_norm": 0.2610771656036377, "learning_rate": 4.022591055786077e-07, "logits/chosen": -2.8877463340759277, "logits/rejected": 0.4570479989051819, "logps/chosen": -481.33953857421875, "logps/rejected": -955.6806030273438, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -9.037692070007324, "rewards/margins": 28.204362869262695, "rewards/rejected": -37.24205780029297, "step": 4472 }, { "epoch": 2.7825816485225507, "grad_norm": 2.493027925491333, "learning_rate": 4.01106500691563e-07, "logits/chosen": -1.6067466735839844, "logits/rejected": 3.3695433139801025, "logps/chosen": -488.156005859375, "logps/rejected": -1194.8486328125, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -11.307488441467285, "rewards/margins": 40.60865020751953, "rewards/rejected": -51.916141510009766, "step": 4473 }, { "epoch": 2.783203732503888, "grad_norm": 24.57866668701172, "learning_rate": 3.999538958045182e-07, "logits/chosen": -0.47966277599334717, "logits/rejected": 3.1841583251953125, "logps/chosen": -600.14013671875, "logps/rejected": -1150.5703125, "loss": 0.1397, "rewards/accuracies": 0.875, "rewards/chosen": -10.52299976348877, "rewards/margins": 30.642662048339844, "rewards/rejected": -41.1656608581543, "step": 4474 }, { "epoch": 2.7838258164852254, "grad_norm": 0.0020669519435614347, "learning_rate": 3.988012909174735e-07, "logits/chosen": -0.7197089195251465, "logits/rejected": 2.831148624420166, "logps/chosen": -545.8742065429688, "logps/rejected": -1166.651123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.606180191040039, "rewards/margins": 38.95869064331055, "rewards/rejected": -46.56487274169922, "step": 4475 }, { "epoch": 2.7844479004665628, "grad_norm": 1.5602360008415417e-06, "learning_rate": 3.976486860304288e-07, "logits/chosen": -0.7320283651351929, "logits/rejected": 2.878640651702881, "logps/chosen": -419.9617919921875, "logps/rejected": -982.1232299804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.648433685302734, "rewards/margins": 34.59681701660156, "rewards/rejected": -40.24524688720703, "step": 4476 }, { "epoch": 2.7850699844479005, "grad_norm": 0.0013564362889155746, "learning_rate": 3.9649608114338405e-07, "logits/chosen": 0.333895206451416, "logits/rejected": 4.933841705322266, "logps/chosen": -350.03466796875, "logps/rejected": -985.833740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.056970119476318, "rewards/margins": 32.718589782714844, "rewards/rejected": -39.77556610107422, "step": 4477 }, { "epoch": 2.785692068429238, "grad_norm": 0.7813869118690491, "learning_rate": 3.9534347625633933e-07, "logits/chosen": -0.41498851776123047, "logits/rejected": 3.3700923919677734, "logps/chosen": -596.0548095703125, "logps/rejected": -1053.457275390625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -6.659764766693115, "rewards/margins": 23.89316177368164, "rewards/rejected": -30.55292510986328, "step": 4478 }, { "epoch": 2.7863141524105757, "grad_norm": 0.2598787546157837, "learning_rate": 3.941908713692946e-07, "logits/chosen": -2.235633134841919, "logits/rejected": 0.9133669137954712, "logps/chosen": -472.2154541015625, "logps/rejected": -889.4255981445312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -6.542850494384766, "rewards/margins": 19.466215133666992, "rewards/rejected": -26.00906753540039, "step": 4479 }, { "epoch": 2.786936236391913, "grad_norm": 1.2075815200805664, "learning_rate": 3.930382664822499e-07, "logits/chosen": 2.1868131160736084, "logits/rejected": 1.7234981060028076, "logps/chosen": -726.6332397460938, "logps/rejected": -897.5386962890625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -5.894367694854736, "rewards/margins": 21.893020629882812, "rewards/rejected": -27.787389755249023, "step": 4480 }, { "epoch": 2.7875583203732504, "grad_norm": 0.0008331398130394518, "learning_rate": 3.918856615952052e-07, "logits/chosen": -2.639051914215088, "logits/rejected": 2.49995493888855, "logps/chosen": -304.4178466796875, "logps/rejected": -796.910400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.245255947113037, "rewards/margins": 26.872276306152344, "rewards/rejected": -34.11753463745117, "step": 4481 }, { "epoch": 2.7881804043545877, "grad_norm": 0.01270584762096405, "learning_rate": 3.907330567081605e-07, "logits/chosen": 1.176623821258545, "logits/rejected": 2.877804756164551, "logps/chosen": -648.4818115234375, "logps/rejected": -917.58740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.145034790039062, "rewards/margins": 25.277450561523438, "rewards/rejected": -35.4224853515625, "step": 4482 }, { "epoch": 2.7888024883359255, "grad_norm": 0.6171900033950806, "learning_rate": 3.8958045182111577e-07, "logits/chosen": 0.6409913301467896, "logits/rejected": 1.7398476600646973, "logps/chosen": -533.915771484375, "logps/rejected": -829.6786499023438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -11.03984260559082, "rewards/margins": 21.923110961914062, "rewards/rejected": -32.96295166015625, "step": 4483 }, { "epoch": 2.789424572317263, "grad_norm": 0.49171316623687744, "learning_rate": 3.8842784693407105e-07, "logits/chosen": -0.3405499756336212, "logits/rejected": 1.7967162132263184, "logps/chosen": -491.95098876953125, "logps/rejected": -943.3385620117188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -6.940799713134766, "rewards/margins": 28.583152770996094, "rewards/rejected": -35.52395248413086, "step": 4484 }, { "epoch": 2.7900466562986, "grad_norm": 1.9615070812051272e-07, "learning_rate": 3.872752420470263e-07, "logits/chosen": 0.26819831132888794, "logits/rejected": 4.310084819793701, "logps/chosen": -454.3849182128906, "logps/rejected": -1021.4901733398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.043222427368164, "rewards/margins": 30.55453109741211, "rewards/rejected": -39.597755432128906, "step": 4485 }, { "epoch": 2.790668740279938, "grad_norm": 0.460784912109375, "learning_rate": 3.861226371599816e-07, "logits/chosen": -1.8704692125320435, "logits/rejected": 2.6920218467712402, "logps/chosen": -410.005126953125, "logps/rejected": -965.3362426757812, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -6.3175435066223145, "rewards/margins": 34.1805534362793, "rewards/rejected": -40.49809265136719, "step": 4486 }, { "epoch": 2.7912908242612753, "grad_norm": 5.327062535798177e-05, "learning_rate": 3.849700322729369e-07, "logits/chosen": -2.625596523284912, "logits/rejected": 1.3865752220153809, "logps/chosen": -388.6800842285156, "logps/rejected": -889.3480224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.086573600769043, "rewards/margins": 23.496519088745117, "rewards/rejected": -29.583093643188477, "step": 4487 }, { "epoch": 2.7919129082426126, "grad_norm": 32.0152702331543, "learning_rate": 3.8381742738589216e-07, "logits/chosen": 0.7535173892974854, "logits/rejected": 2.33378005027771, "logps/chosen": -550.8919677734375, "logps/rejected": -866.2294921875, "loss": 0.5518, "rewards/accuracies": 0.875, "rewards/chosen": -8.010148048400879, "rewards/margins": 23.95358657836914, "rewards/rejected": -31.963733673095703, "step": 4488 }, { "epoch": 2.79253499222395, "grad_norm": 0.002230971585959196, "learning_rate": 3.8266482249884744e-07, "logits/chosen": -0.4587140381336212, "logits/rejected": -0.24935391545295715, "logps/chosen": -423.4178161621094, "logps/rejected": -713.5352783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.603036880493164, "rewards/margins": 25.637386322021484, "rewards/rejected": -34.240421295166016, "step": 4489 }, { "epoch": 2.7931570762052877, "grad_norm": 0.02404855750501156, "learning_rate": 3.815122176118027e-07, "logits/chosen": 1.2902207374572754, "logits/rejected": 2.8667750358581543, "logps/chosen": -725.5426635742188, "logps/rejected": -1105.2545166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.346529006958008, "rewards/margins": 27.529224395751953, "rewards/rejected": -40.875755310058594, "step": 4490 }, { "epoch": 2.793779160186625, "grad_norm": 0.003159100888296962, "learning_rate": 3.8035961272475794e-07, "logits/chosen": -0.68532794713974, "logits/rejected": 3.2057533264160156, "logps/chosen": -442.5166015625, "logps/rejected": -1119.332275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.750860214233398, "rewards/margins": 41.39916229248047, "rewards/rejected": -52.1500244140625, "step": 4491 }, { "epoch": 2.794401244167963, "grad_norm": 0.9106011390686035, "learning_rate": 3.792070078377132e-07, "logits/chosen": 2.084537982940674, "logits/rejected": 3.1576859951019287, "logps/chosen": -626.247802734375, "logps/rejected": -874.1102905273438, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -8.74012565612793, "rewards/margins": 18.01773452758789, "rewards/rejected": -26.75786018371582, "step": 4492 }, { "epoch": 2.7950233281493, "grad_norm": 0.0044584497809410095, "learning_rate": 3.780544029506685e-07, "logits/chosen": 2.107027769088745, "logits/rejected": 4.1379594802856445, "logps/chosen": -637.359130859375, "logps/rejected": -1030.0760498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.089095115661621, "rewards/margins": 30.950481414794922, "rewards/rejected": -41.039573669433594, "step": 4493 }, { "epoch": 2.7956454121306376, "grad_norm": 3.15701836370863e-05, "learning_rate": 3.769017980636238e-07, "logits/chosen": -1.1554534435272217, "logits/rejected": 2.479703664779663, "logps/chosen": -464.4109802246094, "logps/rejected": -899.0807495117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.289630889892578, "rewards/margins": 26.90167999267578, "rewards/rejected": -34.19131088256836, "step": 4494 }, { "epoch": 2.796267496111975, "grad_norm": 2.108475200657267e-06, "learning_rate": 3.757491931765791e-07, "logits/chosen": 0.3373467028141022, "logits/rejected": 3.91178297996521, "logps/chosen": -618.9368286132812, "logps/rejected": -1257.044189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.883026599884033, "rewards/margins": 38.584205627441406, "rewards/rejected": -45.46723175048828, "step": 4495 }, { "epoch": 2.7968895800933127, "grad_norm": 0.10682783275842667, "learning_rate": 3.745965882895344e-07, "logits/chosen": 0.47683557868003845, "logits/rejected": 2.9689526557922363, "logps/chosen": -599.391357421875, "logps/rejected": -1023.8408203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -11.146797180175781, "rewards/margins": 27.955692291259766, "rewards/rejected": -39.10249328613281, "step": 4496 }, { "epoch": 2.79751166407465, "grad_norm": 2.7964730262756348, "learning_rate": 3.7344398340248966e-07, "logits/chosen": 0.1441301703453064, "logits/rejected": 2.867811679840088, "logps/chosen": -398.3563232421875, "logps/rejected": -792.4764404296875, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -7.013823986053467, "rewards/margins": 22.58769416809082, "rewards/rejected": -29.601516723632812, "step": 4497 }, { "epoch": 2.798133748055988, "grad_norm": 2.440301250317134e-05, "learning_rate": 3.7229137851544494e-07, "logits/chosen": 2.6866021156311035, "logits/rejected": 4.955380439758301, "logps/chosen": -597.2278442382812, "logps/rejected": -968.24755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.944086074829102, "rewards/margins": 27.064598083496094, "rewards/rejected": -38.00868606567383, "step": 4498 }, { "epoch": 2.798755832037325, "grad_norm": 35.87205505371094, "learning_rate": 3.711387736284002e-07, "logits/chosen": 1.2314321994781494, "logits/rejected": 3.3555502891540527, "logps/chosen": -576.2147216796875, "logps/rejected": -910.40869140625, "loss": 0.3366, "rewards/accuracies": 0.875, "rewards/chosen": -9.402275085449219, "rewards/margins": 21.1616153717041, "rewards/rejected": -30.563892364501953, "step": 4499 }, { "epoch": 2.7993779160186625, "grad_norm": 0.0009058943251147866, "learning_rate": 3.699861687413555e-07, "logits/chosen": -0.4183881878852844, "logits/rejected": 1.6836124658584595, "logps/chosen": -588.3963012695312, "logps/rejected": -1056.568603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.816722869873047, "rewards/margins": 34.03743362426758, "rewards/rejected": -41.854156494140625, "step": 4500 } ], "logging_steps": 1, "max_steps": 4821, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }